From ac44e7b7793c9839acb1a38f1dad4d078d1d6725 Mon Sep 17 00:00:00 2001 From: Chunwei Date: Thu, 12 Sep 2019 19:55:50 +0800 Subject: [PATCH] init from wiki --- .gitmodules | 12 - CMakeLists.txt | 183 - Home.md | 54 + README.md | 74 - README_cn.md | 62 - add_new_operation.md | 189 + architecture-intro.md | 247 + architecture.md | 94 + benchmark.md | 162 + benchmark_tools.md | 196 + benchmark_tools.md.toc.2019-08-25_233116 | 11 + benchmark_tools.md.toc.2019-08-25_233528 | 11 + cmake/FindGflags.cmake | 582 - cmake/FindGlog.cmake | 24 - cmake/FindGperftools.cmake | 63 - cmake/FindJeMalloc.cmake | 28 - cmake/FindNumPy.cmake | 38 - cmake/cblas.cmake | 94 - cmake/ccache.cmake | 9 - cmake/configure.cmake | 160 - cmake/coveralls.cmake | 103 - cmake/coverallsGcovJsons.cmake | 401 - cmake/cross_compiling/android.cmake | 85 - cmake/cross_compiling/armlinux.cmake | 41 - cmake/cross_compiling/findar.cmake | 33 - cmake/cross_compiling/host.cmake | 48 - cmake/cross_compiling/ios.cmake | 692 -- cmake/cross_compiling/npu.cmake | 90 - cmake/cross_compiling/postproject.cmake | 99 - cmake/cross_compiling/preproject.cmake | 59 - cmake/cuda.cmake | 228 - cmake/cudnn.cmake | 99 - cmake/cupti.cmake | 41 - cmake/external/eigen.cmake | 54 - cmake/external/gflags.cmake | 75 - cmake/external/glog.cmake | 77 - cmake/external/gtest.cmake | 86 - cmake/external/libxsmm.cmake | 55 - cmake/external/mkldnn.cmake | 120 - cmake/external/mklml.cmake | 77 - cmake/external/openblas.cmake | 93 - cmake/external/opencl-clhpp.cmake | 36 - cmake/external/opencl-headers.cmake | 33 - cmake/external/protobuf.cmake | 308 - cmake/external/xbyak.cmake | 57 - cmake/external/xxhash.cmake | 73 - cmake/flags.cmake | 194 - cmake/generic.cmake | 567 - cmake/hip.cmake | 53 - cmake/lite.cmake | 435 - cmake/lite_utils.cmake | 56 - cmake/make_resource.py | 25 - cmake/operators.cmake | 227 - cmake/package.cmake | 21 - cmake/simd.cmake | 99 - cmake/system.cmake | 89 - cmake/tensorrt.cmake | 38 - cmake/util.cmake | 55 - cmake/version.cmake | 66 - cpp_demo.md | 271 + cxx_api.md | 63 + debug_tools.md | 77 + demos.md.toc.2019-08-26_222115 | 19 + demos.md.toc.2019-08-26_222307 | 19 + for-developer.md | 15 + fpga.md | 107 + images/architecture.jpg | Bin 0 -> 80268 bytes images/benchmark_result.png | Bin 0 -> 160704 bytes images/img_mobilenetv1_inference.png | Bin 0 -> 72038 bytes images/lite1.png | Bin 0 -> 258476 bytes images/model_quan_fig.png | Bin 0 -> 315888 bytes images/model_quan_table1.png | Bin 0 -> 128197 bytes images/phone_list.png | Bin 0 -> 16772 bytes images/run_benchmark.png | Bin 0 -> 136012 bytes java_demo.md | 112 + lite/CMakeLists.txt | 159 - lite/api/CMakeLists.txt | 239 - lite/api/_paddle_use_kernels.h | 209 - lite/api/_paddle_use_ops.h | 127 - lite/api/android/.gitignore | 2 - lite/api/android/CMakeLists.txt | 5 - lite/api/android/jni/.gitignore | 3 - lite/api/android/jni/CMakeLists.txt | 52 - lite/api/android/jni/native/CMakeLists.txt | 32 - .../api/android/jni/native/convert_util_jni.h | 197 - .../api/android/jni/native/paddle_lite_jni.cc | 164 - lite/api/android/jni/native/paddle_lite_jni.h | 113 - lite/api/android/jni/native/tensor_jni.cc | 168 - lite/api/android/jni/native/tensor_jni.h | 90 - .../jni/src/com/baidu/paddle/lite/.gitignore | 2 - .../src/com/baidu/paddle/lite/ConfigBase.java | 31 - .../src/com/baidu/paddle/lite/CxxConfig.java | 39 - .../com/baidu/paddle/lite/MobileConfig.java | 69 - .../paddle/lite/PaddleLiteInitializer.java | 23 - .../baidu/paddle/lite/PaddlePredictor.java | 192 - .../jni/src/com/baidu/paddle/lite/Place.java | 148 - .../src/com/baidu/paddle/lite/PowerMode.java | 36 - .../jni/src/com/baidu/paddle/lite/Tensor.java | 141 - .../paddle/lite/PaddlePredictorTest.java | 54 - lite/api/apis_test.cc | 118 - lite/api/benchmark.cc | 190 - lite/api/cxx_api.cc | 177 - lite/api/cxx_api.h | 173 - lite/api/cxx_api_bin.cc | 129 - lite/api/cxx_api_impl.cc | 90 - lite/api/cxx_api_test.cc | 157 - lite/api/detection_model_test.cc | 137 - lite/api/efficientnet_b0_test.cc | 102 - lite/api/inceptionv4_test.cc | 94 - lite/api/light_api.cc | 95 - lite/api/light_api.h | 80 - lite/api/light_api_impl.cc | 79 - lite/api/light_api_test.cc | 90 - lite/api/lite_api_test_helper.cc | 62 - lite/api/lite_api_test_helper.h | 31 - lite/api/mobilenetv1_int8_test.cc | 89 - lite/api/mobilenetv1_ssd_test.cc | 112 - lite/api/mobilenetv1_test.cc | 145 - lite/api/mobilenetv1_yolov3_test.cc | 119 - lite/api/mobilenetv2_test.cc | 147 - lite/api/model_optimize_tool.cc | 119 - lite/api/model_run_test_image.cc | 79 - lite/api/model_test.cc | 181 - lite/api/ocr_attention_test.cc | 115 - lite/api/paddle_api.cc | 73 - lite/api/paddle_api.h | 167 - lite/api/paddle_api_test.cc | 122 - lite/api/paddle_lite_factory_helper.h | 37 - lite/api/paddle_place.cc | 117 - lite/api/paddle_place.h | 164 - lite/api/paddle_use_passes.h | 41 - lite/api/resnet18_test.cc | 88 - lite/api/resnet50_test.cc | 107 - lite/api/resnet50_test_fpga.cc | 63 - lite/api/shufflenetv2_test.cc | 92 - lite/api/test_googlenet_lite.cc | 80 - lite/api/test_helper.h | 40 - lite/api/test_inceptionv4_lite_x86.cc | 112 - lite/api/test_mobilenetv1_lite_x86.cc | 109 - lite/api/test_mobilenetv2_lite_x86.cc | 112 - lite/api/unet_test.cc | 106 - lite/backends/CMakeLists.txt | 7 - lite/backends/arm/CMakeLists.txt | 1 - lite/backends/arm/math/CMakeLists.txt | 111 - lite/backends/arm/math/activation.cc | 698 -- lite/backends/arm/math/activation.h | 71 - lite/backends/arm/math/affine_channel.cc | 69 - lite/backends/arm/math/affine_channel.h | 41 - lite/backends/arm/math/anchor_generator.cc | 82 - lite/backends/arm/math/anchor_generator.h | 41 - lite/backends/arm/math/argmax.cc | 65 - lite/backends/arm/math/argmax.h | 35 - lite/backends/arm/math/axpy.cc | 203 - lite/backends/arm/math/axpy.h | 49 - lite/backends/arm/math/beam_search.cc | 271 - lite/backends/arm/math/beam_search.h | 41 - lite/backends/arm/math/box_coder.cc | 92 - lite/backends/arm/math/box_coder.h | 36 - lite/backends/arm/math/col_im_transform.cc | 75 - lite/backends/arm/math/col_im_transform.h | 40 - lite/backends/arm/math/concat.cc | 60 - lite/backends/arm/math/concat.h | 35 - .../arm/math/conv3x3s1_direct_int8.cc | 806 -- .../arm/math/conv3x3s2_direct_int8.cc | 1081 -- lite/backends/arm/math/conv_block_utils.h | 4292 -------- lite/backends/arm/math/conv_depthwise.cc | 239 - lite/backends/arm/math/conv_depthwise.h | 100 - .../arm/math/conv_depthwise_3x3_int8.cc | 5832 ---------- .../backends/arm/math/conv_depthwise_3x3p0.cc | 4178 ------- .../backends/arm/math/conv_depthwise_3x3p1.cc | 4850 --------- .../backends/arm/math/conv_depthwise_5x5s1.cc | 9615 ----------------- .../arm/math/conv_depthwise_5x5s1_int8.cc | 618 -- .../backends/arm/math/conv_depthwise_5x5s2.cc | 3746 ------- lite/backends/arm/math/conv_direct.cc | 242 - lite/backends/arm/math/conv_direct.h | 107 - lite/backends/arm/math/conv_direct_3x3s1.cc | 1067 -- lite/backends/arm/math/conv_direct_3x3s2.cc | 1209 --- lite/backends/arm/math/conv_gemmlike.cc | 285 - lite/backends/arm/math/conv_gemmlike.h | 108 - lite/backends/arm/math/conv_impl.cc | 900 -- lite/backends/arm/math/conv_impl.h | 423 - lite/backends/arm/math/conv_winograd.cc | 141 - lite/backends/arm/math/conv_winograd.h | 65 - lite/backends/arm/math/conv_winograd_3x3.cc | 479 - lite/backends/arm/math/decode_bboxes.cc | 651 -- lite/backends/arm/math/decode_bboxes.h | 39 - .../backends/arm/math/dot_toolchain_support.h | 196 - lite/backends/arm/math/dropout.cc | 93 - lite/backends/arm/math/dropout.h | 32 - lite/backends/arm/math/elementwise.cc | 1290 --- lite/backends/arm/math/elementwise.h | 95 - lite/backends/arm/math/fill_bias_relu.cc | 122 - lite/backends/arm/math/fill_bias_relu.h | 44 - lite/backends/arm/math/funcs.cc | 153 - lite/backends/arm/math/funcs.h | 427 - lite/backends/arm/math/gemm_prepacked_int8.cc | 3942 ------- lite/backends/arm/math/gemm_prepacked_int8.h | 94 - lite/backends/arm/math/gemv_arm_int8.cc | 480 - lite/backends/arm/math/gemv_arm_int8.h | 40 - lite/backends/arm/math/gru_utils.h | 434 - lite/backends/arm/math/im2sequence.cc | 72 - lite/backends/arm/math/im2sequence.h | 44 - lite/backends/arm/math/increment.cc | 37 - lite/backends/arm/math/increment.h | 33 - lite/backends/arm/math/interpolate.cc | 534 - lite/backends/arm/math/interpolate.h | 58 - lite/backends/arm/math/lrn.cc | 101 - lite/backends/arm/math/lrn.h | 49 - lite/backends/arm/math/negative.cc | 37 - lite/backends/arm/math/negative.h | 33 - lite/backends/arm/math/norm.cc | 52 - lite/backends/arm/math/norm.h | 35 - lite/backends/arm/math/packed_sgemm.cc | 3481 ------ lite/backends/arm/math/packed_sgemm.h | 84 - lite/backends/arm/math/pad2d.cc | 413 - lite/backends/arm/math/pad2d.h | 71 - lite/backends/arm/math/pooling.cc | 3173 ------ lite/backends/arm/math/pooling.h | 154 - lite/backends/arm/math/power.cc | 96 - lite/backends/arm/math/power.h | 33 - lite/backends/arm/math/prior_box.cc | 362 - lite/backends/arm/math/prior_box.h | 68 - lite/backends/arm/math/reduce_max.cc | 207 - lite/backends/arm/math/reduce_max.h | 89 - lite/backends/arm/math/reduce_mean.cc | 204 - lite/backends/arm/math/reduce_mean.h | 89 - lite/backends/arm/math/saturate.h | 320 - lite/backends/arm/math/scale.cc | 177 - lite/backends/arm/math/scale.h | 45 - lite/backends/arm/math/sequence2batch.h | 210 - lite/backends/arm/math/sequence_expand.cc | 63 - lite/backends/arm/math/sequence_expand.h | 35 - lite/backends/arm/math/sequence_pool.cc | 224 - lite/backends/arm/math/sequence_pool.h | 69 - lite/backends/arm/math/sequence_softmax.cc | 49 - lite/backends/arm/math/sequence_softmax.h | 34 - lite/backends/arm/math/sgemm.cc | 68 - lite/backends/arm/math/sgemm.h | 48 - lite/backends/arm/math/sgemv.cc | 1054 -- lite/backends/arm/math/sgemv.h | 38 - lite/backends/arm/math/shuffle_channel.cc | 81 - lite/backends/arm/math/shuffle_channel.h | 34 - lite/backends/arm/math/slice.cc | 93 - lite/backends/arm/math/slice.h | 38 - lite/backends/arm/math/softmax.cc | 616 -- lite/backends/arm/math/softmax.h | 71 - lite/backends/arm/math/split.cc | 85 - lite/backends/arm/math/split.h | 37 - lite/backends/arm/math/stack.cc | 55 - lite/backends/arm/math/stack.h | 30 - lite/backends/arm/math/topk.cc | 53 - lite/backends/arm/math/topk.h | 34 - lite/backends/arm/math/type_trans.cc | 919 -- lite/backends/arm/math/type_trans.h | 117 - lite/backends/arm/math/yolo_box.cc | 168 - lite/backends/arm/math/yolo_box.h | 37 - lite/backends/cuda/CMakeLists.txt | 8 - lite/backends/cuda/blas.cc | 57 - lite/backends/cuda/blas.h | 99 - lite/backends/cuda/cuda_utils.h | 124 - lite/backends/cuda/math/CMakeLists.txt | 21 - lite/backends/cuda/math/activation.cu | 285 - lite/backends/cuda/math/activation.h | 58 - lite/backends/cuda/math/cudnn_conv.cc | 481 - lite/backends/cuda/math/cudnn_conv.h | 132 - lite/backends/cuda/math/cudnn_helper.h | 24 - lite/backends/cuda/math/scale.cu | 74 - lite/backends/cuda/math/scale.h | 37 - lite/backends/cuda/math/transpose.cu | 191 - lite/backends/cuda/math/transpose.h | 44 - lite/backends/cuda/math/type_trans.cu | 71 - lite/backends/cuda/math/type_trans.h | 37 - lite/backends/cuda/math/utils.h | 51 - lite/backends/cuda/target_wrapper.cc | 80 - lite/backends/cuda/target_wrapper.h | 64 - lite/backends/fpga/CMakeLists.txt | 15 - lite/backends/fpga/KD/alignment.h | 26 - lite/backends/fpga/KD/context.hpp | 50 - lite/backends/fpga/KD/dl_engine.cpp | 27 - lite/backends/fpga/KD/dl_engine.hpp | 36 - lite/backends/fpga/KD/float16.hpp | 508 - lite/backends/fpga/KD/fpga_cv.cpp | 78 - lite/backends/fpga/KD/fpga_cv.hpp | 28 - lite/backends/fpga/KD/layout.hpp | 99 - lite/backends/fpga/KD/llapi/bias_scale.cpp | 102 - lite/backends/fpga/KD/llapi/bias_scale.h | 30 - lite/backends/fpga/KD/llapi/config.h | 19 - lite/backends/fpga/KD/llapi/filter.cpp | 317 - lite/backends/fpga/KD/llapi/filter.h | 58 - lite/backends/fpga/KD/llapi/zynqmp_api.cpp | 327 - lite/backends/fpga/KD/llapi/zynqmp_api.h | 347 - lite/backends/fpga/KD/pe.hpp | 37 - lite/backends/fpga/KD/pe_params.hpp | 233 - lite/backends/fpga/KD/pes/batchnorm_pe.hpp | 105 - lite/backends/fpga/KD/pes/concat_pe.hpp | 135 - lite/backends/fpga/KD/pes/conv_pe.hpp | 138 - lite/backends/fpga/KD/pes/conv_process.hpp | 418 - lite/backends/fpga/KD/pes/crop_pe.cpp | 88 - lite/backends/fpga/KD/pes/crop_pe.hpp | 45 - .../fpga/KD/pes/depthwise_conv_pe.hpp | 102 - .../fpga/KD/pes/elementwise_add_pe.hpp | 81 - .../fpga/KD/pes/fully_connected_pe.hpp | 94 - lite/backends/fpga/KD/pes/input_pe.hpp | 54 - lite/backends/fpga/KD/pes/norm_pe.hpp | 121 - lite/backends/fpga/KD/pes/output_pe.hpp | 53 - lite/backends/fpga/KD/pes/pooling_pe.hpp | 176 - lite/backends/fpga/KD/pes/prior_box_pe.cpp | 273 - lite/backends/fpga/KD/pes/prior_box_pe.hpp | 46 - lite/backends/fpga/KD/pes/relu_pe.hpp | 75 - lite/backends/fpga/KD/pes/resize.hpp | 89 - lite/backends/fpga/KD/pes/scale_pe.hpp | 120 - lite/backends/fpga/KD/pes/softmax_pe.cpp | 162 - lite/backends/fpga/KD/pes/softmax_pe.hpp | 44 - lite/backends/fpga/KD/pes/split_pe.hpp | 124 - lite/backends/fpga/KD/shape.hpp | 116 - lite/backends/fpga/KD/tensor.hpp | 456 - lite/backends/fpga/KD/tensor_util.cpp | 32 - lite/backends/fpga/KD/tensor_util.hpp | 25 - lite/backends/fpga/lite_tensor.cc | 110 - lite/backends/fpga/lite_tensor.h | 251 - lite/backends/fpga/target_wrapper.cc | 37 - lite/backends/host/CMakeLists.txt | 3 - lite/backends/host/target_wrapper.cc | 49 - lite/backends/npu/CMakeLists.txt | 6 - lite/backends/npu/bridge/CMakeLists.txt | 67 - lite/backends/npu/bridge/act_op.cc | 88 - lite/backends/npu/bridge/act_op_test.cc | 100 - lite/backends/npu/bridge/batch_norm_op.cc | 96 - .../backends/npu/bridge/batch_norm_op_test.cc | 166 - lite/backends/npu/bridge/concat_op.cc | 74 - lite/backends/npu/bridge/concat_op_test.cc | 128 - lite/backends/npu/bridge/conv_op.cc | 216 - lite/backends/npu/bridge/conv_op_test.cc | 280 - lite/backends/npu/bridge/conv_transpose_op.cc | 146 - .../npu/bridge/conv_transpose_op_test.cc | 369 - lite/backends/npu/bridge/elementwise_ops.cc | 79 - .../npu/bridge/elementwise_ops_test.cc | 182 - lite/backends/npu/bridge/fc_op.cc | 119 - lite/backends/npu/bridge/fc_op_test.cc | 146 - lite/backends/npu/bridge/interpolate_op.cc | 143 - .../npu/bridge/interpolate_op_test.cc | 405 - lite/backends/npu/bridge/mul_op.cc | 122 - lite/backends/npu/bridge/mul_op_test.cc | 125 - lite/backends/npu/bridge/pad2d_op.cc | 86 - lite/backends/npu/bridge/pad2d_op_test.cc | 189 - .../npu/bridge/paddle_use_npu_bridges.h | 37 - lite/backends/npu/bridge/pool_op.cc | 89 - lite/backends/npu/bridge/pool_op_test.cc | 249 - lite/backends/npu/bridge/registry.cc | 39 - lite/backends/npu/bridge/registry.h | 84 - lite/backends/npu/bridge/reshape_op.cc | 121 - lite/backends/npu/bridge/reshape_op_test.cc | 202 - lite/backends/npu/bridge/scale_op.cc | 89 - lite/backends/npu/bridge/scale_op_test.cc | 123 - .../backends/npu/bridge/shuffle_channel_op.cc | 60 - .../npu/bridge/shuffle_channel_op_test.cc | 115 - lite/backends/npu/bridge/softmax_op.cc | 67 - lite/backends/npu/bridge/softmax_op_test.cc | 134 - lite/backends/npu/bridge/split_op.cc | 86 - lite/backends/npu/bridge/split_op_test.cc | 170 - lite/backends/npu/bridge/test_helper.cc | 101 - lite/backends/npu/bridge/test_helper.h | 64 - lite/backends/npu/bridge/transpose_op.cc | 78 - lite/backends/npu/bridge/transpose_op_test.cc | 151 - lite/backends/npu/bridge/utils.cc | 137 - lite/backends/npu/bridge/utils.h | 94 - lite/backends/npu/npu_helper.cc | 139 - lite/backends/npu/npu_helper.h | 110 - lite/backends/opencl/CMakeLists.txt | 18 - lite/backends/opencl/cl_caller.cc | 169 - lite/backends/opencl/cl_caller.h | 52 - lite/backends/opencl/cl_context.cc | 126 - lite/backends/opencl/cl_context.h | 54 - lite/backends/opencl/cl_functions_test.cc | 451 - lite/backends/opencl/cl_im2col_test.cc | 330 - lite/backends/opencl/cl_image.cc | 160 - lite/backends/opencl/cl_image.h | 114 - lite/backends/opencl/cl_image_converter.cc | 461 - lite/backends/opencl/cl_image_converter.h | 139 - lite/backends/opencl/cl_include.h | 21 - .../buffer/depthwise_conv2d_kernel.cl | 70 - .../buffer/elementwise_add_kernel.cl | 45 - .../opencl/cl_kernel/buffer/fc_kernel.cl | 424 - .../opencl/cl_kernel/buffer/im2col_kernel.cl | 64 - .../opencl/cl_kernel/buffer/mat_mul_kernel.cl | 93 - .../opencl/cl_kernel/buffer/pool_kernel.cl | 112 - .../opencl/cl_kernel/buffer/relu_kernel.cl | 22 - lite/backends/opencl/cl_kernel/cl_common.h | 38 - .../cl_kernel/image/channel_add_kernel.cl | 29 - .../cl_kernel/image/elementwise_add_kernel.cl | 26 - .../opencl/cl_kernel/image/pool_kernel.cl | 90 - lite/backends/opencl/cl_runtime.cc | 170 - lite/backends/opencl/cl_runtime.h | 101 - lite/backends/opencl/cl_utility.cc | 84 - lite/backends/opencl/cl_utility.h | 46 - lite/backends/opencl/cl_wrapper.cc | 732 -- lite/backends/opencl/cl_wrapper.h | 572 - lite/backends/opencl/target_wrapper.cc | 341 - lite/backends/opencl/target_wrapper.h | 83 - lite/backends/x86/CMakeLists.txt | 14 - lite/backends/x86/cpu_info.cc | 160 - lite/backends/x86/cpu_info.h | 80 - lite/backends/x86/cupti_lib_path.h.in | 17 - lite/backends/x86/dynamic_loader.cc | 263 - lite/backends/x86/dynamic_loader.h | 38 - lite/backends/x86/jit/CMakeLists.txt | 26 - lite/backends/x86/jit/README.en.md | 103 - lite/backends/x86/jit/README.md | 94 - lite/backends/x86/jit/benchmark.cc | 576 - lite/backends/x86/jit/gen/CMakeLists.txt | 36 - lite/backends/x86/jit/gen/act.cc | 164 - lite/backends/x86/jit/gen/act.h | 347 - lite/backends/x86/jit/gen/blas.cc | 190 - lite/backends/x86/jit/gen/blas.h | 125 - lite/backends/x86/jit/gen/embseqpool.cc | 148 - lite/backends/x86/jit/gen/embseqpool.h | 81 - lite/backends/x86/jit/gen/gru.cc | 116 - lite/backends/x86/jit/gen/gru.h | 116 - lite/backends/x86/jit/gen/hopv.cc | 103 - lite/backends/x86/jit/gen/hopv.h | 92 - lite/backends/x86/jit/gen/jitcode.h | 133 - lite/backends/x86/jit/gen/lstm.cc | 142 - lite/backends/x86/jit/gen/lstm.h | 121 - lite/backends/x86/jit/gen/matmul.cc | 127 - lite/backends/x86/jit/gen/matmul.h | 62 - lite/backends/x86/jit/gen/seqpool.cc | 85 - lite/backends/x86/jit/gen/seqpool.h | 216 - lite/backends/x86/jit/gen/sgd.cc | 130 - lite/backends/x86/jit/gen/sgd.h | 60 - lite/backends/x86/jit/gen/vbroadcast.cc | 91 - lite/backends/x86/jit/gen/vbroadcast.h | 54 - lite/backends/x86/jit/gen_base.cc | 95 - lite/backends/x86/jit/gen_base.h | 87 - lite/backends/x86/jit/helper.cc | 139 - lite/backends/x86/jit/helper.h | 267 - lite/backends/x86/jit/kernel_base.h | 365 - lite/backends/x86/jit/kernel_key.cc | 71 - lite/backends/x86/jit/kernel_key.h | 55 - lite/backends/x86/jit/kernel_pool.cc | 41 - lite/backends/x86/jit/kernel_pool.h | 116 - lite/backends/x86/jit/macro.h | 32 - lite/backends/x86/jit/more/CMakeLists.txt | 18 - .../x86/jit/more/intrinsic/CMakeLists.txt | 9 - .../x86/jit/more/intrinsic/crf_decoding.cc | 185 - .../x86/jit/more/intrinsic/crf_decoding.h | 45 - .../x86/jit/more/intrinsic/layer_norm.cc | 181 - .../x86/jit/more/intrinsic/layer_norm.h | 48 - lite/backends/x86/jit/more/mix/CMakeLists.txt | 15 - lite/backends/x86/jit/more/mix/mix.cc | 255 - lite/backends/x86/jit/more/mix/mix.h | 65 - lite/backends/x86/jit/more/mkl/CMakeLists.txt | 20 - lite/backends/x86/jit/more/mkl/mkl.cc | 336 - lite/backends/x86/jit/more/mkl/mkl.h | 244 - lite/backends/x86/jit/refer/CMakeLists.txt | 40 - lite/backends/x86/jit/refer/refer.cc | 61 - lite/backends/x86/jit/refer/refer.h | 603 -- lite/backends/x86/jit/registry.h | 178 - lite/backends/x86/jit/test.cc | 1447 --- lite/backends/x86/legacy_place.h | 30 - lite/backends/x86/math/CMakeLists.txt | 62 - lite/backends/x86/math/beam_search.cc | 322 - lite/backends/x86/math/beam_search.h | 125 - lite/backends/x86/math/beam_search_test.cc | 152 - lite/backends/x86/math/blas.cc | 57 - lite/backends/x86/math/blas.h | 408 - lite/backends/x86/math/blas_impl.h | 812 -- lite/backends/x86/math/concat_and_split.cc | 131 - lite/backends/x86/math/concat_and_split.h | 83 - lite/backends/x86/math/context_project.cc | 28 - lite/backends/x86/math/context_project.h | 361 - lite/backends/x86/math/cos_sim_functor.cc | 57 - lite/backends/x86/math/cos_sim_functor.h | 187 - lite/backends/x86/math/cpu_vec.h | 662 -- lite/backends/x86/math/cross_entropy.cc | 78 - lite/backends/x86/math/cross_entropy.h | 74 - lite/backends/x86/math/detail/CMakeLists.txt | 1 - .../x86/math/detail/activation_functions.h | 193 - .../backends/x86/math/detail/avx_functions.cc | 91 - lite/backends/x86/math/detail/avx_mathfun.h | 731 -- .../backends/x86/math/detail/gru_cpu_kernel.h | 608 -- lite/backends/x86/math/detail/gru_kernel.h | 222 - lite/backends/x86/math/gru_compute.cc | 181 - lite/backends/x86/math/gru_compute.h | 69 - lite/backends/x86/math/im2col.cc | 292 - lite/backends/x86/math/im2col.h | 108 - lite/backends/x86/math/im2col_cfo_cpu.h | 256 - lite/backends/x86/math/im2col_test.cc | 331 - lite/backends/x86/math/math_function.cc | 158 - lite/backends/x86/math/math_function.h | 93 - lite/backends/x86/math/math_function_impl.h | 192 - lite/backends/x86/math/math_function_test.cc | 344 - lite/backends/x86/math/maxouting.cc | 106 - lite/backends/x86/math/maxouting.h | 47 - lite/backends/x86/math/pooling.cc | 906 -- lite/backends/x86/math/pooling.h | 258 - lite/backends/x86/math/prelu.h | 51 - lite/backends/x86/math/sample_prob.cc | 28 - lite/backends/x86/math/sample_prob.h | 128 - lite/backends/x86/math/sampler.cc | 102 - lite/backends/x86/math/sampler.h | 131 - lite/backends/x86/math/sequence2batch.cc | 67 - lite/backends/x86/math/sequence2batch.h | 190 - lite/backends/x86/math/sequence_padding.cc | 187 - lite/backends/x86/math/sequence_padding.h | 114 - lite/backends/x86/math/sequence_pooling.cc | 406 - lite/backends/x86/math/sequence_pooling.h | 52 - .../x86/math/sequence_pooling_test.cc | 130 - lite/backends/x86/math/sequence_scale.cc | 51 - lite/backends/x86/math/sequence_scale.h | 59 - lite/backends/x86/math/softmax.cc | 33 - lite/backends/x86/math/softmax.h | 67 - lite/backends/x86/math/softmax_impl.h | 245 - lite/backends/x86/math/tree2col.cc | 204 - lite/backends/x86/math/tree2col.h | 95 - lite/backends/x86/math/unpooling.cc | 96 - lite/backends/x86/math/unpooling.h | 44 - lite/backends/x86/math/vol2col.cc | 204 - lite/backends/x86/math/vol2col.h | 92 - lite/backends/x86/mklml.cc | 30 - lite/backends/x86/mklml.h | 99 - lite/backends/x86/port.h | 175 - lite/backends/x86/target_wrapper.cc | 36 - lite/backends/x86/target_wrapper.h | 22 - lite/backends/x86/warpctc_lib_path.h.in | 17 - lite/core/CMakeLists.txt | 124 - lite/core/arena/CMakeLists.txt | 10 - lite/core/arena/framework.cc | 70 - lite/core/arena/framework.h | 258 - lite/core/arena/framework_test.cc | 83 - lite/core/context.cc | 23 - lite/core/context.h | 400 - lite/core/context_test.cc | 51 - lite/core/device_info.cc | 1151 -- lite/core/device_info.h | 209 - lite/core/framework.proto | 188 - lite/core/kernel.cc | 104 - lite/core/kernel.h | 189 - lite/core/kernel_test.cc | 63 - lite/core/lite.map | 6 - lite/core/lite_gtest_main.cc | 23 - lite/core/lite_tensor_test.cc | 32 - lite/core/memory.cc | 109 - lite/core/memory.h | 115 - lite/core/memory_test.cc | 34 - lite/core/mir/CMakeLists.txt | 109 - lite/core/mir/argument_type_display_pass.cc | 46 - lite/core/mir/demo_pass.cc | 37 - lite/core/mir/dot.h | 167 - lite/core/mir/elimination/CMakeLists.txt | 10 - .../identity_scale_eliminate_pass.cc | 73 - .../identity_scale_eliminate_pass_test.cc | 93 - lite/core/mir/fusion/CMakeLists.txt | 48 - .../mir/fusion/conv_activation_fuse_pass.cc | 42 - .../mir/fusion/conv_activation_fuse_pass.h | 32 - lite/core/mir/fusion/conv_activation_fuser.cc | 83 - lite/core/mir/fusion/conv_activation_fuser.h | 50 - lite/core/mir/fusion/conv_bn_fuse_pass.cc | 38 - lite/core/mir/fusion/conv_bn_fuse_pass.h | 32 - .../core/mir/fusion/conv_bn_fuse_pass_test.cc | 140 - lite/core/mir/fusion/conv_bn_fuser.cc | 163 - lite/core/mir/fusion/conv_bn_fuser.h | 58 - ...ementwise_add_activation_fuse_pass_test.cc | 157 - .../mir/fusion/conv_elementwise_fuse_pass.cc | 42 - .../mir/fusion/conv_elementwise_fuse_pass.h | 32 - .../core/mir/fusion/conv_elementwise_fuser.cc | 102 - lite/core/mir/fusion/conv_elementwise_fuser.h | 43 - .../elementwise_add_activation_fuse_pass.cc | 37 - .../elementwise_add_activation_fuse_pass.h | 32 - ...ementwise_add_activation_fuse_pass_test.cc | 117 - .../elementwise_add_activation_fuser.cc | 87 - .../fusion/elementwise_add_activation_fuser.h | 41 - lite/core/mir/fusion/fc_fuse_pass.cc | 35 - lite/core/mir/fusion/fc_fuse_pass.h | 32 - lite/core/mir/fusion/fc_fuse_pass_test.cc | 117 - lite/core/mir/fusion/fc_fuser.cc | 78 - lite/core/mir/fusion/fc_fuser.h | 38 - lite/core/mir/fusion/interpolate_fuse_pass.cc | 39 - lite/core/mir/fusion/interpolate_fuse_pass.h | 32 - lite/core/mir/fusion/interpolate_fuser.cc | 95 - lite/core/mir/fusion/interpolate_fuser.h | 42 - .../mir/fusion/quant_dequant_fuse_pass.cc | 47 - .../core/mir/fusion/quant_dequant_fuse_pass.h | 33 - .../core/mir/fusion/quant_dequant_op_fuser.cc | 200 - lite/core/mir/fusion/quant_dequant_op_fuser.h | 59 - .../mir/fusion/shuffle_channel_fuse_pass.cc | 39 - .../mir/fusion/shuffle_channel_fuse_pass.h | 32 - lite/core/mir/fusion/shuffle_channel_fuser.cc | 109 - lite/core/mir/fusion/shuffle_channel_fuser.h | 44 - .../transpose_softmax_transpose_fuse_pass.cc | 40 - .../transpose_softmax_transpose_fuse_pass.h | 32 - .../transpose_softmax_transpose_fuser.cc | 99 - .../transpose_softmax_transpose_fuser.h | 44 - lite/core/mir/generate_program_pass.cc | 42 - lite/core/mir/generate_program_pass.h | 50 - lite/core/mir/graph_visualize_pass.cc | 102 - lite/core/mir/graph_visualize_pass.h | 39 - lite/core/mir/io_copy_kernel_pick_pass.cc | 75 - lite/core/mir/node.cc | 74 - lite/core/mir/node.h | 173 - lite/core/mir/pass.cc | 15 - lite/core/mir/pass.h | 88 - lite/core/mir/pass_manager.cc | 21 - lite/core/mir/pass_manager.h | 87 - lite/core/mir/pass_manager_test.cc | 33 - lite/core/mir/pass_registry.cc | 21 - lite/core/mir/pass_registry.h | 55 - lite/core/mir/pattern_matcher.cc | 528 - lite/core/mir/pattern_matcher.h | 432 - lite/core/mir/pattern_matcher_high_api.cc | 80 - lite/core/mir/pattern_matcher_high_api.h | 83 - .../core/mir/pattern_matcher_high_api_test.cc | 150 - lite/core/mir/pattern_matcher_test.cc | 233 - lite/core/mir/pattern_matcher_tester.cc | 233 - lite/core/mir/runtime_context_assign_pass.cc | 42 - lite/core/mir/ssa_graph.cc | 240 - lite/core/mir/ssa_graph.h | 144 - lite/core/mir/ssa_graph_test.cc | 59 - lite/core/mir/static_kernel_pick_pass.cc | 136 - lite/core/mir/static_kernel_pick_pass.h | 97 - lite/core/mir/subgraph/CMakeLists.txt | 34 - .../mir/subgraph/generate_npu_program_pass.cc | 218 - .../mir/subgraph/generate_npu_program_pass.h | 65 - .../generate_npu_program_pass_test.cc | 114 - .../mir/subgraph/subgraph_program_pass.cc | 314 - .../core/mir/subgraph/subgraph_program_pass.h | 105 - .../subgraph/subgraph_program_pass_test.cc | 223 - lite/core/mir/type_layout_cast_pass.cc | 177 - lite/core/mir/type_layout_cast_pass.h | 62 - lite/core/mir/type_precision_cast_pass.cc | 183 - lite/core/mir/type_precision_cast_pass.h | 66 - lite/core/mir/type_target_cast_pass.cc | 183 - lite/core/mir/type_target_cast_pass.h | 66 - .../core/mir/variable_place_inference_pass.cc | 35 - lite/core/mir/variable_place_inference_pass.h | 157 - .../mir/variable_place_inference_pass_test.cc | 101 - lite/core/naive_test_model.py | 56 - lite/core/op_lite.cc | 105 - lite/core/op_lite.h | 231 - lite/core/op_lite_test.cc | 24 - lite/core/op_registry.cc | 154 - lite/core/op_registry.h | 306 - lite/core/optimizer.cc | 34 - lite/core/optimizer.h | 213 - lite/core/optimizer_test.cc | 51 - lite/core/profile/CMakeLists.txt | 8 - lite/core/profile/basic_profiler.cc | 26 - lite/core/profile/basic_profiler.h | 210 - lite/core/profile/basic_profiler_test.cc | 46 - lite/core/profile/precision_profiler.h | 137 - lite/core/program.cc | 208 - lite/core/program.h | 156 - lite/core/program_fake_utils.cc | 22 - lite/core/program_fake_utils.h | 142 - lite/core/scope.cc | 72 - lite/core/scope.h | 79 - lite/core/scope_test.cc | 37 - lite/core/target_wrapper.cc | 21 - lite/core/target_wrapper.h | 170 - lite/core/tensor.cc | 115 - lite/core/tensor.h | 249 - lite/core/type_system.cc | 157 - lite/core/type_system.h | 390 - lite/core/type_system_test.cc | 35 - lite/core/types.cc | 95 - lite/core/types.h | 147 - lite/core/types_test.cc | 43 - lite/core/variable.cc | 19 - lite/core/variable.h | 52 - lite/core/workspace.cc | 15 - lite/core/workspace.h | 83 - lite/demo/cxx/Makefile.def | 35 - lite/demo/cxx/README.md | 42 - .../mobile_full/Makefile.android.armv7 | 22 - .../mobile_full/Makefile.android.armv8 | 22 - .../mobile_light/Makefile.android.armv7 | 22 - .../mobile_light/Makefile.android.armv8 | 22 - .../cxx/mobile_full/mobilenetv1_full_api.cc | 83 - .../cxx/mobile_light/mobilenetv1_light_api.cc | 65 - lite/demo/java/README.md | 118 - .../java/android/PaddlePredictor/.gitignore | 13 - .../android/PaddlePredictor/app/.gitignore | 1 - .../android/PaddlePredictor/app/build.gradle | 28 - .../PaddlePredictor/app/proguard-rules.pro | 21 - .../paddle/lite/ExampleInstrumentedTest.java | 114 - .../app/src/main/AndroidManifest.xml | 21 - .../app/src/main/assets/README.txt | 8 - .../com/baidu/paddle/lite/MainActivity.java | 206 - .../drawable-v24/ic_launcher_foreground.xml | 34 - .../res/drawable/ic_launcher_background.xml | 170 - .../app/src/main/res/layout/activity_main.xml | 19 - .../res/mipmap-anydpi-v26/ic_launcher.xml | 5 - .../mipmap-anydpi-v26/ic_launcher_round.xml | 5 - .../src/main/res/mipmap-hdpi/ic_launcher.png | Bin 2963 -> 0 bytes .../res/mipmap-hdpi/ic_launcher_round.png | Bin 4905 -> 0 bytes .../src/main/res/mipmap-mdpi/ic_launcher.png | Bin 2060 -> 0 bytes .../res/mipmap-mdpi/ic_launcher_round.png | Bin 2783 -> 0 bytes .../src/main/res/mipmap-xhdpi/ic_launcher.png | Bin 4490 -> 0 bytes .../res/mipmap-xhdpi/ic_launcher_round.png | Bin 6895 -> 0 bytes .../main/res/mipmap-xxhdpi/ic_launcher.png | Bin 6387 -> 0 bytes .../res/mipmap-xxhdpi/ic_launcher_round.png | Bin 10413 -> 0 bytes .../main/res/mipmap-xxxhdpi/ic_launcher.png | Bin 9128 -> 0 bytes .../res/mipmap-xxxhdpi/ic_launcher_round.png | Bin 15132 -> 0 bytes .../app/src/main/res/values/colors.xml | 6 - .../app/src/main/res/values/strings.xml | 3 - .../app/src/main/res/values/styles.xml | 11 - .../baidu/paddle/lite/ExampleUnitTest.java | 17 - .../java/android/PaddlePredictor/build.gradle | 27 - .../android/PaddlePredictor/gradle.properties | 13 - .../gradle/wrapper/gradle-wrapper.jar | Bin 54329 -> 0 bytes .../gradle/wrapper/gradle-wrapper.properties | 6 - .../demo/java/android/PaddlePredictor/gradlew | 172 - .../java/android/PaddlePredictor/gradlew.bat | 84 - .../android/PaddlePredictor/settings.gradle | 1 - lite/demo/java/android/prepare_demo.bash | 23 - lite/fluid/CMakeLists.txt | 4 - lite/fluid/data_type.cc | 101 - lite/fluid/data_type.h | 88 - lite/fluid/data_type_test.cc | 40 - lite/fluid/eigen.h | 141 - lite/fluid/float16.h | 1100 -- lite/fluid/lod.h | 38 - lite/fluid/math.h | 42 - lite/gen_code/CMakeLists.txt | 49 - lite/gen_code/gen_code.cc | 223 - lite/gen_code/gen_code.h | 258 - lite/gen_code/gen_code_test.cc | 168 - lite/gen_code/generated_code_test.cc | 87 - lite/gen_code/paddle_code_generator.cc | 56 - lite/gen_code/paddle_infer.cc | 145 - lite/gen_code/paddle_infer.h | 72 - lite/kernels/CMakeLists.txt | 11 - lite/kernels/arm/CMakeLists.txt | 95 - lite/kernels/arm/activation_compute.cc | 247 - lite/kernels/arm/activation_compute.h | 136 - lite/kernels/arm/affine_channel_compute.cc | 77 - lite/kernels/arm/affine_channel_compute.h | 38 - lite/kernels/arm/anchor_generator_compute.cc | 66 - lite/kernels/arm/anchor_generator_compute.h | 38 - lite/kernels/arm/argmax_compute.cc | 51 - lite/kernels/arm/argmax_compute.h | 37 - lite/kernels/arm/argmax_compute_test.cc | 139 - lite/kernels/arm/assign_compute.cc | 47 - lite/kernels/arm/assign_compute.h | 37 - lite/kernels/arm/assign_value_compute.cc | 66 - lite/kernels/arm/assign_value_compute.h | 37 - lite/kernels/arm/axpy_compute.cc | 62 - lite/kernels/arm/axpy_compute.h | 37 - lite/kernels/arm/axpy_compute_test.cc | 142 - lite/kernels/arm/batch_norm_compute.cc | 123 - lite/kernels/arm/batch_norm_compute.h | 42 - lite/kernels/arm/batch_norm_compute_test.cc | 221 - lite/kernels/arm/beam_search_compute.cc | 60 - lite/kernels/arm/beam_search_compute.h | 42 - .../kernels/arm/beam_search_decode_compute.cc | 296 - lite/kernels/arm/beam_search_decode_compute.h | 39 - lite/kernels/arm/box_clip_compute.cc | 87 - lite/kernels/arm/box_clip_compute.h | 37 - lite/kernels/arm/box_coder_compute.cc | 241 - lite/kernels/arm/box_coder_compute.h | 36 - lite/kernels/arm/calib_compute.cc | 90 - lite/kernels/arm/calib_compute.h | 51 - lite/kernels/arm/calib_compute_test.cc | 156 - lite/kernels/arm/cast_compute.cc | 62 - lite/kernels/arm/cast_compute.h | 42 - lite/kernels/arm/compare_compute.cc | 186 - lite/kernels/arm/compare_compute.h | 43 - lite/kernels/arm/concat_compute.cc | 87 - lite/kernels/arm/concat_compute.h | 37 - lite/kernels/arm/concat_compute_test.cc | 236 - lite/kernels/arm/conv_compute.cc | 241 - lite/kernels/arm/conv_compute.h | 67 - lite/kernels/arm/conv_compute_test.cc | 1045 -- lite/kernels/arm/conv_transpose_compute.cc | 164 - lite/kernels/arm/conv_transpose_compute.h | 40 - .../arm/conv_transpose_compute_test.cc | 371 - lite/kernels/arm/crop_compute.cc | 77 - lite/kernels/arm/crop_compute.h | 49 - lite/kernels/arm/decode_bboxes_compute.cc | 68 - lite/kernels/arm/decode_bboxes_compute.h | 36 - .../kernels/arm/decode_bboxes_compute_test.cc | 185 - lite/kernels/arm/density_prior_box_compute.cc | 121 - lite/kernels/arm/density_prior_box_compute.h | 37 - lite/kernels/arm/dropout_compute.cc | 51 - lite/kernels/arm/dropout_compute.h | 35 - lite/kernels/arm/dropout_compute_test.cc | 106 - lite/kernels/arm/elementwise_compute.cc | 417 - lite/kernels/arm/elementwise_compute.h | 108 - lite/kernels/arm/elementwise_compute_test.cc | 721 -- lite/kernels/arm/expand_compute.cc | 72 - lite/kernels/arm/expand_compute.h | 34 - lite/kernels/arm/fc_compute.cc | 263 - lite/kernels/arm/fc_compute.h | 68 - lite/kernels/arm/fc_compute_test.cc | 211 - lite/kernels/arm/fill_constant_compute.cc | 54 - .../kernels/arm/generate_proposals_compute.cc | 494 - lite/kernels/arm/generate_proposals_compute.h | 38 - lite/kernels/arm/gru_compute.cc | 146 - lite/kernels/arm/gru_compute.h | 38 - lite/kernels/arm/gru_unit_compute.cc | 116 - lite/kernels/arm/gru_unit_compute.h | 38 - lite/kernels/arm/im2sequence_compute.cc | 141 - lite/kernels/arm/im2sequence_compute.h | 42 - lite/kernels/arm/increment_compute.cc | 49 - lite/kernels/arm/increment_compute.h | 42 - lite/kernels/arm/interpolate_compute.cc | 94 - lite/kernels/arm/interpolate_compute.h | 44 - lite/kernels/arm/is_empty_compute.cc | 47 - lite/kernels/arm/is_empty_compute.h | 40 - lite/kernels/arm/lod_reset_compute.cc | 64 - lite/kernels/arm/lod_reset_compute.h | 41 - lite/kernels/arm/logical_compute.cc | 128 - lite/kernels/arm/logical_compute.h | 53 - lite/kernels/arm/lookup_table_compute.cc | 77 - lite/kernels/arm/lookup_table_compute.h | 38 - lite/kernels/arm/lrn_compute.cc | 56 - lite/kernels/arm/lrn_compute.h | 36 - lite/kernels/arm/lrn_compute_test.cc | 196 - lite/kernels/arm/matmul_compute.cc | 277 - lite/kernels/arm/matmul_compute.h | 42 - lite/kernels/arm/mul_compute.cc | 98 - lite/kernels/arm/mul_compute.h | 42 - lite/kernels/arm/mul_compute_test.cc | 182 - lite/kernels/arm/negative_compute.cc | 53 - lite/kernels/arm/negative_compute.h | 37 - lite/kernels/arm/norm_compute.cc | 50 - lite/kernels/arm/norm_compute.h | 42 - lite/kernels/arm/pad2d_compute.cc | 72 - lite/kernels/arm/pad2d_compute.h | 46 - lite/kernels/arm/pool_compute.cc | 228 - lite/kernels/arm/pool_compute.h | 38 - lite/kernels/arm/pool_compute_test.cc | 286 - lite/kernels/arm/power_compute.cc | 45 - lite/kernels/arm/power_compute.h | 34 - lite/kernels/arm/prior_box_compute.cc | 103 - lite/kernels/arm/prior_box_compute.h | 36 - lite/kernels/arm/read_from_array_compute.cc | 57 - lite/kernels/arm/read_from_array_compute.h | 43 - lite/kernels/arm/reduce_max_compute.cc | 91 - lite/kernels/arm/reduce_max_compute.h | 38 - lite/kernels/arm/reduce_mean_compute.cc | 91 - lite/kernels/arm/reduce_mean_compute.h | 38 - lite/kernels/arm/roi_align_compute.cc | 236 - lite/kernels/arm/roi_align_compute.h | 37 - lite/kernels/arm/scale_compute.cc | 49 - lite/kernels/arm/scale_compute.h | 34 - lite/kernels/arm/scale_compute_test.cc | 117 - lite/kernels/arm/sequence_expand_compute.cc | 132 - lite/kernels/arm/sequence_expand_compute.h | 39 - lite/kernels/arm/sequence_pool_compute.cc | 79 - lite/kernels/arm/sequence_pool_compute.h | 40 - lite/kernels/arm/sequence_softmax_compute.cc | 58 - lite/kernels/arm/sequence_softmax_compute.h | 43 - lite/kernels/arm/shape_compute.cc | 41 - lite/kernels/arm/shape_compute.h | 34 - lite/kernels/arm/shuffle_channel_compute.cc | 50 - lite/kernels/arm/shuffle_channel_compute.h | 35 - lite/kernels/arm/slice_compute.cc | 57 - lite/kernels/arm/slice_compute.h | 41 - lite/kernels/arm/softmax_compute.cc | 80 - lite/kernels/arm/softmax_compute.h | 35 - lite/kernels/arm/softmax_compute_test.cc | 135 - lite/kernels/arm/split_compute.cc | 46 - lite/kernels/arm/split_compute.h | 35 - lite/kernels/arm/split_compute_test.cc | 179 - lite/kernels/arm/squeeze_compute.cc | 70 - lite/kernels/arm/squeeze_compute.h | 42 - lite/kernels/arm/stack_compute.cc | 42 - lite/kernels/arm/stack_compute.h | 34 - lite/kernels/arm/topk_compute.cc | 47 - lite/kernels/arm/topk_compute.h | 34 - lite/kernels/arm/transpose_compute.cc | 185 - lite/kernels/arm/transpose_compute.h | 48 - lite/kernels/arm/transpose_compute_test.cc | 205 - lite/kernels/arm/while_compute.cc | 54 - lite/kernels/arm/while_compute.h | 83 - lite/kernels/arm/write_to_array_compute.cc | 61 - lite/kernels/arm/write_to_array_compute.h | 42 - lite/kernels/arm/yolo_box_compute.cc | 60 - lite/kernels/arm/yolo_box_compute.h | 34 - lite/kernels/cuda/CMakeLists.txt | 42 - lite/kernels/cuda/calib_compute.cu | 131 - lite/kernels/cuda/calib_compute.h | 52 - lite/kernels/cuda/calib_compute_cuda_test.cc | 178 - lite/kernels/cuda/concat_compute.cu | 276 - lite/kernels/cuda/concat_compute.h | 34 - lite/kernels/cuda/concat_compute_test.cc | 227 - lite/kernels/cuda/conv_compute.cc | 103 - lite/kernels/cuda/conv_compute.h | 53 - lite/kernels/cuda/conv_compute_test.cc | 248 - lite/kernels/cuda/elementwise_add_compute.cu | 79 - lite/kernels/cuda/elementwise_add_compute.h | 35 - .../cuda/elementwise_add_compute_test.cc | 107 - lite/kernels/cuda/io_copy_compute.cc | 143 - lite/kernels/cuda/leaky_relu_compute.cu | 69 - lite/kernels/cuda/leaky_relu_compute.h | 34 - lite/kernels/cuda/leaky_relu_compute_test.cc | 72 - lite/kernels/cuda/mul_compute.cc | 31 - lite/kernels/cuda/mul_compute.h | 84 - lite/kernels/cuda/nearest_interp_compute.cu | 160 - lite/kernels/cuda/nearest_interp_compute.h | 35 - .../cuda/nearest_interp_compute_test.cc | 152 - lite/kernels/cuda/transpose_compute.cu | 86 - lite/kernels/cuda/transpose_compute.h | 38 - lite/kernels/cuda/transpose_compute_test.cc | 290 - lite/kernels/cuda/use_kernels.h | 24 - lite/kernels/cuda/yolo_box_compute.cu | 224 - lite/kernels/cuda/yolo_box_compute.h | 37 - lite/kernels/cuda/yolo_box_compute_test.cc | 258 - lite/kernels/fpga/CMakeLists.txt | 32 - lite/kernels/fpga/activation_compute.cc | 53 - lite/kernels/fpga/activation_compute.h | 46 - lite/kernels/fpga/activation_compute_test.cc | 97 - lite/kernels/fpga/calib_compute.cc | 114 - lite/kernels/fpga/calib_compute.h | 51 - lite/kernels/fpga/conv_compute.cc | 71 - lite/kernels/fpga/conv_compute.h | 45 - lite/kernels/fpga/conv_compute_test.cc | 315 - lite/kernels/fpga/elementwise_compute.cc | 102 - lite/kernels/fpga/elementwise_compute.h | 56 - lite/kernels/fpga/elementwise_compute_test.cc | 286 - lite/kernels/fpga/fc_compute.cc | 65 - lite/kernels/fpga/fc_compute.h | 49 - lite/kernels/fpga/fc_compute_test.cc | 205 - lite/kernels/fpga/feed_compute.cc | 60 - lite/kernels/fpga/feed_compute.h | 42 - lite/kernels/fpga/fetch_compute.cc | 59 - lite/kernels/fpga/fetch_compute.h | 41 - lite/kernels/fpga/io_copy_compute.cc | 157 - lite/kernels/fpga/layout_compute.cc | 146 - lite/kernels/fpga/pooling_compute.cc | 65 - lite/kernels/fpga/pooling_compute.h | 44 - lite/kernels/fpga/pooling_compute_test.cc | 291 - lite/kernels/fpga/scale_compute.cc | 39 - lite/kernels/fpga/scale_compute.h | 35 - lite/kernels/fpga/softmax_compute.cc | 57 - lite/kernels/fpga/softmax_compute.h | 46 - lite/kernels/fpga/softmax_compute_test.cc | 136 - lite/kernels/host/CMakeLists.txt | 9 - lite/kernels/host/feed_compute.cc | 46 - lite/kernels/host/fetch_compute.cc | 53 - lite/kernels/host/multiclass_nms_compute.cc | 398 - lite/kernels/host/multiclass_nms_compute.h | 36 - .../host/multiclass_nms_compute_test.cc | 368 - lite/kernels/host/reshape_compute.cc | 138 - lite/kernels/host/reshape_compute.h | 36 - lite/kernels/host/reshape_compute_test.cc | 101 - lite/kernels/host/use_kernels.h | 21 - lite/kernels/npu/CMakeLists.txt | 9 - lite/kernels/npu/graph_compute.cc | 151 - lite/kernels/npu/graph_compute.h | 56 - lite/kernels/opencl/CMakeLists.txt | 49 - lite/kernels/opencl/conv_compute.cc | 296 - lite/kernels/opencl/conv_compute.h | 63 - lite/kernels/opencl/conv_compute_test.cc | 602 -- .../opencl/depthwise_conv2d_compute.cc | 132 - .../opencl/depthwise_conv2d_compute_test.cc | 181 - .../kernels/opencl/elementwise_add_compute.cc | 107 - lite/kernels/opencl/elementwise_add_compute.h | 51 - .../opencl/elementwise_add_compute_test.cc | 251 - lite/kernels/opencl/fc_compute.cc | 126 - lite/kernels/opencl/fc_compute_test.cc | 200 - ...sion_elementwise_add_activation_compute.cc | 56 - lite/kernels/opencl/io_copy_compute.cc | 145 - lite/kernels/opencl/io_copy_compute_test.cc | 83 - lite/kernels/opencl/mul_compute.cc | 119 - lite/kernels/opencl/mul_compute_test.cc | 170 - lite/kernels/opencl/pool_compute.cc | 127 - lite/kernels/opencl/pool_compute_test.cc | 147 - lite/kernels/opencl/relu_compute.cc | 91 - lite/kernels/opencl/relu_compute_test.cc | 94 - lite/kernels/x86/CMakeLists.txt | 48 - lite/kernels/x86/activation_compute.cc | 127 - lite/kernels/x86/batch_norm_compute.cc | 34 - lite/kernels/x86/batch_norm_compute.h | 159 - lite/kernels/x86/batch_norm_compute_test.cc | 139 - lite/kernels/x86/concat_compute.cc | 25 - lite/kernels/x86/concat_compute.h | 71 - lite/kernels/x86/concat_compute_test.cc | 82 - lite/kernels/x86/conv_compute.cc | 39 - lite/kernels/x86/conv_compute.h | 167 - lite/kernels/x86/conv_compute_test.cc | 92 - lite/kernels/x86/dropout_compute.cc | 26 - lite/kernels/x86/dropout_compute.h | 82 - lite/kernels/x86/dropout_compute_test.cc | 78 - lite/kernels/x86/elementwise_compute.cc | 55 - lite/kernels/x86/elementwise_compute.h | 142 - lite/kernels/x86/elementwise_compute_test.cc | 88 - lite/kernels/x86/fc_compute.cc | 23 - lite/kernels/x86/fc_compute.h | 106 - lite/kernels/x86/fc_compute_test.cc | 100 - lite/kernels/x86/fill_constant_compute.cc | 59 - lite/kernels/x86/mean_compute.cc | 108 - lite/kernels/x86/mul_compute.cc | 44 - lite/kernels/x86/mul_compute.h | 159 - lite/kernels/x86/mul_compute_test.cc | 86 - lite/kernels/x86/pool_compute.cc | 25 - lite/kernels/x86/pool_compute.h | 87 - lite/kernels/x86/pool_compute_test.cc | 79 - lite/kernels/x86/relu_compute.cc | 25 - lite/kernels/x86/relu_compute.h | 52 - lite/kernels/x86/relu_compute_test.cc | 75 - lite/kernels/x86/reshape_compute.cc | 36 - lite/kernels/x86/reshape_compute.h | 79 - lite/kernels/x86/reshape_compute_test.cc | 156 - lite/kernels/x86/scale_compute.cc | 25 - lite/kernels/x86/scale_compute.h | 58 - lite/kernels/x86/scale_compute_test.cc | 76 - lite/kernels/x86/sequence_pool_compute.cc | 25 - lite/kernels/x86/sequence_pool_compute.h | 59 - .../kernels/x86/sequence_pool_compute_test.cc | 88 - lite/kernels/x86/sgd_compute.cc | 82 - lite/kernels/x86/shape_compute.cc | 25 - lite/kernels/x86/shape_compute.h | 45 - lite/kernels/x86/shape_compute_test.cc | 73 - lite/kernels/x86/slice_compute.cc | 25 - lite/kernels/x86/slice_compute.h | 145 - lite/kernels/x86/slice_compute_test.cc | 265 - lite/kernels/x86/softmax_compute.cc | 25 - lite/kernels/x86/softmax_compute.h | 82 - lite/kernels/x86/softmax_compute_test.cc | 84 - lite/kernels/x86/squeeze_compute.cc | 36 - lite/kernels/x86/squeeze_compute.h | 70 - lite/kernels/x86/squeeze_compute_test.cc | 142 - lite/kernels/x86/uniform_random_compute.cc | 70 - lite/model_parser/CMakeLists.txt | 34 - lite/model_parser/compatible_pb.cc | 286 - lite/model_parser/compatible_pb.h | 71 - lite/model_parser/compatible_pb_test.cc | 433 - lite/model_parser/cpp/CMakeLists.txt | 6 - lite/model_parser/cpp/block_desc.cc | 47 - lite/model_parser/cpp/block_desc.h | 75 - lite/model_parser/cpp/op_desc.cc | 122 - lite/model_parser/cpp/op_desc.h | 122 - lite/model_parser/cpp/program_desc.cc | 35 - lite/model_parser/cpp/program_desc.h | 57 - lite/model_parser/cpp/var_desc.cc | 15 - lite/model_parser/cpp/var_desc.h | 53 - lite/model_parser/desc_apis.h | 229 - lite/model_parser/model_parser.cc | 794 -- lite/model_parser/model_parser.h | 108 - lite/model_parser/model_parser_test.cc | 138 - lite/model_parser/naive_buffer/CMakeLists.txt | 19 - lite/model_parser/naive_buffer/block_desc.cc | 103 - lite/model_parser/naive_buffer/block_desc.h | 86 - .../naive_buffer/combined_params_desc.cc | 15 - .../naive_buffer/combined_params_desc.h | 63 - .../model_parser/naive_buffer/naive_buffer.cc | 144 - lite/model_parser/naive_buffer/naive_buffer.h | 374 - .../naive_buffer/naive_buffer_test.cc | 178 - .../naive_buffer_wrapper_helper.h | 47 - .../naive_buffer/naive_buffer_wrapper_test.cc | 316 - lite/model_parser/naive_buffer/op_desc.cc | 129 - lite/model_parser/naive_buffer/op_desc.h | 234 - lite/model_parser/naive_buffer/param_desc.cc | 228 - lite/model_parser/naive_buffer/param_desc.h | 92 - .../model_parser/naive_buffer/program_desc.cc | 58 - lite/model_parser/naive_buffer/program_desc.h | 66 - .../naive_buffer/proto/CMakeLists.txt | 1 - .../naive_buffer/proto/framework.nb.cc | 15 - .../naive_buffer/proto/framework.nb.h | 203 - lite/model_parser/naive_buffer/var_desc.cc | 109 - lite/model_parser/naive_buffer/var_desc.h | 63 - lite/model_parser/pb/CMakeLists.txt | 6 - lite/model_parser/pb/block_desc.cc | 47 - lite/model_parser/pb/block_desc.h | 80 - lite/model_parser/pb/op_desc.cc | 132 - lite/model_parser/pb/op_desc.h | 215 - lite/model_parser/pb/program_desc.cc | 36 - lite/model_parser/pb/program_desc.h | 62 - lite/model_parser/pb/var_desc.cc | 317 - lite/model_parser/pb/var_desc.h | 125 - lite/model_parser/runtime.cc | 109 - lite/model_parser/runtime.h | 122 - lite/operators/CMakeLists.txt | 120 - lite/operators/activation_ops.cc | 123 - lite/operators/activation_ops.h | 63 - lite/operators/affine_channel_op.cc | 76 - lite/operators/affine_channel_op.h | 48 - lite/operators/anchor_generator_op.cc | 71 - lite/operators/anchor_generator_op.h | 49 - lite/operators/argmax_op.cc | 62 - lite/operators/argmax_op.h | 48 - lite/operators/assign_op.cc | 52 - lite/operators/assign_op.h | 46 - lite/operators/assign_value_op.cc | 62 - lite/operators/assign_value_op.h | 48 - lite/operators/axpy_op.cc | 63 - lite/operators/axpy_op.h | 48 - lite/operators/batch_norm_op.cc | 112 - lite/operators/batch_norm_op.h | 46 - lite/operators/batch_norm_op_test.cc | 139 - lite/operators/beam_search_decode_op.cc | 59 - lite/operators/beam_search_decode_op.h | 47 - lite/operators/beam_search_op.cc | 69 - lite/operators/beam_search_op.h | 47 - lite/operators/box_clip_op.cc | 61 - lite/operators/box_clip_op.h | 48 - lite/operators/box_coder_op.cc | 106 - lite/operators/box_coder_op.h | 45 - lite/operators/calib_once_op.cc | 30 - lite/operators/calib_once_op.h | 33 - lite/operators/calib_op.cc | 52 - lite/operators/calib_op.h | 59 - lite/operators/calib_op_test.cc | 62 - lite/operators/cast_op.cc | 52 - lite/operators/cast_op.h | 47 - lite/operators/compare_op.cc | 61 - lite/operators/compare_op.h | 47 - lite/operators/concat_op.cc | 77 - lite/operators/concat_op.h | 46 - lite/operators/concat_op_test.cc | 59 - lite/operators/conv_op.cc | 80 - lite/operators/conv_op.h | 107 - lite/operators/conv_transpose_op.cc | 99 - lite/operators/conv_transpose_op.h | 51 - lite/operators/crop_op.cc | 55 - lite/operators/crop_op.h | 46 - lite/operators/decode_bboxes_op.cc | 60 - lite/operators/decode_bboxes_op.h | 45 - lite/operators/density_prior_box_op.cc | 94 - lite/operators/density_prior_box_op.h | 46 - lite/operators/dropout_op.cc | 78 - lite/operators/elementwise_ops.cc | 97 - lite/operators/elementwise_ops.h | 66 - lite/operators/expand_op.cc | 57 - lite/operators/expand_op.h | 44 - lite/operators/fake_dequantize_max_abs.cc | 25 - lite/operators/fake_dequantize_max_abs.h | 64 - .../fake_quantize_moving_avg_max_abs.cc | 25 - .../fake_quantize_moving_avg_max_abs.h | 69 - lite/operators/fake_quantize_range_abs_max.cc | 25 - lite/operators/fake_quantize_range_abs_max.h | 69 - lite/operators/fc_op.cc | 107 - lite/operators/fc_op.h | 61 - lite/operators/fc_op_test.cc | 78 - lite/operators/feed_op.cc | 65 - lite/operators/fetch_op.cc | 60 - lite/operators/fill_constant_op.cc | 59 - lite/operators/flatten_op.cc | 99 - lite/operators/flatten_op.h | 62 - .../fusion_elementwise_activation_ops.cc | 107 - .../fusion_elementwise_activation_ops.h | 71 - .../fusion_elementwise_activation_ops_test.cc | 63 - lite/operators/generate_proposals_op.cc | 86 - lite/operators/generate_proposals_op.h | 49 - lite/operators/graph_op.cc | 52 - lite/operators/graph_op.h | 52 - lite/operators/gru_op.cc | 108 - lite/operators/gru_op.h | 46 - lite/operators/gru_unit_op.cc | 105 - lite/operators/gru_unit_op.h | 46 - lite/operators/im2sequence_op.cc | 77 - lite/operators/im2sequence_op.h | 47 - lite/operators/increment_op.cc | 51 - lite/operators/increment_op.h | 47 - lite/operators/interpolate_op.cc | 101 - lite/operators/interpolate_op.h | 47 - lite/operators/io_copy_once_op.cc | 30 - lite/operators/io_copy_once_op.h | 33 - lite/operators/io_copy_op.cc | 46 - lite/operators/io_copy_op.h | 42 - lite/operators/is_empty_op.cc | 40 - lite/operators/is_empty_op.h | 47 - lite/operators/layout_once_op.cc | 30 - lite/operators/layout_once_op.h | 33 - lite/operators/layout_op.cc | 46 - lite/operators/layout_op.h | 42 - lite/operators/lod_reset_op.cc | 60 - lite/operators/lod_reset_op.h | 47 - lite/operators/logical_op.cc | 80 - lite/operators/logical_op.h | 66 - lite/operators/lookup_table_op.cc | 75 - lite/operators/lookup_table_op.h | 46 - lite/operators/lrn_op.cc | 52 - lite/operators/lrn_op.h | 44 - lite/operators/matmul_op.cc | 165 - lite/operators/matmul_op.h | 50 - lite/operators/mean_op.cc | 100 - lite/operators/mul_op.cc | 122 - lite/operators/mul_op.h | 93 - lite/operators/multiclass_nms_op.cc | 80 - lite/operators/multiclass_nms_op.h | 45 - lite/operators/negative_op.cc | 51 - lite/operators/negative_op.h | 46 - lite/operators/norm_op.cc | 52 - lite/operators/norm_op.h | 47 - lite/operators/op_params.cc | 15 - lite/operators/op_params.h | 824 -- lite/operators/pad2d_op.cc | 58 - lite/operators/pad2d_op.h | 46 - lite/operators/pool_op.cc | 90 - lite/operators/pool_op.h | 82 - lite/operators/pool_op_test.cc | 90 - lite/operators/power_op.cc | 53 - lite/operators/power_op.h | 47 - lite/operators/prior_box_op.cc | 77 - lite/operators/prior_box_op.h | 45 - lite/operators/read_from_array_op.cc | 47 - lite/operators/read_from_array_op.h | 47 - lite/operators/reduce_max_op.cc | 112 - lite/operators/reduce_max_op.h | 43 - lite/operators/reduce_mean_op.cc | 112 - lite/operators/reduce_mean_op.h | 43 - lite/operators/relu_op.cc | 49 - lite/operators/relu_op.h | 46 - lite/operators/reshape_op.cc | 182 - lite/operators/reshape_op.h | 63 - lite/operators/reshape_op_test.cc | 145 - lite/operators/roi_align_op.cc | 71 - lite/operators/roi_align_op.h | 48 - lite/operators/scale_op.cc | 49 - lite/operators/scale_op.h | 46 - lite/operators/scale_op_test.cc | 58 - lite/operators/sequence_expand_op.cc | 86 - lite/operators/sequence_expand_op.h | 46 - lite/operators/sequence_pool_op.cc | 55 - lite/operators/sequence_pool_op.h | 43 - lite/operators/sequence_softmax_op.cc | 50 - lite/operators/sequence_softmax_op.h | 47 - lite/operators/sgd_op.cc | 55 - lite/operators/sgd_op.h | 50 - lite/operators/shape_op.cc | 49 - lite/operators/shape_op.h | 44 - lite/operators/shuffle_channel_op.cc | 52 - lite/operators/shuffle_channel_op.h | 50 - lite/operators/slice_op.cc | 92 - lite/operators/slice_op.h | 47 - lite/operators/softmax_op.cc | 59 - lite/operators/softmax_op.h | 46 - lite/operators/softmax_op_test.cc | 54 - lite/operators/split_op.cc | 82 - lite/operators/split_op.h | 46 - lite/operators/squeeze_op.cc | 133 - lite/operators/squeeze_op.h | 61 - lite/operators/stack_op.cc | 62 - lite/operators/stack_op.h | 47 - lite/operators/topk_op.cc | 59 - lite/operators/topk_op.h | 46 - lite/operators/transpose_op.cc | 165 - lite/operators/transpose_op.h | 66 - lite/operators/transpose_op_test.cc | 93 - lite/operators/uniform_random_op.cc | 45 - lite/operators/uniform_random_op.h | 50 - lite/operators/while_op.cc | 55 - lite/operators/while_op.h | 48 - lite/operators/write_to_array_op.cc | 48 - lite/operators/write_to_array_op.h | 47 - lite/operators/yolo_box_op.cc | 70 - lite/operators/yolo_box_op.h | 46 - lite/tests/CMakeLists.txt | 1 - lite/tests/README.md | 1 - lite/tests/kernels/CMakeLists.txt | 54 - lite/tests/kernels/activation_compute_test.cc | 557 - .../kernels/affine_channel_compute_test.cc | 162 - .../kernels/anchor_generator_compute_test.cc | 177 - lite/tests/kernels/argmax_compute_test.cc | 130 - lite/tests/kernels/assign_compute_test.cc | 80 - .../kernels/assign_value_compute_test.cc | 121 - lite/tests/kernels/axpy_compute_test.cc | 136 - .../kernels/bilinear_interp_compute_test.cc | 282 - lite/tests/kernels/box_clip_compute_test.cc | 97 - lite/tests/kernels/box_coder_compute_test.cc | 212 - lite/tests/kernels/cast_compute_test.cc | 89 - lite/tests/kernels/compare_compute_test.cc | 243 - .../kernels/conv2d_transpose_compute_test.cc | 465 - lite/tests/kernels/crop_compute_test.cc | 129 - .../kernels/decode_bboxes_compute_test.cc | 225 - .../tests/kernels/elementwise_compute_test.cc | 665 -- lite/tests/kernels/expand_compute_test.cc | 135 - lite/tests/kernels/fc_compute_test.cc | 201 - lite/tests/kernels/fill_data.h | 33 - .../generate_proposals_compute_test.cc | 183 - lite/tests/kernels/gru_unit_test.cc | 363 - .../tests/kernels/im2sequence_compute_test.cc | 249 - lite/tests/kernels/increment_compute_test.cc | 94 - lite/tests/kernels/logical_compute_test.cc | 106 - lite/tests/kernels/lrn_compute_test.cc | 206 - lite/tests/kernels/matmul_compute_test.cc | 592 - .../kernels/nearest_interp_compute_test.cc | 192 - lite/tests/kernels/negative_compute_test.cc | 80 - lite/tests/kernels/norm_compute_test.cc | 110 - lite/tests/kernels/pad2d_compute_test.cc | 182 - lite/tests/kernels/power_compute_test.cc | 99 - lite/tests/kernels/prior_box_compute_test.cc | 752 -- .../kernels/read_from_array_compute_test.cc | 105 - lite/tests/kernels/reduce_max_compute_test.cc | 347 - .../tests/kernels/reduce_mean_compute_test.cc | 346 - lite/tests/kernels/roi_align_compute_test.cc | 133 - lite/tests/kernels/scale_compute_test.cc | 125 - .../kernels/sequence_expand_compute_test.cc | 188 - .../kernels/sequence_pool_compute_test.cc | 195 - .../kernels/sequence_softmax_compute_test.cc | 123 - lite/tests/kernels/shape_compute_test.cc | 87 - .../kernels/shuffle_channel_compute_test.cc | 110 - lite/tests/kernels/slice_compute_test.cc | 190 - lite/tests/kernels/squeeze_compute_test.cc | 253 - lite/tests/kernels/stack_compute_test.cc | 116 - lite/tests/kernels/test_funcs.h | 191 - lite/tests/kernels/test_sgemm.cc | 353 - lite/tests/kernels/topk_compute_test.cc | 119 - .../kernels/write_to_array_compute_test.cc | 116 - lite/tests/kernels/yolo_box_compute_test.cc | 254 - lite/tools/CMakeLists.txt | 1 - lite/tools/Dockerfile.mobile | 96 - lite/tools/benchmark.sh | 58 - lite/tools/build.sh | 272 - lite/tools/build_fpga.sh | 26 - lite/tools/build_npu.sh | 178 - lite/tools/ci_build.sh | 955 -- lite/tools/cmake_tools/ast.py | 321 - .../create_fake_kernel_registry.py | 104 - .../cmake_tools/parse_kernel_registry.py | 50 - lite/tools/cmake_tools/parse_op_registry.py | 49 - lite/tools/cmake_tools/utils.py | 18 - lite/tools/debug/CMakeLists.txt | 15 - lite/tools/debug/analysis_tool.py | 401 - lite/tools/debug/check_model.sh | 182 - lite/tools/debug/debug_utils.cc | 15 - lite/tools/debug/debug_utils.h | 337 - lite/tools/debug/model_debug_tool.cc | 112 - lite/tools/gitlab_review.sh | 75 - lite/tools/mobile_readme.md | 135 - lite/tools/prepare_benchmark.sh | 46 - lite/tools/python/lite_test.py | 103 - lite/tools/search_support_ops.py | 66 - lite/utils/CMakeLists.txt | 26 - lite/utils/all.h | 28 - lite/utils/any.cc | 23 - lite/utils/any.h | 71 - lite/utils/check.h | 41 - lite/utils/container.h | 51 - lite/utils/cp_logging.cc | 19 - lite/utils/cp_logging.h | 21 - lite/utils/factory.h | 100 - lite/utils/hash.h | 28 - lite/utils/io.h | 56 - lite/utils/logging.cc | 63 - lite/utils/logging.h | 185 - lite/utils/logging_test.cc | 31 - lite/utils/macros.h | 55 - lite/utils/paddle_enforce.h | 39 - lite/utils/replace_stl/stream.cc | 105 - lite/utils/replace_stl/stream.h | 76 - lite/utils/string.cc | 19 - lite/utils/string.h | 97 - lite/utils/varient.h | 151 - lite/utils/varient_test.cc | 58 - .../MobileNetDemo.xcodeproj/project.pbxproj | 504 - .../contents.xcworkspacedata | 7 - .../xcshareddata/IDEWorkspaceChecks.plist | 8 - .../MobileNetDemo/AppDelegate.swift | 46 - .../AppIcon.appiconset/Contents.json | 98 - .../Assets.xcassets/Contents.json | 6 - .../Base.lproj/LaunchScreen.storyboard | 25 - .../MobileNetDemo/Base.lproj/Main.storyboard | 166 - metal/MobileNetDemo/MobileNetDemo/Info.plist | 47 - .../MobileNetDemo/MobileNet.swift | 76 - .../MobileNetDemo/MobilenetPreProcess.metal | 38 - .../MobileNetDemo/ViewController.swift | 94 - .../project.pbxproj | 457 - .../contents.xcworkspacedata | 7 - .../xcshareddata/IDEWorkspaceChecks.plist | 8 - .../xcschemes/PaddleMobileTest.xcscheme | 91 - .../PaddleMobileTest/AppDelegate.swift | 46 - .../AppIcon.appiconset/Contents.json | 98 - .../Assets.xcassets/Contents.json | 6 - .../Base.lproj/LaunchScreen.storyboard | 25 - .../Base.lproj/Main.storyboard | 45 - .../PaddleMobileTest/Info.plist | 52 - .../PaddleMobileTest/TestViewController.swift | 478 - .../PaddleMobileTest/ViewController.swift | 122 - metal/Podfile | 40 - metal/README.md | 12 - .../project.pbxproj | 742 -- .../contents.xcworkspacedata | 7 - .../xcshareddata/IDEWorkspaceChecks.plist | 8 - .../UserInterfaceState.xcuserstate | Bin 5181 -> 0 bytes .../xcschemes/paddle-mobile-demo.xcscheme | 91 - .../paddle-mobile-demo/AppDelegate.swift | 51 - .../AppIcon.appiconset/Contents.json | 98 - .../Assets.xcassets/Contents.json | 6 - .../paddle-mobile.imageset/Contents.json | 21 - .../paddle-mobile.imageset/paddle-mobile.png | Bin 5331 -> 0 bytes .../Base.lproj/LaunchScreen.storyboard | 25 - .../Base.lproj/Main.storyboard | 325 - .../paddle-mobile-demo/Info.plist | 47 - .../paddle-mobile-demo/MetalHelper.swift | 31 - .../MultiPredictViewController.swift | 66 - .../Net/BufferToTexture.metal | 35 - .../paddle-mobile-demo/Net/CPUCompute.h | 44 - .../paddle-mobile-demo/Net/CPUCompute.mm | 318 - .../paddle-mobile-demo/Net/Genet.swift | 61 - .../paddle-mobile-demo/Net/MobileNet.swift | 75 - .../Net/MobileNetCombined.swift | 70 - .../paddle-mobile-demo/Net/MobileNetSSD.swift | 64 - .../Net/MobilenetSSD_AR.swift | 62 - .../Net/PreProcessKernel.metal | 117 - .../paddle-mobile-demo/Net/YoloNet.swift | 47 - .../paddle-mobile-demo/OC/ImageTool.h | 22 - .../paddle-mobile-demo/OC/ImageTool.m | 38 - .../OCDemo/LoadPointerViewController.h | 23 - .../OCDemo/LoadPointerViewController.m | 116 - .../OCDemo/OCDemoViewController.h | 22 - .../OCDemo/OCDemoViewController.m | 19 - .../OCInterface/PaddleMobileGPU.h | 106 - .../OCInterface/PaddleMobileGPU.m | 107 - .../OCInterface/SuperResolutionNet.swift | 84 - .../VideoCapture/FPSCounter.swift | 31 - .../VideoCapture/VideoCapture.swift | 218 - .../paddle-mobile-demo/ViewController.swift | 302 - .../metal/BatchNormKernel.metal | 42 - .../metal/BatchNormRelu.metal | 36 - .../metal/BilinearInterp.inc.metal | 49 - .../metal/BilinearInterp.metal | 29 - .../metal/BoxCoder.inc.metal | 54 - .../paddle-mobile-demo/metal/BoxCoder.metal | 23 - .../paddle-mobile-demo/metal/Common.metal | 120 - .../metal/ConcatKernel.inc.metal | 318 - .../metal/ConcatKernel.metal | 171 - .../metal/ConvAddBNReluKernel.metal | 310 - .../metal/ConvAddMetal.metal | 622 -- .../metal/ConvAddPrelu.inc.metal | 447 - .../metal/ConvAddPreluKernel.metal | 65 - .../metal/ConvBNReluKernel.metal | 297 - .../paddle-mobile-demo/metal/ConvKernel.metal | 280 - .../metal/ConvTransposeKernel.metal | 174 - .../metal/Elementwise.metal | 100 - .../metal/ElementwiseAddPreluKernel.inc.metal | 91 - .../metal/ElementwiseAddPreluKernel.metal | 75 - .../metal/FetchKernel.inc.metal | 46 - .../metal/FetchKernel.metal | 40 - .../paddle-mobile-demo/metal/Kernels.metal | 69 - .../paddle-mobile-demo/metal/Macro.metal | 29 - .../metal/NMSFetchResultKernel.metal | 80 - .../metal/PoolKernel.inc.metal | 44 - .../paddle-mobile-demo/metal/PoolKernel.metal | 36 - .../metal/PreluKernel.metal | 151 - .../metal/PriorBoxKernel.metal | 367 - .../paddle-mobile-demo/metal/ReluKernel.metal | 41 - .../metal/ReshapeKernel.inc.metal | 66 - .../metal/ReshapeKernel.metal | 150 - .../metal/ResizeBilinear.metal | 75 - .../paddle-mobile-demo/metal/Shape.metal | 21 - .../metal/Softmax.inc.metal | 61 - .../paddle-mobile-demo/metal/Softmax.metal | 29 - .../paddle-mobile-demo/metal/Split.inc.metal | 122 - .../paddle-mobile-demo/metal/Split.metal | 64 - .../metal/TransposeKernel.inc.metal | 60 - .../metal/TransposeKernel.metal | 63 - .../paddle-mobile-demo-Bridging-Header.h | 6 - .../project.pbxproj | 407 - .../contents.xcworkspacedata | 7 - .../xcshareddata/IDEWorkspaceChecks.plist | 8 - .../xcschemes/paddle-mobile-metallib.xcscheme | 80 - .../ActivationKernel.metal | 64 - .../BatchNormKernel.metal | 42 - .../BatchNormRelu.metal | 28 - .../BilinearInterp.inc.metal | 49 - .../BilinearInterp.metal | 29 - .../paddle-mobile-metallib/BoxCoder.inc.metal | 54 - .../paddle-mobile-metallib/BoxCoder.metal | 23 - .../BufferToTexture.metal | 67 - .../paddle-mobile-metallib/Common.metal | 136 - .../ConcatKernel.inc.metal | 318 - .../paddle-mobile-metallib/ConcatKernel.metal | 219 - .../ConvAddBNReluKernel.metal | 310 - .../ConvAddPrelu.inc.metal | 447 - .../ConvAddPreluKernel.metal | 65 - .../ConvAddReluMetal.metal | 889 -- .../ConvBNReluKernel.metal | 297 - .../ConvTransposeKernel.metal | 174 - .../paddle-mobile-metallib/Elementwise.metal | 90 - .../ElementwiseAddPreluKernel.inc.metal | 91 - .../ElementwiseAddPreluKernel.metal | 65 - .../FetchKernel.inc.metal | 60 - .../paddle-mobile-metallib/FetchKernel.metal | 40 - .../paddle-mobile-metallib/Kernels.metal | 69 - .../paddle-mobile-metallib/Macro.metal | 29 - .../NMSFetchResultKernel.metal | 80 - .../NearestInterpKernel.metal | 50 - .../PoolKernel.inc.metal | 50 - .../paddle-mobile-metallib/PoolKernel.metal | 36 - .../paddle-mobile-metallib/PreluKernel.metal | 151 - .../PriorBoxKernel.metal | 367 - .../paddle-mobile-metallib/ReluKernel.metal | 104 - .../ReshapeKernel.inc.metal | 66 - .../ReshapeKernel.metal | 150 - .../ResizeBilinear.metal | 75 - .../paddle-mobile-metallib/Scale.metal | 30 - .../paddle-mobile-metallib/ScaleKernel.metal | 82 - .../paddle-mobile-metallib/Shape.metal | 21 - .../paddle-mobile-metallib/SliceKernel.metal | 75 - .../paddle-mobile-metallib/Softmax.inc.metal | 61 - .../paddle-mobile-metallib/Softmax.metal | 29 - .../paddle-mobile-metallib/Split.inc.metal | 122 - .../paddle-mobile-metallib/Split.metal | 64 - .../TransposeKernel.inc.metal | 60 - .../TransposeKernel.metal | 63 - .../project.pbxproj | 478 - .../contents.xcworkspacedata | 7 - .../xcshareddata/IDEWorkspaceChecks.plist | 8 - .../UserInterfaceState.xcuserstate | Bin 5178 -> 0 bytes .../paddle-mobile-unit-test/AppDelegate.swift | 50 - .../AppIcon.appiconset/Contents.json | 98 - .../Assets.xcassets/Contents.json | 6 - .../Base.lproj/LaunchScreen.storyboard | 25 - .../Base.lproj/Main.storyboard | 24 - .../paddle-mobile-unit-test/Info.plist | 45 - .../ViewController.swift | 35 - .../paddle-mobile.xcodeproj/project.pbxproj | 913 -- .../contents.xcworkspacedata | 7 - .../xcshareddata/IDEWorkspaceChecks.plist | 8 - .../UserInterfaceState.xcuserstate | Bin 9571 -> 0 bytes .../xcschemes/paddle-mobile.xcscheme | 80 - .../paddle-mobile/API/GlobalConfig.swift | 40 - .../paddle-mobile/paddle-mobile/API/Net.swift | 99 - .../paddle-mobile/API/Runner.swift | 415 - metal/paddle-mobile/paddle-mobile/Info.plist | 24 - .../paddle-mobile/Src/Common/Errors.swift | 56 - .../paddle-mobile/Src/Common/Extensions.swift | 124 - .../Src/Common/MetalExtension.swift | 666 -- .../Src/Common/PaddleMobileUnitTest.swift | 362 - .../paddle-mobile/Src/Common/Tools.swift | 56 - .../paddle-mobile/Src/Common/Types.swift | 224 - .../paddle-mobile/Src/Framework/Dim.swift | 56 - .../Src/Framework/Executor.swift | 168 - .../paddle-mobile/Src/Framework/Loader.swift | 297 - .../paddle-mobile/Src/Framework/Tensor.swift | 585 - .../paddle-mobile/Src/Framework/Texture.swift | 262 - .../paddle-mobile/Src/Framework/Utils.swift | 29 - .../Src/Operators/Base/OpCreator.swift | 84 - .../Src/Operators/Base/OpParam.swift | 159 - .../Src/Operators/Base/Operator.swift | 219 - .../Src/Operators/BatchNormOp.swift | 57 - .../Src/Operators/BilinearInterpOp.swift | 58 - .../Src/Operators/BoxcoderOp.swift | 79 - .../Src/Operators/CNNMPSConvOp.swift | 75 - .../Src/Operators/ConcatOp.swift | 75 - .../Src/Operators/ConvAddAddPreluOp.swift | 105 - .../Operators/ConvAddBatchNormReluOp.swift | 125 - .../Src/Operators/ConvAddOp.swift | 70 - .../Src/Operators/ConvAddPreluOp.swift | 97 - .../Src/Operators/ConvAddReluOp.swift | 121 - .../Src/Operators/ConvBNReluOp.swift | 111 - .../paddle-mobile/Src/Operators/ConvOp.swift | 75 - .../Src/Operators/ConvReluOp.swift | 73 - .../Src/Operators/ConvTransposeOp.swift | 53 - .../Src/Operators/DepthwiseConvOp.swift | 55 - .../Src/Operators/DwConvBNReluOp.swift | 70 - .../Src/Operators/ElementwiseAddOp.swift | 88 - .../Src/Operators/ElementwiseAddPreluOp.swift | 108 - .../paddle-mobile/Src/Operators/ExpOp.swift | 47 - .../paddle-mobile/Src/Operators/FeedOp.swift | 67 - .../paddle-mobile/Src/Operators/FetchOp.swift | 50 - .../Src/Operators/FlattenOp.swift | 78 - .../Src/Operators/Kernels/Base/Kernel.swift | 238 - .../Operators/Kernels/BatchNormKernel.swift | 68 - .../Kernels/BatchNormReluKernel.swift | 91 - .../Kernels/BilinearInterpKernel.swift | 69 - .../Operators/Kernels/BoxcoderKernel.swift | 66 - .../Src/Operators/Kernels/CNNConvKernel.swift | 176 - .../Src/Operators/Kernels/Concat.swift | 31 - .../Src/Operators/Kernels/ConcatKernel.swift | 164 - .../Kernels/ConvAddAddPreluKernel.swift | 175 - .../Kernels/ConvAddBatchNormReluKernel.swift | 217 - .../Src/Operators/Kernels/ConvAddKernel.swift | 27 - .../Kernels/ConvAddPreluKernel.swift | 175 - .../Operators/Kernels/ConvAddReluKernel.swift | 394 - .../Operators/Kernels/ConvBNReluKernel.swift | 214 - .../Src/Operators/Kernels/ConvKernel.swift | 207 - .../Operators/Kernels/ConvReluKernel.swift | 27 - .../Kernels/ConvTransposeKernel.swift | 102 - .../Kernels/ElementwiseAddKernel.swift | 100 - .../Kernels/ElementwiseAddPreluKernel.swift | 77 - .../Src/Operators/Kernels/ExpKernel.swift | 52 - .../Src/Operators/Kernels/FetchKernel.swift | 92 - .../Src/Operators/Kernels/FlattenKernel.swift | 149 - .../Operators/Kernels/LeakyReluKernel.swift | 58 - .../Kernels/MulticlassNMSKernel.swift | 70 - .../Kernels/NearestInterpKernel.swift | 58 - .../Src/Operators/Kernels/PoolKernel.swift | 84 - .../Src/Operators/Kernels/PreluKernel.swift | 67 - .../Operators/Kernels/PriorBoxKernel.swift | 162 - .../Src/Operators/Kernels/Relu6Kernel.swift | 58 - .../Src/Operators/Kernels/ReluKernel.swift | 51 - .../Src/Operators/Kernels/ReshapeKernel.swift | 97 - .../Kernels/ResizeBilinearKernel.swift | 64 - .../Src/Operators/Kernels/Scale.swift | 36 - .../Src/Operators/Kernels/ScaleOpKernel.swift | 74 - .../Src/Operators/Kernels/ShapeKernel.swift | 44 - .../Src/Operators/Kernels/SigmoidKernel.swift | 52 - .../Src/Operators/Kernels/SliceKernel.swift | 101 - .../Src/Operators/Kernels/SoftmaxKernel.swift | 66 - .../Src/Operators/Kernels/SplitKernel.swift | 108 - .../Kernels/Texture2DTo2DArrayKernel.swift | 55 - .../Operators/Kernels/TransposeKernel.swift | 91 - .../Kernels/metal/BatchNormRelu.metal | 36 - .../Kernels/metal/ResizeBilinear.metal | 75 - .../Src/Operators/LeakyReluOp.swift | 49 - .../Src/Operators/MulticlassNMSOp.swift | 68 - .../Src/Operators/NearestInterpOp.swift | 52 - .../paddle-mobile/Src/Operators/PoolOp.swift | 71 - .../paddle-mobile/Src/Operators/PreluOp.swift | 65 - .../Src/Operators/PriorBoxOp.swift | 76 - .../paddle-mobile/Src/Operators/Relu6Op.swift | 52 - .../paddle-mobile/Src/Operators/ReluOp.swift | 48 - .../Src/Operators/ReshapeOp.swift | 73 - .../Src/Operators/ResizeBilinearOp.swift | 47 - .../paddle-mobile/Src/Operators/ScaleOp.swift | 53 - .../paddle-mobile/Src/Operators/ShapeOp.swift | 49 - .../Src/Operators/SigmoidOp.swift | 47 - .../paddle-mobile/Src/Operators/SliceOp.swift | 63 - .../Src/Operators/SoftmaxOp.swift | 55 - .../paddle-mobile/Src/Operators/SplitOp.swift | 69 - .../Src/Operators/TransposeOp.swift | 49 - .../paddle-mobile/Src/Program/Attribute.swift | 84 - .../Src/Program/Framework.pbobjc.h | 599 - .../Src/Program/Framework.pbobjc.m | 1417 --- .../Src/Program/MemoryOptimze.swift | 206 - .../Src/Program/PMBlockDesc.swift | 67 - .../paddle-mobile/Src/Program/PMOpDesc.swift | 85 - .../Src/Program/PMProgramDesc.swift | 44 - .../paddle-mobile/Src/Program/PMVarDesc.swift | 104 - .../paddle-mobile/Src/Program/Program.swift | 31 - .../Src/Program/ProgramOptimize.swift | 308 - .../paddle-mobile/Src/Program/Scope.swift | 55 - .../Src/Program/TensorDesc.swift | 89 - .../Src/Program/framework.pb.swift | 1820 ---- .../paddle-mobile/paddle_mobile.h | 24 - mobile.md | 7 + mobile/.clang-format | 5 - mobile/.clang-tidy | 67 - mobile/.gitignore | 103 - mobile/.pre-commit-config.yaml | 69 - mobile/.travis.yml | 36 - mobile/.travis/pre-commit-job.sh | 21 - mobile/CMakeLists.txt | 293 - mobile/CONTRIBUTING.md | 234 - mobile/Dockerfile | 38 - mobile/LICENSE | 204 - mobile/README.md | 137 - mobile/benchmark/arm_benchmark.md | 36 - mobile/benchmark/metal_benchmark.md | 10 - mobile/demo/ReadMe.md | 10 - mobile/demo/getDemo.sh | 8 - mobile/doc/build.md | 63 - mobile/doc/design_doc.md | 171 - mobile/doc/development_android.md | 189 - mobile/doc/development_android_GPU.md | 77 - mobile/doc/development_arm_linux.md | 62 - mobile/doc/development_fpga.md | 5 - mobile/doc/development_ios.md | 85 - mobile/doc/quantification.md | 33 - mobile/src/common/common.h | 31 - mobile/src/common/enforce.h | 73 - mobile/src/common/log.h | 235 - mobile/src/common/threadpool.h | 126 - mobile/src/common/type_define.h | 187 - mobile/src/common/types.cpp | 260 - mobile/src/common/types.h | 268 - mobile/src/common/util.cpp | 46 - mobile/src/common/util.h | 26 - mobile/src/common/variant.h | 106 - mobile/src/fpga/KD/alignment.h | 32 - mobile/src/fpga/KD/context.hpp | 55 - mobile/src/fpga/KD/dl_engine.cpp | 15 - mobile/src/fpga/KD/dl_engine.hpp | 33 - mobile/src/fpga/KD/float16.hpp | 506 - mobile/src/fpga/KD/layout.hpp | 99 - mobile/src/fpga/KD/llapi/bias_scale.cpp | 100 - mobile/src/fpga/KD/llapi/bias_scale.h | 29 - mobile/src/fpga/KD/llapi/config.h | 19 - mobile/src/fpga/KD/llapi/filter.cpp | 346 - mobile/src/fpga/KD/llapi/filter.h | 54 - mobile/src/fpga/KD/llapi/image.cpp | 149 - mobile/src/fpga/KD/llapi/image.h | 38 - mobile/src/fpga/KD/llapi/zynqmp_api.cpp | 384 - mobile/src/fpga/KD/llapi/zynqmp_api.h | 329 - mobile/src/fpga/KD/pe.hpp | 45 - mobile/src/fpga/KD/pe_params.hpp | 179 - mobile/src/fpga/KD/pes/concat_pe.hpp | 70 - mobile/src/fpga/KD/pes/conv_pe.hpp | 96 - mobile/src/fpga/KD/pes/conv_process.hpp | 374 - mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp | 98 - mobile/src/fpga/KD/pes/elementwise_add_pe.hpp | 74 - mobile/src/fpga/KD/pes/fully_connected_pe.hpp | 98 - mobile/src/fpga/KD/pes/input_pe.hpp | 53 - mobile/src/fpga/KD/pes/math_func_neon.h | 330 - mobile/src/fpga/KD/pes/output_pe.hpp | 52 - mobile/src/fpga/KD/pes/pooling_pe.hpp | 72 - mobile/src/fpga/KD/pes/softmax_pe.cpp | 162 - mobile/src/fpga/KD/pes/softmax_pe.hpp | 44 - mobile/src/fpga/KD/shape.hpp | 112 - mobile/src/fpga/KD/tensor.hpp | 281 - mobile/src/fpga/KD/tensor_util.cpp | 31 - mobile/src/fpga/KD/tensor_util.hpp | 25 - mobile/src/fpga/V1/api.cpp | 1021 -- mobile/src/fpga/V1/api.h | 102 - mobile/src/fpga/V1/bias_scale.cpp | 102 - mobile/src/fpga/V1/bias_scale.h | 29 - mobile/src/fpga/V1/deconv_bias_scale.cpp | 48 - mobile/src/fpga/V1/deconv_bias_scale.h | 26 - mobile/src/fpga/V1/deconv_filter.cpp | 280 - mobile/src/fpga/V1/deconv_filter.h | 39 - mobile/src/fpga/V1/filter.cpp | 362 - mobile/src/fpga/V1/filter.h | 50 - mobile/src/fpga/V1/image.cpp | 138 - mobile/src/fpga/V1/image.h | 76 - mobile/src/fpga/V1/pe.cpp | 1180 -- mobile/src/fpga/V2/api.cpp | 1011 -- mobile/src/fpga/V2/api.h | 94 - mobile/src/fpga/V2/bias_scale.cpp | 102 - mobile/src/fpga/V2/bias_scale.h | 29 - mobile/src/fpga/V2/deconv_bias_scale.cpp | 48 - mobile/src/fpga/V2/deconv_bias_scale.h | 26 - mobile/src/fpga/V2/deconv_filter.cpp | 280 - mobile/src/fpga/V2/deconv_filter.h | 39 - mobile/src/fpga/V2/filter.cpp | 362 - mobile/src/fpga/V2/filter.h | 50 - mobile/src/fpga/V2/image.cpp | 146 - mobile/src/fpga/V2/image.h | 71 - mobile/src/fpga/V2/pe.cpp | 1175 -- mobile/src/fpga/common/config.h | 18 - mobile/src/fpga/common/driver.cpp | 295 - mobile/src/fpga/common/driver.h | 141 - mobile/src/fpga/common/fpga_common.cpp | 214 - mobile/src/fpga/common/fpga_common.h | 330 - mobile/src/fpga/common/pe.h | 35 - mobile/src/framework/CMakeLists.txt | 0 mobile/src/framework/attribute.cpp | 40 - mobile/src/framework/attribute.h | 183 - mobile/src/framework/cl/cl_deleter.h | 59 - mobile/src/framework/cl/cl_engine.cpp | 136 - mobile/src/framework/cl/cl_engine.h | 256 - mobile/src/framework/cl/cl_half.cpp | 518 - mobile/src/framework/cl/cl_half.h | 32 - mobile/src/framework/cl/cl_helper.h | 91 - mobile/src/framework/cl/cl_image.cpp | 156 - mobile/src/framework/cl/cl_image.h | 312 - .../src/framework/cl/cl_image_converter.cpp | 510 - mobile/src/framework/cl/cl_image_converter.h | 121 - mobile/src/framework/cl/cl_scope.h | 125 - mobile/src/framework/cl/cl_tensor.h | 193 - mobile/src/framework/cl/cl_tool.cpp | 84 - mobile/src/framework/cl/cl_tool.h | 34 - mobile/src/framework/context.cpp | 605 -- mobile/src/framework/context.h | 81 - mobile/src/framework/data_layout.h | 63 - mobile/src/framework/data_type.cpp | 106 - mobile/src/framework/data_type.h | 80 - mobile/src/framework/ddim.cpp | 327 - mobile/src/framework/ddim.h | 192 - mobile/src/framework/dim.h | 335 - mobile/src/framework/executor.cpp | 1102 -- mobile/src/framework/executor.h | 124 - mobile/src/framework/framework.pb-c.cpp | 1465 --- mobile/src/framework/framework.pb-c.h | 615 -- mobile/src/framework/framework.proto | 196 - mobile/src/framework/load_ops.h | 379 - mobile/src/framework/loader.cpp | 290 - mobile/src/framework/loader.h | 65 - mobile/src/framework/lod_tensor.cpp | 192 - mobile/src/framework/lod_tensor.h | 234 - mobile/src/framework/mixed_vector.h | 271 - mobile/src/framework/op_info.h | 96 - mobile/src/framework/op_kernel_type.h | 60 - mobile/src/framework/op_proto_maker.h | 22 - mobile/src/framework/op_registry.h | 125 - mobile/src/framework/operator.cpp | 154 - mobile/src/framework/operator.h | 203 - mobile/src/framework/program/block_desc.cpp | 44 - mobile/src/framework/program/block_desc.h | 86 - mobile/src/framework/program/op_desc.cpp | 100 - mobile/src/framework/program/op_desc.h | 78 - .../program-optimize/fusion_op_register.h | 82 - .../program/program-optimize/node.cpp | 281 - .../framework/program/program-optimize/node.h | 81 - .../program-optimize/program_optimize.cpp | 300 - .../program-optimize/program_optimize.h | 45 - mobile/src/framework/program/program.h | 40 - mobile/src/framework/program/program_desc.cpp | 118 - mobile/src/framework/program/program_desc.h | 62 - mobile/src/framework/program/tensor_desc.h | 75 - mobile/src/framework/program/var_desc.h | 80 - mobile/src/framework/scope.cpp | 155 - mobile/src/framework/scope.h | 113 - mobile/src/framework/selected_rows.cpp | 127 - mobile/src/framework/selected_rows.h | 138 - mobile/src/framework/tensor.h | 355 - mobile/src/framework/tensor_base.h | 147 - mobile/src/framework/tensor_util.cpp | 30 - mobile/src/framework/tensor_util.h | 39 - mobile/src/framework/type_trait.h | 44 - mobile/src/framework/variable.h | 96 - mobile/src/framework/zynqmp/ztensor.hpp | 312 - mobile/src/io/api.cc | 85 - mobile/src/io/api_paddle_mobile.cc | 259 - mobile/src/io/api_paddle_mobile.h | 53 - mobile/src/io/ios_io/PaddleMobileCPU.h | 184 - mobile/src/io/ios_io/PaddleMobileCPU.mm | 410 - mobile/src/io/jni/PML.java | 66 - mobile/src/io/jni/paddle_mobile_jni.cpp | 465 - mobile/src/io/jni/paddle_mobile_jni.h | 91 - mobile/src/io/loader.h | 49 - mobile/src/io/opencl_interface.cpp | 35 - mobile/src/io/opencl_interface.h | 27 - mobile/src/io/paddle_inference_api.h | 178 - mobile/src/io/paddle_mobile.cpp | 545 - mobile/src/io/paddle_mobile.h | 124 - mobile/src/io/paddle_mobile_wrap.cpp | 361 - mobile/src/io/paddle_mobile_wrap.h | 97 - mobile/src/io/paddle_test_inference_api.cpp | 36 - mobile/src/io/paddle_test_inference_api.h | 35 - mobile/src/memory/t_malloc.cpp | 92 - mobile/src/memory/t_malloc.h | 63 - mobile/src/operators/activation_op.cpp | 105 - mobile/src/operators/activation_op.h | 47 - mobile/src/operators/assign_op.cpp | 39 - mobile/src/operators/assign_op.h | 33 - mobile/src/operators/assign_value_op.cpp | 37 - mobile/src/operators/assign_value_op.h | 33 - mobile/src/operators/batchnorm_op.cpp | 44 - mobile/src/operators/batchnorm_op.h | 48 - .../src/operators/beam_search_decode_op.cpp | 34 - mobile/src/operators/beam_search_decode_op.h | 32 - mobile/src/operators/beam_search_op.cpp | 34 - mobile/src/operators/beam_search_op.h | 31 - mobile/src/operators/bilinear_interp_op.cpp | 55 - mobile/src/operators/bilinear_interp_op.h | 48 - mobile/src/operators/box_coder_op.cpp | 64 - mobile/src/operators/box_coder_op.h | 49 - mobile/src/operators/cast_op.cpp | 36 - mobile/src/operators/cast_op.h | 45 - mobile/src/operators/compare_op.cpp | 45 - mobile/src/operators/compare_op.h | 34 - mobile/src/operators/concat_op.cpp | 77 - mobile/src/operators/concat_op.h | 45 - mobile/src/operators/conditional_block_op.cpp | 34 - mobile/src/operators/conditional_block_op.h | 34 - .../tensor_array_read_write_op.cpp | 43 - .../controlflow/tensor_array_read_write_op.h | 34 - mobile/src/operators/controlflow/while_op.cpp | 36 - mobile/src/operators/controlflow/while_op.h | 30 - mobile/src/operators/conv_op.cpp | 67 - mobile/src/operators/conv_op.h | 45 - mobile/src/operators/conv_transpose_op.cpp | 36 - mobile/src/operators/conv_transpose_op.h | 97 - mobile/src/operators/crf_op.cpp | 55 - mobile/src/operators/crf_op.h | 46 - mobile/src/operators/depthwise_conv_op.cpp | 62 - mobile/src/operators/depthwise_conv_op.h | 43 - mobile/src/operators/dequantize_op.cpp | 36 - mobile/src/operators/dequantize_op.h | 46 - mobile/src/operators/detection_ops.cpp | 145 - mobile/src/operators/detection_ops.h | 46 - mobile/src/operators/dropout_op.cpp | 40 - mobile/src/operators/dropout_op.h | 49 - mobile/src/operators/elementwise_add_op.cpp | 44 - mobile/src/operators/elementwise_add_op.h | 47 - mobile/src/operators/elementwise_mul_op.cpp | 39 - mobile/src/operators/elementwise_mul_op.h | 51 - mobile/src/operators/elementwise_sub_op.cpp | 38 - mobile/src/operators/elementwise_sub_op.h | 51 - mobile/src/operators/exp_op.cpp | 36 - mobile/src/operators/exp_op.h | 30 - mobile/src/operators/feed_op.cpp | 47 - mobile/src/operators/feed_op.h | 45 - mobile/src/operators/fetch_op.cpp | 39 - mobile/src/operators/fetch_op.h | 44 - .../fill_constant_batch_size_like_op.cpp | 25 - .../fill_constant_batch_size_like_op.h | 96 - mobile/src/operators/fill_constant_op.cpp | 27 - mobile/src/operators/fill_constant_op.h | 79 - mobile/src/operators/flatten2_op.cpp | 48 - mobile/src/operators/flatten2_op.h | 34 - mobile/src/operators/flatten_op.cpp | 52 - mobile/src/operators/flatten_op.h | 71 - .../src/operators/fusion_conv_add_bn_op.cpp | 61 - mobile/src/operators/fusion_conv_add_bn_op.h | 76 - .../operators/fusion_conv_add_bn_relu_op.cpp | 64 - .../operators/fusion_conv_add_bn_relu_op.h | 77 - mobile/src/operators/fusion_conv_add_op.cpp | 64 - mobile/src/operators/fusion_conv_add_op.h | 66 - .../src/operators/fusion_conv_add_relu_op.cpp | 62 - .../src/operators/fusion_conv_add_relu_op.h | 68 - .../operators/fusion_conv_bn_add_relu_op.cpp | 65 - .../operators/fusion_conv_bn_add_relu_op.h | 83 - mobile/src/operators/fusion_conv_bn_op.cpp | 61 - mobile/src/operators/fusion_conv_bn_op.h | 72 - .../src/operators/fusion_conv_bn_relu_op.cpp | 64 - mobile/src/operators/fusion_conv_bn_relu_op.h | 74 - mobile/src/operators/fusion_conv_relu_op.cpp | 64 - mobile/src/operators/fusion_conv_relu_op.h | 66 - .../src/operators/fusion_deconv_add_bn_op.cpp | 32 - .../src/operators/fusion_deconv_add_bn_op.h | 116 - .../fusion_deconv_add_bn_relu_op.cpp | 33 - .../operators/fusion_deconv_add_bn_relu_op.h | 118 - mobile/src/operators/fusion_deconv_add_op.cpp | 32 - mobile/src/operators/fusion_deconv_add_op.h | 108 - .../operators/fusion_deconv_add_relu_op.cpp | 33 - .../src/operators/fusion_deconv_add_relu_op.h | 110 - .../operators/fusion_deconv_bn_relu_op.cpp | 32 - .../src/operators/fusion_deconv_bn_relu_op.h | 115 - .../src/operators/fusion_deconv_relu_op.cpp | 31 - mobile/src/operators/fusion_deconv_relu_op.h | 107 - .../operators/fusion_dequant_add_bn_op.cpp | 38 - .../src/operators/fusion_dequant_add_bn_op.h | 75 - .../fusion_dequant_add_bn_relu_op.cpp | 40 - .../operators/fusion_dequant_add_bn_relu_op.h | 77 - .../fusion_dequant_add_bn_relu_quant_op.cpp | 62 - .../fusion_dequant_add_bn_relu_quant_op.h | 123 - mobile/src/operators/fusion_dequant_bn_op.cpp | 54 - mobile/src/operators/fusion_dequant_bn_op.h | 101 - .../src/operators/fusion_dequant_bn_relu_op.h | 74 - .../operators/fusion_dwconv_bn_relu_op.cpp | 63 - .../src/operators/fusion_dwconv_bn_relu_op.h | 76 - .../fusion_elementwise_add_relu_op.cpp | 44 - .../fusion_elementwise_add_relu_op.h | 68 - mobile/src/operators/fusion_fc_op.cpp | 70 - mobile/src/operators/fusion_fc_op.h | 64 - mobile/src/operators/fusion_fc_relu_op.cpp | 67 - mobile/src/operators/fusion_fc_relu_op.h | 66 - .../operators/fusion_instancenorm_relu_op.cpp | 39 - .../operators/fusion_instancenorm_relu_op.h | 68 - mobile/src/operators/gru_op.cpp | 66 - mobile/src/operators/gru_op.h | 46 - mobile/src/operators/gru_unit_op.cpp | 69 - mobile/src/operators/gru_unit_op.h | 44 - mobile/src/operators/im2sequence_op.cpp | 55 - mobile/src/operators/im2sequence_op.h | 48 - mobile/src/operators/increment_op.cpp | 49 - mobile/src/operators/increment_op.h | 48 - mobile/src/operators/instancenorm_op.cpp | 39 - mobile/src/operators/instancenorm_op.h | 48 - mobile/src/operators/is_empty_op.cpp | 44 - mobile/src/operators/is_empty_op.h | 47 - .../src/operators/kernel/activation_kernel.h | 44 - .../kernel/arm/activation_kernel.cpp | 116 - .../kernel/arm/anchor_generator_kernel.cpp | 37 - .../operators/kernel/arm/assign_kernel.cpp | 39 - .../kernel/arm/assign_value_kernel.cpp | 73 - .../operators/kernel/arm/batchnorm_kernel.cpp | 36 - .../kernel/arm/beam_search_decode_kernel.cpp | 278 - .../kernel/arm/beam_search_kernel.cpp | 262 - .../kernel/arm/bilinear_interp_kernel.cpp | 37 - .../operators/kernel/arm/box_coder_kernel.cpp | 36 - .../src/operators/kernel/arm/cast_kernel.cpp | 84 - .../operators/kernel/arm/compare_kernel.cpp | 274 - .../operators/kernel/arm/concat_kernel.cpp | 41 - .../kernel/arm/conditional_block_kernel.cpp | 100 - .../convolution/conv_add_bn_relu_kernel.cpp | 178 - .../arm/convolution/conv_add_kernel.cpp | 79 - .../arm/convolution/conv_add_relu_kernel.cpp | 77 - .../convolution/conv_bn_add_relu_kernel.cpp | 96 - .../arm/convolution/conv_bn_relu_kernel.cpp | 146 - .../kernel/arm/convolution/conv_common.cpp | 113 - .../kernel/arm/convolution/conv_common.h | 25 - .../kernel/arm/convolution/conv_kernel.cpp | 75 - .../arm/convolution/conv_relu_kernel.cpp | 66 - .../arm/convolution/conv_transpose_kernel.cpp | 39 - .../arm/convolution/dwconv_bn_relu_kernel.cpp | 95 - .../src/operators/kernel/arm/crf_kernel.cpp | 39 - .../kernel/arm/density_prior_box_kernel.cpp | 37 - .../kernel/arm/dequantize_bn_kernel.cpp | 340 - .../kernel/arm/dequantize_kernel.cpp | 81 - .../operators/kernel/arm/dropout_kernel.cpp | 51 - .../kernel/arm/elementwise_add_kernel.cpp | 43 - .../kernel/arm/elementwise_mul_kernel.cpp | 38 - .../kernel/arm/elementwise_sub_kernel.cpp | 38 - .../src/operators/kernel/arm/exp_kernel.cpp | 47 - .../src/operators/kernel/arm/feed_kernel.cpp | 35 - .../src/operators/kernel/arm/fetch_kernel.cpp | 31 - .../operators/kernel/arm/flatten_kernel.cpp | 36 - .../operators/kernel/arm/fusion_fc_kernel.cpp | 75 - .../src/operators/kernel/arm/gru_kernel.cpp | 39 - .../operators/kernel/arm/gru_unit_kernel.cpp | 38 - .../kernel/arm/im2sequence_kernel.cpp | 87 - .../operators/kernel/arm/increment_kernel.cpp | 36 - .../operators/kernel/arm/is_empty_kernel.cpp | 37 - .../operators/kernel/arm/lod_reset_kernel.cpp | 68 - .../operators/kernel/arm/logical_kernel.cpp | 125 - .../operators/kernel/arm/lookup_kernel.cpp | 36 - .../src/operators/kernel/arm/lrn_kernel.cpp | 36 - .../src/operators/kernel/arm/mul_kernel.cpp | 39 - .../kernel/arm/multiclass_nms_kernel.cpp | 37 - .../kernel/arm/nearest_interp_kernel.cpp | 88 - .../src/operators/kernel/arm/norm_kernel.cpp | 36 - .../operators/kernel/arm/one_hot_kernel.cpp | 61 - .../src/operators/kernel/arm/pad2d_kernel.cpp | 45 - .../arm/polygon_box_transform_kernel.cpp | 38 - .../src/operators/kernel/arm/pool_kernel.cpp | 36 - .../src/operators/kernel/arm/prelu_kernel.cpp | 122 - .../operators/kernel/arm/prior_box_kernel.cpp | 36 - .../operators/kernel/arm/proposal_kernel.cpp | 36 - .../kernel/arm/psroi_pool_kernel.cpp | 36 - .../operators/kernel/arm/quantize_kernel.cpp | 221 - .../operators/kernel/arm/reshape2_kernel.cpp | 36 - .../operators/kernel/arm/reshape_kernel.cpp | 36 - .../operators/kernel/arm/resize_kernel.cpp | 124 - .../kernel/arm/roi_perspective_kernel.cpp | 291 - .../src/operators/kernel/arm/scale_kernel.cpp | 88 - .../kernel/arm/sequence_expand_kernel.cpp | 115 - .../kernel/arm/sequence_pool_kernel.cpp | 215 - .../kernel/arm/sequence_softmax_kernel.cpp | 44 - .../src/operators/kernel/arm/shape_kernel.cpp | 36 - .../src/operators/kernel/arm/slice_kernel.cpp | 86 - .../operators/kernel/arm/softmax_kernel.cpp | 38 - .../src/operators/kernel/arm/split_kernel.cpp | 36 - .../src/operators/kernel/arm/sum_kernel.cpp | 37 - .../arm/tensor_array_read_write_kernel.cpp | 66 - .../src/operators/kernel/arm/top_k_kernel.cpp | 68 - .../kernel/arm/transpose2_kernel.cpp | 146 - .../operators/kernel/arm/transpose_kernel.cpp | 35 - .../src/operators/kernel/arm/while_kernel.cpp | 128 - mobile/src/operators/kernel/assign_kernel.h | 53 - .../operators/kernel/assign_value_kernel.h | 53 - .../src/operators/kernel/batchnorm_kernel.h | 36 - .../kernel/beam_search_decode_kernel.h | 58 - .../src/operators/kernel/beam_search_kernel.h | 74 - .../operators/kernel/bilinear_interp_kernel.h | 38 - .../src/operators/kernel/box_coder_kernel.h | 38 - .../central-arm-func/activation_arm_func.h | 107 - .../central-arm-func/batchnorm_arm_func.h | 83 - .../bilinear_interp_arm_func.h | 91 - .../central-arm-func/box_coder_arm_func.h | 142 - .../kernel/central-arm-func/concat_arm_func.h | 90 - .../central-arm-func/conv_add_arm_func.h | 151 - .../conv_add_bn_relu_arm_func.h | 143 - .../central-arm-func/conv_add_relu_arm_func.h | 154 - .../kernel/central-arm-func/conv_arm_func.cpp | 377 - .../kernel/central-arm-func/conv_arm_func.h | 58 - .../conv_bn_add_relu_arm_func.h | 148 - .../central-arm-func/conv_bn_relu_arm_func.h | 146 - .../conv_transpose_arm_func.h | 111 - .../kernel/central-arm-func/crf_arm_func.h | 118 - .../density_prior_box_arm_func.h | 161 - .../dwconv_bn_relu_arm_func.h | 144 - .../elementwise_add_arm_func.h | 78 - .../elementwise_mul_arm_func.h | 45 - .../elementwise_sub_arm_func.h | 65 - .../central-arm-func/flatten_arm_func.h | 50 - .../central-arm-func/fusion_fc_arm_func.h | 75 - .../kernel/central-arm-func/gru_arm_func.h | 107 - .../central-arm-func/gru_unit_arm_func.h | 72 - .../central-arm-func/increment_arm_func.h | 39 - .../kernel/central-arm-func/lookup_arm_func.h | 58 - .../kernel/central-arm-func/lrn_arm_func.h | 47 - .../kernel/central-arm-func/mul_arm_func.h | 59 - .../multiclass_nms_arm_func.h | 307 - .../kernel/central-arm-func/norm_arm_func.h | 106 - .../polygon_box_transform_arm_func.h | 53 - .../kernel/central-arm-func/pool_arm_func.h | 91 - .../central-arm-func/prior_box_arm_func.h | 199 - .../central-arm-func/reshape2_arm_func.h | 59 - .../central-arm-func/reshape_arm_func.h | 56 - .../kernel/central-arm-func/shape_arm_func.h | 38 - .../central-arm-func/softmax_arm_func.h | 32 - .../kernel/central-arm-func/split_arm_func.h | 86 - .../kernel/central-arm-func/sum_arm_func.h | 153 - .../central-arm-func/transpose_arm_func.h | 70 - .../operators/kernel/cl/batchnorm_kernel.cpp | 111 - .../operators/kernel/cl/box_coder_kernel.cpp | 78 - .../kernel/cl/cl-kernel-func/conv_func.cpp | 760 -- .../kernel/cl/cl-kernel-func/conv_func.h | 77 - .../kernel/cl/cl_kernel/batchnorm_kernel.cl | 37 - .../kernel/cl/cl_kernel/box_coder_kernel.cl | 147 - .../kernel/cl/cl_kernel/channel_add_kernel.cl | 51 - .../operators/kernel/cl/cl_kernel/cl_common.h | 34 - .../kernel/cl/cl_kernel/concat_kernel.cl | 291 - .../kernel/cl/cl_kernel/conv_kernel.cl | 15 - .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 2801 ----- .../cl/cl_kernel/conv_transpose_kernel.cl | 443 - .../cl/cl_kernel/density_prior_box_kernel.cl | 114 - .../depthwise_conv_add_bn_relu_kernel.cl | 18 - .../cl/cl_kernel/depthwise_conv_kernel.cl | 15 - .../kernel/cl/cl_kernel/dropout_kernel.cl | 42 - .../cl/cl_kernel/elementwise_add_kernel.cl | 27 - .../kernel/cl/cl_kernel/exp_kernel.cl | 34 - .../kernel/cl/cl_kernel/feed_kernel.cl | 62 - .../kernel/cl/cl_kernel/fetch_kernel.cl | 69 - .../kernel/cl/cl_kernel/flatten2_kernel.cl | 48 - .../cl/cl_kernel/instancenorm_kernel.cl | 119 - .../kernel/cl/cl_kernel/leakyrelu_kernel.cl | 38 - .../kernel/cl/cl_kernel/lrn_kernel.cl | 136 - .../cl/cl_kernel/nearest_interp_kernel.cl | 37 - .../kernel/cl/cl_kernel/pad2d_kernel.cl | 57 - .../kernel/cl/cl_kernel/pool_kernel.cl | 95 - .../kernel/cl/cl_kernel/prior_box_kernel.cl | 129 - .../src/operators/kernel/cl/cl_kernel/relu.cl | 58 - .../operators/kernel/cl/cl_kernel/relu6.cl | 32 - .../operators/kernel/cl/cl_kernel/reshape.cl | 202 - .../kernel/cl/cl_kernel/scale_kernel.cl | 35 - .../operators/kernel/cl/cl_kernel/sigmoid.cl | 34 - .../kernel/cl/cl_kernel/slice_kernel.cl | 77 - .../operators/kernel/cl/cl_kernel/softmax.cl | 92 - .../kernel/cl/cl_kernel/tanh_kernel.cl | 31 - .../kernel/cl/cl_kernel/transpose_kernel.cl | 169 - .../src/operators/kernel/cl/concat_kernel.cpp | 196 - .../kernel/cl/conv_add_bn_relu_kernel.cpp | 230 - .../operators/kernel/cl/conv_add_kernel.cpp | 142 - .../kernel/cl/conv_add_relu_kernel.cpp | 144 - .../kernel/cl/conv_bn_add_relu_kernel.cpp | 184 - .../kernel/cl/conv_bn_relu_kernel.cpp | 183 - .../src/operators/kernel/cl/conv_kernel.cpp | 130 - .../operators/kernel/cl/conv_relu_kernel.cpp | 136 - .../kernel/cl/conv_transpose_kernel.cpp | 71 - .../kernel/cl/density_prior_box_kernel.cpp | 156 - .../kernel/cl/depthwise_conv_kernel.cpp | 96 - .../operators/kernel/cl/dropout_kernel.cpp | 59 - .../kernel/cl/dwconv_bn_relu_kernel.cpp | 176 - .../kernel/cl/elementwise_add_kernel.cpp | 129 - mobile/src/operators/kernel/cl/exp_kernel.cpp | 52 - .../src/operators/kernel/cl/feed_kernel.cpp | 78 - .../src/operators/kernel/cl/fetch_kernel.cpp | 101 - .../operators/kernel/cl/flatten2_kernel.cpp | 79 - .../operators/kernel/cl/fusion_fc_kernel.cpp | 123 - mobile/src/operators/kernel/cl/gen_code.py | 208 - .../kernel/cl/instancenorm_kernel.cpp | 94 - .../kernel/cl/instancenorm_relu_kernel.cpp | 95 - .../operators/kernel/cl/leakyrelu_kernel.cpp | 59 - mobile/src/operators/kernel/cl/lrn_kernel.cpp | 79 - mobile/src/operators/kernel/cl/mul_kernel.cpp | 88 - .../kernel/cl/multiclass_nms_kernel.cpp | 340 - .../kernel/cl/nearest_interp_kernel.cpp | 73 - .../src/operators/kernel/cl/pad2d_kernel.cpp | 94 - .../src/operators/kernel/cl/pool_kernel.cpp | 99 - .../operators/kernel/cl/prior_box_kernel.cpp | 216 - .../src/operators/kernel/cl/relu6_kernel.cpp | 53 - .../src/operators/kernel/cl/relu_kernel.cpp | 72 - .../operators/kernel/cl/reshape2_kernel.cpp | 150 - .../operators/kernel/cl/reshape_kernel.cpp | 106 - .../src/operators/kernel/cl/scale_kernel.cpp | 62 - .../operators/kernel/cl/sigmoid_kernel.cpp | 50 - .../src/operators/kernel/cl/slice_kernel.cpp | 64 - .../operators/kernel/cl/softmax_kernel.cpp | 65 - .../src/operators/kernel/cl/split_kernel.cpp | 116 - .../src/operators/kernel/cl/tanh_kernel.cpp | 51 - .../operators/kernel/cl/transpose2_kernel.cpp | 213 - .../operators/kernel/cl/transpose_kernel.cpp | 134 - mobile/src/operators/kernel/compare_kernel.h | 32 - mobile/src/operators/kernel/concat_kernel.h | 37 - .../kernel/conditional_block_kernel.h | 70 - .../src/operators/kernel/conv_add_bn_kernel.h | 44 - .../kernel/conv_add_bn_relu_kernel.h | 49 - mobile/src/operators/kernel/conv_add_kernel.h | 49 - .../operators/kernel/conv_add_relu_kernel.h | 44 - .../kernel/conv_bn_add_relu_kernel.h | 44 - mobile/src/operators/kernel/conv_bn_kernel.h | 44 - .../operators/kernel/conv_bn_relu_kernel.h | 48 - mobile/src/operators/kernel/conv_kernel.h | 41 - .../src/operators/kernel/conv_relu_kernel.h | 42 - .../operators/kernel/conv_transpose_kernel.h | 39 - mobile/src/operators/kernel/crf_kernel.h | 37 - .../operators/kernel/deconv_add_bn_kernel.h | 39 - .../kernel/deconv_add_bn_relu_kernel.h | 39 - .../src/operators/kernel/deconv_add_kernel.h | 39 - .../operators/kernel/deconv_add_relu_kernel.h | 39 - .../operators/kernel/deconv_bn_relu_kernel.h | 39 - .../src/operators/kernel/deconv_relu_kernel.h | 39 - .../src/operators/kernel/dequant_bn_kernel.h | 48 - .../src/operators/kernel/dequantize_kernel.h | 36 - .../src/operators/kernel/detection_kernel.h | 232 - mobile/src/operators/kernel/dropout_kernel.h | 35 - .../operators/kernel/dwconv_bn_relu_kernel.h | 44 - .../operators/kernel/elementwise_add_kernel.h | 39 - .../kernel/elementwise_add_relu_kernel.h | 38 - .../operators/kernel/elementwise_mul_kernel.h | 36 - .../operators/kernel/elementwise_sub_kernel.h | 38 - mobile/src/operators/kernel/exp_kernel.h | 24 - mobile/src/operators/kernel/fc_relu_kernel.h | 37 - mobile/src/operators/kernel/feed_kernel.h | 32 - mobile/src/operators/kernel/fetch_kernel.h | 34 - mobile/src/operators/kernel/flatten2_kernel.h | 28 - mobile/src/operators/kernel/flatten_kernel.h | 37 - .../kernel/fpga/KD/conv_add_bn_kernel.cpp | 47 - .../kernel/fpga/KD/conv_add_kernel.cpp | 34 - .../kernel/fpga/KD/conv_add_relu_kernel.cpp | 34 - .../kernel/fpga/KD/conv_bn_kernel.cpp | 69 - .../kernel/fpga/KD/conv_bn_relu_kernel.cpp | 76 - .../fpga/KD/elementwise_add_relu_kernel.cpp | 60 - .../operators/kernel/fpga/KD/feed_kernel.cpp | 65 - .../operators/kernel/fpga/KD/fetch_kernel.cpp | 55 - .../kernel/fpga/KD/fusion_fc_kernel.cpp | 56 - .../operators/kernel/fpga/KD/pool_kernel.cpp | 62 - .../kernel/fpga/KD/softmax_kernel.cpp | 55 - .../fpga/V1/anchor_generator_kernel.cpp | 88 - .../kernel/fpga/V1/concat_kernel.cpp | 69 - .../kernel/fpga/V1/conv_add_bn_kernel.cpp | 86 - .../fpga/V1/conv_add_bn_relu_kernel.cpp | 100 - .../kernel/fpga/V1/conv_add_kernel.cpp | 63 - .../kernel/fpga/V1/conv_add_relu_kernel.cpp | 63 - .../kernel/fpga/V1/conv_bn_kernel.cpp | 75 - .../kernel/fpga/V1/conv_bn_relu_kernel.cpp | 85 - .../operators/kernel/fpga/V1/conv_kernel.cpp | 56 - .../kernel/fpga/V1/conv_transpose_kernel.cpp | 89 - .../kernel/fpga/V1/deconv_add_bn_kernel.cpp | 90 - .../fpga/V1/deconv_add_bn_relu_kernel.cpp | 91 - .../kernel/fpga/V1/deconv_add_kernel.cpp | 90 - .../kernel/fpga/V1/deconv_add_relu_kernel.cpp | 91 - .../kernel/fpga/V1/deconv_bn_relu_kernel.cpp | 108 - .../kernel/fpga/V1/dropout_kernel.cpp | 34 - .../kernel/fpga/V1/elementwise_add_kernel.cpp | 191 - .../fpga/V1/elementwise_add_relu_kernel.cpp | 72 - .../kernel/fpga/V1/elementwise_mul_kernel.cpp | 93 - .../operators/kernel/fpga/V1/feed_kernel.cpp | 108 - .../operators/kernel/fpga/V1/fetch_kernel.cpp | 127 - .../kernel/fpga/V1/fusion_fc_kernel.cpp | 74 - .../kernel/fpga/V1/fusion_fc_relu_kernel.cpp | 75 - .../operators/kernel/fpga/V1/pad2d_kernel.cpp | 60 - .../operators/kernel/fpga/V1/pool_kernel.cpp | 104 - .../kernel/fpga/V1/proposal_kernel.cpp | 567 - .../kernel/fpga/V1/psroi_pool_kernel.cpp | 284 - .../operators/kernel/fpga/V1/relu_kernel.cpp | 35 - .../kernel/fpga/V1/reshape2_kernel.cpp | 127 - .../kernel/fpga/V1/reshape_kernel.cpp | 40 - .../kernel/fpga/V1/roialign_pool_kernel.cpp | 296 - .../kernel/fpga/V1/sigmoid_kernel.cpp | 54 - .../operators/kernel/fpga/V1/slice_kernel.cpp | 63 - .../kernel/fpga/V1/softmax_kernel.cpp | 138 - .../operators/kernel/fpga/V1/split_kernel.cpp | 74 - .../operators/kernel/fpga/V1/tanh_kernel.cpp | 79 - .../kernel/fpga/V1/transpose2_kernel.cpp | 55 - .../fpga/V2/anchor_generator_kernel.cpp | 87 - .../kernel/fpga/V2/concat_kernel.cpp | 69 - .../kernel/fpga/V2/conv_add_bn_kernel.cpp | 89 - .../fpga/V2/conv_add_bn_relu_kernel.cpp | 104 - .../kernel/fpga/V2/conv_add_kernel.cpp | 64 - .../kernel/fpga/V2/conv_add_relu_kernel.cpp | 64 - .../kernel/fpga/V2/conv_bn_kernel.cpp | 76 - .../kernel/fpga/V2/conv_bn_relu_kernel.cpp | 93 - .../operators/kernel/fpga/V2/conv_kernel.cpp | 58 - .../kernel/fpga/V2/conv_transpose_kernel.cpp | 94 - .../kernel/fpga/V2/deconv_add_bn_kernel.cpp | 98 - .../fpga/V2/deconv_add_bn_relu_kernel.cpp | 98 - .../kernel/fpga/V2/deconv_add_kernel.cpp | 98 - .../kernel/fpga/V2/deconv_add_relu_kernel.cpp | 93 - .../kernel/fpga/V2/deconv_bn_relu_kernel.cpp | 114 - .../kernel/fpga/V2/dropout_kernel.cpp | 34 - .../kernel/fpga/V2/elementwise_add_kernel.cpp | 71 - .../fpga/V2/elementwise_add_relu_kernel.cpp | 69 - .../kernel/fpga/V2/elementwise_mul_kernel.cpp | 93 - .../operators/kernel/fpga/V2/feed_kernel.cpp | 64 - .../operators/kernel/fpga/V2/fetch_kernel.cpp | 118 - .../kernel/fpga/V2/fusion_fc_kernel.cpp | 75 - .../kernel/fpga/V2/fusion_fc_relu_kernel.cpp | 76 - .../operators/kernel/fpga/V2/pool_kernel.cpp | 106 - .../kernel/fpga/V2/proposal_kernel.cpp | 501 - .../kernel/fpga/V2/psroi_pool_kernel.cpp | 202 - .../operators/kernel/fpga/V2/relu_kernel.cpp | 33 - .../kernel/fpga/V2/reshape2_kernel.cpp | 128 - .../kernel/fpga/V2/reshape_kernel.cpp | 40 - .../kernel/fpga/V2/roialign_pool_kernel.cpp | 296 - .../kernel/fpga/V2/sigmoid_kernel.cpp | 57 - .../operators/kernel/fpga/V2/slice_kernel.cpp | 63 - .../kernel/fpga/V2/softmax_kernel.cpp | 123 - .../operators/kernel/fpga/V2/split_kernel.cpp | 74 - .../operators/kernel/fpga/V2/tanh_kernel.cpp | 79 - .../kernel/fpga/V2/transpose2_kernel.cpp | 55 - .../src/operators/kernel/fusion_fc_kernel.h | 37 - mobile/src/operators/kernel/gru_kernel.h | 37 - mobile/src/operators/kernel/gru_unit_kernel.h | 35 - .../src/operators/kernel/im2sequence_kernel.h | 38 - .../src/operators/kernel/increment_kernel.h | 36 - .../operators/kernel/instancenorm_kernel.h | 37 - .../kernel/instancenorm_relu_kernel.h | 42 - mobile/src/operators/kernel/is_empty_kernel.h | 36 - mobile/src/operators/kernel/kernels.h | 36 - mobile/src/operators/kernel/logical_kernel.h | 42 - mobile/src/operators/kernel/lookup_kernel.h | 37 - mobile/src/operators/kernel/lrn_kernel.h | 181 - mobile/src/operators/kernel/mul_kernel.h | 38 - .../operators/kernel/multiclass_nms_kernel.h | 37 - .../operators/kernel/nearest_interp_kernel.h | 38 - mobile/src/operators/kernel/norm_kernel.h | 36 - mobile/src/operators/kernel/one_hot_kernel.h | 51 - mobile/src/operators/kernel/pad2d_kernel.h | 54 - .../kernel/polygon_box_transform_kernel.h | 36 - mobile/src/operators/kernel/pool_kernel.h | 35 - mobile/src/operators/kernel/prelu_kernel.h | 30 - .../src/operators/kernel/prior_box_kernel.h | 114 - mobile/src/operators/kernel/quantize_kernel.h | 36 - mobile/src/operators/kernel/range_kernel.cpp | 49 - mobile/src/operators/kernel/range_kernel.h | 71 - .../operators/kernel/reduce_prod_kernel.cpp | 65 - .../src/operators/kernel/reduce_prod_kernel.h | 65 - mobile/src/operators/kernel/reshape2_kernel.h | 36 - mobile/src/operators/kernel/reshape_kernel.h | 80 - mobile/src/operators/kernel/resize_kernel.h | 82 - mobile/src/operators/kernel/scale_kernel.h | 35 - .../src/operators/kernel/sequence_kernels.h | 36 - mobile/src/operators/kernel/shape_kernel.h | 37 - mobile/src/operators/kernel/slice_kernel.h | 31 - mobile/src/operators/kernel/softmax_kernel.h | 36 - mobile/src/operators/kernel/split_kernel.h | 37 - mobile/src/operators/kernel/sum_kernel.h | 35 - mobile/src/operators/kernel/tanh_kernel.h | 37 - .../kernel/tensor_array_read_write_kernel.h | 32 - .../src/operators/kernel/transpose2_kernel.h | 37 - .../src/operators/kernel/transpose_kernel.h | 37 - mobile/src/operators/kernel/while_kernel.h | 47 - mobile/src/operators/lod_reset_op.cpp | 41 - mobile/src/operators/lod_reset_op.h | 32 - mobile/src/operators/logical_op.cpp | 69 - mobile/src/operators/logical_op.h | 42 - mobile/src/operators/lookup_op.cpp | 66 - mobile/src/operators/lookup_op.h | 46 - mobile/src/operators/lrn_op.cpp | 39 - mobile/src/operators/lrn_op.h | 46 - mobile/src/operators/math/activation.h | 187 - .../math/depthwise/faster_depthwise_conv3x3.h | 34 - .../depthwise/faster_depthwise_conv3x3p1.cpp | 2011 ---- .../src/operators/math/depthwise_conv3x3.cpp | 1060 -- mobile/src/operators/math/depthwise_conv3x3.h | 47 - .../operators/math/depthwise_conv3x3_int8.cpp | 1660 --- .../src/operators/math/depthwise_conv5x5.cpp | 1106 -- mobile/src/operators/math/depthwise_conv5x5.h | 47 - .../operators/math/depthwise_conv5x5_int8.cpp | 1041 -- mobile/src/operators/math/element_wise.h | 396 - .../operators/math/elementwise_op_function.h | 178 - mobile/src/operators/math/gemm.cpp | 3807 ------- mobile/src/operators/math/gemm.h | 492 - mobile/src/operators/math/gemm/cblas.cc | 50 - mobile/src/operators/math/gemm/cblas.h | 32 - mobile/src/operators/math/gemm/executor.h | 266 - mobile/src/operators/math/gemm/gemm1x1s1.cpp | 2221 ---- mobile/src/operators/math/gemm/gemm1x1s1.h | 81 - mobile/src/operators/math/gemm/gemm_kernel.h | 792 -- mobile/src/operators/math/gemm/pack_kernel.h | 801 -- mobile/src/operators/math/gemm/strategy.h | 120 - mobile/src/operators/math/gemm_int8.cpp | 2077 ---- mobile/src/operators/math/gemm_omp_int8.cpp | 453 - mobile/src/operators/math/gpc.cpp | 2142 ---- mobile/src/operators/math/gpc.h | 222 - mobile/src/operators/math/gru_compute.cpp | 56 - mobile/src/operators/math/gru_compute.h | 40 - mobile/src/operators/math/gru_cpu_kernel.h | 203 - mobile/src/operators/math/im2col.cpp | 668 -- mobile/src/operators/math/im2col.h | 129 - mobile/src/operators/math/math.h | 342 - mobile/src/operators/math/math_function.cpp | 176 - mobile/src/operators/math/math_function.h | 62 - .../src/operators/math/math_function_int8.cpp | 109 - mobile/src/operators/math/pad.cpp | 54 - mobile/src/operators/math/pad.h | 32 - mobile/src/operators/math/poly_util.cpp | 120 - mobile/src/operators/math/poly_util.h | 70 - mobile/src/operators/math/pooling.cpp | 82 - mobile/src/operators/math/pooling.h | 199 - mobile/src/operators/math/pooling2x2.cpp | 791 -- mobile/src/operators/math/pooling3x3.cpp | 1317 --- mobile/src/operators/math/quantize.h | 108 - .../operators/math/selected_rows_functor.h | 174 - mobile/src/operators/math/sequence2batch.cpp | 60 - mobile/src/operators/math/sequence2batch.h | 169 - .../operators/math/slidingwindow_conv3x3.cpp | 5668 ---------- .../operators/math/slidingwindow_conv3x3.h | 51 - .../operators/math/slidingwindow_utils.cpp | 365 - .../src/operators/math/slidingwindow_utils.h | 159 - mobile/src/operators/math/softmax.cpp | 157 - mobile/src/operators/math/softmax.h | 42 - mobile/src/operators/math/transform.h | 55 - mobile/src/operators/math/vol2col.cpp | 147 - mobile/src/operators/math/vol2col.h | 94 - .../math/winograd/winograd_transform.h | 42 - .../math/winograd/winograd_transform_f6k3.cpp | 1681 --- mobile/src/operators/mul_op.cpp | 67 - mobile/src/operators/mul_op.h | 46 - mobile/src/operators/multiclass_nms_op.cpp | 50 - mobile/src/operators/multiclass_nms_op.h | 50 - mobile/src/operators/nearest_interp_op.cpp | 56 - mobile/src/operators/nearest_interp_op.h | 50 - mobile/src/operators/norm_op.cpp | 51 - mobile/src/operators/norm_op.h | 47 - mobile/src/operators/one_hot_op.cpp | 43 - mobile/src/operators/one_hot_op.h | 31 - mobile/src/operators/op_param.cpp | 98 - mobile/src/operators/op_param.h | 3566 ------ mobile/src/operators/pad2d_op.cpp | 46 - mobile/src/operators/pad2d_op.h | 32 - .../operators/polygon_box_transform_op.cpp | 45 - .../src/operators/polygon_box_transform_op.h | 56 - mobile/src/operators/pool_op.cpp | 73 - mobile/src/operators/pool_op.h | 46 - mobile/src/operators/prelu_op.cpp | 40 - mobile/src/operators/prelu_op.h | 49 - mobile/src/operators/prior_box_op.cpp | 101 - mobile/src/operators/prior_box_op.h | 34 - mobile/src/operators/quantize_op.cpp | 39 - mobile/src/operators/quantize_op.h | 45 - mobile/src/operators/range_op.cpp | 45 - mobile/src/operators/range_op.h | 33 - mobile/src/operators/reduce_prod_op.cpp | 86 - mobile/src/operators/reduce_prod_op.h | 33 - mobile/src/operators/reshape2_op.cpp | 100 - mobile/src/operators/reshape2_op.h | 53 - mobile/src/operators/reshape_op.cpp | 45 - mobile/src/operators/reshape_op.h | 49 - mobile/src/operators/resize_op.cpp | 36 - mobile/src/operators/resize_op.h | 48 - mobile/src/operators/scale_op.cpp | 38 - mobile/src/operators/scale_op.h | 49 - .../sequence_ops/sequence_expand_op.cpp | 56 - .../sequence_ops/sequence_expand_op.h | 47 - .../sequence_ops/sequence_pool_op.cpp | 38 - .../operators/sequence_ops/sequence_pool_op.h | 46 - .../sequence_ops/sequence_softmax_op.cpp | 39 - .../sequence_ops/sequence_softmax_op.h | 47 - mobile/src/operators/shape_op.cpp | 38 - mobile/src/operators/shape_op.h | 47 - mobile/src/operators/slice_op.cpp | 109 - mobile/src/operators/slice_op.h | 49 - mobile/src/operators/softmax_op.cpp | 40 - mobile/src/operators/softmax_op.h | 45 - mobile/src/operators/split_op.cpp | 93 - mobile/src/operators/split_op.h | 46 - mobile/src/operators/sum_op.cpp | 67 - mobile/src/operators/sum_op.h | 49 - mobile/src/operators/top_k_op.cpp | 44 - mobile/src/operators/top_k_op.h | 45 - mobile/src/operators/transpose2_op.cpp | 121 - mobile/src/operators/transpose2_op.h | 52 - mobile/src/operators/transpose_op.cpp | 62 - mobile/src/operators/transpose_op.h | 48 - mobile/src/pass/memory_optimize.cpp | 170 - mobile/src/pass/memory_optimize.h | 62 - mobile/src/pass/memory_optimize_super.cpp | 209 - mobile/src/pass/memory_optimize_super.h | 70 - mobile/src/pass/model_obfuscate.cpp | 36 - mobile/src/pass/model_obfuscate.h | 36 - mobile/src/pass/pass_base.h | 27 - mobile/src/protobuf-c/protobuf-c.cpp | 2249 ---- mobile/src/protobuf-c/protobuf-c.h | 962 -- mobile/test/CMakeLists.txt | 542 - mobile/test/common/test_enforce.cpp | 21 - mobile/test/common/test_gemm_accuracy.cpp | 131 - .../test/common/test_gemm_int8_accuracy.cpp | 346 - mobile/test/common/test_gemm_perf.cpp | 164 - mobile/test/common/test_lib_size.cpp | 21 - mobile/test/common/test_lib_size.h | 97 - mobile/test/common/test_log.cpp | 34 - mobile/test/common/test_openmp.cpp | 29 - mobile/test/executor_for_test.h | 138 - mobile/test/fpga/test_concat_op.cpp | 87 - mobile/test/fpga/test_densebox_combine.cpp | 49 - mobile/test/fpga/test_format_data.cpp | 93 - mobile/test/fpga/test_marker.cpp | 125 - mobile/test/fpga/test_marker2.cpp | 181 - mobile/test/fpga/test_marker_api.cpp | 241 - mobile/test/fpga/test_mobilenet_api.cpp | 158 - mobile/test/fpga/test_pe.cpp | 111 - mobile/test/fpga/test_resnet50.cpp | 140 - mobile/test/fpga/test_rfcn.cpp | 152 - mobile/test/fpga/test_rfcn_api.cpp | 172 - mobile/test/fpga/test_ssd.cpp | 46 - mobile/test/fpga/test_tensor_quant.cpp | 45 - mobile/test/fpga/test_yolo_api.cpp | 158 - mobile/test/framework/test_inference_api.cpp | 62 - mobile/test/framework/test_load.cpp | 34 - mobile/test/framework/test_load_memory.cpp | 68 - .../test_load_memory_inference_api.cpp | 80 - mobile/test/framework/test_optimize.cpp | 33 - mobile/test/net/test_alexnet.cpp | 59 - mobile/test/net/test_benchmark.cpp | 79 - mobile/test/net/test_eng.cpp | 50 - mobile/test/net/test_genet_combine.cpp | 51 - mobile/test/net/test_gesture.cpp | 97 - mobile/test/net/test_googlenet.cpp | 85 - mobile/test/net/test_googlenet_quali.cpp | 55 - mobile/test/net/test_googlenetv1_combine.cpp | 60 - mobile/test/net/test_inceptionv4.cpp | 59 - mobile/test/net/test_mobilenet+ssd.cpp | 48 - mobile/test/net/test_mobilenet.cpp | 60 - mobile/test/net/test_mobilenet_025_fssd.cpp | 61 - mobile/test/net/test_mobilenet_GPU.cpp | 64 - mobile/test/net/test_mobilenet_combine.cpp | 59 - .../test/net/test_multi_inference_predict.cpp | 104 - mobile/test/net/test_net.cpp | 272 - mobile/test/net/test_net_benchmark.cpp | 59 - mobile/test/net/test_nlp.cpp | 94 - mobile/test/net/test_ocr.cpp | 108 - mobile/test/net/test_op_in_net.cpp | 125 - mobile/test/net/test_resnet.cpp | 73 - mobile/test/net/test_squeezenet.cpp | 49 - mobile/test/net/test_super.cpp | 119 - mobile/test/net/test_vgg16ssd.cpp | 46 - mobile/test/net/test_wrap.cpp | 65 - mobile/test/net/test_yolo.cpp | 50 - mobile/test/net/test_yolo_combined.cpp | 53 - mobile/test/net/test_yologpu.cpp | 190 - mobile/test/operators/test_batchnorm_op.cpp | 122 - mobile/test/operators/test_box_coder_op.cpp | 196 - mobile/test/operators/test_cast_op.cpp | 126 - mobile/test/operators/test_concat_op.cpp | 136 - .../test/operators/test_conv_add_relu_op.cpp | 45 - .../test/operators/test_conv_bn_relu_op.cpp | 172 - mobile/test/operators/test_conv_gpu.cpp | 199 - mobile/test/operators/test_conv_op.cpp | 358 - .../test/operators/test_depthwise_conv_op.cpp | 45 - mobile/test/operators/test_dequantize_op.cpp | 76 - .../test/operators/test_dwconv_bn_relu_op.cpp | 145 - .../operators/test_elementwise_add_op.cpp | 62 - .../operators/test_elementwise_sub_op.cpp | 157 - .../test/operators/test_fill_constant_op.cpp | 112 - .../test_fusion_conv_add_bn_relu_op.cpp | 63 - mobile/test/operators/test_fusion_fc_op.cpp | 166 - mobile/test/operators/test_gru_op.cpp | 100 - mobile/test/operators/test_im2sequence_op.cpp | 137 - mobile/test/operators/test_increment_op.cpp | 75 - mobile/test/operators/test_is_empty_op.cpp | 69 - mobile/test/operators/test_leaky_relu_op.cpp | 80 - mobile/test/operators/test_less_than_op.cpp | 122 - mobile/test/operators/test_log_op.cpp | 80 - mobile/test/operators/test_logical_and_op.cpp | 84 - mobile/test/operators/test_logical_not_op.cpp | 76 - mobile/test/operators/test_logical_or_op.cpp | 84 - mobile/test/operators/test_logical_xor_op.cpp | 86 - mobile/test/operators/test_lrn_op.cpp | 83 - mobile/test/operators/test_mul_op.cpp | 102 - .../test/operators/test_multiclass_nms_op.cpp | 162 - .../test_polygon_box_transform_op.cpp | 125 - mobile/test/operators/test_pool_op.cpp | 231 - mobile/test/operators/test_prelu_op.cpp | 58 - mobile/test/operators/test_prior_box_op.cpp | 152 - mobile/test/operators/test_quantize_op.cpp | 153 - mobile/test/operators/test_relu6_op.cpp | 83 - mobile/test/operators/test_relu_op.cpp | 82 - mobile/test/operators/test_reshape2_op.cpp | 142 - mobile/test/operators/test_reshape_op.cpp | 47 - mobile/test/operators/test_resize_op.cpp | 47 - mobile/test/operators/test_scale_op.cpp | 18 - .../operators/test_sequence_expand_op.cpp | 97 - .../test/operators/test_sequence_pool_op.cpp | 293 - .../operators/test_sequence_softmax_op.cpp | 100 - mobile/test/operators/test_sigmoid_op.cpp | 80 - mobile/test/operators/test_slice_op.cpp | 18 - mobile/test/operators/test_softmax_op.cpp | 100 - mobile/test/operators/test_sum_op.cpp | 131 - mobile/test/operators/test_tanh_op.cpp | 81 - mobile/test/operators/test_topk_op.cpp | 139 - mobile/test/operators/test_transpose2_op.cpp | 143 - mobile/test/operators/test_transpose_op.cpp | 49 - mobile/test/test_helper.h | 147 - mobile/test/test_include.h | 39 - .../third_party/opencl/OpenCL-Headers/CL/cl.h | 1782 --- .../opencl/OpenCL-Headers/CL/cl_d3d10.h | 130 - .../opencl/OpenCL-Headers/CL/cl_d3d11.h | 130 - .../OpenCL-Headers/CL/cl_dx9_media_sharing.h | 131 - .../CL/cl_dx9_media_sharing_intel.h | 181 - .../opencl/OpenCL-Headers/CL/cl_egl.h | 136 - .../opencl/OpenCL-Headers/CL/cl_ext.h | 723 -- .../opencl/OpenCL-Headers/CL/cl_ext_intel.h | 428 - .../opencl/OpenCL-Headers/CL/cl_gl.h | 175 - .../opencl/OpenCL-Headers/CL/cl_gl_ext.h | 74 - .../opencl/OpenCL-Headers/CL/cl_platform.h | 1460 --- .../CL/cl_va_api_media_sharing_intel.h | 171 - .../opencl/OpenCL-Headers/CL/cl_version.h | 86 - .../opencl/OpenCL-Headers/CL/opencl.h | 58 - .../third_party/opencl/OpenCL-Headers/LICENSE | 25 - .../opencl/OpenCL-Headers/README.md | 50 - .../android-cmake/android.toolchain.cmake | 784 -- .../android-debug-script/push2android.sh | 42 - .../android-debug-script/run_on_android.sh | 37 - mobile/tools/arm-platform.cmake | 9 - mobile/tools/build.sh | 225 - mobile/tools/ci_build.sh | 270 - mobile/tools/ci_run_test.sh | 43 - mobile/tools/docker_build_fpga.sh | 7 - mobile/tools/ios-cmake/ios.toolchain.cmake | 216 - mobile/tools/net-detail.awk | 91 - mobile/tools/net.awk | 27 - mobile/tools/op.cmake | 753 -- .../tools/pre-commit.hooks/clang-format.hook | 23 - mobile/tools/pre-commit.hooks/clang-tidy.hook | 18 - mobile/tools/pre-commit.hooks/copyright.hook | 124 - mobile/tools/pre-commit.hooks/cpplint.hook | 13 - mobile/tools/prepare_images_and_models.sh | 20 - mobile/tools/profile_show.sh | 138 - mobile/tools/python/caffetools/run.py | 30 - mobile/tools/python/fluidtools/.gitignore | 5 - mobile/tools/python/fluidtools/run.py | 639 -- mobile/tools/python/fluidtools/test_wrap.py | 546 - mobile/tools/python/imagetools/README.md | 24 - mobile/tools/python/imagetools/imagetools.py | 71 - mobile/tools/python/imagetools/img2nchw.py | 88 - mobile/tools/python/imagetools/img2nhwc.py | 34 - .../tools/python/imagetools/numpy2binary.py | 60 - mobile/tools/python/misc/.gitignore | 4 - mobile/tools/python/misc/fluidtools.py | 175 - mobile/tools/python/misc/ios-test-server.py | 126 - mobile/tools/python/misc/restore-git.py | 54 - .../python/misc/test-fluid-op-feature.py | 13 - mobile/tools/python/modeltools/.gitignore | 109 - .../tools/python/modeltools/core/__init__.py | 0 .../python/modeltools/core/framework.proto | 176 - .../python/modeltools/core/framework_pb2.py | 1141 -- .../tools/python/modeltools/core/op_types.py | 93 - .../python/modeltools/mobilenet/__init__.py | 0 .../mobilenet/converter_mobilenet.py | 509 - .../python/modeltools/mobilenet/swicher.py | 119 - .../tools/python/modeltools/tools/__init__.py | 0 .../modeltools/tools/float2halffloat.py | 70 - .../tools/python/modeltools/tools/loader.py | 11 - .../python/modeltools/tools/model_combine.py | 19 - .../python/modeltools/tools/model_reader.py | 30 - .../tools/python/modeltools/yolo/__init__.py | 0 .../tools/python/modeltools/yolo/mdl2fluid.py | 333 - .../tools/python/modeltools/yolo/swicher.py | 115 - mobile/tools/quantification/CMakeLists.txt | 12 - mobile/tools/quantification/README.md | 37 - mobile/tools/quantification/convert.cpp | 438 - .../quantification/src/block_desc_local.cpp | 48 - .../quantification/src/block_desc_local.h | 56 - mobile/tools/quantification/src/enforce.h | 67 - .../tools/quantification/src/framework.pb-c.c | 1403 --- .../tools/quantification/src/framework.pb-c.h | 579 - .../tools/quantification/src/program_desc.cpp | 30 - .../tools/quantification/src/program_desc.h | 41 - mobile/tools/quantification/src/protobuf-c.c | 2098 ---- mobile/tools/quantification/src/protobuf-c.h | 921 -- mobile/tools/quantification/src/tensor_desc.h | 72 - mobile/tools/quantification/src/var_desc.h | 80 - mobile/tools/shell/change_mobile_namespace.sh | 39 - mobile/tools/shell/check-bitcode.sh | 34 - mobile/tools/shell/check-filename.sh | 41 - .../tools/shell/generate-include/.gitignore | 2 - .../generate-include/check_include_diff.sh | 30 - mobile/tools/shell/generate-include/main.cpp | 6 - mobile/tools/shell/generate-include/parse.py | 21 - mobile/tools/shell/generate-include/run.sh | 9 - mobile/tools/shell/merge.sh | 60 - mobile/tools/shell/prune_static_library.sh | 41 - mobile/tools/shell/restore-private-repo.sh | 5 - .../tools/toolchains/arm-android-neon.cmake | 5 - .../tools/toolchains/arm-linux-gnueabi.cmake | 16 - .../toolchains/arm-linux-gnueabihf.cmake | 10 - model_optimize_tool.md | 61 + model_quantization.md | 305 + npu.md | 125 + opencl.md | 107 + paddle-mobile.md | 1 + roadmap.md | 28 + source_compile.md | 286 + source_compile.md.toc.2019-08-29_160045 | 20 + support_operation_list.md | 196 + tech_highlights.md | 45 + third-party/gflags | 1 - third-party/googletest | 1 - third-party/protobuf-host | 1 - third-party/protobuf-mobile | 1 - tools/codestyle/.gitignore | 1 - tools/codestyle/clang_format.hook | 15 - tools/codestyle/copyright.hook | 121 - tools/codestyle/cpplint_pre_commit.hook | 27 - tools/codestyle/docstring_checker.py | 349 - tools/codestyle/pylint_pre_commit.hook | 19 - tools/codestyle/test_docstring_checker.py | 232 - tools/document_preview.sh | 13 - tutorial.md | 74 + web/.editorconfig | 9 - web/.gitignore | 78 - web/.npmrc | 1 - web/README.md | 46 - web/README_cn.md | 43 - web/demo/index.es6 | 457 - web/demo/index.html | 39 - web/demo/videoDemo.es6 | 57 - web/demo/videoDemo.html | 36 - web/package.json | 32 - web/scripts/build.sh | 8 - web/src/banana.jpeg | Bin 30262 -> 0 bytes web/src/executor/camera.es6 | 142 - web/src/executor/executor.es6 | 105 - web/src/executor/loader.es6 | 423 - web/src/executor/postProcess.es6 | 262 - web/src/executor/runner.es6 | 153 - web/src/factory/fshader/factory.es6 | 83 - web/src/factory/fshader/ops.es6 | 166 - web/src/feed/ImageFeed.es6 | 237 - web/src/feed/dataFeed.es6 | 42 - web/src/feed/io.es6 | 854 -- web/src/gpu/gpu.es6 | 550 - web/src/index.es6 | 20 - web/src/index.html | 13 - web/src/runtime/runtime.es6 | 98 - web/src/shader/atom/common_func.es6 | 35 - web/src/shader/atom/common_params.es6 | 18 - .../atom/getArrayIndexFromTensorPos.es6 | 15 - .../atom/getArrayIndexFromTexturePos.es6 | 14 - web/src/shader/atom/getOutputTensorPos.es6 | 16 - .../shader/atom/getPixelsFromTexturePos.es6 | 9 - .../atom/getRangePowSumFromArrayIndex.es6 | 15 - .../shader/atom/getRangeSumFromArrayIndex.es6 | 15 - .../atom/getTensorPosFromArrayIndex.es6 | 18 - .../atom/getTexturePosFromArrayIndex.es6 | 25 - web/src/shader/atom/getValueFromTensorPos.es6 | 32 - .../atom/getValueFromTensorPosPacked.es6 | 26 - .../shader/atom/getValueFromTexturePos.es6 | 22 - web/src/shader/atom/moveTexture2PosToReal.es6 | 19 - web/src/shader/atom/prefix.es6 | 18 - web/src/shader/atom/prefix2.es6 | 22 - web/src/shader/atom/prelu.es6 | 15 - web/src/shader/atom/scale.es6 | 11 - web/src/shader/atom/sigmoid.es6 | 12 - web/src/shader/atom/softmax.es6 | 14 - web/src/shader/atom/suffix.es6 | 42 - web/src/shader/atom/type_ivec56.es6 | 23 - web/src/shader/batchnorm/conf.es6 | 58 - web/src/shader/batchnorm/main.es6 | 18 - web/src/shader/batchnorm/params.es6 | 24 - web/src/shader/conv2d/conf.es6 | 86 - web/src/shader/conv2d/main.es6 | 48 - web/src/shader/conv2d/params.es6 | 45 - web/src/shader/conv2d_depthwise/conf.es6 | 67 - web/src/shader/conv2d_depthwise/main.es6 | 42 - web/src/shader/conv2d_depthwise/params.es6 | 43 - .../shader/conv2d_elementwise_add/conf.es6 | 77 - .../shader/conv2d_elementwise_add/main.es6 | 53 - .../shader/conv2d_elementwise_add/params.es6 | 54 - .../conv2d_elementwise_add_winograd/conf.es6 | 72 - .../conv2d_elementwise_add_winograd/main.es6 | 98 - .../params.es6 | 47 - web/src/shader/dynamic/conf.es6 | 35 - web/src/shader/dynamic/main.es6 | 14 - web/src/shader/dynamic/params.es6 | 9 - web/src/shader/elementwise_add/conf.es6 | 57 - web/src/shader/elementwise_add/main.es6 | 17 - web/src/shader/elementwise_add/params.es6 | 20 - web/src/shader/mul/conf.es6 | 57 - web/src/shader/mul/main.es6 | 18 - web/src/shader/mul/params.es6 | 27 - web/src/shader/pool2d/conf.es6 | 48 - web/src/shader/pool2d/main.es6 | 49 - web/src/shader/pool2d/params.es6 | 30 - web/src/shader/pool2d_avg/conf.es6 | 47 - web/src/shader/pool2d_avg/main.es6 | 40 - web/src/shader/pool2d_avg/params.es6 | 30 - web/src/shader/pool2d_max/conf.es6 | 47 - web/src/shader/pool2d_max/main.es6 | 45 - web/src/shader/pool2d_max/params.es6 | 29 - web/src/shader/pool2d_winograd/conf.es6 | 50 - web/src/shader/pool2d_winograd/main.es6 | 63 - web/src/shader/pool2d_winograd/params.es6 | 33 - web/src/shader/softmax/conf.es6 | 29 - web/src/shader/softmax/main.es6 | 55 - web/src/shader/softmax/params.es6 | 15 - web/src/shader/v_shader.es6 | 15 - web/src/shader/v_shader2.es6 | 15 - web/src/test/getMaxUniforms.es6 | 59 - web/src/utils/models.es6 | 46 - web/src/utils/opData.es6 | 407 - web/src/utils/tensor.es6 | 161 - web/src/utils/utils.es6 | 190 - web/tools/logger.es6 | 101 - web/tools/toBinaryFile.py | 111 - web/webpack.config.js | 48 - x2paddle.md | 44 + ...20\350\241\214\345\215\225\346\265\213.md" | 28 + 2757 files changed, 2974 insertions(+), 395109 deletions(-) delete mode 100644 CMakeLists.txt create mode 100644 Home.md delete mode 100644 README.md delete mode 100644 README_cn.md create mode 100644 add_new_operation.md create mode 100644 architecture-intro.md create mode 100644 architecture.md create mode 100644 benchmark.md create mode 100644 benchmark_tools.md create mode 100644 benchmark_tools.md.toc.2019-08-25_233116 create mode 100644 benchmark_tools.md.toc.2019-08-25_233528 delete mode 100644 cmake/FindGflags.cmake delete mode 100644 cmake/FindGlog.cmake delete mode 100644 cmake/FindGperftools.cmake delete mode 100644 cmake/FindJeMalloc.cmake delete mode 100644 cmake/FindNumPy.cmake delete mode 100644 cmake/cblas.cmake delete mode 100644 cmake/ccache.cmake delete mode 100644 cmake/configure.cmake delete mode 100644 cmake/coveralls.cmake delete mode 100644 cmake/coverallsGcovJsons.cmake delete mode 100644 cmake/cross_compiling/android.cmake delete mode 100644 cmake/cross_compiling/armlinux.cmake delete mode 100644 cmake/cross_compiling/findar.cmake delete mode 100644 cmake/cross_compiling/host.cmake delete mode 100644 cmake/cross_compiling/ios.cmake delete mode 100644 cmake/cross_compiling/npu.cmake delete mode 100644 cmake/cross_compiling/postproject.cmake delete mode 100644 cmake/cross_compiling/preproject.cmake delete mode 100644 cmake/cuda.cmake delete mode 100644 cmake/cudnn.cmake delete mode 100644 cmake/cupti.cmake delete mode 100644 cmake/external/eigen.cmake delete mode 100644 cmake/external/gflags.cmake delete mode 100644 cmake/external/glog.cmake delete mode 100644 cmake/external/gtest.cmake delete mode 100644 cmake/external/libxsmm.cmake delete mode 100644 cmake/external/mkldnn.cmake delete mode 100644 cmake/external/mklml.cmake delete mode 100644 cmake/external/openblas.cmake delete mode 100644 cmake/external/opencl-clhpp.cmake delete mode 100644 cmake/external/opencl-headers.cmake delete mode 100644 cmake/external/protobuf.cmake delete mode 100644 cmake/external/xbyak.cmake delete mode 100644 cmake/external/xxhash.cmake delete mode 100644 cmake/flags.cmake delete mode 100644 cmake/generic.cmake delete mode 100644 cmake/hip.cmake delete mode 100644 cmake/lite.cmake delete mode 100644 cmake/lite_utils.cmake delete mode 100644 cmake/make_resource.py delete mode 100644 cmake/operators.cmake delete mode 100644 cmake/package.cmake delete mode 100644 cmake/simd.cmake delete mode 100644 cmake/system.cmake delete mode 100644 cmake/tensorrt.cmake delete mode 100644 cmake/util.cmake delete mode 100644 cmake/version.cmake create mode 100644 cpp_demo.md create mode 100644 cxx_api.md create mode 100644 debug_tools.md create mode 100644 demos.md.toc.2019-08-26_222115 create mode 100644 demos.md.toc.2019-08-26_222307 create mode 100644 for-developer.md create mode 100644 fpga.md create mode 100644 images/architecture.jpg create mode 100644 images/benchmark_result.png create mode 100644 images/img_mobilenetv1_inference.png create mode 100644 images/lite1.png create mode 100644 images/model_quan_fig.png create mode 100644 images/model_quan_table1.png create mode 100644 images/phone_list.png create mode 100644 images/run_benchmark.png create mode 100644 java_demo.md delete mode 100644 lite/CMakeLists.txt delete mode 100644 lite/api/CMakeLists.txt delete mode 100644 lite/api/_paddle_use_kernels.h delete mode 100644 lite/api/_paddle_use_ops.h delete mode 100644 lite/api/android/.gitignore delete mode 100644 lite/api/android/CMakeLists.txt delete mode 100644 lite/api/android/jni/.gitignore delete mode 100644 lite/api/android/jni/CMakeLists.txt delete mode 100644 lite/api/android/jni/native/CMakeLists.txt delete mode 100644 lite/api/android/jni/native/convert_util_jni.h delete mode 100644 lite/api/android/jni/native/paddle_lite_jni.cc delete mode 100644 lite/api/android/jni/native/paddle_lite_jni.h delete mode 100644 lite/api/android/jni/native/tensor_jni.cc delete mode 100644 lite/api/android/jni/native/tensor_jni.h delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/Place.java delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java delete mode 100644 lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java delete mode 100644 lite/api/apis_test.cc delete mode 100644 lite/api/benchmark.cc delete mode 100644 lite/api/cxx_api.cc delete mode 100644 lite/api/cxx_api.h delete mode 100644 lite/api/cxx_api_bin.cc delete mode 100644 lite/api/cxx_api_impl.cc delete mode 100644 lite/api/cxx_api_test.cc delete mode 100644 lite/api/detection_model_test.cc delete mode 100644 lite/api/efficientnet_b0_test.cc delete mode 100644 lite/api/inceptionv4_test.cc delete mode 100644 lite/api/light_api.cc delete mode 100644 lite/api/light_api.h delete mode 100644 lite/api/light_api_impl.cc delete mode 100644 lite/api/light_api_test.cc delete mode 100644 lite/api/lite_api_test_helper.cc delete mode 100644 lite/api/lite_api_test_helper.h delete mode 100644 lite/api/mobilenetv1_int8_test.cc delete mode 100644 lite/api/mobilenetv1_ssd_test.cc delete mode 100644 lite/api/mobilenetv1_test.cc delete mode 100644 lite/api/mobilenetv1_yolov3_test.cc delete mode 100644 lite/api/mobilenetv2_test.cc delete mode 100644 lite/api/model_optimize_tool.cc delete mode 100644 lite/api/model_run_test_image.cc delete mode 100644 lite/api/model_test.cc delete mode 100644 lite/api/ocr_attention_test.cc delete mode 100644 lite/api/paddle_api.cc delete mode 100644 lite/api/paddle_api.h delete mode 100644 lite/api/paddle_api_test.cc delete mode 100644 lite/api/paddle_lite_factory_helper.h delete mode 100644 lite/api/paddle_place.cc delete mode 100644 lite/api/paddle_place.h delete mode 100644 lite/api/paddle_use_passes.h delete mode 100644 lite/api/resnet18_test.cc delete mode 100644 lite/api/resnet50_test.cc delete mode 100644 lite/api/resnet50_test_fpga.cc delete mode 100644 lite/api/shufflenetv2_test.cc delete mode 100644 lite/api/test_googlenet_lite.cc delete mode 100644 lite/api/test_helper.h delete mode 100644 lite/api/test_inceptionv4_lite_x86.cc delete mode 100644 lite/api/test_mobilenetv1_lite_x86.cc delete mode 100644 lite/api/test_mobilenetv2_lite_x86.cc delete mode 100644 lite/api/unet_test.cc delete mode 100644 lite/backends/CMakeLists.txt delete mode 100644 lite/backends/arm/CMakeLists.txt delete mode 100644 lite/backends/arm/math/CMakeLists.txt delete mode 100644 lite/backends/arm/math/activation.cc delete mode 100644 lite/backends/arm/math/activation.h delete mode 100644 lite/backends/arm/math/affine_channel.cc delete mode 100644 lite/backends/arm/math/affine_channel.h delete mode 100644 lite/backends/arm/math/anchor_generator.cc delete mode 100644 lite/backends/arm/math/anchor_generator.h delete mode 100644 lite/backends/arm/math/argmax.cc delete mode 100644 lite/backends/arm/math/argmax.h delete mode 100644 lite/backends/arm/math/axpy.cc delete mode 100644 lite/backends/arm/math/axpy.h delete mode 100644 lite/backends/arm/math/beam_search.cc delete mode 100644 lite/backends/arm/math/beam_search.h delete mode 100644 lite/backends/arm/math/box_coder.cc delete mode 100644 lite/backends/arm/math/box_coder.h delete mode 100644 lite/backends/arm/math/col_im_transform.cc delete mode 100644 lite/backends/arm/math/col_im_transform.h delete mode 100644 lite/backends/arm/math/concat.cc delete mode 100644 lite/backends/arm/math/concat.h delete mode 100644 lite/backends/arm/math/conv3x3s1_direct_int8.cc delete mode 100644 lite/backends/arm/math/conv3x3s2_direct_int8.cc delete mode 100644 lite/backends/arm/math/conv_block_utils.h delete mode 100644 lite/backends/arm/math/conv_depthwise.cc delete mode 100644 lite/backends/arm/math/conv_depthwise.h delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3_int8.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p0.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p1.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_5x5s1.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_5x5s2.cc delete mode 100644 lite/backends/arm/math/conv_direct.cc delete mode 100644 lite/backends/arm/math/conv_direct.h delete mode 100644 lite/backends/arm/math/conv_direct_3x3s1.cc delete mode 100644 lite/backends/arm/math/conv_direct_3x3s2.cc delete mode 100644 lite/backends/arm/math/conv_gemmlike.cc delete mode 100644 lite/backends/arm/math/conv_gemmlike.h delete mode 100644 lite/backends/arm/math/conv_impl.cc delete mode 100644 lite/backends/arm/math/conv_impl.h delete mode 100644 lite/backends/arm/math/conv_winograd.cc delete mode 100644 lite/backends/arm/math/conv_winograd.h delete mode 100644 lite/backends/arm/math/conv_winograd_3x3.cc delete mode 100644 lite/backends/arm/math/decode_bboxes.cc delete mode 100644 lite/backends/arm/math/decode_bboxes.h delete mode 100644 lite/backends/arm/math/dot_toolchain_support.h delete mode 100644 lite/backends/arm/math/dropout.cc delete mode 100644 lite/backends/arm/math/dropout.h delete mode 100644 lite/backends/arm/math/elementwise.cc delete mode 100644 lite/backends/arm/math/elementwise.h delete mode 100644 lite/backends/arm/math/fill_bias_relu.cc delete mode 100644 lite/backends/arm/math/fill_bias_relu.h delete mode 100644 lite/backends/arm/math/funcs.cc delete mode 100644 lite/backends/arm/math/funcs.h delete mode 100644 lite/backends/arm/math/gemm_prepacked_int8.cc delete mode 100644 lite/backends/arm/math/gemm_prepacked_int8.h delete mode 100644 lite/backends/arm/math/gemv_arm_int8.cc delete mode 100644 lite/backends/arm/math/gemv_arm_int8.h delete mode 100644 lite/backends/arm/math/gru_utils.h delete mode 100644 lite/backends/arm/math/im2sequence.cc delete mode 100644 lite/backends/arm/math/im2sequence.h delete mode 100644 lite/backends/arm/math/increment.cc delete mode 100644 lite/backends/arm/math/increment.h delete mode 100644 lite/backends/arm/math/interpolate.cc delete mode 100644 lite/backends/arm/math/interpolate.h delete mode 100644 lite/backends/arm/math/lrn.cc delete mode 100644 lite/backends/arm/math/lrn.h delete mode 100644 lite/backends/arm/math/negative.cc delete mode 100644 lite/backends/arm/math/negative.h delete mode 100644 lite/backends/arm/math/norm.cc delete mode 100644 lite/backends/arm/math/norm.h delete mode 100644 lite/backends/arm/math/packed_sgemm.cc delete mode 100644 lite/backends/arm/math/packed_sgemm.h delete mode 100644 lite/backends/arm/math/pad2d.cc delete mode 100644 lite/backends/arm/math/pad2d.h delete mode 100644 lite/backends/arm/math/pooling.cc delete mode 100644 lite/backends/arm/math/pooling.h delete mode 100644 lite/backends/arm/math/power.cc delete mode 100644 lite/backends/arm/math/power.h delete mode 100644 lite/backends/arm/math/prior_box.cc delete mode 100644 lite/backends/arm/math/prior_box.h delete mode 100644 lite/backends/arm/math/reduce_max.cc delete mode 100644 lite/backends/arm/math/reduce_max.h delete mode 100644 lite/backends/arm/math/reduce_mean.cc delete mode 100644 lite/backends/arm/math/reduce_mean.h delete mode 100644 lite/backends/arm/math/saturate.h delete mode 100644 lite/backends/arm/math/scale.cc delete mode 100644 lite/backends/arm/math/scale.h delete mode 100644 lite/backends/arm/math/sequence2batch.h delete mode 100644 lite/backends/arm/math/sequence_expand.cc delete mode 100644 lite/backends/arm/math/sequence_expand.h delete mode 100644 lite/backends/arm/math/sequence_pool.cc delete mode 100644 lite/backends/arm/math/sequence_pool.h delete mode 100644 lite/backends/arm/math/sequence_softmax.cc delete mode 100644 lite/backends/arm/math/sequence_softmax.h delete mode 100644 lite/backends/arm/math/sgemm.cc delete mode 100644 lite/backends/arm/math/sgemm.h delete mode 100644 lite/backends/arm/math/sgemv.cc delete mode 100644 lite/backends/arm/math/sgemv.h delete mode 100644 lite/backends/arm/math/shuffle_channel.cc delete mode 100644 lite/backends/arm/math/shuffle_channel.h delete mode 100644 lite/backends/arm/math/slice.cc delete mode 100644 lite/backends/arm/math/slice.h delete mode 100644 lite/backends/arm/math/softmax.cc delete mode 100644 lite/backends/arm/math/softmax.h delete mode 100644 lite/backends/arm/math/split.cc delete mode 100644 lite/backends/arm/math/split.h delete mode 100644 lite/backends/arm/math/stack.cc delete mode 100644 lite/backends/arm/math/stack.h delete mode 100644 lite/backends/arm/math/topk.cc delete mode 100644 lite/backends/arm/math/topk.h delete mode 100644 lite/backends/arm/math/type_trans.cc delete mode 100644 lite/backends/arm/math/type_trans.h delete mode 100644 lite/backends/arm/math/yolo_box.cc delete mode 100644 lite/backends/arm/math/yolo_box.h delete mode 100644 lite/backends/cuda/CMakeLists.txt delete mode 100644 lite/backends/cuda/blas.cc delete mode 100644 lite/backends/cuda/blas.h delete mode 100644 lite/backends/cuda/cuda_utils.h delete mode 100644 lite/backends/cuda/math/CMakeLists.txt delete mode 100644 lite/backends/cuda/math/activation.cu delete mode 100644 lite/backends/cuda/math/activation.h delete mode 100644 lite/backends/cuda/math/cudnn_conv.cc delete mode 100644 lite/backends/cuda/math/cudnn_conv.h delete mode 100644 lite/backends/cuda/math/cudnn_helper.h delete mode 100644 lite/backends/cuda/math/scale.cu delete mode 100644 lite/backends/cuda/math/scale.h delete mode 100644 lite/backends/cuda/math/transpose.cu delete mode 100644 lite/backends/cuda/math/transpose.h delete mode 100644 lite/backends/cuda/math/type_trans.cu delete mode 100644 lite/backends/cuda/math/type_trans.h delete mode 100644 lite/backends/cuda/math/utils.h delete mode 100644 lite/backends/cuda/target_wrapper.cc delete mode 100644 lite/backends/cuda/target_wrapper.h delete mode 100644 lite/backends/fpga/CMakeLists.txt delete mode 100644 lite/backends/fpga/KD/alignment.h delete mode 100644 lite/backends/fpga/KD/context.hpp delete mode 100644 lite/backends/fpga/KD/dl_engine.cpp delete mode 100644 lite/backends/fpga/KD/dl_engine.hpp delete mode 100755 lite/backends/fpga/KD/float16.hpp delete mode 100644 lite/backends/fpga/KD/fpga_cv.cpp delete mode 100644 lite/backends/fpga/KD/fpga_cv.hpp delete mode 100644 lite/backends/fpga/KD/layout.hpp delete mode 100644 lite/backends/fpga/KD/llapi/bias_scale.cpp delete mode 100644 lite/backends/fpga/KD/llapi/bias_scale.h delete mode 100755 lite/backends/fpga/KD/llapi/config.h delete mode 100644 lite/backends/fpga/KD/llapi/filter.cpp delete mode 100644 lite/backends/fpga/KD/llapi/filter.h delete mode 100644 lite/backends/fpga/KD/llapi/zynqmp_api.cpp delete mode 100644 lite/backends/fpga/KD/llapi/zynqmp_api.h delete mode 100644 lite/backends/fpga/KD/pe.hpp delete mode 100644 lite/backends/fpga/KD/pe_params.hpp delete mode 100644 lite/backends/fpga/KD/pes/batchnorm_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/concat_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/conv_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/conv_process.hpp delete mode 100644 lite/backends/fpga/KD/pes/crop_pe.cpp delete mode 100755 lite/backends/fpga/KD/pes/crop_pe.hpp delete mode 100755 lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp delete mode 100755 lite/backends/fpga/KD/pes/elementwise_add_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/fully_connected_pe.hpp delete mode 100755 lite/backends/fpga/KD/pes/input_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/norm_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/output_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/pooling_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/prior_box_pe.cpp delete mode 100755 lite/backends/fpga/KD/pes/prior_box_pe.hpp delete mode 100755 lite/backends/fpga/KD/pes/relu_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/resize.hpp delete mode 100755 lite/backends/fpga/KD/pes/scale_pe.hpp delete mode 100755 lite/backends/fpga/KD/pes/softmax_pe.cpp delete mode 100644 lite/backends/fpga/KD/pes/softmax_pe.hpp delete mode 100644 lite/backends/fpga/KD/pes/split_pe.hpp delete mode 100755 lite/backends/fpga/KD/shape.hpp delete mode 100644 lite/backends/fpga/KD/tensor.hpp delete mode 100644 lite/backends/fpga/KD/tensor_util.cpp delete mode 100644 lite/backends/fpga/KD/tensor_util.hpp delete mode 100644 lite/backends/fpga/lite_tensor.cc delete mode 100644 lite/backends/fpga/lite_tensor.h delete mode 100644 lite/backends/fpga/target_wrapper.cc delete mode 100644 lite/backends/host/CMakeLists.txt delete mode 100644 lite/backends/host/target_wrapper.cc delete mode 100644 lite/backends/npu/CMakeLists.txt delete mode 100644 lite/backends/npu/bridge/CMakeLists.txt delete mode 100644 lite/backends/npu/bridge/act_op.cc delete mode 100644 lite/backends/npu/bridge/act_op_test.cc delete mode 100644 lite/backends/npu/bridge/batch_norm_op.cc delete mode 100644 lite/backends/npu/bridge/batch_norm_op_test.cc delete mode 100644 lite/backends/npu/bridge/concat_op.cc delete mode 100644 lite/backends/npu/bridge/concat_op_test.cc delete mode 100644 lite/backends/npu/bridge/conv_op.cc delete mode 100644 lite/backends/npu/bridge/conv_op_test.cc delete mode 100644 lite/backends/npu/bridge/conv_transpose_op.cc delete mode 100644 lite/backends/npu/bridge/conv_transpose_op_test.cc delete mode 100644 lite/backends/npu/bridge/elementwise_ops.cc delete mode 100644 lite/backends/npu/bridge/elementwise_ops_test.cc delete mode 100644 lite/backends/npu/bridge/fc_op.cc delete mode 100644 lite/backends/npu/bridge/fc_op_test.cc delete mode 100644 lite/backends/npu/bridge/interpolate_op.cc delete mode 100644 lite/backends/npu/bridge/interpolate_op_test.cc delete mode 100644 lite/backends/npu/bridge/mul_op.cc delete mode 100644 lite/backends/npu/bridge/mul_op_test.cc delete mode 100644 lite/backends/npu/bridge/pad2d_op.cc delete mode 100644 lite/backends/npu/bridge/pad2d_op_test.cc delete mode 100644 lite/backends/npu/bridge/paddle_use_npu_bridges.h delete mode 100644 lite/backends/npu/bridge/pool_op.cc delete mode 100644 lite/backends/npu/bridge/pool_op_test.cc delete mode 100644 lite/backends/npu/bridge/registry.cc delete mode 100644 lite/backends/npu/bridge/registry.h delete mode 100644 lite/backends/npu/bridge/reshape_op.cc delete mode 100644 lite/backends/npu/bridge/reshape_op_test.cc delete mode 100644 lite/backends/npu/bridge/scale_op.cc delete mode 100644 lite/backends/npu/bridge/scale_op_test.cc delete mode 100644 lite/backends/npu/bridge/shuffle_channel_op.cc delete mode 100644 lite/backends/npu/bridge/shuffle_channel_op_test.cc delete mode 100644 lite/backends/npu/bridge/softmax_op.cc delete mode 100644 lite/backends/npu/bridge/softmax_op_test.cc delete mode 100644 lite/backends/npu/bridge/split_op.cc delete mode 100644 lite/backends/npu/bridge/split_op_test.cc delete mode 100644 lite/backends/npu/bridge/test_helper.cc delete mode 100644 lite/backends/npu/bridge/test_helper.h delete mode 100644 lite/backends/npu/bridge/transpose_op.cc delete mode 100644 lite/backends/npu/bridge/transpose_op_test.cc delete mode 100644 lite/backends/npu/bridge/utils.cc delete mode 100644 lite/backends/npu/bridge/utils.h delete mode 100644 lite/backends/npu/npu_helper.cc delete mode 100644 lite/backends/npu/npu_helper.h delete mode 100644 lite/backends/opencl/CMakeLists.txt delete mode 100644 lite/backends/opencl/cl_caller.cc delete mode 100644 lite/backends/opencl/cl_caller.h delete mode 100644 lite/backends/opencl/cl_context.cc delete mode 100644 lite/backends/opencl/cl_context.h delete mode 100644 lite/backends/opencl/cl_functions_test.cc delete mode 100644 lite/backends/opencl/cl_im2col_test.cc delete mode 100644 lite/backends/opencl/cl_image.cc delete mode 100644 lite/backends/opencl/cl_image.h delete mode 100644 lite/backends/opencl/cl_image_converter.cc delete mode 100644 lite/backends/opencl/cl_image_converter.h delete mode 100644 lite/backends/opencl/cl_include.h delete mode 100644 lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/cl_common.h delete mode 100644 lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/image/pool_kernel.cl delete mode 100644 lite/backends/opencl/cl_runtime.cc delete mode 100644 lite/backends/opencl/cl_runtime.h delete mode 100644 lite/backends/opencl/cl_utility.cc delete mode 100644 lite/backends/opencl/cl_utility.h delete mode 100644 lite/backends/opencl/cl_wrapper.cc delete mode 100644 lite/backends/opencl/cl_wrapper.h delete mode 100644 lite/backends/opencl/target_wrapper.cc delete mode 100644 lite/backends/opencl/target_wrapper.h delete mode 100644 lite/backends/x86/CMakeLists.txt delete mode 100644 lite/backends/x86/cpu_info.cc delete mode 100644 lite/backends/x86/cpu_info.h delete mode 100644 lite/backends/x86/cupti_lib_path.h.in delete mode 100644 lite/backends/x86/dynamic_loader.cc delete mode 100644 lite/backends/x86/dynamic_loader.h delete mode 100644 lite/backends/x86/jit/CMakeLists.txt delete mode 100644 lite/backends/x86/jit/README.en.md delete mode 100644 lite/backends/x86/jit/README.md delete mode 100644 lite/backends/x86/jit/benchmark.cc delete mode 100644 lite/backends/x86/jit/gen/CMakeLists.txt delete mode 100644 lite/backends/x86/jit/gen/act.cc delete mode 100644 lite/backends/x86/jit/gen/act.h delete mode 100644 lite/backends/x86/jit/gen/blas.cc delete mode 100644 lite/backends/x86/jit/gen/blas.h delete mode 100644 lite/backends/x86/jit/gen/embseqpool.cc delete mode 100644 lite/backends/x86/jit/gen/embseqpool.h delete mode 100644 lite/backends/x86/jit/gen/gru.cc delete mode 100644 lite/backends/x86/jit/gen/gru.h delete mode 100644 lite/backends/x86/jit/gen/hopv.cc delete mode 100644 lite/backends/x86/jit/gen/hopv.h delete mode 100644 lite/backends/x86/jit/gen/jitcode.h delete mode 100644 lite/backends/x86/jit/gen/lstm.cc delete mode 100644 lite/backends/x86/jit/gen/lstm.h delete mode 100644 lite/backends/x86/jit/gen/matmul.cc delete mode 100644 lite/backends/x86/jit/gen/matmul.h delete mode 100644 lite/backends/x86/jit/gen/seqpool.cc delete mode 100644 lite/backends/x86/jit/gen/seqpool.h delete mode 100644 lite/backends/x86/jit/gen/sgd.cc delete mode 100644 lite/backends/x86/jit/gen/sgd.h delete mode 100644 lite/backends/x86/jit/gen/vbroadcast.cc delete mode 100644 lite/backends/x86/jit/gen/vbroadcast.h delete mode 100644 lite/backends/x86/jit/gen_base.cc delete mode 100644 lite/backends/x86/jit/gen_base.h delete mode 100644 lite/backends/x86/jit/helper.cc delete mode 100644 lite/backends/x86/jit/helper.h delete mode 100644 lite/backends/x86/jit/kernel_base.h delete mode 100644 lite/backends/x86/jit/kernel_key.cc delete mode 100644 lite/backends/x86/jit/kernel_key.h delete mode 100644 lite/backends/x86/jit/kernel_pool.cc delete mode 100644 lite/backends/x86/jit/kernel_pool.h delete mode 100644 lite/backends/x86/jit/macro.h delete mode 100644 lite/backends/x86/jit/more/CMakeLists.txt delete mode 100644 lite/backends/x86/jit/more/intrinsic/CMakeLists.txt delete mode 100644 lite/backends/x86/jit/more/intrinsic/crf_decoding.cc delete mode 100644 lite/backends/x86/jit/more/intrinsic/crf_decoding.h delete mode 100644 lite/backends/x86/jit/more/intrinsic/layer_norm.cc delete mode 100644 lite/backends/x86/jit/more/intrinsic/layer_norm.h delete mode 100644 lite/backends/x86/jit/more/mix/CMakeLists.txt delete mode 100644 lite/backends/x86/jit/more/mix/mix.cc delete mode 100644 lite/backends/x86/jit/more/mix/mix.h delete mode 100644 lite/backends/x86/jit/more/mkl/CMakeLists.txt delete mode 100644 lite/backends/x86/jit/more/mkl/mkl.cc delete mode 100644 lite/backends/x86/jit/more/mkl/mkl.h delete mode 100644 lite/backends/x86/jit/refer/CMakeLists.txt delete mode 100644 lite/backends/x86/jit/refer/refer.cc delete mode 100644 lite/backends/x86/jit/refer/refer.h delete mode 100644 lite/backends/x86/jit/registry.h delete mode 100644 lite/backends/x86/jit/test.cc delete mode 100644 lite/backends/x86/legacy_place.h delete mode 100644 lite/backends/x86/math/CMakeLists.txt delete mode 100644 lite/backends/x86/math/beam_search.cc delete mode 100644 lite/backends/x86/math/beam_search.h delete mode 100644 lite/backends/x86/math/beam_search_test.cc delete mode 100644 lite/backends/x86/math/blas.cc delete mode 100644 lite/backends/x86/math/blas.h delete mode 100644 lite/backends/x86/math/blas_impl.h delete mode 100644 lite/backends/x86/math/concat_and_split.cc delete mode 100644 lite/backends/x86/math/concat_and_split.h delete mode 100644 lite/backends/x86/math/context_project.cc delete mode 100644 lite/backends/x86/math/context_project.h delete mode 100644 lite/backends/x86/math/cos_sim_functor.cc delete mode 100644 lite/backends/x86/math/cos_sim_functor.h delete mode 100644 lite/backends/x86/math/cpu_vec.h delete mode 100644 lite/backends/x86/math/cross_entropy.cc delete mode 100644 lite/backends/x86/math/cross_entropy.h delete mode 100644 lite/backends/x86/math/detail/CMakeLists.txt delete mode 100644 lite/backends/x86/math/detail/activation_functions.h delete mode 100644 lite/backends/x86/math/detail/avx_functions.cc delete mode 100644 lite/backends/x86/math/detail/avx_mathfun.h delete mode 100644 lite/backends/x86/math/detail/gru_cpu_kernel.h delete mode 100644 lite/backends/x86/math/detail/gru_kernel.h delete mode 100644 lite/backends/x86/math/gru_compute.cc delete mode 100644 lite/backends/x86/math/gru_compute.h delete mode 100644 lite/backends/x86/math/im2col.cc delete mode 100644 lite/backends/x86/math/im2col.h delete mode 100644 lite/backends/x86/math/im2col_cfo_cpu.h delete mode 100644 lite/backends/x86/math/im2col_test.cc delete mode 100644 lite/backends/x86/math/math_function.cc delete mode 100644 lite/backends/x86/math/math_function.h delete mode 100644 lite/backends/x86/math/math_function_impl.h delete mode 100644 lite/backends/x86/math/math_function_test.cc delete mode 100644 lite/backends/x86/math/maxouting.cc delete mode 100644 lite/backends/x86/math/maxouting.h delete mode 100644 lite/backends/x86/math/pooling.cc delete mode 100644 lite/backends/x86/math/pooling.h delete mode 100644 lite/backends/x86/math/prelu.h delete mode 100644 lite/backends/x86/math/sample_prob.cc delete mode 100644 lite/backends/x86/math/sample_prob.h delete mode 100644 lite/backends/x86/math/sampler.cc delete mode 100644 lite/backends/x86/math/sampler.h delete mode 100644 lite/backends/x86/math/sequence2batch.cc delete mode 100644 lite/backends/x86/math/sequence2batch.h delete mode 100644 lite/backends/x86/math/sequence_padding.cc delete mode 100644 lite/backends/x86/math/sequence_padding.h delete mode 100644 lite/backends/x86/math/sequence_pooling.cc delete mode 100644 lite/backends/x86/math/sequence_pooling.h delete mode 100644 lite/backends/x86/math/sequence_pooling_test.cc delete mode 100644 lite/backends/x86/math/sequence_scale.cc delete mode 100644 lite/backends/x86/math/sequence_scale.h delete mode 100644 lite/backends/x86/math/softmax.cc delete mode 100644 lite/backends/x86/math/softmax.h delete mode 100644 lite/backends/x86/math/softmax_impl.h delete mode 100644 lite/backends/x86/math/tree2col.cc delete mode 100644 lite/backends/x86/math/tree2col.h delete mode 100644 lite/backends/x86/math/unpooling.cc delete mode 100644 lite/backends/x86/math/unpooling.h delete mode 100644 lite/backends/x86/math/vol2col.cc delete mode 100644 lite/backends/x86/math/vol2col.h delete mode 100644 lite/backends/x86/mklml.cc delete mode 100644 lite/backends/x86/mklml.h delete mode 100644 lite/backends/x86/port.h delete mode 100644 lite/backends/x86/target_wrapper.cc delete mode 100644 lite/backends/x86/target_wrapper.h delete mode 100644 lite/backends/x86/warpctc_lib_path.h.in delete mode 100644 lite/core/CMakeLists.txt delete mode 100644 lite/core/arena/CMakeLists.txt delete mode 100644 lite/core/arena/framework.cc delete mode 100644 lite/core/arena/framework.h delete mode 100644 lite/core/arena/framework_test.cc delete mode 100644 lite/core/context.cc delete mode 100644 lite/core/context.h delete mode 100644 lite/core/context_test.cc delete mode 100644 lite/core/device_info.cc delete mode 100644 lite/core/device_info.h delete mode 100644 lite/core/framework.proto delete mode 100644 lite/core/kernel.cc delete mode 100644 lite/core/kernel.h delete mode 100644 lite/core/kernel_test.cc delete mode 100644 lite/core/lite.map delete mode 100644 lite/core/lite_gtest_main.cc delete mode 100644 lite/core/lite_tensor_test.cc delete mode 100644 lite/core/memory.cc delete mode 100644 lite/core/memory.h delete mode 100644 lite/core/memory_test.cc delete mode 100644 lite/core/mir/CMakeLists.txt delete mode 100644 lite/core/mir/argument_type_display_pass.cc delete mode 100644 lite/core/mir/demo_pass.cc delete mode 100644 lite/core/mir/dot.h delete mode 100644 lite/core/mir/elimination/CMakeLists.txt delete mode 100644 lite/core/mir/elimination/identity_scale_eliminate_pass.cc delete mode 100644 lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc delete mode 100644 lite/core/mir/fusion/CMakeLists.txt delete mode 100644 lite/core/mir/fusion/conv_activation_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/conv_activation_fuse_pass.h delete mode 100644 lite/core/mir/fusion/conv_activation_fuser.cc delete mode 100644 lite/core/mir/fusion/conv_activation_fuser.h delete mode 100644 lite/core/mir/fusion/conv_bn_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/conv_bn_fuse_pass.h delete mode 100644 lite/core/mir/fusion/conv_bn_fuse_pass_test.cc delete mode 100644 lite/core/mir/fusion/conv_bn_fuser.cc delete mode 100644 lite/core/mir/fusion/conv_bn_fuser.h delete mode 100644 lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuse_pass.h delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuser.cc delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuser.h delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuser.cc delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuser.h delete mode 100644 lite/core/mir/fusion/fc_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/fc_fuse_pass.h delete mode 100644 lite/core/mir/fusion/fc_fuse_pass_test.cc delete mode 100644 lite/core/mir/fusion/fc_fuser.cc delete mode 100644 lite/core/mir/fusion/fc_fuser.h delete mode 100644 lite/core/mir/fusion/interpolate_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/interpolate_fuse_pass.h delete mode 100644 lite/core/mir/fusion/interpolate_fuser.cc delete mode 100644 lite/core/mir/fusion/interpolate_fuser.h delete mode 100644 lite/core/mir/fusion/quant_dequant_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/quant_dequant_fuse_pass.h delete mode 100644 lite/core/mir/fusion/quant_dequant_op_fuser.cc delete mode 100644 lite/core/mir/fusion/quant_dequant_op_fuser.h delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuse_pass.h delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuser.cc delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuser.h delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuser.h delete mode 100644 lite/core/mir/generate_program_pass.cc delete mode 100644 lite/core/mir/generate_program_pass.h delete mode 100644 lite/core/mir/graph_visualize_pass.cc delete mode 100644 lite/core/mir/graph_visualize_pass.h delete mode 100644 lite/core/mir/io_copy_kernel_pick_pass.cc delete mode 100644 lite/core/mir/node.cc delete mode 100644 lite/core/mir/node.h delete mode 100644 lite/core/mir/pass.cc delete mode 100644 lite/core/mir/pass.h delete mode 100644 lite/core/mir/pass_manager.cc delete mode 100644 lite/core/mir/pass_manager.h delete mode 100644 lite/core/mir/pass_manager_test.cc delete mode 100644 lite/core/mir/pass_registry.cc delete mode 100644 lite/core/mir/pass_registry.h delete mode 100644 lite/core/mir/pattern_matcher.cc delete mode 100644 lite/core/mir/pattern_matcher.h delete mode 100644 lite/core/mir/pattern_matcher_high_api.cc delete mode 100644 lite/core/mir/pattern_matcher_high_api.h delete mode 100644 lite/core/mir/pattern_matcher_high_api_test.cc delete mode 100644 lite/core/mir/pattern_matcher_test.cc delete mode 100644 lite/core/mir/pattern_matcher_tester.cc delete mode 100644 lite/core/mir/runtime_context_assign_pass.cc delete mode 100644 lite/core/mir/ssa_graph.cc delete mode 100644 lite/core/mir/ssa_graph.h delete mode 100644 lite/core/mir/ssa_graph_test.cc delete mode 100644 lite/core/mir/static_kernel_pick_pass.cc delete mode 100644 lite/core/mir/static_kernel_pick_pass.h delete mode 100644 lite/core/mir/subgraph/CMakeLists.txt delete mode 100644 lite/core/mir/subgraph/generate_npu_program_pass.cc delete mode 100644 lite/core/mir/subgraph/generate_npu_program_pass.h delete mode 100644 lite/core/mir/subgraph/generate_npu_program_pass_test.cc delete mode 100644 lite/core/mir/subgraph/subgraph_program_pass.cc delete mode 100644 lite/core/mir/subgraph/subgraph_program_pass.h delete mode 100644 lite/core/mir/subgraph/subgraph_program_pass_test.cc delete mode 100644 lite/core/mir/type_layout_cast_pass.cc delete mode 100644 lite/core/mir/type_layout_cast_pass.h delete mode 100644 lite/core/mir/type_precision_cast_pass.cc delete mode 100644 lite/core/mir/type_precision_cast_pass.h delete mode 100644 lite/core/mir/type_target_cast_pass.cc delete mode 100644 lite/core/mir/type_target_cast_pass.h delete mode 100644 lite/core/mir/variable_place_inference_pass.cc delete mode 100644 lite/core/mir/variable_place_inference_pass.h delete mode 100644 lite/core/mir/variable_place_inference_pass_test.cc delete mode 100644 lite/core/naive_test_model.py delete mode 100644 lite/core/op_lite.cc delete mode 100644 lite/core/op_lite.h delete mode 100644 lite/core/op_lite_test.cc delete mode 100644 lite/core/op_registry.cc delete mode 100644 lite/core/op_registry.h delete mode 100644 lite/core/optimizer.cc delete mode 100644 lite/core/optimizer.h delete mode 100644 lite/core/optimizer_test.cc delete mode 100644 lite/core/profile/CMakeLists.txt delete mode 100644 lite/core/profile/basic_profiler.cc delete mode 100644 lite/core/profile/basic_profiler.h delete mode 100644 lite/core/profile/basic_profiler_test.cc delete mode 100644 lite/core/profile/precision_profiler.h delete mode 100644 lite/core/program.cc delete mode 100644 lite/core/program.h delete mode 100644 lite/core/program_fake_utils.cc delete mode 100644 lite/core/program_fake_utils.h delete mode 100644 lite/core/scope.cc delete mode 100644 lite/core/scope.h delete mode 100644 lite/core/scope_test.cc delete mode 100644 lite/core/target_wrapper.cc delete mode 100644 lite/core/target_wrapper.h delete mode 100644 lite/core/tensor.cc delete mode 100644 lite/core/tensor.h delete mode 100644 lite/core/type_system.cc delete mode 100644 lite/core/type_system.h delete mode 100644 lite/core/type_system_test.cc delete mode 100644 lite/core/types.cc delete mode 100644 lite/core/types.h delete mode 100644 lite/core/types_test.cc delete mode 100644 lite/core/variable.cc delete mode 100644 lite/core/variable.h delete mode 100644 lite/core/workspace.cc delete mode 100644 lite/core/workspace.h delete mode 100644 lite/demo/cxx/Makefile.def delete mode 100644 lite/demo/cxx/README.md delete mode 100644 lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 delete mode 100644 lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 delete mode 100644 lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 delete mode 100644 lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 delete mode 100644 lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc delete mode 100644 lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc delete mode 100644 lite/demo/java/README.md delete mode 100644 lite/demo/java/android/PaddlePredictor/.gitignore delete mode 100644 lite/demo/java/android/PaddlePredictor/app/.gitignore delete mode 100644 lite/demo/java/android/PaddlePredictor/app/build.gradle delete mode 100644 lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher_round.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java delete mode 100644 lite/demo/java/android/PaddlePredictor/build.gradle delete mode 100644 lite/demo/java/android/PaddlePredictor/gradle.properties delete mode 100644 lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar delete mode 100644 lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties delete mode 100755 lite/demo/java/android/PaddlePredictor/gradlew delete mode 100644 lite/demo/java/android/PaddlePredictor/gradlew.bat delete mode 100644 lite/demo/java/android/PaddlePredictor/settings.gradle delete mode 100644 lite/demo/java/android/prepare_demo.bash delete mode 100644 lite/fluid/CMakeLists.txt delete mode 100644 lite/fluid/data_type.cc delete mode 100644 lite/fluid/data_type.h delete mode 100644 lite/fluid/data_type_test.cc delete mode 100644 lite/fluid/eigen.h delete mode 100644 lite/fluid/float16.h delete mode 100644 lite/fluid/lod.h delete mode 100644 lite/fluid/math.h delete mode 100644 lite/gen_code/CMakeLists.txt delete mode 100644 lite/gen_code/gen_code.cc delete mode 100644 lite/gen_code/gen_code.h delete mode 100644 lite/gen_code/gen_code_test.cc delete mode 100644 lite/gen_code/generated_code_test.cc delete mode 100644 lite/gen_code/paddle_code_generator.cc delete mode 100644 lite/gen_code/paddle_infer.cc delete mode 100644 lite/gen_code/paddle_infer.h delete mode 100644 lite/kernels/CMakeLists.txt delete mode 100644 lite/kernels/arm/CMakeLists.txt delete mode 100644 lite/kernels/arm/activation_compute.cc delete mode 100644 lite/kernels/arm/activation_compute.h delete mode 100644 lite/kernels/arm/affine_channel_compute.cc delete mode 100644 lite/kernels/arm/affine_channel_compute.h delete mode 100644 lite/kernels/arm/anchor_generator_compute.cc delete mode 100644 lite/kernels/arm/anchor_generator_compute.h delete mode 100644 lite/kernels/arm/argmax_compute.cc delete mode 100644 lite/kernels/arm/argmax_compute.h delete mode 100644 lite/kernels/arm/argmax_compute_test.cc delete mode 100644 lite/kernels/arm/assign_compute.cc delete mode 100644 lite/kernels/arm/assign_compute.h delete mode 100644 lite/kernels/arm/assign_value_compute.cc delete mode 100644 lite/kernels/arm/assign_value_compute.h delete mode 100644 lite/kernels/arm/axpy_compute.cc delete mode 100644 lite/kernels/arm/axpy_compute.h delete mode 100644 lite/kernels/arm/axpy_compute_test.cc delete mode 100644 lite/kernels/arm/batch_norm_compute.cc delete mode 100644 lite/kernels/arm/batch_norm_compute.h delete mode 100644 lite/kernels/arm/batch_norm_compute_test.cc delete mode 100644 lite/kernels/arm/beam_search_compute.cc delete mode 100644 lite/kernels/arm/beam_search_compute.h delete mode 100644 lite/kernels/arm/beam_search_decode_compute.cc delete mode 100644 lite/kernels/arm/beam_search_decode_compute.h delete mode 100644 lite/kernels/arm/box_clip_compute.cc delete mode 100644 lite/kernels/arm/box_clip_compute.h delete mode 100644 lite/kernels/arm/box_coder_compute.cc delete mode 100644 lite/kernels/arm/box_coder_compute.h delete mode 100644 lite/kernels/arm/calib_compute.cc delete mode 100644 lite/kernels/arm/calib_compute.h delete mode 100644 lite/kernels/arm/calib_compute_test.cc delete mode 100644 lite/kernels/arm/cast_compute.cc delete mode 100644 lite/kernels/arm/cast_compute.h delete mode 100644 lite/kernels/arm/compare_compute.cc delete mode 100644 lite/kernels/arm/compare_compute.h delete mode 100644 lite/kernels/arm/concat_compute.cc delete mode 100644 lite/kernels/arm/concat_compute.h delete mode 100644 lite/kernels/arm/concat_compute_test.cc delete mode 100644 lite/kernels/arm/conv_compute.cc delete mode 100644 lite/kernels/arm/conv_compute.h delete mode 100644 lite/kernels/arm/conv_compute_test.cc delete mode 100644 lite/kernels/arm/conv_transpose_compute.cc delete mode 100644 lite/kernels/arm/conv_transpose_compute.h delete mode 100644 lite/kernels/arm/conv_transpose_compute_test.cc delete mode 100644 lite/kernels/arm/crop_compute.cc delete mode 100644 lite/kernels/arm/crop_compute.h delete mode 100644 lite/kernels/arm/decode_bboxes_compute.cc delete mode 100644 lite/kernels/arm/decode_bboxes_compute.h delete mode 100644 lite/kernels/arm/decode_bboxes_compute_test.cc delete mode 100644 lite/kernels/arm/density_prior_box_compute.cc delete mode 100644 lite/kernels/arm/density_prior_box_compute.h delete mode 100644 lite/kernels/arm/dropout_compute.cc delete mode 100644 lite/kernels/arm/dropout_compute.h delete mode 100644 lite/kernels/arm/dropout_compute_test.cc delete mode 100644 lite/kernels/arm/elementwise_compute.cc delete mode 100644 lite/kernels/arm/elementwise_compute.h delete mode 100644 lite/kernels/arm/elementwise_compute_test.cc delete mode 100644 lite/kernels/arm/expand_compute.cc delete mode 100644 lite/kernels/arm/expand_compute.h delete mode 100644 lite/kernels/arm/fc_compute.cc delete mode 100644 lite/kernels/arm/fc_compute.h delete mode 100644 lite/kernels/arm/fc_compute_test.cc delete mode 100644 lite/kernels/arm/fill_constant_compute.cc delete mode 100644 lite/kernels/arm/generate_proposals_compute.cc delete mode 100644 lite/kernels/arm/generate_proposals_compute.h delete mode 100644 lite/kernels/arm/gru_compute.cc delete mode 100644 lite/kernels/arm/gru_compute.h delete mode 100644 lite/kernels/arm/gru_unit_compute.cc delete mode 100644 lite/kernels/arm/gru_unit_compute.h delete mode 100644 lite/kernels/arm/im2sequence_compute.cc delete mode 100644 lite/kernels/arm/im2sequence_compute.h delete mode 100644 lite/kernels/arm/increment_compute.cc delete mode 100644 lite/kernels/arm/increment_compute.h delete mode 100644 lite/kernels/arm/interpolate_compute.cc delete mode 100644 lite/kernels/arm/interpolate_compute.h delete mode 100644 lite/kernels/arm/is_empty_compute.cc delete mode 100644 lite/kernels/arm/is_empty_compute.h delete mode 100644 lite/kernels/arm/lod_reset_compute.cc delete mode 100644 lite/kernels/arm/lod_reset_compute.h delete mode 100644 lite/kernels/arm/logical_compute.cc delete mode 100644 lite/kernels/arm/logical_compute.h delete mode 100644 lite/kernels/arm/lookup_table_compute.cc delete mode 100644 lite/kernels/arm/lookup_table_compute.h delete mode 100644 lite/kernels/arm/lrn_compute.cc delete mode 100644 lite/kernels/arm/lrn_compute.h delete mode 100644 lite/kernels/arm/lrn_compute_test.cc delete mode 100644 lite/kernels/arm/matmul_compute.cc delete mode 100644 lite/kernels/arm/matmul_compute.h delete mode 100644 lite/kernels/arm/mul_compute.cc delete mode 100644 lite/kernels/arm/mul_compute.h delete mode 100644 lite/kernels/arm/mul_compute_test.cc delete mode 100644 lite/kernels/arm/negative_compute.cc delete mode 100644 lite/kernels/arm/negative_compute.h delete mode 100644 lite/kernels/arm/norm_compute.cc delete mode 100644 lite/kernels/arm/norm_compute.h delete mode 100644 lite/kernels/arm/pad2d_compute.cc delete mode 100644 lite/kernels/arm/pad2d_compute.h delete mode 100644 lite/kernels/arm/pool_compute.cc delete mode 100644 lite/kernels/arm/pool_compute.h delete mode 100644 lite/kernels/arm/pool_compute_test.cc delete mode 100644 lite/kernels/arm/power_compute.cc delete mode 100644 lite/kernels/arm/power_compute.h delete mode 100644 lite/kernels/arm/prior_box_compute.cc delete mode 100644 lite/kernels/arm/prior_box_compute.h delete mode 100644 lite/kernels/arm/read_from_array_compute.cc delete mode 100644 lite/kernels/arm/read_from_array_compute.h delete mode 100644 lite/kernels/arm/reduce_max_compute.cc delete mode 100644 lite/kernels/arm/reduce_max_compute.h delete mode 100644 lite/kernels/arm/reduce_mean_compute.cc delete mode 100644 lite/kernels/arm/reduce_mean_compute.h delete mode 100644 lite/kernels/arm/roi_align_compute.cc delete mode 100644 lite/kernels/arm/roi_align_compute.h delete mode 100644 lite/kernels/arm/scale_compute.cc delete mode 100644 lite/kernels/arm/scale_compute.h delete mode 100644 lite/kernels/arm/scale_compute_test.cc delete mode 100644 lite/kernels/arm/sequence_expand_compute.cc delete mode 100644 lite/kernels/arm/sequence_expand_compute.h delete mode 100644 lite/kernels/arm/sequence_pool_compute.cc delete mode 100644 lite/kernels/arm/sequence_pool_compute.h delete mode 100644 lite/kernels/arm/sequence_softmax_compute.cc delete mode 100644 lite/kernels/arm/sequence_softmax_compute.h delete mode 100644 lite/kernels/arm/shape_compute.cc delete mode 100644 lite/kernels/arm/shape_compute.h delete mode 100644 lite/kernels/arm/shuffle_channel_compute.cc delete mode 100644 lite/kernels/arm/shuffle_channel_compute.h delete mode 100644 lite/kernels/arm/slice_compute.cc delete mode 100644 lite/kernels/arm/slice_compute.h delete mode 100644 lite/kernels/arm/softmax_compute.cc delete mode 100644 lite/kernels/arm/softmax_compute.h delete mode 100644 lite/kernels/arm/softmax_compute_test.cc delete mode 100644 lite/kernels/arm/split_compute.cc delete mode 100644 lite/kernels/arm/split_compute.h delete mode 100644 lite/kernels/arm/split_compute_test.cc delete mode 100644 lite/kernels/arm/squeeze_compute.cc delete mode 100644 lite/kernels/arm/squeeze_compute.h delete mode 100644 lite/kernels/arm/stack_compute.cc delete mode 100644 lite/kernels/arm/stack_compute.h delete mode 100644 lite/kernels/arm/topk_compute.cc delete mode 100644 lite/kernels/arm/topk_compute.h delete mode 100644 lite/kernels/arm/transpose_compute.cc delete mode 100644 lite/kernels/arm/transpose_compute.h delete mode 100644 lite/kernels/arm/transpose_compute_test.cc delete mode 100644 lite/kernels/arm/while_compute.cc delete mode 100644 lite/kernels/arm/while_compute.h delete mode 100644 lite/kernels/arm/write_to_array_compute.cc delete mode 100644 lite/kernels/arm/write_to_array_compute.h delete mode 100644 lite/kernels/arm/yolo_box_compute.cc delete mode 100644 lite/kernels/arm/yolo_box_compute.h delete mode 100644 lite/kernels/cuda/CMakeLists.txt delete mode 100644 lite/kernels/cuda/calib_compute.cu delete mode 100644 lite/kernels/cuda/calib_compute.h delete mode 100644 lite/kernels/cuda/calib_compute_cuda_test.cc delete mode 100644 lite/kernels/cuda/concat_compute.cu delete mode 100644 lite/kernels/cuda/concat_compute.h delete mode 100644 lite/kernels/cuda/concat_compute_test.cc delete mode 100644 lite/kernels/cuda/conv_compute.cc delete mode 100644 lite/kernels/cuda/conv_compute.h delete mode 100644 lite/kernels/cuda/conv_compute_test.cc delete mode 100644 lite/kernels/cuda/elementwise_add_compute.cu delete mode 100644 lite/kernels/cuda/elementwise_add_compute.h delete mode 100644 lite/kernels/cuda/elementwise_add_compute_test.cc delete mode 100644 lite/kernels/cuda/io_copy_compute.cc delete mode 100644 lite/kernels/cuda/leaky_relu_compute.cu delete mode 100644 lite/kernels/cuda/leaky_relu_compute.h delete mode 100644 lite/kernels/cuda/leaky_relu_compute_test.cc delete mode 100644 lite/kernels/cuda/mul_compute.cc delete mode 100644 lite/kernels/cuda/mul_compute.h delete mode 100644 lite/kernels/cuda/nearest_interp_compute.cu delete mode 100644 lite/kernels/cuda/nearest_interp_compute.h delete mode 100644 lite/kernels/cuda/nearest_interp_compute_test.cc delete mode 100644 lite/kernels/cuda/transpose_compute.cu delete mode 100644 lite/kernels/cuda/transpose_compute.h delete mode 100644 lite/kernels/cuda/transpose_compute_test.cc delete mode 100644 lite/kernels/cuda/use_kernels.h delete mode 100644 lite/kernels/cuda/yolo_box_compute.cu delete mode 100644 lite/kernels/cuda/yolo_box_compute.h delete mode 100644 lite/kernels/cuda/yolo_box_compute_test.cc delete mode 100644 lite/kernels/fpga/CMakeLists.txt delete mode 100644 lite/kernels/fpga/activation_compute.cc delete mode 100644 lite/kernels/fpga/activation_compute.h delete mode 100644 lite/kernels/fpga/activation_compute_test.cc delete mode 100644 lite/kernels/fpga/calib_compute.cc delete mode 100644 lite/kernels/fpga/calib_compute.h delete mode 100644 lite/kernels/fpga/conv_compute.cc delete mode 100644 lite/kernels/fpga/conv_compute.h delete mode 100644 lite/kernels/fpga/conv_compute_test.cc delete mode 100644 lite/kernels/fpga/elementwise_compute.cc delete mode 100644 lite/kernels/fpga/elementwise_compute.h delete mode 100644 lite/kernels/fpga/elementwise_compute_test.cc delete mode 100644 lite/kernels/fpga/fc_compute.cc delete mode 100644 lite/kernels/fpga/fc_compute.h delete mode 100644 lite/kernels/fpga/fc_compute_test.cc delete mode 100644 lite/kernels/fpga/feed_compute.cc delete mode 100644 lite/kernels/fpga/feed_compute.h delete mode 100644 lite/kernels/fpga/fetch_compute.cc delete mode 100644 lite/kernels/fpga/fetch_compute.h delete mode 100644 lite/kernels/fpga/io_copy_compute.cc delete mode 100644 lite/kernels/fpga/layout_compute.cc delete mode 100644 lite/kernels/fpga/pooling_compute.cc delete mode 100644 lite/kernels/fpga/pooling_compute.h delete mode 100644 lite/kernels/fpga/pooling_compute_test.cc delete mode 100644 lite/kernels/fpga/scale_compute.cc delete mode 100644 lite/kernels/fpga/scale_compute.h delete mode 100644 lite/kernels/fpga/softmax_compute.cc delete mode 100644 lite/kernels/fpga/softmax_compute.h delete mode 100644 lite/kernels/fpga/softmax_compute_test.cc delete mode 100644 lite/kernels/host/CMakeLists.txt delete mode 100644 lite/kernels/host/feed_compute.cc delete mode 100644 lite/kernels/host/fetch_compute.cc delete mode 100644 lite/kernels/host/multiclass_nms_compute.cc delete mode 100644 lite/kernels/host/multiclass_nms_compute.h delete mode 100644 lite/kernels/host/multiclass_nms_compute_test.cc delete mode 100644 lite/kernels/host/reshape_compute.cc delete mode 100644 lite/kernels/host/reshape_compute.h delete mode 100644 lite/kernels/host/reshape_compute_test.cc delete mode 100644 lite/kernels/host/use_kernels.h delete mode 100644 lite/kernels/npu/CMakeLists.txt delete mode 100644 lite/kernels/npu/graph_compute.cc delete mode 100644 lite/kernels/npu/graph_compute.h delete mode 100644 lite/kernels/opencl/CMakeLists.txt delete mode 100644 lite/kernels/opencl/conv_compute.cc delete mode 100644 lite/kernels/opencl/conv_compute.h delete mode 100644 lite/kernels/opencl/conv_compute_test.cc delete mode 100644 lite/kernels/opencl/depthwise_conv2d_compute.cc delete mode 100644 lite/kernels/opencl/depthwise_conv2d_compute_test.cc delete mode 100644 lite/kernels/opencl/elementwise_add_compute.cc delete mode 100644 lite/kernels/opencl/elementwise_add_compute.h delete mode 100644 lite/kernels/opencl/elementwise_add_compute_test.cc delete mode 100644 lite/kernels/opencl/fc_compute.cc delete mode 100644 lite/kernels/opencl/fc_compute_test.cc delete mode 100644 lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc delete mode 100644 lite/kernels/opencl/io_copy_compute.cc delete mode 100644 lite/kernels/opencl/io_copy_compute_test.cc delete mode 100644 lite/kernels/opencl/mul_compute.cc delete mode 100644 lite/kernels/opencl/mul_compute_test.cc delete mode 100644 lite/kernels/opencl/pool_compute.cc delete mode 100644 lite/kernels/opencl/pool_compute_test.cc delete mode 100644 lite/kernels/opencl/relu_compute.cc delete mode 100644 lite/kernels/opencl/relu_compute_test.cc delete mode 100644 lite/kernels/x86/CMakeLists.txt delete mode 100644 lite/kernels/x86/activation_compute.cc delete mode 100644 lite/kernels/x86/batch_norm_compute.cc delete mode 100644 lite/kernels/x86/batch_norm_compute.h delete mode 100644 lite/kernels/x86/batch_norm_compute_test.cc delete mode 100644 lite/kernels/x86/concat_compute.cc delete mode 100644 lite/kernels/x86/concat_compute.h delete mode 100644 lite/kernels/x86/concat_compute_test.cc delete mode 100644 lite/kernels/x86/conv_compute.cc delete mode 100644 lite/kernels/x86/conv_compute.h delete mode 100644 lite/kernels/x86/conv_compute_test.cc delete mode 100644 lite/kernels/x86/dropout_compute.cc delete mode 100644 lite/kernels/x86/dropout_compute.h delete mode 100644 lite/kernels/x86/dropout_compute_test.cc delete mode 100644 lite/kernels/x86/elementwise_compute.cc delete mode 100644 lite/kernels/x86/elementwise_compute.h delete mode 100644 lite/kernels/x86/elementwise_compute_test.cc delete mode 100644 lite/kernels/x86/fc_compute.cc delete mode 100644 lite/kernels/x86/fc_compute.h delete mode 100644 lite/kernels/x86/fc_compute_test.cc delete mode 100644 lite/kernels/x86/fill_constant_compute.cc delete mode 100644 lite/kernels/x86/mean_compute.cc delete mode 100644 lite/kernels/x86/mul_compute.cc delete mode 100644 lite/kernels/x86/mul_compute.h delete mode 100644 lite/kernels/x86/mul_compute_test.cc delete mode 100644 lite/kernels/x86/pool_compute.cc delete mode 100644 lite/kernels/x86/pool_compute.h delete mode 100644 lite/kernels/x86/pool_compute_test.cc delete mode 100644 lite/kernels/x86/relu_compute.cc delete mode 100644 lite/kernels/x86/relu_compute.h delete mode 100644 lite/kernels/x86/relu_compute_test.cc delete mode 100644 lite/kernels/x86/reshape_compute.cc delete mode 100644 lite/kernels/x86/reshape_compute.h delete mode 100644 lite/kernels/x86/reshape_compute_test.cc delete mode 100644 lite/kernels/x86/scale_compute.cc delete mode 100644 lite/kernels/x86/scale_compute.h delete mode 100644 lite/kernels/x86/scale_compute_test.cc delete mode 100644 lite/kernels/x86/sequence_pool_compute.cc delete mode 100644 lite/kernels/x86/sequence_pool_compute.h delete mode 100644 lite/kernels/x86/sequence_pool_compute_test.cc delete mode 100644 lite/kernels/x86/sgd_compute.cc delete mode 100644 lite/kernels/x86/shape_compute.cc delete mode 100644 lite/kernels/x86/shape_compute.h delete mode 100644 lite/kernels/x86/shape_compute_test.cc delete mode 100644 lite/kernels/x86/slice_compute.cc delete mode 100644 lite/kernels/x86/slice_compute.h delete mode 100644 lite/kernels/x86/slice_compute_test.cc delete mode 100644 lite/kernels/x86/softmax_compute.cc delete mode 100644 lite/kernels/x86/softmax_compute.h delete mode 100644 lite/kernels/x86/softmax_compute_test.cc delete mode 100644 lite/kernels/x86/squeeze_compute.cc delete mode 100644 lite/kernels/x86/squeeze_compute.h delete mode 100644 lite/kernels/x86/squeeze_compute_test.cc delete mode 100644 lite/kernels/x86/uniform_random_compute.cc delete mode 100644 lite/model_parser/CMakeLists.txt delete mode 100644 lite/model_parser/compatible_pb.cc delete mode 100644 lite/model_parser/compatible_pb.h delete mode 100644 lite/model_parser/compatible_pb_test.cc delete mode 100644 lite/model_parser/cpp/CMakeLists.txt delete mode 100644 lite/model_parser/cpp/block_desc.cc delete mode 100644 lite/model_parser/cpp/block_desc.h delete mode 100644 lite/model_parser/cpp/op_desc.cc delete mode 100644 lite/model_parser/cpp/op_desc.h delete mode 100644 lite/model_parser/cpp/program_desc.cc delete mode 100644 lite/model_parser/cpp/program_desc.h delete mode 100644 lite/model_parser/cpp/var_desc.cc delete mode 100644 lite/model_parser/cpp/var_desc.h delete mode 100644 lite/model_parser/desc_apis.h delete mode 100644 lite/model_parser/model_parser.cc delete mode 100644 lite/model_parser/model_parser.h delete mode 100644 lite/model_parser/model_parser_test.cc delete mode 100644 lite/model_parser/naive_buffer/CMakeLists.txt delete mode 100644 lite/model_parser/naive_buffer/block_desc.cc delete mode 100644 lite/model_parser/naive_buffer/block_desc.h delete mode 100644 lite/model_parser/naive_buffer/combined_params_desc.cc delete mode 100644 lite/model_parser/naive_buffer/combined_params_desc.h delete mode 100644 lite/model_parser/naive_buffer/naive_buffer.cc delete mode 100644 lite/model_parser/naive_buffer/naive_buffer.h delete mode 100644 lite/model_parser/naive_buffer/naive_buffer_test.cc delete mode 100644 lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h delete mode 100644 lite/model_parser/naive_buffer/naive_buffer_wrapper_test.cc delete mode 100644 lite/model_parser/naive_buffer/op_desc.cc delete mode 100644 lite/model_parser/naive_buffer/op_desc.h delete mode 100644 lite/model_parser/naive_buffer/param_desc.cc delete mode 100644 lite/model_parser/naive_buffer/param_desc.h delete mode 100644 lite/model_parser/naive_buffer/program_desc.cc delete mode 100644 lite/model_parser/naive_buffer/program_desc.h delete mode 100644 lite/model_parser/naive_buffer/proto/CMakeLists.txt delete mode 100644 lite/model_parser/naive_buffer/proto/framework.nb.cc delete mode 100644 lite/model_parser/naive_buffer/proto/framework.nb.h delete mode 100644 lite/model_parser/naive_buffer/var_desc.cc delete mode 100644 lite/model_parser/naive_buffer/var_desc.h delete mode 100644 lite/model_parser/pb/CMakeLists.txt delete mode 100644 lite/model_parser/pb/block_desc.cc delete mode 100644 lite/model_parser/pb/block_desc.h delete mode 100644 lite/model_parser/pb/op_desc.cc delete mode 100644 lite/model_parser/pb/op_desc.h delete mode 100644 lite/model_parser/pb/program_desc.cc delete mode 100644 lite/model_parser/pb/program_desc.h delete mode 100644 lite/model_parser/pb/var_desc.cc delete mode 100644 lite/model_parser/pb/var_desc.h delete mode 100644 lite/model_parser/runtime.cc delete mode 100644 lite/model_parser/runtime.h delete mode 100644 lite/operators/CMakeLists.txt delete mode 100644 lite/operators/activation_ops.cc delete mode 100644 lite/operators/activation_ops.h delete mode 100644 lite/operators/affine_channel_op.cc delete mode 100644 lite/operators/affine_channel_op.h delete mode 100644 lite/operators/anchor_generator_op.cc delete mode 100644 lite/operators/anchor_generator_op.h delete mode 100644 lite/operators/argmax_op.cc delete mode 100644 lite/operators/argmax_op.h delete mode 100644 lite/operators/assign_op.cc delete mode 100644 lite/operators/assign_op.h delete mode 100644 lite/operators/assign_value_op.cc delete mode 100644 lite/operators/assign_value_op.h delete mode 100644 lite/operators/axpy_op.cc delete mode 100644 lite/operators/axpy_op.h delete mode 100644 lite/operators/batch_norm_op.cc delete mode 100644 lite/operators/batch_norm_op.h delete mode 100644 lite/operators/batch_norm_op_test.cc delete mode 100644 lite/operators/beam_search_decode_op.cc delete mode 100644 lite/operators/beam_search_decode_op.h delete mode 100644 lite/operators/beam_search_op.cc delete mode 100644 lite/operators/beam_search_op.h delete mode 100644 lite/operators/box_clip_op.cc delete mode 100644 lite/operators/box_clip_op.h delete mode 100644 lite/operators/box_coder_op.cc delete mode 100644 lite/operators/box_coder_op.h delete mode 100644 lite/operators/calib_once_op.cc delete mode 100644 lite/operators/calib_once_op.h delete mode 100644 lite/operators/calib_op.cc delete mode 100644 lite/operators/calib_op.h delete mode 100644 lite/operators/calib_op_test.cc delete mode 100644 lite/operators/cast_op.cc delete mode 100644 lite/operators/cast_op.h delete mode 100644 lite/operators/compare_op.cc delete mode 100644 lite/operators/compare_op.h delete mode 100644 lite/operators/concat_op.cc delete mode 100644 lite/operators/concat_op.h delete mode 100644 lite/operators/concat_op_test.cc delete mode 100644 lite/operators/conv_op.cc delete mode 100644 lite/operators/conv_op.h delete mode 100644 lite/operators/conv_transpose_op.cc delete mode 100644 lite/operators/conv_transpose_op.h delete mode 100644 lite/operators/crop_op.cc delete mode 100644 lite/operators/crop_op.h delete mode 100644 lite/operators/decode_bboxes_op.cc delete mode 100644 lite/operators/decode_bboxes_op.h delete mode 100644 lite/operators/density_prior_box_op.cc delete mode 100644 lite/operators/density_prior_box_op.h delete mode 100644 lite/operators/dropout_op.cc delete mode 100644 lite/operators/elementwise_ops.cc delete mode 100644 lite/operators/elementwise_ops.h delete mode 100644 lite/operators/expand_op.cc delete mode 100644 lite/operators/expand_op.h delete mode 100644 lite/operators/fake_dequantize_max_abs.cc delete mode 100644 lite/operators/fake_dequantize_max_abs.h delete mode 100644 lite/operators/fake_quantize_moving_avg_max_abs.cc delete mode 100644 lite/operators/fake_quantize_moving_avg_max_abs.h delete mode 100644 lite/operators/fake_quantize_range_abs_max.cc delete mode 100644 lite/operators/fake_quantize_range_abs_max.h delete mode 100644 lite/operators/fc_op.cc delete mode 100644 lite/operators/fc_op.h delete mode 100644 lite/operators/fc_op_test.cc delete mode 100644 lite/operators/feed_op.cc delete mode 100644 lite/operators/fetch_op.cc delete mode 100644 lite/operators/fill_constant_op.cc delete mode 100644 lite/operators/flatten_op.cc delete mode 100644 lite/operators/flatten_op.h delete mode 100644 lite/operators/fusion_elementwise_activation_ops.cc delete mode 100644 lite/operators/fusion_elementwise_activation_ops.h delete mode 100644 lite/operators/fusion_elementwise_activation_ops_test.cc delete mode 100644 lite/operators/generate_proposals_op.cc delete mode 100644 lite/operators/generate_proposals_op.h delete mode 100644 lite/operators/graph_op.cc delete mode 100644 lite/operators/graph_op.h delete mode 100644 lite/operators/gru_op.cc delete mode 100644 lite/operators/gru_op.h delete mode 100644 lite/operators/gru_unit_op.cc delete mode 100644 lite/operators/gru_unit_op.h delete mode 100644 lite/operators/im2sequence_op.cc delete mode 100644 lite/operators/im2sequence_op.h delete mode 100644 lite/operators/increment_op.cc delete mode 100644 lite/operators/increment_op.h delete mode 100644 lite/operators/interpolate_op.cc delete mode 100644 lite/operators/interpolate_op.h delete mode 100644 lite/operators/io_copy_once_op.cc delete mode 100644 lite/operators/io_copy_once_op.h delete mode 100644 lite/operators/io_copy_op.cc delete mode 100644 lite/operators/io_copy_op.h delete mode 100644 lite/operators/is_empty_op.cc delete mode 100644 lite/operators/is_empty_op.h delete mode 100644 lite/operators/layout_once_op.cc delete mode 100644 lite/operators/layout_once_op.h delete mode 100644 lite/operators/layout_op.cc delete mode 100644 lite/operators/layout_op.h delete mode 100644 lite/operators/lod_reset_op.cc delete mode 100644 lite/operators/lod_reset_op.h delete mode 100644 lite/operators/logical_op.cc delete mode 100644 lite/operators/logical_op.h delete mode 100644 lite/operators/lookup_table_op.cc delete mode 100644 lite/operators/lookup_table_op.h delete mode 100644 lite/operators/lrn_op.cc delete mode 100644 lite/operators/lrn_op.h delete mode 100644 lite/operators/matmul_op.cc delete mode 100644 lite/operators/matmul_op.h delete mode 100644 lite/operators/mean_op.cc delete mode 100644 lite/operators/mul_op.cc delete mode 100644 lite/operators/mul_op.h delete mode 100644 lite/operators/multiclass_nms_op.cc delete mode 100644 lite/operators/multiclass_nms_op.h delete mode 100644 lite/operators/negative_op.cc delete mode 100644 lite/operators/negative_op.h delete mode 100644 lite/operators/norm_op.cc delete mode 100644 lite/operators/norm_op.h delete mode 100644 lite/operators/op_params.cc delete mode 100644 lite/operators/op_params.h delete mode 100644 lite/operators/pad2d_op.cc delete mode 100644 lite/operators/pad2d_op.h delete mode 100644 lite/operators/pool_op.cc delete mode 100644 lite/operators/pool_op.h delete mode 100644 lite/operators/pool_op_test.cc delete mode 100644 lite/operators/power_op.cc delete mode 100644 lite/operators/power_op.h delete mode 100644 lite/operators/prior_box_op.cc delete mode 100644 lite/operators/prior_box_op.h delete mode 100644 lite/operators/read_from_array_op.cc delete mode 100644 lite/operators/read_from_array_op.h delete mode 100644 lite/operators/reduce_max_op.cc delete mode 100644 lite/operators/reduce_max_op.h delete mode 100644 lite/operators/reduce_mean_op.cc delete mode 100644 lite/operators/reduce_mean_op.h delete mode 100644 lite/operators/relu_op.cc delete mode 100644 lite/operators/relu_op.h delete mode 100644 lite/operators/reshape_op.cc delete mode 100644 lite/operators/reshape_op.h delete mode 100644 lite/operators/reshape_op_test.cc delete mode 100644 lite/operators/roi_align_op.cc delete mode 100644 lite/operators/roi_align_op.h delete mode 100644 lite/operators/scale_op.cc delete mode 100644 lite/operators/scale_op.h delete mode 100644 lite/operators/scale_op_test.cc delete mode 100644 lite/operators/sequence_expand_op.cc delete mode 100644 lite/operators/sequence_expand_op.h delete mode 100644 lite/operators/sequence_pool_op.cc delete mode 100644 lite/operators/sequence_pool_op.h delete mode 100644 lite/operators/sequence_softmax_op.cc delete mode 100644 lite/operators/sequence_softmax_op.h delete mode 100644 lite/operators/sgd_op.cc delete mode 100644 lite/operators/sgd_op.h delete mode 100644 lite/operators/shape_op.cc delete mode 100644 lite/operators/shape_op.h delete mode 100644 lite/operators/shuffle_channel_op.cc delete mode 100644 lite/operators/shuffle_channel_op.h delete mode 100644 lite/operators/slice_op.cc delete mode 100644 lite/operators/slice_op.h delete mode 100644 lite/operators/softmax_op.cc delete mode 100644 lite/operators/softmax_op.h delete mode 100644 lite/operators/softmax_op_test.cc delete mode 100644 lite/operators/split_op.cc delete mode 100644 lite/operators/split_op.h delete mode 100644 lite/operators/squeeze_op.cc delete mode 100644 lite/operators/squeeze_op.h delete mode 100644 lite/operators/stack_op.cc delete mode 100644 lite/operators/stack_op.h delete mode 100644 lite/operators/topk_op.cc delete mode 100644 lite/operators/topk_op.h delete mode 100644 lite/operators/transpose_op.cc delete mode 100644 lite/operators/transpose_op.h delete mode 100644 lite/operators/transpose_op_test.cc delete mode 100644 lite/operators/uniform_random_op.cc delete mode 100644 lite/operators/uniform_random_op.h delete mode 100644 lite/operators/while_op.cc delete mode 100644 lite/operators/while_op.h delete mode 100644 lite/operators/write_to_array_op.cc delete mode 100644 lite/operators/write_to_array_op.h delete mode 100644 lite/operators/yolo_box_op.cc delete mode 100644 lite/operators/yolo_box_op.h delete mode 100644 lite/tests/CMakeLists.txt delete mode 100644 lite/tests/README.md delete mode 100644 lite/tests/kernels/CMakeLists.txt delete mode 100644 lite/tests/kernels/activation_compute_test.cc delete mode 100644 lite/tests/kernels/affine_channel_compute_test.cc delete mode 100644 lite/tests/kernels/anchor_generator_compute_test.cc delete mode 100644 lite/tests/kernels/argmax_compute_test.cc delete mode 100644 lite/tests/kernels/assign_compute_test.cc delete mode 100644 lite/tests/kernels/assign_value_compute_test.cc delete mode 100644 lite/tests/kernels/axpy_compute_test.cc delete mode 100644 lite/tests/kernels/bilinear_interp_compute_test.cc delete mode 100644 lite/tests/kernels/box_clip_compute_test.cc delete mode 100644 lite/tests/kernels/box_coder_compute_test.cc delete mode 100644 lite/tests/kernels/cast_compute_test.cc delete mode 100644 lite/tests/kernels/compare_compute_test.cc delete mode 100644 lite/tests/kernels/conv2d_transpose_compute_test.cc delete mode 100644 lite/tests/kernels/crop_compute_test.cc delete mode 100644 lite/tests/kernels/decode_bboxes_compute_test.cc delete mode 100644 lite/tests/kernels/elementwise_compute_test.cc delete mode 100644 lite/tests/kernels/expand_compute_test.cc delete mode 100644 lite/tests/kernels/fc_compute_test.cc delete mode 100644 lite/tests/kernels/fill_data.h delete mode 100644 lite/tests/kernels/generate_proposals_compute_test.cc delete mode 100644 lite/tests/kernels/gru_unit_test.cc delete mode 100644 lite/tests/kernels/im2sequence_compute_test.cc delete mode 100644 lite/tests/kernels/increment_compute_test.cc delete mode 100644 lite/tests/kernels/logical_compute_test.cc delete mode 100644 lite/tests/kernels/lrn_compute_test.cc delete mode 100644 lite/tests/kernels/matmul_compute_test.cc delete mode 100644 lite/tests/kernels/nearest_interp_compute_test.cc delete mode 100644 lite/tests/kernels/negative_compute_test.cc delete mode 100644 lite/tests/kernels/norm_compute_test.cc delete mode 100644 lite/tests/kernels/pad2d_compute_test.cc delete mode 100644 lite/tests/kernels/power_compute_test.cc delete mode 100644 lite/tests/kernels/prior_box_compute_test.cc delete mode 100644 lite/tests/kernels/read_from_array_compute_test.cc delete mode 100644 lite/tests/kernels/reduce_max_compute_test.cc delete mode 100644 lite/tests/kernels/reduce_mean_compute_test.cc delete mode 100644 lite/tests/kernels/roi_align_compute_test.cc delete mode 100644 lite/tests/kernels/scale_compute_test.cc delete mode 100644 lite/tests/kernels/sequence_expand_compute_test.cc delete mode 100644 lite/tests/kernels/sequence_pool_compute_test.cc delete mode 100644 lite/tests/kernels/sequence_softmax_compute_test.cc delete mode 100644 lite/tests/kernels/shape_compute_test.cc delete mode 100644 lite/tests/kernels/shuffle_channel_compute_test.cc delete mode 100644 lite/tests/kernels/slice_compute_test.cc delete mode 100644 lite/tests/kernels/squeeze_compute_test.cc delete mode 100644 lite/tests/kernels/stack_compute_test.cc delete mode 100644 lite/tests/kernels/test_funcs.h delete mode 100644 lite/tests/kernels/test_sgemm.cc delete mode 100644 lite/tests/kernels/topk_compute_test.cc delete mode 100644 lite/tests/kernels/write_to_array_compute_test.cc delete mode 100644 lite/tests/kernels/yolo_box_compute_test.cc delete mode 100644 lite/tools/CMakeLists.txt delete mode 100644 lite/tools/Dockerfile.mobile delete mode 100644 lite/tools/benchmark.sh delete mode 100755 lite/tools/build.sh delete mode 100755 lite/tools/build_fpga.sh delete mode 100755 lite/tools/build_npu.sh delete mode 100755 lite/tools/ci_build.sh delete mode 100644 lite/tools/cmake_tools/ast.py delete mode 100644 lite/tools/cmake_tools/create_fake_kernel_registry.py delete mode 100644 lite/tools/cmake_tools/parse_kernel_registry.py delete mode 100644 lite/tools/cmake_tools/parse_op_registry.py delete mode 100644 lite/tools/cmake_tools/utils.py delete mode 100644 lite/tools/debug/CMakeLists.txt delete mode 100644 lite/tools/debug/analysis_tool.py delete mode 100755 lite/tools/debug/check_model.sh delete mode 100644 lite/tools/debug/debug_utils.cc delete mode 100644 lite/tools/debug/debug_utils.h delete mode 100644 lite/tools/debug/model_debug_tool.cc delete mode 100755 lite/tools/gitlab_review.sh delete mode 100644 lite/tools/mobile_readme.md delete mode 100755 lite/tools/prepare_benchmark.sh delete mode 100644 lite/tools/python/lite_test.py delete mode 100644 lite/tools/search_support_ops.py delete mode 100644 lite/utils/CMakeLists.txt delete mode 100644 lite/utils/all.h delete mode 100644 lite/utils/any.cc delete mode 100644 lite/utils/any.h delete mode 100644 lite/utils/check.h delete mode 100644 lite/utils/container.h delete mode 100644 lite/utils/cp_logging.cc delete mode 100644 lite/utils/cp_logging.h delete mode 100644 lite/utils/factory.h delete mode 100644 lite/utils/hash.h delete mode 100644 lite/utils/io.h delete mode 100644 lite/utils/logging.cc delete mode 100644 lite/utils/logging.h delete mode 100644 lite/utils/logging_test.cc delete mode 100644 lite/utils/macros.h delete mode 100644 lite/utils/paddle_enforce.h delete mode 100644 lite/utils/replace_stl/stream.cc delete mode 100644 lite/utils/replace_stl/stream.h delete mode 100644 lite/utils/string.cc delete mode 100644 lite/utils/string.h delete mode 100644 lite/utils/varient.h delete mode 100644 lite/utils/varient_test.cc delete mode 100644 metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj delete mode 100644 metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata delete mode 100644 metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist delete mode 100644 metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/Contents.json delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Base.lproj/LaunchScreen.storyboard delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Base.lproj/Main.storyboard delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Info.plist delete mode 100644 metal/MobileNetDemo/MobileNetDemo/MobileNet.swift delete mode 100644 metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal delete mode 100644 metal/MobileNetDemo/MobileNetDemo/ViewController.swift delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.pbxproj delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/xcshareddata/xcschemes/PaddleMobileTest.xcscheme delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/AppDelegate.swift delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/AppIcon.appiconset/Contents.json delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/Contents.json delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/LaunchScreen.storyboard delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/Main.storyboard delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Info.plist delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/TestViewController.swift delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/ViewController.swift delete mode 100644 metal/Podfile delete mode 100644 metal/README.md delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcshareddata/xcschemes/paddle-mobile-demo.xcscheme delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/AppIcon.appiconset/Contents.json delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/Contents.json delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/paddle-mobile.png delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/LaunchScreen.storyboard delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/BufferToTexture.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.h delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.h delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.m delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.h delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.m delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormRelu.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Common.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddBNReluKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddMetal.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPrelu.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPreluKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvBNReluKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvTransposeKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Elementwise.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Kernels.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Macro.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/NMSFetchResultKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PreluKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PriorBoxKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReluKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ResizeBilinear.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Shape.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.inc.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.metal delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Macro.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/NearestInterpKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ScaleKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Shape.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/SliceKernel.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Info.plist delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/xcshareddata/xcschemes/paddle-mobile.xcscheme delete mode 100644 metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/API/Net.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/API/Runner.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Info.plist delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Errors.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Utils.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/CNNMPSConvOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvReluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ExpOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormReluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/CNNConvKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Concat.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ExpKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/LeakyReluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/NearestInterpKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Relu6Kernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SigmoidKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SliceKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormRelu.metal delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ResizeBilinear.metal delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/LeakyReluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/NearestInterpOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Relu6Op.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ScaleOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SigmoidOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SliceOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/Attribute.swift delete mode 100755 metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.h delete mode 100755 metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.m delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/MemoryOptimze.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMProgramDesc.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/Program.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/framework.pb.swift delete mode 100644 metal/paddle-mobile/paddle-mobile/paddle_mobile.h create mode 100644 mobile.md delete mode 100644 mobile/.clang-format delete mode 100644 mobile/.clang-tidy delete mode 100644 mobile/.gitignore delete mode 100644 mobile/.pre-commit-config.yaml delete mode 100644 mobile/.travis.yml delete mode 100755 mobile/.travis/pre-commit-job.sh delete mode 100644 mobile/CMakeLists.txt delete mode 100644 mobile/CONTRIBUTING.md delete mode 100644 mobile/Dockerfile delete mode 100644 mobile/LICENSE delete mode 100644 mobile/README.md delete mode 100644 mobile/benchmark/arm_benchmark.md delete mode 100644 mobile/benchmark/metal_benchmark.md delete mode 100644 mobile/demo/ReadMe.md delete mode 100644 mobile/demo/getDemo.sh delete mode 100644 mobile/doc/build.md delete mode 100644 mobile/doc/design_doc.md delete mode 100644 mobile/doc/development_android.md delete mode 100644 mobile/doc/development_android_GPU.md delete mode 100644 mobile/doc/development_arm_linux.md delete mode 100644 mobile/doc/development_fpga.md delete mode 100644 mobile/doc/development_ios.md delete mode 100644 mobile/doc/quantification.md delete mode 100644 mobile/src/common/common.h delete mode 100644 mobile/src/common/enforce.h delete mode 100644 mobile/src/common/log.h delete mode 100644 mobile/src/common/threadpool.h delete mode 100644 mobile/src/common/type_define.h delete mode 100755 mobile/src/common/types.cpp delete mode 100644 mobile/src/common/types.h delete mode 100644 mobile/src/common/util.cpp delete mode 100644 mobile/src/common/util.h delete mode 100644 mobile/src/common/variant.h delete mode 100644 mobile/src/fpga/KD/alignment.h delete mode 100644 mobile/src/fpga/KD/context.hpp delete mode 100644 mobile/src/fpga/KD/dl_engine.cpp delete mode 100644 mobile/src/fpga/KD/dl_engine.hpp delete mode 100644 mobile/src/fpga/KD/float16.hpp delete mode 100644 mobile/src/fpga/KD/layout.hpp delete mode 100644 mobile/src/fpga/KD/llapi/bias_scale.cpp delete mode 100644 mobile/src/fpga/KD/llapi/bias_scale.h delete mode 100755 mobile/src/fpga/KD/llapi/config.h delete mode 100644 mobile/src/fpga/KD/llapi/filter.cpp delete mode 100644 mobile/src/fpga/KD/llapi/filter.h delete mode 100644 mobile/src/fpga/KD/llapi/image.cpp delete mode 100644 mobile/src/fpga/KD/llapi/image.h delete mode 100644 mobile/src/fpga/KD/llapi/zynqmp_api.cpp delete mode 100644 mobile/src/fpga/KD/llapi/zynqmp_api.h delete mode 100644 mobile/src/fpga/KD/pe.hpp delete mode 100644 mobile/src/fpga/KD/pe_params.hpp delete mode 100644 mobile/src/fpga/KD/pes/concat_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/conv_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/conv_process.hpp delete mode 100644 mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/elementwise_add_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/fully_connected_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/input_pe.hpp delete mode 100755 mobile/src/fpga/KD/pes/math_func_neon.h delete mode 100644 mobile/src/fpga/KD/pes/output_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/pooling_pe.hpp delete mode 100644 mobile/src/fpga/KD/pes/softmax_pe.cpp delete mode 100644 mobile/src/fpga/KD/pes/softmax_pe.hpp delete mode 100644 mobile/src/fpga/KD/shape.hpp delete mode 100644 mobile/src/fpga/KD/tensor.hpp delete mode 100644 mobile/src/fpga/KD/tensor_util.cpp delete mode 100644 mobile/src/fpga/KD/tensor_util.hpp delete mode 100644 mobile/src/fpga/V1/api.cpp delete mode 100644 mobile/src/fpga/V1/api.h delete mode 100644 mobile/src/fpga/V1/bias_scale.cpp delete mode 100755 mobile/src/fpga/V1/bias_scale.h delete mode 100644 mobile/src/fpga/V1/deconv_bias_scale.cpp delete mode 100644 mobile/src/fpga/V1/deconv_bias_scale.h delete mode 100644 mobile/src/fpga/V1/deconv_filter.cpp delete mode 100644 mobile/src/fpga/V1/deconv_filter.h delete mode 100644 mobile/src/fpga/V1/filter.cpp delete mode 100755 mobile/src/fpga/V1/filter.h delete mode 100644 mobile/src/fpga/V1/image.cpp delete mode 100644 mobile/src/fpga/V1/image.h delete mode 100644 mobile/src/fpga/V1/pe.cpp delete mode 100644 mobile/src/fpga/V2/api.cpp delete mode 100644 mobile/src/fpga/V2/api.h delete mode 100644 mobile/src/fpga/V2/bias_scale.cpp delete mode 100644 mobile/src/fpga/V2/bias_scale.h delete mode 100644 mobile/src/fpga/V2/deconv_bias_scale.cpp delete mode 100644 mobile/src/fpga/V2/deconv_bias_scale.h delete mode 100644 mobile/src/fpga/V2/deconv_filter.cpp delete mode 100644 mobile/src/fpga/V2/deconv_filter.h delete mode 100644 mobile/src/fpga/V2/filter.cpp delete mode 100644 mobile/src/fpga/V2/filter.h delete mode 100644 mobile/src/fpga/V2/image.cpp delete mode 100644 mobile/src/fpga/V2/image.h delete mode 100644 mobile/src/fpga/V2/pe.cpp delete mode 100644 mobile/src/fpga/common/config.h delete mode 100644 mobile/src/fpga/common/driver.cpp delete mode 100644 mobile/src/fpga/common/driver.h delete mode 100644 mobile/src/fpga/common/fpga_common.cpp delete mode 100644 mobile/src/fpga/common/fpga_common.h delete mode 100644 mobile/src/fpga/common/pe.h delete mode 100644 mobile/src/framework/CMakeLists.txt delete mode 100644 mobile/src/framework/attribute.cpp delete mode 100644 mobile/src/framework/attribute.h delete mode 100644 mobile/src/framework/cl/cl_deleter.h delete mode 100644 mobile/src/framework/cl/cl_engine.cpp delete mode 100644 mobile/src/framework/cl/cl_engine.h delete mode 100644 mobile/src/framework/cl/cl_half.cpp delete mode 100644 mobile/src/framework/cl/cl_half.h delete mode 100644 mobile/src/framework/cl/cl_helper.h delete mode 100644 mobile/src/framework/cl/cl_image.cpp delete mode 100644 mobile/src/framework/cl/cl_image.h delete mode 100644 mobile/src/framework/cl/cl_image_converter.cpp delete mode 100644 mobile/src/framework/cl/cl_image_converter.h delete mode 100644 mobile/src/framework/cl/cl_scope.h delete mode 100644 mobile/src/framework/cl/cl_tensor.h delete mode 100644 mobile/src/framework/cl/cl_tool.cpp delete mode 100644 mobile/src/framework/cl/cl_tool.h delete mode 100644 mobile/src/framework/context.cpp delete mode 100644 mobile/src/framework/context.h delete mode 100644 mobile/src/framework/data_layout.h delete mode 100644 mobile/src/framework/data_type.cpp delete mode 100644 mobile/src/framework/data_type.h delete mode 100644 mobile/src/framework/ddim.cpp delete mode 100644 mobile/src/framework/ddim.h delete mode 100644 mobile/src/framework/dim.h delete mode 100644 mobile/src/framework/executor.cpp delete mode 100644 mobile/src/framework/executor.h delete mode 100644 mobile/src/framework/framework.pb-c.cpp delete mode 100644 mobile/src/framework/framework.pb-c.h delete mode 100644 mobile/src/framework/framework.proto delete mode 100755 mobile/src/framework/load_ops.h delete mode 100644 mobile/src/framework/loader.cpp delete mode 100644 mobile/src/framework/loader.h delete mode 100644 mobile/src/framework/lod_tensor.cpp delete mode 100644 mobile/src/framework/lod_tensor.h delete mode 100644 mobile/src/framework/mixed_vector.h delete mode 100644 mobile/src/framework/op_info.h delete mode 100644 mobile/src/framework/op_kernel_type.h delete mode 100644 mobile/src/framework/op_proto_maker.h delete mode 100644 mobile/src/framework/op_registry.h delete mode 100644 mobile/src/framework/operator.cpp delete mode 100644 mobile/src/framework/operator.h delete mode 100644 mobile/src/framework/program/block_desc.cpp delete mode 100644 mobile/src/framework/program/block_desc.h delete mode 100644 mobile/src/framework/program/op_desc.cpp delete mode 100644 mobile/src/framework/program/op_desc.h delete mode 100644 mobile/src/framework/program/program-optimize/fusion_op_register.h delete mode 100644 mobile/src/framework/program/program-optimize/node.cpp delete mode 100644 mobile/src/framework/program/program-optimize/node.h delete mode 100644 mobile/src/framework/program/program-optimize/program_optimize.cpp delete mode 100644 mobile/src/framework/program/program-optimize/program_optimize.h delete mode 100644 mobile/src/framework/program/program.h delete mode 100644 mobile/src/framework/program/program_desc.cpp delete mode 100644 mobile/src/framework/program/program_desc.h delete mode 100644 mobile/src/framework/program/tensor_desc.h delete mode 100644 mobile/src/framework/program/var_desc.h delete mode 100644 mobile/src/framework/scope.cpp delete mode 100644 mobile/src/framework/scope.h delete mode 100644 mobile/src/framework/selected_rows.cpp delete mode 100644 mobile/src/framework/selected_rows.h delete mode 100644 mobile/src/framework/tensor.h delete mode 100644 mobile/src/framework/tensor_base.h delete mode 100644 mobile/src/framework/tensor_util.cpp delete mode 100644 mobile/src/framework/tensor_util.h delete mode 100644 mobile/src/framework/type_trait.h delete mode 100644 mobile/src/framework/variable.h delete mode 100644 mobile/src/framework/zynqmp/ztensor.hpp delete mode 100644 mobile/src/io/api.cc delete mode 100644 mobile/src/io/api_paddle_mobile.cc delete mode 100644 mobile/src/io/api_paddle_mobile.h delete mode 100644 mobile/src/io/ios_io/PaddleMobileCPU.h delete mode 100644 mobile/src/io/ios_io/PaddleMobileCPU.mm delete mode 100644 mobile/src/io/jni/PML.java delete mode 100644 mobile/src/io/jni/paddle_mobile_jni.cpp delete mode 100644 mobile/src/io/jni/paddle_mobile_jni.h delete mode 100644 mobile/src/io/loader.h delete mode 100644 mobile/src/io/opencl_interface.cpp delete mode 100644 mobile/src/io/opencl_interface.h delete mode 100644 mobile/src/io/paddle_inference_api.h delete mode 100644 mobile/src/io/paddle_mobile.cpp delete mode 100644 mobile/src/io/paddle_mobile.h delete mode 100644 mobile/src/io/paddle_mobile_wrap.cpp delete mode 100644 mobile/src/io/paddle_mobile_wrap.h delete mode 100644 mobile/src/io/paddle_test_inference_api.cpp delete mode 100644 mobile/src/io/paddle_test_inference_api.h delete mode 100755 mobile/src/memory/t_malloc.cpp delete mode 100644 mobile/src/memory/t_malloc.h delete mode 100755 mobile/src/operators/activation_op.cpp delete mode 100644 mobile/src/operators/activation_op.h delete mode 100644 mobile/src/operators/assign_op.cpp delete mode 100644 mobile/src/operators/assign_op.h delete mode 100644 mobile/src/operators/assign_value_op.cpp delete mode 100644 mobile/src/operators/assign_value_op.h delete mode 100644 mobile/src/operators/batchnorm_op.cpp delete mode 100644 mobile/src/operators/batchnorm_op.h delete mode 100644 mobile/src/operators/beam_search_decode_op.cpp delete mode 100644 mobile/src/operators/beam_search_decode_op.h delete mode 100644 mobile/src/operators/beam_search_op.cpp delete mode 100644 mobile/src/operators/beam_search_op.h delete mode 100644 mobile/src/operators/bilinear_interp_op.cpp delete mode 100644 mobile/src/operators/bilinear_interp_op.h delete mode 100644 mobile/src/operators/box_coder_op.cpp delete mode 100644 mobile/src/operators/box_coder_op.h delete mode 100644 mobile/src/operators/cast_op.cpp delete mode 100644 mobile/src/operators/cast_op.h delete mode 100644 mobile/src/operators/compare_op.cpp delete mode 100644 mobile/src/operators/compare_op.h delete mode 100644 mobile/src/operators/concat_op.cpp delete mode 100644 mobile/src/operators/concat_op.h delete mode 100644 mobile/src/operators/conditional_block_op.cpp delete mode 100644 mobile/src/operators/conditional_block_op.h delete mode 100644 mobile/src/operators/controlflow/tensor_array_read_write_op.cpp delete mode 100644 mobile/src/operators/controlflow/tensor_array_read_write_op.h delete mode 100644 mobile/src/operators/controlflow/while_op.cpp delete mode 100644 mobile/src/operators/controlflow/while_op.h delete mode 100644 mobile/src/operators/conv_op.cpp delete mode 100644 mobile/src/operators/conv_op.h delete mode 100755 mobile/src/operators/conv_transpose_op.cpp delete mode 100755 mobile/src/operators/conv_transpose_op.h delete mode 100644 mobile/src/operators/crf_op.cpp delete mode 100644 mobile/src/operators/crf_op.h delete mode 100644 mobile/src/operators/depthwise_conv_op.cpp delete mode 100644 mobile/src/operators/depthwise_conv_op.h delete mode 100644 mobile/src/operators/dequantize_op.cpp delete mode 100644 mobile/src/operators/dequantize_op.h delete mode 100644 mobile/src/operators/detection_ops.cpp delete mode 100644 mobile/src/operators/detection_ops.h delete mode 100644 mobile/src/operators/dropout_op.cpp delete mode 100644 mobile/src/operators/dropout_op.h delete mode 100644 mobile/src/operators/elementwise_add_op.cpp delete mode 100644 mobile/src/operators/elementwise_add_op.h delete mode 100644 mobile/src/operators/elementwise_mul_op.cpp delete mode 100644 mobile/src/operators/elementwise_mul_op.h delete mode 100644 mobile/src/operators/elementwise_sub_op.cpp delete mode 100644 mobile/src/operators/elementwise_sub_op.h delete mode 100644 mobile/src/operators/exp_op.cpp delete mode 100644 mobile/src/operators/exp_op.h delete mode 100644 mobile/src/operators/feed_op.cpp delete mode 100644 mobile/src/operators/feed_op.h delete mode 100644 mobile/src/operators/fetch_op.cpp delete mode 100644 mobile/src/operators/fetch_op.h delete mode 100644 mobile/src/operators/fill_constant_batch_size_like_op.cpp delete mode 100644 mobile/src/operators/fill_constant_batch_size_like_op.h delete mode 100644 mobile/src/operators/fill_constant_op.cpp delete mode 100644 mobile/src/operators/fill_constant_op.h delete mode 100644 mobile/src/operators/flatten2_op.cpp delete mode 100644 mobile/src/operators/flatten2_op.h delete mode 100644 mobile/src/operators/flatten_op.cpp delete mode 100644 mobile/src/operators/flatten_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_bn_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_op.h delete mode 100644 mobile/src/operators/fusion_conv_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_bn_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_bn_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_bn_op.h delete mode 100644 mobile/src/operators/fusion_conv_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_conv_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_conv_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_op.h delete mode 100755 mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_add_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_op.h delete mode 100644 mobile/src/operators/fusion_deconv_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_deconv_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_deconv_relu_op.h delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_op.h delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h delete mode 100644 mobile/src/operators/fusion_dequant_bn_op.cpp delete mode 100644 mobile/src/operators/fusion_dequant_bn_op.h delete mode 100644 mobile/src/operators/fusion_dequant_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_dwconv_bn_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_dwconv_bn_relu_op.h delete mode 100644 mobile/src/operators/fusion_elementwise_add_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_elementwise_add_relu_op.h delete mode 100644 mobile/src/operators/fusion_fc_op.cpp delete mode 100644 mobile/src/operators/fusion_fc_op.h delete mode 100644 mobile/src/operators/fusion_fc_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_fc_relu_op.h delete mode 100644 mobile/src/operators/fusion_instancenorm_relu_op.cpp delete mode 100644 mobile/src/operators/fusion_instancenorm_relu_op.h delete mode 100644 mobile/src/operators/gru_op.cpp delete mode 100644 mobile/src/operators/gru_op.h delete mode 100644 mobile/src/operators/gru_unit_op.cpp delete mode 100644 mobile/src/operators/gru_unit_op.h delete mode 100644 mobile/src/operators/im2sequence_op.cpp delete mode 100644 mobile/src/operators/im2sequence_op.h delete mode 100644 mobile/src/operators/increment_op.cpp delete mode 100644 mobile/src/operators/increment_op.h delete mode 100644 mobile/src/operators/instancenorm_op.cpp delete mode 100644 mobile/src/operators/instancenorm_op.h delete mode 100644 mobile/src/operators/is_empty_op.cpp delete mode 100644 mobile/src/operators/is_empty_op.h delete mode 100644 mobile/src/operators/kernel/activation_kernel.h delete mode 100644 mobile/src/operators/kernel/arm/activation_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/assign_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/assign_value_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/batchnorm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/beam_search_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/box_coder_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/cast_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/compare_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/conditional_block_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_common.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_common.h delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/crf_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/dequantize_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/exp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/flatten_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/gru_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/gru_unit_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/im2sequence_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/increment_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/is_empty_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/lod_reset_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/logical_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/lookup_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/lrn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/norm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/one_hot_kernel.cpp delete mode 100755 mobile/src/operators/kernel/arm/pad2d_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/prelu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/proposal_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/quantize_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/resize_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/scale_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/shape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/slice_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/sum_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/top_k_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/transpose2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/arm/while_kernel.cpp delete mode 100644 mobile/src/operators/kernel/assign_kernel.h delete mode 100644 mobile/src/operators/kernel/assign_value_kernel.h delete mode 100644 mobile/src/operators/kernel/batchnorm_kernel.h delete mode 100644 mobile/src/operators/kernel/beam_search_decode_kernel.h delete mode 100644 mobile/src/operators/kernel/beam_search_kernel.h delete mode 100644 mobile/src/operators/kernel/bilinear_interp_kernel.h delete mode 100644 mobile/src/operators/kernel/box_coder_kernel.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/activation_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/concat_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/crf_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/gru_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/increment_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/mul_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/norm_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/pool_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/shape_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/split_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/sum_arm_func.h delete mode 100644 mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h delete mode 100644 mobile/src/operators/kernel/cl/batchnorm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/box_coder_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/cl_common.h delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl delete mode 100755 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/relu.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/relu6.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/reshape.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/softmax.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl delete mode 100644 mobile/src/operators/kernel/cl/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp delete mode 100755 mobile/src/operators/kernel/cl/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/exp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/flatten2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/gen_code.py delete mode 100644 mobile/src/operators/kernel/cl/instancenorm_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/lrn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/pad2d_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/prior_box_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/relu6_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/scale_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/sigmoid_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/slice_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/tanh_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/transpose2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/cl/transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/compare_kernel.h delete mode 100644 mobile/src/operators/kernel/concat_kernel.h delete mode 100644 mobile/src/operators/kernel/conditional_block_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_bn_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_add_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_bn_add_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_bn_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/conv_transpose_kernel.h delete mode 100644 mobile/src/operators/kernel/crf_kernel.h delete mode 100755 mobile/src/operators/kernel/deconv_add_bn_kernel.h delete mode 100755 mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/deconv_add_kernel.h delete mode 100644 mobile/src/operators/kernel/deconv_add_relu_kernel.h delete mode 100755 mobile/src/operators/kernel/deconv_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/deconv_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/dequant_bn_kernel.h delete mode 100644 mobile/src/operators/kernel/dequantize_kernel.h delete mode 100644 mobile/src/operators/kernel/detection_kernel.h delete mode 100644 mobile/src/operators/kernel/dropout_kernel.h delete mode 100644 mobile/src/operators/kernel/dwconv_bn_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_add_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_add_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_mul_kernel.h delete mode 100644 mobile/src/operators/kernel/elementwise_sub_kernel.h delete mode 100644 mobile/src/operators/kernel/exp_kernel.h delete mode 100644 mobile/src/operators/kernel/fc_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/feed_kernel.h delete mode 100644 mobile/src/operators/kernel/fetch_kernel.h delete mode 100644 mobile/src/operators/kernel/flatten2_kernel.h delete mode 100644 mobile/src/operators/kernel/flatten_kernel.h delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp delete mode 100755 mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/split_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp delete mode 100644 mobile/src/operators/kernel/fusion_fc_kernel.h delete mode 100644 mobile/src/operators/kernel/gru_kernel.h delete mode 100644 mobile/src/operators/kernel/gru_unit_kernel.h delete mode 100644 mobile/src/operators/kernel/im2sequence_kernel.h delete mode 100644 mobile/src/operators/kernel/increment_kernel.h delete mode 100644 mobile/src/operators/kernel/instancenorm_kernel.h delete mode 100644 mobile/src/operators/kernel/instancenorm_relu_kernel.h delete mode 100644 mobile/src/operators/kernel/is_empty_kernel.h delete mode 100644 mobile/src/operators/kernel/kernels.h delete mode 100644 mobile/src/operators/kernel/logical_kernel.h delete mode 100644 mobile/src/operators/kernel/lookup_kernel.h delete mode 100644 mobile/src/operators/kernel/lrn_kernel.h delete mode 100644 mobile/src/operators/kernel/mul_kernel.h delete mode 100644 mobile/src/operators/kernel/multiclass_nms_kernel.h delete mode 100644 mobile/src/operators/kernel/nearest_interp_kernel.h delete mode 100644 mobile/src/operators/kernel/norm_kernel.h delete mode 100644 mobile/src/operators/kernel/one_hot_kernel.h delete mode 100644 mobile/src/operators/kernel/pad2d_kernel.h delete mode 100644 mobile/src/operators/kernel/polygon_box_transform_kernel.h delete mode 100644 mobile/src/operators/kernel/pool_kernel.h delete mode 100644 mobile/src/operators/kernel/prelu_kernel.h delete mode 100644 mobile/src/operators/kernel/prior_box_kernel.h delete mode 100644 mobile/src/operators/kernel/quantize_kernel.h delete mode 100644 mobile/src/operators/kernel/range_kernel.cpp delete mode 100644 mobile/src/operators/kernel/range_kernel.h delete mode 100644 mobile/src/operators/kernel/reduce_prod_kernel.cpp delete mode 100644 mobile/src/operators/kernel/reduce_prod_kernel.h delete mode 100644 mobile/src/operators/kernel/reshape2_kernel.h delete mode 100644 mobile/src/operators/kernel/reshape_kernel.h delete mode 100644 mobile/src/operators/kernel/resize_kernel.h delete mode 100644 mobile/src/operators/kernel/scale_kernel.h delete mode 100644 mobile/src/operators/kernel/sequence_kernels.h delete mode 100644 mobile/src/operators/kernel/shape_kernel.h delete mode 100644 mobile/src/operators/kernel/slice_kernel.h delete mode 100644 mobile/src/operators/kernel/softmax_kernel.h delete mode 100644 mobile/src/operators/kernel/split_kernel.h delete mode 100644 mobile/src/operators/kernel/sum_kernel.h delete mode 100644 mobile/src/operators/kernel/tanh_kernel.h delete mode 100644 mobile/src/operators/kernel/tensor_array_read_write_kernel.h delete mode 100644 mobile/src/operators/kernel/transpose2_kernel.h delete mode 100644 mobile/src/operators/kernel/transpose_kernel.h delete mode 100644 mobile/src/operators/kernel/while_kernel.h delete mode 100644 mobile/src/operators/lod_reset_op.cpp delete mode 100644 mobile/src/operators/lod_reset_op.h delete mode 100644 mobile/src/operators/logical_op.cpp delete mode 100644 mobile/src/operators/logical_op.h delete mode 100644 mobile/src/operators/lookup_op.cpp delete mode 100644 mobile/src/operators/lookup_op.h delete mode 100644 mobile/src/operators/lrn_op.cpp delete mode 100644 mobile/src/operators/lrn_op.h delete mode 100644 mobile/src/operators/math/activation.h delete mode 100644 mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h delete mode 100644 mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv3x3.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv3x3.h delete mode 100644 mobile/src/operators/math/depthwise_conv3x3_int8.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv5x5.cpp delete mode 100644 mobile/src/operators/math/depthwise_conv5x5.h delete mode 100644 mobile/src/operators/math/depthwise_conv5x5_int8.cpp delete mode 100644 mobile/src/operators/math/element_wise.h delete mode 100644 mobile/src/operators/math/elementwise_op_function.h delete mode 100644 mobile/src/operators/math/gemm.cpp delete mode 100644 mobile/src/operators/math/gemm.h delete mode 100644 mobile/src/operators/math/gemm/cblas.cc delete mode 100644 mobile/src/operators/math/gemm/cblas.h delete mode 100644 mobile/src/operators/math/gemm/executor.h delete mode 100644 mobile/src/operators/math/gemm/gemm1x1s1.cpp delete mode 100644 mobile/src/operators/math/gemm/gemm1x1s1.h delete mode 100644 mobile/src/operators/math/gemm/gemm_kernel.h delete mode 100644 mobile/src/operators/math/gemm/pack_kernel.h delete mode 100644 mobile/src/operators/math/gemm/strategy.h delete mode 100644 mobile/src/operators/math/gemm_int8.cpp delete mode 100644 mobile/src/operators/math/gemm_omp_int8.cpp delete mode 100644 mobile/src/operators/math/gpc.cpp delete mode 100644 mobile/src/operators/math/gpc.h delete mode 100644 mobile/src/operators/math/gru_compute.cpp delete mode 100644 mobile/src/operators/math/gru_compute.h delete mode 100644 mobile/src/operators/math/gru_cpu_kernel.h delete mode 100644 mobile/src/operators/math/im2col.cpp delete mode 100644 mobile/src/operators/math/im2col.h delete mode 100644 mobile/src/operators/math/math.h delete mode 100644 mobile/src/operators/math/math_function.cpp delete mode 100644 mobile/src/operators/math/math_function.h delete mode 100644 mobile/src/operators/math/math_function_int8.cpp delete mode 100644 mobile/src/operators/math/pad.cpp delete mode 100644 mobile/src/operators/math/pad.h delete mode 100644 mobile/src/operators/math/poly_util.cpp delete mode 100644 mobile/src/operators/math/poly_util.h delete mode 100644 mobile/src/operators/math/pooling.cpp delete mode 100644 mobile/src/operators/math/pooling.h delete mode 100644 mobile/src/operators/math/pooling2x2.cpp delete mode 100644 mobile/src/operators/math/pooling3x3.cpp delete mode 100644 mobile/src/operators/math/quantize.h delete mode 100644 mobile/src/operators/math/selected_rows_functor.h delete mode 100644 mobile/src/operators/math/sequence2batch.cpp delete mode 100644 mobile/src/operators/math/sequence2batch.h delete mode 100644 mobile/src/operators/math/slidingwindow_conv3x3.cpp delete mode 100644 mobile/src/operators/math/slidingwindow_conv3x3.h delete mode 100644 mobile/src/operators/math/slidingwindow_utils.cpp delete mode 100644 mobile/src/operators/math/slidingwindow_utils.h delete mode 100644 mobile/src/operators/math/softmax.cpp delete mode 100644 mobile/src/operators/math/softmax.h delete mode 100644 mobile/src/operators/math/transform.h delete mode 100644 mobile/src/operators/math/vol2col.cpp delete mode 100644 mobile/src/operators/math/vol2col.h delete mode 100644 mobile/src/operators/math/winograd/winograd_transform.h delete mode 100644 mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp delete mode 100644 mobile/src/operators/mul_op.cpp delete mode 100644 mobile/src/operators/mul_op.h delete mode 100644 mobile/src/operators/multiclass_nms_op.cpp delete mode 100644 mobile/src/operators/multiclass_nms_op.h delete mode 100644 mobile/src/operators/nearest_interp_op.cpp delete mode 100644 mobile/src/operators/nearest_interp_op.h delete mode 100644 mobile/src/operators/norm_op.cpp delete mode 100644 mobile/src/operators/norm_op.h delete mode 100644 mobile/src/operators/one_hot_op.cpp delete mode 100644 mobile/src/operators/one_hot_op.h delete mode 100644 mobile/src/operators/op_param.cpp delete mode 100644 mobile/src/operators/op_param.h delete mode 100755 mobile/src/operators/pad2d_op.cpp delete mode 100644 mobile/src/operators/pad2d_op.h delete mode 100644 mobile/src/operators/polygon_box_transform_op.cpp delete mode 100644 mobile/src/operators/polygon_box_transform_op.h delete mode 100644 mobile/src/operators/pool_op.cpp delete mode 100644 mobile/src/operators/pool_op.h delete mode 100644 mobile/src/operators/prelu_op.cpp delete mode 100644 mobile/src/operators/prelu_op.h delete mode 100644 mobile/src/operators/prior_box_op.cpp delete mode 100644 mobile/src/operators/prior_box_op.h delete mode 100644 mobile/src/operators/quantize_op.cpp delete mode 100644 mobile/src/operators/quantize_op.h delete mode 100644 mobile/src/operators/range_op.cpp delete mode 100644 mobile/src/operators/range_op.h delete mode 100644 mobile/src/operators/reduce_prod_op.cpp delete mode 100644 mobile/src/operators/reduce_prod_op.h delete mode 100644 mobile/src/operators/reshape2_op.cpp delete mode 100644 mobile/src/operators/reshape2_op.h delete mode 100644 mobile/src/operators/reshape_op.cpp delete mode 100644 mobile/src/operators/reshape_op.h delete mode 100644 mobile/src/operators/resize_op.cpp delete mode 100644 mobile/src/operators/resize_op.h delete mode 100644 mobile/src/operators/scale_op.cpp delete mode 100644 mobile/src/operators/scale_op.h delete mode 100644 mobile/src/operators/sequence_ops/sequence_expand_op.cpp delete mode 100644 mobile/src/operators/sequence_ops/sequence_expand_op.h delete mode 100644 mobile/src/operators/sequence_ops/sequence_pool_op.cpp delete mode 100644 mobile/src/operators/sequence_ops/sequence_pool_op.h delete mode 100644 mobile/src/operators/sequence_ops/sequence_softmax_op.cpp delete mode 100644 mobile/src/operators/sequence_ops/sequence_softmax_op.h delete mode 100644 mobile/src/operators/shape_op.cpp delete mode 100644 mobile/src/operators/shape_op.h delete mode 100644 mobile/src/operators/slice_op.cpp delete mode 100644 mobile/src/operators/slice_op.h delete mode 100644 mobile/src/operators/softmax_op.cpp delete mode 100644 mobile/src/operators/softmax_op.h delete mode 100644 mobile/src/operators/split_op.cpp delete mode 100644 mobile/src/operators/split_op.h delete mode 100644 mobile/src/operators/sum_op.cpp delete mode 100644 mobile/src/operators/sum_op.h delete mode 100644 mobile/src/operators/top_k_op.cpp delete mode 100644 mobile/src/operators/top_k_op.h delete mode 100644 mobile/src/operators/transpose2_op.cpp delete mode 100644 mobile/src/operators/transpose2_op.h delete mode 100644 mobile/src/operators/transpose_op.cpp delete mode 100644 mobile/src/operators/transpose_op.h delete mode 100644 mobile/src/pass/memory_optimize.cpp delete mode 100644 mobile/src/pass/memory_optimize.h delete mode 100644 mobile/src/pass/memory_optimize_super.cpp delete mode 100644 mobile/src/pass/memory_optimize_super.h delete mode 100644 mobile/src/pass/model_obfuscate.cpp delete mode 100644 mobile/src/pass/model_obfuscate.h delete mode 100644 mobile/src/pass/pass_base.h delete mode 100644 mobile/src/protobuf-c/protobuf-c.cpp delete mode 100644 mobile/src/protobuf-c/protobuf-c.h delete mode 100644 mobile/test/CMakeLists.txt delete mode 100644 mobile/test/common/test_enforce.cpp delete mode 100644 mobile/test/common/test_gemm_accuracy.cpp delete mode 100644 mobile/test/common/test_gemm_int8_accuracy.cpp delete mode 100644 mobile/test/common/test_gemm_perf.cpp delete mode 100644 mobile/test/common/test_lib_size.cpp delete mode 100644 mobile/test/common/test_lib_size.h delete mode 100644 mobile/test/common/test_log.cpp delete mode 100644 mobile/test/common/test_openmp.cpp delete mode 100644 mobile/test/executor_for_test.h delete mode 100644 mobile/test/fpga/test_concat_op.cpp delete mode 100644 mobile/test/fpga/test_densebox_combine.cpp delete mode 100644 mobile/test/fpga/test_format_data.cpp delete mode 100644 mobile/test/fpga/test_marker.cpp delete mode 100644 mobile/test/fpga/test_marker2.cpp delete mode 100644 mobile/test/fpga/test_marker_api.cpp delete mode 100644 mobile/test/fpga/test_mobilenet_api.cpp delete mode 100644 mobile/test/fpga/test_pe.cpp delete mode 100644 mobile/test/fpga/test_resnet50.cpp delete mode 100644 mobile/test/fpga/test_rfcn.cpp delete mode 100644 mobile/test/fpga/test_rfcn_api.cpp delete mode 100644 mobile/test/fpga/test_ssd.cpp delete mode 100644 mobile/test/fpga/test_tensor_quant.cpp delete mode 100644 mobile/test/fpga/test_yolo_api.cpp delete mode 100644 mobile/test/framework/test_inference_api.cpp delete mode 100644 mobile/test/framework/test_load.cpp delete mode 100644 mobile/test/framework/test_load_memory.cpp delete mode 100644 mobile/test/framework/test_load_memory_inference_api.cpp delete mode 100644 mobile/test/framework/test_optimize.cpp delete mode 100644 mobile/test/net/test_alexnet.cpp delete mode 100644 mobile/test/net/test_benchmark.cpp delete mode 100644 mobile/test/net/test_eng.cpp delete mode 100644 mobile/test/net/test_genet_combine.cpp delete mode 100644 mobile/test/net/test_gesture.cpp delete mode 100644 mobile/test/net/test_googlenet.cpp delete mode 100644 mobile/test/net/test_googlenet_quali.cpp delete mode 100644 mobile/test/net/test_googlenetv1_combine.cpp delete mode 100644 mobile/test/net/test_inceptionv4.cpp delete mode 100644 mobile/test/net/test_mobilenet+ssd.cpp delete mode 100644 mobile/test/net/test_mobilenet.cpp delete mode 100644 mobile/test/net/test_mobilenet_025_fssd.cpp delete mode 100644 mobile/test/net/test_mobilenet_GPU.cpp delete mode 100644 mobile/test/net/test_mobilenet_combine.cpp delete mode 100644 mobile/test/net/test_multi_inference_predict.cpp delete mode 100644 mobile/test/net/test_net.cpp delete mode 100644 mobile/test/net/test_net_benchmark.cpp delete mode 100644 mobile/test/net/test_nlp.cpp delete mode 100644 mobile/test/net/test_ocr.cpp delete mode 100644 mobile/test/net/test_op_in_net.cpp delete mode 100644 mobile/test/net/test_resnet.cpp delete mode 100644 mobile/test/net/test_squeezenet.cpp delete mode 100644 mobile/test/net/test_super.cpp delete mode 100644 mobile/test/net/test_vgg16ssd.cpp delete mode 100644 mobile/test/net/test_wrap.cpp delete mode 100644 mobile/test/net/test_yolo.cpp delete mode 100644 mobile/test/net/test_yolo_combined.cpp delete mode 100644 mobile/test/net/test_yologpu.cpp delete mode 100644 mobile/test/operators/test_batchnorm_op.cpp delete mode 100644 mobile/test/operators/test_box_coder_op.cpp delete mode 100644 mobile/test/operators/test_cast_op.cpp delete mode 100644 mobile/test/operators/test_concat_op.cpp delete mode 100644 mobile/test/operators/test_conv_add_relu_op.cpp delete mode 100644 mobile/test/operators/test_conv_bn_relu_op.cpp delete mode 100644 mobile/test/operators/test_conv_gpu.cpp delete mode 100644 mobile/test/operators/test_conv_op.cpp delete mode 100644 mobile/test/operators/test_depthwise_conv_op.cpp delete mode 100644 mobile/test/operators/test_dequantize_op.cpp delete mode 100644 mobile/test/operators/test_dwconv_bn_relu_op.cpp delete mode 100644 mobile/test/operators/test_elementwise_add_op.cpp delete mode 100644 mobile/test/operators/test_elementwise_sub_op.cpp delete mode 100644 mobile/test/operators/test_fill_constant_op.cpp delete mode 100644 mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp delete mode 100644 mobile/test/operators/test_fusion_fc_op.cpp delete mode 100644 mobile/test/operators/test_gru_op.cpp delete mode 100644 mobile/test/operators/test_im2sequence_op.cpp delete mode 100644 mobile/test/operators/test_increment_op.cpp delete mode 100644 mobile/test/operators/test_is_empty_op.cpp delete mode 100644 mobile/test/operators/test_leaky_relu_op.cpp delete mode 100644 mobile/test/operators/test_less_than_op.cpp delete mode 100644 mobile/test/operators/test_log_op.cpp delete mode 100644 mobile/test/operators/test_logical_and_op.cpp delete mode 100644 mobile/test/operators/test_logical_not_op.cpp delete mode 100644 mobile/test/operators/test_logical_or_op.cpp delete mode 100644 mobile/test/operators/test_logical_xor_op.cpp delete mode 100644 mobile/test/operators/test_lrn_op.cpp delete mode 100644 mobile/test/operators/test_mul_op.cpp delete mode 100644 mobile/test/operators/test_multiclass_nms_op.cpp delete mode 100644 mobile/test/operators/test_polygon_box_transform_op.cpp delete mode 100644 mobile/test/operators/test_pool_op.cpp delete mode 100644 mobile/test/operators/test_prelu_op.cpp delete mode 100644 mobile/test/operators/test_prior_box_op.cpp delete mode 100644 mobile/test/operators/test_quantize_op.cpp delete mode 100644 mobile/test/operators/test_relu6_op.cpp delete mode 100644 mobile/test/operators/test_relu_op.cpp delete mode 100644 mobile/test/operators/test_reshape2_op.cpp delete mode 100644 mobile/test/operators/test_reshape_op.cpp delete mode 100644 mobile/test/operators/test_resize_op.cpp delete mode 100644 mobile/test/operators/test_scale_op.cpp delete mode 100644 mobile/test/operators/test_sequence_expand_op.cpp delete mode 100644 mobile/test/operators/test_sequence_pool_op.cpp delete mode 100644 mobile/test/operators/test_sequence_softmax_op.cpp delete mode 100644 mobile/test/operators/test_sigmoid_op.cpp delete mode 100644 mobile/test/operators/test_slice_op.cpp delete mode 100644 mobile/test/operators/test_softmax_op.cpp delete mode 100644 mobile/test/operators/test_sum_op.cpp delete mode 100644 mobile/test/operators/test_tanh_op.cpp delete mode 100644 mobile/test/operators/test_topk_op.cpp delete mode 100644 mobile/test/operators/test_transpose2_op.cpp delete mode 100644 mobile/test/operators/test_transpose_op.cpp delete mode 100644 mobile/test/test_helper.h delete mode 100644 mobile/test/test_include.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_egl.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_platform.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_version.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/opencl.h delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/LICENSE delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/README.md delete mode 100644 mobile/tools/android-cmake/android.toolchain.cmake delete mode 100644 mobile/tools/android-debug-script/push2android.sh delete mode 100644 mobile/tools/android-debug-script/run_on_android.sh delete mode 100644 mobile/tools/arm-platform.cmake delete mode 100755 mobile/tools/build.sh delete mode 100755 mobile/tools/ci_build.sh delete mode 100644 mobile/tools/ci_run_test.sh delete mode 100644 mobile/tools/docker_build_fpga.sh delete mode 100644 mobile/tools/ios-cmake/ios.toolchain.cmake delete mode 100644 mobile/tools/net-detail.awk delete mode 100644 mobile/tools/net.awk delete mode 100755 mobile/tools/op.cmake delete mode 100644 mobile/tools/pre-commit.hooks/clang-format.hook delete mode 100755 mobile/tools/pre-commit.hooks/clang-tidy.hook delete mode 100644 mobile/tools/pre-commit.hooks/copyright.hook delete mode 100644 mobile/tools/pre-commit.hooks/cpplint.hook delete mode 100755 mobile/tools/prepare_images_and_models.sh delete mode 100644 mobile/tools/profile_show.sh delete mode 100644 mobile/tools/python/caffetools/run.py delete mode 100644 mobile/tools/python/fluidtools/.gitignore delete mode 100644 mobile/tools/python/fluidtools/run.py delete mode 100644 mobile/tools/python/fluidtools/test_wrap.py delete mode 100644 mobile/tools/python/imagetools/README.md delete mode 100644 mobile/tools/python/imagetools/imagetools.py delete mode 100644 mobile/tools/python/imagetools/img2nchw.py delete mode 100644 mobile/tools/python/imagetools/img2nhwc.py delete mode 100644 mobile/tools/python/imagetools/numpy2binary.py delete mode 100644 mobile/tools/python/misc/.gitignore delete mode 100644 mobile/tools/python/misc/fluidtools.py delete mode 100644 mobile/tools/python/misc/ios-test-server.py delete mode 100644 mobile/tools/python/misc/restore-git.py delete mode 100644 mobile/tools/python/misc/test-fluid-op-feature.py delete mode 100644 mobile/tools/python/modeltools/.gitignore delete mode 100644 mobile/tools/python/modeltools/core/__init__.py delete mode 100644 mobile/tools/python/modeltools/core/framework.proto delete mode 100644 mobile/tools/python/modeltools/core/framework_pb2.py delete mode 100644 mobile/tools/python/modeltools/core/op_types.py delete mode 100644 mobile/tools/python/modeltools/mobilenet/__init__.py delete mode 100644 mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py delete mode 100644 mobile/tools/python/modeltools/mobilenet/swicher.py delete mode 100644 mobile/tools/python/modeltools/tools/__init__.py delete mode 100644 mobile/tools/python/modeltools/tools/float2halffloat.py delete mode 100644 mobile/tools/python/modeltools/tools/loader.py delete mode 100644 mobile/tools/python/modeltools/tools/model_combine.py delete mode 100644 mobile/tools/python/modeltools/tools/model_reader.py delete mode 100644 mobile/tools/python/modeltools/yolo/__init__.py delete mode 100644 mobile/tools/python/modeltools/yolo/mdl2fluid.py delete mode 100644 mobile/tools/python/modeltools/yolo/swicher.py delete mode 100644 mobile/tools/quantification/CMakeLists.txt delete mode 100644 mobile/tools/quantification/README.md delete mode 100644 mobile/tools/quantification/convert.cpp delete mode 100644 mobile/tools/quantification/src/block_desc_local.cpp delete mode 100644 mobile/tools/quantification/src/block_desc_local.h delete mode 100644 mobile/tools/quantification/src/enforce.h delete mode 100644 mobile/tools/quantification/src/framework.pb-c.c delete mode 100644 mobile/tools/quantification/src/framework.pb-c.h delete mode 100644 mobile/tools/quantification/src/program_desc.cpp delete mode 100644 mobile/tools/quantification/src/program_desc.h delete mode 100644 mobile/tools/quantification/src/protobuf-c.c delete mode 100644 mobile/tools/quantification/src/protobuf-c.h delete mode 100644 mobile/tools/quantification/src/tensor_desc.h delete mode 100644 mobile/tools/quantification/src/var_desc.h delete mode 100755 mobile/tools/shell/change_mobile_namespace.sh delete mode 100644 mobile/tools/shell/check-bitcode.sh delete mode 100644 mobile/tools/shell/check-filename.sh delete mode 100644 mobile/tools/shell/generate-include/.gitignore delete mode 100644 mobile/tools/shell/generate-include/check_include_diff.sh delete mode 100644 mobile/tools/shell/generate-include/main.cpp delete mode 100644 mobile/tools/shell/generate-include/parse.py delete mode 100755 mobile/tools/shell/generate-include/run.sh delete mode 100644 mobile/tools/shell/merge.sh delete mode 100644 mobile/tools/shell/prune_static_library.sh delete mode 100644 mobile/tools/shell/restore-private-repo.sh delete mode 100644 mobile/tools/toolchains/arm-android-neon.cmake delete mode 100644 mobile/tools/toolchains/arm-linux-gnueabi.cmake delete mode 100644 mobile/tools/toolchains/arm-linux-gnueabihf.cmake create mode 100644 model_optimize_tool.md create mode 100644 model_quantization.md create mode 100644 npu.md create mode 100644 opencl.md create mode 100644 paddle-mobile.md create mode 100644 roadmap.md create mode 100644 source_compile.md create mode 100644 source_compile.md.toc.2019-08-29_160045 create mode 100644 support_operation_list.md create mode 100644 tech_highlights.md delete mode 160000 third-party/gflags delete mode 160000 third-party/googletest delete mode 160000 third-party/protobuf-host delete mode 160000 third-party/protobuf-mobile delete mode 100644 tools/codestyle/.gitignore delete mode 100755 tools/codestyle/clang_format.hook delete mode 100644 tools/codestyle/copyright.hook delete mode 100755 tools/codestyle/cpplint_pre_commit.hook delete mode 100644 tools/codestyle/docstring_checker.py delete mode 100755 tools/codestyle/pylint_pre_commit.hook delete mode 100644 tools/codestyle/test_docstring_checker.py delete mode 100755 tools/document_preview.sh create mode 100644 tutorial.md delete mode 100644 web/.editorconfig delete mode 100644 web/.gitignore delete mode 100644 web/.npmrc delete mode 100644 web/README.md delete mode 100644 web/README_cn.md delete mode 100644 web/demo/index.es6 delete mode 100644 web/demo/index.html delete mode 100644 web/demo/videoDemo.es6 delete mode 100644 web/demo/videoDemo.html delete mode 100644 web/package.json delete mode 100644 web/scripts/build.sh delete mode 100644 web/src/banana.jpeg delete mode 100644 web/src/executor/camera.es6 delete mode 100644 web/src/executor/executor.es6 delete mode 100644 web/src/executor/loader.es6 delete mode 100644 web/src/executor/postProcess.es6 delete mode 100644 web/src/executor/runner.es6 delete mode 100644 web/src/factory/fshader/factory.es6 delete mode 100644 web/src/factory/fshader/ops.es6 delete mode 100644 web/src/feed/ImageFeed.es6 delete mode 100644 web/src/feed/dataFeed.es6 delete mode 100644 web/src/feed/io.es6 delete mode 100644 web/src/gpu/gpu.es6 delete mode 100644 web/src/index.es6 delete mode 100644 web/src/index.html delete mode 100644 web/src/runtime/runtime.es6 delete mode 100644 web/src/shader/atom/common_func.es6 delete mode 100644 web/src/shader/atom/common_params.es6 delete mode 100644 web/src/shader/atom/getArrayIndexFromTensorPos.es6 delete mode 100644 web/src/shader/atom/getArrayIndexFromTexturePos.es6 delete mode 100644 web/src/shader/atom/getOutputTensorPos.es6 delete mode 100644 web/src/shader/atom/getPixelsFromTexturePos.es6 delete mode 100644 web/src/shader/atom/getRangePowSumFromArrayIndex.es6 delete mode 100644 web/src/shader/atom/getRangeSumFromArrayIndex.es6 delete mode 100644 web/src/shader/atom/getTensorPosFromArrayIndex.es6 delete mode 100644 web/src/shader/atom/getTexturePosFromArrayIndex.es6 delete mode 100644 web/src/shader/atom/getValueFromTensorPos.es6 delete mode 100644 web/src/shader/atom/getValueFromTensorPosPacked.es6 delete mode 100644 web/src/shader/atom/getValueFromTexturePos.es6 delete mode 100644 web/src/shader/atom/moveTexture2PosToReal.es6 delete mode 100644 web/src/shader/atom/prefix.es6 delete mode 100644 web/src/shader/atom/prefix2.es6 delete mode 100644 web/src/shader/atom/prelu.es6 delete mode 100644 web/src/shader/atom/scale.es6 delete mode 100644 web/src/shader/atom/sigmoid.es6 delete mode 100644 web/src/shader/atom/softmax.es6 delete mode 100644 web/src/shader/atom/suffix.es6 delete mode 100644 web/src/shader/atom/type_ivec56.es6 delete mode 100644 web/src/shader/batchnorm/conf.es6 delete mode 100644 web/src/shader/batchnorm/main.es6 delete mode 100644 web/src/shader/batchnorm/params.es6 delete mode 100644 web/src/shader/conv2d/conf.es6 delete mode 100644 web/src/shader/conv2d/main.es6 delete mode 100644 web/src/shader/conv2d/params.es6 delete mode 100644 web/src/shader/conv2d_depthwise/conf.es6 delete mode 100644 web/src/shader/conv2d_depthwise/main.es6 delete mode 100644 web/src/shader/conv2d_depthwise/params.es6 delete mode 100644 web/src/shader/conv2d_elementwise_add/conf.es6 delete mode 100644 web/src/shader/conv2d_elementwise_add/main.es6 delete mode 100644 web/src/shader/conv2d_elementwise_add/params.es6 delete mode 100644 web/src/shader/conv2d_elementwise_add_winograd/conf.es6 delete mode 100644 web/src/shader/conv2d_elementwise_add_winograd/main.es6 delete mode 100644 web/src/shader/conv2d_elementwise_add_winograd/params.es6 delete mode 100644 web/src/shader/dynamic/conf.es6 delete mode 100644 web/src/shader/dynamic/main.es6 delete mode 100644 web/src/shader/dynamic/params.es6 delete mode 100644 web/src/shader/elementwise_add/conf.es6 delete mode 100644 web/src/shader/elementwise_add/main.es6 delete mode 100644 web/src/shader/elementwise_add/params.es6 delete mode 100644 web/src/shader/mul/conf.es6 delete mode 100644 web/src/shader/mul/main.es6 delete mode 100644 web/src/shader/mul/params.es6 delete mode 100644 web/src/shader/pool2d/conf.es6 delete mode 100644 web/src/shader/pool2d/main.es6 delete mode 100644 web/src/shader/pool2d/params.es6 delete mode 100644 web/src/shader/pool2d_avg/conf.es6 delete mode 100644 web/src/shader/pool2d_avg/main.es6 delete mode 100644 web/src/shader/pool2d_avg/params.es6 delete mode 100644 web/src/shader/pool2d_max/conf.es6 delete mode 100644 web/src/shader/pool2d_max/main.es6 delete mode 100644 web/src/shader/pool2d_max/params.es6 delete mode 100644 web/src/shader/pool2d_winograd/conf.es6 delete mode 100644 web/src/shader/pool2d_winograd/main.es6 delete mode 100644 web/src/shader/pool2d_winograd/params.es6 delete mode 100644 web/src/shader/softmax/conf.es6 delete mode 100644 web/src/shader/softmax/main.es6 delete mode 100644 web/src/shader/softmax/params.es6 delete mode 100644 web/src/shader/v_shader.es6 delete mode 100644 web/src/shader/v_shader2.es6 delete mode 100644 web/src/test/getMaxUniforms.es6 delete mode 100644 web/src/utils/models.es6 delete mode 100644 web/src/utils/opData.es6 delete mode 100644 web/src/utils/tensor.es6 delete mode 100644 web/src/utils/utils.es6 delete mode 100644 web/tools/logger.es6 delete mode 100644 web/tools/toBinaryFile.py delete mode 100644 web/webpack.config.js create mode 100644 x2paddle.md create mode 100644 "\345\246\202\344\275\225\345\234\250Android\346\211\213\346\234\272\344\270\212\350\277\220\350\241\214\345\215\225\346\265\213.md" diff --git a/.gitmodules b/.gitmodules index 107036c702..e69de29bb2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,12 +0,0 @@ -[submodule "third-party/gflags"] - path = third-party/gflags - url = https://github.com/gflags/gflags.git -[submodule "third-party/googletest"] - path = third-party/googletest - url = https://github.com/google/googletest.git -[submodule "third-party/protobuf-mobile"] - path = third-party/protobuf-mobile - url = https://github.com/tensor-tang/protobuf.git -[submodule "third-party/protobuf-host"] - path = third-party/protobuf-host - url = https://github.com/protocolbuffers/protobuf.git diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 3643379acb..0000000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License - -cmake_minimum_required(VERSION 3.0) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -include(lite_utils) - -lite_option(WITH_PADDLE_MOBILE "Use the paddle-mobile legacy build" OFF) -if (WITH_PADDLE_MOBILE) - add_subdirectory(mobile) - return() -endif(WITH_PADDLE_MOBILE) - -set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) -set(CMAKE_CXX_STANDARD 11) - -include(system) -include(cross_compiling/preproject) - -project(paddle CXX C) -message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " - "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") -message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " - "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") -message(STATUS "AR tools: ${CMAKE_AR}") - -if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - find_package(CUDA QUIET) -endif() -find_package(Git REQUIRED) -find_package(Threads REQUIRED) - -include(simd) - -################################ Exposed Configurations ####################################### -lite_option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) -lite_option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ON IF ${AVX_FOUND}) -lite_option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) -lite_option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) -lite_option(WITH_MKL "Compile PaddlePaddle with MKL support." ON IF ${AVX_FOUND}) -lite_option(WITH_ARM_DOTPROD "Compile PaddlePaddle with ARM dot production" ON) -lite_option(WITH_SYSTEM_BLAS "Use system blas library" OFF) -# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. -if(ANDROID OR IOS OR ARMLINUX) - set(WITH_GPU OFF CACHE STRING - "Disable GPU when cross-compiling for Android and iOS" FORCE) - set(WITH_DSO OFF CACHE STRING - "Disable DSO when cross-compiling for Android and iOS" FORCE) - set(WITH_AVX OFF CACHE STRING - "Disable AVX when cross-compiling for Android and iOS" FORCE) - set(WITH_PYTHON OFF CACHE STRING - "Disable PYTHON when cross-compiling for Android and iOS" FORCE) - set(WITH_RDMA OFF CACHE STRING - "Disable RDMA when cross-compiling for Android and iOS" FORCE) - set(WITH_MKL OFF CACHE STRING - "Disable MKL when cross-compiling for Android and iOS" FORCE) -endif() - -# for lite, both server and mobile framework. -lite_option(LITE_WITH_JAVA "Enable Java JNI lib in lite mode" OFF) -lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) -lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) -lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) -lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) -lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) -lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) -lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) -lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF) -lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) -lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE) -lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF) -lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF) -lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) -# publish options -lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF) - -set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING - "A path setting third party libraries download & build directories.") - -# CMAKE_BUILD_TYPE -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING - "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" - FORCE) -endif() - -# check options -if (LITE_ON_TINY_PUBLISH) - if (NOT (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND NOT WITH_TESTING))#LITE_WITH_JAVA AND - message(FATAL_ERROR "LITE_ON_TINY_PUBLISH=ON must be used with WITH_LITE=ON LITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON LITE_WITH_JAVA=ON WITH_TESTING=OFF") - return() - endif() -endif() - -include_directories("${PADDLE_SOURCE_DIR}") -# the generated header files. -set(LITE_GENERATED_INCLUDE_DIR "${CMAKE_BINARY_DIR}") -include_directories("${LITE_GENERATED_INCLUDE_DIR}") - -# for mobile -if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - message(STATUS "Building the mobile framework") - include(cross_compiling/postproject) - include(cross_compiling/npu) # check and prepare NPU DDK - - # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON - # So the following third party dependencies are not needed. - if (NOT LITE_ON_TINY_PUBLISH) - # include the necessary thirdparty dependencies - include(external/gflags) # download, build, install gflags - # LITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON will disable glog - # TODO(sangoly): refine WITH_LITE and LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - include(external/gtest) # download, build, install gtest - include(ccache) # set ccache for compilation - include(external/protobuf) # download, build, install protobuf - endif() - - # for opencl - if (LITE_WITH_OPENCL) - include(external/opencl-headers) - include(external/opencl-clhpp) - endif() - - include(generic) # simplify cmake module - include(configure) # add paddle env configuration - - add_subdirectory(lite) - return() -endif() -################################# End of mobile compile ############################## - -set(WITH_MKLML ${WITH_MKL}) -if (NOT DEFINED WITH_MKLDNN) - if (WITH_MKL AND AVX2_FOUND) - set(WITH_MKLDNN ON) - else() - message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") - set(WITH_MKLDNN OFF) - endif() -endif() - -######################################################################################## - -include(external/mklml) # download mklml package -include(external/xbyak) # download xbyak package -include(external/libxsmm) # download, build, install libxsmm -include(external/gflags) # download, build, install gflags -include(external/glog) # download, build, install glog -include(external/gtest) # download, build, install gtest -include(external/protobuf) # download, build, install protobuf -include(external/openblas) # download, build, install openblas -include(external/mkldnn) # download, build, install mkldnn -include(external/eigen) # download eigen3 -include(external/xxhash) # download install xxhash needed for x86 jit - -include(cudnn) -include(configure) # add paddle env configuration - -if(LITE_WITH_CUDA) - include(cuda) -endif() - -include(generic) # simplify cmake module -include(ccache) # set ccache for compilation -include(util) # set unittest and link libs -include(version) # set PADDLE_VERSION - -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") -set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") - -add_subdirectory(lite) diff --git a/Home.md b/Home.md new file mode 100644 index 0000000000..f521dbf171 --- /dev/null +++ b/Home.md @@ -0,0 +1,54 @@ +# Paddle Lite 文档 +## 总体概述 + +Paddle-Lite 框架是 PaddleMobile 新一代架构,重点支持移动端推理预测,特点**高性能、多硬件、轻量级** 。支持PaddleFluid/TensorFlow/Caffe/ONNX模型的推理部署,目前已经支持 ARM CPU, Mali GPU, Adreno GPU, Huawei NPU 等多种硬件,正在逐步增加 X86 CPU, Nvidia GPU 等多款硬件,相关硬件性能业内领先。 + + +## 简介 + +- [技术特点](./tech_highlights) +- [架构设计](./architecture) +- [Road Map](./roadmap) + +## Benchmark + +- [最新性能](./benchmark) +- [测试方法](./benchmark_tools) + +## 安装 + +- [源码编译](./source_compile) + +## 使用 + +- [使用流程](./tutorial) +- [C++实例](./cpp_demo) +- [Java实例](./java_demo) +- [Android/IOS APP demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo) +- [模型转化方法](./model_optimize_tool) + +## 进阶 + +- [通过 X2Paddle 支持 Caffe, TensorFlow 模型](x2paddle) +- [模型量化](./model_quantization) +- [支持Op列表](./support_operation_list) +- [新增Op方法](./add_new_operation) +- [测试工具](./debug_tools) +- [调试方法](./debug_tools) +- [使用华为NPU](./npu) +- [使用Android GPU](./opencl) +- [使用FPGA](./fpga) + +## 开发者文档 + +- [开发基础须知](./for-developer) +- [架构详解](./architecture-intro) + +## FAQ + +- 问题或建议可以[发Issue](https://github.com/PaddlePaddle/Paddle-Lite/issues),为加快问题解决效率,可先检索是否有类似问题,我们也会及时解答! +- 欢迎加入Paddle-Lite百度官方QQ群:696965088 + +## paddle-mobile + +- [paddle-mobile 编译](./mobile) diff --git a/README.md b/README.md deleted file mode 100644 index e32840a21d..0000000000 --- a/README.md +++ /dev/null @@ -1,74 +0,0 @@ -[中文版](./README_cn.md) - -# Paddle Lite - - -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/Paddle-Lite/wiki) -[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) - - - -Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources. - -For tutorials, please see [PaddleLite Wiki](https://github.com/PaddlePaddle/Paddle-Lite/wiki). - -## Key Features - -### Light Weight - -On mobile devices, execution module can be deployed without third-party libraries, because our excecution module and analysis module are decoupled. - -On ARM V7, only 800KB are taken up, while on ARM V8, 1.3MB are taken up with the 80 operators and 85 kernels in the dynamic libraries provided by Paddle Lite. - -Paddle Lite enables immediate inference without extra optimization. - -### High Performance - -Paddle Lite enables device-optimized kernels, maximizing ARM CPU performance. - -It also supports INT8 quantizations with [PaddleSlim model compression tools](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim), reducing the size of models and increasing the performance of models. - -On Huawei NPU and FPGA, the performance is also boosted. - -The latest benchmark is located at [benchmark](https://github.com/PaddlePaddle/Paddle-Lite/wiki/benchmark) - -### High Compatibility - -Hardware compatibility: Paddle Lite supports a diversity of hardwares — ARM CPU, Mali GPU, Adreno GPU, Huawei NPU and FPGA. In the near future, we will also support AI microchips from Cambricon and Bitmain. - -Model compatibility: The Op of Paddle Lite is fully compatible to that of PaddlePaddle. The accuracy and performance of 18 models (mostly CV models and OCR models) and 85 operators have been validated. In the future, we will also support other models. - -Framework compatibility: In addition to models trained on PaddlePaddle, those trained on Caffe and TensorFlow can also be converted to be used on Paddle Lite, via [X2Paddle](https://github.com/PaddlePaddle/X2Paddle). In the future to come, we will also support models of ONNX format. - -## Architecture - -Paddle Lite is designed to support a wide range of hardwares and devices, and it enables mixed execution of a single model on multiple devices, optimization on various phases, and leight-weighted applications on devices. - -![img](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png) - -As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment. - -## Key Info about the Update - -The earlier Paddle-Mobile was designed to be compatible with PaddlePaddle and multiple hardwares, including ARM CPU, Mali GPU, Adreno GPU, FPGA, ARM-Linux and Apple's GPU Metal. Within Baidu, inc, many product lines have been using Paddle-Mobile. For more details, please see: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md). - -As an update of Paddle-Mobile, Paddle Lite has incorporated many older capabilities into the [new architecture](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite). For the time being, the code of Paddle-mobile will be kept under the directory `mobile/`, before complete transfer to Paddle Lite. - -For demands of Apple's GPU Metal and web front end inference, please see `./metal` and `./web` . These two modules will be further developed and maintained. - -## Special Thanks - -Paddle Lite has referenced the following open-source projects: - -- [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29) -- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. - - -## Feedback and Community Support - -- Questions, reports, and suggestions are welcome through Github Issues! -- Forum: Opinions and questions are welcome at our [PaddlePaddle Forum](https://ai.baidu.com/forum/topic/list/168)! -- WeChat Official Account: PaddlePaddle -- QQ Group Chat: 696965088 -

     

-

  WeChat Official Account           QQ Group Chat     

diff --git a/README_cn.md b/README_cn.md deleted file mode 100644 index d2111786b1..0000000000 --- a/README_cn.md +++ /dev/null @@ -1,62 +0,0 @@ -# Paddle Lite - - -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/Paddle-Lite/wiki) -[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) - - -Paddle Lite为Paddle-Mobile的升级版,定位支持包括手机移动端在内更多场景的轻量化高效预测,支持更广泛的硬件和平台,是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外,也兼容支持其他训练框架产出的模型。 - -完整使用文档位于 [PaddleLite Wiki](https://github.com/PaddlePaddle/Paddle-Lite/wiki) 。 - -## 特性 - -### 轻量级 -执行阶段和计算优化阶段实现良好解耦拆分,移动端可以直接部署执行阶段,无任何第三方依赖。 -包含完整的80个 Op+85个 Kernel 的动态库,对于ARMV7只有800K,ARMV8下为1.3M,并可以裁剪到更低。 -在应用部署时,载入模型即可直接预测,无需额外分析优化。 - -### 高性能 -极致的 ARM CPU 性能优化,针对不同微架构特点实现kernel的定制,最大发挥计算性能,在主流模型上展现出领先的速度优势。 -支持INT8量化计算,结合 [PaddleSlim 模型压缩工具](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim) 中 INT8量化训练功能,可以提供高精度高性能的预测能力。 -在Huawei NPU, FPGA上也具有有很好的性能表现。 - -最新 Benchmark 位于 [benchmark](https://github.com/PaddlePaddle/Paddle-Lite/wiki/benchmark)。 - -### 通用性 -硬件方面,Paddle Lite 的架构设计为多硬件兼容支持做了良好设计。除了支持ARM CPU、Mali GPU、Adreno GPU,还特别支持了华为 NPU,以及 FPGA 等边缘设备广泛使用的硬件。即将支持支持包括寒武纪、比特大陆等AI芯片,未来会增加对更多硬件的支持。 - -模型支持方面,Paddle Lite和PaddlePaddle训练框架的Op对齐,提供更广泛的模型支持能力。目前已严格验证18个模型85个OP的精度和性能,对视觉类模型做到了较为充分的支持,覆盖分类、检测和定位,包含了特色的OCR模型的支持。未来会持续增加更多模型的支持验证。 - -框架兼容方面:除了PaddlePaddle外,对其他训练框架也提供兼容支持。当前,支持Caffe 和 TensorFlow 训练出来的模型,通过X2Paddle (https://github.com/PaddlePaddle/X2Paddle) 转换工具实现。接下来将会对ONNX等格式模型提供兼容支持。 - -## 架构 - -PaddleLite 的架构设计着重考虑了对多硬件和平台的支持,并且强化了多个硬件在一个模型中混合执行的能力,多个层面的性能优化处理,以及对端侧应用的轻量化设计。 - -![](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png) - -其中,Analysis Phase 包括了 MIR(Machine IR) 相关模块,能够对原有的模型的计算图针对具体的硬件列表进行算子融合、计算裁剪 在内的多种优化。Execution Phase 只涉及到Kernel 的执行,且可以单独部署,以支持极致的轻量级部署。 - - -## Paddle-Mobile升级为Paddle Lite的说明 -原Paddle-Mobile作为一个致力于嵌入式平台的PaddlePaddle预测引擎,已支持多种硬件平台,包括ARM CPU、 Mali GPU、Adreno GPU,以及支持苹果设备的GPU Metal实现、ZU5、ZU9等FPGA开发板、树莓派等arm-linux开发板。在百度内已经过广泛业务场景应用验证。对应设计文档可参考: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md) - -Paddle-Mobile 整体升级重构并更名为Paddle Lite后,原paddle-mobile 的底层能力大部分已集成到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下。作为过渡,暂时保留原Paddle-mobile代码。 主体代码位于 `mobile/` 目录中,后续一段时间会继续维护,并完成全部迁移。新功能会统一到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下开发。 - -metal, web的模块相对独立,会继续在 `./metal` 和 `./web` 目录下开发和维护。对苹果设备的GPU Metal实现的需求及web前端预测需求,可以直接进入这两个目录。 - -## 致谢: -Paddle Lite 借鉴了以下开源项目: -- [ARM compute library]((https://github.com/ARM-software/ComputeLibrary)) -- [Anakin](https://github.com/PaddlePaddle/Anakin) ,Anakin对应底层的一些优化实现已被集成到Paddle Lite。Anakin作为PaddlePaddle组织下的一个高性能预测项目,极具前瞻性,对Paddle Lite有重要贡献。Anakin已和本项目实现整合。之后,Anakin不再升级。 - -## 交流与反馈 -* 欢迎您通过Github Issues来提交问题、报告与建议 -* 微信公众号:飞桨PaddlePaddle -* QQ群: 696965088 - -

     

-

   微信公众号                官方技术交流QQ群

- -* 论坛: 欢迎大家在[PaddlePaddle论坛](https://ai.baidu.com/forum/topic/list/168)分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 diff --git a/add_new_operation.md b/add_new_operation.md new file mode 100644 index 0000000000..a077a20696 --- /dev/null +++ b/add_new_operation.md @@ -0,0 +1,189 @@ +# 新增op的方法 + +以下以添加argmax为例,详细说明新增op的方法步骤。 + +## 1. 添加OpParam 结构体以传导 Op 的输入和输出 + +- 这里命名为 `ArgmaxParam` + +- 在 `paddlelite/lite/operators/op_params.h` 中添加 `ArgmaxParam` 结构体,代码如下: + ```c++ + struct ArgmaxParam { + lite::Tensor* X{}; + lite::Tensor* Out{}; + int Axis{0}; + }; + ``` +## 2. 添加 Argmax Op 并注册 + +- 在paddlelite/lite/operators/目录下新建argmax_op.h文件,主要代码如下: + ```c++ + class ArgmaxOpLite : public OpLite { + public: + ArgmaxOpLite() {} + explicit ArgmaxOpLite(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "argmax"; } + private: + mutable ArgmaxParam param_; + }; + ``` + `ArgmaxOpLite` 继承 `OpLite` ,成员变量包括 `ArgmaxParam` 结构体,需要实现的接口包括 `CheckShape()` 、`InferShape()` 、`AttachImp()` 、`AttachKernel()` 和 `DebugString()` 函数。`AttachKernel()` 和 `DebugString() `函数较为简单,此处直接实现; + +- 在 `paddlelite/lite/operators/` 目录下新建argmax_op.cc文件,需要具体实现`CheckShape()`、`InferShape()`和`AttachImp()`函数。`CheckShape()`函数检查输入是否符合要求,`InferShape()`函数基于输入推断得到输出的维度,`AttachImp()`函数绑定Op的输入输出。然后在argmax_op.cc文件中注册argmax,核心代码如下: + ```c++ + bool ArgmaxOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.Axis < (param_.X)->dims().size()); + return true; + } + + bool ArgmaxOpLite::InferShape() const { + auto x_dims = param_.X->dims(); + int x_rank = x_dims.size(); + int axis = param_.Axis; + if (axis < 0) axis += x_rank; + + std::vector out_dims; + for (int64_t i = 0; i < axis; i++) { + out_dims.push_back(x_dims[i]); + } + for (int64_t i = axis + 1; i < x_rank; i++) { + out_dims.push_back(x_dims[i]); + } + + // Set output dims + param_.Out->Resize(lite::DDim(out_dims)); + return true; + } + + bool ArgmaxOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + auto x = op_desc.Input("X").front(); + auto out = op_desc.Output("Out").front(); + + param_.X = scope->FindVar(x)->GetMutable(); + param_.Out = scope->FindVar(out)->GetMutable(); + param_.Axis = op_desc.GetAttr("Axis"); + + return true; + } + REGISTER_LITE_OP(argmax, paddle::lite::operators::ArgmaxOpLite); + ``` +- 在paddlelite/lite/operators/CMakeLists.txt中添加```lite_cc_library(argmax_op SRCS argmax_op.cc DEPS ${op_DEPS})```,并且在set ops lite 中添加argmax_op; +- 在paddlelite/lite/api/paddle_use_ops.h中添加```USE_LITE_OP(argmax)```。 + +## 3. 添加Argmax Kernel并绑定 +以下以arm端argmax实现为例说明 +- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.h文件,声明ArgmaxCompute类,并继承KernelLite,主要代码如下: + ```c++ + class ArgmaxCompute : public KernelLite { + public: + using param_t = operators::ArgmaxParam; + void Run() override; + virtual ~ArgmaxCompute() = default; + }; + ``` +- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件,主要实现Run函数。`Run()`函数调用paddlelite/lite/arm/math/argmax.h中的`argmax_func()`函数,根据输入计算输出。最后在argmax_compute.cc文件中,我们绑定argmax的输入输出(为tensor的输入参数都需要绑定),代码如下: + ```c++ + void ArgmaxCompute::Run() { + auto& param = Param(); + lite::Tensor* input = param.X; + lite::Tensor* output = param.Out; + int axis = param.Axis; + lite::arm::math::argmax_func(input, axis, output); + return; + } + + REGISTER_LITE_KERNEL( + argmax, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ArgmaxCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + ``` + +- 在paddlelite/lite/kernels/arm/CMakeLists.txt中添加 + ```cmake + lite_cc_library(argmax_compute_arm SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm) + ``` + CMakeLists.txt中set arm_kernels需要添加argmax_compute_arm; +- 在paddlelite/lite/api/paddle_use_kernels.h中添加```USE_LITE_KERNEL(argmax, kARM, kFloat, kNCHW, def)```。 + +## 4. 添加Argmax实现 +- 在paddlelite/lite/arm/math/目录下新建argmax.h文件,声明`argmax_func()`函数,代码如下: + ```c++ + void argmax_func(const lite::Tensor* input, const int axis, lite::Tensor* output); + ``` +- 在paddlelite/lite/arm/math/目录下新建argmax.cc文件,具体实现`argmax_func()`函数,代码如下: + ```c++ + void argmax_func(const lite::Tensor *input, + const int axis, + lite::Tensor *output) { + auto input_ddim = input->dims(); + auto output_ddim = output->dims(); + + const int size = input_ddim[axis]; + const int in_channel = input_ddim.count(axis, input_ddim.size()); + const int out_channel = output_ddim.count(axis, output_ddim.size()); + const int in_stride = input_ddim.count(axis + 1, input_ddim.size()); + const int out_stride = input_ddim.count(0, axis); + + for (int n = 0; n < out_stride; n++) { + for (int k = 0; k < in_stride; k++) { + const float *in_ptr = input->data() + n * in_channel + k; + std::vector> vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(in_ptr[i * in_stride], i); + } + // sort + std::partial_sort(vec.begin(), + vec.begin() + 1, + vec.end(), + std::greater>()); + + // out + float *out_ptr = output->mutable_data() + n * out_channel + k; + *out_ptr = vec[0].second; + } + } + } + ``` +- 在paddlelite/lite/arm/math/CMakeFile.txt中的```math_arm library```中添加argmax.cc,在paddlelite/lite/arm/math/funcs.h中添加```#include "lite/arm/math/argmax.h"``` + +## 5. 添加Argmax单测 +- 在paddlelite/lite/tests/kernels目录下新建argmax_compute_test.cc文件,声明并实现ArgmaxComputeTester类; +- ArgmaxComputeTester类中主要包括PrepareOpDesc、PrepareData和RunBaseline函数。PrepareOpDesc函数设定单测op的类型和输入输出参数,PrepareData函数对输入tensor进行初始化,RunBaseline是基于输入计算得到输出,用于和框架计算的输出进行对比; +- 使用gtest添加单测,代码如下: + ```c++ + TEST(Argmax, precision) { + #ifdef LITE_WITH_ARM + LOG(INFO) << "test argmax arm"; + Place place(TARGET(kARM)); + + for (int axis : {0, 1, 2, 3}) { + for (int n : {1, 3}) { + for (int c : {3, 6}) { + for (int h : {9, 18}) { + for (int w : {9, 18}) { + std::unique_ptr tester( + new ArgmaxComputeTester(place, "def", axis, n, c, h, w)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + } + } + } + #endif + } + ``` +- 在paddlelite/lite/tests/kernels/CMakeLists.txt中添加 + ```cmake + lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + ``` +# 6. 编译运行 +- 在paddlelite目录中,执行```./lite/tools/ci_build.sh build_test_arm```,该脚本会创建手机模拟器,并编译运行所有单测(花费时间较久)。如果运行无误,则表明添加argmax成功。 diff --git a/architecture-intro.md b/architecture-intro.md new file mode 100644 index 0000000000..e7a705677c --- /dev/null +++ b/architecture-intro.md @@ -0,0 +1,247 @@ +# Paddle-Lite 开发者文档 + +这篇文档会从开发者角度详细介绍开发 Paddle-Lite 需要的相关信息。 + + + +## 设计及思考 + +近年来,各种深度学习预估硬件称出不穷,从手机APP到车载设备,再到音箱,均需要部署深度学习预测,且有如下共性需求: + +1. 高性能 +2. 硬件支持和扩展容易 +3. 轻量级部署 + +Paddle-Lite 的架构方面便是定向参考如上需求设计实现的,具体地 + +- 高性能方面 + - 通过 MIR(Machine IR) 实现精细复杂的计算图的分析和优化 + - 执行期 Kernel 的简单设计,几乎没有额外调度开销 + - 适当的硬件层抽象,框架支持各个硬件后端中做特定的调度实现 +- 轻量级部署方面 + - 拆分分析和执行两个阶段,执行阶段轻量级实现,可以单独部署 + - 轻量级 Op 和 Kernel 设计 +- 硬件支持和扩展方面 + - 通过 MIR 支撑带硬件和执行信息的宏观分析优化 + - TypeSystem 抽象带硬件的不同计算模式的表示,实现整个计算图的强类型推导,以及执行状态机的静态分析 + +Paddle-Lite 的架构尝试从强类型推导的角度建模支持多硬件,多种计算模式(不同量化精度、不同的 data layout等)的混合计算,从而实现宏观上的各异硬件和计算模式的混合。 + +框架部分已经经过 FPGA,GPU,NPU 等异构硬件的打磨,各项能力也在完善中。 + +## 重要模块介绍 + +### OpLite + +[OpLite](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/op_lite.h#L52) 是 Paddle-Lite 中的 Operator,用户扩展单个硬件时,最多的就是扩展 Op 和 Kernel。 + +重要方法如下: + +```c++ +class OpLite : public Registry { + public: + // Check the shape. + virtual bool CheckShape() const { return true; } + // Inference the outputs' shape. + virtual bool InferShape() const { return true; } + // Link the external execution environ to internal context. + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope); +}; +``` + +其中,分析期执行 + +- `AttachImpl` + +执行期执行 + +- `CheckShape` +- `InferShape` + +扩展须知: + +1. `CheckShape` 只在第一个 batch 执行,所以耗时不敏感 + +2. `InferShape` 需要在每个 batch 执行,应该严格耗时 + + 1. 可以通过添加 member variable 的方式,对其中一部分信息增加 cache,比如 + + ```c++ + class XXOp : public OpLite { + void InferShape() { + int batch_size = param().input.shape[0]; + if (!shape_cache_.empty()) { + shape_cache_[0] = batch_size; + param().output->Resize(shape_cache_); + } + } + + private: + shape_t shape_cache_; + } + ``` + + + +### OpParam + +[OpParam](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/operators/op_params.h) 用于存储执行期 Kernel 需要的各项参数。 所有字段可以直接存储(比如指针或者 `int`),以避免执行中获取参数的延迟。 + +因为没有需求,OpParam 暂时没有设置基类。 + +实际例子: + +```c++ +// For Softmax op +struct SoftmaxParam { + lite::Tensor* x{}; + lite::Tensor* output{}; + int axis{-1}; +}; +``` + +OpLite 的 `AttachImpl` 方法就用于构建 `OpParam` ,复制传递给 `Kernel` 用于执行。 + +OpParam 是执行期的重要模块,需要严格保证性能,相应的扩展要求: + +1. 字段的获取必须是低延迟的,可以直接用指针,或者直接复制值 +2. 避免执行无关信息混入,包括 debug 信息 +3. 命名需要与 Paddle OpDesc 中的信息严格一致,以降低功能对齐和理解的难度 + +### Kernel + +```c++ +template +class KernelLite : public KernelBase { + public: + // Run the kernel. + virtual void Run() { CHECK(false) << "Not Implemented"; } + + TargetType target() const override { return Target; } + PrecisionType precision() const override { return Precision; } + DataLayoutType layout() const override { return DataLayout; } + Place place() const override { return Place{Target, Precision, DataLayout}; } + std::string name() const override; +}; +``` + +由于是执行期的重要概念,因此 Kernel 设计地非常简单高效。 + +其中,执行期的 `Run` 是其唯一重要的接口,其中包含具体的计算逻辑。 + +模板中的参数主要用于方便多硬件编译,以及自解释: + +- Target: 执行硬件 +- Precision: 主要的计算精度 +- DataLayout:主要计算的 data layout + +这部分信息用于帮助挑选 kernel,具体的值并不严格。 + + + +Kernel 的注册需要用到 TypeSystem,不光对 Kernel 本身的特性进行描述,对其输入和输出均进行详尽的定义。 + +例如 FullyConnected 的注册 + +```c++ +REGISTER_LITE_KERNEL( + fc, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FcCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +``` + +Kernel自身定义是 `kARM` 的,也就是ARM上的kernel,主要的计算精度是 `kFloat`,主要的 Data layout 是 `kNCHW`。 + +接着会对其所有的输入和输出做详细定义,比如看 `Input` 输入的定义是 `LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))`,也就是声明其 Target 是 `kARM`, PRECISION 是 `kFloat`,Data Layout 是 `kNCHW`。 + +这里的设计思想是类似C++中的函数重载,同一个 Kernel(的名字),在重载了其输入输出的类型之后可以是不同的kernel。 + +#### 扩展须知 + +1. 模板参数选用计算中主要的来表示 + 1. 比如,scale kernel,同时能接受 `float` 和 `int` 的输入,但其不算量化 kernel,那应该设置为 `Precision=float`,代表常规的计算精度中使用 +2. Kernel 输入输出的定义需要足够精确,是什么类型就是什么类型;框架会根据其输入输出的定义来动态构建状态机,否则会出现分析期和执行期的状态机不一致,造成未定义行为 + +### MIR + +MIR 类似于 LLVM 里的 IR,只是加上了硬件和执行期的信息参与分析优化。 + +Pass 是MIR中的模块化策略,其输入和输出都是 SSA Graph. + +框架会自动基于模型的Program 构建 SSA Graph,之后按 [Optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/optimizer.h) 中定义的pass的顺序调用一系列 Pass。 + +#### Op Fusion + +MIR 中的 [PatternMacher](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/pattern_matcher.h) 实现了简单有效的基于图的模板识别的算法,相关的 op fusion 的图操作可以基于此实现。 + +实际的例子可以参考 [fc_fuse_pass.h](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/fusion/fc_fuse_pass.h)。 + +### TypeSystem + +TypeSystem 是 Paddle-Lite 中构建复杂计算图的基础模块,核心思想是协助 SSA Graph 构建一个状态机,表示其中不同的状态。 + +这里的 Type 主要包含下面四组信息,更多的信息可以按需扩展: + +- TargetType +- Precision +- DataLayout +- device id,用于表示卡号 + + + +状态机的表示: + +```python +Tensor0(kARM, kFloat, kNCHW) --pass--> Tensor1(kOpenCL, kFloat, kNCHW) +``` + +MIR 会识别出,Tensor0 和 Tensor1 的硬件位置不同,因此触发相依的 Pass 插入对应的 cast op 来进行 type cast,比如 + +``` +Tensor0(kARM, kFloat, kNCHW) --pass-> IoCopyOp(kARM, kOpenCL) --pass-> Tensor1(kOpenCL, kFloat, kNCHW) +``` + +### KernelContext + +KernelContext 是硬件支持的核心封装,主要用于为 Kernel 提供执行期的硬件上下文。 + +KernelContext 的设计类似于 OpParam,两者均没有基类;对于 KernelContext,其假定是,不同的硬件间的接口和逻辑可能完全不同,比如 kARM 和 kCUDA,因此不设定基类,也不需要提供统一的接口来封装不同硬件行为。 + +不同硬件的 KernelContext 直接与该硬件对应的 Kernel 对接。 + +KernelContext 的行为可以被 MIR 在分析期确定和调度。 + +注意事项: + +1. 由于是执行期概念,KernelContext 也需要注意性能和轻量化 +2. 移动端部署时只会部署执行期,因此 MIR 和 KernelContext 会拆开,因此 KernelContext 相应的设置需要能够序列化到 ProgramDesc 中,以便执行期载入和执行 + +## 扩展硬件后端 + +### 扩展现有的硬件后端 + +主要是扩充 Op 和 Kernel 的工作,如果需要 fuse,则参考 MIR 章节,增加相应的fuse pass便可,具体地,可以参考 + +- [fc_op](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/operators/fc_op.h) 实现类似的 Op +- [fc_compute](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/kernels/arm/fc_compute.h) 实现类似的 Kernel +- [fc_fuse_pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/fusion/fc_fuse_pass.h) 实现fuse逻辑,并注册到 [optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/optimizer.h) + +### 扩展全新硬件后端 + +需要额外扩充如下模块,让框架能够支撑硬件执行: + +- TypeSystem,需要扩充其中相关的 type + - 相关 [enum](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/api/paddle_place.h#L44) +- MIR,需要扩展其中的 type cast 相关的 pass + - [TargetType cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.cc) 用于拷贝不同硬件上的tensor + - [Data layout cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.h) 用于转化不同的 data layout + - [Precision cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_precision_cast_pass.h) 用于转化不同 tensor 的量化精度 +- KernelContext,具体地可以参考 + - [ARM context](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/context.h#L91) + - 需要注意的是,硬件 context 的接口只服务于该硬件的 kernel + - context 有分析期和执行期两个阶段,如果分析期没有特殊的优化,则无需考虑;否则,需要注意将分析期的信息整理并序列化到离线模型中,用于执行期直接加载。 \ No newline at end of file diff --git a/architecture.md b/architecture.md new file mode 100644 index 0000000000..fbcd9b05eb --- /dev/null +++ b/architecture.md @@ -0,0 +1,94 @@ +# 架构设计 + +Mobile 在这次升级为 Lite 架构, 侧重多硬件、高性能的支持,其主要设计思想如下 + +- 引入 Type system,强化多硬件、量化方法、data layout 的混合调度能力 +- 硬件细节隔离,通过不同编译开关,对支持的任何硬件可以自由插拔 +- 引入 MIR(Machine IR) 的概念,强化带执行环境下的优化支持 +- 优化期和执行期严格隔离,保证预测时轻量和高效率 + +架构图如下 + +![Paddle Inference Refactor1.0](./images/architecture.jpg) + +## 编译期和执行期严格隔离设计 + +- compile time 优化完毕可以将优化信息存储到模型中;execution time 载入并执行 +- 两套 API 及对应的预测lib,满足不同场景 + - `CxxPredictor` 打包了 `Compile Time` 和 `Execution Time`,可以 runtime 在具体硬件上做分析和优化,得到最优效果 + - `MobilePredictor` 只打包 `Execution Time`,保持部署和执行的轻量 + +## `Execution Time` 轻量级设计和实现 + +- 每个 batch 实际执行只包含两个步骤执行 + - `Op.InferShape` + - `Kernel.Run`,Kernel 相关参数均使用指针提前确定,后续无查找或传参消耗 + - 设计目标,执行时,只有 kernel 计算本身消耗 +- 轻量级 `Op` 及 `Kernel` 设计,避免框架额外消耗 + - `Op` 只有 `CreateKernels` 和 `InferShape` 两个重要职能 + - `Kernel` 只有 `Run` 职能 + +## 多硬件后端支持 + +- 硬件通用行为,使用 `TargetWrapper` 模块做适配器适配,对上层框架提供一致界面 +- 框架上层策略保持硬件无关,如存储优化 (Memory optimize),计算剪枝 (Computation prune) 等,任何硬件接入均可直接复用 +- 框架支持了硬件通用行为,特定硬件细节不做过多约束,各硬件可以自行实现并接入框架 +- 计算模式上目前支持两种主流模型,一种是类似 X86, ARM CPU 等非异构设备;一种是 GPU,或 FPGA 等异构设备(支持 stream, event异步执行模式以及跨设备拷贝) + +--- +## 多硬件及算法混合调度支持 +`TensorTy` 用来表示 Tensor 类型 + +```c++ +struct TensorTy { + TargetType target; + PrecisionType precision; + DataLayout layout; + int deviceid; +}; +``` + +```c++ +enum class TargetType { kARM, kX86, kCUDA, kOpenCL }; +enum class PrecisionType { kFP32, kFP16, kInt8, kInt16 }; +enum class DataLayout { kNCHW, kNHWC }; +``` +--- + +注册 Kernel,确定特定 Kernel 的输入输出特征 + +```c++ +REGISTER_LITE_KERNEL( + mul, kARM, kFloat, kNCHW, arm::MulCompute, def) + .BindInput("X", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)}) + .BindInput("Y", {LiteType::GetTensorTy(kARM, kFloat, kNCHW))}) + .BindOutput("Out", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)}) + .Finalize(); +``` + +--- + +同一个 Op 的不同 Kernel 类似函数重载 + +用于支持任意的混合调度: + +1. 标记模型中所有 tensor 的 Type +2. 标记 Kernel 的 硬件、执行精度、data layout 等信息 + +全局做类型推断,当发现 tensor 传递中有类型冲突,采用 type cast 操作,通过插入特定功能 Op 来实现正确的传导 + +![lite-7](images/lite1.png) + + + +--- + +## MIR 用于图分析优化 + +基于 Type System 的 SSA,通过 IR Pass 对计算图进行分析和优化: + +- 支持对整个 graph 进行类型推断,发现类型冲突并加入 type cast op,来支持通用混合调度 +- 计算剪枝 (Compute prune),比如去掉 scale(1), assign op 等 +- 存储优化 (Memory optimize) +- 操作熔合 (Operator fuse)(已经支持 fc, conv_bn, ele_add+act 等6种 fuse 策略) +- 支持量化处理(已支持 Int8预测) \ No newline at end of file diff --git a/benchmark.md b/benchmark.md new file mode 100644 index 0000000000..8125e02218 --- /dev/null +++ b/benchmark.md @@ -0,0 +1,162 @@ +# Benchmark + +可以参考[benchmark_tools](https://github.com/PaddlePaddle/Paddle-Lite/wiki/benchmark_tools),推荐**一键benchmark**。 + +## 测试环境 + +* 测试模型 + * fp32模型 + * mobilenet_v1 + * mobilenet_v2 + * squeezenet_v1.1 + * mnasnet + * shufflenet_v2 + + * int8模型 + * mobilenet_v1 + * mobilenet_v2 + * resnet50 + +* 测试机器(android ndk ndk-r17c) + * 骁龙855 + * xiaomi mi9, snapdragon 855 + * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz + + + * 骁龙845 + * xiaomi mi8, 845 + * 2.8GHz(大四核),1.7GHz(小四核) + + * 骁龙835 + * xiaomi mix2, snapdragon 835 + * 2.45GHz(大四核),1.9GHz(小四核) + + * 骁龙625 + * oppo R9s, snapdragon625 + * A53 x 8, big core@2.0GHz + + * 骁龙653 + * 360 N5, snapdragon 653 + * 4 x A73@2.0GHz + 4 x A53@1.4GHz + + * 麒麟970 + * HUAWEI Mate10 + +* 测试说明 + * commit id: 12c129affaacd476e27a0a82b235a9d547d33f0f + * warmup=10, repeats=30,统计平均时间,单位是ms + * 当线程数为1时,```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH,否者设置LITE_POWER_NO_BIND + * 模型的输入图像的维度是{1, 3, 224, 224},输入图像的每一位数值是1 + +## 测试数据 + +### fp32 模型测试数据 + +## 测试数据 + +### fp32 模型测试数据 + +骁龙855 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1 | 31.64 | 18.98 | 10.67 | 33.17 | 19.55 | 11.43 + mobilenet_v2 | 25.54 | 13.80 | 8.75 | 29.25 | 15.19 | 9.65 + squeezenet_v1.1 | 26.81 | 14.39 | 8.92 | 28.63 | 15.37 | 9.53 + mnasnet | 25.39 | 13.89 | 9.63 | 28.97 | 15.54 | 10.10 + shufflenet_v2 | 13.85 | 7.81 | 5.87 | 14.64 | 8.35 | 6.14 + + + 骁龙845 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1 | 62.04 | 33.63 | 18.63 | 66.23 | 35.78 | 20.14 + mobilenet_v2 | 40.41 | 22.94 | 13.33 | 44.22 | 24.58 | 14.50 + squeezenet_v1.1 | 49.92 | 23.78 | 13.86 | 52.00 | 24.85 | 15.87 + mnasnet | 40.14 | 23.36 | 14.46 | 43.77 | 24.78 | 14.76 + shufflenet_v2 | 22.27 | 13.69 | 8.96 | 26.11 | 14.95 | 9.02 + + + 骁龙835 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1 | 89.57 | 50.88 | 27.62 | 96.11 | 53.18 | 31.99 + mobilenet_v2 | 59.92 | 33.93 | 20.91 | 64.04 | 36.85 | 23.10 + squeezenet_v1.1 | 65.25 | 37.92 | 23.40 | 74.87 | 40.96 | 23.69 + mnasnet | 60.97 | 35.04 | 22.40 | 64.88 | 37.90 | 24.53 + shufflenet_v2 | 30.87 | 19.33 | 12.78 | 31.71 | 19.52 | 13.25 + + + 骁龙625 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1 | 180.98 | 92.27 | 51.51 | 216.12 | 110.33 | 61.68 + mobilenet_v2 | 132.46 | 68.38 | 43.54 | 146.18 | 76.62 | 46.21 + squeezenet_v1.1 | 124.49 | 66.84 | 41.53 | 153.28 | 82.42 | 47.14 + mnasnet | 122.50 | 67.46 | 43.04 | 146.20 | 79.64 | 48.56 + shufflenet_v2 | 68.70 | 40.77 | 26.53 | 75.38 | 42.40 | 28.36 + + + 骁龙653 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1 | 121.27 | 59.36 | 34.06 | 126.55 | 64.96 | 39.23 + mobilenet_v2 | 79.48 | 46.17 | 27.81 | 87.93 | 48.28 | 31.87 + squeezenet_v1.1 | 81.10 | 42.66 | 42.07 | 82.29 | 45.88 | 28.84 + mnasnet | 75.60 | 44.22 | 30.16 | 82.99 | 49.07 | 32.34 + shufflenet_v2 | 39.18 | 23.54 | 16.73 | 40.12 | 24.76 | 17.68 + + + + 麒麟970 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1 | 99.58 | 56.91 | 29.02 | 102.42 | 57.81 | 35.36 + mobilenet_v2 | 69.22 | 42.41 | 23.55 | 69.49 | 43.38 | 25.26 + squeezenet_v1.1 | 67.48 | 41.06 | 24.47 | 75.03 | 43.57 | 26.35 + mnasnet | 74.55 | 43.06 | 24.22 | 75.48 | 44.43 | 26.69 + shufflenet_v2 | 39.20 | 24.54 | 16.34 | 37.40 | 24.32 | 16.66 + +### int8 模型测试数据 + +骁龙855 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1_int8 | 16.77 | 8.38 | 4.59 | 43.42 | 20.80 | 10.89 + mobilenet_v2_int8 | 22.81 | 13.71 | 10.43 | 29.65 | 20.09 | 13.99 + resnet50_int8 | 258.83 | 157.22 | 85.83 | 424.99 | 209.37 | 112.32 + + 骁龙845 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1_int8 | 44.08 | 23.75 | 12.52 | 49.19 | 26.77 | 13.82 + mobilenet_v2_int8 | 36.61 | 22.70 | 15.29 | 40.51 | 25.84 | 17.89 + resnet50_int8 | 399.64 | 217.74 | 112.86 | 408.80 | 224.72 | 122.15 + + 骁龙835 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1_int8 | 59.99 | 31.59 | 16.55 | 62.92 | 33.33 | 17.38 + mobilenet_v2_int8 | 50.68 | 31.25 | 21.62 | 52.56 | 33.88 | 24.31 + resnet50_int8 | 498.85 | 267.65 | 146.03 | 510.54 | 278.77 | 155.05 + + 骁龙625 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1_int8 | 122.86 | 63.52 | 33.91 | 125.77 | 64.78 | 34.25 + mobilenet_v2_int8 | 110.71 | 67.76 | 49.85 | 114.63 | 71.74 | 51.73 + resnet50_int8 | 954.67 | 505.78 | 286.64 | 1016.64 | 532.84 | 305.20 + + 骁龙653 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1_int8 | 81.46 | 42.99 | 31.69 | 81.20 | 42.46 | 23.47 + mobilenet_v2_int8 | 68.39 | 43.47 | 32.03 | 69.40 | 44.47 | 33.46 + resnet50_int8 | 687.59 | 369.70 | 208.99 | 684.55 | 369.04 | 208.42 + + 麒麟970 | armv8 | | |armv7 ||| +---- | ---- | ---- | ---- |---- |----| ----| +num_threads | 1 | 2 | 4 |1 |2| 4 + mobilenet_v1_int8 | 64.27 | 35.48 | 18.76 | 64.63 | 37.67 | 20.70 + mobilenet_v2_int8 | 64.54 | 36.76 | 22.17 | 68.80 | 38.85 | 24.30 + resnet50_int8 | 509.94 | 268.95 | 276.13 | 520.57 | 281.92 | 157.82 + diff --git a/benchmark_tools.md b/benchmark_tools.md new file mode 100644 index 0000000000..8148f712f0 --- /dev/null +++ b/benchmark_tools.md @@ -0,0 +1,196 @@ + + * [Benchmark](#Benchmark) + * [环境准备](#环境准备) + * [1. 一键Benchmark](#一-一键benchmark) + * [2. 逐步Benchmark](#二-逐步Benchmark) + * [1. 获取benchmark可执行文件](#1-获取benchmark可执行文件) + * [2. 下载模型](#2-下载模型) + * [3. benchmark.sh脚本](#3-benchmark-sh脚本) + * [4. 测试](#4-测试) + + +# Benchmark + +本文将会介绍,在**Ubuntu:16.04交叉编译环境**下,用安卓手机在终端测试Paddle-Lite的性能,并介绍两种Benchmark方法: + +1. **一键Benchmark**:适用于想快速获得常见模型性能的用户,下载预编译好的benchmark可执行文件; +2. **逐步Benchmark**:将**一键Benchmark**流程拆解讲解。 + +# 环境准备 + +1. 准备[adb](https://developer.android.com/studio/command-line/adb)等必备软件: +```shell +sudo apt update +sudo apt install -y wget adb +``` +2. 检查手机与电脑连接。安卓手机USB连上电脑,打开设置 -> 开启开发者模式 -> 开启USB调试 -> 允许(授权)当前电脑调试手机; +3. 在电脑终端输入`adb devices`命令,查看当前连接到的设备: +```shell +adb devices +``` +命令成功执行,显示结果类似下面(序列码略有不同): +```shell +List of devices attached +712QSDSEMMS7C device +``` + +## 一. 一键Benchmark + +执行以下命令,完成Benchmark: + +```shell +wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/run_benchmark.sh +sh run_benchmark.sh +``` + +该`run_benchmark.sh`脚本会: + +1. 下载模型,并上传手机:包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet; +2. 下载pre-built android-armv7和android-armv8的可执行文件,并上传手机:`benchmark_bin_v7`和`benchmark_bin_v8`; +3. 自动执行另一个脚本`benchmark.sh`(多台手机连接USB,请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`); +4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`,到当前目录,并显示Benchmark结果。 + +## 二. 逐步Benchmark + +### 1. 获取benchmark可执行文件 + +benchmark_bin文件可以测试PaddleLite的性能,有下面两种方式获得。 + +#### 方式一:下载benchmark_bin可执行文件 + +```shell +# Download benchmark_bin for android-armv7 +wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_bin_v7 + +# Download benchmark_bin for android-armv8 +wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_bin_v8 +``` + +#### 方式二:由源码编译benchmark_bin文件 + +根据[源码编译](./source_compile)准备编译环境,拉取PaddleLite最新release发布版代码,并在仓库根目录下,执行: + +```shell +########################################### +# Build benchmark_bin for android-armv7 # +########################################### +./lite/tools/ci_build.sh \ + --arm_os="android" \ + --arm_abi="armv7" \ + --arm_lang="gcc " \ + build_arm + +# build result see: /build.lite.android.armv7.gcc/lite/api/benchmark_bin + +########################################### +# Build benchmark_bin for android-armv8 # +########################################### +./lite/tools/ci_build.sh \ + --arm_os="android" \ + --arm_abi="armv8" \ + --arm_lang="gcc " \ + build_arm + +# build result see: /build.lite.android.armv8.gcc/lite/api/benchmark_bin +``` + +> **注意**:为了避免在docker内部访问不到手机的问题,建议编译得到benchmark_bin后退出到docker外面,并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下,按照下面步骤下载模型、拷贝脚本、测试。 + +### 2. 下载模型 + +PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_models.tar.gz)。 + +执行以下命令,下载常见Benchmark模型并解压: + +```shell +wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_models.tar.gz +tar zxvf benchmark_models.tar.gz +``` + +| 模型 | 下载地址 | +| --------------- | ------------------------------------------------------------ | +| MobilenetV1 | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/mobilenet_v1.tar.gz) | +| MobilenetV2 | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/mobilenet_v2.tar.gz) | +| ShufflenetV2 | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/shufflenet_v2.tar.gz) | +| Squeezenet_V1.1 | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/squeezenet_v11.tar.gz) | +| Mnasnet | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/mnasnet.tar.gz) | + +> 注:若要使用测试脚本,**对单个模型测试**,请把单个模型放入 `benchmark_models` 文件夹,并确保测试脚本、`benchmark_models`文件夹在同一级的目录。 + +注:上述模型都已经使用`model_optimize_tool`进行转化,而且Lite移动端只支持加载转化后的模型。如果需要测试其他模型,请先参考[模型转化方法](./model_optimize_tool)。 + + +### 3. benchmark.sh脚本 + +benchmark测试的执行脚本`benchmark.sh` 位于源码中的`/PaddleLite/lite/tools/benchmark.sh`位置,测试时需要将`benchmark.sh`、 `benchmark_bin` 、 `benchmark_models` 文件复制到同一目录下。 + +### 4. 测试 + +从终端进入benchmark.sh、可执行文件(benchmark_bin_v7、benchmark_bin_v8)和模型文件(benchmark_models)所在文件夹。 + +运行 benchmark.sh 脚本执行测试 + +```shell +# Benchmark for android-armv7 +sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt + +# Benchmark for android-armv8 +sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt +``` +测试结束后,armv7和armv8的结果,分别保存在当前目录下的`result_armv7.txt`和`result_armv8.txt`文件中。 + +**查看测试结果** + +在当前目录的`result_armv7.txt`和`result_armv8.txt`文件,查看测试结果。 + +```shell +run benchmark armv7 +-------------------------------------- +PaddleLite Benchmark +Threads=1 Warmup=10 Repeats=30 +-- mnasnet avg = 159.8427 ms +-- mobilenet_v1 avg = 235.0072 ms +-- mobilenet_v2 avg = 173.0387 ms +-- shufflenet_v2 avg = 76.0040 ms +-- squeezenet_v11 avg = 164.2957 ms + +Threads=2 Warmup=10 Repeats=30 +-- mnasnet avg = 83.1287 ms +-- mobilenet_v1 avg = 121.6029 ms +-- mobilenet_v2 avg = 86.6175 ms +-- shufflenet_v2 avg = 41.5761 ms +-- squeezenet_v11 avg = 87.8678 ms + +Threads=4 Warmup=10 Repeats=30 +-- mnasnet avg = 73.3880 ms +-- mobilenet_v1 avg = 119.0739 ms +-- mobilenet_v2 avg = 85.3050 ms +-- shufflenet_v2 avg = 38.0762 ms +-- squeezenet_v11 avg = 64.2201 ms +-------------------------------------- + +run benchmark armv8 +-------------------------------------- +PaddleLite Benchmark +Threads=1 Warmup=10 Repeats=30 +-- mnasnet avg = 165.3073 ms +-- mobilenet_v1 avg = 306.0188 ms +-- mobilenet_v2 avg = 195.1884 ms +-- shufflenet_v2 avg = 99.3692 ms +-- squeezenet_v11 avg = 156.6971 ms + +Threads=2 Warmup=10 Repeats=30 +-- mnasnet avg = 90.2290 ms +-- mobilenet_v1 avg = 157.0007 ms +-- mobilenet_v2 avg = 118.1607 ms +-- shufflenet_v2 avg = 68.6804 ms +-- squeezenet_v11 avg = 91.3090 ms + +Threads=4 Warmup=10 Repeats=30 +-- mnasnet avg = 179.9730 ms +-- mobilenet_v1 avg = 204.0684 ms +-- mobilenet_v2 avg = 181.6486 ms +-- shufflenet_v2 avg = 123.2728 ms +-- squeezenet_v11 avg = 412.9046 ms +-------------------------------------- +``` \ No newline at end of file diff --git a/benchmark_tools.md.toc.2019-08-25_233116 b/benchmark_tools.md.toc.2019-08-25_233116 new file mode 100644 index 0000000000..6fbec144e8 --- /dev/null +++ b/benchmark_tools.md.toc.2019-08-25_233116 @@ -0,0 +1,11 @@ + * [Benchmark 测试方法](#benchmark-测试方法) + * [1. 一键Benchmark](#1-一键benchmark) + * [2. 逐步测试说明](#2-逐步测试说明) + * [1. benchmark可执行文件](#1-benchmark可执行文件) + * [2. 下载模型](#2-下载模型) + * [3. benchmark.sh 脚本](#3-benchmarksh-脚本) + * [4. 测试](#4-测试) + * [3. 完整实例](#3-完整实例) + + + diff --git a/benchmark_tools.md.toc.2019-08-25_233528 b/benchmark_tools.md.toc.2019-08-25_233528 new file mode 100644 index 0000000000..238a7cb053 --- /dev/null +++ b/benchmark_tools.md.toc.2019-08-25_233528 @@ -0,0 +1,11 @@ + * [Benchmark 测试方法](#benchmark-测试方法) + * [1. 一键Benchmark](#1-一键benchmark) + * [2. 逐步测试说明](#2-逐步测试说明) + * [1. benchmark可执行文件](#1-benchmark可执行文件) + * [2. 下载模型](#2-下载模型) + * [3. benchmark.sh 脚本](#3-benchmarksh-脚本) + * [4. 测试](#4-测试) + * [3. 完整实例](#3-完整实例) + + + diff --git a/cmake/FindGflags.cmake b/cmake/FindGflags.cmake deleted file mode 100644 index 6587089ba3..0000000000 --- a/cmake/FindGflags.cmake +++ /dev/null @@ -1,582 +0,0 @@ -# Ceres Solver - A fast non-linear least squares minimizer -# Copyright 2015 Google Inc. All rights reserved. -# http://ceres-solver.org/ -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * Neither the name of Google Inc. nor the names of its contributors may be -# used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# Author: alexs.mac@gmail.com (Alex Stewart) -# - -# FindGflags.cmake - Find Google gflags logging library. -# -# This module will attempt to find gflags, either via an exported CMake -# configuration (generated by gflags >= 2.1 which are built with CMake), or -# by performing a standard search for all gflags components. The order of -# precedence for these two methods of finding gflags is controlled by: -# GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION. -# -# This module defines the following variables: -# -# GFLAGS_FOUND: TRUE iff gflags is found. -# GFLAGS_INCLUDE_DIRS: Include directories for gflags. -# GFLAGS_LIBRARIES: Libraries required to link gflags. -# GFLAGS_NAMESPACE: The namespace in which gflags is defined. In versions of -# gflags < 2.1, this was google, for versions >= 2.1 it is -# by default gflags, although can be configured when building -# gflags to be something else (i.e. google for legacy -# compatibility). -# -# The following variables control the behaviour of this module when an exported -# gflags CMake configuration is not found. -# -# GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION: TRUE/FALSE, iff TRUE then -# then prefer using an exported CMake configuration -# generated by gflags >= 2.1 over searching for the -# gflags components manually. Otherwise (FALSE) -# ignore any exported gflags CMake configurations and -# always perform a manual search for the components. -# Default: TRUE iff user does not define this variable -# before we are called, and does NOT specify either -# GFLAGS_INCLUDE_DIR_HINTS or GFLAGS_LIBRARY_DIR_HINTS -# otherwise FALSE. -# GFLAGS_INCLUDE_DIR_HINTS: List of additional directories in which to -# search for gflags includes, e.g: /timbuktu/include. -# GFLAGS_LIBRARY_DIR_HINTS: List of additional directories in which to -# search for gflags libraries, e.g: /timbuktu/lib. -# -# The following variables are also defined by this module, but in line with -# CMake recommended FindPackage() module style should NOT be referenced directly -# by callers (use the plural variables detailed above instead). These variables -# do however affect the behaviour of the module via FIND_[PATH/LIBRARY]() which -# are NOT re-called (i.e. search for library is not repeated) if these variables -# are set with valid values _in the CMake cache_. This means that if these -# variables are set directly in the cache, either by the user in the CMake GUI, -# or by the user passing -DVAR=VALUE directives to CMake when called (which -# explicitly defines a cache variable), then they will be used verbatim, -# bypassing the HINTS variables and other hard-coded search locations. -# -# GFLAGS_INCLUDE_DIR: Include directory for gflags, not including the -# include directory of any dependencies. -# GFLAGS_LIBRARY: gflags library, not including the libraries of any -# dependencies. - -# Reset CALLERS_CMAKE_FIND_LIBRARY_PREFIXES to its value when FindGflags was -# invoked, necessary for MSVC. -macro(GFLAGS_RESET_FIND_LIBRARY_PREFIX) - if (MSVC) - set(CMAKE_FIND_LIBRARY_PREFIXES "${CALLERS_CMAKE_FIND_LIBRARY_PREFIXES}") - endif (MSVC) -endmacro(GFLAGS_RESET_FIND_LIBRARY_PREFIX) - -# Called if we failed to find gflags or any of it's required dependencies, -# unsets all public (designed to be used externally) variables and reports -# error message at priority depending upon [REQUIRED/QUIET/] argument. -macro(GFLAGS_REPORT_NOT_FOUND REASON_MSG) - unset(GFLAGS_FOUND) - unset(GFLAGS_INCLUDE_DIRS) - unset(GFLAGS_LIBRARIES) - # Do not use unset, as we want to keep GFLAGS_NAMESPACE in the cache, - # but simply clear its value. - set(GFLAGS_NAMESPACE "" CACHE STRING - "gflags namespace (google or gflags)" FORCE) - - # Make results of search visible in the CMake GUI if gflags has not - # been found so that user does not have to toggle to advanced view. - mark_as_advanced(CLEAR GFLAGS_INCLUDE_DIR - GFLAGS_LIBRARY - GFLAGS_NAMESPACE) - - gflags_reset_find_library_prefix() - - # Note _FIND_[REQUIRED/QUIETLY] variables defined by FindPackage() - # use the camelcase library name, not uppercase. - if (Gflags_FIND_QUIETLY) - message(STATUS "Failed to find gflags - " ${REASON_MSG} ${ARGN}) - elseif (Gflags_FIND_REQUIRED) - message(FATAL_ERROR "Failed to find gflags - " ${REASON_MSG} ${ARGN}) - else() - # Neither QUIETLY nor REQUIRED, use no priority which emits a message - # but continues configuration and allows generation. - message("-- Failed to find gflags - " ${REASON_MSG} ${ARGN}) - endif () - return() -endmacro(GFLAGS_REPORT_NOT_FOUND) - -# Verify that all variable names passed as arguments are defined (can be empty -# but must be defined) or raise a fatal error. -macro(GFLAGS_CHECK_VARS_DEFINED) - foreach(CHECK_VAR ${ARGN}) - if (NOT DEFINED ${CHECK_VAR}) - message(FATAL_ERROR "Ceres Bug: ${CHECK_VAR} is not defined.") - endif() - endforeach() -endmacro(GFLAGS_CHECK_VARS_DEFINED) - -# Use check_cxx_source_compiles() to compile trivial test programs to determine -# the gflags namespace. This works on all OSs except Windows. If using Visual -# Studio, it fails because msbuild forces check_cxx_source_compiles() to use -# CMAKE_BUILD_TYPE=Debug for the test project, which usually breaks detection -# because MSVC requires that the test project use the same build type as gflags, -# which would normally be built in Release. -# -# Defines: GFLAGS_NAMESPACE in the caller's scope with the detected namespace, -# which is blank (empty string, will test FALSE is CMake conditionals) -# if detection failed. -function(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE) - # Verify that all required variables are defined. - gflags_check_vars_defined( - GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY) - # Ensure that GFLAGS_NAMESPACE is always unset on completion unless - # we explicitly set if after having the correct namespace. - set(GFLAGS_NAMESPACE "" PARENT_SCOPE) - - include(CheckCXXSourceCompiles) - # Setup include path & link library for gflags for CHECK_CXX_SOURCE_COMPILES. - set(CMAKE_REQUIRED_INCLUDES ${GFLAGS_INCLUDE_DIR}) - set(CMAKE_REQUIRED_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES}) - # First try the (older) google namespace. Note that the output variable - # MUST be unique to the build type as otherwise the test is not repeated as - # it is assumed to have already been performed. - check_cxx_source_compiles( - "#include - int main(int argc, char * argv[]) { - google::ParseCommandLineFlags(&argc, &argv, true); - return 0; - }" - GFLAGS_IN_GOOGLE_NAMESPACE) - if (GFLAGS_IN_GOOGLE_NAMESPACE) - set(GFLAGS_NAMESPACE google PARENT_SCOPE) - return() - endif() - - # Try (newer) gflags namespace instead. Note that the output variable - # MUST be unique to the build type as otherwise the test is not repeated as - # it is assumed to have already been performed. - set(CMAKE_REQUIRED_INCLUDES ${GFLAGS_INCLUDE_DIR}) - set(CMAKE_REQUIRED_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES}) - check_cxx_source_compiles( - "#include - int main(int argc, char * argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - return 0; - }" - GFLAGS_IN_GFLAGS_NAMESPACE) - if (GFLAGS_IN_GFLAGS_NAMESPACE) - set(GFLAGS_NAMESPACE gflags PARENT_SCOPE) - return() - endif (GFLAGS_IN_GFLAGS_NAMESPACE) -endfunction(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE) - -# Use regex on the gflags headers to attempt to determine the gflags namespace. -# Checks both gflags.h (contained namespace on versions < 2.1.2) and -# gflags_declare.h, which contains the namespace on versions >= 2.1.2. -# In general, this method should only be used when -# GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE() cannot be used, or has -# failed. -# -# Defines: GFLAGS_NAMESPACE in the caller's scope with the detected namespace, -# which is blank (empty string, will test FALSE is CMake conditionals) -# if detection failed. -function(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_REGEX) - # Verify that all required variables are defined. - gflags_check_vars_defined(GFLAGS_INCLUDE_DIR) - # Ensure that GFLAGS_NAMESPACE is always undefined on completion unless - # we explicitly set if after having the correct namespace. - set(GFLAGS_NAMESPACE "" PARENT_SCOPE) - - # Scan gflags.h to identify what namespace gflags was built with. On - # versions of gflags < 2.1.2, gflags.h was configured with the namespace - # directly, on >= 2.1.2, gflags.h uses the GFLAGS_NAMESPACE #define which - # is defined in gflags_declare.h, we try each location in turn. - set(GFLAGS_HEADER_FILE ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h) - if (NOT EXISTS ${GFLAGS_HEADER_FILE}) - gflags_report_not_found( - "Could not find file: ${GFLAGS_HEADER_FILE} " - "containing namespace information in gflags install located at: " - "${GFLAGS_INCLUDE_DIR}.") - endif() - file(READ ${GFLAGS_HEADER_FILE} GFLAGS_HEADER_FILE_CONTENTS) - - string(REGEX MATCH "namespace [A-Za-z]+" - GFLAGS_NAMESPACE "${GFLAGS_HEADER_FILE_CONTENTS}") - string(REGEX REPLACE "namespace ([A-Za-z]+)" "\\1" - GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}") - - if (NOT GFLAGS_NAMESPACE) - gflags_report_not_found( - "Failed to extract gflags namespace from header file: " - "${GFLAGS_HEADER_FILE}.") - endif (NOT GFLAGS_NAMESPACE) - - if (GFLAGS_NAMESPACE STREQUAL "google" OR - GFLAGS_NAMESPACE STREQUAL "gflags") - # Found valid gflags namespace from gflags.h. - set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" PARENT_SCOPE) - return() - endif() - - # Failed to find gflags namespace from gflags.h, gflags is likely a new - # version, check gflags_declare.h, which in newer versions (>= 2.1.2) contains - # the GFLAGS_NAMESPACE #define, which is then referenced in gflags.h. - set(GFLAGS_DECLARE_FILE ${GFLAGS_INCLUDE_DIR}/gflags/gflags_declare.h) - if (NOT EXISTS ${GFLAGS_DECLARE_FILE}) - gflags_report_not_found( - "Could not find file: ${GFLAGS_DECLARE_FILE} " - "containing namespace information in gflags install located at: " - "${GFLAGS_INCLUDE_DIR}.") - endif() - file(READ ${GFLAGS_DECLARE_FILE} GFLAGS_DECLARE_FILE_CONTENTS) - - string(REGEX MATCH "#define GFLAGS_NAMESPACE [A-Za-z]+" - GFLAGS_NAMESPACE "${GFLAGS_DECLARE_FILE_CONTENTS}") - string(REGEX REPLACE "#define GFLAGS_NAMESPACE ([A-Za-z]+)" "\\1" - GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}") - - if (NOT GFLAGS_NAMESPACE) - gflags_report_not_found( - "Failed to extract gflags namespace from declare file: " - "${GFLAGS_DECLARE_FILE}.") - endif (NOT GFLAGS_NAMESPACE) - - if (GFLAGS_NAMESPACE STREQUAL "google" OR - GFLAGS_NAMESPACE STREQUAL "gflags") - # Found valid gflags namespace from gflags.h. - set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" PARENT_SCOPE) - return() - endif() -endfunction(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_REGEX) - -# ----------------------------------------------------------------- -# By default, if the user has expressed no preference for using an exported -# gflags CMake configuration over performing a search for the installed -# components, and has not specified any hints for the search locations, then -# prefer a gflags exported configuration if available. -if (NOT DEFINED GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION - AND NOT GFLAGS_INCLUDE_DIR_HINTS - AND NOT GFLAGS_LIBRARY_DIR_HINTS) - message(STATUS "No preference for use of exported gflags CMake configuration " - "set, and no hints for include/library directories provided. " - "Defaulting to preferring an installed/exported gflags CMake configuration " - "if available.") - set(GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION TRUE) -endif() - -if (GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION) - # Try to find an exported CMake configuration for gflags, as generated by - # gflags versions >= 2.1. - # - # We search twice, s/t we can invert the ordering of precedence used by - # find_package() for exported package build directories, and installed - # packages (found via CMAKE_SYSTEM_PREFIX_PATH), listed as items 6) and 7) - # respectively in [1]. - # - # By default, exported build directories are (in theory) detected first, and - # this is usually the case on Windows. However, on OS X & Linux, the install - # path (/usr/local) is typically present in the PATH environment variable - # which is checked in item 4) in [1] (i.e. before both of the above, unless - # NO_SYSTEM_ENVIRONMENT_PATH is passed). As such on those OSs installed - # packages are usually detected in preference to exported package build - # directories. - # - # To ensure a more consistent response across all OSs, and as users usually - # want to prefer an installed version of a package over a locally built one - # where both exist (esp. as the exported build directory might be removed - # after installation), we first search with NO_CMAKE_PACKAGE_REGISTRY which - # means any build directories exported by the user are ignored, and thus - # installed directories are preferred. If this fails to find the package - # we then research again, but without NO_CMAKE_PACKAGE_REGISTRY, so any - # exported build directories will now be detected. - # - # To prevent confusion on Windows, we also pass NO_CMAKE_BUILDS_PATH (which - # is item 5) in [1]), to not preferentially use projects that were built - # recently with the CMake GUI to ensure that we always prefer an installed - # version if available. - # - # [1] http://www.cmake.org/cmake/help/v2.8.11/cmake.html#command:find_package - find_package(gflags QUIET - NO_MODULE - NO_CMAKE_PACKAGE_REGISTRY - NO_CMAKE_BUILDS_PATH) - if (gflags_FOUND) - message(STATUS "Found installed version of gflags: ${gflags_DIR}") - else(gflags_FOUND) - # Failed to find an installed version of gflags, repeat search allowing - # exported build directories. - message(STATUS "Failed to find installed gflags CMake configuration, " - "searching for gflags build directories exported with CMake.") - # Again pass NO_CMAKE_BUILDS_PATH, as we know that gflags is exported and - # do not want to treat projects built with the CMake GUI preferentially. - find_package(gflags QUIET - NO_MODULE - NO_CMAKE_BUILDS_PATH) - if (gflags_FOUND) - message(STATUS "Found exported gflags build directory: ${gflags_DIR}") - endif(gflags_FOUND) - endif(gflags_FOUND) - - set(FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION ${gflags_FOUND}) - - # gflags v2.1 - 2.1.2 shipped with a bug in their gflags-config.cmake [1] - # whereby gflags_LIBRARIES = "gflags", but there was no imported target - # called "gflags", they were called: gflags[_nothreads]-[static/shared]. - # As this causes linker errors when gflags is not installed in a location - # on the current library paths, detect if this problem is present and - # fix it. - # - # [1] https://github.com/gflags/gflags/issues/110 - if (gflags_FOUND) - # NOTE: This is not written as additional conditions in the outer - # if (gflags_FOUND) as the NOT TARGET "${gflags_LIBRARIES}" - # condition causes problems if gflags is not found. - if (${gflags_VERSION} VERSION_LESS 2.1.3 AND - NOT TARGET "${gflags_LIBRARIES}") - message(STATUS "Detected broken gflags install in: ${gflags_DIR}, " - "version: ${gflags_VERSION} <= 2.1.2 which defines gflags_LIBRARIES = " - "${gflags_LIBRARIES} which is not an imported CMake target, see: " - "https://github.com/gflags/gflags/issues/110. Attempting to fix by " - "detecting correct gflags target.") - # Ordering here expresses preference for detection, specifically we do not - # want to use the _nothreads variants if the full library is available. - list(APPEND CHECK_GFLAGS_IMPORTED_TARGET_NAMES - gflags-shared gflags-static - gflags_nothreads-shared gflags_nothreads-static) - foreach(CHECK_GFLAGS_TARGET ${CHECK_GFLAGS_IMPORTED_TARGET_NAMES}) - if (TARGET ${CHECK_GFLAGS_TARGET}) - message(STATUS "Found valid gflags target: ${CHECK_GFLAGS_TARGET}, " - "updating gflags_LIBRARIES.") - set(gflags_LIBRARIES ${CHECK_GFLAGS_TARGET}) - break() - endif() - endforeach() - if (NOT TARGET ${gflags_LIBRARIES}) - message(STATUS "Failed to fix detected broken gflags install in: " - "${gflags_DIR}, version: ${gflags_VERSION} <= 2.1.2, none of the " - "imported targets for gflags: ${CHECK_GFLAGS_IMPORTED_TARGET_NAMES} " - "are defined. Will continue with a manual search for gflags " - "components. We recommend you build/install a version of gflags > " - "2.1.2 (or master).") - set(FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION FALSE) - endif() - endif() - endif() - - if (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION) - message(STATUS "Detected gflags version: ${gflags_VERSION}") - set(GFLAGS_FOUND ${gflags_FOUND}) - set(GFLAGS_INCLUDE_DIR ${gflags_INCLUDE_DIR}) - set(GFLAGS_LIBRARY ${gflags_LIBRARIES}) - - # gflags does not export the namespace in their CMake configuration, so - # use our function to determine what it should be, as it can be either - # gflags or google dependent upon version & configuration. - # - # NOTE: We use the regex method to determine the namespace here, as - # check_cxx_source_compiles() will not use imported targets, which - # is what gflags will be in this case. - gflags_check_gflags_namespace_using_regex() - - if (NOT GFLAGS_NAMESPACE) - gflags_report_not_found( - "Failed to determine gflags namespace using regex for gflags " - "version: ${gflags_VERSION} exported here: ${gflags_DIR} using CMake.") - endif (NOT GFLAGS_NAMESPACE) - else (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION) - message(STATUS "Failed to find an installed/exported CMake configuration " - "for gflags, will perform search for installed gflags components.") - endif (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION) -endif(GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION) - -if (NOT GFLAGS_FOUND) - # Either failed to find an exported gflags CMake configuration, or user - # told us not to use one. Perform a manual search for all gflags components. - - # Handle possible presence of lib prefix for libraries on MSVC, see - # also GFLAGS_RESET_FIND_LIBRARY_PREFIX(). - if (MSVC) - # Preserve the caller's original values for CMAKE_FIND_LIBRARY_PREFIXES - # s/t we can set it back before returning. - set(CALLERS_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}") - # The empty string in this list is important, it represents the case when - # the libraries have no prefix (shared libraries / DLLs). - set(CMAKE_FIND_LIBRARY_PREFIXES "lib" "" "${CMAKE_FIND_LIBRARY_PREFIXES}") - endif (MSVC) - - # Search user-installed locations first, so that we prefer user installs - # to system installs where both exist. - list(APPEND GFLAGS_CHECK_INCLUDE_DIRS - /usr/local/include - /usr/local/homebrew/include # Mac OS X - /opt/local/var/macports/software # Mac OS X. - /opt/local/include - /usr/include) - list(APPEND GFLAGS_CHECK_PATH_SUFFIXES - gflags/include # Windows (for C:/Program Files prefix). - gflags/Include ) # Windows (for C:/Program Files prefix). - - list(APPEND GFLAGS_CHECK_LIBRARY_DIRS - /usr/local/lib - /usr/local/homebrew/lib # Mac OS X. - /opt/local/lib - /usr/lib) - list(APPEND GFLAGS_CHECK_LIBRARY_SUFFIXES - gflags/lib # Windows (for C:/Program Files prefix). - gflags/Lib ) # Windows (for C:/Program Files prefix). - - # Search supplied hint directories first if supplied. - find_path(GFLAGS_INCLUDE_DIR - NAMES gflags/gflags.h - PATHS ${GFLAGS_INCLUDE_DIR_HINTS} - ${GFLAGS_CHECK_INCLUDE_DIRS} - PATH_SUFFIXES ${GFLAGS_CHECK_PATH_SUFFIXES}) - if (NOT GFLAGS_INCLUDE_DIR OR - NOT EXISTS ${GFLAGS_INCLUDE_DIR}) - gflags_report_not_found( - "Could not find gflags include directory, set GFLAGS_INCLUDE_DIR " - "to directory containing gflags/gflags.h") - endif (NOT GFLAGS_INCLUDE_DIR OR - NOT EXISTS ${GFLAGS_INCLUDE_DIR}) - - find_library(GFLAGS_LIBRARY NAMES gflags - PATHS ${GFLAGS_LIBRARY_DIR_HINTS} - ${GFLAGS_CHECK_LIBRARY_DIRS} - PATH_SUFFIXES ${GFLAGS_CHECK_LIBRARY_SUFFIXES}) - if (NOT GFLAGS_LIBRARY OR - NOT EXISTS ${GFLAGS_LIBRARY}) - gflags_report_not_found( - "Could not find gflags library, set GFLAGS_LIBRARY " - "to full path to libgflags.") - endif (NOT GFLAGS_LIBRARY OR - NOT EXISTS ${GFLAGS_LIBRARY}) - - # gflags typically requires a threading library (which is OS dependent), note - # that this defines the CMAKE_THREAD_LIBS_INIT variable. If we are able to - # detect threads, we assume that gflags requires it. - find_package(Threads QUIET) - set(GFLAGS_LINK_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) - # On Windows (including MinGW), the Shlwapi library is used by gflags if - # available. - if (WIN32) - include(CheckIncludeFileCXX) - check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI) - if (HAVE_SHLWAPI) - list(APPEND GFLAGS_LINK_LIBRARIES shlwapi.lib) - endif(HAVE_SHLWAPI) - endif (WIN32) - - # Mark internally as found, then verify. GFLAGS_REPORT_NOT_FOUND() unsets - # if called. - set(GFLAGS_FOUND TRUE) - - # Identify what namespace gflags was built with. - if (GFLAGS_INCLUDE_DIR AND NOT GFLAGS_NAMESPACE) - # To handle Windows peculiarities / CMake bugs on MSVC we try two approaches - # to detect the gflags namespace: - # - # 1) Try to use check_cxx_source_compiles() to compile a trivial program - # with the two choices for the gflags namespace. - # - # 2) [In the event 1) fails] Use regex on the gflags headers to try to - # determine the gflags namespace. Whilst this is less robust than 1), - # it does avoid any interaction with msbuild. - gflags_check_gflags_namespace_using_try_compile() - - if (NOT GFLAGS_NAMESPACE) - # Failed to determine gflags namespace using check_cxx_source_compiles() - # method, try and obtain it using regex on the gflags headers instead. - message(STATUS "Failed to find gflags namespace using using " - "check_cxx_source_compiles(), trying namespace regex instead, " - "this is expected on Windows.") - gflags_check_gflags_namespace_using_regex() - - if (NOT GFLAGS_NAMESPACE) - gflags_report_not_found( - "Failed to determine gflags namespace either by " - "check_cxx_source_compiles(), or namespace regex.") - endif (NOT GFLAGS_NAMESPACE) - endif (NOT GFLAGS_NAMESPACE) - endif (GFLAGS_INCLUDE_DIR AND NOT GFLAGS_NAMESPACE) - - # Make the GFLAGS_NAMESPACE a cache variable s/t the user can view it, and could - # overwrite it in the CMake GUI. - set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" CACHE STRING - "gflags namespace (google or gflags)" FORCE) - - # gflags does not seem to provide any record of the version in its - # source tree, thus cannot extract version. - - # Catch case when caller has set GFLAGS_NAMESPACE in the cache / GUI - # with an invalid value. - if (GFLAGS_NAMESPACE AND - NOT GFLAGS_NAMESPACE STREQUAL "google" AND - NOT GFLAGS_NAMESPACE STREQUAL "gflags") - gflags_report_not_found( - "Caller defined GFLAGS_NAMESPACE:" - " ${GFLAGS_NAMESPACE} is not valid, not google or gflags.") - endif () - # Catch case when caller has set GFLAGS_INCLUDE_DIR in the cache / GUI and - # thus FIND_[PATH/LIBRARY] are not called, but specified locations are - # invalid, otherwise we would report the library as found. - if (GFLAGS_INCLUDE_DIR AND - NOT EXISTS ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h) - gflags_report_not_found( - "Caller defined GFLAGS_INCLUDE_DIR:" - " ${GFLAGS_INCLUDE_DIR} does not contain gflags/gflags.h header.") - endif (GFLAGS_INCLUDE_DIR AND - NOT EXISTS ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h) - # TODO: This regex for gflags library is pretty primitive, we use lowercase - # for comparison to handle Windows using CamelCase library names, could - # this check be better? - string(TOLOWER "${GFLAGS_LIBRARY}" LOWERCASE_GFLAGS_LIBRARY) - if (GFLAGS_LIBRARY AND - NOT "${LOWERCASE_GFLAGS_LIBRARY}" MATCHES ".*gflags[^/]*") - gflags_report_not_found( - "Caller defined GFLAGS_LIBRARY: " - "${GFLAGS_LIBRARY} does not match gflags.") - endif (GFLAGS_LIBRARY AND - NOT "${LOWERCASE_GFLAGS_LIBRARY}" MATCHES ".*gflags[^/]*") - - gflags_reset_find_library_prefix() - -endif(NOT GFLAGS_FOUND) - -# Set standard CMake FindPackage variables if found. -if (GFLAGS_FOUND) - set(GFLAGS_INCLUDE_DIRS ${GFLAGS_INCLUDE_DIR}) - set(GFLAGS_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES}) -endif (GFLAGS_FOUND) - -# Handle REQUIRED / QUIET optional arguments. -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(Gflags DEFAULT_MSG - GFLAGS_INCLUDE_DIRS GFLAGS_LIBRARIES GFLAGS_NAMESPACE) - -# Only mark internal variables as advanced if we found gflags, otherwise -# leave them visible in the standard GUI for the user to set manually. -if (GFLAGS_FOUND) - mark_as_advanced(FORCE GFLAGS_INCLUDE_DIR - GFLAGS_LIBRARY - GFLAGS_NAMESPACE - gflags_DIR) # Autogenerated by find_package(gflags) -endif (GFLAGS_FOUND) diff --git a/cmake/FindGlog.cmake b/cmake/FindGlog.cmake deleted file mode 100644 index 142e2ca96b..0000000000 --- a/cmake/FindGlog.cmake +++ /dev/null @@ -1,24 +0,0 @@ -# -# Find libglog -# -# LIBGLOG_INCLUDE_DIR - where to find glog/logging.h, etc. -# LIBGLOG_LIBRARY - List of libraries when using libglog. -# LIBGLOG_FOUND - True if libglog found. -# -# from https://github.com/facebook/hhvm/blob/master/CMake/FindGlog.cmake - -IF (LIBGLOG_INCLUDE_DIR) - # Already in cache, be silent - SET(LIBGLOG_FIND_QUIETLY TRUE) -ENDIF () - -FIND_PATH(LIBGLOG_INCLUDE_DIR glog/logging.h) - -FIND_LIBRARY(LIBGLOG_LIBRARY glog) - -# handle the QUIETLY and REQUIRED arguments and set LIBGLOG_FOUND to TRUE if -# all listed variables are TRUE -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(LIBGLOG DEFAULT_MSG LIBGLOG_LIBRARY LIBGLOG_INCLUDE_DIR) - -MARK_AS_ADVANCED(LIBGLOG_LIBRARY LIBGLOG_INCLUDE_DIR) \ No newline at end of file diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake deleted file mode 100644 index 928f573a4f..0000000000 --- a/cmake/FindGperftools.cmake +++ /dev/null @@ -1,63 +0,0 @@ -# Tries to find Gperftools. -# -# Usage of this module as follows: -# -# find_package(Gperftools) -# -# Variables used by this module, they can change the default behaviour and need -# to be set before calling find_package: -# -# Gperftools_ROOT_DIR Set this variable to the root installation of -# Gperftools if the module has problems finding -# the proper installation path. -# -# Variables defined by this module: -# -# GPERFTOOLS_FOUND System has Gperftools libs/headers -# GPERFTOOLS_LIBRARIES The Gperftools libraries (tcmalloc & profiler) -# GPERFTOOLS_INCLUDE_DIR The location of Gperftools headers - -find_library(GPERFTOOLS_TCMALLOC - NAMES tcmalloc - HINTS ${Gperftools_ROOT_DIR}/lib) - -find_library(GPERFTOOLS_PROFILER - NAMES profiler - HINTS ${Gperftools_ROOT_DIR}/lib) - -find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER - NAMES tcmalloc_and_profiler - HINTS ${Gperftools_ROOT_DIR}/lib) - -find_path(GPERFTOOLS_INCLUDE_DIR - NAMES gperftools/heap-profiler.h - HINTS ${Gperftools_ROOT_DIR}/include) - -set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER}) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - Gperftools - DEFAULT_MSG - GPERFTOOLS_LIBRARIES - GPERFTOOLS_INCLUDE_DIR) - -mark_as_advanced( - Gperftools_ROOT_DIR - GPERFTOOLS_TCMALLOC - GPERFTOOLS_PROFILER - GPERFTOOLS_TCMALLOC_AND_PROFILER - GPERFTOOLS_LIBRARIES - GPERFTOOLS_INCLUDE_DIR) - -# create IMPORTED targets -if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc) - add_library(gperftools::tcmalloc UNKNOWN IMPORTED) - set_target_properties(gperftools::tcmalloc PROPERTIES - IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC} - INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") - add_library(gperftools::profiler UNKNOWN IMPORTED) - set_target_properties(gperftools::profiler PROPERTIES - IMPORTED_LOCATION ${GPERFTOOLS_PROFILER} - INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") -endif() diff --git a/cmake/FindJeMalloc.cmake b/cmake/FindJeMalloc.cmake deleted file mode 100644 index b95287160b..0000000000 --- a/cmake/FindJeMalloc.cmake +++ /dev/null @@ -1,28 +0,0 @@ -# - Find JeMalloc library -# Find the native JeMalloc includes and library -# -# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc. -# JEMALLOC_LIBRARIES - List of libraries when using jemalloc. -# JEMALLOC_FOUND - True if jemalloc found. - -find_path(JEMALLOC_INCLUDE_DIR - NAMES jemalloc/jemalloc.h - HINTS ${JEMALLOC_ROOT_DIR}/include) - -find_library(JEMALLOC_LIBRARIES - NAMES jemalloc - HINTS ${JEMALLOC_ROOT_DIR}/lib) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR) - -mark_as_advanced( - JEMALLOC_LIBRARIES - JEMALLOC_INCLUDE_DIR) - -if (JEMALLOC_FOUND) - add_library(jemalloc::jemalloc UNKNOWN IMPORTED) - set_target_properties(jemalloc::jemalloc PROPERTIES - IMPORTED_LOCATION ${JEMALLOC_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}") -endif() diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake deleted file mode 100644 index 8cdd642ac0..0000000000 --- a/cmake/FindNumPy.cmake +++ /dev/null @@ -1,38 +0,0 @@ -# Find the Python NumPy package -# PYTHON_NUMPY_INCLUDE_DIR -# NUMPY_FOUND -# will be set by this script - -cmake_minimum_required(VERSION 2.6) - -if(NOT PYTHON_EXECUTABLE) - if(NumPy_FIND_QUIETLY) - find_package(PythonInterp QUIET) - else() - find_package(PythonInterp) - set(_numpy_out 1) - endif() -endif() - -if (PYTHON_EXECUTABLE) - # write a python script that finds the numpy path - file(WRITE ${PROJECT_BINARY_DIR}/FindNumpyPath.py - "try: import numpy; print(numpy.get_include())\nexcept:pass\n") - - # execute the find script - exec_program("${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR} - ARGS "FindNumpyPath.py" - OUTPUT_VARIABLE NUMPY_PATH) -elseif(_numpy_out) - message(STATUS "Python executable not found.") -endif(PYTHON_EXECUTABLE) - -find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h - HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}") - -if(PYTHON_NUMPY_INCLUDE_DIR) - set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found") -endif(PYTHON_NUMPY_INCLUDE_DIR) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(NumPy DEFAULT_MSG PYTHON_NUMPY_INCLUDE_DIR) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake deleted file mode 100644 index 52ac31d1d1..0000000000 --- a/cmake/cblas.cmake +++ /dev/null @@ -1,94 +0,0 @@ -# Find the CBlas and lapack libraries -# -# It will search MKLML, atlas, OpenBlas, reference-cblas in order. -# -# If any cblas implementation found, the following variable will be set. -# CBLAS_PROVIDER # one of MKLML, OPENBLAS, REFERENCE -# CBLAS_INC_DIR # the include directory for cblas. -# CBLAS_LIBS # a list of libraries should be linked by paddle. -# # Each library should be full path to object file. - -set(CBLAS_FOUND OFF) - -## Find MKLML First. -if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) - set(CBLAS_FOUND ON) - set(CBLAS_PROVIDER MKLML) - set(CBLAS_INC_DIR ${MKLML_INC_DIR}) - set(CBLAS_LIBRARIES ${MKLML_LIB}) - - add_definitions(-DPADDLE_WITH_MKLML) - add_definitions(-DLAPACK_FOUND) - - message(STATUS "Found cblas and lapack in MKLML " - "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") - return() -endif() - -## Then find openblas. -set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas") -set(OPENBLAS_INCLUDE_SEARCH_PATHS - ${OPENBLAS_ROOT}/include - /usr/include - /usr/include/openblas - /usr/local/opt/openblas/include) -set(OPENBLAS_LIB_SEARCH_PATHS - ${OPENBLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/openblas - /usr/lib/openblas - /usr/local/opt/openblas/lib) - -find_path(OPENBLAS_INC_DIR NAMES cblas.h - PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) -find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h - PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) -find_library(OPENBLAS_LIB NAMES openblas - PATHS ${OPENBLAS_LIB_SEARCH_PATHS}) - -if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB) - set(CBLAS_FOUND ON) - set(CBLAS_PROVIDER OPENBLAS) - set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR}) - set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) - - add_definitions(-DPADDLE_USE_OPENBLAS) - add_definitions(-DLAPACK_FOUND) - - message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") - message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") - return() -endif() - - -## Then find the reference-cblas. www.netlib.org/blas/ -set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH - "Folder contains reference-cblas") -set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/include - /usr/include - /usr/include/cblas -) - -set(REFERENCE_CBLAS_LIB_SEARCH_PATHS - ${REFERENCE_CBLAS_ROOT}/lib - /usr/lib - /usr/lib/blas/reference/ - /usr/lib/reference/ -) - -if(WITH_SYSTEM_BLAS) - find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS - ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) - find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS - ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) - - if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) - set(CBLAS_FOUND ON) - set(CBLAS_PROVIDER REFERENCE) - set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR}) - set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY}) - add_definitions(-DPADDLE_USE_REFERENCE_CBLAS) - message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") - endif() -endif() diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake deleted file mode 100644 index 900f59d4cb..0000000000 --- a/cmake/ccache.cmake +++ /dev/null @@ -1,9 +0,0 @@ -# Use ccache if found ccache program - -find_program(CCACHE_PATH ccache) - -if(CCACHE_PATH) - message(STATUS "Ccache is founded, use ccache to speed up compile.") - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH}) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH}) -endif(CCACHE_PATH) diff --git a/cmake/configure.cmake b/cmake/configure.cmake deleted file mode 100644 index 67830fe2e0..0000000000 --- a/cmake/configure.cmake +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT WITH_PYTHON) - add_definitions(-DPADDLE_NO_PYTHON) -endif(NOT WITH_PYTHON) - -if(WITH_DSO) - add_definitions(-DPADDLE_USE_DSO) -endif(WITH_DSO) - -if(WITH_TESTING) - add_definitions(-DPADDLE_WITH_TESTING) -endif(WITH_TESTING) - -if(NOT WITH_PROFILER) - add_definitions(-DPADDLE_DISABLE_PROFILER) -endif(NOT WITH_PROFILER) - -if(WITH_AVX AND AVX_FOUND) - set(SIMD_FLAG ${AVX_FLAG}) -elseif(SSE3_FOUND) - set(SIMD_FLAG ${SSE3_FLAG}) -endif() - -if(LITE_WITH_CUDA) - add_definitions(-DLITE_WITH_CUDA) - add_definitions(-DEIGEN_USE_GPU) - - FIND_PACKAGE(CUDA REQUIRED) - - if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) - message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") - endif() - - if(NOT CUDNN_FOUND) - message(FATAL_ERROR "Paddle needs cudnn to compile") - endif() - if(CUPTI_FOUND) - include_directories(${CUPTI_INCLUDE_DIR}) - add_definitions(-DPADDLE_WITH_CUPTI) - else() - message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") - endif() - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") - - # Include cuda and cudnn - include_directories(${CUDNN_INCLUDE_DIR}) - include_directories(${CUDA_TOOLKIT_INCLUDE}) - -elseif(WITH_AMD_GPU) - add_definitions(-DPADDLE_WITH_HIP) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") -else() - add_definitions(-DHPPL_STUB_FUNC) - list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) -endif() - -if (WITH_MKLML AND MKLML_IOMP_LIB) - message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - if(WIN32) - # openmp not support well for now on windows - set(OPENMP_FLAGS "") - else(WIN32) - set(OPENMP_FLAGS "-fopenmp") - endif(WIN32) - set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") -endif() - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") - -if(WITH_DISTRIBUTE) - add_definitions(-DPADDLE_WITH_DISTRIBUTE) -endif() - -if(WITH_GRPC) - add_definitions(-DPADDLE_WITH_GRPC) -endif(WITH_GRPC) - -if(WITH_BRPC_RDMA) - add_definitions(-DPADDLE_WITH_BRPC_RDMA) -endif(WITH_BRPC_RDMA) - -if(ON_INFER) - add_definitions(-DPADDLE_ON_INFERENCE) -endif(ON_INFER) - -if(WITH_WBAES) - add_definitions(-DPADDLE_WITH_WBAES) -endif(WITH_WBAES) - -if (REPLACE_ENFORCE_GLOG) - add_definitions("-DREPLACE_ENFORCE_GLOG") -endif() - -# for lite -# TODO(Superjomn) not work fine with the option -if (LITE_WITH_X86) - add_definitions("-DLITE_WITH_X86") -endif() - -if (LITE_WITH_ARM) - add_definitions("-DLITE_WITH_ARM") -endif() - -if (WITH_ARM_DOTPROD) - add_definitions("-DWITH_ARM_DOTPROD") -endif() - -if (LITE_WITH_NPU) - add_definitions("-DLITE_WITH_NPU") -endif() - -if (LITE_WITH_OPENCL) - add_definitions("-DLITE_WITH_OPENCL") -endif() - -if (LITE_WITH_FPGA) -add_definitions("-DLITE_WITH_FPGA") -endif() - -if (LITE_WITH_PROFILE) - add_definitions("-DLITE_WITH_PROFILE") - if (LITE_WITH_PRECISION_PROFILE) - add_definitions("-DLITE_WITH_PRECISION_PROFILE") - endif() -endif() - -if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK") -endif() - -if (LITE_SHUTDOWN_LOG) - add_definitions("-DLITE_SHUTDOWN_LOG") -endif() - -if (LITE_ON_TINY_PUBLISH) - add_definitions("-DLITE_ON_TINY_PUBLISH") -endif() - -if (LITE_ON_MODEL_OPTIMIZE_TOOL) - add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL") -endif(LITE_ON_MODEL_OPTIMIZE_TOOL) - diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake deleted file mode 100644 index ca1471cabb..0000000000 --- a/cmake/coveralls.cmake +++ /dev/null @@ -1,103 +0,0 @@ -# CMake script for code coverage. -# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically. - -# Param _COVERAGE_SRCS A list of coverage source files. -# Param _COVERALLS_UPLOAD Upload the result to coveralls. -# Param _CMAKE_SCRIPT_PATH CMake script path. -function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH) - # clean previous gcov data. - file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda) - - # find curl for upload JSON soon. - if (_COVERALLS_UPLOAD) - find_program(CURL_EXECUTABLE curl) - if (NOT CURL_EXECUTABLE) - message(FATAL_ERROR "Coveralls: curl not found!") - endif() - endif() - - # When passing a CMake list to an external process, the list - # will be converted from the format "1;2;3" to "1 2 3". - set(COVERAGE_SRCS "") - foreach (SINGLE_SRC ${_COVERAGE_SRCS}) - set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}") - endforeach() - - # query number of logical cores - cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES) - # coveralls json file. - set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json) - add_custom_target(coveralls_generate - # Run regress tests. - COMMAND ${CMAKE_CTEST_COMMAND} - -j ${core_size} - --output-on-failure - # Generate Gcov and translate it into coveralls JSON. - COMMAND ${CMAKE_COMMAND} - -DCOVERAGE_SRCS="${COVERAGE_SRCS}" - -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}" - -DCOV_PATH="${PROJECT_BINARY_DIR}" - -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}" - -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake" - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMENT "Coveralls: generating coveralls output..." - ) - - if (_COVERALLS_UPLOAD) - message("COVERALLS UPLOAD: ON") - # Upload the JSON to coveralls. - add_custom_target(coveralls_upload - COMMAND ${CURL_EXECUTABLE} - -S -F json_file=@${COVERALLS_FILE} - https://coveralls.io/api/v1/jobs - DEPENDS coveralls_generate - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMENT "Coveralls: uploading coveralls output...") - - add_custom_target(coveralls DEPENDS coveralls_upload) - else() - message("COVERALLS UPLOAD: OFF") - add_custom_target(coveralls DEPENDS coveralls_generate) - endif() -endfunction() - -if(WITH_COVERAGE) - set(CMAKE_BUILD_TYPE "Debug") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") - - set(EXCLUDE_DIRS - "demo/" - "build/" - "tests/" - ".test_env/" - ) - - if(WITH_GPU) - file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu") - else() - file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c") - endif() - - # exclude trivial files in PADDLE_SOURCES - foreach(EXCLUDE_DIR ${EXCLUDE_DIRS}) - foreach(TMP_PATH ${PADDLE_SOURCES}) - string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND) - if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1) - list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH}) - endif() - endforeach(TMP_PATH) - endforeach() - - # convert to absolute path - set(PADDLE_SRCS "") - foreach(PADDLE_SRC ${PADDLE_SOURCES}) - set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}") - endforeach() - - code_coverage( - "${PADDLE_SRCS}" - ${COVERALLS_UPLOAD} - "${PROJECT_SOURCE_DIR}/cmake" - ) -endif() diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake deleted file mode 100644 index 4641184fcf..0000000000 --- a/cmake/coverallsGcovJsons.cmake +++ /dev/null @@ -1,401 +0,0 @@ -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -# Copyright (C) 2014 Joakim Söderberg -# -# This is intended to be run by a custom target in a CMake project like this. -# 0. Compile program with coverage support. -# 1. Clear coverage data. (Recursively delete *.gcda in build dir) -# 2. Run the unit tests. -# 3. Run this script specifying which source files the coverage should be performed on. -# -# This script will then use gcov to generate .gcov files in the directory specified -# via the COV_PATH var. This should probably be the same as your cmake build dir. -# -# It then parses the .gcov files to convert them into the Coveralls JSON format: -# https://coveralls.io/docs/api -# - -CMAKE_MINIMUM_REQUIRED(VERSION 2.8) - -# Since it's not possible to pass a CMake list properly in the -# "1;2;3" format to an external process, we have replaced the -# ";" with "*", so reverse that here so we get it back into the -# CMake list format. -string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS}) - -find_program(GCOV_EXECUTABLE gcov) -if (NOT GCOV_EXECUTABLE) - message(FATAL_ERROR "gcov not found! Aborting...") -endif() - -find_package(Git) - -# TODO: Add these git things to the coveralls json. -if (GIT_FOUND) - # Branch. - execute_process( - COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - - macro (git_log_format FORMAT_CHARS VAR_NAME) - execute_process( - COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS} - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE ${VAR_NAME} - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - endmacro() - - git_log_format(an GIT_AUTHOR_EMAIL) - git_log_format(ae GIT_AUTHOR_EMAIL) - git_log_format(cn GIT_COMMITTER_NAME) - git_log_format(ce GIT_COMMITTER_EMAIL) - git_log_format(B GIT_COMMIT_MESSAGE) - - message("Git exe: ${GIT_EXECUTABLE}") - message("Git branch: ${GIT_BRANCH}") - message("Git author: ${GIT_AUTHOR_NAME}") - message("Git e-mail: ${GIT_AUTHOR_EMAIL}") - message("Git commiter name: ${GIT_COMMITTER_NAME}") - message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}") - message("Git commit message: ${GIT_COMMIT_MESSAGE}") - -endif() - -############################# Macros ######################################### - -# -# This macro converts from the full path format gcov outputs: -# -# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov -# -# to the original source file path the .gcov is for: -# -# /path/to/project/root/subdir/the_file.c -# -macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME) - - # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov - # -> - # #path#to#project#root#subdir#the_file.c.gcov - get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME) - - # #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c - string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT}) - string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP}) - set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}") -endmacro() - -############################################################################## - -# Get the coverage data. -file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda") -message("Process GCDA files:") -message("===============================") - -# Get a list of all the object directories needed by gcov -# (The directories the .gcda files and .o files are found in) -# and run gcov on those. -foreach(GCDA ${GCDA_FILES}) - get_filename_component(GCDA_DIR ${GCDA} PATH) - - # - # The -p below refers to "Preserve path components", - # This means that the generated gcov filename of a source file will - # keep the original files entire filepath, but / is replaced with #. - # Example: - # - # /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda - # ------------------------------------------------------------------------------ - # File '/path/to/project/root/subdir/the_file.c' - # Lines executed:68.34% of 199 - # /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov' - # - # If -p is not specified then the file is named only "the_file.c.gcov" - # - execute_process( - COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null - WORKING_DIRECTORY ${GCDA_DIR} - ) -endforeach() - -# TODO: Make these be absolute path -file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov") - -# Get only the filenames to use for filtering. -#set(COVERAGE_SRCS_NAMES "") -#foreach (COVSRC ${COVERAGE_SRCS}) -# get_filename_component(COVSRC_NAME ${COVSRC} NAME) -# message("${COVSRC} -> ${COVSRC_NAME}") -# list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}") -#endforeach() - -# -# Filter out all but the gcov files we want. -# -# We do this by comparing the list of COVERAGE_SRCS filepaths that the -# user wants the coverage data for with the paths of the generated .gcov files, -# so that we only keep the relevant gcov files. -# -# Example: -# COVERAGE_SRCS = -# /path/to/project/root/subdir/the_file.c -# -# ALL_GCOV_FILES = -# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov -# /path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov -# -# Result should be: -# GCOV_FILES = -# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov -# -set(GCOV_FILES "") -#message("Look in coverage sources: ${COVERAGE_SRCS}") -message("\nFilter out unwanted GCOV files:") -message("===============================") - -set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS}) - -foreach (GCOV_FILE ${ALL_GCOV_FILES}) - - # - # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov - # -> - # /path/to/project/root/subdir/the_file.c - get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) - - # Is this in the list of source files? - # TODO: We want to match against relative path filenames from the source file root... - list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND) - - if (NOT WAS_FOUND EQUAL -1) - message("YES: ${GCOV_FILE}") - list(APPEND GCOV_FILES ${GCOV_FILE}) - - # We remove it from the list, so we don't bother searching for it again. - # Also files left in COVERAGE_SRCS_REMAINING after this loop ends should - # have coverage data generated from them (no lines are covered). - list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH}) - else() - message("NO: ${GCOV_FILE}") - endif() -endforeach() - -# TODO: Enable setting these -set(JSON_SERVICE_NAME "travis-ci") -set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID}) - -set(JSON_TEMPLATE -"{ - \"service_name\": \"\@JSON_SERVICE_NAME\@\", - \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\", - \"source_files\": \@JSON_GCOV_FILES\@ -}" -) - -set(SRC_FILE_TEMPLATE -"{ - \"name\": \"\@GCOV_SRC_REL_PATH\@\", - \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\", - \"coverage\": \@GCOV_FILE_COVERAGE\@ - }" -) - -message("\nGenerate JSON for files:") -message("=========================") - -set(JSON_GCOV_FILES "[") - -# Read the GCOV files line by line and get the coverage data. -foreach (GCOV_FILE ${GCOV_FILES}) - - get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) - file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}") - - # The new coveralls API doesn't need the entire source (Yay!) - # However, still keeping that part for now. Will cleanup in the future. - file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5) - message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}") - - # Loads the gcov file as a list of lines. - # (We first open the file and replace all occurences of [] with _ - # because CMake will fail to parse a line containing unmatched brackets... - # also the \ to escaped \n in macros screws up things.) - # https://public.kitware.com/Bug/view.php?id=15369 - file(READ ${GCOV_FILE} GCOV_CONTENTS) - string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") - string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") - string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") - file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}") - - file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES) - list(LENGTH GCOV_LINES LINE_COUNT) - - # Instead of trying to parse the source from the - # gcov file, simply read the file contents from the source file. - # (Parsing it from the gcov is hard because C-code uses ; in many places - # which also happens to be the same as the CMake list delimeter). - file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE) - - string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - # According to http://json.org/ these should be escaped as well. - # Don't know how to do that in CMake however... - #string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - #string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - #string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") - - # We want a json array of coverage data as a single string - # start building them from the contents of the .gcov - set(GCOV_FILE_COVERAGE "[") - - set(GCOV_LINE_COUNT 1) # Line number for the .gcov. - set(DO_SKIP 0) - foreach (GCOV_LINE ${GCOV_LINES}) - #message("${GCOV_LINE}") - # Example of what we're parsing: - # Hitcount |Line | Source - # " 8: 26: if (!allowed || (strlen(allowed) == 0))" - string(REGEX REPLACE - "^([^:]*):([^:]*):(.*)$" - "\\1;\\2;\\3" - RES - "${GCOV_LINE}") - - # Check if we should exclude lines using the Lcov syntax. - string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}") - string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}") - string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}") - - set(RESET_SKIP 0) - if (LINE_SKIP AND NOT DO_SKIP) - set(DO_SKIP 1) - set(RESET_SKIP 1) - endif() - - if (START_SKIP) - set(DO_SKIP 1) - message("${GCOV_LINE_COUNT}: Start skip") - endif() - - if (END_SKIP) - set(DO_SKIP 0) - endif() - - list(LENGTH RES RES_COUNT) - - if (RES_COUNT GREATER 2) - list(GET RES 0 HITCOUNT) - list(GET RES 1 LINE) - list(GET RES 2 SOURCE) - - string(STRIP ${HITCOUNT} HITCOUNT) - string(STRIP ${LINE} LINE) - - # Lines with 0 line numbers are metadata and can be ignored. - if (NOT ${LINE} EQUAL 0) - - if (DO_SKIP) - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") - else() - # Translate the hitcount into valid JSON values. - if (${HITCOUNT} STREQUAL "#####") - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") - elseif (${HITCOUNT} STREQUAL "-") - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") - else() - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ") - endif() - endif() - endif() - else() - message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}") - endif() - - if (RESET_SKIP) - set(DO_SKIP 0) - endif() - math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1") - endforeach() - - message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!") - - # Advanced way of removing the trailing comma in the JSON array. - # "[1, 2, 3, " -> "[1, 2, 3" - string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) - - # Append the trailing ] to complete the JSON array. - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") - - # Generate the final JSON for this file. - message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...") - string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) - - set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") -endforeach() - -# Loop through all files we couldn't find any coverage for -# as well, and generate JSON for those as well with 0% coverage. -foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING}) - - # Loads the source file as a list of lines. - file(STRINGS ${NOT_COVERED_SRC} SRC_LINES) - - set(GCOV_FILE_COVERAGE "[") - set(GCOV_FILE_SOURCE "") - - foreach (SOURCE ${SRC_LINES}) - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") - - string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}") - string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}") - string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}") - string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}") - set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n") - endforeach() - - # Remove trailing comma, and complete JSON array with ] - string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) - set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") - - # Generate the final JSON for this file. - string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) - set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") -endforeach() - -# Get rid of trailing comma. -string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES}) -set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]") - -# Generate the final complete JSON! -message("Generate final JSON...") -string(CONFIGURE ${JSON_TEMPLATE} JSON) - -file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}") -message("###########################################################################") -message("Generated coveralls JSON containing coverage data:") -message("${COVERALLS_OUTPUT_FILE}") -message("###########################################################################") diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake deleted file mode 100644 index 11a803ff03..0000000000 --- a/cmake/cross_compiling/android.cmake +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT ARM_TARGET_OS STREQUAL "android") - return() -endif() - -set(ANDROID TRUE) -add_definitions(-DLITE_WITH_LINUX) - -if(NOT DEFINED ANDROID_NDK) - set(ANDROID_NDK $ENV{NDK_ROOT}) - if(NOT ANDROID_NDK) - message(FATAL_ERROR "Must set ANDROID_NDK or env NDK_ROOT") - endif() -endif() - -if(ARM_TARGET_LANG STREQUAL "gcc") - # gcc do not need set lang on android - set(ARM_TARGET_LANG "") -endif() - -if(NOT DEFINED ANDROID_API_LEVEL) - set(ANDROID_API_LEVEL "22") -endif() - -# then check input arm abi -if(ARM_TARGET_ARCH_ABI STREQUAL "armv7hf") - message(FATAL_ERROR "ANDROID does not support hardfp on v7 use armv7 instead.") -endif() - -set(ANDROID_ARCH_ABI ${ARM_TARGET_ARCH_ABI} CACHE STRING "Choose Android Arch ABI") -if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") - set(ANDROID_ARCH_ABI "arm64-v8a") -endif() - -if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(ANDROID_ARCH_ABI "armeabi-v7a") -endif() - -check_input_var(ANDROID_ARCH_ABI DEFAULT ${ANDROID_ARCH_ABI} LIST "arm64-v8a" "armeabi-v7a" - "armeabi-v6" "armeabi" "mips" "mips64" "x86" "x86_64") -check_input_var(ANDROID_STL_TYPE DEFAULT "c++_static" LIST "c++_static" "gnustl_static" "c++_shared") - -if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") - message(STATUS "armeabi-v7a use softfp by default.") - set(CMAKE_ANDROID_ARM_NEON ON) - message(STATUS "NEON is enabled on arm-v7a with softfp.") -endif() - -set(CMAKE_SYSTEM_NAME Android) -set(CMAKE_SYSTEM_VERSION ${ANDROID_API_LEVEL}) -set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ARCH_ABI}) -set(CMAKE_ANDROID_NDK ${ANDROID_NDK}) -set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION ${ARM_TARGET_LANG}) -set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL_TYPE}) - -if (ARM_TARGET_LANG STREQUAL "clang") - if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") - set(triple aarch64-v8a-linux-android) - elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(triple arm-v7a-linux-android) - set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE) - else() - message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7") - endif() - - set(CMAKE_C_COMPILER clang) - set(CMAKE_C_COMPILER_TARGET ${triple}) - set(CMAKE_CXX_COMPILER clang++) - set(CMAKE_CXX_COMPILER_TARGET ${triple}) - - message(STATUS "CMAKE_CXX_COMPILER_TARGET: ${CMAKE_CXX_COMPILER_TARGET}") -endif() diff --git a/cmake/cross_compiling/armlinux.cmake b/cmake/cross_compiling/armlinux.cmake deleted file mode 100644 index 98f23d4300..0000000000 --- a/cmake/cross_compiling/armlinux.cmake +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT ARM_TARGET_OS STREQUAL "armlinux") - return() -endif() - -set(ARMLINUX TRUE) -add_definitions(-DLITE_WITH_LINUX) -set(CMAKE_SYSTEM_NAME Linux) - -check_input_var(ARMLINUX_ARCH_ABI DEFAULT ${ARM_TARGET_ARCH_ABI} LIST "armv8" "armv7" "armv7hf") - -if(ARMLINUX_ARCH_ABI STREQUAL "armv8") - set(CMAKE_SYSTEM_PROCESSOR aarch64) - set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc") - set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++") -endif() - -if(ARMLINUX_ARCH_ABI STREQUAL "armv7") - set(CMAKE_SYSTEM_PROCESSOR arm) - set(CMAKE_C_COMPILER "arm-linux-gnueabi-gcc") - set(CMAKE_CXX_COMPILER "arm-linux-gnueabi-g++") -endif() - -if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf") - set(CMAKE_SYSTEM_PROCESSOR arm) - set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc") - set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++") -endif() diff --git a/cmake/cross_compiling/findar.cmake b/cmake/cross_compiling/findar.cmake deleted file mode 100644 index bcb0dc70fd..0000000000 --- a/cmake/cross_compiling/findar.cmake +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT ARM_TARGET_LANG STREQUAL "clang") - # only clang need find ar tool - return() -endif() - -if(NOT EXISTS "${CMAKE_CXX_COMPILER}") - message(ERROR "Can not find CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}") -endif() - -get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH) - -find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH}) - -if(NOT AR_TOOL) - message(ERROR "Failed to find AR_TOOL in ${AR_PATH}") -else() - set(CMAKE_AR ${AR_TOOL}) - message(STATUS "Found CMAKE_AR : " ${CMAKE_AR}) -endif() diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake deleted file mode 100644 index b76dd60046..0000000000 --- a/cmake/cross_compiling/host.cmake +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set(HOST_C_COMPILER $ENV{CC}) -set(HOST_CXX_COMPILER $ENV{CXX}) - -if(IOS) - set(default_cc clang) - set(default_cxx clang++) -else() - set(default_cc gcc) - set(default_cxx g++) -endif() - -if(NOT HOST_C_COMPILER) - find_program(HOST_C_COMPILER NAMES ${default_cc} PATH - /usr/bin - /usr/local/bin) -endif() - -if(NOT HOST_CXX_COMPILER) - find_program(HOST_CXX_COMPILER NAMES ${default_cxx} PATH - /usr/bin - /usr/local/bin) -endif() - -if(NOT HOST_C_COMPILER OR NOT EXISTS ${HOST_C_COMPILER}) - MESSAGE(FATAL_ERROR "Cannot find host C compiler. export CC=/path/to/cc") -ENDIF() - -if(NOT HOST_CXX_COMPILER OR NOT EXISTS ${HOST_CXX_COMPILER}) - MESSAGE(FATAL_ERROR "Cannot find host C compiler. export CC=/path/to/cc") -ENDIF() - -MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER}) -MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER}) - diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake deleted file mode 100644 index 76f62765af..0000000000 --- a/cmake/cross_compiling/ios.cmake +++ /dev/null @@ -1,692 +0,0 @@ -# This file is part of the ios-cmake project. It was retrieved from -# https://github.com/cristeab/ios-cmake.git, which is a fork of -# https://code.google.com/p/ios-cmake/. Which in turn is based off of -# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which -# are included with CMake 2.8.4 -# -# The ios-cmake project is licensed under the new BSD license. -# -# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, -# Kitware, Inc., Insight Software Consortium. All rights reserved. -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# This file is based off of the Platform/Darwin.cmake and -# Platform/UnixPaths.cmake files which are included with CMake 2.8.4 -# It has been altered for iOS development. -# -# Updated by Alex Stewart (alexs.mac@gmail.com) -# -# ***************************************************************************** -# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) -# under the BSD-3-Clause license -# https://github.com/leetal/ios-cmake -# ***************************************************************************** -# -# INFORMATION / HELP -# -# The following arguments control the behaviour of this toolchain: -# -# PLATFORM: (default "OS") -# OS = Build for iPhoneOS. -# OS64 = Build for arm64 iphoneOS. -# OS64COMBINED = Build for arm64 x86_64 iphoneOS. Combined into FAT STATIC lib (supported on 3.14+ of CMakewith "-G Xcode" argument ONLY) -# SIMULATOR = Build for x86 i386 iphoneOS Simulator. -# SIMULATOR64 = Build for x86_64 iphoneOS Simulator. -# TVOS = Build for arm64 tvOS. -# TVOSCOMBINED = Build for arm64 x86_64 tvOS. Combined into FAT STATIC lib (supported on 3.14+ of CMake with "-G Xcode" argument ONLY) -# SIMULATOR_TVOS = Build for x86_64 tvOS Simulator. -# WATCHOS = Build for armv7k arm64_32 for watchOS. -# WATCHOSCOMBINED = Build for armv7k arm64_32 x86_64 watchOS. Combined into FAT STATIC lib (supported on 3.14+ of CMake with "-G Xcode" argument ONLY) -# SIMULATOR_WATCHOS = Build for x86_64 for watchOS Simulator. -# -# CMAKE_OSX_SYSROOT: Path to the SDK to use. By default this is -# automatically determined from PLATFORM and xcodebuild, but -# can also be manually specified (although this should not be required). -# -# CMAKE_DEVELOPER_ROOT: Path to the Developer directory for the platform -# being compiled for. By default this is automatically determined from -# CMAKE_OSX_SYSROOT, but can also be manually specified (although this should -# not be required). -# -# DEPLOYMENT_TARGET: Minimum SDK version to target. Default 2.0 on watchOS and 9.0 on tvOS+iOS -# -# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true) -# -# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default) -# -# ENABLE_VISIBILITY: (1|0) Enables or disables symbol visibility support. Default 0 (false, visibility hidden by default) -# -# ARCHS: (armv7 armv7s armv7k arm64 arm64_32 i386 x86_64) If specified, will override the default architectures for the given PLATFORM -# OS = armv7 armv7s arm64 (if applicable) -# OS64 = arm64 (if applicable) -# SIMULATOR = i386 -# SIMULATOR64 = x86_64 -# TVOS = arm64 -# SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated) -# WATCHOS = armv7k arm64_32 (if applicable) -# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated) -# -# This toolchain defines the following variables for use externally: -# -# XCODE_VERSION: Version number (not including Build version) of Xcode detected. -# SDK_VERSION: Version of SDK being used. -# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from PLATFORM). -# -# This toolchain defines the following macros for use externally: -# -# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) -# A convenience macro for setting xcode specific properties on targets. -# Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel -# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all"). -# -# find_host_package (PROGRAM ARGS) -# A macro used to find executable programs on the host system, not within the -# environment. Thanks to the android-cmake project for providing the -# command. -# -# ******************************** DEPRECATIONS ******************************* -# -# IOS_DEPLOYMENT_TARGET: (Deprecated) Alias to DEPLOYMENT_TARGET -# CMAKE_IOS_DEVELOPER_ROOT: (Deprecated) Alias to CMAKE_DEVELOPER_ROOT -# IOS_PLATFORM: (Deprecated) Alias to PLATFORM -# IOS_ARCH: (Deprecated) Alias to ARCHS -# -# ***************************************************************************** -# - -## Lite settings -if (ARM_TARGET_OS STREQUAL "ios") - set(PLATFORM "OS") -elseif(ARM_TARGET_OS STREQUAL "ios64") - set(PLATFORM "OS64") -else() - return() -endif() -add_definitions(-DTARGET_IOS) - -# if do not specify the ARM_TARGET_ARCH_ABI then use default all supported -if(ARM_TARGET_ARCH_ABI STREQUAL "armv7" - OR ARM_TARGET_ARCH_ABI STREQUAL "armv7hf" - OR ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a") - set(ARCHS "armv7") -elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv8" - OR ARM_TARGET_ARCH_ABI STREQUAL "arm64-v8a") - set(ARCHS "arm64") -# else() all default choice: armv7 armv7s arm64 -endif() - -if(PLATFORM STREQUAL "OS64" AND ARCHS STREQUAL "armv7") - message(FATAL_ERROR "Can not build IOS64 with armv7") -endif() - -# TODO(xxx): enable omp on ios -set(LITE_WITH_OPENMP OFF CACHE STRING "Disable OpenMP when cross-compiling for Android and iOS" FORCE) -set(ARM_TARGET_LANG "clang" CACHE STRING "Force use clang on IOS" FORCE) - -add_definitions(-DLITE_WITH_IPHONE) -## End lite settings - -# Fix for PThread library not in path -set(CMAKE_THREAD_LIBS_INIT "-lpthread") -set(CMAKE_HAVE_THREADS_LIBRARY 1) -set(CMAKE_USE_WIN32_THREADS_INIT 0) -set(CMAKE_USE_PTHREADS_INIT 1) - -# Cache what generator is used -set(USED_CMAKE_GENERATOR "${CMAKE_GENERATOR}" CACHE STRING "Expose CMAKE_GENERATOR" FORCE) - -if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14") - set(MODERN_CMAKE YES) - message(STATUS "Merging integrated CMake 3.14+ iOS,tvOS,watchOS,macOS toolchain(s) with this toolchain!") -endif() - -# Get the Xcode version being used. -execute_process(COMMAND xcodebuild -version - OUTPUT_VARIABLE XCODE_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}") -string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}") -message(STATUS "Building with Xcode version: ${XCODE_VERSION}") - -######## ALIASES (DEPRECATION WARNINGS) - -if(DEFINED IOS_PLATFORM) - set(PLATFORM ${IOS_PLATFORM}) - message(DEPRECATION "IOS_PLATFORM argument is DEPRECATED. Consider using the new PLATFORM argument instead.") -endif() - -if(DEFINED IOS_DEPLOYMENT_TARGET) - set(DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET}) - message(DEPRECATION "IOS_DEPLOYMENT_TARGET argument is DEPRECATED. Consider using the new DEPLOYMENT_TARGET argument instead.") -endif() - -if(DEFINED CMAKE_IOS_DEVELOPER_ROOT) - set(CMAKE_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT}) - message(DEPRECATION "CMAKE_IOS_DEVELOPER_ROOT argument is DEPRECATED. Consider using the new CMAKE_DEVELOPER_ROOT argument instead.") -endif() - -if(DEFINED IOS_ARCH) - set(ARCHS ${IOS_ARCH}) - message(DEPRECATION "IOS_ARCH argument is DEPRECATED. Consider using the new ARCHS argument instead.") -endif() - -######## END ALIASES - -# Unset the FORCE on cache variables if in try_compile() -set(FORCE_CACHE FORCE) -get_property(_CMAKE_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE) -if(_CMAKE_IN_TRY_COMPILE) - unset(FORCE_CACHE) -endif() - -# Default to building for iPhoneOS if not specified otherwise, and we cannot -# determine the platform from the CMAKE_OSX_ARCHITECTURES variable. The use -# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly -# determine the value of PLATFORM from the root project, as -# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake. -if(NOT DEFINED PLATFORM) - if (CMAKE_OSX_ARCHITECTURES) - if(CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*" AND CMAKE_OSX_SYSROOT MATCHES ".*iphoneos.*") - set(PLATFORM "OS") - elseif(CMAKE_OSX_ARCHITECTURES MATCHES "i386" AND CMAKE_OSX_SYSROOT MATCHES ".*iphonesimulator.*") - set(PLATFORM "SIMULATOR") - elseif(CMAKE_OSX_ARCHITECTURES MATCHES "x86_64" AND CMAKE_OSX_SYSROOT MATCHES ".*iphonesimulator.*") - set(PLATFORM "SIMULATOR64") - elseif(CMAKE_OSX_ARCHITECTURES MATCHES "arm64" AND CMAKE_OSX_SYSROOT MATCHES ".*appletvos.*") - set(PLATFORM "TVOS") - elseif(CMAKE_OSX_ARCHITECTURES MATCHES "x86_64" AND CMAKE_OSX_SYSROOT MATCHES ".*appletvsimulator.*") - set(PLATFORM "SIMULATOR_TVOS") - elseif(CMAKE_OSX_ARCHITECTURES MATCHES ".*armv7k.*" AND CMAKE_OSX_SYSROOT MATCHES ".*watchos.*") - set(PLATFORM "WATCHOS") - elseif(CMAKE_OSX_ARCHITECTURES MATCHES "i386" AND CMAKE_OSX_SYSROOT MATCHES ".*watchsimulator.*") - set(PLATFORM "SIMULATOR_WATCHOS") - endif() - endif() - if (NOT PLATFORM) - set(PLATFORM "OS") - endif() -endif() - -set(PLATFORM_INT "${PLATFORM}" CACHE STRING "Type of platform for which the build targets.") - -# Handle the case where we are targeting iOS and a version above 10.0 (32-bit support dropped officially) -if(PLATFORM_INT STREQUAL "OS" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.0) - set(PLATFORM_INT "OS64") - message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") -elseif(PLATFORM_INT STREQUAL "SIMULATOR" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.0) - set(PLATFORM_INT "SIMULATOR64") - message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") -endif() - -# Determine the platform name and architectures for use in xcodebuild commands -# from the specified PLATFORM name. -if(PLATFORM_INT STREQUAL "OS") - set(SDK_NAME iphoneos) - if(NOT ARCHS) - set(ARCHS armv7 armv7s arm64) - endif() -elseif(PLATFORM_INT STREQUAL "OS64") - set(SDK_NAME iphoneos) - if(NOT ARCHS) - if (XCODE_VERSION VERSION_GREATER 10.0) - set(ARCHS arm64) # Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example - else() - set(ARCHS arm64) - endif() - endif() -elseif(PLATFORM_INT STREQUAL "OS64COMBINED") - set(SDK_NAME iphoneos) - if(MODERN_CMAKE) - if(NOT ARCHS) - if (XCODE_VERSION VERSION_GREATER 10.0) - set(ARCHS arm64 x86_64) # Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example - else() - set(ARCHS arm64 x86_64) - endif() - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the OS64COMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS i386) - endif() - message(DEPRECATION "SIMULATOR IS DEPRECATED. Consider using SIMULATOR64 instead.") -elseif(PLATFORM_INT STREQUAL "SIMULATOR64") - set(SDK_NAME iphonesimulator) - if(NOT ARCHS) - set(ARCHS x86_64) - endif() -elseif(PLATFORM_INT STREQUAL "TVOS") - set(SDK_NAME appletvos) - if(NOT ARCHS) - set(ARCHS arm64) - endif() -elseif (PLATFORM_INT STREQUAL "TVOSCOMBINED") - set(SDK_NAME appletvos) - if(MODERN_CMAKE) - if(NOT ARCHS) - set(ARCHS arm64 x86_64) - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the TVOSCOMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") - set(SDK_NAME appletvsimulator) - if(NOT ARCHS) - set(ARCHS x86_64) - endif() -elseif(PLATFORM_INT STREQUAL "WATCHOS") - set(SDK_NAME watchos) - if(NOT ARCHS) - if (XCODE_VERSION VERSION_GREATER 10.0) - set(ARCHS armv7k arm64_32) - else() - set(ARCHS armv7k) - endif() - endif() -elseif(PLATFORM_INT STREQUAL "WATCHOSCOMBINED") - set(SDK_NAME watchos) - if(MODERN_CMAKE) - if(NOT ARCHS) - if (XCODE_VERSION VERSION_GREATER 10.0) - set(ARCHS armv7k arm64_32 i386) - else() - set(ARCHS armv7k i386) - endif() - endif() - else() - message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the WATCHOSCOMBINED setting work") - endif() -elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") - set(SDK_NAME watchsimulator) - if(NOT ARCHS) - set(ARCHS i386) - endif() -else() - message(FATAL_ERROR "Invalid PLATFORM: ${PLATFORM_INT}") -endif() -message(STATUS "Configuring ${SDK_NAME} build for platform: ${PLATFORM_INT}, architecture(s): ${ARCHS}") - -if(MODERN_CMAKE AND PLATFORM_INT MATCHES ".*COMBINED" AND NOT USED_CMAKE_GENERATOR MATCHES "Xcode") - message(FATAL_ERROR "The COMBINED options only work with Xcode generator, -G Xcode") -endif() - -# If user did not specify the SDK root to use, then query xcodebuild for it. -execute_process(COMMAND xcodebuild -version -sdk ${SDK_NAME} Path - OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -if (NOT DEFINED CMAKE_OSX_SYSROOT_INT AND NOT DEFINED CMAKE_OSX_SYSROOT) - message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain" - "is pointing to the correct path. Please run:" - "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer" - "and see if that fixes the problem for you.") - message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} " - "does not exist.") -elseif(DEFINED CMAKE_OSX_SYSROOT) - message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${PLATFORM_INT} when checking compatibility") -elseif(DEFINED CMAKE_OSX_SYSROOT_INT) - message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT_INT} for platform: ${PLATFORM_INT}") - set(CMAKE_OSX_SYSROOT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") -endif() - -# Set Xcode property for SDKROOT as well if Xcode generator is used -if(USED_CMAKE_GENERATOR MATCHES "Xcode") - set(CMAKE_OSX_SYSROOT "${SDK_NAME}" CACHE INTERNAL "") - if(NOT DEFINED CMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM) - set(CMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM 123456789A CACHE INTERNAL "") - endif() -endif() - -# Specify minimum version of deployment target. -if(NOT DEFINED DEPLOYMENT_TARGET) - if (PLATFORM_INT STREQUAL "WATCHOS" OR PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") - # Unless specified, SDK version 2.0 is used by default as minimum target version (watchOS). - set(DEPLOYMENT_TARGET "2.0" - CACHE STRING "Minimum SDK version to build for." ) - else() - # Unless specified, SDK version 9.0 is used by default as minimum target version (iOS, tvOS). - set(DEPLOYMENT_TARGET "9.0" - CACHE STRING "Minimum SDK version to build for." ) - endif() - message(STATUS "Using the default min-version since DEPLOYMENT_TARGET not provided!") -endif() -# Use bitcode or not -if(NOT DEFINED ENABLE_BITCODE AND NOT ARCHS MATCHES "((^|, )(i386|x86_64))+") - # Unless specified, enable bitcode support by default - message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!") - set(ENABLE_BITCODE TRUE) -elseif(NOT DEFINED ENABLE_BITCODE) - message(STATUS "Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!") - set(ENABLE_BITCODE FALSE) -endif() -set(ENABLE_BITCODE_INT ${ENABLE_BITCODE} CACHE BOOL "Whether or not to enable bitcode" ${FORCE_CACHE}) -# Use ARC or not -if(NOT DEFINED ENABLE_ARC) - # Unless specified, enable ARC support by default - set(ENABLE_ARC TRUE) - message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!") -endif() -set(ENABLE_ARC_INT ${ENABLE_ARC} CACHE BOOL "Whether or not to enable ARC" ${FORCE_CACHE}) -# Use hidden visibility or not -if(NOT DEFINED ENABLE_VISIBILITY) - # Unless specified, disable symbols visibility by default - set(ENABLE_VISIBILITY FALSE) - message(STATUS "Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!") -endif() -set(ENABLE_VISIBILITY_INT ${ENABLE_VISIBILITY} CACHE BOOL "Whether or not to hide symbols (-fvisibility=hidden)" ${FORCE_CACHE}) -# Get the SDK version information. -execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion - OUTPUT_VARIABLE SDK_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - -# Find the Developer root for the specific iOS platform being compiled for -# from CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in -# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain -# this information from xcrun or xcodebuild. -if (NOT DEFINED CMAKE_DEVELOPER_ROOT AND NOT USED_CMAKE_GENERATOR MATCHES "Xcode") - get_filename_component(PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH) - get_filename_component(CMAKE_DEVELOPER_ROOT ${PLATFORM_SDK_DIR} PATH) - - if (NOT DEFINED CMAKE_DEVELOPER_ROOT) - message(FATAL_ERROR "Invalid CMAKE_DEVELOPER_ROOT: " - "${CMAKE_DEVELOPER_ROOT} does not exist.") - endif() -endif() -# Find the C & C++ compilers for the specified SDK. -if(NOT CMAKE_C_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang - OUTPUT_VARIABLE CMAKE_C_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}") -endif() -if(NOT CMAKE_CXX_COMPILER) - execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++ - OUTPUT_VARIABLE CMAKE_CXX_COMPILER - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) - message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}") -endif() -# Find (Apple's) libtool. -execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool - OUTPUT_VARIABLE BUILD_LIBTOOL - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -message(STATUS "Using libtool: ${BUILD_LIBTOOL}") -# Configure libtool to be used instead of ar + ranlib to build static libraries. -# This is required on Xcode 7+, but should also work on previous versions of -# Xcode. -set(CMAKE_C_CREATE_STATIC_LIBRARY - "${BUILD_LIBTOOL} -static -o ") -set(CMAKE_CXX_CREATE_STATIC_LIBRARY - "${BUILD_LIBTOOL} -static -o ") -# Get the version of Darwin (OS X) of the host. -execute_process(COMMAND uname -r - OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE) -# CMake 3.14+ support building for iOS, watchOS and tvOS out of the box. -if(MODERN_CMAKE) - if(SDK_NAME MATCHES "iphone") - set(CMAKE_SYSTEM_NAME iOS CACHE INTERNAL "" ${FORCE_CACHE}) - elseif(SDK_NAME MATCHES "appletv") - set(CMAKE_SYSTEM_NAME tvOS CACHE INTERNAL "" ${FORCE_CACHE}) - elseif(SDK_NAME MATCHES "watch") - set(CMAKE_SYSTEM_NAME watchOS CACHE INTERNAL "" ${FORCE_CACHE}) - endif() - - # Provide flags for a combined FAT library build on newer CMake versions - if(PLATFORM_INT MATCHES ".*COMBINED") - set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH NO CACHE INTERNAL "") - set(CMAKE_IOS_INSTALL_COMBINED YES CACHE INTERNAL "") - message(STATUS "Will combine built (static) artifacts into FAT lib...") - endif() -else() - # Legacy code path prior to CMake 3.14 - set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "" ${FORCE_CACHE}) -endif() -# Standard settings. -set(CMAKE_SYSTEM_VERSION ${SDK_VERSION} CACHE INTERNAL "") -set(UNIX TRUE CACHE BOOL "") -set(APPLE TRUE CACHE BOOL "") -set(IOS TRUE CACHE BOOL "") -set(CMAKE_AR ar CACHE FILEPATH "" FORCE) -set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE) -set(CMAKE_STRIP strip CACHE FILEPATH "" FORCE) -# Set the architectures for which to build. -set(CMAKE_OSX_ARCHITECTURES ${ARCHS} CACHE STRING "Build architecture for iOS") -# Change the type of target generated for try_compile() so it'll work when cross-compiling -set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) -# All iOS/Darwin specific settings - some may be redundant. -set(CMAKE_SHARED_LIBRARY_PREFIX "lib") -set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") -set(CMAKE_SHARED_MODULE_PREFIX "lib") -set(CMAKE_SHARED_MODULE_SUFFIX ".so") -set(CMAKE_C_COMPILER_ABI ELF) -set(CMAKE_CXX_COMPILER_ABI ELF) -set(CMAKE_C_HAS_ISYSROOT 1) -set(CMAKE_CXX_HAS_ISYSROOT 1) -set(CMAKE_MODULE_EXISTS 1) -set(CMAKE_DL_LIBS "") -set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") -set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") -set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") -set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") - -if(ARCHS MATCHES "((^|, )(arm64|arm64e|x86_64))+") - set(CMAKE_C_SIZEOF_DATA_PTR 8) - set(CMAKE_CXX_SIZEOF_DATA_PTR 8) - if(ARCHS MATCHES "((^|, )(arm64|arm64e))+") - set(CMAKE_SYSTEM_PROCESSOR "arm64") - else() - set(CMAKE_SYSTEM_PROCESSOR "x86_64") - endif() - message(STATUS "Using a data_ptr size of 8") -else() - set(CMAKE_C_SIZEOF_DATA_PTR 4) - set(CMAKE_CXX_SIZEOF_DATA_PTR 4) - set(CMAKE_SYSTEM_PROCESSOR "arm") - message(STATUS "Using a data_ptr size of 4") -endif() - -message(STATUS "Building for minimum ${SDK_NAME} version: ${DEPLOYMENT_TARGET}" - " (SDK version: ${SDK_VERSION})") -# Note that only Xcode 7+ supports the newer more specific: -# -m${SDK_NAME}-version-min flags, older versions of Xcode use: -# -m(ios/ios-simulator)-version-min instead. -if(PLATFORM_INT STREQUAL "OS" OR PLATFORM_INT STREQUAL "OS64") - if(XCODE_VERSION VERSION_LESS 7.0) - set(SDK_NAME_VERSION_FLAGS - "-mios-version-min=${DEPLOYMENT_TARGET}") - else() - # Xcode 7.0+ uses flags we can build directly from SDK_NAME. - set(SDK_NAME_VERSION_FLAGS - "-m${SDK_NAME}-version-min=${DEPLOYMENT_TARGET}") - endif() -elseif(PLATFORM_INT STREQUAL "TVOS") - set(SDK_NAME_VERSION_FLAGS - "-mtvos-version-min=${DEPLOYMENT_TARGET}") -elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") - set(SDK_NAME_VERSION_FLAGS - "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}") -elseif(PLATFORM_INT STREQUAL "WATCHOS") - set(SDK_NAME_VERSION_FLAGS - "-mwatchos-version-min=${DEPLOYMENT_TARGET}") -elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") - set(SDK_NAME_VERSION_FLAGS - "-mwatchos-simulator-version-min=${DEPLOYMENT_TARGET}") -else() - # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min. - set(SDK_NAME_VERSION_FLAGS - "-mios-simulator-version-min=${DEPLOYMENT_TARGET}") -endif() -message(STATUS "Version flags set to: ${SDK_NAME_VERSION_FLAGS}") -set(CMAKE_OSX_DEPLOYMENT_TARGET ${DEPLOYMENT_TARGET} CACHE STRING - "Set CMake deployment target" ${FORCE_CACHE}) - -if(ENABLE_BITCODE_INT) - set(BITCODE "-fembed-bitcode") - set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE bitcode CACHE INTERNAL "") - message(STATUS "Enabling bitcode support.") -else() - set(BITCODE "") - set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE NO CACHE INTERNAL "") - message(STATUS "Disabling bitcode support.") -endif() - -if(ENABLE_ARC_INT) - set(FOBJC_ARC "-fobjc-arc") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES CACHE INTERNAL "") - message(STATUS "Enabling ARC support.") -else() - set(FOBJC_ARC "-fno-objc-arc") - set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC NO CACHE INTERNAL "") - message(STATUS "Disabling ARC support.") -endif() - -if(NOT ENABLE_VISIBILITY_INT) - set(VISIBILITY "-fvisibility=hidden") - set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN YES CACHE INTERNAL "") - message(STATUS "Hiding symbols (-fvisibility=hidden).") -else() - set(VISIBILITY "") - set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN NO CACHE INTERNAL "") -endif() - -#Check if Xcode generator is used, since that will handle these flags automagically -if(USED_CMAKE_GENERATOR MATCHES "Xcode") - message(STATUS "Not setting any manual command-line buildflags, since Xcode is selected as generator.") -else() - set(CMAKE_C_FLAGS - "${SDK_NAME_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_C_FLAGS}") - # Hidden visibilty is required for C++ on iOS. - set(CMAKE_CXX_FLAGS - "${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_CXX_FLAGS}") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g ${CMAKE_CXX_FLAGS_DEBUG}") - set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -ffast-math ${CMAKE_CXX_FLAGS_MINSIZEREL}") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -ffast-math ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -ffast-math ${CMAKE_CXX_FLAGS_RELEASE}") - set(CMAKE_C_LINK_FLAGS "${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") - set(CMAKE_CXX_LINK_FLAGS "${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") - - # In order to ensure that the updated compiler flags are used in try_compile() - # tests, we have to forcibly set them in the CMake cache, not merely set them - # in the local scope. - list(APPEND VARS_TO_FORCE_IN_CACHE - CMAKE_C_FLAGS - CMAKE_CXX_FLAGS - CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_CXX_FLAGS_MINSIZEREL - CMAKE_CXX_FLAGS_RELEASE - CMAKE_C_LINK_FLAGS - CMAKE_CXX_LINK_FLAGS) - foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE}) - set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "") - endforeach() -endif() - -set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) -set(CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks") -set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -Wl,-headerpad_max_install_names") -set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -Wl,-headerpad_max_install_names") -set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") -set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") -set(CMAKE_FIND_LIBRARY_SUFFIXES ".tbd" ".dylib" ".so" ".a") -set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-install_name") - -# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old -# build tree (where install_name_tool was hardcoded) and where -# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in -# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode -# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did -# before, Alex. -if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) -endif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - -# Set the find root to the iOS developer roots and to user defined paths. -set(CMAKE_FIND_ROOT_PATH ${CMAKE_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT_INT} - ${CMAKE_PREFIX_PATH} CACHE STRING "Root path that will be prepended to all search paths") -# Default to searching for frameworks first. -set(CMAKE_FIND_FRAMEWORK FIRST) -# Set up the default search directories for frameworks. -set(CMAKE_FRAMEWORK_PATH - ${CMAKE_DEVELOPER_ROOT}/Library/Frameworks - ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks - ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks - ${CMAKE_FRAMEWORK_PATH} CACHE STRING "Frameworks search paths") - -# By default, search both the specified iOS SDK and the remainder of the host filesystem. -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH CACHE STRING "" ${FORCE_CACHE}) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH CACHE STRING "" ${FORCE_CACHE}) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH CACHE STRING "" ${FORCE_CACHE}) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH CACHE STRING "" ${FORCE_CACHE}) -endif() - -# -# Some helper-macros below to simplify and beautify the CMakeFile -# - -# This little macro lets you set any XCode specific property. -macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION) - set(XCODE_RELVERSION_I "${XCODE_RELVERSION}") - if(XCODE_RELVERSION_I STREQUAL "All") - set_property(TARGET ${TARGET} PROPERTY - XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}") - else() - set_property(TARGET ${TARGET} PROPERTY - XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}") - endif() -endmacro(set_xcode_property) -# This macro lets you find executable programs on the host system. -macro(find_host_package) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE NEVER) - set(IOS FALSE) - find_package(${ARGN}) - set(IOS TRUE) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) -endmacro(find_host_package) diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake deleted file mode 100644 index 863200986c..0000000000 --- a/cmake/cross_compiling/npu.cmake +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT LITE_WITH_NPU) - return() -endif() - -if(NOT DEFINED NPU_DDK_ROOT) - set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT}) - if(NOT NPU_DDK_ROOT) - message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON") - endif() -endif() - -message(STATUS "NPU_DDK_ROOT: ${NPU_DDK_ROOT}") -find_path(NPU_DDK_INC NAMES HiAiModelManagerService.h - PATHS ${NPU_DDK_ROOT}/include NO_DEFAULT_PATH) -if(NOT NPU_DDK_INC) - message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include") -endif() - -include_directories("${NPU_DDK_ROOT}") - -set(NPU_SUB_LIB_PATH "lib64") -if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") - set(NPU_SUB_LIB_PATH "lib64") -endif() - -if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(NPU_SUB_LIB_PATH "lib") -endif() - -find_library(NPU_DDK_HIAI_FILE NAMES hiai - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) - -find_library(NPU_DDK_IR_FILE NAMES hiai_ir - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) - -find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) - -find_library(NPU_DDK_PROTO_FILE NAMES protobuf-lite - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) - -if(NOT NPU_DDK_HIAI_FILE) - message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") -else() - message(STATUS "Found NPU_DDK HIAI Library: ${NPU_DDK_HIAI_FILE}") - add_library(npu_ddk_hiai SHARED IMPORTED GLOBAL) - set_property(TARGET npu_ddk_hiai PROPERTY IMPORTED_LOCATION ${NPU_DDK_HIAI_FILE}) -endif() - -if(NOT NPU_DDK_IR_FILE) - message(FATAL_ERROR "Can not find NPU_DDK_IR_FILE in ${NPU_DDK_ROOT}") -else() - message(STATUS "Found NPU_DDK IR Library: ${NPU_DDK_IR_FILE}") - add_library(npu_ddk_ir SHARED IMPORTED GLOBAL) - set_property(TARGET npu_ddk_ir PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_FILE}) -endif() - -if(NOT NPU_DDK_IR_BUILD_FILE) - message(FATAL_ERROR "Can not find NPU_DDK_IR_BUILD_FILE in ${NPU_DDK_ROOT}") -else() - message(STATUS "Found NPU_DDK IR_BUILD Library: ${NPU_DDK_IR_BUILD_FILE}") - add_library(npu_ddk_ir_build SHARED IMPORTED GLOBAL) - set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE}) -endif() - -if(NOT NPU_DDK_PROTO_FILE) - message(FATAL_ERROR "Can not find NPU_DDK_PROTO_FILE in ${NPU_DDK_ROOT}") -else() - message(STATUS "Found NPU_DDK Protobuf Library: ${NPU_DDK_PROTO_FILE}") - add_library(npu_ddk_proto SHARED IMPORTED GLOBAL) - set_property(TARGET npu_ddk_proto PROPERTY IMPORTED_LOCATION ${NPU_DDK_PROTO_FILE}) -endif() - -set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build npu_ddk_proto CACHE INTERNAL "npu ddk libs") - - diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake deleted file mode 100644 index 33254df03c..0000000000 --- a/cmake/cross_compiling/postproject.cmake +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - return() -endif() - -include(CheckCXXCompilerFlag) - -if(ANDROID) - include(cross_compiling/findar) - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC") -endif() - -if(ARMLINUX) - if(ARMLINUX_ARCH_ABI STREQUAL "armv8") - set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}") - message(STATUS "NEON is enabled on arm64-v8a") - endif() - - if(ARMLINUX_ARCH_ABI STREQUAL "armv7") - set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}") - message(STATUS "NEON is enabled on arm-v7a with softfp") - endif() - - if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf") - set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" ) - message(STATUS "NEON is enabled on arm-v7a with hard float") - endif() -endif() - -function(check_linker_flag) - foreach(flag ${ARGN}) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}") - check_cxx_compiler_flag("" out_var) - if(${out_var}) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${flag}") - endif() - endforeach() - set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE) -endfunction() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -if (LITE_ON_TINY_PUBLISH) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fno-exceptions -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto -fvisibility=hidden -fvisibility-inlines-hidden -fdata-sections -ffunction-sections") - check_linker_flag(-Wl,--gc-sections) -endif() - -if(LITE_WITH_OPENMP) - find_package(OpenMP REQUIRED) - if(OPENMP_FOUND OR OpenMP_CXX_FOUND) - add_definitions(-DARM_WITH_OMP) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - message(STATUS "Found OpenMP ${OpenMP_VERSION} ${OpenMP_CXX_VERSION}") - message(STATUS "OpenMP C flags: ${OpenMP_C_FLAGS}") - message(STATUS "OpenMP CXX flags: ${OpenMP_CXX_FLAGS}") - message(STATUS "OpenMP OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}") - message(STATUS "OpenMP OpenMP_CXX_LIBRARIES: ${OpenMP_CXX_LIBRARIES}") - else() - message(FATAL_ERROR "Could not found OpenMP!") - endif() -endif() - -# third party cmake args -set(CROSS_COMPILE_CMAKE_ARGS - "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}" - "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}") - -if(ANDROID) - set(CROSS_COMPILE_CMAKE_ARGS ${CROSS_COMPILE_CMAKE_ARGS} - "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}" - "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}" - "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}" - "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}") -endif() - -if(IOS) - set(CROSS_COMPILE_CMAKE_ARGS ${CROSS_COMPILE_CMAKE_ARGS} - "-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}" - "-DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}" - "-DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}") -endif() diff --git a/cmake/cross_compiling/preproject.cmake b/cmake/cross_compiling/preproject.cmake deleted file mode 100644 index 813d1910fc..0000000000 --- a/cmake/cross_compiling/preproject.cmake +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - return() -endif() - -cmake_minimum_required(VERSION 3.10) - -# define check function -function(check_input_var VAR_NAME) - set(options "") - set(oneValueArgs "") - set(multiValueArgs DEFAULT LIST) - cmake_parse_arguments(check_input_var "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(var_out "") - if(NOT DEFINED ${VAR_NAME}) - set(var_out ${check_input_var_DEFAULT}) - else() - set(var_out ${${VAR_NAME}}) - endif() - - if(NOT var_out IN_LIST check_input_var_LIST) - message(FATAL_ERROR "${VAR_NAME}:${var_out} must be in one of ${check_input_var_LIST}") - endif() - set(${VAR_NAME} ${var_out} PARENT_SCOPE) -endfunction(check_input_var) - -check_input_var(ARM_TARGET_OS DEFAULT "android" LIST "android" "armlinux" "ios" "ios64") -check_input_var(ARM_TARGET_ARCH_ABI DEFAULT "armv8" LIST "armv8" "armv7" "armv7hf" "arm64-v8a" "armeabi-v7a") -check_input_var(ARM_TARGET_LANG DEFAULT "gcc" LIST "gcc" "clang") -check_input_var(ARM_TARGET_LIB_TYPE DEFAULT "static" LIST "static" "shared") - -include(cross_compiling/armlinux) -include(cross_compiling/android) -include(cross_compiling/ios) -include(cross_compiling/host) - -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default use Release in android" FORCE) -endif() - -if(NOT THIRD_PARTY_BUILD_TYPE) - set(THIRD_PARTY_BUILD_TYPE "MinSizeRel" CACHE STRING "Default use MinSizeRel in android" FORCE) -endif() - -message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI} ${ARM_TARGET_LANG}") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake deleted file mode 100644 index 1e6f34a621..0000000000 --- a/cmake/cuda.cmake +++ /dev/null @@ -1,228 +0,0 @@ -if(NOT LITE_WITH_CUDA) - return() -endif() - -set(paddle_known_gpu_archs "30 35 50 52 60 61 70") -set(paddle_known_gpu_archs7 "30 35 50 52") -set(paddle_known_gpu_archs8 "30 35 50 52 60 61") -set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") -set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75") - -###################################################################################### -# A function for automatic detection of GPUs installed (if autodetection is enabled) -# Usage: -# detect_installed_gpus(out_variable) -function(detect_installed_gpus out_variable) - if(NOT CUDA_gpu_detect_output) - set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) - - file(WRITE ${cufile} "" - "#include \n" - "int main() {\n" - " int count = 0;\n" - " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" - " if (count == 0) return -1;\n" - " for (int device = 0; device < count; ++device) {\n" - " cudaDeviceProp prop;\n" - " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" - " std::printf(\"%d.%d \", prop.major, prop.minor);\n" - " }\n" - " return 0;\n" - "}\n") - - execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" - "--run" "${cufile}" - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(nvcc_res EQUAL 0) - # only keep the last line of nvcc_out - STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") - STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") - list(GET nvcc_out -1 nvcc_out) - string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") - set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) - endif() - endif() - - if(NOT CUDA_gpu_detect_output) - message(STATUS "Automatic GPU detection failed. Building for all known architectures.") - set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) - else() - set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) - endif() -endfunction() - - -######################################################################## -# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME -# Usage: -# select_nvcc_arch_flags(out_variable) -function(select_nvcc_arch_flags out_variable) - # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") - set(archs_name_default "All") - list(APPEND archs_names "Auto") - - # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) - set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") - set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} ) - mark_as_advanced(CUDA_ARCH_NAME) - - # verify CUDA_ARCH_NAME value - if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};") - string(REPLACE ";" ", " archs_names "${archs_names}") - message(FATAL_ERROR "Only ${archs_names} architeture names are supported.") - endif() - - if(${CUDA_ARCH_NAME} STREQUAL "Manual") - set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") - set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") - mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) - else() - unset(CUDA_ARCH_BIN CACHE) - unset(CUDA_ARCH_PTX CACHE) - endif() - - if(${CUDA_ARCH_NAME} STREQUAL "Kepler") - set(cuda_arch_bin "30 35") - elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") - set(cuda_arch_bin "50") - elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") - set(cuda_arch_bin "60 61") - elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") - set(cuda_arch_bin "70") - elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") - set(cuda_arch_bin "75") - elseif(${CUDA_ARCH_NAME} STREQUAL "All") - set(cuda_arch_bin ${paddle_known_gpu_archs}) - elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") - detect_installed_gpus(cuda_arch_bin) - else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") - set(cuda_arch_bin ${CUDA_ARCH_BIN}) - endif() - - # remove dots and convert to lists - string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") - string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") - string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") - string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") - list(REMOVE_DUPLICATES cuda_arch_bin) - list(REMOVE_DUPLICATES cuda_arch_ptx) - - set(nvcc_flags "") - set(nvcc_archs_readable "") - - # Tell NVCC to add binaries for the specified GPUs - foreach(arch ${cuda_arch_bin}) - if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") - # User explicitly specified PTX for the concrete BIN - list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) - list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) - else() - # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN - list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) - list(APPEND nvcc_archs_readable sm_${arch}) - endif() - endforeach() - - # Tell NVCC to add PTX intermediate code for the specified architectures - foreach(arch ${cuda_arch_ptx}) - list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) - list(APPEND nvcc_archs_readable compute_${arch}) - endforeach() - - string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") - set(${out_variable} ${nvcc_flags} PARENT_SCOPE) - set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) -endfunction() - -message(STATUS "CUDA detected: " ${CUDA_VERSION}) -if (${CUDA_VERSION} LESS 7.0) - set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) - add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") -elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") -elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - # CUDA 8 may complain that sm_20 is no longer supported. Suppress the - # warning for now. - list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") - add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") -elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - add_definitions("-DPADDLE_CUDA_BINVER=\"90\"") -elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - add_definitions("-DPADDLE_CUDA_BINVER=\"100\"") -endif() - -include_directories(${CUDA_INCLUDE_DIRS}) -if(NOT WITH_DSO) - if(WIN32) - set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) - endif(WIN32) -endif(NOT WITH_DSO) - -# setting nvcc arch flags -select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) -list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) -message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") - -# Set C++11 support -set(CUDA_PROPAGATE_HOST_FLAGS OFF) - -# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. -# So, don't set these flags here. -if (NOT WIN32) # windows msvc2015 support c++11 natively. -# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. -list(APPEND CUDA_NVCC_FLAGS "-std=c++11") -list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") -endif(NOT WIN32) - -if(WITH_FAST_MATH) - # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html - list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") -endif() -# in cuda9, suppress cuda warning on eigen -list(APPEND CUDA_NVCC_FLAGS "-w") -# Set :expt-relaxed-constexpr to suppress Eigen warnings -list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") - -if (NOT WIN32) - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) - elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) - elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) - elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - # nvcc 9 does not support -Os. Use Release flags instead - list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) - endif() -else(NOT WIN32) - list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") - list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") - # match the cl's _ITERATOR_DEBUG_LEVEL - list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") - elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") - else() - message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") -endif() -endif(NOT WIN32) - -mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) -mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake deleted file mode 100644 index 3775d6cc2b..0000000000 --- a/cmake/cudnn.cmake +++ /dev/null @@ -1,99 +0,0 @@ -if(NOT LITE_WITH_CUDA) - return() -endif() - -if(WIN32) - set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) -else(WIN32) - set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") -endif(WIN32) - -find_path(CUDNN_INCLUDE_DIR cudnn.h - PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include - $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} - NO_DEFAULT_PATH -) - -get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) - -set(TARGET_ARCH "x86_64") -if(NOT ${CMAKE_SYSTEM_PROCESSOR}) - set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) -endif() - -list(APPEND CUDNN_CHECK_LIBRARY_DIRS - ${CUDNN_ROOT} - ${CUDNN_ROOT}/lib64 - ${CUDNN_ROOT}/lib - ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu - ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ - $ENV{CUDNN_ROOT} - $ENV{CUDNN_ROOT}/lib64 - $ENV{CUDNN_ROOT}/lib - /usr/lib - ${CUDA_TOOLKIT_ROOT_DIR} - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 - ) -set(CUDNN_LIB_NAME "libcudnn.so") - -if(WIN32) -# only support cudnn7 -set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll") -endif(WIN32) - -if(APPLE) -set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so") -endif(APPLE) - -find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a - PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist} - NO_DEFAULT_PATH - DOC "Path to cuDNN library.") - - -if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY) - set(CUDNN_FOUND ON) -else() - set(CUDNN_FOUND OFF) -endif() - -if(CUDNN_FOUND) - file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) - - get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY) - - string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)" - CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1" - CUDNN_VERSION "${CUDNN_VERSION}") - - if("${CUDNN_VERSION}" STREQUAL "2000") - message(STATUS "Current cuDNN version is v2. ") - else() - string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION - "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1" - CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}") - string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION - "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1" - CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}") - string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)" - CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1" - CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}") - - if(NOT CUDNN_MAJOR_VERSION) - set(CUDNN_VERSION "???") - else() - add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") - math(EXPR CUDNN_VERSION - "${CUDNN_MAJOR_VERSION} * 1000 + - ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") - endif() - - message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. " - "Current cuDNN version is v${CUDNN_MAJOR_VERSION}. ") - - endif() -endif() diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake deleted file mode 100644 index 72ed0f1e58..0000000000 --- a/cmake/cupti.cmake +++ /dev/null @@ -1,41 +0,0 @@ -if(NOT WITH_GPU) - return() -endif() - - -set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT") -find_path(CUPTI_INCLUDE_DIR cupti.h - PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include - $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include - ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include - NO_DEFAULT_PATH - ) - -get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) - -set(TARGET_ARCH "x86_64") -if(NOT ${CMAKE_SYSTEM_PROCESSOR}) - set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) -endif() - -list(APPEND CUPTI_CHECK_LIBRARY_DIRS - ${CUPTI_ROOT} - ${CUPTI_ROOT}/lib64 - ${CUPTI_ROOT}/lib - ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu - $ENV{CUPTI_ROOT} - $ENV{CUPTI_ROOT}/lib64 - $ENV{CUPTI_ROOT}/lib - /usr/lib - ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64) -find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a - PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist} - NO_DEFAULT_PATH - DOC "Path to cuPTI library.") - -get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY) -if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY) - set(CUPTI_FOUND ON) -else() - set(CUPTI_FOUND OFF) -endif() diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake deleted file mode 100644 index bd0d117a63..0000000000 --- a/cmake/external/eigen.cmake +++ /dev/null @@ -1,54 +0,0 @@ -INCLUDE(ExternalProject) - -SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) -SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) -INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) -if(NOT WITH_FAST_MATH) - # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html - # enables some optimizations which might affect the accuracy of the result. - # This currently enables the SSE vectorization of sin() and cos(), - # and speedups sqrt() for single precision. - # Defined to 1 by default. Define it to 0 to disable. - add_definitions(-DEIGEN_FAST_MATH=0) -endif() - -if(WITH_AMD_GPU) - ExternalProject_Add( - extern_eigen3 - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e - PREFIX ${EIGEN_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" - ) -else() - ExternalProject_Add( - extern_eigen3 - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" - # eigen on cuda9.1 missing header of math_funtions.hpp - # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen - GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c - PREFIX ${EIGEN_SOURCE_DIR} - DOWNLOAD_NAME "eigen" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" - ) -endif() - -if (${CMAKE_VERSION} VERSION_LESS "3.3.0") - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) - file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";") - add_library(eigen3 STATIC ${dummyfile}) -else() - add_library(eigen3 INTERFACE) -endif() - -add_dependencies(eigen3 extern_eigen3) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake deleted file mode 100644 index 44ede96171..0000000000 --- a/cmake/external/gflags.cmake +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -INCLUDE(ExternalProject) - -SET(GFLAGS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/gflags) -SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) -SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE) -IF(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) -ELSE(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) -ENDIF(WIN32) - -INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) - -SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" - "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}") - -ExternalProject_Add( - extern_gflags - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "" - GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a - SOURCE_DIR ${GFLAGS_SOURCES_DIR} - PREFIX ${GFLAGS_INCLUDE_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DBUILD_STATIC_LIBS=ON - -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${CROSS_COMPILE_CMAKE_ARGS} - ${OPTIONAL_ARGS} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -) -IF(WIN32) - IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib") - add_custom_command(TARGET extern_gflags POST_BUILD - COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib - ) - ENDIF() -ENDIF(WIN32) -ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) -ADD_DEPENDENCIES(gflags extern_gflags) - -# On Windows (including MinGW), the Shlwapi library is used by gflags if available. -if (WIN32) - include(CheckIncludeFileCXX) - check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI) - if (HAVE_SHLWAPI) - set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib) - endif(HAVE_SHLWAPI) -endif (WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake deleted file mode 100644 index 970020d784..0000000000 --- a/cmake/external/glog.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -INCLUDE(ExternalProject) - -SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog) -SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) -SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE) - -IF(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) - SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") -ELSE(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) - SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -ENDIF(WIN32) - -INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) - -SET(GLOG_REPOSITORY "https://github.com/google/glog.git") -SET(GLOG_TAG "v0.3.5") - -SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" - "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}") - -ExternalProject_Add( - extern_glog - ${EXTERNAL_PROJECT_LOG_ARGS} - DEPENDS gflags - GIT_REPOSITORY ${GLOG_REPOSITORY} - GIT_TAG ${GLOG_TAG} - PREFIX ${GLOG_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS ${CROSS_COMPILE_CMAKE_ARGS} - ${OPTIONAL_ARGS} - -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DWITH_GFLAGS=ON - -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags - -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -) -IF(WIN32) - IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib") - add_custom_command(TARGET extern_glog POST_BUILD - COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib - ) - ENDIF() -ENDIF(WIN32) - -ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) -ADD_DEPENDENCIES(glog extern_glog gflags) -LINK_LIBRARIES(glog gflags) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake deleted file mode 100644 index 0df39138dd..0000000000 --- a/cmake/external/gtest.cmake +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# the gtest is only used when WITH_TESTING=ON -IF(WITH_TESTING) - IF(WITH_TESTING) - ENABLE_TESTING() - ENDIF(WITH_TESTING) - - INCLUDE(ExternalProject) - - SET(GTEST_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/googletest) - SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest) - SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE) - - INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR}) - - IF(WIN32) - set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE) - set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE) - ELSE(WIN32) - set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE) - set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) - ENDIF(WIN32) - - IF(WITH_MKLML) - # wait for mklml downloading completed - SET(GTEST_DEPENDS ${MKLML_PROJECT}) - ENDIF() - - SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" - "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}") - - ExternalProject_Add( - extern_gtest - ${EXTERNAL_PROJECT_LOG_ARGS} - DEPENDS ${GTEST_DEPENDS} - GIT_REPOSITORY "" - SOURCE_DIR ${GTEST_SOURCES_DIR} - GIT_TAG "release-1.8.0" - PREFIX ${GTEST_INSTALL_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS ${CROSS_COMPILE_CMAKE_ARGS} - ${OPTIONAL_ARGS} - -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_GMOCK=ON - -Dgtest_disable_pthreads=ON - -Dgtest_force_shared_crt=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - ) - - ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) - SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES}) - ADD_DEPENDENCIES(gtest extern_gtest) - - ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL) - SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) - ADD_DEPENDENCIES(gtest_main extern_gtest) - -ENDIF() diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake deleted file mode 100644 index 69cdba7c59..0000000000 --- a/cmake/external/libxsmm.cmake +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF) - -IF(NOT WITH_LIBXSMM) - return() -ENDIF() - -IF(WIN32 OR APPLE) - MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.") - SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE) - return() -ENDIF() - -INCLUDE (ExternalProject) - -SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm) -SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) -SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE) -SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE) -SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a" - "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") - -ExternalProject_Add( - extern_libxsmm - GIT_REPOSITORY "https://github.com/hfp/libxsmm.git" - GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2" - PREFIX ${LIBXSMM_SOURCES_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install - INSTALL_COMMAND "" -) -ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") -SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") - -MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") -include_directories(${LIBXSMM_INCLUDE_DIR}) -ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) -ADD_DEPENDENCIES(libxsmm extern_libxsmm) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake deleted file mode 100644 index b1e437a900..0000000000 --- a/cmake/external/mkldnn.cmake +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -IF(NOT ${WITH_MKLDNN}) - return() -ENDIF(NOT ${WITH_MKLDNN}) - -INCLUDE(ExternalProject) - -SET(MKLDNN_PROJECT "extern_mkldnn") -SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) -SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) -SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) - -IF(APPLE) - MESSAGE(WARNING - "Mac is not supported with MKLDNN in Paddle yet." - "Force WITH_MKLDNN=OFF") - SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE) - return() -ENDIF() - -# Introduce variables: -# * CMAKE_INSTALL_LIBDIR -INCLUDE(GNUInstallDirs) -SET(LIBDIR "lib") -if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$") - SET(LIBDIR "lib64") -endif() - -MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path") -SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}") - -INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. - -IF(${CBLAS_PROVIDER} STREQUAL "MKLML") - SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) - MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") -ELSE() - MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") -ENDIF() - -IF(NOT WIN32) - SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") - SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") - SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") - SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") -ELSE() - SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") -ENDIF(NOT WIN32) - -ExternalProject_Add( - ${MKLDNN_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - DEPENDS ${MKLDNN_DEPENDS} - GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" - GIT_TAG "863ff6e7042cec7d2e29897fe9f0872e0888b0fc" - PREFIX ${MKLDNN_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - CMAKE_ARGS -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} - CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} - CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} - CMAKE_ARGS -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} - -DMKLROOT:PATH=${MKLML_ROOT} -) -if(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) -else(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) -endif(WIN32) - -ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) -ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT}) -MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") -add_definitions(-DPADDLE_WITH_MKLDNN) - -# generate a static dummy target to track mkldnn dependencies -# for cc_library(xxx SRCS xxx.c DEPS mkldnn) -SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c) -FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") -ADD_LIBRARY(mkldnn STATIC ${dummyfile}) -TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB}) -ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) - -# copy the real so.0 lib to install dir -# it can be directly contained in wheel or capi -if(WIN32) - SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) -else(WIN32) - SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) - ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - DEPENDS mkldnn shared_mkldnn) -endif(WIN32) -ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) -ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake deleted file mode 100644 index 142fce816d..0000000000 --- a/cmake/external/mklml.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -IF(NOT ${WITH_MKLML}) - return() -ENDIF(NOT ${WITH_MKLML}) - -IF(APPLE) - MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.") - SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE) - return() -ENDIF() - -INCLUDE(ExternalProject) -SET(MKLML_DST_DIR "mklml") -SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) -SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) -SET(MKLML_INC_DIR ${MKLML_ROOT}/include) -SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") - -SET(TIME_VERSION "2019.0.1.20181227") -IF(WIN32) - SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) - SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) - SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) - SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) - SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) - SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) -ELSE() - #TODO(intel-huying): - # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. - SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) - SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) - SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) - SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) - SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) - SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) -ENDIF() - -SET(MKLML_PROJECT "extern_mklml") -MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") -SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") -SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") - -ExternalProject_Add( - ${MKLML_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${MKLML_SOURCE_DIR} - URL ${MKLML_URL} - DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND - ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} && - ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR} -) - -INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) - -ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) -ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake deleted file mode 100644 index d8a4a0be6f..0000000000 --- a/cmake/external/openblas.cmake +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -INCLUDE(cblas) - -IF(NOT ${CBLAS_FOUND}) - INCLUDE(ExternalProject) - - SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) - SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) - SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) - - SET(CBLAS_LIBRARIES - "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" - CACHE FILEPATH "openblas library." FORCE) - - ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) - - IF (WIN32) - SET(CBLAS_FOUND true) - MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR}) - ENDIF(WIN32) - - IF (NOT WIN32) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") - SET(OPENBLAS_COMMIT "v0.2.20") - - IF(APPLE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") - ENDIF() - SET(OPTIONAL_ARGS "") - IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") - SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) - ENDIF() - - SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) - ExternalProject_Add( - extern_openblas - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_TAG ${OPENBLAS_COMMIT} - PREFIX ${CBLAS_SOURCES_DIR} - INSTALL_DIR ${CBLAS_INSTALL_DIR} - BUILD_IN_SOURCE 1 - BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= - && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - ) - ELSE() - ENDIF(NOT WIN32) - SET(CBLAS_PROVIDER openblas) -ENDIF(NOT ${CBLAS_FOUND}) - -MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") -MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}") -INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) - -# FIXME(gangliao): generate cblas target to track all high performance -# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) -SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) -FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";") -ADD_LIBRARY(cblas STATIC ${dummyfile}) - -IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") - TARGET_LINK_LIBRARIES(cblas dynload_mklml) -ELSE() - TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) -ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") - -IF(WITH_LIBXSMM) - TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS}) - ADD_DEPENDENCIES(cblas extern_libxsmm) -ENDIF() - -IF(NOT ${CBLAS_FOUND}) - ADD_DEPENDENCIES(cblas extern_openblas) -ELSE() - IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") - ADD_DEPENDENCIES(cblas mklml) - ENDIF() -ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/opencl-clhpp.cmake b/cmake/external/opencl-clhpp.cmake deleted file mode 100644 index ea724860d9..0000000000 --- a/cmake/external/opencl-clhpp.cmake +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -INCLUDE(ExternalProject) - -SET(OPENCL_CLHPP_SRCS_DIR ${THIRD_PARTY_PATH}/opencl-clhpp) -SET(OPENCL_CLHPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/opencl-clhpp) -SET(OPENCL_CLHPP_INCLUDE_DIR "${OPENCL_CLHPP_INSTALL_DIR}" CACHE PATH "opencl-clhpp include directory." FORCE) - -INCLUDE_DIRECTORIES(${OPENCL_CLHPP_INCLUDE_DIR}) - -ExternalProject_Add( - opencl_clhpp - GIT_REPOSITORY "https://github.com/KhronosGroup/OpenCL-CLHPP.git" - GIT_TAG "v2.0.10" - PREFIX "${OPENCL_CLHPP_SRCS_DIR}" - CMAKE_ARGS -DBUILD_DOCS=OFF - -DBUILD_EXAMPLES=OFF - -DBUILD_TESTS=OFF - -DCMAKE_INSTALL_PREFIX=${OPENCL_CLHPP_INSTALL_DIR} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${OPENCL_CLHPP_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -) - -ADD_DEPENDENCIES(opencl_clhpp opencl_headers) diff --git a/cmake/external/opencl-headers.cmake b/cmake/external/opencl-headers.cmake deleted file mode 100644 index 68c9c5251c..0000000000 --- a/cmake/external/opencl-headers.cmake +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -INCLUDE(ExternalProject) - -SET(OPENCL_HEADERS_SRCS_DIR ${THIRD_PARTY_PATH}/opencl-headers) -SET(OPENCL_HEADERS_INCLUDE_DIR "${OPENCL_HEADERS_SRCS_DIR}/src/opencl_headers" CACHE PATH "opencl-headers include directory." FORCE) - -INCLUDE_DIRECTORIES(${OPENCL_HEADERS_INCLUDE_DIR}) - -ExternalProject_Add( - opencl_headers - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/KhronosGroup/OpenCL-Headers.git" - GIT_TAG "c5a4bbeabb10d8ed3d1c651b93aa31737bc473dd" - PREFIX ${OPENCL_HEADERS_SRCS_DIR} - DOWNLOAD_NAME "OpenCL-Headers" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake deleted file mode 100644 index 2a88cf0321..0000000000 --- a/cmake/external/protobuf.cmake +++ /dev/null @@ -1,308 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -INCLUDE(ExternalProject) -# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp -IF(NOT WIN32) -FIND_PACKAGE(Protobuf QUIET) -ENDIF(NOT WIN32) -macro(UNSET_VAR VAR_NAME) - UNSET(${VAR_NAME} CACHE) - UNSET(${VAR_NAME}) -endmacro() - -UNSET_VAR(PROTOBUF_INCLUDE_DIR) -UNSET_VAR(PROTOBUF_FOUND) -UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) -UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) -UNSET_VAR(PROTOBUF_LITE_LIBRARY) -UNSET_VAR(PROTOBUF_LIBRARY) -UNSET_VAR(PROTOBUF_INCLUDE_DIR) -UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) -function(protobuf_generate_python SRCS) - # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") - return() - endif() - - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - # Create an include path for each file specified - foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(ABS_PATH ${ABS_FIL} PATH) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - else() - set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) - endif() - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() - - if(DEFINED Protobuf_IMPORT_DIRS) - foreach(DIR ${Protobuf_IMPORT_DIRS}) - get_filename_component(ABS_PATH ${DIR} ABSOLUTE) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - endif() - - set(${SRCS}) - foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(FIL_WE ${FIL} NAME_WE) - if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) - get_filename_component(FIL_DIR ${FIL} DIRECTORY) - if(FIL_DIR) - set(FIL_WE "${FIL_DIR}/${FIL_WE}") - endif() - endif() - list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} - DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE} - COMMENT "Running Python protocol buffer compiler on ${FIL}" - VERBATIM ) - endforeach() - - set(${SRCS} ${${SRCS}} PARENT_SCOPE) -endfunction() - -# Print and set the protobuf library information, -# finish this cmake process and exit from this file. -macro(PROMPT_PROTOBUF_LIB) - SET(protobuf_DEPS ${ARGN}) - - MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") - MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}") - MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") - MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}") - MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") - INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) - - # Assuming that all the protobuf libraries are of the same type. - IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX}) - SET(protobuf_LIBTYPE STATIC) - ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") - SET(protobuf_LIBTYPE SHARED) - ELSE() - MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") - ENDIF() - - ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) - SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) - - ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) - SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) - - ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) - SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) - - ADD_EXECUTABLE(protoc IMPORTED GLOBAL) - SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE}) - # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. - # make `protobuf_generate_cpp` happy. - SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) - - FOREACH(dep ${protobuf_DEPS}) - ADD_DEPENDENCIES(protobuf ${dep}) - ADD_DEPENDENCIES(protobuf_lite ${dep}) - ADD_DEPENDENCIES(libprotoc ${dep}) - ADD_DEPENDENCIES(protoc ${dep}) - ENDFOREACH() - - RETURN() -endmacro() -macro(SET_PROTOBUF_VERSION) - EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION) - STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") -endmacro() - -set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") -IF (WIN32) - SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) -ENDIF(WIN32) - -if (NOT "${PROTOBUF_ROOT}" STREQUAL "") - find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) - find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) - find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) - find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) - find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) - if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) - message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") - SET(PROTOBUF_FOUND true) - SET_PROTOBUF_VERSION() - PROMPT_PROTOBUF_LIB() - else() - message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}") - endif() -endif() - -FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) - STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") - SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) - SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) - - SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) - SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) - SET(${TARGET_NAME}_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) - SET(${TARGET_NAME}_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) - SET(${TARGET_NAME}_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) - SET(${TARGET_NAME}_PROTOC_EXECUTABLE - "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" - PARENT_SCOPE) - - # https://github.com/protocolbuffers/protobuf.git - SET(PROTOBUF_REPO "") - SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") - SET(OPTIONAL_CACHE_ARGS "") - SET(OPTIONAL_ARGS "") - SET(SOURCE_DIR "${CMAKE_SOURCE_DIR}/third-party/protobuf-host") - - IF(BUILD_FOR_HOST) - # set for server compile. - if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - set(HOST_C_COMPILER "${CMAKE_C_COMPILER}") - set(HOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}") - endif() - - SET(OPTIONAL_ARGS - "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}" - "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}" - "-Dprotobuf_WITH_ZLIB=OFF" - "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}") - SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") - ELSE() - # protobuf have compile issue when use android stl c++_static - # https://github.com/tensor-tang/protobuf.git - SET(PROTOBUF_REPO "") - SET(PROTOBUF_TAG "mobile") - SET(SOURCE_DIR "${CMAKE_SOURCE_DIR}/third-party/protobuf-mobile") - SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF" - ${CROSS_COMPILE_CMAKE_ARGS} - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" - "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" - "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}") - ENDIF() - IF(WIN32) - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") - ENDIF() - - if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - ExternalProject_Add( - ${TARGET_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${PROTOBUF_SOURCES_DIR} - SOURCE_SUBDIR cmake - UPDATE_COMMAND "" - GIT_REPOSITORY "" - GIT_TAG ${PROTOBUF_TAG} - SOURCE_DIR ${SOURCE_DIR} - CMAKE_ARGS - ${OPTIONAL_ARGS} - -Dprotobuf_BUILD_TESTS=OFF - -DCMAKE_SKIP_RPATH=ON - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_SHARED_LIBS=OFF - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - ${OPTIONAL_CACHE_ARGS} - ) - else() - ExternalProject_Add( - ${TARGET_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SOURCE_DIR} - UPDATE_COMMAND "" - GIT_REPOSITORY "" - GIT_TAG ${PROTOBUF_TAG} - SOURCE_DIR ${SOURCE_DIR} - CONFIGURE_COMMAND ${CMAKE_COMMAND} ${SOURCE_DIR}/cmake - ${OPTIONAL_ARGS} - -Dprotobuf_BUILD_TESTS=OFF - -DCMAKE_SKIP_RPATH=ON - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=lib - -DBUILD_SHARED_LIBS=OFF - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - ${OPTIONAL_CACHE_ARGS} - ) - endif() -ENDFUNCTION() - -SET(PROTOBUF_VERSION 3.1.0) - -IF(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - build_protobuf(protobuf_host TRUE) - LIST(APPEND external_project_dependencies protobuf_host) - SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE} - CACHE FILEPATH "protobuf executable." FORCE) -ENDIF() - -IF(NOT PROTOBUF_FOUND) - if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - build_protobuf(extern_protobuf FALSE) - else() - build_protobuf(extern_protobuf TRUE) - endif() - - SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} - CACHE PATH "protobuf include directory." FORCE) - SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} - CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} - CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} - CACHE FILEPATH "protoc library." FORCE) - - IF(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) - ELSE() - SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} - CACHE FILEPATH "protobuf executable." FORCE) - PROMPT_PROTOBUF_LIB(extern_protobuf) - ENDIF() - -ENDIF(NOT PROTOBUF_FOUND) diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake deleted file mode 100644 index 1d61154c0d..0000000000 --- a/cmake/external/xbyak.cmake +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set(WITH_XBYAK ON) -if(WIN32 OR APPLE) - SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) - return() -endif() - -include(ExternalProject) - -set(XBYAK_PROJECT extern_xbyak) -set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) -set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) -set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include) - -include_directories(${XBYAK_INC_DIR}) -include_directories(${XBYAK_INC_DIR}/xbyak) - -add_definitions(-DPADDLE_WITH_XBYAK) - -# xbyak options -add_definitions(-DXBYAK64) -add_definitions(-DXBYAK_NO_OP_NAMES) - -ExternalProject_Add( - ${XBYAK_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - DEPENDS "" - GIT_REPOSITORY "https://github.com/herumi/xbyak.git" - GIT_TAG "v5.661" # Jul 26th - PREFIX ${XBYAK_PREFIX_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT} -) - -if (${CMAKE_VERSION} VERSION_LESS "3.3.0") - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c) - file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";") - add_library(xbyak STATIC ${dummyfile}) -else() - add_library(xbyak INTERFACE) -endif() - -add_dependencies(xbyak ${XBYAK_PROJECT}) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake deleted file mode 100644 index 23b1e02108..0000000000 --- a/cmake/external/xxhash.cmake +++ /dev/null @@ -1,73 +0,0 @@ -INCLUDE(ExternalProject) - -set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash) -set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) -set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") - -IF(WITH_STATIC_LIB) - SET(BUILD_CMD make lib) -ELSE() - IF(APPLE) - SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib) - ELSE(APPLE) - SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib) - ENDIF(APPLE) -ENDIF() - -if(WIN32) - ExternalProject_Add( - extern_xxhash - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" - GIT_TAG "v0.6.5" - PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" - UPDATE_COMMAND "" - BUILD_IN_SOURCE 1 - PATCH_COMMAND - CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial - -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} - -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DBUILD_XXHSUM=OFF - -DCMAKE_GENERATOR_PLATFORM=x64 - -DBUILD_SHARED_LIBS=OFF - ${OPTIONAL_CACHE_ARGS} - TEST_COMMAND "" - ) -else() - ExternalProject_Add( - extern_xxhash - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" - GIT_TAG "v0.6.5" - PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - PATCH_COMMAND - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install - TEST_COMMAND "" - ) -endif() - -if (WIN32) - IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") - add_custom_command(TARGET extern_xxhash POST_BUILD - COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib - ) - ENDIF() - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") -else() - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") -endif () -INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) - -add_library(xxhash STATIC IMPORTED GLOBAL) -set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES}) -include_directories(${XXHASH_INCLUDE_DIR}) -add_dependencies(xxhash extern_xxhash) diff --git a/cmake/flags.cmake b/cmake/flags.cmake deleted file mode 100644 index 36b533aa4f..0000000000 --- a/cmake/flags.cmake +++ /dev/null @@ -1,194 +0,0 @@ -# Setting Paddle Compile Flags -include(CheckCXXCompilerFlag) -include(CheckCCompilerFlag) -include(CheckCXXSymbolExists) -include(CheckTypeSize) - -function(CheckCompilerCXX11Flag) - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) - message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") - endif() - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" - # Apple Clang is a different compiler than upstream Clang which havs different version numbers. - # https://gist.github.com/yamaya/2924292 - if(APPLE) # cmake < 3.0 compiler id "Clang" on Mac OS X - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1) - message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.") - endif() - else() - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) - message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") - endif() - endif() - endif() -endfunction() - -CheckCompilerCXX11Flag() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -# safe_set_flag -# -# Set a compile flag only if compiler is support -# is_c: is C flag or C++ flag, bool type. -# src_list: The list name which the flag name will be append to. -# flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc -# rest arguments: not used. -function(safe_set_flag is_c src_list flag_name) - string(REPLACE "-" "_" safe_name ${flag_name}) - string(REPLACE "=" "_" safe_name ${safe_name}) - if(is_c) - CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) - set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) - else() - CHECK_CXX_COMPILER_FLAG(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name}) - set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name}) - endif() - if(${safe_name}) - set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE) - endif() -endfunction() - -# helper macro to set cflag -macro(safe_set_cflag src_list flag_name) - safe_set_flag(ON ${src_list} ${flag_name}) -endmacro() - -# helper macro to set cxxflag -macro(safe_set_cxxflag src_list flag_name) - safe_set_flag(OFF ${src_list} ${flag_name}) -endmacro() - -# helper macro to set nvcc flag -macro(safe_set_nvflag flag_name) - string(REPLACE "-" "_" safe_name ${flag_name}) - string(REPLACE "=" "_" safe_name ${safe_name}) - CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) - set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) - if(${safe_name}) - LIST(APPEND CUDA_NVCC_FLAGS -Xcompiler ${flag_name}) - endif() -endmacro() - -macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared - if (BUILD_SHARED_LIBS) - return() # if build shared libs, the flags keep same with '/MD' - endif(BUILD_SHARED_LIBS) - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE - CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) -endmacro() - -CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) -if(NOT UINT64_MAX_EXISTS) - set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS) - CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE) - if(UINT64_MAX_EXISTS_HERE) - set(CMAKE_REQUIRED_DEFINITIONS) - add_definitions(-D__STDC_LIMIT_MACROS) - else() - message(FATAL_ERROR "Cannot find symbol UINT64_MAX") - endif() -endif() - -SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h") -CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND) -CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND) -if(SPINLOCK_FOUND) - add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK) -endif(SPINLOCK_FOUND) -if(BARRIER_FOUND) - add_definitions(-DPADDLE_USE_PTHREAD_BARRIER) -endif(BARRIER_FOUND) -SET(CMAKE_EXTRA_INCLUDE_FILES "") - -# Common flags. the compiler flag used for C/C++ sources whenever release or debug -# Do not care if this flag is support for gcc. - -# https://github.com/PaddlePaddle/Paddle/issues/12773 -if (NOT WIN32) -set(COMMON_FLAGS - -fPIC - -fno-omit-frame-pointer - -Werror - -Wall - -Wextra - -Wnon-virtual-dtor - -Wdelete-non-virtual-dtor - -Wno-unused-parameter - -Wno-unused-function - -Wno-error=literal-suffix - -Wno-error=sign-compare - -Wno-error=unused-local-typedefs - -Wno-error=parentheses-equality # Warnings in pybind11 - -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 - -Wno-error=terminate # Warning in PADDLE_ENFORCE - -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 - -Wimplicit-fallthrough=0 # Warning in tinyformat.h - -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 -) - -set(GPU_COMMON_FLAGS - -fPIC - -fno-omit-frame-pointer - -Wnon-virtual-dtor - -Wdelete-non-virtual-dtor - -Wno-unused-parameter - -Wno-unused-function - -Wno-error=sign-compare - -Wno-error=literal-suffix - -Wno-error=unused-local-typedefs - -Wno-error=unused-function # Warnings in Numpy Header. - -Wno-error=array-bounds # Warnings in Eigen::array -) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") -endif(NOT WIN32) - -if (APPLE) - # On Mac OS X build fat binaries with x86_64 architectures by default. - set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) - # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 - set (COMMON_FLAGS -Wno-deprecated-register) -endif(APPLE) - -if(LINUX) - set(GPU_COMMON_FLAGS - -Wall - -Wextra - -Werror - ${GPU_COMMON_FLAGS}) -endif(LINUX) - -if(UNIX AND NOT APPLE) - # except apple from nix*Os family - set(LINUX TRUE) -endif(UNIX AND NOT APPLE) - -foreach(flag ${COMMON_FLAGS}) - safe_set_cflag(CMAKE_C_FLAGS ${flag}) - safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) - -endforeach() - -foreach(flag ${GPU_COMMON_FLAGS}) - safe_set_nvflag(${flag}) -endforeach() - -if(WIN32) -# windows build turn off warnings. -safe_set_static_flag() - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE - CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) - string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") - set(flag_var "${flag_var} /w") - endforeach(flag_var) -endif(WIN32) diff --git a/cmake/generic.cmake b/cmake/generic.cmake deleted file mode 100644 index a87c64cbe9..0000000000 --- a/cmake/generic.cmake +++ /dev/null @@ -1,567 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -# generic.cmake defines CMakes functions that look like Bazel's -# building rules (https://bazel.build/). -# -# -# ------------------------------------------- -# C++ CUDA C++ Go -# ------------------------------------------- -# cc_library nv_library go_library -# cc_binary nv_binary go_binary -# cc_test nv_test go_test -# ------------------------------------------- -# -# To build a static library example.a from example.cc using the system -# compiler (like GCC): -# -# cc_library(example SRCS example.cc) -# -# To build a static library example.a from multiple source files -# example{1,2,3}.cc: -# -# cc_library(example SRCS example1.cc example2.cc example3.cc) -# -# To build a shared library example.so from example.cc: -# -# cc_library(example SHARED SRCS example.cc) -# -# To build a library using Nvidia's NVCC from .cu file(s), use the nv_ -# prefixed version: -# -# nv_library(example SRCS example.cu) -# -# To specify that a library new_example.a depends on other libraies: -# -# cc_library(new_example SRCS new_example.cc DEPS example) -# -# Static libraries can be composed of other static libraries: -# -# cc_library(composed DEPS dependent1 dependent2 dependent3) -# -# To build an executable binary file from some source files and -# dependent libraries: -# -# cc_binary(example SRCS main.cc something.cc DEPS example1 example2) -# -# To build an executable binary file using NVCC, use the nv_ prefixed -# version: -# -# nv_binary(example SRCS main.cc something.cu DEPS example1 example2) -# -# To build a unit test binary, which is an executable binary with -# GoogleTest linked: -# -# cc_test(example_test SRCS example_test.cc DEPS example) -# -# To build a unit test binary using NVCC, use the nv_ prefixed version: -# -# nv_test(example_test SRCS example_test.cu DEPS example) -# -# It is pretty often that executable and test binaries depend on -# pre-defined external libaries like glog and gflags defined in -# /cmake/external/*.cmake: -# -# cc_test(example_test SRCS example_test.cc DEPS example glog gflags) -# -# To build a go static library using Golang, use the go_ prefixed version: -# -# go_library(example STATIC) -# -# To build a go shared library using Golang, use the go_ prefixed version: -# -# go_library(example SHARED) -# - -# including binary directory for generated headers. -include_directories(${CMAKE_CURRENT_BINARY_DIR}) - -if(NOT APPLE) - find_package(Threads REQUIRED) - link_libraries(${CMAKE_THREAD_LIBS_INIT}) - set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl") - if (NOT ANDROID) - set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -lrt") - endif() -endif(NOT APPLE) - -set_property(GLOBAL PROPERTY FLUID_MODULES "") -# find all fluid modules is used for paddle fluid static library -# for building inference libs -function(find_fluid_modules TARGET_NAME) - get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) - string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) - string(FIND "${__target_path}" "fluid" pos) - if(pos GREATER 1) - get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) - set(fluid_modules ${fluid_modules} ${TARGET_NAME}) - set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}") - endif() -endfunction(find_fluid_modules) - - -function(common_link TARGET_NAME) - if (WITH_PROFILER) - target_link_libraries(${TARGET_NAME} gperftools::profiler) - endif() - - if (WITH_JEMALLOC) - target_link_libraries(${TARGET_NAME} jemalloc::jemalloc) - endif() -endfunction() - - -# find all third_party modules is used for paddle static library -# for reduce the dependency when building the inference libs. -set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) -function(find_fluid_thirdparties TARGET_NAME) - get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) - string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) - string(FIND "${__target_path}" "third_party" pos) - if(pos GREATER 1) - get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY) - set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME}) - set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}") - endif() -endfunction(find_fluid_thirdparties) - -function(merge_static_libs TARGET_NAME) - set(libs ${ARGN}) - list(REMOVE_DUPLICATES libs) - - # Get all propagation dependencies from the merged libraries - foreach(lib ${libs}) - list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) - endforeach() - if(libs_deps) - list(REMOVE_DUPLICATES libs_deps) - endif() - - # To produce a library we need at least one source file. - # It is created by add_custom_command below and will helps - # also help to track dependencies. - set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) - - if(APPLE) # Use OSX's libtool to merge archives - # Make the generated dummy source file depended on all static input - # libs. If input lib changes,the source file is touched - # which causes the desired effect (relink). - add_custom_command(OUTPUT ${target_SRCS} - COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} - DEPENDS ${libs}) - - # Generate dummy staic lib - file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") - add_library(${TARGET_NAME} STATIC ${target_SRCS}) - target_link_libraries(${TARGET_NAME} ${libs_deps}) - - foreach(lib ${libs}) - # Get the file names of the libraries to be merged - set(libfiles ${libfiles} $) - endforeach() - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" - COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles} - ) - endif(APPLE) - if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib - set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir) - - foreach(lib ${libs}) - set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library - set(objdir ${target_DIR}/${lib}.objdir) - - add_custom_command(OUTPUT ${objdir} - COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir} - DEPENDS ${lib}) - - add_custom_command(OUTPUT ${objlistfile} - COMMAND ${CMAKE_AR} -x "$" - COMMAND ${CMAKE_AR} -t "$" > ${objlistfile} - DEPENDS ${lib} ${objdir} - WORKING_DIRECTORY ${objdir}) - - list(APPEND target_OBJS "${objlistfile}") - endforeach() - - # Make the generated dummy source file depended on all static input - # libs. If input lib changes,the source file is touched - # which causes the desired effect (relink). - add_custom_command(OUTPUT ${target_SRCS} - COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} - DEPENDS ${libs} ${target_OBJS}) - - # Generate dummy staic lib - file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") - add_library(${TARGET_NAME} STATIC ${target_SRCS}) - target_link_libraries(${TARGET_NAME} ${libs_deps}) - - # Get the file name of the generated library - set(target_LIBNAME "$") - - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'` - COMMAND ${CMAKE_RANLIB} ${target_LIBNAME} - WORKING_DIRECTORY ${target_DIR}) - endif(LINUX) - if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs. - # Make the generated dummy source file depended on all static input - # libs. If input lib changes,the source file is touched - # which causes the desired effect (relink). - add_custom_command(OUTPUT ${target_SRCS} - COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} - DEPENDS ${libs}) - - # Generate dummy staic lib - file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") - add_library(${TARGET_NAME} STATIC ${target_SRCS}) - target_link_libraries(${TARGET_NAME} ${libs_deps}) - - foreach(lib ${libs}) - # Get the file names of the libraries to be merged - set(libfiles ${libfiles} $) - endforeach() - # msvc will put libarary in directory of "/Release/xxxlib" by default - # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" - add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" - COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles} - ) - endif(WIN32) -endfunction(merge_static_libs) - -function(cc_library TARGET_NAME) - set(options STATIC static SHARED shared) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if(WIN32) - # add libxxx.lib prefix in windows - set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") - endif(WIN32) - if(cc_library_SRCS) - if(cc_library_SHARED OR cc_library_shared) # build *.so - add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) - else() - add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) - find_fluid_modules(${TARGET_NAME}) - endif() - - if(cc_library_DEPS) - # Don't need link libwarpctc.so - if("${cc_library_DEPS};" MATCHES "warpctc;") - list(REMOVE_ITEM cc_library_DEPS warpctc) - add_dependencies(${TARGET_NAME} warpctc) - endif() - # Only deps libmklml.so, not link - if("${cc_library_DEPS};" MATCHES "mklml;") - list(REMOVE_ITEM cc_library_DEPS mklml) - if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml") - list(APPEND cc_library_DEPS dynload_mklml) - endif() - add_dependencies(${TARGET_NAME} mklml) - if(WIN32) - target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) - else(WIN32) - target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") - endif(WIN32) - endif() - # remove link to python, see notes at: - # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually - if("${cc_library_DEPS};" MATCHES "python;") - list(REMOVE_ITEM cc_library_DEPS python) - add_dependencies(${TARGET_NAME} python) - if(WIN32) - target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) - else() - target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") - endif(WIN32) - endif() - target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) - add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) - common_link(${TARGET_NAME}) - endif() - - set(full_path_src "") - # cpplint code style - foreach(source_file ${cc_library_SRCS}) - string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - endif() - if(${source_file} MATCHES "framework.pb.cc") - list(APPEND full_path_src ${source_file}) - else() - list(APPEND full_path_src ${CMAKE_CURRENT_SOURCE_DIR}/${source_file}) - endif() - endforeach() - set(__lite_cc_files ${__lite_cc_files} ${full_path_src} CACHE INTERNAL "") - else(cc_library_SRCS) - if(cc_library_DEPS) - merge_static_libs(${TARGET_NAME} ${cc_library_DEPS}) - else() - message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).") - endif() - endif(cc_library_SRCS) -endfunction(cc_library) - -# The link operation under windows may exceeds the maximum characters limit, simply break the link command -# into multiple link opeartion can fix that, say -# original: -# lib /out:target.lib a.lib b.lib c.lib d.lib -# after: -# 1. lib /out:dummy_lib_1.lib a.lib b.lib -# 2. lib /out:dummy_lib_2.lib c.lib d.lib -# 1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib -function(sep_library TARGET_NAME) - set(options STATIC static SHARED shared) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(dummy_index 1) - set(dummy_offset 1) - # the dummy target would be consisted of limit size libraries - set(dummy_limit 50) - list(LENGTH sep_library_DEPS sep_all_len) - foreach(v ${sep_library_DEPS}) - list(APPEND dummy_list ${v}) - list(LENGTH dummy_list listlen ) - if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len})) - message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}") - cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list}) - foreach(i ${dummy_list}) - list(REMOVE_AT dummy_list 0) - endforeach() - list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index}) - MATH(EXPR dummy_index "${dummy_index}+1") - endif() - MATH(EXPR dummy_offset "${dummy_offset}+1") - endforeach() - if(${sep_library_SHARED}) - cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) - else(${sep_library_SHARED}) - cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) - endif(${sep_library_SHARED}) -endfunction(sep_library) - -function(cc_binary TARGET_NAME) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - add_executable(${TARGET_NAME} ${cc_binary_SRCS}) - if(cc_binary_DEPS) - target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) - add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) - common_link(${TARGET_NAME}) - endif() - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${os_dependency_modules}) -endfunction(cc_binary) - -function(cc_test TARGET_NAME) - if(WITH_TESTING) - set(options SERIAL) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS ARGS) - cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - add_executable(${TARGET_NAME} ${cc_test_SRCS}) - if(WIN32) - if("${cc_test_DEPS};" MATCHES "python;") - list(REMOVE_ITEM cc_test_DEPS python) - target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) - endif() - endif(WIN32) - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main memory gtest gflags glog) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) - common_link(${TARGET_NAME}) - add_test(NAME ${TARGET_NAME} - COMMAND ${TARGET_NAME} ${cc_test_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - if (${cc_test_SERIAL}) - set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) - endif() - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) - # No unit test should exceed 10 minutes. - set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) - endif() -endfunction(cc_test) - -# cc_test without default dependencies -function(raw_cc_test TARGET_NAME) - if(WITH_TESTING) - set(options SERIAL) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS ARGS) - cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - add_executable(${TARGET_NAME} ${cc_test_SRCS}) - if(WIN32) - if("${cc_test_DEPS};" MATCHES "python;") - list(REMOVE_ITEM cc_test_DEPS python) - target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) - endif() - endif(WIN32) - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - - if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} lite_gtest_main gtest gflags logging) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} lite_gtest_main gtest gflags logging) - else() - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} lite_gtest_main gtest gflags glog) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} lite_gtest_main gtest gflags glog) - endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - - common_link(${TARGET_NAME}) - add_test(NAME ${TARGET_NAME} - COMMAND ${TARGET_NAME} ${cc_test_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - if (${cc_test_SERIAL}) - set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) - endif() - # No unit test should exceed 10 minutes. - set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) - endif() -endfunction(raw_cc_test) - -function(_lite_cc_test args) - message(STATUS "building lite raw test: ${args}") - raw_cc_test(${args} ${ARGN}) -endfunction() - -function(nv_library TARGET_NAME) - if (LITE_WITH_CUDA) - set(options STATIC static SHARED shared) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if(nv_library_SRCS) - if (nv_library_SHARED OR nv_library_shared) # build *.so - cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) - else() - cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) - find_fluid_modules(${TARGET_NAME}) - endif() - if (nv_library_DEPS) - add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) - endif() - # cpplint code style - foreach(source_file ${nv_library_SRCS}) - string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - endif() - endforeach() - else(nv_library_SRCS) - if (nv_library_DEPS) - merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) - else() - message(FATAL "Please specify source file or library in nv_library.") - endif() - endif(nv_library_SRCS) - endif() -endfunction(nv_library) - -function(nv_binary TARGET_NAME) - if (LITE_WITH_CUDA) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) - if(nv_binary_DEPS) - target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) - add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) - common_link(${TARGET_NAME}) - endif() - endif() -endfunction(nv_binary) - -function(nv_test TARGET_NAME) - if (LITE_WITH_CUDA AND WITH_TESTING) - set(options SERIAL) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest -gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY}) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog) - common_link(${TARGET_NAME}) - add_test(${TARGET_NAME} ${TARGET_NAME}) - if (nv_test_SERIAL) - set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) - endif() - endif() -endfunction(nv_test) - - -# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support -# Usage: -# paddle_protobuf_generate_cpp( ) - -function(paddle_protobuf_generate_cpp SRCS HDRS) - if(NOT ARGN) - message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files") - return() - endif() - - set(${SRCS}) - set(${HDRS}) - - foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(FIL_WE ${FIL} NAME_WE) - - set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") - set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") - list(APPEND ${SRCS} "${_protobuf_protoc_src}") - list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") - - add_custom_command( - OUTPUT "${_protobuf_protoc_src}" - "${_protobuf_protoc_hdr}" - - COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} - -I${CMAKE_CURRENT_SOURCE_DIR} - --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} - DEPENDS ${ABS_FIL} protoc - COMMENT "Running C++ protocol buffer compiler on ${FIL}" - VERBATIM ) - endforeach() - - set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) - set(${SRCS} ${${SRCS}} PARENT_SCOPE) - set(${HDRS} ${${HDRS}} PARENT_SCOPE) -endfunction() - - -function(proto_library TARGET_NAME) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(proto_srcs) - set(proto_hdrs) - paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) - cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) -endfunction() diff --git a/cmake/hip.cmake b/cmake/hip.cmake deleted file mode 100644 index c3a748db50..0000000000 --- a/cmake/hip.cmake +++ /dev/null @@ -1,53 +0,0 @@ -if(NOT WITH_AMD_GPU) - return() -endif() - -include_directories("/opt/rocm/include") -include_directories("/opt/rocm/hip/include") -include_directories("/opt/rocm/miopen/include") -include_directories("/opt/rocm/hipblas/include") -include_directories("/opt/rocm/hiprand/include") -include_directories("/opt/rocm/rocrand/include") -include_directories("/opt/rocm/rccl/include") -include_directories("/opt/rocm/thrust") - -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) - -if(WITH_DSO) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") -endif(WITH_DSO) - -if(WITH_TESTING) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") -endif(WITH_TESTING) - -if(WITH_DISTRIBUTE) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") -endif(WITH_DISTRIBUTE) - -if(WITH_GRPC) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") -endif(WITH_GRPC) - -if(WITH_MKLDNN) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") -endif(WITH_MKLDNN) - -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") - -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) -endif() - -if("x${HCC_HOME}" STREQUAL "x") - set(HCC_HOME "/opt/rocm/hcc") -endif() - -set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") -set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") -set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") - diff --git a/cmake/lite.cmake b/cmake/lite.cmake deleted file mode 100644 index 707982a3e7..0000000000 --- a/cmake/lite.cmake +++ /dev/null @@ -1,435 +0,0 @@ -set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url") - -function(lite_download_and_uncompress INSTALL_DIR URL FILENAME) - message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") - string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME}) - set(EXTERNAL_PROJECT_NAME "extern_lite_download_${FILENAME_EX}") - set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}") - ExternalProject_Add( - ${EXTERNAL_PROJECT_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${INSTALL_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} - DOWNLOAD_DIR ${INSTALL_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) -endfunction() - -function (lite_deps TARGET) - set(options "") - set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS ARGS) - cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(deps ${lite_deps_DEPS}) - - if(LITE_WITH_X86) - foreach(var ${lite_deps_X86_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - if(LITE_WITH_CUDA) - foreach(var ${lite_deps_CUDA_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - if(LITE_WITH_ARM) - foreach(var ${lite_deps_ARM_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - if(LITE_WITH_PROFILE) - foreach(var ${lite_deps_PROFILE_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - foreach(var ${lite_deps_LIGHT_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - - - if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - foreach(var ${lite_deps_HVY_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - if (LITE_WITH_OPENCL) - foreach(var ${lite_deps_CL_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - if (LITE_WITH_FPGA) - foreach(var ${lite_deps_FPGA_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - if (LITE_WITH_NPU) - foreach(var ${lite_deps_NPU_DEPS}) - set(deps ${deps} ${var}) - endforeach(var) - endif() - - set(${TARGET} ${deps} PARENT_SCOPE) -endfunction() - - -# A fake target to include all the libraries and tests the lite module depends. -add_custom_target(lite_compile_deps COMMAND echo 1) - -# Add names for lite libraries for latter compile. We use this name list to avoid compiling -# the whole fluid project to accelerate the compile speed. -set(offline_lib_registry_file "${CMAKE_BINARY_DIR}/lite_libs.txt") -file(WRITE ${offline_lib_registry_file} "") # clean - -# cc_library with branch support. -# The branches: -# X86_DEPS: works only when LITE_WITH_X86 is ON. -# CUDA_DEPS: LITE_WITH_CUDA -# ARM_DEPS: LITE_WITH_ARM -# PROFILE_DEPS: LITE_WITH_PROFILE -# LIGHT_DEPS: LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -# HVY_DEPS: NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -# EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None -function(lite_cc_library TARGET) - set(options SHARED shared STATIC static MODULE module) - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS - HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) - cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(deps "") - lite_deps(deps - DEPS ${args_DEPS} - X86_DEPS ${args_X86_DEPS} - CUDA_DEPS ${args_CUDA_DEPS} - CL_DEPS ${args_CL_DEPS} - NPU_DEPS ${args_NPU_DEPS} - ARM_DEPS ${args_ARM_DEPS} - FPGA_DEPS ${args_FPGA_DEPS} - PROFILE_DEPS ${args_PROFILE_DEPS} - LIGHT_DEPS ${args_LIGHT_DEPS} - HVY_DEPS ${args_HVY_DEPS} - ) - - if (args_SHARED OR ARGS_shared) - cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS} SHARED) - elseif (args_MODULE OR ARGS_module) - add_library(${TARGET} MODULE ${args_SRCS}) - add_dependencies(${TARGET} ${deps} ${args_DEPS}) - else() - cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS}) - endif() - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) - - # collect targets need to compile for lite - if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS) - add_dependencies(lite_compile_deps ${TARGET}) - endif() - - # register a library name. - file(APPEND ${offline_lib_registry_file} "${TARGET}\n") -endfunction() - -function(lite_cc_binary TARGET) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) - cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(deps "") - lite_deps(deps - DEPS ${args_DEPS} - X86_DEPS ${args_X86_DEPS} - CUDA_DEPS ${args_CUDA_DEPS} - CL_DEPS ${args_CL_DEPS} - ARM_DEPS ${args_ARM_DEPS} - FPGA_DEPS ${args_FPGA_DEPS} - PROFILE_DEPS ${args_PROFILE_DEPS} - LIGHT_DEPS ${args_LIGHT_DEPS} - HVY_DEPS ${args_HVY_DEPS} - ) - cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS}) - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) - # collect targets need to compile for lite - if (NOT args_EXCLUDE_COMPILE_DEPS) - add_dependencies(lite_compile_deps ${TARGET}) - endif() -endfunction() - -# Add a unit-test name to file for latter offline manual test. -set(offline_test_registry_file "${CMAKE_BINARY_DIR}/lite_tests.txt") -file(WRITE ${offline_test_registry_file} "") # clean -# Test lite modules. - -function(lite_cc_test TARGET) - if(NOT WITH_TESTING) - return() - endif() - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS - ARGS - COMPILE_LEVEL # (basic|extra) - ) - cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if (args_COMPILE_LEVEL STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) - MESSAGE(STATUS "Ignore test ${TARGET} due to compile level ${args_COMPILE_LEVEL}") - return() - endif() - - set(deps "") - lite_deps(deps - DEPS ${args_DEPS} - X86_DEPS ${args_X86_DEPS} - CUDA_DEPS ${args_CUDA_DEPS} - CL_DEPS ${args_CL_DEPS} - ARM_DEPS ${args_ARM_DEPS} - FPGA_DEPS ${args_FPGA_DEPS} - PROFILE_DEPS ${args_PROFILE_DEPS} - LIGHT_DEPS ${args_LIGHT_DEPS} - HVY_DEPS ${args_HVY_DEPS} - ) - _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) - file(APPEND ${offline_test_registry_file} "${TARGET}\n") - - # collect targets need to compile for lite - if (NOT args_EXCLUDE_COMPILE_DEPS) - add_dependencies(lite_compile_deps ${TARGET}) - endif() -endfunction() - -set(arm_kernels CACHE INTERNAL "arm kernels") -set(x86_kernels CACHE INTERNAL "x86 kernels") -set(fpga_kernels CACHE INTERNAL "fpga kernels") -set(npu_kernels CACHE INTERNAL "npu kernels") -set(opencl_kernels CACHE INTERNAL "opencl kernels") -set(host_kernels CACHE INTERNAL "host kernels") - -set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt") -file(WRITE ${kernels_src_list} "") # clean -# add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA) -# level: one of (basic, extra) -function(add_kernel TARGET device level) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS - ARGS) - cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) - return() - endif() - - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - # the source list will collect for model_optimize_tool to fake kernel generation. - foreach(src ${args_SRCS}) - file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") - endforeach() - return() - endif() - - # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated, - # no need to continue the compilation of the true kernel source. - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - return() - endif(LITE_ON_MODEL_OPTIMIZE_TOOL) - - - if ("${device}" STREQUAL "Host") - set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") - endif() - if ("${device}" STREQUAL "ARM") - if (NOT LITE_WITH_ARM) - return() - endif() - set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "") - endif() - if ("${device}" STREQUAL "X86") - if (NOT LITE_WITH_X86) - return() - endif() - set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "") - endif() - if ("${device}" STREQUAL "NPU") - if (NOT LITE_WITH_NPU) - return() - endif() - set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") - endif() - if ("${device}" STREQUAL "FPGA") - if (NOT LITE_WITH_FPGA) - return() - endif() - set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") - endif() - if ("${device}" STREQUAL "OPENCL") - if (NOT LITE_WITH_OPENCL) - return() - endif() - set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "") - endif() - - # the source list will collect for paddle_use_kernel.h code generation. - foreach(src ${args_SRCS}) - file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") - endforeach() - - lite_cc_library(${TARGET} SRCS ${args_SRCS} - DEPS ${args_DEPS} - X86_DEPS ${args_X86_DEPS} - CUDA_DEPS ${args_CUDA_DEPS} - CL_DEPS ${args_CL_DEPS} - ARM_DEPS ${args_ARM_DEPS} - FPGA_DEPS ${args_FPGA_DEPS} - PROFILE_DEPS ${args_PROFILE_DEPS} - LIGHT_DEPS ${args_LIGHT_DEPS} - HVY_DEPS ${args_HVY_DEPS} - ) -endfunction() - -set(ops CACHE INTERNAL "ops") -set(ops_src_list "${CMAKE_BINARY_DIR}/ops_src_list.txt") -file(WRITE ${ops_src_list} "") # clean -# add an operator -# level: one of (basic, extra) -function(add_operator TARGET level) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS - ARGS) - cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) - return() - endif() - - set(ops "${ops};${TARGET}" CACHE INTERNAL "source") - - foreach(src ${args_SRCS}) - file(APPEND ${ops_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") - endforeach() - - lite_cc_library(${TARGET} SRCS ${args_SRCS} - DEPS ${args_DEPS} - X86_DEPS ${args_X86_DEPS} - CUDA_DEPS ${args_CUDA_DEPS} - CL_DEPS ${args_CL_DEPS} - ARM_DEPS ${args_ARM_DEPS} - FPGA_DEPS ${args_FPGA_DEPS} - PROFILE_DEPS ${args_PROFILE_DEPS} - LIGHT_DEPS ${args_LIGHT_DEPS} - HVY_DEPS ${args_HVY_DEPS} - ) -endfunction() - - -# Bundle several static libraries into one. -function(bundle_static_library tgt_name bundled_tgt_name fake_target) - list(APPEND static_libs ${tgt_name}) - - function(_recursively_collect_dependencies input_target) - set(_input_link_libraries LINK_LIBRARIES) - get_target_property(_input_type ${input_target} TYPE) - if (${_input_type} STREQUAL "INTERFACE_LIBRARY") - set(_input_link_libraries INTERFACE_LINK_LIBRARIES) - endif() - get_target_property(public_dependencies ${input_target} ${_input_link_libraries}) - foreach(dependency IN LISTS public_dependencies) - if(TARGET ${dependency}) - get_target_property(alias ${dependency} ALIASED_TARGET) - if (TARGET ${alias}) - set(dependency ${alias}) - endif() - get_target_property(_type ${dependency} TYPE) - if (${_type} STREQUAL "STATIC_LIBRARY") - list(APPEND static_libs ${dependency}) - endif() - - get_property(library_already_added - GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency}) - if (NOT library_already_added) - set_property(GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency} ON) - _recursively_collect_dependencies(${dependency}) - endif() - endif() - endforeach() - set(static_libs ${static_libs} PARENT_SCOPE) - endfunction() - - _recursively_collect_dependencies(${tgt_name}) - - list(REMOVE_DUPLICATES static_libs) - - set(bundled_tgt_full_name - ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}) - - #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}") - - if(NOT IOS) - file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in - "CREATE ${bundled_tgt_full_name}\n" ) - - foreach(tgt IN LISTS static_libs) - file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in - "ADDLIB $\n") - endforeach() - - file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "SAVE\n") - file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "END\n") - - file(GENERATE - OUTPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar - INPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in) - - set(ar_tool ${CMAKE_AR}) - if (CMAKE_INTERPROCEDURAL_OPTIMIZATION) - set(ar_tool ${CMAKE_CXX_COMPILER_AR}) - endif() - - add_custom_command( - COMMAND ${ar_tool} -M < ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar - OUTPUT ${bundled_tgt_full_name} - COMMENT "Bundling ${bundled_tgt_name}" - VERBATIM) - else() - foreach(lib ${static_libs}) - set(libfiles ${libfiles} $) - endforeach() - add_custom_command( - COMMAND /usr/bin/libtool -static -o ${bundled_tgt_full_name} ${libfiles} - OUTPUT ${bundled_tgt_full_name} - ) - endif() - - add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_full_name}) - add_dependencies(${fake_target} ${tgt_name}) - - add_library(${bundled_tgt_name} STATIC IMPORTED) - set_target_properties(${bundled_tgt_name} - PROPERTIES - IMPORTED_LOCATION ${bundled_tgt_full_name} - INTERFACE_INCLUDE_DIRECTORIES $) - add_dependencies(${bundled_tgt_name} ${fake_target}) - -endfunction() diff --git a/cmake/lite_utils.cmake b/cmake/lite_utils.cmake deleted file mode 100644 index f07ea85936..0000000000 --- a/cmake/lite_utils.cmake +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# ---------------------------------------------------------------------------- -# section: Provides an paddle lite config option macro -# usage: lite_option(var "help string to describe the var" [if or IF (condition)]) -# ---------------------------------------------------------------------------- -macro(lite_option variable description value) - set(__value ${value}) - set(__condition "") - set(__varname "__value") - foreach(arg ${ARGN}) - if(arg STREQUAL "IF" OR arg STREQUAL "if") - set(__varname "__condition") - else() - list(APPEND ${__varname} ${arg}) - endif() - endforeach() - unset(__varname) - if(__condition STREQUAL "") - set(__condition 2 GREATER 1) - endif() - - if(${__condition}) - if(__value MATCHES ";") - if(${__value}) - option(${variable} "${description}" ON) - else() - option(${variable} "${description}" OFF) - endif() - elseif(DEFINED ${__value}) - if(${__value}) - option(${variable} "${description}" ON) - else() - option(${variable} "${description}" OFF) - endif() - else() - option(${variable} "${description}" ${__value}) - endif() - else() - unset(${variable} CACHE) - endif() - unset(__condition) - unset(__value) -endmacro() diff --git a/cmake/make_resource.py b/cmake/make_resource.py deleted file mode 100644 index 09a2ca877d..0000000000 --- a/cmake/make_resource.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import sys - -res = sys.argv[1] -out = sys.argv[2] -var = re.sub(r'[ .-]', '_', os.path.basename(res)) - -open(out, "w").write("const unsigned char " + var + "[] = {" + ",".join([ - "0x%02x" % ord(c) for c in open(res).read() -]) + ",0};\n" + "const unsigned " + var + "_size = sizeof(" + var + ");\n") diff --git a/cmake/operators.cmake b/cmake/operators.cmake deleted file mode 100644 index c17e718f42..0000000000 --- a/cmake/operators.cmake +++ /dev/null @@ -1,227 +0,0 @@ -set(PART_CUDA_KERNEL_FILES) -function(op_library TARGET) - # op_library is a function to create op library. The interface is same as - # cc_library. But it handle split GPU/CPU code and link some common library - # for ops. - set(cc_srcs) - set(cu_srcs) - set(hip_cu_srcs) - set(miopen_hip_cc_srcs) - set(cu_cc_srcs) - set(cudnn_cu_cc_srcs) - set(CUDNN_FILE) - set(mkldnn_cc_srcs) - set(MKLDNN_FILE) - set(op_common_deps operator op_registry math_function) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - set(pybind_flag 0) - cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - - list(LENGTH op_library_SRCS op_library_SRCS_len) - if (${op_library_SRCS_len} EQUAL 0) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) - list(APPEND cc_srcs ${TARGET}.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) - list(APPEND cu_cc_srcs ${TARGET}.cu.cc) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) - list(APPEND cu_srcs ${TARGET}.cu) - endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu - ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) - list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) - endif() - - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) - list(APPEND hip_cu_srcs ${TARGET}.hip.cu) - endif() - string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) - list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) - endif() - if(WITH_AMD_GPU) - string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc) - list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc) - endif() - endif() - if(WITH_MKLDNN) - string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc) - list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) - endif() - endif() - else() - foreach(src ${op_library_SRCS}) - if (${src} MATCHES ".*\\.hip.cu$") - list(APPEND hip_cu_srcs ${src}) - elseif (${src} MATCHES ".*\\.cu$") - list(APPEND cu_srcs ${src}) - elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") - list(APPEND cudnn_cu_cc_srcs ${src}) - elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$") - list(APPEND miopen_hip_cc_srcs ${src}) - elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") - list(APPEND mkldnn_cc_srcs ${src}) - elseif(${src} MATCHES ".*\\.cu.cc$") - list(APPEND cu_cc_srcs ${src}) - elseif(${src} MATCHES ".*\\.cc$") - list(APPEND cc_srcs ${src}) - else() - message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu") - endif() - endforeach() - endif() - - list(LENGTH cc_srcs cc_srcs_len) - if (${cc_srcs_len} EQUAL 0) - message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") - endif() - if (WIN32) - # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") - if ("${TARGET}" STREQUAL "${windows_unsupport_op}") - return() - endif() - endforeach() - endif(WIN32) - set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs") - - list(LENGTH op_library_DEPS op_library_DEPS_len) - if (${op_library_DEPS_len} GREATER 0) - set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) - endif() - if (WITH_GPU) - nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} - ${op_common_deps}) - elseif (WITH_AMD_GPU) - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} - ${op_common_deps}) - else() - cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} - ${op_common_deps}) - endif() - - # Define operators that don't need pybind here. - foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" -"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" -"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op") - if ("${TARGET}" STREQUAL "${manual_pybind_op}") - set(pybind_flag 1) - endif() - endforeach() - - # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. - # Note that it's enough to just adding one operator to pybind in a *_op.cc file. - # And for detail pybind information, please see generated paddle/pybind/pybind.h. - file(READ ${TARGET}.cc TARGET_CONTENT) - string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") - string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}") - if (one_register STREQUAL "") - string(REPLACE "_op" "" TARGET "${TARGET}") - else () - string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}") - string(REPLACE "," "" TARGET "${TARGET}") - endif() - - # pybind USE_NO_KERNEL_OP - # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel - string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") - string(REPLACE "_op" "" TARGET "${TARGET}") - if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") - file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") - set(pybind_flag 1) - endif() - - # pybind USE_CPU_ONLY_OP - list(LENGTH cu_srcs cu_srcs_len) - list(LENGTH cu_cc_srcs cu_cc_srcs_len) - list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) - list(LENGTH hip_cu_srcs hip_cu_srcs_len) - list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) - if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND - ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) - file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") - set(pybind_flag 1) - endif() - - # pybind USE_OP_DEVICE_KERNEL for CUDNN - list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len) - if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) - if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") - else() - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") - endif() - endif() - - # pybind USE_OP_DEVICE_KERNEL for MIOPEN - if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0) - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") - endif() - - # pybind USE_OP_DEVICE_KERNEL for MKLDNN - if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) - # Append first implemented MKLDNN activation operator - if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") - elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n") - - else() - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") - endif() - endif() - - # pybind USE_OP - if (${pybind_flag} EQUAL 0) - # NOTE(*): activation use macro to regist the kernels, set use_op manually. - if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP(relu);\n") - elseif(${TARGET} STREQUAL "fake_dequantize") - file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") - elseif(${TARGET} STREQUAL "fake_quantize") - file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n") - elseif(${TARGET} STREQUAL "tensorrt_engine_op") - message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") - elseif(${TARGET} STREQUAL "fc") - # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition - file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") - else() - file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") - endif() - endif() -endfunction() - - -function(register_operators) - set(options "") - set(oneValueArgs "") - set(multiValueArgs EXCLUDES DEPS) - cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - - file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") - string(REPLACE "_mkldnn" "" OPS "${OPS}") - string(REPLACE ".cc" "" OPS "${OPS}") - list(REMOVE_DUPLICATES OPS) - list(LENGTH register_operators_DEPS register_operators_DEPS_len) - - foreach(src ${OPS}) - list(FIND register_operators_EXCLUDES ${src} _index) - if (${_index} EQUAL -1) - if (${register_operators_DEPS_len} GREATER 0) - op_library(${src} DEPS ${register_operators_DEPS}) - else() - op_library(${src}) - endif() - endif() - endforeach() -endfunction() diff --git a/cmake/package.cmake b/cmake/package.cmake deleted file mode 100644 index 79e02147f3..0000000000 --- a/cmake/package.cmake +++ /dev/null @@ -1,21 +0,0 @@ -set(CPACK_PACKAGE_NAME paddle) -set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION}) -set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION}) -set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION}) -set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION}) -## DEB Settings -set(CPACK_DEBIAN_PACKAGE_NAME paddle) -set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64) -set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev ) -set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle") -set(CPACK_PACKAGE_DESCRIPTION "") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl") -set(CPACK_DEBIAN_PACKAGE_SECTION Devel) -set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION}) -set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst") -#set(CPACK_GENERATOR "DEB") -# Start cpack -include (CMakePackageConfigHelpers) -include (CPack) - - diff --git a/cmake/simd.cmake b/cmake/simd.cmake deleted file mode 100644 index 566dc75fda..0000000000 --- a/cmake/simd.cmake +++ /dev/null @@ -1,99 +0,0 @@ -# This file is use to check all support level of AVX on your machine -# so that PaddlePaddle can unleash the vectorization power of muticore. - -include(CheckCXXSourceRuns) -include(CheckCXXSourceCompiles) - -if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(MMX_FLAG "-mmmx") - set(SSE2_FLAG "-msse2") - set(SSE3_FLAG "-msse3") - set(AVX_FLAG "-mavx") - set(AVX2_FLAG "-mavx2") - set(AVX512F_FLAG "-mavx512f") -elseif(MSVC) - set(MMX_FLAG "/arch:MMX") - set(SSE2_FLAG "/arch:SSE2") - set(SSE3_FLAG "/arch:SSE3") - SET(AVX_FLAG "/arch:AVX") - SET(AVX2_FLAG "/arch:AVX2") -endif() - -set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) - -# Check MMX -set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) -set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - _mm_setzero_si64(); - return 0; -}" MMX_FOUND) - -# Check SSE2 -set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) -set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - _mm_setzero_si128(); - return 0; -}" SSE2_FOUND) - -# Check SSE3 -set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) -set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m128d a = _mm_set1_pd(6.28); - __m128d b = _mm_set1_pd(3.14); - __m128d result = _mm_addsub_pd(a, b); - result = _mm_movedup_pd(result); - return 0; -}" SSE3_FOUND) - -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) - -# Check AVX 2 -set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) -set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; -}" AVX2_FOUND) - -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) - -set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) -mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/cmake/system.cmake b/cmake/system.cmake deleted file mode 100644 index ba00df928a..0000000000 --- a/cmake/system.cmake +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Detects the OS and sets appropriate variables. -# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is -# building for, but the host processor name like centos is necessary -# in some scenes to distinguish system for customization. -# -# for instance, protobuf libs path is /lib64 -# on CentOS, but /lib on other systems. - -IF(WIN32) - SET(HOST_SYSTEM "win32") -ELSE(WIN32) - IF(APPLE) - SET(HOST_SYSTEM "macosx") - EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION) - STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}") - IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET}) - # Set cache variable - end user may change this during ccmake or cmake-gui configure. - SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING - "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.") - ENDIF() - IF(ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux" - OR ARM_TARGET_OS STREQUAL "ios" OR ARM_TARGET_OS STREQUAL "ios64") - ELSE() - set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") - ENDIF() - ELSE(APPLE) - - IF(EXISTS "/etc/issue") - FILE(READ "/etc/issue" LINUX_ISSUE) - IF(LINUX_ISSUE MATCHES "CentOS") - SET(HOST_SYSTEM "centos") - ELSEIF(LINUX_ISSUE MATCHES "Debian") - SET(HOST_SYSTEM "debian") - ELSEIF(LINUX_ISSUE MATCHES "Ubuntu") - SET(HOST_SYSTEM "ubuntu") - ELSEIF(LINUX_ISSUE MATCHES "Red Hat") - SET(HOST_SYSTEM "redhat") - ELSEIF(LINUX_ISSUE MATCHES "Fedora") - SET(HOST_SYSTEM "fedora") - ENDIF() - - STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}") - ENDIF(EXISTS "/etc/issue") - - IF(EXISTS "/etc/redhat-release") - FILE(READ "/etc/redhat-release" LINUX_ISSUE) - IF(LINUX_ISSUE MATCHES "CentOS") - SET(HOST_SYSTEM "centos") - ENDIF() - ENDIF(EXISTS "/etc/redhat-release") - - IF(NOT HOST_SYSTEM) - SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME}) - ENDIF() - - ENDIF(APPLE) -ENDIF(WIN32) - -# query number of logical cores -CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES) - -MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) - -MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") -MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") - -# external dependencies log output -SET(EXTERNAL_PROJECT_LOG_ARGS - LOG_DOWNLOAD 0 # Wrap download in script to log output - LOG_UPDATE 1 # Wrap update in script to log output - LOG_CONFIGURE 1 # Wrap configure in script to log output - LOG_BUILD 0 # Wrap build in script to log output - LOG_TEST 1 # Wrap test in script to log output - LOG_INSTALL 0 # Wrap install in script to log output -) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake deleted file mode 100644 index 3bf12094e4..0000000000 --- a/cmake/tensorrt.cmake +++ /dev/null @@ -1,38 +0,0 @@ -if(NOT WITH_GPU) - return() -endif() - -set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT") -find_path(TENSORRT_INCLUDE_DIR NvInfer.h - PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include - $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include - NO_DEFAULT_PATH -) - -find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a - PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib - $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib - NO_DEFAULT_PATH - DOC "Path to TensorRT library.") - -if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) - if(WITH_DSO) - set(TENSORRT_FOUND ON) - endif(WITH_DSO) -else() - set(TENSORRT_FOUND OFF) -endif() - -if(TENSORRT_FOUND) - file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") - string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" - TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") - - message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " - "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") - include_directories(${TENSORRT_INCLUDE_DIR}) - link_directories(${TENSORRT_LIBRARY}) - add_definitions(-DPADDLE_WITH_TENSORRT) -endif() diff --git a/cmake/util.cmake b/cmake/util.cmake deleted file mode 100644 index 02667dbce6..0000000000 --- a/cmake/util.cmake +++ /dev/null @@ -1,55 +0,0 @@ -# Some common routine for paddle compile. - -# target_circle_link_libraries -# Link libraries to target which has circle dependencies. -# -# First Argument: target name want to be linked with libraries -# Rest Arguments: libraries which link together. -function(target_circle_link_libraries TARGET_NAME) - if(APPLE) - set(LIBS) - set(inArchive OFF) - set(libsInArgn) - - foreach(arg ${ARGN}) - if(${arg} STREQUAL "ARCHIVE_START") - set(inArchive ON) - elseif(${arg} STREQUAL "ARCHIVE_END") - set(inArchive OFF) - else() - if(inArchive) - list(APPEND LIBS "-Wl,-force_load") - endif() - list(APPEND LIBS ${arg}) - list(APPEND libsInArgn ${arg}) - endif() - endforeach() - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - if(NOT IOS_ENABLE_BITCODE) - list(APPEND LIBS "-undefined dynamic_lookup") - endif() - endif() - list(REVERSE libsInArgn) - target_link_libraries(${TARGET_NAME} - ${LIBS} - ${libsInArgn}) - - else() # LINUX - set(LIBS) - - foreach(arg ${ARGN}) - if(${arg} STREQUAL "ARCHIVE_START") - list(APPEND LIBS "-Wl,--whole-archive") - elseif(${arg} STREQUAL "ARCHIVE_END") - list(APPEND LIBS "-Wl,--no-whole-archive") - else() - list(APPEND LIBS ${arg}) - endif() - endforeach() - - target_link_libraries(${TARGET_NAME} - "-Wl,--start-group" - ${LIBS} - "-Wl,--end-group") - endif() -endfunction() diff --git a/cmake/version.cmake b/cmake/version.cmake deleted file mode 100644 index 8bcc4ffe72..0000000000 --- a/cmake/version.cmake +++ /dev/null @@ -1,66 +0,0 @@ -# Get the latest git tag. -set(PADDLE_VERSION $ENV{PADDLE_VERSION}) -set(tmp_version "HEAD") -set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?") -set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+") -# set(LATEST_PADDLE_VERSION "latest") -set(LATEST_PADDLE_VERSION "0.0.0") - -while ("${PADDLE_VERSION}" STREQUAL "") - # Check current branch name - execute_process( - COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version} - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE GIT_BRANCH_NAME - RESULT_VARIABLE GIT_BRANCH_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT ${GIT_BRANCH_RESULT}) - execute_process( - COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version} - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE GIT_TAG_NAME - RESULT_VARIABLE GIT_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT ${GIT_RESULT}) - # Check if current branch is release branch - if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}") - # Check the tag is a correct version - if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}") - # if no tag was found, set PADDLE_VERSION to "latest" - set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}") - elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") - string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME}) - else() # otherwise, get the previous git tag name. - set(tmp_version "${GIT_TAG_NAME}~1") - endif() - else() - execute_process( - COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version} - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE GIT_EXACT_TAG_NAME - RESULT_VARIABLE GIT_EXACT_TAG_RESULT - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT ${GIT_EXACT_TAG_NAME}) - # Check if current branch is tag branch - if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") - string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME}) - else() - set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}") - endif() - else() - # otherwise, we always set PADDLE_VERSION to "latest" - set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}") - endif() - endif() - else() - set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}") - message(WARNING "Cannot add paddle version from git tag") - endif() - else() - set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}") - message(WARNING "Cannot add paddle version for wrong git branch result") - endif() -endwhile() - -add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION}) -message(STATUS "Paddle version is ${PADDLE_VERSION}") diff --git a/cpp_demo.md b/cpp_demo.md new file mode 100644 index 0000000000..bfb3439998 --- /dev/null +++ b/cpp_demo.md @@ -0,0 +1,271 @@ + +* [C++ Demo](#c-demo) + * [编译](#编译-1) + * [准备执行环境](#准备执行环境) + * [使用安卓手机](#使用安卓手机) + * [使用安卓模拟器](#使用安卓模拟器) + * [下载模型并运行示例](#下载模型并运行示例) + * [Demo 程序运行结果](#demo-程序运行结果) + * [如何在代码中使用 API](#如何在代码中使用-api) + + + + + +# `C++` Demo + +## 编译 + +首先按照[PaddleLite 源码编译](https://github.com/PaddlePaddle/Paddle-Lite/wiki/source_compile)准备交叉编译环境,之后拉取最新[PaddleLite release发布版代码](https://github.com/PaddlePaddle/Paddle-Lite)。下面以Android-ARMv8架构为例,介绍编译过程,并最终在手机上跑通MobilNetv1模型。 + +进入 Paddle-Lite 目录,运行以下命令编译代码(**需加编译选项`--build_extra=ON`确保完整编译**): + +```shell +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --arm_lang=gcc \ + --android_stl=c++_static \ + --build_extra=ON \ + full_publish +``` + +编译完成后 `./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/` 文件夹下包含: + +```bash +cxx/include/ +cxx/lib/libpaddle_api_full_bundled.a +cxx/lib/libpaddle_api_light_bundled.a +demo/cxx/ #其中包括{include Makefile.def mobile_light} +third_party/gflags/ +``` + +## 准备执行环境 + +执行环境有两种:使用安卓手机;若没安卓手机,也可在安卓模拟器中执行。 + +### 环境一:使用安卓手机 + +将手机连上电脑,在手机上打开选项 -> 开启-开发者模式 -> 开启-USB调试模式。确保 `adb devices` 能够看到相应的设备。 + +### 环境二:使用安卓模拟器 + +运行下面命令,分别创建安卓armv8、armv7架构的模拟器。若需在真机测试,将模拟器换成相应架构的真机环境即可。 + +```shell +# android-armv8 +adb kill-server +adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done +echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a" +echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 & +sleep 1m +``` + +```shell +# android-armv7 +adb kill-server +adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done +echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" +echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 & +sleep 1m +``` + +## 下载模型并运行示例 + +```bash +cd inference_lite_lib.android.armv8/demo/cxx/mobile_full +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz + +make + +adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/ +adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api +adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" +``` +注:我们也提供了轻量级 API 的 demo,可以执行以下代码运行轻量级 API 示例。 + +```bash +cd ../mobile_light +make +adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api +adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt --threads=1 " +``` +## Demo 程序运行结果 +Demo 运行成功后 ,将在控制台输出预测结果的前10个类别的预测概率: + +```bash +Output dim: 1000 +Output[0]: 0.000191 +Output[100]: 0.000160 +Output[200]: 0.000264 +Output[300]: 0.000211 +Output[400]: 0.001032 +Output[500]: 0.000110 +Output[600]: 0.004829 +Output[700]: 0.001845 +Output[800]: 0.000202 +Output[900]: 0.000586 +``` + +## 如何在代码中使用 API + +在C++中使用PaddleLite API非常简单,不需要添加太多额外代码,具体步骤如下: + +- 加入头文件引用 + +```cpp + #include + #include + #include "paddle_api.h" + #include "paddle_use_kernels.h" + #include "paddle_use_ops.h" + #include "paddle_use_passes.h" +``` + +- 通过MobileConfig设置:模型文件位置(model_dir)、线程数(thread)和能耗模式( power mode )。输入数据(input),从 MobileConfig 创建 PaddlePredictor 并执行预测。 (注:Lite还支持从memory直接加载模型,可以通过MobileConfig::set_model_buffer方法实现) + +代码示例: +```cpp + // 1. Create MobileConfig + MobileConfig config; + + // 2. Load model + config.set_model_dir("path to your model directory"); //model dir + /*load model: Lite supports loading model from file or from memory (naive buffer from optimized model) + //Method One: Load model from memory: + void set_model_buffer(const char* model_buffer, + size_t model_buffer_size, + const char* param_buffer, + size_t param_buffer_size) + //Method Two: Load model from file: + void set_model_dir(const std::string& model_dir) */ + + // 3. Set MobileConfig (or you can skip this step to use default value): + config.set_power_mode(LITE_POWER_HIGH); //power mode + /*power modes: Lite supports the following power modes + LITE_POWER_HIGH + LITE_POWER_LOW + LITE_POWER_FULL + LITE_POWER_NO_BIND + LITE_POWER_RAND_HIGH + LITE_POWER_RAND_LOW */ + config.set_threads("num of threads"); //threads + + // 4. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 5. Prepare input data + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 3, 224, 224}); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { + data[i] = 1; + } + + // 6. Run predictor + predictor->Run(); + + // 7. Get output + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); +``` + +## CxxConfig案例: OCR_model的运行 + +1. OCR 模型文件: + - 我们提供Pb格式的[ocr_attention_mode](https://paddle-inference-dist.cdn.bcebos.com/ocr_attention.tar.gz)l下载 + - 也可以从[Paddle/model项目](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ocr_recognition)中训练出模型 + +2. 示例代码: +```c++ +#include +#include +#include +#include "paddle_api.h" // NOLINT +#include "paddle_use_kernels.h" // NOLINT +#include "paddle_use_ops.h" // NOLINT +#include "paddle_use_passes.h" // NOLINT +using namespace paddle::lite_api; // NOLINT + +DEFINE_string(model_dir, "", "Model dir path."); +DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +void RunModel() { + // 1. Set CxxConfig + CxxConfig config; + config.set_model_dir(FLAGS_model_dir); + std::vector valid_places{Place{TARGET(kARM), PRECISION(kFloat)},Place{TARGET(kHost), PRECISION(kFloat)}}; + config.set_preferred_place(Place{TARGET(kARM), PRECISION(kFloat)}); + config.set_valid_places(valid_places); + + // 2. Create PaddlePredictor by CxxConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data + //input 0 + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize(shape_t({1,1,48,512})); + auto* data = input_tensor->mutable_data(); + for(int i = 0; i < ShapeProduction(input_tensor->shape()); ++i){ + data[i] = 1; + } + //input1 + std::unique_ptr init_ids(std::move(predictor->GetInput(1))); + init_ids->Resize(shape_t({1,1})); + auto* data_ids = init_ids->mutable_data(); + for(int i = 0; i < ShapeProduction(init_ids->shape()); ++i){ + data_ids[i] = 0; + } + lod_t lod_i{{0,1},{0,1}}; + init_ids->SetLoD(lod_i); + //input2 + std::unique_ptr init_scores(std::move(predictor->GetInput(2))); + init_scores->Resize(shape_t({1,1})); + auto* data_scores = init_scores->mutable_data(); + for(int i = 0; i < ShapeProduction(init_scores->shape()); ++i){ + data_scores[i] = 0; + } + lod_t lod_s{{0,1},{0,1}}; + init_scores->SetLoD(lod_s); + + + // 4. Run predictor + predictor->Run(); + + // 5. Get output + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + for (int i = 0; i < ShapeProduction(output_tensor->shape()); i ++) { + printf("Output[%d]: %f\n", i, output_tensor->data()[i]); + } +} + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + RunModel(); + return 0; +} +``` +3. 运行方法: +参考以上代码编译出可执行文件`OCR_DEMO`,模型文件夹为`ocr_attention`。手机以USB调试、文件传输模式连接电脑 +在终端中输入以下命令执行OCR model测试: +``` +#OCR_DEMO为编译出的可执行文件名称,ocr_attention为ocr_attention模型的文件夹名称 +adb push OCR_DEMO data/local/tmp +adb push ocr_attention data/local/tmp +adb shell 'cd data/local/tmp && ./OCR_DEMO --model_dir=./OCR_DEMO' +``` +4. 运行结果 + + \ No newline at end of file diff --git a/cxx_api.md b/cxx_api.md new file mode 100644 index 0000000000..a05b2d3d69 --- /dev/null +++ b/cxx_api.md @@ -0,0 +1,63 @@ +# C++ API接口使用指南 + +请参考[源码编译](./source_compile)确保 Lite 可以正确编译,下面用Lite的c++接口加载并执行 MobileNetV1 模型为例,详细说明使用方法。 + +## 准备模型 + +Lite支持PaddlePaddle训练好的模型,MobileNetV1模型可以由以下三种方式得到: + +- 直接下载训练好的[MobileNetV1模型](https://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz) +- 使用[PaddlePaddle](https://paddlepaddle.org.cn/)构建MobileNetV1网络并训练 +- 使用[X2Paddle](./x2paddle)对caffe或者tensorflow的MobileNetV1模型进行转换得到 + +## 模型优化 + +使用Model Optimize Tool优化模型,使得模型预测过程表现出优异的性能。Model Optimize Tool的具体使用方法请参考[文档](./model_optimize_tool)。 + +- 准备model_optimize_tool +- 使用model_optimize_tool优化模型 +- 得到优化后的模型,包括__model__.nb文件和param.nb文件 + +## 加载模型 + +加载MobileNetV1网络模型,创建predictor,具体可以参考```paddlelite/lite/api/model_test.cc```文件。 +```c++ +lite::DeviceInfo::Init(); +lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num); +lite_api::MobileConfig config; +config.set_model_dir(model_dir); + +auto predictor = lite_api::CreatePaddlePredictor(config); +``` + +## 设定输入 + +得到input_tensor,设置输入值,此处我们设定为全1 + +```cpp +// 获取第 j 个 tensor 的句柄 +auto input_tensor = predictor->GetInput(j); +input_tensor->Resize(input_shapes[j]); + +// 获取数据指针,以塞入数据 +auto input_data = input_tensor->mutable_data(); +int input_num = 1; +for (int i = 0; i < input_shapes[j].size(); ++i) { + input_num *= input_shapes[j][i]; +} +for (int i = 0; i < input_num; ++i) { + input_data[i] = 1.f; +} +``` + +## 执行并输出 + +```cpp +predictor.Run(); +auto* out = predictor.GetOutput(0); +LOG(INFO) << "dims " << out->dims(); +LOG(INFO) << "out data size: " << out->data_size(); +``` + +输出为```dims dims{1000,}, out data size: 1000``` + diff --git a/debug_tools.md b/debug_tools.md new file mode 100644 index 0000000000..b904fdcd71 --- /dev/null +++ b/debug_tools.md @@ -0,0 +1,77 @@ +# Debug tools + +**Lite Model Debug Tool** 是用来检查Paddle-Lite框架与Paddle-Fluid框架运行时tensor(包括variable与weight)之间diff信息的基础工具。 + +## 工作流程: + +1. 运行 `/bin/bash check_model.sh --model_dir= --build_root_dir= debug_cpp_stage` 获得模型在Paddle-Lite框架下的运行拓扑信息、varibles信息和weights信息。运行后拓扑信息将会存储在默认名为 `topo_file.txt` 的文件中,variables和weights信息将会存储在默认名为 `tensor_cpp.txt` 的文件中。 +2. 运行 `/bin/bash check_model.sh --model_dir= --build_root_dir= debug_py_stage`执行fluid框架预测以获取相同模型在fluid框架下的variable与weight信息(注意:我们使用fluid的python api运行fluid模型,因此您在运行此步之前应确保已正确安装fluid的python api)。然后debug tool将会自动比较Paddle-Lite框架输出的信息和Paddle-Fluid框架输出的信息来检查是否存在运行时diff。 执行Paddle-Fluid框架,输出的信息将会存储在默认名为 `tensor_py.txt` 的文件中,相应的diff信息将会存储在默认名为 `diff.txt`的文件中(默认情况下,只会输出执行拓扑序中第一个有diff的variable相关的信息)。 + +## 注意事项: + +1. 输出的结果是在**执行完一次预测后**输出的相应变量/权重的最终值,因此如果您在预测过程进行过诸如变量复用/子图融合等优化方法,则相应的输出可能会出现偏差。 +2. 默认情况下debug tools将以全1作为输入进行比对。 +3. 默认情况下,为了保证与Paddle-Fluid框架的结果可比对,debug tool将会禁用掉所有的Paddle-Lite的优化策略。 +4. Paddle-Lite框架的执行环境由与您的编译选项有关,比如您开启了LITE_WITH_ARM编译选项,那debug tool的`debug_cpp_stage`也需要在ARM平台下运行。 + +## Diff信息输出: + +如果debug tool检测到diff信息,那么在`diff.txt`中将会输出类似以下结构信息 + +```c++ +>>>>>>>>>>>>>>>>>>DIFF VARIABLE: dropout_0.tmp_0<<<<<<<<<<<<<<<<<<< +dropout (X:pool2d_7.tmp_0) (Mask:dropout_0.tmp_1 Out:dropout_0.tmp_0) +--------------- Tensor File info --------------- +pool2d_7.tmp_0 {1,1536,1,1} 0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... +dropout_0.tmp_0 {1,1536,1,1} 0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... +--------------- Fluid Tensor info --------------- +pool2d_7.tmp_0 {1,1536,1,1} 0.7498912 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015033395 0.6216395 0.14709876 0.63672537 0.0 0.0 0.0041093696 0.7847073 0.0 0.07048465 0.23359808 0.8401219 0.23919891 0.1128789 0.0 0.1553514 0.3069055 0.0 0.0 0.8609365 0.22103554 ... +dropout_0.tmp_0 {1,1536,1,1} 0.599913 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.012026716 0.4973116 0.117679015 0.5093803 0.0 0.0 0.0032874958 0.62776583 0.0 0.056387722 0.18687847 0.67209756 0.19135913 0.090303116 0.0 0.12428112 0.2455244 0.0 0.0 0.68874925 ... +``` + +其中第二行为op相关信息,标明了执行哪个op出现了diff及其对应的输入输出变量名。Tensor File info为Paddle-Lite框架的输出信息,而Fluid Tensor info为Paddle-Fluid框架的相应输出信息。 +示例中的`dropout_0.tmp_1`没有相应的tensor信息是因为工具检测到其在预测的后序流程中未被使用,因此不会对预测结果造成影响,从而将其自动屏蔽掉以保证输出尽量简洁。 + +## 其他选项: + +| Option | Description | +| --------------------------- | ------------------------------------------------------------ | +| --input_file | 输入文件名,不同field以逗号分隔,相同field内以空格分隔, 只有文件中的第一行输入信息会被使用. 如果您不指定input_file,那么所有输入将会被置为1。注意:`debug_py_stage`目前不支持多field输入。 | +| --cpp_topo_file | 存储运行时拓扑信息,由`debug_cpp_stage`写入并且由`debug_py_stage`读取使用。 默认为`topo_file.txt` 。 | +| --cpp_tensor_file | 存储`debug_cpp_stage` 在运行拓扑序下的输出信息,默认为 `tensor_cpp.txt` 。 | +| --tensor_names | 如果此选项不为空,那么只输出由此选项中指定名字的variable/weight信息,名字间用逗号分隔。 | +| --tensor_output_length | 输出数据的长度,默认为全部输出。 | +| --py_threshold | 判断diff发生的阈值,默认为 `1e-5` 。 | +| --py_tensor_file | 存储`debug_py_stage` 在运行拓扑序下的输出信息,默认为`tensor_py.txt`. | +| --py_output_file | diff信息的存储文件,默认为`diff.txt`。 | +| --py_only_output_first_diff | 是否只输出运行时拓扑序中第一个有diff的var/op信息,默认为true | + +您可以参考 `check_model.sh` 脚本中的代码以获得更多细节. + +## Basic Profiler + +Basic profiler 用于 CPU 上kernel 耗时的统计,在 cmake 时添加 `-DLITE_WITH_PROFILER=ON` ,就可以开启相应支持。 + +在模型执行完毕后,会自动打印类似如下 profiler 的日志 + +``` + kernel average min max count + feed/def/1/4/2 0 0 0 1 + conv2d/def/4/1/1 1175 1175 1175 1 + conv2d/def/4/1/1 1253 1253 1253 1 + depthwise_conv2d/def/4/1/1 519 519 519 1 + conv2d/def/4/1/1 721 721 721 1 + elementwise_add/def/4/1/1 18 18 18 1 + conv2d/def/4/1/1 2174 2174 2174 1 + depthwise_conv2d/def/4/1/1 380 380 380 1 + conv2d/def/4/1/1 773 773 773 1 + elementwise_add/def/4/1/1 2 2 2 1 + conv2d/def/4/1/1 1248 1248 1248 1 + depthwise_conv2d/def/4/1/1 492 492 492 1 + conv2d/def/4/1/1 1150 1150 1150 1 + elementwise_add/def/4/1/1 33 33 33 1 + elementwise_add/def/4/1/1 3 3 3 1 + conv2d/def/4/1/1 1254 1254 1254 1 + depthwise_conv2d/def/4/1/1 126 126 126 1 +``` + diff --git a/demos.md.toc.2019-08-26_222115 b/demos.md.toc.2019-08-26_222115 new file mode 100644 index 0000000000..ab60264fec --- /dev/null +++ b/demos.md.toc.2019-08-26_222115 @@ -0,0 +1,19 @@ +-e -e * [Java Android Demo](#java-android-demo) + * [编译](#编译) + * [准备 demo 需要的其他文件](#准备-demo-需要的其他文件) + * [脚本方法](#脚本方法) + * [手动拷贝方法](#手动拷贝方法) + * [把 .so 动态库和 .jar 拷贝进安卓demo程序:](#把-so-动态库和-jar-拷贝进安卓demo程序) + * [把demo使用到的模型文件拷贝进安卓程序:](#把demo使用到的模型文件拷贝进安卓程序) + * [运行 Android 程序结果](#运行-android-程序结果) + * [C Demo](#c-demo) + * [编译](#编译-1) + * [准备执行环境](#准备执行环境) + * [使用安卓手机](#使用安卓手机) + * [使用安卓模拟器](#使用安卓模拟器) + * [下载模型并运行示例](#下载模型并运行示例) + * [Demo 程序运行结果](#demo-程序运行结果) + * [如何在代码中使用 API](#如何在代码中使用-api) + + + diff --git a/demos.md.toc.2019-08-26_222307 b/demos.md.toc.2019-08-26_222307 new file mode 100644 index 0000000000..ae8ee6b208 --- /dev/null +++ b/demos.md.toc.2019-08-26_222307 @@ -0,0 +1,19 @@ +-e -e * [Java Android Demo](#java-android-demo) + * [编译](#编译) + * [准备 demo 需要的其他文件](#准备-demo-需要的其他文件) + * [脚本方法](#脚本方法) + * [手动拷贝方法](#手动拷贝方法) + * [把 .so 动态库和 .jar 拷贝进安卓demo程序:](#把-so-动态库和-jar-拷贝进安卓demo程序) + * [把demo使用到的模型文件拷贝进安卓程序:](#把demo使用到的模型文件拷贝进安卓程序) + * [运行 Android 程序结果](#运行-android-程序结果) + * [C Demo](#c-demo) + * [编译](#编译-1) + * [准备执行环境](#准备执行环境) + * [使用安卓手机](#使用安卓手机) + * [使用安卓模拟器](#使用安卓模拟器) + * [下载模型并运行示例](#下载模型并运行示例) + * [Demo 程序运行结果](#demo-程序运行结果) + * [如何在代码中使用 API](#如何在代码中使用-api) + + + diff --git a/for-developer.md b/for-developer.md new file mode 100644 index 0000000000..8c01f6e1e0 --- /dev/null +++ b/for-developer.md @@ -0,0 +1,15 @@ +# 基础须知 + +可以参考 [Paddle 开发者文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/local_dev_guide.html)。 + +# 提交PR + +需要在 commit message 里加上 `test=develop` 才能触发 CI + +# 版本发布检查清单 + +1. 所有 feature 梳理,确认状态 +2. 所有 QA 测试结果梳理,确认版本可靠 +3. Release note 确认 review 通过 +4. 确认需要 release 的 binary 编译完毕 + diff --git a/fpga.md b/fpga.md new file mode 100644 index 0000000000..fdb48a26bf --- /dev/null +++ b/fpga.md @@ -0,0 +1,107 @@ +# Lite基于fpga的模型预测 + +Paddle Lite支持基于arm的fpga zu3/zu5/zu9的模型预测,提供armv8的交叉编译 + +Lite基于fpga运行模型需要相应的fpga驱动,目前只支持百度edgeboard开发板 + +**Lite实现fpga简介** + +Lite支持fpga作为后端硬件进行模型推理,其主要特性如下: + +- Lite中fpga的kernel(feed、fetch除外)均以FP16、NHWC的格式作为输入输出格式,所有的weights和bias仍为FP32、NCHW的格式,feed的输入和fetch的输出均为FP32、NCHW格式的数据,在提升计算速度的同时能做到用户对数据格式无感知 + +- 对于fpga暂不支持的kernel,均会切回arm端运行,实现arm+fpga混合布署运行 + +- 目前fpga成本功耗都较低,Lite基于fpga的模型性能远远好于arm端,可作为边缘设备首选硬件 +# 编译 + +需要提前准备带有fpgadrv.ko的fpga开发板(如edgeboard开发板)和Lite代码 + +CMAKE编译选项: + +- 设置`LITE_WITH_FPGA=ON`和`LITE_WITH_ARM=ON` + +其他编译选项与ARM编译相同,可以参考[“Paddle Lite在Docker下的ARM编译”](./source_compile)。 +示例如下: +```shell + cmake .. \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_ARM=ON \ + -DLITE_WITH_OPENMP=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DWITH_TESTING=ON \ + -DLITE_WITH_FPGA=ON \ + -DARM_TARGET_OS=armlinux + make -j2 +``` +Lite提供fpga编译脚本,位于lite/tools/build_fpga.sh,在Lite根目录执行该脚本即可编译 + +# 运行示例 + +- **运行文件准备** + +下面以Resnet50模型为例,介绍如何使用edgeboard开发板实现模型运行 + +```bash +#连接开发板,并利用screen命令启动 [本机执行] +screen /dev/cu.SLAB_USBtoUART 115200 +#查看开发板ip并ssh登录到开发板,假设开发板ip为192.0.1.1 [本机执行] +ssh root@192.0.1.1 + +#在开发板上建立目录workspace,拷贝fpga驱动fpgadrv.ko到workspace目录 [开发板执行] +mkdir workspace && scp $DRIVER_PATH/fpgadrv.ko workspace + +#将Lite中编译好的测试程序拷贝到开发板workspace目录 [本机执行] +scp $LITE_ROOT/build_fpga/lite/api/test_resnet50_fpga root@$EDGEBOARD_IP:workspace/ +#把Resnet50的模型和参数scp到开发板workspace目录 [本机执行] +scp -r $LITE_ROOT/build_fpga/lite/third_party/install/resnet50/ root@$EDGEBOARD_IP:workspace/ + +#在运行模型前需要加载fpga驱动 [开发板执行] +insmod fpgadrv.ko +#给测试程序添加可运行权限 [开发板执行] +chmod +x test_resnet50_fpga +``` + +- **使用fpga进行模型预测** + +```bash +#以下命令均在开发板上运行 +#直接运行单测程序 +./test_resnet50_fpga --model_dir=resnet50 +#如果需要测试性能,可以用repeats参数设置模型运行次数(如1000),同时可以设置预热次数(如10)来让硬件事先运行到稳定水平 +./test_resnet50_fpga --model_dir=resnet50 --repeats=1000 --warmup=10 +``` + +# 如何在Code中使用 + +在Lite中使用fpga与ARM相似,具体的区别如下: + +- 由于fpga运行模式为fp16精度、nhwc布局,所以需要修改相应的`valid_place`和`preferred_place` +- fpga不需要device的初始化和运行模式设置 + +代码示例: +```cpp +lite::Predictor predictor; +std::vector valid_places( + {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNHWC)}}); +Place preferred_place = Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}; + +predictor.Build(model_dir, preferred_place, valid_places); + +auto* input_tensor = predictor.GetInput(0); +input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); +auto* data = input_tensor->mutable_data(); +auto item_size = input_tensor->dims().production(); +//假设设置输入数据全为1 +for (int i = 0; i < item_size; i++) { + data[i] = 1; +} + +predictor.Run(); +auto* out = predictor.GetOutput(0); +``` diff --git a/images/architecture.jpg b/images/architecture.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0e6caa88a932553e212cbd899515ef4f5366839a GIT binary patch literal 80268 zcmeFZ2UHX5-Y* zq972ulzS*F|L~l%&%5`z=X+o8z3*M?e&5;;YcVsKdH(hH{AT8v|IClqKi&aO z!}Ov006IDVfbQ@Q@M9971E4?hv;F)}f7p&P9R1l&FfbfrU_8OZ#CU>{k?G{AlT6I4 z%#4gIY%Hv&PM>Bw&2*Ce4EyObhwG<*3ZeVilm6(5!;Yt!8JQ1%{hw?h?n|yocJ4(Eo=8hlbG6A34-=xKEu8 zaO6}nIXZEQP#8ZIHb2g)(^nxYNX?R@jl;2 zui-}(0vAU>J>Eew%)5Gd`UdH@qgO&w(rQ>9pxThM6?_f+`=EzlI`?%3iwjDYUH0Yr z8EumzirscNWP$!jm*~Cdz&YGRiYhdng1TE&haL*ra}_g={z~BCz7`EXtpv9sX9$FVa9j*)EqcJdNAYYiH^@O%7FZwq3{_rXa6G<{FT}3hOkvEd;2|jO zoFuZr#=fZIc5sxiU!Oi~Jh{UVyw-f&?_zSX{{lzy-HTZq`a?S6eni9OLR z%+M|4znqe`ZN+2wXjQ*-_el*r(QY;1Z2t%Ic~je>E_=wGu48XfuaOSAuXj>PEw@el zKdYR=mGkA|C)+K40G1VL?`JfsKFf zk*BrGwk}BOBNNY7N?Y>QHK-kaPEc`gJ!KWUOLSTvuiO1Mt}-AXtT$ z1f+M-<)(h{I~s99o2e*Q&QzM{X~UEcYnm zcHEc~y>eh*wIf4`7iOG_C)#!X9^Q9gMA}U~jQyP=y+c_}iEK>5C&u-va}mWg(z9jk z{GPOxH`Wy+swHlkB9pzSGu($+Q)t8*HA_EvkUl3KIUkf#6Px}6;C~37sO2qOTSRb) z2j{6D0K%c6zZLtLD_LrVCVo8}(fTXd@iV?6o?AAv{7hkpqTZU=S-}8?3hSArwp}KS zAFl0zbnHaCw(Q1FFn|4sZS&lIf9T@myNu+wdk11oVSJaY zy3JU|UK|EqaIV#ZG}!8#ReetKe2biwLZQ6IaZ*96r%%6H08jx)RanB;LGUO}7zC>dJbSrOn^nvmcD6$!krptzHWO^KahoNTnE#bIb(*@_(|5Zv18Q8L&z-)JzC zCy-D1Xr}r0=uc|tOMeh}3%hEUVsU@(O?r|H69le+AAa5}&tE5YI0SIDGQJLA(R?V+ zcj9rNVugEV`6y&7l~Z2h zXdZBAFfjbU-Jqx})MB9ih2rr0nVn~dE1oSjp0oKa0%s1#4+Hs?XVHVN zFUKire)x!Tsfd?2sK}*uO!Vgf@BA1ywyl6!#b62{y80kOPo1#&<`~wi)KYj&`X@VR z8sYBpvaLv6<=9@|{26|;O!=d0OTY;Vq{Qs@9cPbXuR{tw1BD2$^#-OFnUF0;R+&{2 z@dhQXu!Cy3%U3~sO1B>TGvsZ%>V(SiV??h0KRSwiYuX^mh%fT1OUeo5@?wX`M*EGD ztjPWA6Im360ueE6tC{fHKsN~3eeq^arkrRYxBqdn9=L$ zQgFKiQm)T~pYRVI$MAFomYJ4VWKU95(I|a=6JU6b-gqwV&{bpd@S&@g2FyV8IJO}W z>3shgLdA2bq)yJ{YSK^4pBmZ1wjsH2_9{hsudn=ML|uvBs_mD@j%h(?tk#3u0}uN0 z_4Pio+KsGGRNpjP;0tj`s!2xL9RQ9?qHMXj8I8A}6khK)}77E*bH~DH>+_)@w7qeTKBQ z8WE1PCyy)h3xuVEGsl1@3N_(rcNc0_c2rh_^l`*z`0tWsA(1X=#P!39?kC2rqMTiMfnAbb!dS5}B%o);F^Tcak+v`uzknxYP8CQ) zVcF;_uX;>Ku+zGZ;X3c~^U`n%!!!3m3Ay!@%{8Q&8^WQ~j3u&DL`Wr+A22$Nm28d zbev&mjsJ(D&but|fD`jypo0YJh`y8M9_wmVT4;4ZwY6ixZXGX5&0b5i`KTzCQR@@` zR+w6hDl7Z|M+9OGqwoIq==WQIKP(ySIy&Ah$XY}p6xB1qeMGZDNbTa}DH?DjJ#6P= z52~h6XbPTjNiJC96e*S3XiCC5~pQ#VDIrHLUpa~W&|KbNwO2KuIOx8pV`mp!!iy^h9Zz>(_9e^bp0g51!KU+co2bG~@OqP%IPOTvp$ zdW0x(z{bXwo}LKT>sY&(8>e#Bp)JkEAugcli(!nCzx(`l=gHCJN8KkzboEzt>k>66Dm4wn{$JS@R+AfM{k z06P5%VbuCm|I4aGmuPazgs!bG6h1X?=kozArRltophT@R4QSK#4ym;3L^S!XgwRk-S{ z!nY>O(D-Wq#)Zf%MMyy*f^^C;VA6E9cL_kZ5cm9_N%+%BciF%J>E)%3?|nZ2Yz-a7 z|Fb})@A*nBEh!8HzxoKHRTp!{77E%`d(@KhwJpjo3T4DAUNUnD;yE*v=i^D_Sq$E$ zZtM3cOdy5Xh;dAY&H9@n>57(&0S{Iy z52_)#{|=zee`J7EI1~NtWBC5s6PCMu3d?;3tr;5jZ>tX0M=43U=} za>=DDzW=9?{+o@h9{{ma`%h=Sy>q(vGJtM{ZsEJJ;D?a5FVCO+07S1GnRL_$mmdKD zBt?JkESJTg%{obUUV75p}Q=3B}) zwZLS&yF-*pYKsO;5oGNd~ZneBHjCZQH}|+hwOGT=qH7b4LW2Wd^R9t(>5g@ZhfU#WU}P?Dv)R z>;gVgpSQTRgicJgm~&Sg!!K2e?;d~9bL;h)&ktnK9WLB|5T*Vf8jh{LaA{m){HFD; zdTQ_J_s4N|Rq1+v5OwN5G|&Za+Ioi^xo7?lYlTNk|1$Qk$o>U`|4Rl~`Zt}FMFHyl z@`sbRiEWm*5F%}NM~en06gR1FFdC8OlP)Fla1U#!b-0YKp~|$WQkioCH|h!&u>{44 z<(Xq~WQky8>X@+hs}$Y@31vrHqF4dMRM$1gRZ(Fkpdp2A?Mz#PEy(|l0v?(cPdw@v z4w=l4wIiiqSf2E60+>$zPOkL4&#zLtla|_-zCsb_Bswa&*Zm4UeGK{mV7H9yR{MJC zOQHA|sn6+-<=#0j0RTVd-+BFCmPF^2F^788j-6uBr24l&Yo5H}+%Td_uO5hWHP|TH zj58TNNA;EHjq%Vvjn6^`gMmK8QVUf&9VJhE8C_7?_O!` z(qY_6ZyY>R;gj6lSj`ScIlca9)89^8 zKK&8ZR^fT4WXXADCz^JdF6B<`%vhbe)pk`tu&%n@&DBW($xpAFo(e0P16-VBR07tM zI78PQoiL}kvefoEW1j_rD-07X0X#Nwru-o{+OJ;Zwjt>JrSPwi{+|R5UcRgO)Y*Vj z!n5rn7HA-wEN6KHp*KkrWObR^)6gTr);~~oE$ofAJ+>Uy_rm>6(xoH9tL#=_7sgm7IMnO;_}*M!142y|H29X*gRRnbBW-<_|z$t1z-Jzl6jvIXW+}IIEk^?Om>2AaEbfd&H{$+~N z6v{6^%6k^7x~|xmVf{dv52DDwINfTjL6MRJGaY-{G;lw*#&>!;*C~8tQ4BdWM9eTl z3DwHR2xQe3z>J50>&=>PYOSoTo*sh-Z4q{z{BB3_!4{gJ;(Rdc0z_Spqo|&NW)ok1 zx@3OVd!da-CaDSl z;Ct}=-&q3yK&JeZKo*;$UyCsLDeg|tTm+^>RL|K2oM$3F^)5&$N|J?__Ci7nghrT0 zoE?ceNyRL+2XfSzugX52^ks&EB%yeHeS_!Jo>9Opz;<5enlh-et)OZ>f4Z< zO}|&??yEN;hJWe*EB^m?zDX<`^Y?GO;X z0yOl`!F?G}`5I%&4i&A>@gJEh7A1m!GWtjyCg?=8n;yrjz2eYtrSWoGsKVsO=pHcn z-ocr+2ORn{`ns+?qK~WjW@ED&A(#J~Pbh#BQXLx)T`g`{is`1v+(KVi_ASs8Jy+fW z+);?9kU?E)eQ0^j6WRlbwr81}-rpTB+5Zj($-+IX=ldUQnGBes$_f^6}1H(W~3N9dMD`K&Au`IftGtSmcsd^*aAMIyjm>3 zV2U8*`b}I*s*QLK_{+z?^7t24{~C{fan4`7`hViwQ4y!7Uj{3=64)dO&;9|}*Huy0 z`B1M|`n5uBo~u4!VIO2f9*AslZ~)NR#e}roNJH)W`_+5grZ?pa_D_Aa=%a+Sh$qhT z)KRd;BT$hM0uU`aAF*7`hQT9mL?X`yFesxw@?b}6Rr5yJhs&6NdA5S(DL_e)EewXo zSIDGF(#A#3%x-_KGKZ`mwmxmC1JP{=(evIQi?8r0b&1xd9yvHUose^M>G}&WUJNCnByF2>0%k8O)2wsYxo=NvTA?L2NXd|J-G+XTP!$_AR3cGnd&;{G@Qluv`|Hu?Q*`~yzNFkjv@r2Ov zn`z>FqnSi}SD`(Wj8<=$hl*(OKNPL6oK{kX3tc^+F z+R)?OICq70huiKriP1>^0(b*MFTHdKmIPd8E!+R3eBC3lp}=mmSuN|ZTyR@nUg=JH z+NY1E^Hm>Etrh9DAMy@o03a0{8%6?qS(2YJO+TOhYv;cSLPw*d-?i)d^WLUcZt%Ck z9+}FcE_)$iA|hQ-_xJ`LJo0WseA%nl^J~2!tpn+*Txu?z30EIO)7J3UAlqbjQtJd6 zb}a+(s{V>rGR!^O&fM%Ch29mwbFLKHc`@f5!KirT+Lb_xmZe*P#iA+|RXyGN|KPR%9YQ>NTqf8? z^STCm(jeNZ>zUK&ocf@3hznC-0K>D^&wJh|FDE^LzEJs<5~`eeWQ$YXh9$3h5MJ02r< zNl-57)P_Pw;VeJ;39Yb6A-FBU{C(we5;a@4G`;817E;OrTo-xnK{iwY(?G%%2&a>s ztu<`C-I)n0!0T9F-8D4B>!wA(*A!oC4)!_*%N*iTI z3tnG_c$?nATy5FxPqk=v5q7a{JhQyzl838#%4eHoAfCXuLNqcGC9)f0VeolfTQ(07 zu52(5h{`d!;Z@JI+=iFJe~pbFPnBOABv5iPJX;Ga@nlwCFu6k)e+RQm5RWRi5xUx# z{Uo!@&Yd#*GB9KP`b!6oJ5)UFV&*~y$Ed4qMW_T+J^FL()l(qck?igqqgS6U%zkZH z5p8OZx~USu3$+C)?Dr;#ybCp=-Z1m38~2h?m@p3)#o`FnqK9)1aicog~4Fv^F`)fE6Sz(N#=9j zq8?S{Cer5j>M!95`4y`(k_dC?h+$)~p=l8J8PPmoZtDG!4y!KGXSI^UIa=rS2z!+ zYX08-cODC}MmWd7D)hY-?b6xNXTute#f(5VGr}lnuMuqYPMM^zLQ%uvWtWIkjYM)L zas^j}or)2H%z^e0GnJnWyTBrFffNi#7X}v956f>oV&U4h$BPAXL}_?v9gnXb&T+i4E_VsmIvJRgw(R|uSm1}BKWK7RvS%~}qI`Fd zjy4>rjv}^5S5xWu&z;$uM|~tu+qiRufv3FK+(%wN63+9unHJk^=ZU9ejFy3f59_x; z`ga@C(?7~N1J2gH1)aC_aW{8ysH5NdaBMI;%2)mC73eLfb+(#`*71`)aI|QiH>?Nf z8*PxvqUZfTF*?nemZSs^u5_4fe4jad9u$f$kX*R&1HgQ?>)YF@OEUw9k3IjR(JijX z&E35)1dcba_wDe?ps7)=5GQv|_`9={CqOjz{L) ze_=!(SzWr^ZvUB9@7-mT=~P!#<6O_+(ftZbcOtOoXF=eu+V*Z4h0$1D%n0K!F~4Qpz9 zl6V1~Bwb4sXLkgJ)39+CT-DMhz87~Fch%KdlE(8==_45i&tNghV}>OW;R|`T#On!V zv5LJH9^37#K9fqZ^MU!iw|ogca-(tu;?&n>SzlK&uCHyWA4yS}leF%w%@7oaDYgV$ zv?a~5%*S%8nw54?EWj828-@$lO?3$F*CCG3vi&JBr=Si*JquI|r61ijmEsjKEZyxw zf4agbBoALTp3qg;a_UYl6mbPuz4a!(#tZ#sQlm61E+V6rtct&EHL+k8q#9slah&zW zh&u(wPloizX5`yut9e~V_}7agSWYV8Dk(C$9!i5eXyKcqhQ?J}nN7ophc~!bn${22 zsk>DWEG?)Yf`7*%84d!a6%KjpTe@IY#(_iDsBzbe(evq^HAnai50XNvchw^F^V@L4 zo91h`QD!RhJYB;a$xqJb1W?%U?H!|pGNM~rZtW-aNu@s5)Buwvu;gwh0|opp)uyPK zFNbV`Iqs{d8ENZ9$Ybj|q0OsDR`v0Po%3HzYbvPwJ72Y%y<;q?XF&e0QXY}Z z7Q(O~k|XLAg}+f2@2P;xigJVl1sr;Ncdz{u(L6T)DfsUz!c+WE-xG^}0H%-af3>pY zE$VJm_yI7my7%G8m+z9rfWwM`e>S=YewZd-$Zx)rC*hLZ>s;M|(qmFqaa?Z`?26?S za9w&{)DLn(sCYuLHmEPgksoyxG@YUB4j{tRZINU1R=GJT2~~ z_l=L^S_VNx>0;A!tdD2O(vP#RZ&@U8;j$CdiN$ecg~WFqI(Nc`l)R2jnbOSDV@gy$ z-?-Pj!?V$atBxlrO={o7g`DxMd-(CY;a-8|p77-BlzXqlo-%o9Hd~S`3viRa#rWBX z>F>?OOFWu+w!Lap)s;f2yP1h#N9u{b6*F>H?2g5Gbg!7y(NNry(sP)z%X};GxO6es zt}be}@I9CMaj!U3!G&XxQ_`JeJkCM;WhpS*%U-br=JG~6S)jCcX`zO*ZYm%uLo`or z%1*$`OMpT&lvid+VSDV43MKjq3do%69-ZI(&Q}!^xavRdrDAjDHYdx?g{Bq5eg_j? zQ6cGzWfY|{Nl~P!5bDZ@ncE^%%!Q#CuN%M*{*I7>@!~qwYTbh`DQpmXj}TpMkxmyH zvhWf9O^~U~2`>;7!&@^}`Sv8?wXEd|r>8-)tXj5Z8mf!3RKQl-Bbx73mmb}7QY0=8 zfj3#_La&l_aqO2VgP(>ouHTxSD=%oSe{@1S$ymHN2=JgyR+qpkA-^1$QeD%J2PhSnz!C7|wZrI5`KhXbk zA<*jP55PuR?pno)OJHMD$Y}r_Rr>die_kGGxpgr}*{*ibqdj_EaiT4{Ve}(LruepU zj{>N)x5nwf92>;8%kfOWLJCkc{pxXdqv!F3Y3729tg}G}mDojWnVNsb16Ce<{s>fi zDg_UlJDmjPjb`fn^Ki|NO2Ut<4j-xF?Jv|1)a_m;+7ueNR9Medv<1`>)lbD_M92O1 zaW~%9i#aDfc*5VlJ9bNeG)5jQ_T|&=LfOkIrJhFSzeA$nzUPlN{Mm(n-GY=`U)rpS z&wcZ)?K|HAP2avd$wegju&`Lmj%#jAhF!0QtyThC?_wF}n;5=mFlte4FD#uT50Q?i zBcCiuHrolX`MxRJGF$wZrf=J|Ht|sFiOdb{%yjJ*Z?APqX#!N0CSIJtgV!UZ2t013 zCT&~%L%BwbA_$?RpOD>5eWF8oBHS$yPWx5<1z*k&}?38x0wX5lMBvMyD zMG?pZO)brzy=!s*>7vr&rTiT0pmuw%b;SjTi|VnDjZ#!%xt<=&`B0+_84dCoUp6fr z`yFiK=RP$DeOqRQ|BkxI= z2iP@h!#8)k=g)&v!Fw)~{dT48fyx*26bq+&;KL(rUGkp2JwW2z^}Ekdu;{qxBcU!s zrP6v;u05#_IqjI16&YEL?~Ydx=k=X+VigmZ>?4 zESPO5M>zSdXg*-YY?1cQwN|JG2x_tXEK@W5Wbq|o#wM#N>G8w=&nirC;45-~=dFpf zjL(X_LYZv*Gt!liu-^bZpQ^3ewo^7t{Kk-_Pf;26)W-Dh?+|x4*eV0 z9@%D1F_%@>dm4Yp2yYPpQvQdm9REW`B8vehOa71*tv^6OsQ9op|Bo>EBLoi1qPm;@ z00aL2Unu;V>FPWxnmpnOFPg+j!M7E9^Cy$2sltU_o3Aga&dcXTa3N>UUBg#vT+}7t zQWdWOLLohJ>AH8;g$gsELMODLAedpburTLKx$c=UyDX86Rp$?YslvC{}!x z=}Ize%lPPUMpQU$h^K*6J2YPOWWoe|rXK)Q6a8kFtDsPS8)%sAO{kF4<} ztIAy2J*xn7)PG$7i|~|4)rsF>z|&;(NFwnMO^ta_ruq-e$lz9%HyYPvb?(r`)OU@B z_r2aYFK-qZN8Y>b=7G9?Orygvx(`ZKtdap%h{ScL)9o}q`S#1%#D4;WN z7j{Eo5zcih9kUviZyIHlVbQMZ_Pm+?w%HXU<>zPB`;DwM^jX+b;dK&2dG<`eS92P4 zM2Zm{MV&i8oXlWnfq9&~VpJLB`^>af%)8TFrs#XPWi?9VhbJ;B)TT6H_3jT-CwPAC`VF6 z^Vn-uF+aPil?(2;ri3c}6sFg3a1_sb7MWtZI8y^2hO9KmP*%dxT!drzxu)P&r^rqE zGd{^>jG0wamtAvNxGmN;`bURanL^?*Y@Rd`nLLsxp*EwM8dquXa-Mk2;=STU(_EB! zf%$6U`U>pe%f8a6B&%T;K^OYS(cX z<#hzp8=>*}!wl8Bc0Na}UgxzKEP<2Q-3^a=;=O}TD?fpz%P7E{RRI9D={ z7mF$oNTA_$9e-;kl>#%bon#dUN*&47B2V~#fkp5}Io$TKER;8OVRKVW7pR{QeIH;; zRJYU*Wv=uy{vDF%{&Pqw`|pN7WqHRS_Ey~PjPi|!`+b_cVAA-T>@s3F$3Bm+@OZJl zy^bjqic)mJa9>OMRY?9sn)F-1we4R-CV$5#wO8rQ{4W_$n}Y$G{cG!9(&tGHU2I%^ z2*?DlAIu0^A|3sApea2+%V!F+KBoE5qIP*Ng-MaMVPl$X?3&dr+(I0eSL)7o$%O&*ZO!WsgrxT+uS~#R|)05L{{% z=5hFBYg@Lz1I_FH1(hsZDN`5&2{pVf$${qgee)EN)~AZo%UVPRzjaT}qq*Wf!g)5q zFS_Khv?TaLUE+vWtU{4>YmC3jz-5#dV#Ll*g{mI2i(r56n{$6?f>=~L@B0YV^-u;xeFfbG zzSjCV=HB}NNL(SUA~xTvRLdd(%`C)wuPQ-Qtljw`6qjX$e^L3(b4f+J@@}c zXlIloFHWNBNvaXFCjLQNw5BZM{SC zI5Vyaa#a`4(UHceAXhhp#3oO`Yio(7AWk<}U+=S2w+rrhC!`qckqWvAqJERP+KP@Q<2uA4Ykv-%Q1H!XJ zp7c+~y`9(S%pyV^wu)#?1qZ5kbdq5=2idc}7Hd&U*uthvrBRM z`BrCN^t2)~O2T9O7l8;SJ<+6%dGamnQ1mmp&ZNrt!-}4(%Q7B?j%PPRZr<|#P`4~F zo!d9DHE^RKXnOba_{Qf;Ck$GjfrBj3Z{T0~o(OdjRor7t?^>Oow~BtY0l}!6h$hWU zA-@~>0SzcHKn>Ns( z?+|7ow#>%1h?IE;Sk*6jp0<-V=nC0>l`SS(7sUv3Y16hgwB_a~(SB8$l$b*J8mUta z+NUt+sR8XiSFPWw$4^49d&Y)r8#6gG`ZCE%AFo3(T3q zqBy8sjgt5kG>y`0``gB>f zQR69KXUPB^KeT^?QPHzn(Pu0SHAV*051@(s?GCiwuj_VicGnfRpOihjH|C|(cpKf* zVEgjwm9~U3mJ1Ai|A{UZp*cX1%Tdt z&RaI}+Z2!6W3^-bTXmm)032&47T%PAXL|n=(R_NlQRnP@xTz_m??i}sREpH_F9MCy)>8Qb+SN>z%V;25(MN*|x!)hR zZm><}jj2akI&%g{UTXj_4Hv;YYKRm|{A#qXOyPi28+BC zENKe|>$^vARaK}6TVh#p8Z%9}*psQ=QbH@x{91ELdU)hu|9fcEnc)8gvkmZ;!GeOH z#bcft8L|>VpAh7m3#Lhx*qy+spo$x|gaqh85`HF3UCB=>;Gtiz378n5%IpQ)^dh@PCOxf& zCO-Qhp8hE*x141XRXa9tOF90F(ig|_s}C!{@8+bm=Fv@u#YsN^0~iXR(Un+&u_ z9_;Ya8eKR)vV1cY9_B^(Cle$zd3u48InkC~6@ZQ%qs4O9rS8QK6^F3}dToDiMp1g2 ztsUFa#}$tgU#2znS!RW2%U~sC^A3}D9uM{ZG76JZzdCBVH0tpj!bKS2Nd-GgDO&u4 zaeij;p)IlCgO+F~YfLDucj}YF-x%lR9~eiarPMpjl}axZXri_3D+aEy5*HFTLdM3u zKTKaV5bLP9i;d?^Hg|Psj4q{_bdiTyXR8L|C#yvRfQv@ zNx9q-1A7SjTBISq#tuL1Bg3MCR`2ceA~N@WKe+DK-cjcW%RYuu1XFZN8B*;-Kg(o2N(RRE$%+;kC~xSjH%$t$qg z^pV*##VyCaf6|Mz8L%*Q5h3A=9QY0H7vPXxNO;t3U()SGOl_@AWjrMq5(B?4b4?QS z-uH%CF5El7;SE{z)s{m^w=aSFg#wA?WNruwnpmN`Jk>rp?bo1eQa*+nV*D~%y4@2r z=~erk^-1%6ZgWm3hf1~E^P)zza?@dip^FS{L20OG#?B}_D(UIVqE-!7Y1H4nJ0Kr2ISRdX{}xsakS$ z;INIwxaa%0;>z^ZKC$Yqht{vQp}bFWv-2E6}J5#S}EgK52p}q9z#mx@%i3AaGde zUTP%0QMFG=;7-v<-c(u?7%03K#gx{&QralzKrkeA_}Pu7p6uG|HmaDP-SxDbjvSdl zxby4@j(r5T=+vU?vU*L5d~ja^_1>)qa_S(99~6%~8DqYqb$&u6Yu39M?77y(Y`dai z;Fxq!b~fTkeVCehmoHmgCXAFp0-nju*+Ya9nm;CA&CR8Mb;ro9yMF!C(P9et@kldv zrlj`k@xIZ7bV>|%(zFaX1`~u?>pjP(u<>Bnvc4_Fs<=c+MNbPk%T#%k2g$(C*p_sL z`!Vu(D0R_DPT9}_>8%mlPX?*Q2{(YeCQ2AKZlVOnM7u0}7)24cl`LM7H`VI)R!UpU zPH)ptJO<2WQan%XLd*OB;D8J+?GcZVt9JX@Ll$PJHZLz1(N{Eg?1? zNj*JNjNuJfrWVy0Nru9CWDJENa6*RaKK#jya+xRz0pqm&0Z6#gVWnLYU6|_B11p`e%-k-9XIRoSeZbv^ z-?AH=_U|p%8C^}~ukTiS164V!LDGEDt4<5QRZg^i^u{?XY$6z6P{4^+=x`}yl%Qgt z9_-gMyPGaMJTQt(1~Xt`Da*Q_v0rrtZYDB!9mH0Cv4;vZ=D0=%k*lMF~ENgWmX?KzHqQ+J(-`c zr-5(?Jghct1+%3#Yr4+t^FrOfdY_KqIyaVm8+}`OZ9i=PQ(z0Y&3VHvWUpB^PR@xx zR9qo3P$cUbfT8R1-^`?waixG1(V447K;KjW$#XHeTzJ@>(V2Z|+=Ig}VLOW&+B(pN z)cQM7{q{Kixl8cM3Fx@X+Y+zb_px&7eNG=TWUBfr`X4MxnaV`a&?vM}Ex}OR@h~N; zQqJS6vV#hw9X*^4;lq0~U2~yzj!uZ-?PT1$&uePTI+tm9m^{jnq;f7AX`0P`& zk@N^4aOeH+0X+JY=M=@@!Bn~nXkH%$)--rQV>ePvozuK@wst)v%8?uzZ&p%)8rjuo^|;~SuB0L4@PESKi;RCaYn@1`@8 z^QdtNEG*h0tB_f%^P|g$p_(xIT);h;eVrt~lhIN+v0Xl6I(=*R+_NF4PVhJT&@Z(K z96mFjD4&Lc?s9yya{7S%y1SKLwy|dAOZ=Q1__dnPKN`RNc&O$Fpjcbf z^s4&mkWY>0r+Qzv+dZ`Py~Oof8^cvSJK<+n?RVf-06C(e7d?3g z`&H&=5wYn%#l ze5+f_v_Vgy1%DFQzGtboYqZlQXs)Lq4ii+Y_f|m>-o4sqSAFC)o3UxR+To z?m`Nfc6d|sRVK2>X2w;a??%(_c{|ZusQxLi-AM*q9&0)F*w9;HHwPNMT$iuY5f-hU z@{v*N*xhYzt#UBVt1&afWjnD`qj0s#dd&1{K!LU6JLcyvH#-~$)jl-}J|W93Q;v>F z8dtx(C8B-E9wXX!wh&GI?MAF6F#%aNR+@ym7ZEdj%IBSIreY-PW6q( zQucJ%>p5Jj+PPsw=iJC7#Kz$2>+4CSZWvR40KWGn74QeUoH`>h^1e1$?|RzJYn9Q1 z1#(B!&bbgovEf3Z)6k!fE8>1*V=-q4*~9l9eiwOr@vP(7LH}2U#$?B4xAcY$UbIz1 zhpCBfNl)yMpApj05)!_&pkAZi+uJ@D*goLd@-0(PU+CtaKhox)@V=(?N?hPFEX5He ziiJw%7`9Giobh?w_a7!-Z+2d)yZX~e-Ycw5`y16qb<$%t*$k5o=*Dzlt_lnn|c3fK*8! zAV?7eEc7Ojgdn~5CcTM;UZsX!qzed2M?ud-&;8HbJI~y6?=$~9^XYtgKjg`N*Sq#w zzrEJpd;L~($^^R$bwF>T6bPdsI>&wUe*H=sD=rHl(tyLJ>F2AJFVv!V^CdS>DV;h6?%Olf_9`q>|$#D{hi109?GwIn;rimc_?9LbN>T+$9*5c9UW5rT4 zB?F&2lg*qVHPW|e!Cv{inf03I=X}&<6%Zh5TxY9fA>l%SK;q4T8TU~=M~g4-996Lx zArBhVKvpNRxYWcwu|PS?7<^$_uOiQuk{&ctrA0vlJxuTODDJRs&Dd53R~o&$#35Or z4ZPmjMxWQ&4G>p0?zvC}R3{h(nc?>~q$L|BftolX6TFaKq>IGtIt9<3K!%O1gfL*C znTj8lF!v34k=)fjcMLfIOC|M{V6{f0oPzbhFh|b>DFDNgu}(IVy(2$7Nxm3uawV$m?re5rmjhx{B6 z<84=>SC|f4@MV}g{ox+fmSvQ>YlF~|0SkM3p+*fS?>y*H;cnHhT^fh;$82hKrDv^Q z5IX{$htN;<>?28Pfr1@|yw7e5YU`Qf)MpBI{3-0eAujkbO)S&?M~_2Hkd9b|Y@ev1 zLaSi&{m`yfEtN`1dyR?Mbr5T~^`eMg;C-$3?!8om0UDS-bRAlqmfo3!C`w37)>VaZ zSxeY!^N76^NHKCSvfc3J@ZYexT*~Z*jSA|Ya*OqW6>Ft;7Rv60!QFhYz^dMdov8g( zvsWhN6TFMXNxbW;F_S?=+~e3F_`uof#7{Yfg-)WWL=ClvMa~vzbJKzavwPFeV3&Rr zgvS1h@Z*)Sgz+j&qWrzs`)8BFdfb7ASS;4GA*WL%MfrECk44rK4q}7AqP-xZQL-6r z;mFX4Dn=_Nuz|9IjvBR9j#_47^gl*lYZ0%{`|6kQw-_V+XN+;VM%Jd}G1Z3q`u;no zU2-hyxqD~wXfqjs{U9L`!R}qAl0Y<-3|iCc8X;A4pm{aCBhW=<(~Q`2*BIb;Z`Yet zo7FAd_Aww4(4^WKoI#@1O4ekJZh91X9aJ!KwU0)ML&`~K>CkCWVHCLyKX5#FC9C=p zLs~39pBN$ascSu~4l99v<^p4fgQ(v-3-VztFj?gK8`f$cB^G!&)+*-eH?P)RY>)4z4-P&y;xmNU2fqX%ZRy*{e zw~k>sJy>Wl2-g{5=!rxuH|&bsT$qX>y%!eN`=T~XFEDm}!aWAlS-E$&8MK2f-u={# z6K1f{d|AXs2l-eC&xS{Riy+)3+Nf}(y5R40k5NNN+EyJmZ$eQ}^(-$I^YI>=-QDek z;p)@@s#DMeF%7e=!4LI(-0I%0iO5LC`xp&;-?^jckXE6(>|Pn`fq1^$x>5Lwl?quybrb%fmd_OC&6ubi9s9*Y4c}*X)F0hW`RAT==0Ggpi+Vi; zqd~5b`18-5@yyO?t{K8E+YPE?V#5W4{6nvb$4ib zsBN~_x&~gQzy!r}S*OZI6|y#Xhw}=dVxso<5noupVx_FRYPQ|iSm}H+_s8h`v&$xu`Yd0lbW9AtEf4V(U*mObnk{_`r zh`p|~{@kuNjh&t*rfO-!1tg?bA+FH^54Cc2+P+U1Vu>U_0k&#AQ9BIFU8YnBL|uM;h!) zVHeM0c0+p$5lKUpDq5Y>R?Q$*d)Usk#Vf>@spxk2!zp40hb8yWqw z(=U9K69u^cqb1W6;P~9|>vNv#65^wM^fz1ljKXZOMp$jzz{JW84qa1LZx*5vPS6q3 z7yMbr(4vydy;X*hF{7j)+x%WKA{#KNxCAhRKSo)96EJb^Z6$9&*pRBRY#baMftsUs ztrQCM;Vo0as`z~ku`K@%>1eTL*Y?6_<1Y4+tr3G+=hxxk@gVokd^bK{4rRIP1M^=p^NvS6IS|`&7+BmR%aRfm@fXwyS-tQ z<$Zcb?Z+?cbk|c#(Hn%aujiV$+LPkAE1|JCL;+m^u()$*YWm`kG-KTKN%H+UhR?Fg z^WjFDSGU|USfvzS^l2oATp+Myiy=4=edZnw8>NC6U>fUi>_^^Pg*H6sc1`tSY?*ks zo;`ppD0xt#DR5CI&u}P<-w*+pU0)QFN?^a@ne2M&dIg_xLaMJsk{pqAW5LM;Y{tBS zc*;{B3dm>fSSPMQ1`H5cg*mJ$lg`>2F0!+o{iS%G2fTW=B2rhOgu(;g2%oG7mNSL* zdKtsxr4m$0#P=VjgSp&acYBwrVf3o9daE`N4HHX9!`J#Adq%K@mu25PF8DsGD}9Jb zxNR{0skXNuXDfHowh9ZTk)g58Uh?LLE+;ImUjEuvcF{$ffEUyx#wR&t0=Q5a7 zK|5bL!pG!5)=?hXX0#ybb>@i@ZwC+{GA`CZB|3QA174XuCMuk{CWl|Czh*ZXrF^5C zQxHdVu$Q|Ch_Aq$b%cPxSgD)U9Fe^i)Q$uR$#dV{dLl=wJbLr^m;z6-nV=$JS~?*# z^!`%W_{z^F=@I)+Msjfu9~0m@V{~7&`4QguI|cL{oF@*@oZn`j>hTU7#?9_&GvrP& ze$aRdZtyVo;Co(zbk!Hxm6R?PzNZFdt`E|CP{;u7qNWK{ap>lqtj$CfP6E~EFqDpx*b!&nTG`2UwYm2 zy71{rrE~)Dd+b?MKSaJqbh)vc{-{3Dyuew>e%fPgjQb8ohU-SE5N98e>BR}b1zVjz zp`u%ea;oiG&3W7%$I_bEuAX z8nAAYU@dvq%GhS&OMSCCVwx08>}ISM6zL8uxFp*=DIL0>SXE@ob`GQlyurq_m8_9L zW`DDJR3)4m4X*0R>yY9wi{cJmYBM(J?K9(t(#r2lO0KA>nO-0p6LJQDd$*n{9CHo8X}I{!9#+0|dG@y1S?Uaep? z2Nbz3iS<1T>5ZRyWCnNujRiTtW|c{V5p8|IVC>T@N|LK~L~64z6paZl9=)sOCV7s8 zA+1Sgsq&de$P>21l-b-n(nQCL+H%N)u*5wYy7tt{E^#jD5v_;)<-g%yUqAQif9>Bg zf4xyq&``9gI1>vtzEp(P_PgBBh|h8N2>cPsr&ETQakfyh_GO*x<_@z zzC~!KhHw-zDE({)7&1Vm$UF?AY=G!*$8dt#G19~F3)k*aeT6YR>!RTio!_IPy3nNP zv??zBM$fHZ&B5so=2E34-;TkdF;BXlsuX4c-c>WrGve)%_qMXn=qWnj!jcQ4um`*G zS+D#{gBYE6IsD*a(@=l!u9#BxHIG^zZh8GkyDwsbPmg_eOE>jazNdbE{X3QD*z&lO zd5^yTnBBI$a5MGF)RbdJ#1#WK}lz2=27m-`_E-$r#;r> z!a!^IU^b8O6<7UF4i1`6k}LiM;{#3$k7>cF_x@R4liK6Ub44y`p0SzXtej2ahCaO>!wl1E z8o@QWu^y?euo!KJ;}|GT{W~V&<6O(G<8JoOiQu=wra92p^(BeM)f6Ay16-Q)mdMsu zpQhJ?Go^l$p11sC0sv!;aHMy`Hz6T(+%Qi@XTlfE*-rtgL9>;Jo*%L2V@GR0W)>BJt(anAEnxcqG}}`=)86vU#*()6q;P?*{Z_!ywVH(7x?i6 zyhz)8)A{zRQhTmf7d|3Ng<9Hfrc63Py8|`^OgD_o3IsBj6EHJ7Ly=Yd?w9G$L#shE zGTJrjiaJ7A=fY$CYLJGb!JeTV--DpUxDj+!CrTUAC9F@`GApV6VIC;qPj%`)Ug(4< z0Gk%HM-&`Eatc40m)%x}Q@Ru|>Ir(-1ZH0~bwn2C1M1Sw%Dk;(pHa7#tpC{B5Xmx$ zlcHH?<~J+NxMmx(4upbQT}{0?K^d)#Mcv2sjII)A68A!Q477y9Zo3tpco(>yohr$I z@8ViHbE*(pJFq@J(^Y&wk9-BgE3(C^h06-N9398uW+msvABW)FF{Oc?R~B!brrM0L zDa81W*33rc=^{+R#vNx(o}QRSz4YN&qU{u|zhEiUv&MEiJ7Q;q@Y~(#sJ1X}))eJ2 zV18IT!wCXyy?MWH0UEy~s3Ccs)pdQ(@){GpphnsJ2l%bRruVPrdLf=1QlF>nrP;r4 zndz23&WL~b0ZZ4pWa`F-8R2k)L?&6ev`KY|j`pTX#`>f1HY!w$W0IEF7~34k2^%k+ zH5OW9BJ|QX@CIs&=lumm#Xalnw}Cx7%vo1X{`qI!-0x%SXmv$N+53!e#&%XLi4NyW zF4?h5H_wGT;>}AE-(oM?`LcDYS&6gwSq2Ekmnm7&iW4$pvTU_n)A zX8&{)mDK|_^KNrGk>324VRcycID?of(c@OPb<6@orFNs6>#J;G73AA$dl(OtKa9~s zJ2drFZ=01v+faVV&aG|*#~cOS9%J)7f#Q!v&SqmbbSBq_8*!H>?WR(PH7s+|?k}Nz z@Yk%ehit)Otd1U{QaP#1_7xJ}0e8x&26v=Vt1U0lr=5r`#(jLP2=ZWD*Y-PzgT@3_ zm}Zb~j>b0<@Vz~IVZ71mW^S|f&+NyHQl=c*hef9k5+UD3j)PvD=^XVT`^w8I?YVYS z<_FMFw+~cofo6B2EJQ}v(T>^*-F>$Az5&zKeDHIAIS5;B5t4(4o>b1((o!YlJC9a5 zRT{5p8muhj2}OiA3d{HY&u zgy{dEzhx1zxZr_-FRSIWM96B)YKu?dR{dga@gU(v`CemJ))YvFULKd?q-Qyglrnvd zuKciLM3SC#UIex(`=?c_c~hj~?csJhH9V7}@i{A|ki1@EZfq=c@-aZi zuBPAe=H}wo*Y7Jot!{7UGnc1>SHR1aF7tS}U1y%w0SY>M09s;pbs<49@GKwj>wao0j9^XsBOI zl9l22oY+|9dgV8#^r{C_-Tx!id=Yu-3M$29r*Er7>Dtu*pnb?gIeMqK2@h6uc$PMN zjE}uuq|r?BwFA$OaxKLVJ|PdK=$VP89&E|POzrYPKK5G-jc6%FEy#}&ZN>SLC$1kH63EkBJ)S3j zXvCu{*jZb2v`;MY{F`FLr8>vE{1M0^-X58TsuN$I$6s=+HUUddg**^dAy(}vtU~O$ zuWCk@?#1Nwkp-p@v5EMah{CMjsq975Pp=CYAS<5x^L#f+dcLVGljo}0kS`?^H#)U< z>Q8PW^|V!aKmAX#c}e~6o!#hE zx$d_D_*vl+0Z6=qjE$?cmU_;upWrge$ny`*{++%4>sZp)r{Cu^nvR{@Ta=o9>Ut0I znd9Nbx;7N|Yae!P1J3fFu0E9f6S{_JW0}W*jg*GOL+W3#(10q=^15;$&rtGDDA_n4 z%N&2X5iRi$Z};^AJj&!uq+uq%@16vGu6|?pdQP1$ zV-suFX`XW{k5`~8myPG|fsy0oZM+#hl#1u=!a*NP-kePdwDesoRKDKxt(pQdbyO@R zO_piil;dKqVY{kT-D2FB@hoO?v-E=!GXolt`QF5#OhwV^EX*an=_sf7O1r7ovWZ)O`63g+sjqv?m1 zhJ{36?qMaBkVQb%8?N?!eg&MMZP2|WXC}ukTLSLE;%?8|w+y7dO`Q4GNYdXi)lAv<7<{zB3Egkp&4REo@`K4Vea9vmXl}3p)^qAieZgQIt;(*>_r` zQlvuSeVa;Ac9{JVrM&iH8L70x#eKF`a3uAf7xF?$kD+{QEGy=jvvYy5LBrCZuzbIK z?{0M1vv@QIo!*4?_og`dms^wCW#-i~IT@z0ORYJtT{`p)8j4G3cD^ilzu%Zt(fwI8 zM!mK6#eul`xv@`VsvFdTl+lI8n?r_4JL}g#y`ly72@Ni5Ctu zRyvt9G*H~RV~v=TSO%T7U@<==3Pu9?m9PR1Ao@f&ByJK$2Yfk5J#BG@|xZJ$U~vT%j)bvYHXf8UmOS zshQxe8Nhc5!nB_xP%4l?@u#E=b(ufGKXdaMW6vOW!*+)xf(yIpj!BNj)%*QmxYaox#tOterw}J zJ8?y8u|KJxzI27Y-uqEkJMl7szb`;!Xl6~tPsNsh%IQSAr0$}Y|} zn-cy;5cN9YC-+p@#TOBu_1Y|nC6+oNf4%T8mHxjp8W9bt=|O;7sfC_Nmg_EfXejjz5t0Rb z=!=n&ULC0~Q(re+g;=*mT&!{Lw)>zR-es$WLkb#v{k+D*!$an=*-#{dHM|pEbSHY& z|Hl3C&b2`A*vsE9KlvL%{HMweA;p|OVqK?>--etBD^qx1U%Gr#?{N8i`r4P0v;T_N z`}*gI$?=v)DrNo_af;3vcdT#8W{6oYHnK997N9s`q~eHCm`?vFH?3uRq*#r>Xw*Bi z#)WeIH$n^;&&4%d2NF&Ym(Aj?P|@imrOFc3@S=RS+_&@xh1mG{H@1Rc8E;JEWuI?= zhsU4iO%v_yl58in`n*YYv5YJ7{itTnA0EyL4*f0b%^xoddJphU;5mbH%0;jAJOjol zb^{1q41y(@Ldl{u(9Yh@N-7`hhh@P))`=FiZnC}2@CsRQdpvte3YN6)pZrx^GAciI z8ig1xgz4I=5qhf2dimXBXQdk09R^BILkUuw?$G?fQ(oo{w1J{`hPTRm&_RfsymBt2G zo0hWCZB}Vos}&{`$r*H~OqixnL8uI;Sw5biOoRi~wJXT7wwfj$*%@&SU2VRpBUPy> z7zNsXI?#{^w!}U!*pMS|s*yz)TBcoir-<#02s|+Xjs`Vqg)Uem&}iTQmz{>**8{(W zp*>D*adTbxkZmB$B6RlFBk0~UDUt&P>S=Opguo&oMmVZ>8#;$tQ4F#R!Fz#U=wHN!Lkzi+qD}z6V;9wsvSH3JbKP+-62k_)j;8OFB>$UX% za+v?q^U@8IH`@}Cb17B7Q_UCN*gM)i7x6n4)6KrGQ~EzRvS^*Y>Awt*)~8T-EnLgm zR?L3G@SO$lUjtnpJ^j7Z;IX2Z(gG}ILgODB-i%Rb#%IKpCM-v~Hy^kkT&Fq{WAa~r z{KqG?VZh@dQMtzt87Sob4!nk!_qzAR&TP|f)w%Gq(GS12Z7Y7z1<5Hto!QH0)i6@2 zqHjGp5yGzfx^e(JIvI6g0+*aVe%sD**}Mx^&FSn$Xl@1b94le=TL}<QNoc*-K|=$@x2FD`YW5>b~R&- zEuBs1Kjn!~cOo`-`++&@PS z&wGkhn)4s{0@ghKJjwU(i6q@ozYNbO)`&m<5<;o56*X56tqO=W;yDfzM6*cZm7d@T zpCEMKeWHje4}VSUrC;J_jtv%$=``riU-+;xGiNi{7o?O_y-b?2*lgO^-MY#$^9|7gkRpGLHP(Xkm)0-kO?yV+mf={P1gJp?{pmq<>4Mz^SXNaT=M3 z@pif9D~ZU!dJk5fA$oO_h%I2weS;ybAE#>cUv|puuS>&e{@w~BEdRwU@hGxHKWNJ` zT60cLXZ9}*LB$#amr-+!yEq)pXtHBBN8|Fzp~R`BNzPrJpP3yf1$}= zX!5f2FEshz0q*}HntYGPSafC&%p=56`A9)M#=azYu5wIag#_g&m;*yh-{IwvrYWvm z(D()60V1qfv8(xJ;6}wHXPI1KD0wPI%5?-)&o{6>x>Q%r5U6lEKMvG6G+>KI8OEE& zBdXaQOl`B*5X{BN+7DVY;Q+jMY-r)Khkz3>_8Zmdirat7Apbk9%KknHEi<4{)okY> zV`0x4tr#rhWOV}ZGX_hPEdgBS7@&;~zOi zyx8;g42A%g)Y(}*Ngt^>3AF62W8dziy>q@OcqI!pXcBMgx5uO1#zib=nV`S95+nw@ zGh~ppo^fyK162I&=9){T5K!uk2!-!SH}Sd_3SfD@LOCaE%QU-I8lykBs~?qJi7Zp?h9f8rs?>#!|Nk90nO(s`_S7}1JFoG>5<{fL)wP|SOepmMPe`x$0W>+KN;Vu zu9585`q_c(MR+2d$^cHqu14i(dyQ2N_CXFP1XRe1$>q5`(e3wt@COOXHTkKv4H zGM;Wfm^1Yv-C%H@B^^UDRn2z#x(Y@nGS3E4h^!)j94N)*L>-PM@L`U2aL6(thEULg zra7+?@nO^f*x=VGb=g^T-_liXQp$Yk!-dsA+Ixxhix|Yr{-?bMI!&)1k9?}B_gd?D zrLTeS?cD6P(!e#~Q945y`RJ->UfwN5UXGG64G+Gs+y?9(Iys%Elru5PQQg|y?r>YHWLRF_F1RP!)=ZE(}T|JOy@~sm*!an;$cJQt5{Ekal>=?Ev-V>7ph@*V_}A zbc^Byb4g=X^vle3IHo7pC;vqB4_}@-?HbGDgv?$yr9Q{R*Y-X=61?!)@KC&|cGP@1@+{|{=#IKf+iwjqpYhdp3AlcL?6eAn zwoNLLMl{WhPUDxU=p95Ed*3`=KYw~+rS##XL9O2-g)75iE@}I6;8OE%Zl9S3vnwHT zZZF_T&e`{@zH$?otnhiwPNA2Ve+ddsjR_ z;d_d~iog4(QnLRv*$3jYQyZc??WgIchUV-Iv>g3S>!oCqByY!12rR8GRR-`!>UcWvHMc&#UrO!#CCB}X_dfM6Q2K-23K#ygRryW};OCLy$>k_d$yOONA&mI|{ab)_X(&4e#w!1iuzC9 z)$!Xu1qm58pR)R>&DRy5MefQne_IyHKaRcIs~ug@gSg@_gh6pqn(#?6#1;KOS%NDiprz4}*9d6M|hs7+3Yk%nfTk+1fXHnWl@VsZj*h}d=G^eggL zToWt0RqhVoya0usD_&3-e!@Xlfy|kG20v4~T+?epVBNeum6%E4^+M->`M-qn_35(t zbO9m1lx*|MhXT)v=?KVg^d7gSypGCk>iQmRIp#2NZ>C6b{_UT=fcmE!(rt247LMD- zDR00v)~zpUvDDLKfEf_6EQnDGRF=j%G>$^KdmG=+IA<4{;pbEfoMw=P!P}a^SbOi+ z4ik)r*xtV>i87A=tWlNR=xD)YWo};}Sv^P+@u8`Lz|t>t72|z1o}mIZWUqn3o5seU zvTQzn59-(T$R{h_pSFE-%1xKxFykp#WY;3e&X~PlQ_&-4ViC+4P|e)aP>PF6_iS{i z7Z!1*o{cJ~z2oq(Fs%aPtJ?9GOVUIe3bqQG9;&@7ZR zavsfVS_D$=v z`WBlo*KpM02orlv-iBYLA_V^OVOYgy5C_{)5`}r>NDp)6lm1OADktLKMLMUfUrvVi z)dZ^M3pw6{VEfvx+Q$_lb8CL50x)Bhy{;RphqrhNk501Dr{?PtyYi(+OgTsidid!R zZMdM_ba_G4O4G(tU!>TrtuHq}d)Z01ff?7<@Kcq0iu3W8JICd1=e-ic&Y$RAe#$h~ zh)A|3)$N~c#}#~=bLx*II$h;)YV6Vo`oY(GJMv}n5HHhvDk@IU-^oX1QY@WWiOXPE zzmPc5n^@_pnUI4eu9duL4dRF~?_?RYeo5hsstIbzD`c{RVCkw^HF9#1*|kc55tyl$ z2-e2C%;&3w4ek3!uju7w3iQOa(LT>?q({EH)ZA-_Nnc`=Hebq8p`38oUsr4UovIJz zfS#|Ky)|&ibyfHE2Csto6u7J-CWPJUR<_39^CA76hyWwAazo3am*GCKAclExm-l@S z>iV<6Jx$bcY2Oiy_^(CTW53C4N*`~S9lYV2bB`vaZjyQydvzwJH{Kh9Yx zYFihmPE|QRp`tpA>*+JUQncwTRUGW;u?3}Mg5D5=CNW$QAM*E95|?9H=$VWa%jv8TGD!YjWi7*2K$zpZhEkHqWL0>ag0Z3BYFZ2dl=&bIDxBD5Puxcl^CtDr ztkoJ1t+mg$t%MUbl`1N=ivs zT5+nGFRn_954nUswb-c2DQ-VoyRChxkgZK6!}7IgVWa0c6|)WRZ*D-NWkj-FCZZ4K zOurRo3hcyBWvi({d(~M*79d|#5<|ouHTYIrfogX9TZ>aFm?zWfsh%z{vXg&caZ}ps9;*>_`yHf?sguq`eQC$0$pIQm$){KDW2ED8Z&c ztsEfZN4~v$ii7M-2xo1h$CW5P*elvuGxYPIp@W46&xWpim9VPNF47L0%rcs(1PVj* zn?e}BFUVf)0jmjA+$el1z4-Kz_LCfAK{)Vjm_VAVEN`cowm0=NqaZx7MRSxFB6+_@ z;{Eoy9C3C!;con8yw{6n>+@ahlXB(D&y$F|1``2xiDn1^Drk`?D(S-#X&r@$QsUiEKA31{Q#pV1e1Maw$C!cfoBB&;#O=3l-p`yfgVRPGwMwN)KdzxvL;1nMvR&zhbzD2$YNx;yzA^d<^gjdSBM_l|RT3u=|Xl(2@FZVb3 zHdw*b%hH=Kgu4!Y#5oKUvYoSntcz!L5Ze@W1O|;ZMezye&CWwh7Ey$Z*7QkIApT;d zbMb-XVpA8!oXwnxVa>hOqy}GvO2~xl`RtLf9$K9b`|PuvMS?MF_lJh68S4UY%{&mZ zhp}=sk1qfAqEM#v#?aQ>IWg-EOf1jAQ>``HJqL_m9X@}^UNCraNN0oJ6vmFgXMwdq zgb8-hp!+sbU!6x}?{C7+y|bt)Nt%oR6zfP&F&+BM2uZk2WGv z1Xrk@?L1&$o?GK}))6`C=rLBocT7wx-ui3ep0<03;KkPG2mzOy)7~73JP*tod1VMi z_ePI7%z7kTi~>CSEuZXwl-O=5cSIv=a>&^&`JDtzvQSV9AhaG{|2`jX*)0VzD=C$rube z4LyT&`9LwSaIYdtqMMq_nxvHC9aQ>oV)J0f-RUHWuHiuGY7eFQ*1A~4h!bvFGZ5-= zPScXK03Fc5~U@&bV2;p*qheMUn@z;KV2z;Q3I2@&H{M+PC}|0|DP_1 zyPATWScOfG@zKhA_l&R{SmffQ4DntZv?QadBCZS@2tFV4eo#nRklq9B-o=GP@n2l% zew>{7Eh%?*`P5>sRB5AnnOS&_5&wjwwm4g|Pf1nl%KAuP6Z1lYL*$PxQBeuZBk6%> zDv1*XnZPxa9^*8$Jt8&Aq-x_~lsn?8CtbO#I3oUZ7zcuhuXIZIW^b$OA{H=Y>!9;$ zc3n^y(K(_rv;JJAX|DL-%O#0=&spR)AS>9N_8Ta2XbCM@zxVrN?6F;XQ()ZeK4n1irfo;L6FcJz>t|?Po#} z^?_ZZB-8jS-`;LHJj*hA`%yx2$$ifu!;fP%= zED>EBpRnOt9Sa%ijF8OkE@xtOne-G>4kx$i#p^T(9<)KKL+{&g-?`jvXI^t>+9M4< z;S9P+$kn#1F6%6w0_p9GK5e{L@#JA_6L@eLJYQ3n2 zTj@kPp?ng@`8-M^!0$XV5NoEE(BpV@ySd0y+%)!rqk$0FVlkUfJ~OjgKa#A#hU$mx`VxR+N=GpDa)IGef6`EIAmv$L3;^UCfLFBuQU zb;C80jHbBO0=d(??#WW@btUG86j(M&-yybdWHv;k5@2zx5qkM%%=@jH5}9$_QUWip zJ@)oN=dOZwZxyPZ<7#5hIcvLJpho)nVNM4oRTC#0z2FD!{VuhHtb0r2xUur112g&K z-OIo6J&;V9nWVNmV8kA5wp;;}Cub~S&N0x%vWi&N!*1f>Mk)X+M; zM+|9Alf*^l>W+Lq37r1uvAwTt*4=F;yIGi|*tWFI`^!_hwa(L)Elx_UBwp=`^=)B* zca1A)8ChK3kgRb{IW@k*?g5u73JD(6&KcxjPWSyOtat>0@!=BA=ZU)@67O+?0?$ zl${_jN&x1Twk9zfD}#ZQ{>Pbl6oP`NZ&UawbC z6n}OMsS^h7!hg_1!iS2ZoIXvfYEN+rb?*BGVR%8LvH)XN?v&#Prfka#5itt`4qj&< z7bx7q6BFGpLb7T|p_NwrjC@fUCC#Im$cTAjQFvY~eQ<1pDNbG2zx12X1$10XU zciDg?jw_=3i6qOtQ1?O{%=tr6sk$;Hlu~uHs!$YJ*Spsu zQjMy^qgaC&bAFOKMZU_ud#QY+s9H#!&yfL3^Ayy%#hrka50~xZgWf;JdtYmnvzA_7 zDSd9}eTeC6Bpk2wkUvS^s~VomlMfv3R$Y>U2hN{uU!9ea*fd>r%#i`v5uV{b9kkCV z?_HU$;#c@cN%lK;XC+2n?H74oPGxUzAF?8=)^@ zI znTehTd?0h6?W*c;ghM!Tp>`qHkg+_DK+64m{3FQ%L|#-J`(^Jwk;OA`fkDwByTfej z=`qZMA$PJ|gGM9gy+=0ul5Z*>ON?r~FG<8d{eC)s5^n#d2FNY1YszTXdXg)C5m@%6!w~My#P+ei3C&AE~ zMUd8LcmORwYmP)hlCvQ1Se~7_C7Wb^Pq5==N+@QBz)$~hkA)?wGNRAw`C-BsAKtZalOFx4{mfFoDxH( zTk5SlT{r0awnp+U=aecaizi|IYo{a@L7cyZCW-FAGaiRkGVoacPPI}ZDSCg><;urL z&!hR~JYrKzH%ViQovQ!EDFbP8ou7H?^)`QKtC1(bB*fTEZ{%{Cp530dnVQ;A3Mjh^ zs|*4XjO>#mL@KVZ=zv>bGg%yOi|03MWhVkj!yHjOq?VW$K%Zx-d?_@;8@B54#b7FF zZss1LIC;M2Npcq8ZP9#Nx%9*nhnqxiX3~|QE5oS-96>LT-QL?)eOYmkCxWlnnQ3Hn zlS9Z_E3X>rbL+N4k{d1Rz?GPCi&J3oR!-$QsoGXOAuRp8im6JgJX6cG9sy=Pfg4)Y z2r$w`IYA{?uGtq>srm~$5aSshsbxTPJNY@4f>SRV9_+&0UOl-eNO_b zoWTlWiXlZZsXPX+2gTS2awLfsM$z;}^Duqj6eem5OTWKtRUJ*thnUF=3;G5Q$-Pykwbw2Gsr7z59fkLKo?DR8WC5-f5i21FJJ-5sW@n79(ekkL`s$Q2}(9R0kfJ+au9gsLL*ngV*K63j<;?d!gHt2D!vI>Ftd3JTeT~ zz@g$wXm$mpGsF`YVl!f!Yp2qe(G?S(2fGE_)p2&z5}nkW2edx2FWWQl8pagTc(&Kl(D!#Df|BLDrH}1vu_z zULF);K_|?Von+U+t2pTS=4+8IZL)MyhwRnGtghM_+j~WgcGX8a42vQ{NkUq!N8R~=R4OWMHhs@zxlgb6>kNo~J}-Fsd73U0Gz^9GeGAazRw3_rN@^ zs+qn>r3%!kxN4r_Fd~4*6&#dY)#Y(~uSL-Jq1BGuEpneljSl9)he%Q zcOEN$j(7EiGQ}c_HRE+cn}Xp%h7D#k@!%XrK+HyKiX#+{OkhvAil=Dg{i*ZXyf11t z$4jbuvFD_6#{&&U@u4Nzu?!X6vRV-VXxe;pi*s*PifF!68?%UR8F5aoqBAv}L6Dq2 z#>0G_>rn!jcy}z01|)zdCNKgbBSd;pZ@FpiArhcDaGNVaQV`(zb?*u4UVFq4-547A zybPulftW^(X6+;r?Y$EcB5=Ww$@++jkkQ7k;5h)YVQ*Ye>mVPX| zON-vFoZ4LaIPq1Q)8muB0;yl|q`g!o(fp0Qla+VB*Hf;4wqYZ9XqV(LM>NLu%#LMG zGv^WG>Z$YS>U9L6@s0pJ1)1anQi0ZKd0|fcrP(7;g zK%_`SHmauWbC5*cw)CEA8Hf z_Q_CUfTAFTS0pmf_KHMHq8(ZBTb_yZCo_EEM73;bpEyyM(nCP9Zkr#~xt9l#kl%~9 zq7|SByVjgJSWxAOTle5&Mj1G)1^2R=sPNV_BeN5=ZBfv(sRh6YkviM!&hq@t8_g_2 zNuE}1<)$GjLk4xbYa)u%07_AgoiLPOt41V2J1Z!Sq^qe^Y_-K3I|$%1Uu#>mTG@zc zjO)Gs!QOj@HMOm4!?6np3Ia-(4xvd$>XH&jD3Z`jXwm`%f^-E1EOZD+Xwszw2vthx z$dX>AgVs!|FF^6rtG3cRml?m*T zNqQnqZrf-_sn~GSC_(KWN$LFFe&~wBDHegWToe4u2O+z6vH0Cb%)y^!PqKmhMCtq=q za|EY@M~ljPJ3P-ebHT;!JxXsGR!u3R$3ZjOdyo9?p#QKwEm6t1t)aBkwhCYh!C%Uo7f;m3n6OA1q`v@w#~3X$G&(d_HZL zPPDU(X1j!}cj%Pz4pbx{teKhUUr^G2#=68h{r%NY!1=+F+ga`9rp`$&?=o@a7*Qr3 z{6-ADk2XIKAt6Jb)EPJbI08zWAKBXKFgVwN;sLQ(1z$;+bgabc2hx&PHeWl<#nl-a za^oyOKK_B7W=@|!wUXpwHCtN+twxd-oTpDD!#$eoYlpKJa1#;(PHts>b`R{X5Q;=Z zb468vv1ZDP>6%*DL|4M=)n!JD1_PB3=xR(DQ(pKS%iw-{ou z$^UfE^bXu2|I6UtugOE4zSYyT$le3|Z{I<3P9|Pv%Hw*8ucxWMn)JR%^3~yjf_^(7 z^O6xn%5$s7Ku=Z#^}&oMJM4gRQRs78LwvyK-84onv=nqL0@n~LQb*$39ZHT*@FBn0 z?qk6$AL5Ju&IteownO`o|w1eI*@KZGqwKV5lTQAm_tfn3_y_{Z` zKi{_7Si)J2w^g^xRG+5@#r8*E+ToIrkYJ+;0fwaXr~1;Npy1x z6e#=4!7cfZ(=8;p6E(j zRZYX)#ym#Z+#!+VC0y6Z#7!(*wq)WaN9gk^78i{$wrfgOWKt$f#{d-Lu3gqFqswFY zBJO1qSuKST(*r@LtX2(OyjEy@@;9U$ru|!9_kXIG*z`D7tPeS3%z|!9WP6}wMJAo< zs~&}hxkZ$f^MUy+6XLJ`ZrGqxQk%h3G1i(I{dXNZySd$V>21ug8?KmG@U=x=zFUV3 zENMP1=_oS0o-D`f4auX8{YPfs0e8p;x{=L_n6o$~H4;4KX4hfff7h!ED2d*Svo^Ed za1O(Q6PIuxn!m7$71^`p8v8ElGqV3#9J&6LdO}zT`SW8Jq5s9Cx7W<;DR*J5)!KFH zTTJRM%FiOn9C=Z+{)YB9{InW9 zkEGsY(5*9~vT6%e8NZ+mB4K0fuU_^MR7^LMwy>Dzt-6Um0s~F06QQ(jMFWC+BfpyM z*4XP}-Ny%tZ^@zM#@L63NMF4dx&F$>Tj#bIH>V7Voxi!o|2S5E_X)HAH*oU(IhO_8&FkgFlF>$e6T->h2$;5PZjS<*GT1vrI{n8P$#j%j5 zXk(OjyprJ8_*Yl^;Fn~9Wx>gFHuOk`6@814{52 zV>RmTL{2YKc5#=U?(4>FKwjUn@9V~HUT9!2T)S}K9o z&+nus=niWp8%eae#CS8jd`nqq7{!xXmjWt#tihPNhNfx_J4!Vg)(G8ve)cWOuU4n- zW9Vo+#Oy0oolrTOFN|79XAI?T_HvLGrvzFo-XHK`V!jf~D;y2o5h1U37)0*>vD$%> zQ2~({zZ7)s+%=jKpZ#MZwA-A~^A=xsxDAzdu?)e7I*fC zs9YT`(8m*>J7nElQZ*aV$p?ES3_XJl?Sw;g)os%X@nHVjv;i8{^3~ayBMG3ae$KUk z6|*Z}uZEOehkP_}#~Gl9MfeRSClcZ+lT*vx>nLwl%T~ohFWdxN=5qdNF!QI&Q~D{) zDfj0UWR*axzOq^$I;t0j&81yFc3HXea5^aN;)d0tX%p0)cz7z5cvNTQ8R~c?nka6f zD!*1RyQtAkxu^+x3HXJ;c0^+YBE!gp0IZM(Jd7@@vorG4`%`I+0n=56zSEIvIeeZD z(J7TVU&m}2b1sn(i#hnhEy=9eB?NJ4BG6>J=ImdYih`Du> z`WEeCQu{j+ZZ9HVFS(+(yf<3l>*MUt$zDx{wQ&5{Fx`uf?P~WC1f*RtJOz}h?%9jHcu?xg^QocRIt%t*u){VYzbbVCaA*QLJo{axeS&F^R+Lk)xpnU%Z@K8LgUft<2nwlJ^n6=AmuBkI$_)^y zMvQj_DSZqbyPi09M)SOrd0-?~WUlHtl5Uq^>!aDS#1S}Le2DAr(|hwWF2q}c#doAo zQUui-A;VQ7uEMEgy)F@y*VFNIalG(0U-B90q;7u7d%tE9Eeni^Bh6Vl>J`pL>|WW5 z<+BKN4>O5%Mte??7`OgTp9iKL!jj$}y}NW#ZieuCIf6^jJib$TT#B)B6ElWy%DgCs zjeQcxl9;p+?e%N=Y0~ydc1i5e&j*@wQtSSB_X>m3 zTwczZ$lzH5s<#JG!POU97ji z!(x14kowCo^Vap)z#2#%@&jovJCng zd?g7_&2r7XM9HRch(qg|0eM=rW!j)C2+1Cir5sJ^7SS@KJKe-J)^8z+UVqadKN`0D z1*!kH&EnU)bT#PaJ$;hch!6@j2j$~}R{^8t2h-vAMYwL2FnL8tzDVe-Sy@y}k#^u2 zdZM55_k47T<8OQCkJ$ddpz(h}bNPKr!>wp!=!l zr7o(GJiZv|OLVd8>(X=*!h5H)@;XD~9QJ%u=n*=Tt+^R)FKMT-^W2qr<}m$ODE3Nf zA4)=$LX^*k_LAl4w0JY0o&=+6y4e_GM1cf8##q}$Dun3LkY*cqHGEP8iCg1FU?38` zt#&P*7S}5zqHCm$UKub)xj=GayLc6Nn>)MJFiEjtXuh0S+eebHI2>JXvdrET;|OXd zH9Cp_YY-Bd=P4QXQ zT^j8d6Z;NOa^tvNXvpE34R##nN!pvRj;1ed@)o*Q;olX-{#wh2@Fwyp?WMMV}Tp!{_phV2_?g z;yn%VE1>RBAtE#3_7_U2gfuc=k9ogfR72MYN~bkbf~Wg72OUhC`}o;F!qn~MIG*U5 zkojb^eeyu8^7hr`ZdVztr06>_r)u36YuU88iE*!@O=5pYE*I5!o*)D3mdZ_)1ShdX z2uFVXLUBwnoCmfWSE1|CQXH7nItrynejp3KYzW*<3Wek}i7;)6B++t^=nO2&bM@wpmn83<;vIS=g1B z_#Kd{Y4IG9B522KpvRQci)l*$J$@8;61X;;Q@+6wW|!M9C4*I-KgU!)h`ox!arcbz7TC*eWS!ZVfYG3 zJ4C`vJ!c@TJ~Bz}tz5P&1$9j#1|^@!*^uO}|%?QC54x z5sS2-?#8qmB}@vBlIZN=h9>#kR*I)u1~rG?5ZF(n%v#>qyg8inwNMWpC6CDl=R&mg zRJE5D`$X(ctY`1~++$dpqc3uLT+;a7b7j`qCe@qCu6)*W)6=d+uPE@bFiSI~7c;?S z_;W#j1f}dKD0GoPeIH~_if`n1 zfTOK^+#|gpsE4=rOUI$dMtQ=mi_5|vSqdesu8`g!_Jt7G^c~04)%wD#X3%X6U3Y4~ZLlK~e3rN4i832(uoHVj#jfA@xOWP!+-Z$so%X;k@;V|mi6DgHjZrYn^W!; zKECbE@-*Xyl7R;B@W7a56eK3Uf6I-Nn1k@%BdgYP0UUE!d<)69{9*PAF`sDao*iWT zxInjZPD$g*F%58Gn9)K5SSn7?fJxSr4tVf!fE7oY2Rs4ds7W9YU+KE_B!AGVr~Tk1$Qqoo*SAmKd;tw z`TXl=X`o`$%5T7m`Joa`Y7&vr!41luJGUhbc8L1=MUgG&?!GTS0Kpn6 z%ay{;t*)7#h0y7*7k(o?nlqU&qtQ2`FO?r1hIQwZ{v+&wzlYuAKf#bAO+T|`RVD@Q2DRh~sv)Zd9aGS>bhoJ{{DoC>3mmOctV zTf81YpmI$RWyomKAy!ruuxwMe7oj0wCnLotm5wZTjaJ?$|JoY(Tkx^}NEx~B$SLFZ z;L}j|jr}=g{AWn0{0m5s<{PO(ISH(s6NGZy9g6Ll?{(89tX8gpQD|lH#Mlk6aX_W1 zHqGa6KakCOs_i#qix>ZA&iL=Cw?}rCjJJCc>c91jnm?&1mHRi>9m|=l>eIDj{eS~< zuQS7Z9>c?u{=i(L-!NBi@OR8@ON@c0NIBZ}5YB=Wmj2D~keB}&RHti}?Y_iTq{a(Y zH8Q3Nis_R%AnlHKgJ=Yx3@ckR_jo8bXY-LZuI8ZtdbDJ)&BKt4!aeK~7 z2|98XbNB=NMU%{_9iK>SE2{QoE*ex#F* zjX#RUf*(cW^goM6>z+SK8`^3^^>P`BXUx&4$vsH{QZUvvx){fwm^j+|Sjidc=)t1? z)lFglx4M)H{$FW1D(tzO#+__qG|A#RqVf>k{&BXHk%2RGv;TqOe)LsGy+NNXV$nzj zYs5WASYR^9x*<0|DyKT=TKR;&0Q?8-uPG1cgV7+i@w7T zaOvX{SjV>Osx+_ZVPjSAK=P)!{?B%P!S=<{*e+R<`Em7fuphC~G3>UQ ziL4>_9$yu~_BG$Rd+fG<**KfJ)OaE-r1Sevz21-iEA55Z^3g zT7Ahie(nH4$!eFv5<&i7jeT!>KC;ApD_9w6$+@OUIJE38@iTKgu(>cAux|470exd$ z+rxx=4Cyb-*~iubfVF$BYv%?x{S&tfGfXZE-#hF-Mtx?5?PoB|!=L=rScglHc?K*) zA`P0`5x!^)e~3wf_y}o-yEb}qbBo06AAr1T?oj#96EyTJ`BPx4>BiYH*T;hEHAD*% zv_~7judl>UZX7y)^(1-8DCY4>?6lv97znDG8rZFK`Xp^?S9Hp!-KbM49Xc`cAk94p z{9GlfU_@MUKOiWFEkSNw&!+aSGcqj7DE?F7ulxL?kBR@SKK=qLVa_RN@l1~A6Gj)= z+j6$^34Yoqk4%6dXeN7t=&g(wcQC!me51g$dOZ_yht!7m@Rz4*a zzd90w>M@2d!j~2~$p`7CjWDm4=nHt;@!8PBb=3{SygIl=MoJ{bl42dh=V)n_!Fy(& zNuQZUyp!qh+meWPl*dQ$h)f^i_HrV3l<`7*^1Vx?KS}Wr+Iq+1$YVwFI{_}JA=iwz@fII8^w>k(mw?ww z61SbJo={?xH)}}x+=h3n9q}#(0zPN!vIDaqSh8U{5<~N@V#)#?E!-rTWV^0*rst1V zBkRjo&C-Hu&e{JFIr9GzPVG$zH~s8S2oD+Ev?m{%gdtDC|A-u#Kf-Cd{c_QFz-Z~I zHZjL73*MJMx27|QQv9U()R=sT_;{bz1ZgB2K>@dqu}|}FvlEOXn3Sn_hsqZ^}t-egXbR;hopKehi@UGXOm4E^2h z`neb63y<+z(!VfS{^XNCS&AMR;i70fej)Kgn%+=Mc~PqoSD-p-pFnNCb^ z8()Rd$9Fghe|aV$A-Tu@t7!>k{jNh~BO{BuxxXrRCXFJRU)sKv2!f%+hA(93($-5K! zN3WgQo+odQ=ojwc4C+P|gy_<6s4Ydq%bwRG-*L_KYOr|nr5X03d3FqFc2>WkaMFZQ21@n4D0>&)u4 zs)0MICZ2b$zO>&6tc_6VD((~8W_gT!;0=4{vnE`@Jk4e|-fWslB=y8^ucZ1eEc&V( zG+i8@)8lBJcfMq|D1i4&L%b5dODl{0$brml45WGfmFo(BLLDaSx->f4jgaw+@&x=!cgxv{aVqym={6c>8=_U^a zA6Z1@pi^7uru1Y>XwB$Wj7Y%)*&ZpuqTULNuliUY+CBO=>;^)B{Mmt5-L=jamS(B4 z6=>#tzP`pTV~gcIlb_KrC1er7s?+fakcBJJqRgRGt_6KZu4%_fRA0(9@{N$kHF_t# zK}QqZF)fOHDm@#04IH^g%Bw4YQ!7zV$YMM;{SH7g+&{Re(tuU&XlvRZH2w~F@%*TZ z+y2a9k9p}@e!+Kuq)Gj9(A)h%F!`snqt2F%$7UR8`iujL_YINCEo_H9jfHDDcb6T% zd?RyY6d?eBQ*Ws@n=kL%zMpMp%rpwxpSy4*asQJImHw_Gb??ZxH=QdBO41Ww?9*M} zA1V-<%I=-aoo*L;dH7XbjB`DAUaJ#QLhi#e#({SIhz z*0((JC!btil^qat``QNEN$5s&U-$hJA?s(~0aJTljtMN^fEULb`=B2WBxB!x2Nbt@ ztCYl60?wGNle>I&_EJD)W0TzA$u)o~MZqSwFI@puLTDa^~+fGB>_=*9Tm>o0>aeWIM(QhCvC<4C#mvhn!;6v zc=5*(h~r~9&kK5AZkep8DvWasPHqGk{Q@|Dz7qhrFp_aB3Tu)DzCSRWdo^p>=lFh7 zIN)VJISH(}#~;*^4c47UeSZ7IN$+IP=3Bd5>A>{2fqB;u(&oP3#Am>n{vQW|Tma#| zkz$Xt`VXQ$uM(TmU;&lz?|_>Y*iB0*wMDzudt*7;=5L_y?gSZQH@o*rwUF7Tz)BHU zprSc$?BF#nX}N`1SV`GuJ2t~*Q1PPGSVQ}C$F4zlzg(g9Om0JAh<%J9g?k>Y>e#D_ znSJCkqF>}lNpK|jcr)Wp)1n)2QKs{K-&Fr6E%zvaroO3fSk|oE-|>d5IlH@RCu=DO z6P^K4?|z&w5^_8&yHifqK3l3ZRF7qM|2rW3fX#yKc27Q?Kg zwUbQZ;_)xVr?GQ4LWz#5x*&CsEqu{Do2sH#XHwQrb;f(CC1tYXxsTc{Pv*4tJ|%H} zo~#1nG&j39&V>)uq87Jg;G7!ib}!zATnfh7+Mu^n)YrYG(?G3wy8D#U%zLr$z-ink zw`IfQ!HBR0HrtyVibJ4XrLS{&;!M(!xybp zD?F`mOfR)dEk~!%lTuCx;=Mr3f~For?>dQDW0evTpwvAkTln*W*=J0cZ56U0A9AJ# zN!Qeco&_;i>F6)E)!Lb%E8lH&)peuFZEn`$`E~6}H#>%ZRg-5bdTTR~7~Z1YVI%`{4nfN!%u7##r=m5wm{U>7y_WL0cXM$DHLOv} z$Xz++M8maU+OA%OpMEbqkXs?q_h?QPyr@Zpef80K&GVX4V-TJ@MeyBx3&%q7yWOll z*jXt-QyEi3Trr#{vY#iXpNm_W^`dz@6r;avh>ldv8~)HJW0)c`dN;BMS1uj6$dBpFrspkjsw0r zkQH?*Cmv`*KS{hu7|2=@6rNxj7Y4ozU}{}}(wQzJqCMfrHAq&A zZ>Bt`PkX$>@qG0)+*y<{n7m7gTe9cr_3cLO)7<((@rB*vuM3<<9j<3OYY=9mdRF>& z&k{7F1;wwJ8*0crs8nIGeUr#kAy*vd+)0>+bl-a=yT`2){Kv_0LYov@fV zSq4PJMyZTTWp`piU_;fAlQsk@0-gs;%!@LT(~^eiiR$G~JkdP$IyI)|rss`9#(+*E z(;Ba4#8LS;H<#EMURgD(LlsaB2YZR#$?w} zV={$ccKF8Ok6bKRY9qt^dM2x)7Tos!{#BWsORatrR}brmmF0mfVQ7kVkv5!wbGbC0 z$5zzj!juO8iRyz-+?Uz!wO^kXy=aV9)J|evtQ+)$PE?V4b#h~4otNkm7U@A~>R8M# z+{W)z4MDc$=)H9HMqwlRv4}=7oZm$RzR*y!bh+OKaN<5G=M#DHwdigZb>C1+3t~|r z?dArmSVk98Q#Hj`wX>+>#sIgSXOpoE?#5~n)pG}`27J>6-Cc9kgThKD_s`jVDULXw z0_^0|GYgWm&>jKPfWwhE!HFZIa-2ePn&bm+m2!c2)0UxB^kW4j|2OkS?PHu9HWFaw z=uVBK@M2kpoBO2@g40cxm*EkQm?_^+(j~jUa8NgfG=~Xl{CAS|p(j z)8$Q2p_$g`cLs+r;zTCFafh~&*?QU-#R|}1LRGgzTs#v$JX@lllctXz@8z$?kFM_Y z4Zk)O$g$DY)EFtETKH3CH2%Q_S-P|j%9R{8&+zuieOo$oW71pqv|+Q0ALu<&9f_e- zLYKEF7@@d^5Z=WHIu#==iqV1!z~jm-#)M<{o`Rh;>9G5%f}vK#n>!dGNM9l&R=W>d zp*L6oyF@1ClH~5W)T+o?xS&xOY#gbb^`UC5T!f;Ci1G{`LP_0XX5x~JKp4hV&4*&Z zA8&LkJ%)lnV1)8bhY9jqmbzPtkDl)Ot_s) z-c5ti^ErInon^!=CG|2!=21(bYmX`&rox8=bv3q2*Gxoow-$0-85Yaj+t0go2I+oR zCe>~kHpf&+kRrj#RXZaG8Y=Tccas)vx@C?DUZzr&CZy|Lhz9hkr&F6JiOM^t&WS2& zZK&!BJaDJn(kjsmrJCY6otS5l5Jmw34R#%`lu#abJf7YzaeUwf;}NJZQtr>!7aZiO z%p>;R)$K(Kd1CX@$tSNAqoiZf16Nh`;^Ja8NQLA?`5kas|K6uvOn@Yr- zC3%VeZobu~d=adW_!6U=53&nk48_#=$mrF9CuHU$q#tntwf}WA7SDD@Wel2saN8I?$+45ym+VMV7uJsbR-or&?4A%)_&Bjm0z)3ct?_p zTYULqjjsC?srr+)%tT?^8{h+_1xB=PYa>DD0@T=yZw9ub?dIL}tg1_07aJGEAKp8n zTJCyyC=cJUW3JT2F;19pO4|_eEPdd*#S3Qs+Od4f_#ra2DbU?OKuYjD$~0H~8CNDs zt<|`xW1DB6nuYQ6>|7mtg~OP$ppm!Pi+6A3!$fP|uK7{R*=l*Ag`S70#yLnxKxtzz z{O46BE#9X(dZdpH(lB~8KjTP~b5m#&$BRg90iE`sQgs|i4=)qT{qNaPzY6gMasMBL6FSh(3m zYwqR%gH081!D8R)@y5UUfft$0y9K3_MsiTg?|@iLAR20bD6Op?e;HAFWUEB@s6(JZjL-}CWNFm5z;cqcv^uz{`km;_vt2DQ>vmvDtlIFrn3f% zuM>RF`D0U63Z|t@P`w|6&KusPUwP@~EE1<&^C_mdiar~SVB>=M@P26AyIb+`;0u~@ zt8jWZ@;c67m2NRh)t0YdUD|ljc}lHc2H!e4fktUZ)nGkDeB^vW$4g6CA;P6vt3v3zqdQR$bC4~_hfJ*i0%_P(C=ah% zPQ(Ow#;mY|N0q(Qe0N5ieZX<0fA9yKi)yS zH+*J7kFv}z>aRS!wr^`W!{g#=2aGh;E;#Xwn>Y22_5B*B13ifmxwC;7`?S{lY+qz< zU1YLHAu!dPv?hJqD!)|z+iXcT0s*0RL<^G344xOyI!AM2BK8Ho1GEzI_C=h&?9MH0 zaJ57nohR+Lb7@DrnyoboikOao?1sBMM5O5wYQRyGEWFhhF{ZLZMZF2S4Hcd7iI91{ zCCju6_EhGER?a!)As=^=)^D(%CO8WlYeai--ux&Uo6bIJZRQw;G+Y}3GQ?+=-@i|$ zcjU?wa!91k%aU>&Eaa7muX;YT#UFbNl+w}goh~DNs&^URJz#W`a?da@%BhF>xpOlm z;k)XMq;R*$Wrhd2%wW&=S>21(8C~%7B91*Mt|!_xC_!gQ-} zRl$7;eKeg=#iaH)=TQiGa?jRKMB{GwA!Ae8o9wCmfZbZix9P%wC4=DSJ^|8s0S;v+ zHVWl3vkVSX@6t*^sCr%HJmVd`e$kEguz%iQooce`;_u+`Rc+7vNO>cF`e9*66$ZFe zJUyn4aAr}Q(zQFFKo~ITNrjk#z5F|Iyra7~W()+*6H4@dIR2Chg6fwimM_K7~?sPL8~t#sYXd6U|`DFCX;R9k1{S1HUk?aWd3WJTJ4R&;O|5naL-vk+7Cb7nmMINOuxe5@{c6sU9QI zEI{t06&YJ~av|q=mo0F8r zTr74AeOD{jX-RSTk@J0F!a`E{L%2*w6>}hDCypF#c`f}u&ASwAUvsSzuqANlFwlYteS4di*&sLi#UW-a*5KX)_yhA@qiW|-S zCLXaZJsyb)lkiiOOsKECi#D+4<*h>b+Cpwc3Gc?NnmRoLuX8Xaj_4s#6YbQ#>V&B! z*2S+zj-u10h}~e+P5RjJvU7AKP1?nDEVn__EotC7$9d?8o-Jy>%1`+_Af~hM#SGVX zfb4y-M$T@z>;T#OD&z~G?UR3u@SHaaRQZ_Hl$NMTre`dOEW8*nBN?C|8of2qVnjdJ zbm#up(wLFC_cLxkj?(-UKozC`&!HjmP$Yx5YCMckD77kr=IPGsw2DJ<5eYI`VB}*e zNr~-fzNv;dZvD^6tR8vO@d_mB$R$1nhDRl6jZn#%JZCl?-O%XuVsOqDo5#hEQw%P{YuBmhHgkEG8Oa!xoJxY@mzZINoPvcu!y&CP|(D$ga1mANxWxn_9@tcT(@B8 zL`_T2dy0AW2cv1R5YXGD8W-A4tpM;$p2j2Ns$-M5+lECfzP8JxCCxxD#usQ=ah)6Q zRWEmw6P&u{M7rvTdt!HJ$iTzCyQX2IjTlGus@I25y$H`~hVX_IcO~k{7E!Q2v>iJY zuQE(Lvxh>?b%^YpXO z0v-okUC4wU5hh#(7^l3MXS!u2UM{t4zr@-%@Ik6E3nBa+a77_*mGdR?+IV~q2SdqW zCE}8=c_S=G-|AN#b1aK#43t-tR_1x1Jgh1LzeksAtOEA!3T$&kvmA=E`VBq9RJOXY417m8F> ztU(F6{x?Que#I^VEBvSV1CXl68!;#bc$3T3rV38~eXT{;e#txAN0Rq1ti9>4yY(x; zXW z9UxYI>ubNRx8KA{s_aSB=KYPkDmA~9g)g&w2WWa%?+bHG-?-bHp~2q$9iU+11OP|? z$#T~H^zuM|?c{xF(`NqrIem?z3jtXQ+&k_mYu?;{KMxhxJX9yBlsQ>8<sZG6k^hNk4P`Bt95A_E_xdDWEq+YDUGWY$UhP;4oSRrQI&Fq;)hDb9 zfALGIO{nPc?}vCH8(VFzEjpE5AEC27473+{;P84fw-uu8Ox*2^^I(OqaSv3DosAt; z77;5*?H&2#d`LD2cbExhnZ}9B`^6Vatk8gz>crU>OAHdf0Dk|R0XB1d(Ru%_C;OYX z>j>yyz%ktQeO~o9lTOm-PakVCHcCnN0OvaXWv=TVjgY_o{+MLZ>Fh_%&-oT!+v(O% zXNyv|t`E-hBI8EQjew6#lWP)e!CvkcBcqEM6eT-5|8)vVlPGdX4PWdmfZovY66Cpx z<>qXo^N{SssM{wXA1m}ad=6%8Fe2zuLp(BUEZvW;85Q7So+y}KsyN0EHf=DI>qk$C z{{1|%hrV_?=8jp6)T{i`bjArYwRiI9T|Uj28X0R)p7)Q=Hx04<^d}$gtHny6es!z& zZ|e6M`8jB#KK=9;g*cK3drakWvW}doSyf8?#rqcUrF5+DcJFrpJmPc1HUE)A(NUs- zB4%|o!hotKStxTTPlb;8!;u5oJ394_7p?Yr3#1TAmiF(6FFm~B8()I2EJtqli>Uk9 z$M|QlDF?$(h^z1SsRMGv&93=x9)27k8jcD1@@5YpS_d^K6K_3C(`PIB<1Tq7`y+$w zfN|-Q7)`9W{P66zOJ8T2ELR#xPd0zj1m{4H%48bL$=lN1kl$~+z*4>b?MuwSo9P?h z0Z85B4J#Dw?vejpa=s)l4ftVVnD!~p&kBM5O9>&JZUF=A7hI#K%Lfi<#17AHp^ic9CM|0w?puQQ zMAQHH<;<@iX^#>R9Gv8HCVp{49jkSJ^#gvi;Qn)ndgwYgoLP1!uMH`Dw-}dj-R^Ag zB$^kiAD4JGCp0uP;wkM#<~J->BmS>8EgJ7l#S8YYO?D?jtX9m1un;d1u%seZT_fq> zVyL3I5puzl+E#d~Wz>;eS#RjfiEwkL6{Yc~2`Y?C0slq@UEo?;tZ$>!fC`JpRhS zk=D$0*7OQjH5)1&?&u+Z@`)xF`_s0%C$!Gro-#__c7!r<@6zPZ>Vs^~I zvY;t=a1>s2Kb?mRPuxMGHgIX z{$ARcMN8@p0=HeE2*N98cLjHAX=%v(8TZe6pRTnoDD)DI%?p}Ska32hDwmTMF{Wdq zuwTu8vfPEI)7)Q#E0xtEPM(G^mJ-dE9xKN`5Ti@@Q(Ipk-=$a@GE}lOZ{YmDo@nhD zIp17{c8th3Wi+RZ5p3^^Z#tdY`+$BTiRl$FFecf9ympq)*8Ca5OZlAc^9a~;c2hpv zZ1g^M-b0LY(bo3bYl2yeiJjs^Uz?3UlYdHO9))IqYv7-0pEde-_0zU*9+Efvtr%W# z`oo4af8d8=OxS4pmk)BkV@4E4`(T9sNF!tC>d)x7gh5SR$xzStUJ?mOQPOhDN8`ZC zLlVNBkPnfdp~O4DHQ)UU5!EzTzkR_zKi|lO*xZhCe+>Mw0Ik?pP5L?|c;oJ~FfZ=V z?K@yvdva=S*{r z7FD;9vK7Zj;H? z`eHX8u)w`o-S`mY63}JAR@0KrYK`j4qA%A**px4{&V7D3DQ1nI%+x84HxUuxuP^|& zIJWqm_fudfylbSI%{EN4R?Aw;ub`LPXrg}IgC;i{V%=;?9({8_&xi`7EAB4(e#5hV z+z!Xktt9HEpz3>x2Ut9IrQ^jye+8eUkk<(QnJpGtf!2+!*F`p}*zn}Xai!bkbijBm$)# zn9P3rrg|l6h1MBK7_7e(W`exL>-5;ONwElO*+%;8f4glfAAGnzVxlv zpjl<>k%@V?u;C8exdyCLnMP_@-Zw@UrL0Yucco_H3>-ercx)JPy-&GQaOYLGOOCd$ zqpDp8MvPFw6{%M>M4SwBapmI0(%Z&X!%Z6hpZ3l(tf_46``AW6P(TquKuQ7x2uSE% zIteuekYa<-i-5oo6h{GpKmbWXC_y@eF4C)oW+0*W=8+;zWE24v9ep=*JoC()@;>w7 zz1}Zp_NTq~wf4R4eXsvoSxMIYTVd`h=ZqTci%jAnZ1L9SOPQYV8_t_Go@|nA zKFuirXwx_Xh8o7Y$mjMyZJtraKk_jv30OC=%hx?^*nsJPIK}X1q%ku|ehM1(<8D4g z4mzV}iLW><()C$|d86}mN+(8|JHLat+jq2r!r9@eVKE0Tc=02in}2;V%P1x7mJY0_ ztg@tVjlX!~()o!piiPFr&bLCPa@XEAhLtoITA-Il%Zg52m%f}hT#C-fEkyX2_^+qo z4YFfzc%SLGVRLONn6)au+eU*p^f)HBJ7}~{OjJ0}>7Faah-e1W5xW%nDf9_=+*8d? z0PXG9AjUJ{UkFg#OI!s|nb5l%qxmQmPjTCPBr%~HYVx<_!1>mK3F}`{oRCffbrNMy z<3j`LXMEXTey{_7$iKz?OB*n0dcNQ@(b39=$4tbvmzFxpNZZt<6lB?!<~cSOIen_w z(n?IRBU1(E<)C7t$zl(Lkhv{V*|yuW>D&(Cb>yJ>g0rlW)8IxL(No*l6ooF{el1=9 zO9&C4L_r|kU;<1IsSFf&WP_2A9&_e?hor*NGfecP#KrK*E|IB#QLvO)um_5MKXO2c z2G!)a7#ts7t91`-&v!_)!FgP4t~pDzXhmvp{%zM+_SQEzxj76NF+IIj2DwezS#5Ke zy};rYaOZ#K(0@N;J=Ra|+(SrT60-0M=iCe~GJL^yPukG~DRO`QErLKUSgw3{vB!y{ z^CRwp5Id){c9#LoZo!z!pFy@QtY=P^C5RR@Mj%DUox~h1xrGt>v1F3CxEM_6oq4&V zvRC+E-z&X}QKBayz!jAOMnpmmd!bJG$|%(?U~07U!!UV0GRnXvBO|wDoOh)0Hqzq) z*5HXV{&D14hnbDew7+rm?UYJS2r??3K|+(n7VO{lS+8%Kl3Wkqna?>8h5}-1AK{4G z-s@tOu$|ub7W!6piiCN6WUsDGGL*wTIY4$m?se!?sL=F=N!2jS*ZC@W46d9~8kR+G zj*Dpl@*TqBk#I$N?Dji4E%ox>F>r*d!~#I2~bi=eSi~bzTmUd_^bE*%JTsX5kRoqW5w&TgMODwpWjh znaQOdTznq6?{dSP&SjTquox$I&=*j_6XhLlJN~d&xT0Livh;5MT&Y**O6innMS3H} z9pJ9`v>tyGB0lNKukz?T!mA*ocTC#~d&$UP4qr>tHYzSiUos2}U9uPpV>5nyh$|-X zVqWtROCtkTDdvR0kJ3&OelZW?A+3z^FOhdrs>Va`Vo`T}Tyh3#&uxXEfNvw487 z#^%V*u(OeEb~=_M%(?IKi$f>V6VH%im5fH-5{X|XMva_YU#Bc*L}gXF^S4LklW&)a zIh+{)gp>sUd8*%P)=ef|E5J^7K7*7Jy=5>v+R4GZCqg8KGj~qs5#5uy*A|miZC1eo zA35~@Y@@`?GW|OX^z!7gqsXbtkMpD#<((cXje>S=sP&%$3+C_G^9*+5_;{FGer1CQ zm8blN{-i+j=l`rS_gy|4%3*` zT(sixZUCvNoAGpW!dW{yndE))a8W6|yaGMCYcz_{o}?mfj5@@@vkXmWiH37_Ib`pb z3ao#&QO?~^=-^5IA-4H859&(B9>7p)3v3;+CXF!{)vU>ZNn=TH!KR(*P%urU;)v{bFrj9t4x4(n?v?} zsi2-Vk2j4pyrjzpk?#vPwCTRwblI~QzP=ogDTPosS+54?61L>P=>Wp(to)&tOR>8G z#unw~JQ*bt)pb_rDL$n22WMZ%0MZKrU^s-c)KC63T>Ld${54$s*AEwOCvZYN5-|UM zg_#~#Nr9iVZ3p@Psvur%*Hi&5kK!iUvax^9ZqSEj6#$**L;U3@a^IL zyvhhvBmvcePdOPYwsEkVCk+KC=jPU%jO^EUcH?NhIEW zlvKqMq@GCm=^Kkg(x~8b#5b0y{GH<;*`~uHbD2R8f636bz9A$7{AQOls`>J)m9@)9 z6SF(bhQk=_IbgN>j~bz=Sov_zSE*Mq&8LuE#HW-^>4e zr^RzUF!B3cgZj=87jZWRHfon^iG}i%<-k8rXMyX!CjD7Ep}J^#ikICIQd;>KWh#%? zSW#AE+s%D^UO(KoaCg>YRsySDFg`tup!Z2qE^vZ^=k9X9?;GdJ|6*`PR=u;X0`8uo zBVA0yx%bsK6#VY||5o@{i@+QH>b${!w5;`4IDQ}Kzryhc z-TgO&!!)p%))~_ic?R6&k%6PKzYJ*LerAe9{|(qtV2l6EU0JoZ4Sqk($Pet`;VG>> zXs~G*MqIuvUKh-W$v*h{6vdhds&CiIPh5HWx_i66bUb*X*W^?w2C>bzmLg?#&$l*R z;fk+C^H4>pz7rh6jtUlqBi+-1>@5{98F|R_OX21C+mG-Tl_+*-N#bcjX{V;~$GpC( zO?=`>ZqKoHeSu=l)RKEpb_~u9Yn_lB66&`sUqG$#;h#9xZ|-ISPVS>0OY6XH2y zo338th2bAc7SmkaT66RkN5tRthr#&MuZ@G}BkCvhe$4E3ycH%B3=hM|KHio6k}7@r zwmBJPuWw46VcjBJ)4AY#Rnc2j&?9Ai`=;=lTpQV$W-~vvE{V#h)l3RQmG3y(t#~Uo zzT889w^%Q^dB%Q8DFUVgau$fN8f4-6`|$rySvM)yPJAx9zT!IcRa4C?FH?2x%{La_ zEsa*q&y|v&W%klo4g~c6o6im4)*+qe-YQFvtKML48Yzv9&D%*SPgY>CO~Q)nG@Cpt z72DUo%|$((T}nJXb!*DpYg*L;tY7xGYdH`-@=zGftrBZ{d_ExSRRURr(&T*KzjxF<^CY_+cTfSI|w>ds%^A zIuQcDmN@QnZ{slkA@Q{p$n}6$+4m&mHdoMlazo>b)3~1EJy!3`sbiC@>eYNzTv(1I zT4rHe_uHC&3zI>;KU8}ID}PmFnSS}H-hG%(Rv_iW+4cc-<#T#Rt<=qKCd|8MW+dfA zH}&0byAw$jy?Nk=3xB!wSG4}iwAC)3=?m=OQeDY$p|Q&GxKp8{Qd2OV9vMr@Bo@C* zM^~OSy~NFijwTBeQNzYK?hGGoWgT#eGh_NA%tp?+5R3%2fm3Iy%)lC|=P^))qxuYDyKNnmSZ7 zx9Imy+uY<=;FXBOz&ehF_n+O^oqXU=H}O`ear$o7J~_jVzmSj3TZY5;rVh5W?jU75 zZceY-EBrQ1w5JYUEY7^YK})bai!JYyO(vN#dsy}S`C><-Ks|xH6?)vgdf8KrCY}!H z0-HB`Efx-NHdBToblBkB{OQ>T#ZI*t2>YvYl^yY$R2y)ft+P>~ z#c>KR#K83GvoUQ{ONfIR3}}HG!H=WHo76moBy)8Z~@40sM zem*Xt=iGlo#g|>fv#vCxU)ubI?@}KQ;Rr|6=qWj*q#oRnOi_BHCfcZ*Bce-{c}@s6 z7JhCiH8;F!hncD7I5x~_T-|b6NH{uZV&d?Sas<&I4IQneff_RUcA5k!;}Js&7TsPc z`qMe836t*}>Y;{cnc-FCX5J1+O{tkzNLsA=dJ=zc#8eWc>xW3gc4VVRD5-emM{Dv= z1~DC$wRjOLM`0FkW$SE^IK}ncPn5yYvox)Ic71)7ApMWo<<2W z{KFN;pW9&JMr*#+Q%g}qM*{{T2>|9=hhMTkarOMx6;C-DJvVte?WouMmzsO-BM4sg zo+L{&T#MncP#fa$vAAri8D%FJGlLsLUrz?^9p18k2IyN>4N*3u@Ph=6jns^NZ%O3~^cdZE*b{u8>W9Q^dL^LR?*X1U%-6 z<{_J=KtQ+P0QTuEk0PInjdqhSJxpWJqy+jKzW035jQw%}XrMJ_ll+P%#;6iYixqb@ z}7lJ zss;dSpVR2yfwkO!8dzH(?BNRv?_^i>2fTkT%^POk2}WI^5I$0vu}M2@kcXDv8NTq} zGko!`u;BsadI0r%uK6U*lE|nsM02SGMjZSXHg1fqtNufHT2!go)=)OjR5JJ1Tn{s- z$a@1%AZBQ&?0ZK~#h*oWe6~1=65b;P7yO)XawaI#9k7)6*b;;Bjiq1wpTo%r40kaW#o$UeoZOU#*TyNh9r6F_L7 zOx~q(yBl!S<8liMKF6N!kQ4Uyl-_4JGoMb0{iiC0!Z^d6#>k!)6$k+09H%F#qkZqG z)vECW0W&#fOz#=jnm$gj&Cp9@OXh&_>{4+h=hdNX<=x`IpeX5fiXq6z1&m$CMwm!Kwn>Ryg<#|<% z!@n4H@l)g!E}Dj)3pn(f9dooemXI!Yqso!WW4s9$A1D1o1B($cSHc2&6<1Ld(n5`K z=Eq&G%qi7;_ng{Ytd1k0!=$912r!}ZRm9b0h_96qnA!$uB;Bf#TxDqT4$nqh(LB=h z2;{d!)b>=tyX|wRB`ZO*gbrFsH_`ot!E5Y`gsg~>Ut0noQ41Y^y#0#oK3E8q>~c#0 zL?t?r;!rYNSXdIsu74{U?jrkDs|(-AR0^Yd0^LzpFkWb6bPMpylom^lqS1Rk2zA0f= zlE>|K{XRaB?^-v`VHObX(6*0~??x?vvg68Qo$6)vf_4V;o;*%dHG`puB(bf;T53e0 zIfYWne$7)CqBLEjOXz%m-npv67e>zZ(EdSYef%{9xr>j(`t0>bO-H@b_wzXdi%RA% z&xEIE{|e6IJiVY1hmS#13)3NRcrwng|Jj2qhMR-t(X+o9lIn}?H|1A7f=Gu#F{j#x zZ=*cx+@#UKSO1i zL<9n0O_Cze$^eK^0NQQCd-8+|F{ozLLmc$!W@qV%jKgj{ zu6MQyc5$f!y1~&?VOXiMX;?3+a#)m2b}|NL_^Xw5HF1-BzEX|Y;d!S_%!0hA^(hD)EIEO;kktxgL9)uTcogrD}3eP#Su}X%R>}Y zN=WkII4s0$TB{dRIU=fH9vk++K1JP?yXijHvwEysTS@kbU-hJ6)IMj}lR{MK%BtXB z+XuC3f3H>$xpqJAp$B7MPj|sou4U7hO7+@L5U+yR68I;0oAJZ=aiO(fI6Q8)(Ojv1 zK-+DBP`%}v`bH@I6djdXizAm@Dl#B1ocHnDJ7#h<7J10E6Mh!n$X}eUlm}!?B1M?= z#t(@#4WMEast+Ffrr=BSX*XvSRN|D03PA~OjVNfRED!DC5hqXea_y)|*no;;0$)}G z{K&q4R3#Why_bO}AwZ*);d{Sug?Bt~1w8n$T=-=#sYOo#Y>KBRL!y_f2mVyMV iNjmqd=c1G64l}Sd{-kkTr|?+NZ`0@huP3_Srv4AOcZcZ! literal 0 HcmV?d00001 diff --git a/images/benchmark_result.png b/images/benchmark_result.png new file mode 100644 index 0000000000000000000000000000000000000000..d991fefc7ec3436381eeacc515aace2d688ff13f GIT binary patch literal 160704 zcmc$^WmFx_7A*?FA$Wk`?(QzZC0KBGcQ&rUB>_TkcXubaySr~3f^OX5?R;|1x%d5g zzaC>iH(gz_s#n#TYtD*LQjkJM#6yIDfIya!7XJbP@$Mc10*VG67W|~eCf)-A;=P=e zn3$4`m>7wYvxB*ntr-M_bVPy%oTkb!cBal38Os28X@vHOFZpr^ITC0;=$7DRNOJm; z(Zo-NY7r~-#6|R`)zESD^dM_UZ)<7X7B7{s6B1l&48`{35f6{cM2Kv$P@W&m+Xy7l z!5*sm8}BJY-MUD3kTT2t9nIeZEr_QSPK9mk8xD+wq4N$`E|o@4$#0aBe{e=Q!8H^;q1z0R{A7#h z)J(xmrJXl9^dSsD8cGk%_~8J6$=M+nd|3^RY!PKE>2rcM5%08w2ZYW1jHO}a|0OoA zZvh0vK=>v0TooxEL3@~}?!aa(8Y61t3;4ab+%Ri&c(LBO z29ueM}Mb zdn7W_AE-|u#pWT|5J$o&HuD-dKT1*=l{|O9N}(MD(ELF1r&rbgxn()LiCT>AkC}lA z7!MKSy{m{u=A6chG7e8FU^kU{PacXv9@f>k1H5Dfv1BLhJ!VLC8KhL>S_~Bb~kkX+`itjL)3XXyh!Y-5o);0@Xm zh*BN^RX#}Q0qKwSzNgCr4GH5DT{3JdL*TI%ta-pMLv$LLJwr-OxY923YOE7z;UM2^ z1OnK>ZYxJfUD(BL!e+0QZj~uG{UAu_Ptx#&MkE8_Fvj0_$j`!2F~WmMr=oz5hnR;x?eaLI{1&k%NBu#x3v)@-uY_m)H8scW@Z%huS)Q^b z86VcTs70=XB|-~9g}BZ1x+S?6LHk#IDgNmqOJP_R#_thY#!Shm1Kq|o91NIoMprd# z_klX|4c`y`v9_~N;2z9)#>T%zWx|eyTdxbdf6-GaV zRtaS9j_IQOOlFF1jxLLU6?_^D6|4fXI-+W$!ie~sD>#iW6jjn^yG^jowk^3$WK3TZ zv?Ost9V5#HHG?@Ea}IOmLubO>!C7_|3Hq z1{$GTk~V7EIE`&9CX#*|A2Cy_Rj7SrkY^-k#7fj*SY?DOKdJax9$9g#xn5VO{iJE3 zUDaS;cUD(ZonpwoSh%t8t`k`iQ!(V$eu}%w?RM)%>jqrqTo$b1u02@6SzcTPx@Pl3 z^NzWI*V4YU%eB*()7qiOGJ@-+3kI(mzn-&XMtgey1bzm;B)a57^w005gs!t8`#o;7 zjwSA%EUCPFUM?4hh0~J*OW$Si>#WC8ShI8m^#FQJzc!p^-SN-WPZqXsd)9|D&h=0I z#)Hn#&uY5zmUkXB4{Ca9(grpzO+6Gq&l`2)S9wWINn>T*WrM2&{Ohd)PlYi7is_SV zS#=%IKr|psN0=}0!SiA7>GbMp71%F4=%4f!=XARhywezUj987h31f?J_}&zz5bpKk z60Rw3A5tl@*GF4I2KWu4=j79&3G;JvVI(Ky*boAE&)$ojv7P!*;~exHvz$|-M{ zHKV)Uik`Zje)j-zwTnS2XH3`A*Zjp%+ zO`k=mzh_mO_pEfmv-27*x`$)ih7ZY0v@~7Mm|yrRmt|;KF1er=at*~*I}c$ zMftPLZQcTxk*Jbe6{MofG;=ma_m$Y+e6yajcSV>pDl_f{{+s=m)}Al8Yu{793kijF zX1=MSF=)~sqQk;#AQ4-@npRHu{KWdvKvgF7fua=cCX#oX?&9iX=wy7^!Pyw#w)=j+ zaQAGO{Gi}~F#b@^nS$CPd6Fi5n$$HhQI_B0aa@WpDfe*0I-5c`N%~ z;Ub%)I=f}_w0u|V3F!urG6cthe_WsM$w}E`S`h%^ede_TFdi_aJ{b~uZzMhpJ%4|& zLn_CKXE@aVSmSs=c9feg(TUZHUB%?rz+>hl7oj!W^Rx1ttfP#1o+%+=<9XxlX7jS?a%}IEzD?a$TSk{w%hN<{sx+hQ zMjfZ(Oq<b-DkIX|ZGxoq^2beV-2e z6?EV`!L0{snev#v+718;`14Pl&2M-NKYh6}1CBBh?(jwHBeXa_TOZEPE;cVR>BfK; zF8S|)iZm5_w+)OnFK7O{+bbZFA8~UXh-u4yS#aGO>rVcB@pbgVz?-j~FGc{~*YU!r zrIZCJqTh>Z0GR%wKKn@VfW(na%XVRPUN9-yKsLx;VyJO(WdD6=3%%1Hxo7z z6HTlmB;nh_pUllevp4KfS^)tImzsjm^^Fi>N(ItTH}olN_y~|Gi*~cD8q($8b5xft z+Xi9zm3z-EV-~SXpXWqdO$sA>t&|!TYOdGB!%{r+_g3Y8ZZ{_C*6HS^96N+GBZN;p zG75?<0l{-KGL<*}*M*d$tAa1#AX`vg6NcV;qZ8@9>?<-#&{tIf!e15O>;l12S_=RH zflc}67gFX61qcEHGQ&z$(?wJMGoOir9g~r%gRvQthn*w%Z3qYf4?gg%otcXfiHDu7 zJ%Gb4Cldz~3z-li2?>dSv#B}X7jemd zs)PR%B(rpJapYrWc6WDYa%X39aJFD(<>lpNW?^GyV`Bur!3glQcQNu{vvsah^ww=ojATC<;wb!O(~=8?F@VR7uge$BUUS`T~&z6?bmZ|i& z^G04$WL#_FCt2WMFP|kIpOO|pp8KsMs0?>dl{lF|Gd~Z#bANLKqpRqh zD2?;ZBNjHGe}aO+=s{F(Wi(wK^9X}xbw0CfUr+F>VCmBS__^ z%h!OxL;X#Y#<)mE2J(ZAOq}EVPTJDHT=E^<_P1;$2 zmt0{9;j<%7P`)XVMi~1nng9BtMPd9UIp&)v;O*_LuYLEaAQQ18<9YP%g7fO8XQ`L# zEllUB?)e#lq~`HQYv%g|=t;Gltj;zchAS_j*T|T%T-8t-m5jto7=J&sIZWzF10t`t z8N4(R%IzTpD}5^ffoL-)Po|v7x41gFHGzx_!za4>3bU+926gphI!?y!N{cV zGm`JiauL(*X@@rxUzJetk!bp?0GTf(XvuZ^6~Jd zv7>I5c;*uO%nk#aNJ#$kXO54C-3FPR(!OsqM{SPn)6e#;Pf)fN9g8jgUEUS?{;x=8 zqNMge{BHDc%!STVq|~%uckmcU#B2e0S1V&C&pT)Sx>or<-Nci!`|No{jGnKcOo$Jj zZkGVp+}|-d5;|y^tppqy zISRsi&|HtZ|2_ecT=WLPK`mZYr7~`d50$YfA{RSFnS*@h^k5xi9s)ZB{3|E8ncMvD zj%#5JZca+^l^p799!?mlho*ewozx`VEKAFHs}-`cEj@U47~ z_gZaeHDN6wtOK@0?Tvh|HOD*{+FyXT(jneUwVHY|tIhvVr8&mwMm!eyR<7@bzD{zw z(lLJccI^(8k$d(A`leSO(WLhFbj7-eRjFQfC+b>AM|)%P|0IHj1`Q4Q-^&mjJbd>g9Jg{R2|WC}z^^3# zxmZF6Z`x4oYW#nwfT(7>I@=5THk=psnEs6Te>4YfT!ctW1m2IR)#!`fe+Tf_aJq2b ziGqA{$PfN^XyB3nfzYpL&jN0}`7QrDv_Au71`qE^RB8g@|LVo_JtXP9=SirHE6o3x z`KK2Hu%&k)(dn4~#{{53Uw(Y|5WPvgP$5C6&7t08G~Ouaa(uh~!(N8uq27As2VbM5 zaoSq|rcC`E$Y~Yed11KBR$!$RqfmMTUfhM4G6$E&?D-SR-yt&NVxhcBzbK@)OT)!v zjO=4v>E9l5r&28_V;QuY;jDM0rQYF@a$^5IfN)FbHjL+)0@;~(Zn#zT*M3E2+MRB4 z*-;OXrn7>X5_X)!Jw@T=!tvih_a3+BGoP^d0S|aBUN7d-YVZ_UC>3eSlWrAuS3uYM zk9O;Tsast)!gQ8$VTmNhvf8W;Q!GKB>!q69kwNiUsU4u|!|86L9a61be)TLS7X1B=k50yVuAdp=kwiP-&~$11Xo=i9t>T6(L9=N)6Dk? zN-0k5>pfio0AkiUG0}FKtW&Cb{87MJd~S!3_!wP^e_hB!tc$)ojOt7yZqTcBV#i~k zH>}sEJ+b0PnVRW6)dz!#r29pOsksUXd$u zsQ{eNPcO)4E0C+vbF(OoQdr}TLv~;y!pJ;~}z}AEJOB^H}1-rc#Ni5Vqy`hk!VRuXE zMdHv)5=Rq%GSdPeXtf#9Q`WpV*~`%MtyNa$?2weo+bsg6_^Ks0t1-acjV49e-&wS~eR z#xO+hj~%msYJ-M`E}7~tu(#)?j6!tMJYkm;3q3G{13=3xkQHIJC1X)LAO?pN@C zc+cuiN4&014ewXbjHue+OrSpHxYyqW$Qr7k^q6$92D#JEDZe z)m21~4!Dt>3L&F9Naef&OI%cb~~PQRRnvsgpNEX0z*15Wk>vjT!GRlHIIn;HYL z*xd~KAsxSSrjU0x!_2wc`zL^E*_YVurEWeZu24iQvOUeLvDlKI6MDHg%m(Jey|ztm zwaAS^9x%iY&NoE`lj2#KILm z0&_B>#MDn&sk-L8w@m>A{GO5tTBW8k*3->GYL!ZvvIz`1kB!H)O=|Te2h0Lah{2oe zb932#c693X#dHUBcQLOU{s~I2|Jl61ePJ=xz2|a7au0bDVzj~>;r*t`9F!&%PK#Z4V9gWH z?8IpUO9qvxK5X>K1Omf*?~%{v#~W$H#4F^_WAeiVk4lDmeq`g;sMj=T2~t(NQN zkz1C6f~($=)hqtbAMBQD7PZx!Yu`TY=fU3YLrMkG9mE*CiQHH(W*fD)F5k)=E_S5; zn#xmnh4&*pqlR;oUcp8_Dm#wO5L7JVKF!vs+R9}j`*#bwm+C?f1UBf=AI-GsmeCp$6Rd*M7N=t_wb3~NZ=|^2szM8oA+n&Y(XVJpE9Ot#c~UNv z57_GmhvGG#O}{G2ER2Y{5%VhOxDi)lSKM+=EFo}5qQF}psE@DhiO6X;7^kn?Nr7_I zh}K%mp0sT@Gvmz}sdvZBl3>+OKl-aXY)B|(n%@8&5sCAPe{e&Q-hbTuOt1kIG^z2q zLRoc%O1DgsDUM-B?<>b1+shUlI6+O{vKi1YJ8iXbZ1-Cf;&Z&tU%VBY6?D3Xa9_`= zOMKlw<$vBNq(v6uc$oUq%Q1PT<&rBz)h(6fcxd_)v1X;zDHqQ%TS3C}tN+%DtL(2h zS$Pjwa=y}XXVcf!P7AXXn3;2l=9d`Qaf*BPq^i9DUEvVs$NW|+tLOK^8g&lMs4)+o zuRce%z9o-05+HP0H^-{VQh8~gLsg85of`ptq+fSkN^bwa1RwHE_-_B>@lUGl&Xj5x zW6b5+<~#)YrkJTaMU$6<=URz)Y8AS%RLJ<*9EQIMbl#4&km`J<(4=1Cng*0;sT5g| zMfx)7Z5wl8cJVxuQ|0&r&sBw5bg#Ho1BLSyxvjM)8$jcqsMx~^?Nf!n!lRt8W?+=6 zs%y8EASEtOxLA?}@rB5N#@&qgxqD6dLSh=^b57EriR;uB*-VmyM0!6^rpl+`fov0^ znOws&92aZYD>KgKBivGg@lLy+*!^!LTSmmq#_MNaNV~118_W3vd)AIjrR`kJhI8>`DcmQwX^=!+MWV+;>#A53TI! zwf$uEkvr)tannM$DaShY*z_isizSD<&FxEN+P=p^&7Re2bx4SG+WRd?HS)1Mf1^_o zIH|SG-bNkX;X2JmePn|1OpINj_Q zx&DrrHzykIYS7@#WZw+Khaju_e4Epc8VZX)5H+rs+Y@H91z}!|%lzrxxW>$XfG1wS zSNK<7t|G0mQ~^ebC0(dOa^&N=Z0-5&8zV%i9o4Xm!vPc*-!=^gpuCg<#jmUW0sg$O zFIa9O{i>Yxvu=$m)faCrmml55iU0CHqJ+^kgpkb`g04P#6_bM~kKeVAZl@x4sT``> z&z(NPy<(#=91pRZ{1Gt&tmy`Eo1I=Sk(`af$NLq2q*G^=W`KrcTUm3gq7^d?>mQd6 z=8U!21-Ijnm)8YlcE_d@G{Jbtqkr57m>$8#d9#qxW6;U2pYkW z-nhZ9Zh=$xRaww^b;_1>bhB!iwwh}Nugw-WyUc%*ge4L%sA>AeG)ij8fK$04UJbaR z3`Y#$*)qE)KJjIj529_c)jH~KeEu=+ttI9^q$2jVdPhaa?_xj!yNF^2NTN#FXL5rE zmiQ?i)3g=Zu&x~O_}MI$!9yQ#PYWOrC{s=TkfAH}`GcAgXB!6L=|0u3KX zrK=9PcBw`JQ-a4Rsomoxk-?8rp<$F_uEYE*nw3;lZJX)N0>ea2mpH1Q@`11~KO;qv z!e7oX>7RV6uaWm|_06BhKDfnKj@#}^llN)1NfKb1R=Ybstq#0WdLs4}^pg)H9@+Mz z>dtDl7Id{;R!sqtI*{Ipd=mc$59vbpklj0yh=t1q544SN-F8p}G^k|o(!mkDJ`XoS zNm(BFEnf4_Z09{cp>BLh(rZr78rufB%u=7%aI{ubN>TK~XEz1+~)ujMu`k zO>=tCuwR!MRLE$c$^l;RtEU4wTZfpnFuZv;#eF{qzFB0J%;czy){vhT#YvX^M3*@j z3mHAr_Tkx*-^=%-_{~tU(KhR=>kvHqb|`QS-Y7b=nu6sq{BPZ-tbATlp^3jiSsws& zO>jcSH1wSn{!I$2#Yjj3Q3Goo*Rk(zgliIo$aiv{dXI&+%z7!-rQg|clJy(*>ktyY z+r_8iK|AJXl}SfGazF^1y{0s%at}|b=tDsS z6^NLjuHt4LBySe(atEGs2vMo$*u%+!Vui^NILiz)uo@gssobC4z!S1xursmRFizYp zMQ{hKPC?qej558w(6mX40-7ky4N~0uKiapfegFp^s?6d!xV9yI-Ouj{1p(3OPSP?e zJ*WO-ZPj)d&z;XUuvNe;R>-Z8I2Cm#7Z;%z&KLWL3He-l>{2u;gc-*Y<;PjaQ%$c=&852 zzTcpdwm;WiF*-LW3(6RgT3mj_XXp;wIjI7=#g>|)NZjxAU@b5NtxW#q27&Kp<+css z{k*x_59f2DMus`~0kwJxXx6G|(*=HS-KJ`b_|FUqNEd&Cu>yt<^&>G%{ODG!&moZ879s-+`GC|2N$ZP(mAW3lxqv5ntSA@x8b7<#uOX zw;1otNapHM4~!`$db_#!V0QMyCoBT=?p(atpeg#SKeznPx;;6HG3mze(Yf|^jDSR+lG_1|NjPq{- z|5cJldd@2$bC&;?F{U(R#T|PH(uQ7ahkGswz)^$`tl? zn5LFQ+8!qhzpF@{U_Z0{Y~cnH2?m!A^p~LWolm|Co{5bu4`-aUgWye!|*m(;fM@74vV&JUlX> zB1SCysWm?rCuS#Tldgr++GE+tTiNF#<%aK&@4luEn`Y3|9G zUd=`$lOvzFD!O?0*Clixc{HaDon+y1lE>xM9aT?h3n{Skz&+@|#5zpr2g?b#@6j=8 zHcY2djCRO;>}_a|G7WT?D1Uw#mIiYQ?U~fYAp?)HcLiekS+w8^pE$B889TE9Og2@nw z>_G3G=KAv=Z~*zKOPoXpK?w2jYwwDY=M(j0=JX;vj;Yp1I4eGob&TL@(?B5xf1p8y zHD#)R&ksKT{TVFnHggqhM#m&K#uA%j?377EX}mTW!c~av#+sfbj=7-P^IOzTH8A0lC{xkj67wt$;A75BjxsRqlaUq zVNu5CQzmnP;MgZ>UtcFqU}GA}Gjh z>EbZuvq@612hJA>`qgY|6I9JvM_rtY`=*oK7gwvi#4O*RP~6kNUAzK`cr{Zfa)Eu@ z0dB{B$^Yj>tRi&ac5lzzm}-xcKGqWorR(!I*eg`vQO@)Z#{5ZqO6}dCiK(fsq^h(F zBI%HG&1?@67%sM;w5+EoV!cfH7E7U@2Ni;101A6~qpbe&{My0zYp(au1%MA^`yR4q z8*<~Kn>t)pB2X=Est4#^AEJknsx};Ler?5T32~+30T`|ZTx~G7Gnm=2iDvh}kQ7}F zEoCq(zi|Byo;&<{iVWMVNKVB#H*XD!p$`ver&R`N>*WOr^3FQfEz(b6i=Uhj6U*eq*B?oPz?;eIg95XyuR6;iMOVM z{U`I@ZY`S$`FL54P*ii0vCW#JN14nTJKuJh?Snx92I_+LYy<$uYV-{1s2L1Rebd2_ z{Z_aoYFQqn99?q#)pO^hz$VabfBq)H;2-cA81`qaiywHzzl|u zTvuRI%4W3{X5VFc@(i-06@I;T+YL~_d4&T|GAgFP?-0}%6s8OKPcwIO&pxdT*bb3Z z#?daMpk1Xm*#&=jIrt8O(gg{m{b+}(V!wglF3 zh2os+&wnicYh|s1S5^>gi~E6_?2|1v%Ddp?ReCP_A{za^{g>(9IV%HV_P%4H=_-Co zP#CJI_-IBwhxU+o1Gd!3Z(u+;bleVI6k$rw~eyIf^P4#;z8z(Rm=mXDSiEKq>O^S0utc(EQ2gWcVPM!^z0nCnMW|@NajNIh&uLT zT(gnto6Luqi2v{#>R{H?$3~R;_M%&qMn*jC7l1qAtgvu3)fD=RIs#W%abII8rDm%y z`Cg-ej=O^=UqVdNvluVRIPGWSk-GDD9G<8=!R7-{RMDs>V&6l*Na}Y`b#7xuoc4VG z^1ib!`KG5XdBR-PiXcmHsXv2k%M(!h77=_aP55uHgmi%4rRly?N8n}pmn`_t;3h~& zpnG6wCBgsY z&*)ZmM*lYs{!^9g9hd_|cf!xd_`k_=b};qr4l6Y*@o%vCi**-43x@}@ z=u&-lKV<)7!9P|1e>Nn*mipdoG`{c;pFWVu;~k4a#E*rJJZY04U9R(uuimn}wB3A_ zsKxckUf(8gP6KVo9bP=?TdrsbqO$hmi_apLl~T6|#H2&XPs(DKlAqTD_zp?~L1EX2 z7je~i%;yrCpyB?&ujKHir1ws&${fw!Mb&J;W^dt)b`OcY0GyY&h`hPm%sgt#tI8Ln zJv}96`}um(8!!pa>A!!;>2mWpE&wAzT4|S|jWAaz14XCY)?E4o)(Bk4e(N?i?yR%+ z7K_&yU!U{?9`D-BJ|%Ek^@&8%>80wo`$+)D^~XA^f?D)3q=?=r2!uK9nH0;VVXL;w zQqt+w=3DRPQ7axkR;<=vZPpOr`Y7UZ2FoVV#KLSIFdNLgz@gp)#ag;$zvcsy4(H1> zv}T3_rqF2A>*2AObT##!ua~c;*do4`<&Grgm?Ix6H^^dv8E+ms{rY05ip)-rPdJtO zMHNpdZ~`7t!FN4*=(XhV9~_};(Vp)kYdFkB&Z`Zz8m;&<1OWPbq|m|)H0wAAYJmpH z4Pp^D*2+bR<6x%sd#m8mQtdg~sgTFqPVy|jLky9q+RnCWqyMWP>{c&UxJcPe7O{K| z_q|D`*PDeDo?#|!WcomcG}a)10c1|S&*`5tufQt<*lx1 zP&wsG!SjBEh5J=zkbTHEAQDA@BD5sxk^GG{1zPCi+snLamKFJfsc}XsM{Smk`MQQ+ z&e~GnQ<~F3gPUjZ6zc zX0l1Se}H4yq`@!AD-FA9s@NBSz^^KkMi0rBscg>Gspz7!cf`UHMa;VDc$a-EoW){S zR3o}@+Sj;p>a!L6%NCLr7|%h}aet(pl2%A$fXi~NYK5}|opydW-)*Wzra6;=+@k$% zg_f||1N>7Q5dL-j+GSx5XhGy*0DSxr`FA~) ze$JoKXo~;%v?7t|xc~r5%!R^s=W$&OReay*A1e%XOJdB$VGsZK}AB2rHn*Qkf zc!GN>?4*sJ%~X+ECh4%@{^PbTQPZh-8=)=Ytr zN7+OfB?_XPYGH6OMAPnoL^Sh{;-XR{ooGJy1Kk@!84>N=tetmOxVYChVaBlOiXCviDB}mTcy2< zwU)T(3_koRLhhn!4ZBqdDaA~LLq3@gp738c!=4DR882adZ*M*5CSD0}QHfcXwM2xq zY*&pIZo2Q$RZ01@t>)e?cG4O8R%uEsfS1F}(85zs zm&5tWXShq9@)=;7Rp_djV`c*>W@I>;M5zmX2rCekdNlZcM(1d#*iU1*6}EdGqrixPwxD5+bt=BVgkXaFp08;YyITk#ne~2U z7UVVdQmvVcYpr@7op@4dFkM0JCe1<te)o2Mmk( ze4gvQEapN|{8?tdR4E`gxF==lIVh)QKgboUOB4iUc;a=o7Lwpe31)7Dt(jtWth%2m zgNaJjIxn?SDpFLRN$)sOI?&k$5N#?7Kn3G1mj}2Df46`&^>gUAF(Kx@|@oJrl zj#+6?Tv^k3A^C}PCLsoVx707OCzm3s8|yMU+s#e5SKAV?Q`EZSjxcTFC{LbqEA$0N zZhIvSivmA%ifpi=v@*~IR8Roa^B8mp#m)$?&aZpf?snkjP0{Ep zLlYQb1ML=fOR*?6nbYObMa!KD-^UL(rz@G{uN+^V-jcaAmWz(!6gJ#~nGK1Ynp>KQg-PKU1z+8&-1} zISRX;o$Qs>JKjjHx@>_(#7+0(sni$4)RJw(vQGO|`_Ar{V8q~MnFdy|rb;U`Qm!>&~pO-7#Wwef6P*;Y=q6(OQPoLmc*%E{$8OoTYsahoidE5nP%=qn z_M+*$z}cHfi7%`;=(hbG|1K#pJS>tNmu&$cB}eS$=CKLJCIr^)wAgJr^dEVsDTiQw zQ$Xa4gCo39Fh`IFQe1>v`=Al2>!DEaDb$4G(ktN>QGjd1NJ2c7`{T@fd6bOu!*5|r zIj778R0(|$!?0dv4T!oMgWmrE##uiur;J<0l8Rs0+SB30krNQ)Ym-m#^8M4!bJBUC zw`E&K%Xte?3&RQAl>j(5oLnNMa-2eGv`!xin&^?$(T|5~$EnmSisvtpjl(BYQ%^~J zJ_XhZ)PG<81iBbgH!6D0YPQ$-=}@XwAuC{#q|^}UO=5G`A+-(Q@v{^zRb$j&XXDS! zJEt_q@V}byXz<>-B}{E}3=!Hptnw?2Ey+OcU>cj-bXf8JaZAKksQAl2;}8{0wI2B{ zVmapu4biXN)qB+BawI2E7G|R^yd0;DYK*CzozADZUvM9mcpEtEhP-MmVs%~+z}{AD zL8%%9PL&w|C|GTV=B2%v#~w!;NtCA0hm-?Rt7qAXrhFExQ%tL2(+WO+T+@iXeowVL z)vUvhaV41&3_tB-Jeok>w;|lK&$sW&f_%7Cip!|i)ZdmqHbggNP0A%~>(Y=-PPEa) zPgyMkS--A=!%#Qhl?P}YqT-km)waWo#i{3v3q@8kg*|=sS`ZZ#M^Zt%+5SsmTENah zxmvW%r?(}_hcW*+)SG?*k_5HZ+sNqYG$|!jn#&WOY*Yl&9-EgxwD++_SJWYzz~1G6 zRfdWr=z9OC@d$GU{XK3wd$i|>=h)Kjd<^0e{|U|PBlxwqotGv7bA#8Be4L3cw7$u( zUpTPZDhJFegV)5=XRQ-RojF32RNcV(1Bg>WA6>K6jD|*dIYicTIWsPDZ}&MRSD%d| zWS0O7#(Gn@ZmErrhLiiBnjgxy+CNSPIrnXZ&K@Mq9(X6$NNfWvQDBANjJO2-ekOF~ zw8MkX8WDF!$;TYX6FPzv!3mFbkg&hF5g=pX38d65XoAi^tH;Ysc?M{Y7( zdGzfUUrKWClnI}L=f}4qXu@RYi>#t`S(*`pf-Y(qCD6C$S~UtITVH9)ok&qOISKb9 z32t(WvgWW?GH}0ed)Ul>y%$OQmVCZ%IOtmw6D3WdE>wySq9xpetRsJWI_2OTDpha5 zkS7ufuNIj`YSfaVQJh5pFJF08;wBXwCNo?T*e}(cPxkxhM_0CgiAauVy7T)WH?XXw zEMY6`5$pys=0vl+djoBmN{Ck6E-g!daIVJGW`2Du^|^}qty1eW>9^v3f55XEdr$D1 z+Hk22coGIa7n^^|7CD?<-eZZ@+@d|vfq)!XG`3~!LF5OV3q0(>A z;uo~fV31Tmr*IRd4wfT)rS{TG6%#|fc+tkZBN<$XkvQe6oj^AYu?xaGyhMYDT<{(4 zd~I~M=#qfb+xE8V?MYc`g2g{Usxs-%p)3&;OrJ8`U{9#exi%I;#)X^+(+_0?dEl6O zRX=$e3$ULmxP*s?5=o&py7 zQnj@m7Srx{sdEpriOf@0GWGeVsq|WMKdYtASIJf50$n$Y*&=4#o~SN}Zyq>js z$`?=8$$&0S3?n?lOG=s~A!dk8ZCY!CHhVcvz7ZeL<=QmjtR~Bds^VF;In-G#cJ4Dd zQAgAtpiMhvCk%Rr3vDXe*23V9=1Yg=a9GItahjDs>Aj40ntP2YG~?s$hsuC*il@+h z4S$9}K(heeHU9WkuB0^p#n`GPS@b9tw;ARIRMsdu3Tk^7Yq+7ZTO{K}11?8Bc^rjJ9K-QY>s7DvasT~(N z#LI*m&Nye1heFd4E1I@?a{d04AB&ad-ppJFh?9sXM)L_TA$2t{we36rNz~S9~tVn!L~4{Uye-X zb8UtSH1Ik6ZD^uD{FSZE0M)I!LSo8P_yCmQs6HPgku34V)cow6f9I1JQ)7i&K^ z5B+QN;2xKx0|mMeN8@jmzEM@arDPqy{np7#fk+(RwoD_01_Iw1c{w5phY4H!G{;Oj zSwq?9?HhxG7kY%h#{YOCHgFZ$*ifpbwpz`f)o59;^18QM^P&F*qK~OQn-6`TIBwH> z6w$Ys{o3ul|<^qy`%ZpyWv{AZHcR6}b zS-x!L4Ubq2jWPc?-4pmIG9+I{0fOrQlogIBFL_P0be?vhulxdouo;`N&@e=GQAcLmvMGT`Da;4&uxpMd~Q^ev+RP zDM(4{5`KTwI9+?>n`H3ZYKOLBq~I6Au3h_A7mh~17-sbX9XYV>heck8sQ6g?gLn}> zyNG0R*N_XrMx(XOdNd`-d@KrST(n|E5Hd&Q2nYjoR#J})TDS7w5{`&YD1lEI{q4mW zqHpC+K@rwjOs5GL(1T31d9Q0&$2V4Y4*BgkXOY99VDf3V99c;8 zp*v?@L6q{||6%cc<~x6*B(p>lstC^!D#~yb;^GrtXR^`H0WvMIsP*3ALr`!6FWvoN zL}1OYOe?sARx*5wl0acLt=?ui*vs9q-|S8jnl`on}1^&bMlp7@W}zBMSJJLUa4Q2;j~Otq0yPVQe|8(0zqaS_*-p7RMV@&$0)W} zHD_HUehO*S3o4F2!Va{kbG~6e8@910{HiB&29V7jXGX-z9{kxdY^QO2Snc$A=C8df zWI*EioCv#5TR595I&d7zxa{w#wLM{@#Ia9!Qs#~XbYjU8<3$>BUH)Dr?@#8v@O-om0+)^E^h zruP!bZnHa~*J>ycjws(v7-`6)yCCn*g~u*vCL*5BVoA?~9|ffNx`CrmkMjDk*IkAm zrAn=Jn;HXq-wWbKOp^8-k)<>DC7nm(^1tk7v|jfH?j7M3y?Kz|w-AhN%j0q`wbR7u zk-Yn__bIxy6?Jjx2F3`G{qmQ5`OGSBftn$k+tB6~LK}OlpX?fE_bEG@{6`)Y-!FNP zQ=R|aYyQ~YK5;3?Cr1ckXDK+3E6w>Uga05(#hvFTVt!VBX|R&cODnHTB{uBt(N$v@ zqKAEyY}NTYW>bg#lk|}G=UeZ3yOQ*CMu}m3sDS6K=)CaPxHQ=XFUQXCq+UY+ll}v5U-7a~A4(RR@ejUyU%s&`1-mD5 zuzW<1qKL;Qf)&v{5F@ZuT`HP>^f&sIlE^6iGdaasU^?R=8AIl!PK(*LcZ+cU+Yj)` zQ{L3{18#D#52-0Xkb~h8V!%5o(9MWXPAJ{%WK^`|_M}?}recD-g)wps`1HzbtT(JI0Va7``V<)SdUuZg-mcbir=A5GKfLcr z4IM+5CJuNaQN-g&8Bjt)QqZ`86ZPFCPxg(}81P`k@HXfjr`k}y&u^|bey~j3-Gon) z;_1s~6_2QBok1JXcQB><9mVZ*A>pfQ>zL-j26p0%Rqq3G@Fuyxyi1K8=VA}*zY#T~ z{BMI%4!-+c#Tdxn911%L5qxqCT|3LFxCuk;rEsGPTN-DR`k#ajZ1on;T%#4a>*OiC zED0)9p(a=({9tywJ^ldy(8T~-NE)vFu7>>nF(}|_qbJCCg_zjI&K#4jTM9Szb5H!? zB(T^1iJCw?5T+xtwo&+l=kSAvPUiX;bJJ&+R6udk>EGSk0ag$_|M?O18VCP{)ON1L@(OBm{(r= zuRWKs`{|Ux$}NqdBY%P~;n=5k%okD?$L$VsV(Ozfy>r?li;4VKu&Nsk$2jHt$^K^2 zGLKfzlQ4-X<+%BT$0NS6uD9`j7n0H@ywDJvw7U$I2bl0GQL}EZkc|j~5=WbjSATN0 z)ZXSHAT<{9ACFE^%o3cuWN_V7bMbC;I2E!7yH}u`v`c8i@Hee8TfeVPYjcRkK>Y5Q z9k0ho8ffRsi#}z#U}9qef9}D|{Os>1KiA0a=XdC5b$Vf2(h?G5yYK9ByovGoZ1aRH zt?zGGi@=!)YUg8K^ywPKJzoz>F0)a;!SUB|if_ebVBE{W;S4UV*<%eU#Xi%@fjDQTJS3c#;|R8 z0npT{bY-wCdGcj4GMVKLqxF0Z+h z5Lh7Mw6T42?HUgEc`q*cR{Q#)23(nQiNuj%$WPf2au2fF4^R&7Xi&}b(9CCn$|oRcB!svqh;1E4E11H;Us@Kl7BiZhL}F? zqppn<$jRq&sQXMBh9zccf<7itcgtikS0=X>S&ENw1OAoSleN5vK>2X0gxw`#H7F$J z+PYEm9+vdQ#T0q=Rrlcdnpx<*oi@kC5LXT&{~ML6#$a{JegQr$(ZOBafPhkwRSeS0 zi3t7ka=g=->fl6XQ38Hb>s`%sla06jT^DC-kZ)^$PkP%q4HO0~^#8-wTZYB4MO(WG z!Cgae2u^|pcMl2fZo%DMgIj>$Zo%DM8xQU-jk~*lMfT1<&pG%0;!k%|wQ5z>nsbbI ztTrJt!k@KvNMF8HwL}SZE^l^^1Ge!(cXsGAP6U-pYznQP{II;^hurQzknwqP48>0@ z9oB4_hlBL}MM{f()ikza*e~&Qzn@Q^yzV!Gj*Oo^oE<<5q5ELrk(vazm}f6TVw&lZ z+NlrgA?ElMFi>dB-AuQ*6?bcIdDj8;Vk|goDRbEgnA-%$)bp6igiO+%lkcTvj?A}5 zdw)0kR1|<4(ofr$j@NK$f$R0WBE6gwZ=M?*+gN5zlrrqlbN%Z}BpX$TQ3~F4fxyt* zO0G4DvUR{jiskP{+D2-^w#W-DX9WpINN2-YDb6>ko-}(4f23P zvTYO#fszkOZ~k=;a$Z6{m6P007r(a?Hu(ZktQCoYj`ruDPX_EJ21M4EC8;rs6~YFThg1LM%iQw=PafbwDZKzp3d6fa@XV;(>^JiGY(<)mB>fe z#(SinE{IY(Zy`WndmMVYEwfDCScXA95UTs+L5of25}HGC=o`*4q9jPZ#rq>+kIHqP zsk%YtiLu^6?F2(rqc&?opuo8^LUo_PFq>ZV#LLxWx^+AKAtde=4X7;5Q_|r;gAL6a z;_Jy}SOBXb|8=-RtcBqvj9Pjpy;8hnqqD!1E%VPN)=(1xyt(9>HO|TO*0;#T<$78m z+%+mK`<)l@ni}w(Qi{iZ{7QP~kO%2_#LTtKWnxnObI8%OLf-h9jsP%+!FsAs$5DQI z9lTwjSmGj!t^efR0~h%NnSkq13-dSk@kMQPSATG0%Ro}yOBcPk6)-myUr!>cyFP>7 z806|ImquR`+={{jL_~imDg@GRQg;MV$+C#}b${3uM zh|@*;vO@3*i+uUnzV0;aJQ@j8D#n5oUtkS-q3p4gMLt9srA}*6`Or#7m6cJLe|JiL znH~Q&&3+tB0Lk03%W&O zsH?AXkNeF|g$rkf1^bVl%3xVl)*LF;%KA?{9T#SIkBBIF^WQ&33u~cSpS8Ntl3@c= z2VR({l`=wQz>{fHDlQSxYN`^wMH42xF#BD*uJy@Ux40J04Kmj!zVkSdG!EqF6lYcr zp>%IWbC27zjl|D3B0{?;Rp+gNazq%%U)-cg3%sUh<8%!aO*};ru;{vSb5UX%c7@U& zk-F{a+FG%@T9IZ*-P2x4&9&SvDEIE#=l|B+iz`)`9k0<}H{APH5$ zsr*(#r3!+D9s3aEw8SJqAz^hpn&;yv4@@Vh}DU(1bJg(J6( zWxRq!ppe3p0OWS;!F#%re-W^MW%$0lmxIe=Q%WL#Ija94JQ)TwR3B}I!WbFr~I@v5{xjm#e&>z5q4 zxXi8``ED57GVw>M8$f6nYJ0@D{%$*4YV+JK9Lh-?IDwXH zm&(l@V+3rv;=QK}RbzrLY3B6`|BgXBB?M8pS9A!&F4E_t?3^(lMZ78vxdu6WmJ;ni z+{sb_1F_gM_UDZ`CQheKT#E%aDp95V@HLP~RlsFFV7F3;-B(QcT}=%TLKz6kWfRjL zfz-5*tu8d+!?o^^@vhZ31%PtDmo)I3&E#?`mAA~xx2>BtnKc`E2v2@)JcnnT*Y(Wr zihxT5VEnSN>AlX_jh1X^h|F4{JhWjw1-u~6RPIIpk*i0|O0T`VSXd&avfxC0q|5Ad-W=+)d{o8W?<&nq!XFw4&>F)z!s+Xo* z#`Jmu*xkkYuhg5`@OGeTB0X2Ujz+N0LkMVTw z7u`5d`}B5q<&zDt9cKV|BX<;Ox>A^i{x!T=rPX4iI6!ypkR*3>mE(woLSN>acC$MS zNtbDjRg)J{Z32EV?v%wb$Y)DePP08LnY~9uk%Y_!;Pn`Qd-hRi33o~{vP`=zm!oKU z!axc;UjYC#a+^64g3A2iP`j`}q9J3xcl861tnXA78||=FWi)+$NPB>Et3eMdw_I~a~v4a(!i)q#5 zLS-IkLLtcGx|SP~H^n1!=8uq)xY#dxbdzXqOKo8n=gdQnp_NygTgYRoxTYvcX8Glr zx==LnuN=J`<%%EZGT5XSAgf`HNWdApK{%Vz<{GwDwg`??I#viN;y(@}EOh)9a(;Lq zqR*1pEzuviE-KSdX&yiF$?jII(UnKlVxJE%^qog*O9RgVw>1(sv~b|aVcP`cc>X!^ zcdw0$Rsw31Hg^*3s!b=tKi2pYJP)M1FNAgCTqw);$el_gUyitBoFx3VwCtyh{w!;+ z5L4K(k~^$x1UB+4ny8Sp+xtCc5 zSKj>#vMErKXw98zuAyzOuII%L)!jC?ljI1r%P};+y$AADcms}d zj84SuLZ#AMbhJ#HkSqj-{7f;mD{$0X@Cch>{T_`wu4 zY^wP>w1dOjy*kNgXaz=AC5y88+JW_yWe$%k-M3*scq!ZdJkB6P`Pm=sUW8?TNkQWH zhPEbPooRGBIz3$~E1A>auuYh=*lL#C%59cmcIpJbMbWLI?#i;ss@Y;A?-8n7A0d*C z&;anbr5Mr+&>Q0*ftE%Y0HiIv*cDeEyWCJGnb{~|gOR=LkDa`%QZ7a4|46YR=%K-G zt3xkVtSXcPn{B~3wz=b5(HW~c(Y8&W^$WjC5XI9S?ZNU;T^7K+6gkXlk8?XIqs+~w z$~`S`$C{crolf-8OBQMuNK@r+o%M0o!lNHXiv`);2L~x*4k3Um%qEsCu*5~3a&?kF zu_s&waB{U}p}*1&)!lR&o^Pn<+ryc2h3y)pDZ*{H9e%>RwyNq3o?0M;lhSUM_`qgq zBz+cd`1a@!?a@_6_b^F;ed21!qht2VHg%o985;&PMl0le@M=p_qv)xl$u}4_i&L>n zlJYdCO3l6|<4CrAqdZ5#VJ^eO(Wv5uhWPL_ZaqS!1#yjQan1*Bs&e=k zalpY_5%#Xic3;Y&*()A-uQG99hAPTxD2w<=XyFO4MaP^W8-83V#|h0-bhgfyz~}vL zs!;Na?k=*e;5T<)zod)X1!^JgHm-!{kRem+`@Qxj4?)96mR?39Hg`04`SVtX=Wp{4 z;2?`C$*RmSx2uWP&a~k+sZBj~=btdC8-a~G*0%}6l#?qE;wJ(hQBH%Ht6QItfG4@~hr5U!G-}wVl=SMILVWQeg@2bjxAqmJ^Y7iUZC zIe3h;v12~!{y>pvU^7FL+-3Sr<$a&ceC7T3`DrhPWQGKo54pq=36?8~1Eo0F=iIf* zR?D$PHX$XKksiP^s;>!_eT>hGJM!K+oE5@tlio8WxqEI@cpKZ?5UtbjEea7|E`8q6 zK-4tZdzons=?0PC#hbn6w08=A9L+XJw&3SF*}Lh%Yl)qnTcwwwe*%$8ikL%OM-=M zJ@JP)KE!|}YdhIZJ!^_1((Xi9(sWT);rHE!A@e+=VO=oPyB8%|p`e&o_eF^=iJpV5 z0R6jO?X!eT*-7WBvk+LCuU1g3v16UI$nTney8cx2c+c*6)029CL4Ap^IhYsSD?^>O z1PL(Vo(QQ%=W?{f5;scxj5*{@uulmfd=c`@Mmmao?Q<@?N&jyYo`D2HMdiEbt?2-n zPc$uI1%Z-5fpl1A3VDB-D6SN!+B8bn`%9UDt*00T(-1bcL1%~prB!#R0rSIazNC{} zu{QH_N6tihEpoURFsV%*D{a1;`V3PT&i4|05mB!dfroEiE94VDjOh6Wl0S}AmPP|+ z7C6uUb7lBGs;#oh77VTC^w{)K_Fegk8pF`PeGP&ouyI&5uyjDJKwnk;Fny|E zI$f@iE4ipFG3em}DiUmYV6>QOc9>0i__h9ge$8YKgP~RFMC4egSLASJ#Ul^pL1Sn}-PYJq{3I1$?%|*k~s+e4P|e*cciVsesqWRg*s- zyI0#P9mnf&1tETdc6u;bH;LO^uPP|jobX$$>A@S;MC^~_Ylm)`BY}6xGq9fB^=B7P zJKqW$94n4}L9bI@Z8605$1BI*;#NaNTM?VAPB4aJ1){pM4b?xxW;Mh~wil0?7T+T4 zb*rs6*2GFLBMRLUB7#KIES$ZIEG5n=PTsoX9OvW|fiNC~7@{FN+V)QvSyxYahtwUdTjB&*|2P7o9vRfuyf!jQdnm@rY2J@9N23zkv z-d`X|2hPC}F1DKcBA*(*)pJU{R3UJLCgd7rfmI!nq7^kyQKJZZmtVbFf3?sMS|t&v zdS1QYRH#V`Q(h$v@T0I^Z}9xc%qFrk$Jd_yVJOrPbd>oX$6-NrLZab7QNeFXjQZvI z;fks@VrzC+dQSiB7~cjeH$9j5VvSyv3q^iJK`ZP6uQszbWSYN|c{^Q;d9=JFATc65 zH%b$-4FV>LUnf{*p)5uKd_sk5-mY`JV+$78HqKNiyk2EG2-R1YEH-p4KqA9c) zs~QG2TsPxVJs)3z+j#(YfJhf(Q1ejuqM8pmu-e_56w3ga@A;~d%cq_4Xf-%sr#4x0 zgeE2?A};`s{voym4?zAMK#?uQaQ~Qaw~$jqezQ}&ciJXnur$527r+@Soqt)bWwhK5 zkcC5$P-%r^SnN;xf$N|14|9@h$ABo0#dy!VX=AKjMXt;p3<|WXoXh~G$(P`ZfjINU zel!PnCmPrme=2x@n;bN$Ua?^Ajq2_|1E@)q@id{{$=ciAgLaubK-68gxMV-)v|A`} zG)b2Kv8Q|@_+IVgy*iX^ApZ%;R^MC_^1a~6p-iOSORa$c>o>udJnvNWn{v<6bh(9t z?&+!}xWBj~UJlL$>1Zl8$%dKzizAwQtPA1o8`|r#*@m}j28iWPAWziM;(+&CY>z;v zuK($wdtmeoKF9sUH|{|E73kH#MM)>)O)jl=TP&7=>b~DLcExT5dz_|1vJNco9%F!V z+)ULwhV7OiPpq=rwvfRBuqXsb2W!L)R}`xnBk!zYfGa^y>P#qRACd?m_sP-30?yMY z&33ovN-?wl+5J3gT#EhgnDQr-?{w4lSG26Xkc%5Z*>(YTrBLtYI%ylixu2u z68@uHRD11IJzli#Pa_9)2|Qa+MYw&5d~nNIXIhAFcgbkgww!A?hK#6i5kp{pD(7q; zE|Y5A;Uj-YsuesE4}cmH+D|VE@m#O6s^j*75*G6W+<#A2xR)pExA2=H>UD+cYK&qY ztzk8N`N1jqLqi@kd$7}STMpf+(fLxA9^T&UuL7|Tkm<0QWr3#|FuJ1VQ^BvE6t)>= zF#;}>4VzAil6#(9QP@))ED_R5UcyYWIBOIX>1;E8Hhh>iDo`pLKFs&1>hUX|E9Y$2 z>5bus^MrFOefr)2ZJ5j;<1>=(z@p>Vhu>9khSq`Q&D)yZL>AgHR^UF!>eqNk#wSdW z0@ll{711^lKgl#Po~e$M!nmN%eY+~Lr@$+q+&%!ch4e7tG_jOROaaoacZ}vDKW zvWcd)zz8Y=m&fu(O>F`)A^e?)Oo4cH$D`H$A1ayQY#R9p`Q!-rJXXMt}M?7Szgt8tECS?)3nes2(fNty#1* zXwtNu-TABAdkAb!9!19BYBWbF9_|iL3+wftR344bE5;q*u9RUt%Y(kU0&6H(tk-$> z+W^^U>|AD9$Qe3qmC3YCUvQ|_+_!oo(fhlG>uQulOSzlIu+25{4d>uyJMwbZ4{)R= zB+*6=;`L=_pPtsYDLM%|Dwo@)9{hI#bC8m@v5q8aL z(3tD~Ri4)%eFMD+Ro@X88M6@i+XR<@m&wJvkBUshbPi{r4$GSrmyGLPGA59BO2E~# zJYvo9q-K?SyLLxZa3;bM4Is-H#*NduMW9pO_#&x7rqyBQjTFjc@VzG4!a6{A;c`8ox=@4*Z(zPF_y-Ur6sfjany_$NT; z!=k%&G^KeoJCZX)zxMRf8m+(z6bjJC2911>wI9TJr@cQ`52`Y6|LhK5rh?Gq!2TJK zV?nICX`7WAbN=h+fqI%xe(_dhb_$K@ZKp}#Ksm+zwTU~l z4GMZ&jJxnlfTzXmUj4@-w@wtIw@V4D0B7(1*1<5ML-4!nw(XNu;+=OV54#H{yjdH@ zz1ADI%32Hx3y)l}C}$3WwjtpCM(B=S`*|5Je=w76gUHDcT5(x8K`Lf$Wxov>!Iyl% z{}7Hlo4hw8Av0m;^+f0i%ePvt4ltC~Hzvuuy{iKavzpf5@p6qC%L+iRs-pG(RO7Sn ze`-Ef;mBbKK=DtD8YG#}E%nJR4r`T{pdcqbu2f1k^N-=oK!|KX0YN~cl+is@Rj~LY zbilG5KJ@Isn(8d7jf-P7#WYFfHpJ0r2Aci5lz#F(&>e_c46Q4(l4 zF^$O7*(HIq(w=Tq@-4<@;6O%Xwm>CA!JkFgOOgYwKEE7<;%@XPJr29C6Dl&CtnvUn zv1~p>y%Tm$M=nmr-9fkO099mmo?-Li@aqW?Xt`z}VufM@pXTdW#EoI7*{9b7nAovzG{Fu!b|@U6%j(}YNSzv%A8k^iQpovJO>73CDW9f zr0D-yThwV{cS!r^wYD{pR=vzOEKlp6@hQTq5$g>6{trQVXr#1&KhfbS@*z|)$V@R* zSuv%}Fl*!$vLTnwXU1^lf+^%8vswDJ@-S^bm$0@&z>M5yG=3XudH%Mk5}e28(nVa? zwzR0YAIq1nsbSf(a_8pW}>k?_~u}K_r(1i zA%_l=Vs~?9maQ6y69Jz91DxLDOLCiHNf)m{1}wN&zP}Eell0C?xCq~5-Q!0n*Dd@P zXrK+tKF625x$`rD;htX`YIEQ#h4s@N-v(n)RyK>YeWF^nk^YHTCglzV3{wz|dO#9O zzIZT;8LASqh_lfb0rHzIi5}7`lbfiT-s$dcOnz3cfJ$mfj3+t))Q7cMj^iqtyEC)j zb?!9z9V9o;*U?V9Dw=hO^R%INZBk(7Q)@g4!fK4c#vIwFKhIMpZw zkkJ?0Xz+SCfAmsv&Hgkp9z6p0sUME`M=HS+7O}IJj{oX=u)?$X^qUqiWnxkGI=dlz8+eDq+cd$(V}C15(JSIoHUm}g z%bNI%)i_~oi)FXj+z$KAb5h$RE7_<1p)#N2!94D)F#iPN)`PA?h9G7il!?jtsY2`7 zeKEon&Z?szl{$xKG|p~MoCF+r5g(G9zBYFW=F2saDSKl{v&4Az!L83)qf^8rRgOmJ(&?goFs#+CR59)z2Td}y0r4qUf zse!vpMe-I>wM0pI2|HA*1HrPm4={@)2Heisxwv+fgmV7~QJ|Nz#W!zw8rO5Iwq6w^ z^k~wmz?@0SgnQg1-zLR-{fI={Ehf;lLEm*vKvBT3@6lIvnK^cMvVcg`xMOQ@tBvWk z5IlZlmzrS3y?Tl4gO+y&mpgbMm7MG@0qg0wqWN32 z!GDFjpO3IWvQ8rOH4yr>*+2KzF+vYqHrpH>C2zTq7Vir@@nsl*?`Sg|RaMLVbMC}M zB;SqAg1#s<>ci3W?-vl;ex)gs0hUJV?vr9BA;2&Dz36>1krJ?CJZu))Y`b7s*vXl<8}Xr_%O4ZtnlC4vhuCwq;u)6#c9h zvc~3ypGLMgLCllmmf|J(=W6_*owC#&(ZOWGr%<_I!*b$(7-YuA?rTXhcHpAI1jery zkVb;d!oFLYE6jbmFeIA!=!@ze7P9Yjk+G~$_8FpOYw7vPv~c@3{^Ezh`S32+ntb^I zLxo6~uw3(h`nsls|AvN~kwaj8=o;^P;-RLX={LIvk`_M$&D9GrpBI|q1&E#5Ka|q= zhkr0~>aWlnHFY{3O){EIz8kcIxxQX#SV8&xR|VC!UEOm?sz&s}>1$UPBwqpRQm0Oa;PIP2skzc$+Y1Jcwx`zl?% z{;}S5(ZYQ|qU*qhgKSgHx|?h;kPo>!6Tt8}b{Wx^25!SMN(dSIW+uaTKw@PPuJffK z{Gra_amyq#%}~-qBTJEd{PYmqZpL96ki(iYYaDRzbi9+6cdc0Eqb@bRv=B;mFL}f}9*9mla;%r z_6u)VL7nj+?--e!%EB?`>0!4jp_$!LtEcr|j=~YdTAWeWHlz}^fib3SCa^)p*W_8ISP2Ls;GvNmGTQetJ8D9FKaNdt|QR#cTkibt&?>cW%i4q>v*g za^2eSI`0>kMww1G9MZoTtSL#Hv_u3V1=ma(HyYwsyVNMXMC_OXmP~1ER!g$OS#U3k z?rA}v%F_Do@1g@4`?~}G@5yh)Jizu-^#Ym^pTL7^jMFRx&4064WpfQ-M!sUD&HHO? zU9F5bXGG6Gbqp2)oYvkVttWJ`eE*Q@y-f&({lTPanMq6PyU8L1 z7U|+VsehjfeF37-)g3{i9|Wa?kxx8cI9LYT z`aH&P@n9Gq=|2dvRlL_qWOkKSs;ir$X_17(X#3QH6BnTc>a8I*U)K*Y!0D#|kk>T5 zPd{KiPj7bRU2pm}`4Pi!Hg=g5KsMAekVL}9sM0V3S0@Un=1mf+)s{@%*3P+U2Od`N z<{hjDvu1?0R=sBkXB1o)Dg<~AJ_^X^)y7M2%ev;pPOQ#j@bJ|bzc6gFEc95+wxlDM z$4$pGixmrXx;a!yq8nop?02uCx+8kJR#ME_FEgG85^OZ4y6x`#X4?98&Dvq$@yRA; zzL$rq&$py*PHW`Hj|`0TR_8>eGy4ymPFCnr9)D9`>xx?lp8@0)3f)MIndJ zdJVH=#q!yF7qO|6(T`fEmT-UZ$MZsr?r+es5?9i`8SVN zPEch+zxN(Fr*||>)10*Bd$}OaSerPEV(dTqZi%wzLSW+iK`fG+Bf)_~$vxZb3dT3C zMH89EE15PuIuPOYCho^=i4CK9=%ea95IoC3AGS{K5G7P8UqB;c##J09fRV&biQf)i zexDE-go2q4YH5^XQpy*6Te2c(G^{PRFwMu}Ld^9-Bj3Ex$Uh0PQ{xW6f$etA$E8}2 z>!Sh6gNM1aTDyHbHM(*-aJ)gAB&BFE3wA&2%VrLY>tco2k1XS&mO6AoEV@Gd*luG^S zN#(7`J88a_UjP_vpUfikj-6k=gy-(jf4mrI>o{0W4QfJ^7O9Qp&<>mUXa}%pVgy_& z%bVq+9zd6s$F1`Q(u5kUX)(2iESPy-N3K^JT)%c|xdhm@Z}e~$ihzA-m(#3>0RdIL z^y9Yt6c4dln_2pdfK2@*$h0zgVMZ(92?%?I1jh?Omz(aC!h zsz=b9O-$H(vkN`c6z$4yA5sTW;8?{S&OF$HVf#RrG3xze=<@=TP~VX0t0>yujz9=i zhJxQjui@+QV!E8JkuPl?Cr8T_>cia)anV5ymbhK-tenTW4DotbBzDQF6St;(Z7~>4 z;poh;q%-&?1`LSqo`*d!or9dor7l=ph|*_^zI{i_(W?WhB4 z+Cew94-pmbdf?UiHnW!Ety!sWUHx?qJLZnnGEUhz=-0P-pxMsK1h(ID+l1tUTyc)@ zz+>j)ua`cFqdcI*?#72+%u(AE3t5oVZcH-zE(cL+mH~(G@rA(s?gpapOBHk$=wmth zX!KDlr&e-kpm_>!r(@F$NWfxB>32!(Q-S`rgYd`Rz3(WOxo8o6BI|eJy^2+zz=0VD~M#(;IK$ENvCyh~9p@PMSXI z`V*SU>fPb-6xG)2%~?IaiPcw8w~Q}5m58`@e+9%2PNrq$))?`9kR)^4UD_0#Xule9hi%3}LlYRlLiZ zG5JlE^NHQ&Jkk>Pby@Ja@`Jae(P1mK)Ku>v#zH2#`)uEnuFt?lU;N=W`dS|IbQ7_w zWB-9SS_eX~t(Q{2TZw;qz`o@68cAu&79GIW$R9VEaJ9Z5kiQXFRdaAhZltyn4QL%N z$F@6e!-RdT(0kgdm~!i4&4&7DPjYUehN#|PO%U=@>aJvIrMI#q9o7vLilhStw4Ert z53qx1UZSw}5SdMOq8+~AL^9$(;wMY<1@;LLR4tA z+v%?)>H{a3v5M=l*5^mZq`F*xcc8_-dM2bGX!mCFdSqp!T#++CW9}9G$8t!bEXzZG zOVhnoDwBENeV9?Qr}k+LOkGCl0gBWnV+0pk$3fhYd4oPU$8c3kn{WV&Oz~&~OYL$< zYpX$B`R5}rvoko{`be@sL@vI+A%!_<&y6$F(G#s7YP>$2_g2=j$d{0Q>Y(>2*kptA z-1Dd{N-?lfHbGZw+NI0e)0@UyxFZi}rJMCHT_$7^PL$5{ec}pKsR|`L5uct0k^N4W zEU3|+GU=(!pB`Al;q|Z}X>at$jdSWd8+73GGFCjz8ZwVMiH`aBl78)l zg1#h=yv-M!U6XJ&HXQuvrmpky6GP% zNhKYFXhEVziV@HLH2bwjO8P?u?t&$Omz?L&NfQDTzqJ}o@KQPLKpq8z9A=c59v$wm zL(La1k7iP|i6}+9R&ilLJa)4hoyj{e(E8|(&PsyDPMVfz?iO|-0t4_`uY?hJd6m2) zUc>Ts;dAJ%04dOh5McXUSxy0{@N%bYg1qy9jM{n?~8czfjG6+?0N>0 zYSy!RxTYRCB+;w<857pg5&k0_zu#pYhHukx0>gabYJy?D`qer77EHLS0@BEFAinLV zLJ1t7xdTpC0!DHTA0k9v!=`qOj78ss+!3}}OAp);27XTtmya_|)$>mald82_wtM

8i+YQe9-dEt~84^MB>30nN@ySCuj*E8<)U`<)J3 z^RgRiCwNrhaSprH%K4sjlsPG+Q{@sv%%Fh~tb>>IvFG!ef!%J8NCP3Ke#a!IXz)FP zm%lMG(%cv{khY1vPv880d=nbfo3VvLB&ifWkT1lbtBEv9b_3)5I`4Afde+eBvq*UW zk6k72bVn!2&CR<<(|-0G5Icy8f53&J53~4fJ~qKA8-lJ@pasc|lCZuuDlEZkb^}Mt zDFu#;`+@G{U5WZ*u9OF_>(&oqh`Mnj8;U$F!8i^%#Axo|Z&x0(fjm!3)%M}dN>N!G zvGT$=@X^BI+42p7E}Ju#SYFa^(ELUTdm^=o9U(b>atnUV;(#pWaoplAS*%`S#Tx|3 zB}YF_qdb{|?xOiRt7(;QK6BQHauP3#^w1G2b{c|*uf$&ZIppr!I#LK21UGEvx@|rB zjuQUH&mlU{`gs1#107(fUh3 zt;Y9>Xzr;vx3_Ra?-2{~GRk9kl+cxdkVNBbllcXmK*Yf~R(U8J}5G=4#UAdY4X>jUrn83(e)Do~>8Z*H$)f z4wR1U_kws5d^N;6olOVWFcS$_>W(ydHXc&X0pcgV-J*7+F~{N`P%NIGf7@3g`PuDW zL7wlE@l9?*jAS-zLA>8mg&KLE*5ZIqv4DuHD~UbaN((}v?EKO?{Tnae;vDS~nuOT{ z?%)x1>Xp2Um#z;A1|BU1=sFMW-E0+8jj@cj<{iCo3Y^Yr5wMTUsKO)P)p7?T*NV7xvO97hiC9Q??_M3X6)C(~dm%C%yS>965vJFNhr&{JbbQB4WU9bA)Em zS>!i{%MeV+<-1K4y62Abou|*8Xe2kQ8RP~~uiYSVr-~%A0Ky1n6~*!aVTPmL7xR zp85M-=xsGL55=!$_EHY2a090UAfypuT!Bz&gmg?mPO>G#eNdhgs8n-wcQM}dom(|I zw&D`~8Vx4mvJ2m`tHh+l8Hn_}=)@j8TH)qngXJxKj;D^UoRBv1zx=d-Ioya1xiWC2 zvOq`+VoS|K6ZO*SS95^tTCf@`Z#^^cPBp4er1A%=AIAW~=PGFi%h3&N=h5x-VN-2v z0xv%Z){}XJ%e?$KsU)=}E>Zy4RmXVY?E!OES62vlyPJwL{JRP=@xXC1{vJBIsRE}R z1*J+W)<6#T+@#HkY$6_L#zkJ{Dhn_y;9IbOdY`PjO+h+NK*l`c{*3%4o}fone90E5 z%8Sa@gMi8O8#7$Jy@%s~3YzmZk)FQcTp}@GFwgo=eu8UV=(F#yM@zXWOiAx;XbqnjNAQVoz9gzFVJC|jETX!e ze;1A#ol*?VJLVU376NvgBD@&FBuK-k`t*Ezd+k}pWZ0my|C9h@NNadp6U_YTbZv2!aExJeg)wCPF!=ApT^_5EWQzpwMMWEfZ zRQ8tVzH1&(88`*jn%2ovS$KrAj#VU8RvVB1qtOr!iXiNus#WAiZI^6 zt_=R0prIbgePm6#q#RWD&-Ca%RuptLwuK#GjKR}oDfz}FqmR!y&Qk{S!o zrf;`qZJLo~GRYNK|3}32j(OF0|CH*DzTgSN)1T8rE2*>KK*&T)bjSzP_DN3WEn<2Q z^gKIq6>JxEU_IJ{ahlV)$GU!~CIzE+Cx7^*h-NSrC~z$g){K-iGLM^~0^tEvUk$rwj^gL7`|FltKkYe$E$=!%xJ$==mDQ=Y&_rh3$ooN% z$+`rcbBAw-Q-I`|eS^6z(MMFtLGKQ2sc$&0%(R_6bXsB-3YUQ8(a(XPixx(a2gl3F z=4!2Ns?wDEDZaxzbmIFGjBvsSz?)M31(uij95%CHecX%9NtXXD>+Gx6;Usf*6`jcl z#K384tq-6TyN!1xV1_Iu#ypLZNY7$}Aq#rIz~|;GPe;1;3f~XSYwqy1?+`w(D|rZ( ze64$*+aO{Fv{EXr%+*{A{X0>pJ~>+IOT2tpWz(E=nhWiCS# zWt-73Ffe6(o7>oIszYR;qepgA)|P*Cy@p`_!4yOq92p1Phnikm>Tg}1I7EN>3BU>m_>Y-bZ51zFcQr-}+R?c&Q$5g@iR(? z=To*20PPn=1SiusI7N*{e)|SO5}{ z1bIL~b(U5@j-s)`vz95-`IW_d<29;*ZND+E=I={9oY$AtTf21va}!puiqJ4McGz^c zykD$wKWLYJWZ8Uobwez`R2jr{lDLKi5>3^r?ls4ojvf%w`=nu_1h#Go)|=8cG@^@! zB~GFKB?g5-fR2GILlqkMd{V&l+Ie>Bb;oFUmSKWi6+G9aeqLmt$?TlC@=#~B-LZhd z%5k0iixs*Z<@GrM3fEi=lDfMXuCZcMwZ-pDb38P3gl~leYoAYKDU!B5U`02L=cooI z1=~W)G{p0|#^EM^*?7cqJaT$Ppf2Ocjl%K@`YC;-lfI8g)h#)pZFD;3B|k-URzz-_ zEj1y1~+BH#C=8x%4H z1l{AccZTc|6{OW#=q?+4g7iH`UJ1wVFbizD>$b$Do3Q#o9m>I{5=ZNJCq%6cMQX5((&U{w-1;!GvO_wR{NU}MyoFKmbVESBjA*`k%+hs1XAL_ zLPH`Ok|@K6x#M-N`2JfOw=_8BKHloBWXFTcu*Jc36*jmhL#dLM>ruwqfxRqzg`vuu zpuuf24J8bsDv6J9t*uhx)8Cm5EIfhG^JG3bE+DepJI+mJWLP0K3waktiKr8OK0wcC zc;cJXmHpDE!yR(a%*>Bmo<`*TN;tfg6Pb6ALl@0Q+JP12snZ-t!AKM|HkIdpXLVM$ z_3ED4VjHn6lbw0Z?eevmu6TTZf5sh4V9yU`!v<_fqDKK950|)K_#>nGV=G>l=xBy= z>^*JCaEEQfC)&U+if+Luf_W9cfI9+{ z8&JHlo89Pm{)M#xO1r%njU@IPugy-j-Is3nd5_%}+lxNHkZXIw|FuW(rx1THZaDFs z_c|a;p%JD53DEMGjzYT>*-5`ywU-mW5^W&=axyR7-h^al9WqS+nY%a zPWyvwvqy`hHvUv~mMYr1(1q*$W#8IfeWN2%nHY@a7`jycmrX+@q}s};eArKacaM@vo8(q8wp#c#3bm16qiXoZ}P78M=*94Z6-5wK{4$vGNSfeiYc zhV8Vhg@#4KrJ81+2+N3%1#%*YIq?a;q1Dvr%8H1ux8>+3 zl$4Y-8>6-stTaLQEuy`(7lJMGp8t=tw+gFk>$-FkAi>>&y9W>M4nc!UaA)D}?hpvU z-GaLZ*Wm8%t_vsVnPl&;>ietCxjA?6AY-mEZH(UD-rCvhlFQlT4rmT2{kJ-jkxjX$ z>@9_r)|#ISLl)~4k*e(6dNHf6CtchPpmTeX!~^1_GzY6qatOriS_T-6Pxr3T_sN;Z zJFOw^RKQiwMe-zqg8wytHjM1O(s_>9Ox>t&h?xUcLy`#Ji`JLue zwqM84PM)_$4FbT@(P|7ai`**)XtEu+tUk79`keVNLD##xn9HYYE{@eFCHh=k(Ui?U z(h=~wcP-N&{qtFQpt5#O7mDb|L^+Tb8%CNnit4*{id3tFxp3JkIVdP7xo6M6V^TXP z&O#B8b7tW8`ib*)d|=CJIKmn8_7MkZylrq;Bk8MtIUg;#;srg6#K`B5<~Q>$=~Gqp znhHjb0v=R`YkADR$u*AgHd?C70L63`^+iUyCuKdL^!8{`=z3#*;nMM{C%N)LiJ=Ig zmB%}dTOOuOz!UH_gT6VIu}U#&g&L~~-ckg@632c?xwzqS7s;4g%=~(*QkGPT`?OSJ z`pK!2r70!hbPaX^?KSX$IFnc*^nuyppGE)SL~dU+>>9t>GAoOX65zB@1mQEqbOc+Y z4utI_LBmu;6UYJ0WlBz8f7!>gpYeAr-ltmPD*ZT|D+ihc>&fG~EjvE7s5L&Thf&_m zv9f9?`Gm1puc-Fmr9AWmLb>~y+8oUlXEVG!oV8Dt8kg+jC)S|Lg3p$lB?=f96DRh{ zmYQOZP4W%D$LquJ>)sRzJx&bd%r<)J)|SA|OrsH1x_;?e`j7{8Y(@1B}n za?pN9Ow{v1(>#fb3yPfE1E_Y5`<4^TY@{~WOkoQ=8E7e^dtQNNH{$tjM#No9k!F?{hSWF#8QTCg~%O&XJJT`ip*zV)dXA0rD|@C?kQ8(#1+xGaT8t9 zCiA=63g_j#yEMx=p6h+aSINJuUNmYFrvb`Lv8Grnc6%C^JXdzGN0<~Dw9*J|{ET_h zwPW?v+>Rc^pMmI_{fRBA4V}TMi!*s#S>qbn@aN|z1f*&(b(S|HOxr03?JaKR=!vFc z#f=qA4du~-%kfN%h%ymCWi5Z6Blrg)$+egj@c{2jS#&8mBD&SSgx2aP4E@IoprBDM zqg9YKy%}P1*aHdpTEW;;bw3=*mH9#W%TOE1|@O{+b#nlJPN!m+5Y06jANKFA& zgrgbq6f(J7VFaD>44I;J#cDf=%EozYzZ~h)3H-~)>#p$RX6|khrDU>bk3$WqY~NQ; zT`b14-MVFWR>XbvD^`_LRm#{IqAaEn4;uSDB$V#IZU!kq)6dp>3749P!KfN(p;{ut zhLaf{IfIzt+}aVPC?TK5o=V`z_v6M_hQsRXTE(jK2(Hp^dyT1IL+sYqJoy}YSjNxq z!8}D>QHtLkmS40Q&h~TarOj*(XN$|D^3H9De-y2&>EH^p(W)sr4S#K#w7Z|Je_O5e z>e8;4Q0hE#sQfLRO@H(4$eES1rhFey{0KXPPSfD_=;{E!>KG3M_4LM>GMjfI#*WQQ zP|4hWS03FVL2}QTddn6u_T3Wd@D~<=mBsLZ9dlrXS1b)JpOH^d!XZB0imZ!QkY*vY z{IYjs9_5szd6|b-Kki4?c(nD&!j{NFxjDh&tTt$f`r1AVvI-kYK=qE?E-t4*hcFvL z&6C3g6;q#TZ%(hc8x7=7MCTovoC72YR|Yt~qwRiK9CE|Fvos~5Xjhihe3kkpNPj%m@7k+Up zrIHWTFk|FB2z?DIrOL2m5+dNB`T_!Y?a}gR6^0j+CjQkJSe0 z06{X;v*{KxX{df?T|fGRI7LAm;doYxZKvS(5avKbJ)MB{?cwt1p87*~3x!1`y0rlF z)}J$*)*QL+TfCY#a*FLhyBAMk1$WEKos?$1bB)pj{=&;*5_XY=eZs{)ymf=NXu27! zcPZE}?x@>QQ5;QA#k)U(}(%SxgRX`yE|zxb=a~RE zp^q~u!&NB)rDC-Wt@EB3B~L*K_`M<`$UotXhl+1H2rqa+7}j>7RrWH=Ahi-<3fE8Y z3-xEjWoz})WvMSR{>VHMW+k5!X~l|U6gJG1qX?vcUpvG@DO+snwnM?N@&GIX;)qn0 zQ0iquba}iPK=1o`qpstev>-Pp+Ij3S8d?!nwor^RpE*;(-+2Z+cu&G5kzCH!3dZtB zVKryVk<3t@S&V<&BW^U@y|g_I%#`+(6E0A}V2DDkrgw}1$O!oO>sImt!YE^m+g=ph znwnqTzH79Qt!F1-R?gxT2^-M^*{x=QvUhJq(>|6x$ysSF8_cbpFUP z;dYjHlG_8Z1&-f293kL_xc9NW zH6yQIs@X1}Ew!eQ@mAvrlx7)VHV_2M@@ts>*~^>RP{()e4Q?^)bE9;k zIf2Bolt~HXd~3JBk$5lt+(L%>#?)g_@RdPD`C%#$Oe?p(oT>%*>)Y2pqIhn}-cXD` zzFBk!V+EGW86iQQDyEyxbWY^c%WNzp2zu||b;F+v0`3E#hO-L_OJUA3hO!#Tb1`oX zxy*K%PE!k=-JhlTQ+o{eo!l8jmQQH!gjPDBW`3G4CLmDx0axu1TKi>{KANZH*6x^Q z_2d`=wG8H}es@K+U{!aAv6H7Tx&DxESCr z6E^+69rU0&ovRH}nURlBs|+5|q0csVSnv($&Uz`a7Uw`xK=zWP2L) zo{O&?!$+j+h<`FKLi~;cDu9nwA`f4lQZ!<;uIhxfQ`Qy~#$fZ(*u{EW)@#fiiVTl- zr>l&Whz2H|zDl*rMa)|Mj7+^@-pcL_$;Zvdc)9b==cm^zlA=M%8YS}9!B?Q18|DIRm*BO$}Cg_c7IWcqfKPMwm&@b$UZTYVaX2DEoD!fs?y?MR2^=hj>s> zxl-#vWVLIAiekq9a(`pQ))eFxiKB&;YkczP2M%P#Vo=*te@SMytKo8-!KokM_zg74 z5H)Cwdd6@9bjXD}-(NaJm*_gjq8CWh+sCHdafZ-RHS1r016MXREcVQ)3Hyvd_?XrJ zdt$u#$mOF~Hu1j2BDxMa=9La<;~*KjGFz)ckAmgBw@rJ9TCh89e=ugw3&IIUOu>;^ zQl-_^VdIssUhoadl`jH&&JD#Wq0dgY)Km^!twX7kHdyE*>FuCRns;7azSx*V>? zYQEVwocMT2+WE2`&uqE(Rda#;Y_If$8x6^11GPp6q%>i}wpAT&`js;h4Um1lS<>%krmVh*NI?g~j)( zFqK&2Utu0(&T%t0O7}sU?d452Q(TP+s&TtmozV$I5OgDmV4jEl)PwrZ66jOvrP%8J zFF2sim`xmme}R#ggX{CdJUuj@G($@Lq_L^$KDoud#ER}HjjbA@UXkOgn)%HuLo`p$ z2k$|~*gjq_b4)YzaV*|pfEkICWfOj7MsnX*)ySuRG$rEcvZ^P@CR}3k5K7K?{6++& zGz6F~L~2!dRmwzGT>O*QSm69n?+{Uk=&qca<}@lBz;lK%+E{YV1(xx zDC0-eOQfy^9ZSxkrEtmXu!2v#{SNO-ja4-R%r-XE@OUIms=Ag7wSv~(@5|@=Bz@(e zbW`T7u;j1JDA7z(ej1rk`M@Jk*AEiEL!SxFZ5U}l2lVm04azM@5z>e!TNxIZ{J@$_ zf3UxqkkWFi2%YY}SQ4N20368?hni39cmutRV+`yvEsQt#`6u(CJ0}pg^-xkYYmi}+ z>iN6$@qR=`b1=n)iW) z5^qu$kv6M`B_IsAL0(Ub0PEzqTFu^`WUryu+SO#j`j+vG?+a;sJYIbf_II^w(M6;q zxBQb$03si#H$hfGjWq&tZ(^eG1x|QWKv1<)A^c#L*c*45RT2~J`(HYn#+U7DUb0q; zveGGY;yoD}mgO7d_t20mun)p?B(*$@4KQu>T)(r72bxCO@;9~&G4DS3u8R!Eu+{^4 zBbJnJNAkf_1W(aEs;>=*$ioVsbzGY>z!&6skh-Y)&2n~a^lq#q)*&di$ z>nai+eDzE7dxJqwECTx&a^SQZ_j#~-_7^t5n9M#krPiNuWzCNG=-b9e+lu)=yFIuyjJ8s~W8FSr8#C?Kpe8Q@hfxYfNafemj@)^q%B{UUL2NJhX1* zZ(i1ot8!>LXUbAe?7r{3U-o2$c%N)c8DR>C=sdx^t$f};KhM5nU5&RKH24f4H?dR6 z`1H!&q5Mgo^=uO{mmBojh)mjBH5(}Iw3ix89@alDWlJ(#=`L(fmAI3d2dr>WcJ{@E zA!(}zpI{2Chv4{2a`(w+DjnV3vM&X7N=;jcpJTmH$HxAMYhCj3irE?7;u)yU(S7Ai zF8j8yX5lCBb@ZS~`vrXN7uZ@z2mj$rD$Zn7NBPY}IIEMOAwyg(R3fCnE(F|zsHXKt zBHq%hDSm8UlM_$Qo2hL?MZo7j9=A%ilpUi)cp+<(O#Nk;Yo?n<{FCZzgq3N=|ArsFr|t4NB;nNA;$9^K$%q$mshsJC^jY8@G_y7u|d#F&DebJm%a1 z3=IG1=B)NQbphK)40`Qm*pj#Y3n2}=)|412-r4Ko#55<7a4N12UDpKB6J!wYG|xIXP@ZHdz=Q7JQdeF?Yn zMPURVOw~a;SxzYHM=ZDvo235Gf)daK&VjF5+X$xxI_;hK*Bz!gn1gSngSN#^EKe-V zp(ZN{>mpO**>16+M`G$UktnUjRGY!-a)8iauwet~4gu}M<~udK=5zpM!!z?u|WuDSG&nGcuJectkIg$_LFQ~r_5v6?^~K%o5L3_%x|jlb}_ zc140~*@gNkp) z(&_EaRHoULFJ^1d09^QwW`D9zp%(>8owqrg zE$ShOP`XRI0fHofCQN?mggLk!;%7cd>!x04)uT+XZ7Ue@8fWXP+FnYgYowbpFkuz; z0qY{E;DWi{l6)R9WR?d^%N6oH;BCveWM~Dd1#;ZAm~3X;xxEidn*#qgCnADM4wpR4 z=;ryINj<5PQrDV2IeVJU7;q#F70h+7GoSWdEj|JV2vxp=`aIsFV`#vA4o!A9!F#sr zaf|?sn>6wkIt}oLKn2#m7Z6s?e$31Zp+S`K_BY28qao*;ALCQV`uiLYbES+8 zD5e>Gl(?S|aGnw$F5ZX^j#F zKi3C(+*!4@kD0b;ZuvBaL+{4YoX2s6I6WOv-{FoWJ#BeicN zw2ocSjo`gw2{K7GL>nARI3~C94^TIk$G&bKPhlV9(x-g75+km@Cwk8_=Kk>mf6BVn z7_9M<#TV2;E^A{0F>u*j5Uer*(?Ok8r3|maAJmS)52e&*%G(iEV2L0iucj0egOqg>^En8`s+8pE3j1L zq>UDa_{o1F$@vRBKzp%@h3xf|4*PgNx2_#WYB0<9n+B+gurmI9;wU6PnUFq_OL{EwQW=;_j}EBHvLFQNIYI0%f%#E^fDHh^H3IZ%26> z;3@v4=kB10?t(LkvBp3N8{cUK~Sko_X2AefLR1sCK1DMloW1;Cl?3oKV8;` zcyG|-=ePzGxNd%hK(n#nYZkbc0rAz=qiOba;`bACHCpaM;4f!*Y4$iV5bMN|cSF`2 z8J{a1sQKOdWuTY1jEZLTrA`*ZY;}`#Y8Q>&#lWq=Zsdf3Q^{aW?qu4!8JsqI;k-l) zLv-;GxzgW?9>yE6JDVva#PuRL894nabNg*8a&WQAf@|sM3~@qdt(+gI9Kb-l2)_}9 zw(HygUB8o%bOS7g+mTEvoHpk-S^<{iTh-A22TNkW^U9K#fBz3lLV9VNemH|+e(!uL z{J9${{TxE~eS@>c+Q1AMuz%UoD=`OR*h*ff zV$H02I5~LkoIcg|PS}|9+n3QDOQ}G1tH zLDK{u8zZ0Z9Mf21(slOPamJpmo3Xgs84^!aIkD?mz@QBxj>jx|@f z9W8z%9N8&!vK%DchK!chJD5l(Ez1?m$1fR**&MocoATdui3W&22wLE-(jUrlQ~f$| z0d|NSqJh(^RcTpYI}<4k_7XSlcehWH?!GFoqgyL}uTtB{2LTxVJ7~`2l+>>+X;@6( z?Rg~rTFUS_SR_#tvq6R0bza*Fkcw_5sNNR=Up%;p6 z-A@-f)5lL>Y^0OC`?k)p4iypl1-X-lD@aym^uh%?98GV!RRpxN7Pxhn;mr#2qxTal zoS?;zv3#jgatv52yFFc@f?_Tl6m25t%MPhP-?n$>9xG&Y@ah9fpD$w%HseY+kX&68 zv7@?8#lrTaU-m9}VWhT`bFmL+auO_06@MutT;?mqQHVai4U>TR;#t75P_WG)5lH(R z5{nwLnQ6`!2M9pOA>wq`(!6)7vwGTNY|lJSL{(gqTx0(@~*Xp4V6 zaAN=bppRBZoHmyh$L1JMB*Vi@Mm0~+u;lVDb8;H=%AA<9|7A{&{xT=t0awnu$a);l z0kfas4Oad?n3Ko2sl;Yk3AGZ%FdgHQ2pZcTeBI7*o!gks#bsX_6)Q>K_6d^^zoi1i zo2)mazBtwY!Y6I30mA|02QI9`th3f@VnA4;qOq`B+v|CJi|{OTAxqy{Vb|gh{$@VM zYNpp9XW=45cDMiOxEcC(KPP8d4{yyVkTh5~7VEoD;P1Bnzy-PCuMp0CRcF-Dz3NRS zu_;zgw7Itu(%^GMB(X{Lg_|jVjM*FLQTBuH8P?aneR?K_FZ!lS9u?80xnik)r>c?7xSOcuIXz5a=0}+A#32*J0vp)_E^+BoEq%kuJ_bBTDXkHBkcTaDV zjo*1L+|)n%D~6=M>Niu*tKskk2?QqnvZ}p~i7b;}yIdRP*wlb?3+F~0^hBTF+fg3Z z~Ku<7wEpNM>Q9Pab9M|q?w`B2v(w^WYRBZNz_&O0@0rWFHrc@ zCpWopS>)}_4Ku=#)@?+J1}8(`md3?6CqsORVl;^Xq*J~cod~WvAJqQCVF>b)j{nr> z!<}ddvEWVv+t|+=1S-oJ5HZcGbVDPw>B2}`Vz2QLK76!#Ef#`b4L$fx>|f*K{z4qz z>0p#AG?N`l9lyY0oZ$MVeUzTH${tdySRhF70#-^209fI#D`;KosPwPN$qTWs89!ih z5>5uN=^GZfEN_Sjyo) z^w7Tp{ogbG?;&g=04^$a&fPy7fzSG1Blrt40fR$}Q1w?$!seMxXRQ|ZWTTs!-TILU zecn6Gwl^M9xa>vbyd)`L+1sWLf+I`jn%Ian?fz^`XwCN@a^>W)T%>P*t`*uUdSG-} zB8!@8lt!D|FC2?TI5|ylud_R(agEyC@9)4QqzZ)!B(wuk9iQVr3XSpI;;*zhB%ZGL zplmMuX-`^LdUI;``=-^0qxlc8w{=jOB|xFpqKsM12m9yi_9xv3fN7SZT<(GTdTJc%kLv@hE-8iwxHlbCt?*uxAYF;-V<6 zwvk_F$3!ZV6&Tg}d`kp{WYW?_rQRKZNiU2pfsnr`7Ex@l$Do@H$XEFC=Q+Aj@z`L< zoX%)4Gzv0QyNrRkJJo~Ob*Qdoh+JC!FptarLQN^N_Tn|3NXREO z8*9H_hkjorvc9w~ezFjp9d@tm<$ZXx4JeJEwA5Pm6YJy_ zMqeSR!1h2_;@7fX4&jMr3E^|CE0?xpE7}*E*5ZP%ia72rkJK~oN$#E-NE=|EJ2KLX zWMfMcFWhE-ef``lcYwojz#wlH$VR`?j$ZvYxI~bF zGz<3ldRS(5S~n+Bs{wN&XqTUv#7}2^Vc@>bq7du(`QhvnXn~fyZ>QA%Hap`>9U*H& z$@xbid&M7%b*?KWuSAyH{FIYy;6H@P`}&2J4xE`tf&jlGAAQtWO<4Gl*ogb9NTT|e=L?BCgp$~Wpw?mum^RjX zfV%-!p(uJ*EoUpLUl8`5{7<1lsOdFo){P9-)24kB@l+R7wA_HtIKq zTFv~FGDt~Ttor!$fP*i({$-xtzy6-htVH96NI+_6)*7if%2uvliWDD~KYgk0r_RaJ zco3bMIp2dtE4O{)r&D- zlxwtxb36YzWOGKjm{aJe%Cie-^4=r$mqlyu(c@^fI6`VSoR@63i&aRZ(U~!vEt+d3 zzKj1sr!X@(pj_n1MXSsHMYc0vdn~I4n-SV6h2_EN+mNpQbW*hDeu;&N$u#Spmqp4y!rq8Kcidu$4=%MK6K zHSZCd4#8SAd|V!Tbc~4ExbbvFWy#WNi*h#5Jh}juc^>-4q`_jYRBnb=RUxJyC3%OP zC$#|ZMbEbc8@pf4uW|vbh`HM={#=p9H$6(lxRc+E;O56yR%9ii=S;jMm6&NH&hOVt zh4u70Kd?QSb#YnjGLCcS8fh1V(Gt0L%uV=)Tlgbp7PZURQ_TLydPsykn#7^U(GKch z=|OX2L?I0HxAA~XSCYGRDz>C%Atb=oMwWA3w0n^a<~f3DOeYK7bhN}!7H_7&{c*Wd zV$1T-c+mq~c#*CIZS^eH&1zK)4^ji9#}8i7dgxeu;SC~)`SePAj4F&WM|MCD$Oy-u zx-TM-InnDaxu&BNlWeIN6V4@lG`R9l+-&B@S#5_`{{Sq{a#1FLdS$yeDCj)$N^iI`PJZT(LtG+$acpZ9eFEZO zt;{%Vs0(fgGU%%uaVL<{hGnWj^NmS$Q6D4SENUFIYS?cFIOEaE)fLNhhI21pG<{$n zQA{&&r>pVsL$_WR?~)W)sw+l$GCSxf?0nbGWRCc=0ouYC2uL1FwA>AN16viB26URQ zLrOouF4R1xC?(b4oeP7t*v&;n+?~HPpx}4nIvT7FIvqzRNVf9aOT3@E8(!E7X@3V~uj|X>8daooe@T#n_eS+=m4$-4Oh| zUUDw7-xWaLGdsmTjI8HNlKn6kAgzQlEBitiojEj#k6@BNnrQTR;rC*9k{hp{1sUR{gtp5h|IC0#D12UP5NJ%fo}VFmfbw${ zUI%K!kyXKH?SDYWnLj$+NKn2Z=)ckT`F4v8)rVcWDo9ns=4C?E>M18n3PGWKIP|{*g6GV4p_>zU<3zvBM_Ti?nGAl zf9))Ml>K%0$tRnnb-twJ{=L`l01D1qpuA$Z!pJc^vIjSIkcG0dYhXQy+n%Tj#6y^MjX#HPubHsib46d2F{0IE08(S! z(|pD!dnUha2~fb93SSHw@b|)#KcnUeZhLY2(!s}S z*Xst{S7}FL9tzPkF^lUD5}cZyQ|C9De?FI<_^@g%wQqIkYt~SV8x2`Tuei%nA^Q{q_Bx3g+txSd&?p|E52pE>Z<(D)cIoJ7P@E+vwhtDWnZHy*P zj!1@3n!*hWh1}>L`~td`7z4mBHnS;Ip?r?Vy_MTd5WSKs6R_PBSiGLAD@ViXnSSVW z9z)b5DR}gBz${ITm)4JRI;Y@V{tlb+cdFb1F%qDJmv z++>{vVzt1(huuEY zyjCg!<4t@~$?W4b9dX$47oQ*$U4S(q%rXZO32W6@vFUH=Og1ASfr+kI+?wND9dDN! z_qW|#@1DJBh1Gza0|LM{k`#U-_cfj+5$ruT>tgZTyQ?qdHDuFt)mr6U?97yBV(}nT zUU5}pK*7O5@Nw5?ms?d2uK`{gYGXw>Z%ww!y=sm3R^Nn{)DWB-$$k3yyPWS{{nJp3 z9C+h_DGJjP#S?q@*Uew_0XH3kw9~dI>1#%QBu;s_DEb8DJlk;D3fZ~RloP57!#%ip~ZYh{~n z|5W~tWWS(TU;5>~H0qlMcb`g#IOb>pDtv$^@0`Om_r01q{=?fEk3*}|W>2X<{L3k# zZQGom$9%4u=Gt`0E;PL{S_ANw6NHjLG%zjXuA6?_{Pu_2dtttM(2YL}-yb~WWf}VR zFo#dKLnSv!C%sMHIDyh&yHil^LPnh7w15$MO*B_EVm^YbQ0S0Aec!63Xc$`CU?cqt{+}aaRxQTP7KSc7axYo#fe+9$noz_>7{Xa7S~XFjZ~Q z;S)1Bh`59?a8aBgxwPS!@yAz~uEl@E@wmE;D?R#&QXe5-BbYUAbhSxuM z5vo5?=&yS1AgTv6Ir5x$M9d|`?x?TM7=DIJJzOad08TPasgy%9Hb zO?CD$h-}w}^Q0Fsr8gyoO5O*)0FH(ZW*Z&~>zV3t#FvcpN7VEBipM<2|nuPEg(*)#qVwS{E{zQ@W%D`gL&MnwIR6a zoHIW!M$~T|VuA-tIF|=yLSoQnR<5Ccs^uaX|DYx7dF^C}!0*Z}4~y>uB&-pet>tt= zk995q-iq7$OsU!f_=9n_Fl}>BEK8`PKv4~Vr)6%;T?^$ZIv@*QyCRN&6!)KGeew^* zPXJAH{Gga}jBnMtp4DlIZak{+s;{pRvW4_gD!>VpRN>qf^OIS zH|3%HVjrD^exnqsV9_J3lQk(R&~7d}7r)b}*~*S%PrZf^*zuqk93H(Q2uKXi63Y#x z8t-J&$Xa!E>iZT0fD-?3?DO3rC+5x;3MdwuWrH zA=KI_{|4dJ(QHi0bV)8!rhdr4g0$)aX5C5%N?5lW7^;e_zMg7ld^86?<$`{fjX3T5 zL4>06-K;GG5iP%0lXF%SeM)iVW~I(+NCYsGZW$|0*iY3Zb4_F9cT6j@ay9D_L^8O! z;bcEX&!_f03))7Wp<$l&O$m?Qja_L|cVTMzBg=|}?8=gs{)mHpBc1|Gpq+?+S}4&v zQSgQ9^nN&&i5iG)m2*oh&zqXoO;Be%`Gu%8 zr8b|?@Bst@7#%3swC~tnNo!xrtucx3V?}4HTAl` zao#!C`%^AN#4&Ah>&)aXSu$*;{|5|U&^EEZH~g?Y-ShTHXZ_gNqx2*qVV%QVIa>Vc zAiFrd!Ge!Kz)7`iZeyEwtA=1i)pAwNgHu<8+P{Y)_77R;c)BZ+Nu`q2Pxg}RukwR1 z5N@b`;@1{ag>PF`rqa>)zOji7HeRgPPTQpXG31ErU zFAdS2OBVBx#x=|E=t%A)W*=ql{A;hDS+mpRx3vLMWtQcAu&mxi%KrCwGquWOiw0-t zZV4@u4rC~jg0eA)L%H>{_pX>wup6U4P2LeVy6)f-a62oWAn#?WJ;6DTr~@gujJO#% zrLEccbmLHKa47lyhv4XH>{M!!UPh?oK@;Z2s&MQ;RHtA~`I=rtWdrmy!^C~KhcYz4 zDgH@l*-2E;*R9`lYFD%Q!-aDpXLGVL0B3=^0h5h!;;s#76Vpoi^u%gaZHV(5txu&D zo(6i1xSz$P=S|uMaX{$kXmLd=$HpaAWTF}m#6Y5~w}>EUnYsTd1(IdCDSgM)^rS#4 z)c>`SU+IqE={j9ag>w{ygJWsfI=+aob|WY^KNZ}~D*`C>m+lw@L?77cY*D-)c0M*+ z^yp?bQiJhq_(swO>;^%Gd{hMuTA$1%fm}M3#JU=I$f%ECQbH znk(f+5yP4E@=l3BHbr~Q!Wm@f%$lKh>*iRxH?5=nU z05nMlb;_PH)r!9txbGNdfBU#Rp66Evif7h3qlsKJuveV?s%ij*J~fTX?~(b)NkwiP zreO-8=H^E28M_Qk15(Td_Tvy0J@|nimpV{&qwu2(5B2VREgKAi44G&KpGldt{W{WA zQk^rlTZEUKgEBFOOT6EK9OO_vSArE>^BM3C6k*yR?Q@}T^MdO@#Rmn^ZGL>SHr^Di zq>+UfRm$0Ji1|`UN$GI7J6nKD8g60i9^?4f&}<%HO*e>SuhI-GcU&c2&Yxa4P;LYHHwK$ z?g(r6iwH6o#iq$>S4J|}t$&v!|LR#f2cCStHsr0bWNy6&6k+OCT=N&&BxiwjnEdWR z;}za0QtFrk_a?AQGeHkh3`}YSlwu;^GPOWdOpv^#+52a%r#)Y6B1MUIpq^8Kn2n}g z_IrR%u71A*&oD%32W}_+0D6n zOyDPtGH0`uw0mcZg4vUbj1zpUx1OGHVZPjH2b%Ffz>rQthd0Ts zmhl%Qw!D7G#Hn_GVy_(2I5c!Kq(Kx)cQ6O6Ds?LyF6aZr0^~c^ z3DXZ?W7(1rgf^4cIsVny?_&n4+#62$M3Y@k!7^csVsl(8_3<4I*0V|Poz6<(mVNFN zxSGNth z+iNxvW&Z+7T88*W|7z@y`ONBk!$iDvgiS3vNLxi)CXqx?qk+(J@8@oEmwykLzlEeZ z#zuv@fQQI8fAd(S$ZmPQu+goSC%u#~IQw+^^w4%&gVzbo9>ZV2=>j9+0|M+DM1|W$ zDr`~w|Fv%bT9~j%l<2x{h2YRSUj)wDh2$4>H(?}jxP1<2jW2l6VIwkpp1FDpKD8=T zVAUC2q9J@Iww?%8=uJayyj_3$T26RlC(gwY7QF7$A2Tkvi42ZJY=ciehPO{7r+hcmEtD)C;ZBU$#+0Xhp^O@4V*CNKhq zfJlLwC^bY=LBHok-!9727n#qQ_4;Ehg)B&S!DyA+i4}RVrek;>k}<2Jk0d9(KAK1V5TUK`!U#a%#W74- z;w}6fP)jlKxNhQlD!)x0k}nv6O-#bP^(7 zMd=uxO=XPE`q?9@J0-L$RFC401(7$rc+zY(St(?}6Sqe?xrc$>;qvy?kf8+L^hHGG zwDt(+Q>7MFT|pLLk(Xj*V9=1!JNPs6?%_GqswY6oMNTi+2AN#Fr$x_cUA-1cm+sDA z8+vrz_6>VYS;~$OxV5*hRJS5T1l+ox*k=7{NF6XDa5MSz)Y6SVUUFZ9eEI?i4mxsSkEm!}yUn%OQiNAni(I@|Uh1x?0;A@nW4iv~L8{*v(s- z0TtXGFPiqjo%y9B75yn72L)pZ@HN-CJ@Vl~ak{6M!_fmrFj0?hFott{&ADuhyAD#QT z!2x?Q*nQS%r+`9+;7GM%GqGxRQ9#t_Rj)QB=gZ59(1=c5x*PTEV13S$j#q4M-yKhq zYqT4_Jf6)Zxl5ArSCmjtU9Ngk{Xf+u^Vl&j|D{p9h1$nfCKZ9Ej6R+=MEInkw z!2TrWs++e!5X}?ujc`J|;Z)S5<-KB#D6{c}-$9Gajb#J(%%OupV~;ehC=vVit-7@1 zjzv@Mi1x>&n7O0i_jE;;q^UdPG>Fdx-ui284vhN|^a);rpp!6T-8^fdtSr(uu|Q04 zh=Vyds9#^aNS4O|9evp1RmMI`&13zRqy1OLPQ6;T^e8PaTN|Kovkm@A81#WCiIa5r zKs%#stRCju%#NWd<8aXDnoa1M=R1%7KTs?zHguy9L3)Kh=D_F0y4g1ZR6{$IV;{$% z1gLDPze@i{HSLFe&D|JYe+ppL0@mvOf}O8QY-jV{q~YI|T__-J{>?Bd5WBHU5i+mg zeSY}uPi?^@J@7-AU_hqP+wcG1eg()RuU-kd|H~^u_g}9B(&7%It)b*g`w>TGWFbmf zR46!dKy!uPwn~)y8-qj)(?bR!i#wZ(>g7?G-|hBH8Gk?VU_6bXMzVEPwg3~kiO1R%L=o3#sAQM0Y(rCkcNdiVnqH=Ww_;4uUnn!}tT! zNW9)p@}H0LuR#OwgaPzw8yA_y*W{W1{&!HXc_%2g+u#4sv29+(@&+EtlmAOB2c}B^ zhSUC^Hs-$rZ4&tb8a0g1?WsVP$L;CV)S7q1k=%dV9(aYQ(UM3V0PP-%7Y-p;SzlD8(+afnO$)SH` zUtwB2a}rM~cSlQU#_1ASSPJR;;zqAq2JJTIboSF_X$}?^uZI;?d%Vn+@ESbpbNjEr zNm-^bH*FKw8~)xlouw)Z^kMt~v|Iku2hCp!Bo@67<^ zL2Jz1BKpzu=O=$bA@MAfb{}_V3u(Yb0vWE`#2eXa^EO818R+Vwy?DHwT@h*+%h$=# zh){b^k3P~FjyX6{=+{a^1Nb$j0LE7n0H#$RfOYw^#a~{hXQ78?buMVV?|~rhW)B|f z0=?5_;wOod)k@kRw`Zt*%KBd2&2|J)siQDTe=g9-49H4t)0NJ$GhGsZ5qDM1UVJbQ z*(G~CgUSzD^7!ms#G{F4!vf2{nywdXac7f8nIa1Xl`#zyR>cy2O|ld+G5lj(tl`^|B(E&<}zgHun@_zP4d_wSSIo@mj__n-p5f1#BbTiumlFMB;nl zA>8o&0Edb%WyLCSln=mDXrG-YyH~gv2;rz`{iLkY`cH?j!$*eP!!)3JKU1oxQ6_JE z(vrUR2I*MvQ3Kh=FCSqn<&w?+FZYh?Te>AEmAglr;1DtZcqrW{w%L|xf zrS;ZVajJX*pmDC&H^}Dj5cfJYXG&1=j}*r8KnIQ+_PE-z(glB`5DryT2<9ej6vTJw z+wZduEEOse7G2WQ4JYyQXVy1#Vk_P@dS2tMY!~(FDbh4TsFbCZ=3fma%KikTc$yP; zT;z7C|9AnsPj7Wi9=o;?DLpLGD&eMxx2wml)w#VpaEN^JI*B6TW8Q%dN5~< z*KkiTg$Fe8hlK^h)TgW5HNbQAwMV};=}Rv%oNM*82K_v@S>(oC2HMuSyKsL}$-iLc zkpec$I-qF!!fIA|GKCyh=3?y{ZWTjQ*(6uNOOI5x_JhY!IaAiKh{ST1+H^C0*xDag4k3o0I@u4%g!@w$*VP90ZYeA#zYe!L zV++g&4!cGJ?RKE|fh|Lyi56QDyx?Cgc0RC#Wb8fPrbcqCDdTt;>)uRN{Qk+amM-(y}k7JT)MPsb(^H(F=s8fy+fC zfN&Ju^zh;*z;o@(O~!mumC?KOJt4o;ZOYCDkQQ(j9>raBXi{R0VaH*1ySvnUyYfGSjDvw}c2bo6V}5Qgc2RaR7`@cW-Ehe@`~R@_ z)?Zb9d*8Q|Y`Q_ZK|s2@Q@TT1y1PNT1eB0&>29REySux)<6ihZ=Zxz*_doDFW86Qf zIFRkybIrZx=k<;k1{=vS8yA*mde!QWzK;G|%Ud3=+j6at7qgGn0%X4RS@-Bwqd-yk$Z6G*88m&@Y#|$2$?u7^iYTM*PkK_aY z#bxJj28-OefNURD>|i~=Gb8fEGoch9E$tWxDnCMFTvSXtWj*0GX15&$Jotr{-Q&Mh zv|PsM*cdBfOc(v<12TE)S#khbRLkn-xr0U$_$9}T(u~zMc24lja+>9v8{beq=%ga| zbdu1dsJ%p~`&0&}goJac*(aSZnTyg?Lt9*jbjq!S7l%?~hxf1l{(UcQu>ICT1iZKI zQYN5yntH^)nRP7_`j`D)YtrPVYjAgvj6_~2=db-ffmclHY?-q+>4|Hexfd9QwEh0j zYY-)p_7A-6w7_onub{X-7_+fVe_7Ri$(LXcGU`vrDNZPE2?YH2$yCl3YJ<7b?>)Z$ zY*&Xy_=eH;=EX;r`nMO~tsFCf#WI7-`$33$@u2L22G{0pi)>`aA<)DPDLG!~aV6-~ zXF=lI{yHZ*72=J<4V7?hw5EqCT;^03^-zz$2+emaxM>SVJhB^Xy} zmt2WfyAeb+kI9q8J%|J&&2{wksT_>&qUEo#nbEXH57q^o-?U#Wq)i>|4b844v|cNt zIbTMhRe|MV;_%m_JbO@U@p*m#(CkmE)uuvm!}aQ7AWAOARlqM8n!eQR^4EQhTKcZ2 zhqPamJM%oe5nV7EGe1*}xpFp)@FPuhRDDN!m1@sHUw}iiDbfKWd7af8*%0Mjr~#fu z%+6tF?&dAFY0keL4O?2hMnVLe4gbP7X2+Yo#K|17IJSe7Mcm>7cO6aFXO0JSWphcE zLZ8_~F?h$4Hk&WH;b(R(9;eaE(&$aU^C$4AQi%!(?UZTC<;{5td?tzUtcL0m{#*=t znah06_a4T1VA7|hzsUId@MNT)iYB%GH$O$aAmH{E7{)4!NA67jeD!X?VQsmpIM{PCQR*^@vy7mMaBHzqBKiql^K6gJ^v0DJO(d5 zWaCq8xO(W^e18Gb4H=4>cf*`_J^o6RoKUUbO$NzFl3t;aM|B<`C%b8h>N(j|WL<(Y z0FTRIz0e%RcG{=7CDa(j@iKZo5xKZ;-(1++l;(t=L}!Yc0oE?vAx*I;i(?23Wj9jn!jC_g%q=n|b&Zbj05n*gah%>C zFwvWiT4d|~F64=bKw9s`yFefmRYE?vKq}@9FpEj1%?C)E%ohi)u*e!(w+dcot5~md z|8Lmt3`S_9unx7xMXu+bW)T!C$*+!Mc*2>G%v`C#BtV4riwPHZO--5g;u{7wGhRCW zlDFx~|J=)6M%@8rpDbS(kYuOPF#zkSD^M*}DFmpGRHgl@Jg4-G@5pKF399<#^O*oa zWqQ=2CtE*IHzu8*C-z0wr%Ivy7eL!L%RImj{3hUGMUiDe509(5GS_k8?4I^GCNXP< zs$$~0Osv;- ziqR>Zr_{Wm6w7SxS~Eef%2Xx;$+MaWN}iT@+YSU;_ZtBl!_a(2qB~@V-(gl|2ZGwN z2JYR9`UYvcD!?v7Xc+`CVnKg>SddiyFOYn3Q^;#fSdoSW7~b%!lkHhp{c-2;tZ0&R z0;%6xAEgGijUd;~KqPT(J`9^v-gQPA0J}>a1<3F=50VEWq+@6J?%}A7tg^Rqdn|hW z*%;q)0hIXTH^3oTG;le_4`kHdHkRER4>5usk}zD~4DE^A52+~{92Gu6Wn~F`nj{n1Co51XxmR0T7bj5 z@qE6x;;B!C1DyAle=%W*N$e$trw}3_*sVtvja@mUYY;CNRklV}Nv~mr%F|o4<(%WT zwWZH~XS{dF=nYC$LmX9+R#BXPSN4xWdDLA^EVi5BXL3nOuidU;b`&^B+$t=RcLq>aOO|DGnaBBbZAV@QV~b6x`@TK^!kymS zgmFb!zzy=j{awH{1W@`nQIlMbLzLj<3ZSV8;{_FH^y(vlkgjzeSHHGVD{>lW2dN>K zkEJk^Gl29#MLvoQ@y9zO0(ANajt!kR9>5Qj^*tZWHo3O{z0Q2nbxspZXWdAyworm( z_yLzQ7dMcyjy?X=+*PXduDLC;zd!#A7;EfM{#5?XAZsgo1&A3I)|?UXvV-jKv_udq zVDU`Oz_t6=!DT3=tCnF0I()=5jM`f!SoM0<}`Ct ziJGh_QY-n5*S3^meti$n_hOm}-&hTC7LLQ0#K9(kv+*=Ldc=%|Bg^kY-w7{=V%b(Z z*4`)4i0}xd1sAk_VoW}5u=Y;r66wXDqxunyyf^GkY23#%6ur@mgplr>8)!^O5scip z@4lRB3Va%xt>W`bsW@F-X=s+)H#Ncw@0`Wz1!GI#9ZUq&l-a1PbE8T9QpW4y0)5Bw zw1)t;0@B%UFE@#hukjcbUgu}Nk6N4JiuQ`A^MPdD=caBH2O(mXFy1T}J(GVzEWQMW zdV4u%<&Q{a7TV1o z#DOe3|6j4gG$58jwPZ1by^poU;*}_0y&p3Z9)Ch6I_2JsMKKp<=~zWc_q0BqNMIEaS+YlT*FsTz-dp*iiTc@Q)-Y zWe1F92(&OdpKRA00cFr=XM@`Dx!>P@2L4Of_FxLsNMQzDVV)N^Dhm}Ql)6^Y&WD4S z_7j$+p#-6`5|=Nvn_yricEOdfJ@m|+TVq;Tu~m}6cSA3kmKcPdPVBE zPZN}ZTGWompC>;9O#=Vn#B}j!gX<|YnP$%o2Xz4jhbE)m*+`t@u&52h%$G;xFa1hF zW=brOt*Q})EKG~%8oGxR!StzIHU%s29@sJ-9W4nd$aSE-kJ#31r#p|FAKIaj$9Z(Q z_A!tA`HjWfDCLgvd1Pd)1WE2BO6MAdRddHh59dU76BoC$dN_ARRhqXrxgr}HK6;Qep}jW_bSjoPBN#) z%=S18ujcOMJ^6)y>$?3**9>sW)UqbB*#NS1O5$JCEV;XM+B@}EzEw~kneDoW3RPHX z472dtPiIR=YesL9%ef=Dgn-pDK5M>Q%!qKXqG8p*3C!e&a)9;nw|t>0N}Zke#J`)f zHVrG>ig}ALrCg9g%p{rW)7BP)xa<>6m_Yt)RFHh~@%XGmI7!wZCvCP_lu~LXzYp=Z z0w>cUK`uW8OFO(Bf{<23JJP$IQ_dyeUXtUOPXJO9*K9)#y~e)+yWmfb3ec##IEkE_ zo_(;ehC3|-)N?1zl8aH6eG&weWp%TN=!hk^(GkdL$m%ReD7Qeh{G(+p*5Ha6VECq6 zxJ@X9{*2tiFYT>i2h&KV_Aw^@T=21}4f|Szgj42hT#P*pTatgs0XX%@886m2%Bmfc zF;Ka=Y?vCYzNfz~GZ{wu7zQocnx=!UDotK5@Jkbar{5*3!tbs-giNo-|EiGS3+%C& zs;K8Xf1^UyB#iO{6_0<)7Z%15Q|u~i@|?}I4m?q`%7B#LrGMLpQ+gt4@ts55{JxXmI|(Erl^u>+d`{IT@J0nX08AfC*kx3N z80*HJvdZ3S0-$n1d8gwoy|HUEe~=^DGs7s=4@km(cLF|dA;9C2SA|7a5b%B>h@kfN84zc=<2}DOFzL+&!&_>k zsWDGs#Si-K{CUP&_53RHzCgPQr+KOst9?f?9Ic%_Z|j_? zDs0eU0zTW7!9Iok0Xe}Lue&ogNo6jPm%D^FEv_oQG%47OG7qY^2ACHiWzQa0SY#g! zrh^i0+Rl2V=F7WHf|1DO;%jMDcI3&VCr(Y+{g8fVxwS~g^a8^kv%?i($3TA?vR|?p zm=T9X(n~IJ^3w0JPw+Kqd`R$u-#;g7z)i8k zE;<{Ggb;0vnBcJ@tybHTFWi0*O8;9KFn_f7-iPKCh+vi+QY*B_d~}w@L+4sQ)y~ghcWqC@=}uYOZ^KLuZbU1A>Ns6~_;(0MG`Vo$6}&vvzck-Qu<((r zM^_AnQ#>c#FL+4&7E|9Z*S27fIYM|*^@t3@*|zOS?h-VGQQ^q)#{3)v3yEwc7P=Er zoWLn4{gCYE68SlR$o}7AaH#fcADT{O>_{wiru+{(llk%Pf0ANqx46@;q0=vHk^|u~ zykMzY_&ZmP%jbG61kT&p^CqdRLfUVQGawQy?V>=m@zz$>`I;0Bf!z^?+x=@({_i?< z;&q>Y-4lX6Ij?xSK?3Q;4_t6eQNIO4vfleD8Xw~szlbPoS%MU6JwM4gi~7(m@m?W9yd~kF z8kWG5r_Tcs&xuw1-Ot|Gk4M)sUeAON?fSyEJU==`UQq& zy*Vjy(9@it*EBIDkk0SV<_e_qQ=22O?5q0DY|JHIIwr(mYawQXiV*#nWkcRdCuR&{!ls7h(}3M=?x_Zn7pX)m{(~1d0Wy6&%XQY zE9Rgp>N=Q8=Phwq%Zoh{SSYt5Uh6BBS3YSmLtUn^-zR&2^UAJu5Yu^dlfV34eBBff z70&^E!&1%rw$Hd3oAQD_sSv@m1c7Ze;UyQx8c)Wfl7~?#{Q@%e6)D8`%HL>m$}jTI z=y?Np$+hruU?g-_To&>ftoaHh_7aeU*Bj&rSxAd_9>wGh_N=^ZrK|f7&X1YVXzpf=mCy6{+BoJ^$gLjoHAq&QN{ex zvkGR)v0WGPUM=S1A=;nID&RV{S(VKbw~W@5hsrI*a6=AAI6u?s(`#*T?SW`*ujI*A z{+rwHxeXYu_?1R%PaV8FDVJV)8=B&In3rk+I6_FL{xcRScuicfHHGxw_%Z>s_u<+) z^uw^j*EhH11r`8T4L$h*JoaG?T+_>ljXqp8!zp z^-A6SEuh{RV4QWr-JM=SXS|R~g%`wUug$MPHoOxUcS%prjo3fa=yrkpeF3_++cF*R)4m2Z z;Ah;zN*0JChtU5g!apKt@=*pcE>ylx^2k#o3IB3k9`=7bhfq3wa#isli7NZ(eV zzishHXDwx)EP!YG;@?Q_xs~YmN9s80!lEU;Q!g?aRFr>vptA(uje8;!m!ON=(sZ0Z z*sL^UlXTSwdVk+ZvM&Xwa2FDAX*YLuBtQR`r5Q{OFeo<)kwN`8(7t9&L{P_21_2 zQsC9P1WLgEZ;}oI2c}QRPj`RRCKUe*zqerkueKCf670XePUQWzR`_qFG^zi+uK)FK z)&m~(B#y_j*|GG0?d~RAgTf;~{-~Na}juwD~5fR%;abV(1 z)MFypJr7u(Po2MANiR1urgWVl@aO@PW?*Xl^*H#G@xw75}d#P6^z1UP5Fr|GgRffAu>A=3u;s z^Kn?*rjP^s_Tkj_`ueNLp;JNBX7=H^Q;iPXDp`ciCwaw3S}4j~tD*&83P`4lu|?x( zRI-{HZ-Geu-WK|3bU%RrQe-8Xq=7gZahHn|F;~ePSHEw(;I7_9YDH44g=*Qb7}WAF z&5v&g|FO%PAnvw9E`uzQ2w4m=4A?B@Oo3!^8;QE5%2Q=x1r(?yMOx8pICj^_(b z3vO#4f%cKM5|eP@Qa6lL}P}vS3~B9AQ&s|LnQwZ(8&!zMiBODEPZ8XKS)LEd%u-A9-n# zZ}2!ZKKY-zDfK8s6sm;B-tS-TI^M4EzBJ0QtHb=TKc{Z?yswnt;zY&+msgIZ*VWN} zy6c}@V+#FNl%+90Q6cl$T)m8r+?HQE5Q9FK$Lr=hU0%*)q<`-dL6caAhyILUEtGF% zqyUOvx;lGgZUhb{DqT_SNfz@ieE|p+#P5<4qF>OPAX;$j`D4je;AO|+0wE=9U66yWM68%GdIPQyY&&p67Z@#T1xc?NaT1GfKvR?G*%CW8ZOxpV-u6K~2KUHams-ORQO6arx8i?|X>5m&)5>J+% z@v_!<#?sut=xF9Ngd2SmrlKq?xDmWB31q8r&A?PdO}8`ZC}B@m1JqY4ZuWR4 zo`ld84wvz3Uo=0P;IC`FQaJ5|ocTyjR*RY{SDHOC4L_a2?d?y;ywV;bN>pOF>Z~PU z0!Js{W*A0v|3orX*@Hv(^kQrRQSXSf$wkWPbQ*$nr(5^+S!<5U5eHX?Zj;j^pLR^+1Mpg^O&IhLg#qO|!xhS5?YTN7g4c?>| zCeO;gq<6Yzq<3`}7wSqr{&64f9bempbKZqUhGnrm@zj|v^<6)G(x?knw&xQIxQ`E04V`B|)T-(a@3 zpE_}wc|3mD`Tg}?)=2sv$3Ru9g>{<2jL~GsS06szzH57StoCXyAqgqMQ{pNDo3_Ntmfser6`dOkg zp2bi1CqIE^(H@Ke$I^@W3a)aX)fTbOgFOuIS1n^osFD5{UEJczFO7yssXT}2R6jTa zo2_?sxa@YCV!ET7!cf4FJrn%!IqZTWt7;$nCrzfBmAKtYY~)m6eL&thVHkU}HY4(* z=6sLeJuA<+JBDJYK2*1G8=_8!hAND{=(cDo2Dm2g3(wyG11gtk>bU*q#29UFa9?;^ zVJa=NmfC7$PoKrT&UvLT%(^~L1}MMk#+j*T%>8*S&yc1)vVC2Hm*LH5z1SVzP~RaIKM&L#QYsSDI3@Ax|QEurCd z|KyFP6;(UG9(2!U20WZAmpS82M~12@#c9;p?;l*lx2A+=|DGIn^$mk%Gzm0QyqR5(B&OX1Z-F2U{A$32*ZZrSD+iKvld zGuLzi>iyXw?~ev6YhN3oA9*(2%(_mIx6qTjYm9<-RAq1~gwL59Tv>;WT$btnKv~$M z#r5|9sst|QgA{j@sldQjhB@>3riz8e?$Oe`k1}IVveL^$G~(^i6) zzM{1pr7SAf$>On^#R^2=PO0tCu;01KZrd+4c=a~1ST3fjA9~+(kW<`48Gou*$bf8> zDKdVW0jTtBQel}s*YehGvwF$RoGys>&ik_CZ6LMxdi5d^iX3aPDT&SvwIg}XD?%?4 z1{E#)(?<31#NMJyg*lgK?G;bar&JyGbQf5`KOJx3iHBeA94bXW?Dl5|E855S-MY2k z9)9E{9V{96d81R0A?pn3%M>Y4f%ZfyO$S7}t6C^yRCoTNDX4ZHq|33ooS{<^Loy_Z z`5pb#OQ}mO#Hc?fAQWsF$CIY{G-qs%k`zo_+r}N`?J+yx&E!UQvtt%BkJ6Z^F}!sT}k*dFA9S3OIVHh znx0DwN7QfYGO!)l{ZL9Z|! z+_7-dl4_MlpTyUQO1WvuR7;mF?4wKg4X&VJF`^K8BPVm67CYQWZjs3!I?Kv9&q%eK z$@3NS=sq(0nzoQQ#{o9%q<(#pHW*24#~hBa7Z<0K0hti7lHc9x^NErKJ;iJ=P8sXuT9X4RI@n2U~?1XuBENJq;N&q!Q_ zi|wjDnlgc=ch(>R>#Eu$0GNZuvi5d7}si93;z7J}TZ z(AX5qKV&rWJ{-ZHk>yuea9x5OzGqx z)MCGT0U+|mD|oqdmYeYG%H@kREh~Qdu=CCzyj9*P;VHpKonZWnA4rh?fpN($W8EEN znGf#Ii-|Qt+0a)CjrqcwViZOz4Z}|Pq7j@(Kc;Z;{E&!;iPAd0X6e{Cs7v5CAn8AT zV-`{*St2dB?pv_uSf#!5=Cv?nQt^HuJWveGyY~NS!t>caJk=Be#kBI}I(5E~r)|&Y zrD7~kLyqYFusR7oXL;y%;L%h5_Axs}VIuC47U}k$2|Qwq&z+@ja?fG`P`*BP+u=<# zS|xeryQ)B8+g5GS_?5u%m)gRG6wh9>CvPf~gMAzYJ4Hh-6Yc%xZunC5_jRH>S7Y3A z(rY+6T)Yy-B9$nUqi9!wJm^j5Qx7|yU|-ZH>`ig`$)M1}{-cmc>(4`EFPEdn`OCQT zoGd=`I9>zey3f+2O5C`fUd&4Vt=DZRT(;HuWxi<7nUZJzRxaAclmq@vLSAx3%B3QP z8X5+$4|>b3X>!fRY4%oStF^PvsZr50{FL?|<~WPL3F`Wd6{XvfFNa=?w*+Q_G?4de znzpGx-N?bjb#Z}zbA<|n(eh8$RyEF^3MWcs+3aZ z2?AA|TSQ$_ElxO;-q)YN_!M6suU)7=uf5Qp4%sDp-4jwY+jayHdGif5GmJfKEUFwN zyzGiJ9yXof9$i|ZK9xKoP5K(IRScorz+^y;62g+iU+Q2x{vl*Niw$9-*?v2f#EKX9;>RX&k!MXk-nqwB^_x+ zxp!!HnD!?G_Y1OT5yH!C(M(lA;$sGZs9hF6kkozwpSx^_-xYd6M3}d#v<0+mcuVf5 z>sv!Pw7x8JQHf$1FSb1D7isx`eIf#GU-Ve17r^Q84*8?AU#sb`7)7~gr&VIDHz701 z&Caq7Dv5H(h!{@qao6|A4J%mu+N-g5?O-f%orAQ-+(&v}^~y@8$`~|*@fmM}O<63K ze&j~E4Lf->dfF=bcxDWX4!Jm#Z#K*girFF4czGZbOTW$(X=~y8?QlgLS9TI}4L}%+ zzZ_29oBx<*$Rrp}zVz{B zc8OnDH_x0Wd*RjU=A5=Tj(57viN;P|Pn!Qw*S1a*aDRdHfg1gi^Q9^4#O8(BORDyW zUj#bp8Qr9jBTd+6g}bWhwM)~jtD?gxK~3+w22)e8>q-Z4 z3y2Ss31>LdJek!z-`a=u`8};hbA^OL$^lXdB!7al_<9Pp+|Mv~4&*+MuuqPJaI7*J zc4yZ$KWF$iInzd9*+4BWTuszBC}Iwsr4R&@ z@EPQ?!i?vzL@Km6SGg@5-fSrxu}oE0NNrjW`N~HvFV-n&KJhc?v(-H5*chZ(j>tyY ze5WJQa+Y=BPv%O)DbJ>7dGvXH(9&9znY(*6IAJkb2q@-7@hol%(}3>FTeTf>VG9L` z9F*S&=qxk;IQ(%#WnmygTT*ryd?BRX42#vlW%oh?o;=*@1_VIgNeRlgE9iMmz`^I45ae|= z&0cG)mLMslj01mFIbVH9xLWKesE9Y3c`tnzmgT?4F*dNtD?)g-YFP>8<}djZQP^xG zmWT;`2!hy0ud2#6AQxE9=onmCNYa(a-`*L^wqe=_Vo(>kW*D&#?kW42qCV*@;u|>? z^7IW7!ff%1dMEvc7Cs}b@nL&}L#xtiGVWVA_TYIC72WXA|E2rS39tgy(+1~q25*1e zDe~@}#vdbS@hPgp>&XZsRFZ`1FB>7mF9Xn3bJl4x?#O0ehilU%a#E8&Wjz6H9?}Q} z%guU}Q6N% zcmz&6^*^N=K|bO4mUTW+onb*r1{%rt4s3OPd!Ua=dwyr6zn491r0qGsc2Yt{F8-kZ zRgx>+GeQmgCt1t%(g8(HC)wxVolb|d@Ohjtu1c*h_KMxGl|>iJ11j`>`xL7c*FDMv_eZ84498Uzt_F{e})@PTXb0WTVx z@6o2$pFOC^L6O?};?FiSmJ@l}t#41ouz&kj<(t^WNZ z<1tGP8M0CJ!w?+|zHH`&51}Xg#%1?+MnZ90`c_%32nd{&cNCYLTQNZG;@zm z(_WY?UxBil2R4o`aL_&H!jDB7uogNW(cYut&gV`z+Slg7&GH#cj3#nua;h2N_<+Sc zF7hDYSu=)095u`3eNp`>s|Od}$4exXASrm2{kjeH9mgT%w!;6Rz+hhVBTUV5$u%p? zKMz6rY*{jW(tfR$ovLa)o=aG8jNQFG(bJy>JlTcZ^gE*Zw&PQ) zB^`GCi5U@SGQh5Qy6$XL6~jabRi_mYygl zw3#kypT!ucZO3`WklPjZl(D!yUI~4F{gL7NT`N{)?ND~fi3@0G&u)S0jn{=D8@=K zqnK1a7sQ8yYd4ER^=^k4G>TO$ZGskQY&if{`GCz`y|NAEb<}l4R07%BCTW(&ggwx= z4%X{paa1YaEBz>=3%QyEBP7ITRLNg2je5pR}2oqze)Pd0q36bd5WMJi}96RyL zr^>p@ij2JKldO&0W>eZPs`z5#^C1L z^VyY+B+1>eR^9J&;fJEEN6ozh7V%MyVF(Jjtj_q2Cm$cSRU?vBtxw|lETve42rGGtAqfZrF z1Z?t(-lq`V_4UT8E%PTERakef6~2&A)-I_tJLyF~VXY^u8B8tyHVFvE<2c}S2gUir{>3Za>Iy}P z5CV;ZE$J2xx}u|l-$9qKL^_CJveIDExj}C(;IK0AQBC|9>`yQu_;g4WH@A!B=1!}#9US@*)n9*HckBXXhG|8Q-hJaObR~VIo1v!8UZWOuJX_qe^ zTw54%3<~3qh@zTVZRVFtY<9Kh6XIhC_ca4${Sr6=wY=!9HRYXUB;52uYbVb`0x@4m zG8Fn!SC6fo){jt#H0|KJaq5eN-+6XDi$^MnBC6(gT6B&%u8F@iTVbV2{y~?%A=aG? z%@+BNJvUqP10zfHDHK{6$xRQ+x2&u6>vErebiWCnXkzYb?^+LSxB$t2w4X-+b%Jx+h(2VE{|r*&?PP`0uFx>s-XTh}$w>EY?tu0tcO!?B@ou+g4j&kGS(^(P!!JKy-q^|g!;1zlCodte*;#U--YhViaodY$N8_;w<)Mh8ApRnlym^77S2>F_z`1YIm2;CG8rv0q6Fe1hzWwkI&k zz^K>w+2V(Xs}1&(os#ERXqWR9+KtJYu|#1hp@GTMcLCtCnjX>}YXKQZUdmO5l1^Ph zax#tU6FqJ7Mb2N~Yc}VeY{3JTZ>S?7EW@xf!ETO#z(_kF^J1rop4-fMTlVUuGkAdP z{VWjEkC)EvRvZrMtxom44B!sMGM znm1U-4Y->8$twNmkXjbgnrlUo@4^F95L-j^y&@m|1v6N6M{64Ua@@mK;ODoY z*eoC0+sJ8__hZ6xd&{zczHBY`30Nq)-)bW+l7t5)yo7#L^Dp7fm5uI+Lgmumo*ZJA zTyW~wSQI+QJ|Mn{V@~I2#B`1#s%sGp{8r5@(8`wLt{_Y)lu4xgTCJT@{yT4NuPmVUm zda*S)tzpv|?{B|6)0K{+Ne zVcQ)N21Kj{I>G6f5Z$HSz}u%~nKed;x|BI!XgK%E7jS zbsOG-s~D0H=b&yiL84-Oy!1EGC5Se6Q{JD<%7_{1cyA|kZFVQ0cCr`ICGmnt7@o(G zFTqSgpJcqsklTsg29hwcSW`fVK|a4@2P&ar44nH?lxDV3wRNgD^@+76=w;^+hR~<% z_7`F}`tzZOLjjXGman>+Yj?#Kri|Hzg?~kX1g?-=hgv5(<)Zd3E*0Q7YUAI%y9Z$v zLS6R`Awkwo2g~KvWB$GrJwtKI#x6rcQM!vv7-hNWd;|}kw$v#iPVr-VJZ^M=Xz2E_ ze{yf)R~6(~?I#VYe?a)*1~03J3ErNehe1!bW0lDl$I#^ioDpJ=j`z{DQJjjyA-Klh zHu0%{y%A0^iN;(H3HNZ5pAL_(M*P>rrC#CvbkDky$m>>v%w~mNbX>K5aw@LCJ>v8! z2xeH4$Wy?ySeu&~iXj{b!&ovG@sB;=c~>-84#ZhF24(v$Rcc7XE%fOZW6Wm0T-Dg> z1p=wQGsxnXS;D8%sTs0vq~*3^z-$&613Bc)pYJUF5gweg%%kECj0DK#_GakXtsbor z6WR}vG|lNGENzOXjucR)R8$(zt|odv&*_%)Zxqj|W7?Uf=`tlPl}lzFqTz*H5g3~uQZVDd@`TvelX>Cvfi;R0uAc;~opTX{xHzfakZ$r55^a9O4CZ$-~s@VKP% z+%T34vV*Dkzjqu>A*p(6x8n$MlYGr}FjxfE&jpK&xkTrKa~@Yy+PC}C4mKa|(H4wg zG&dsPpJ&T<1E!8Ly55JS-Qr2|55SWY_w-3-EcCVA1FvYBCIihxs)?UrRyMlXY&hIT z6Qv)aPLO4T(J$>;q}C9Yy3H{=9hgVOpYxxXQ&%DbZiV7Sy#WEuH*l@J z!fNpM2u7CDEXmtv(8;WJxAlgDt+xU)8V^b)$ikmAp_qT9X#E)|7EJEBb7)R{FZ=o0 zH512F0q~VVZ~ul49_L_VEbPxkpDt8U$Q4iU9&SuN-BGq!QLBXzZ|iqPdOBHe1h0}O zdN|{CYty_*PUI9dX5v|1mb$tj;LD6Qco*K@US%@sX0(Rq{MCDdDjd}T?}ane^1z=D z!h*~7a+``VZo}{WBwyq{D27M4X-m9Q4Eu{64oGBT%B4wH{OkGS9W zznFQ^51B{-pJdVB1O#xNqUDLHvH`716Bi^8O0$|8$sY$hBBEm*ypMytf7JnJgn%68 z#gLo|xE^iL;6hiP@06wFJ~zfs1jU)Z>8(NmNsf1@6&F=K5waTyG#=i>Miq%F*@y)_ zV2|`yiSQ&2h<0ApC8dBro%w!4<|TX=H!(SJl2fgpqX|qm&7{U1t#8E2_Cn*F#ElZ@& zjJwA#M|Q~}bAj5T`%^dIlm*0a@A)9kkg6t2xx3V?_~v*gRupgbu!(UZ3-gxao@DE9xL`Nvl6hn%0ih+k^3HH9k*UX4s;Fg|z=D zo{&M{D%5IO8ffB5(*tqSMg`dHW&offsLI!+PvfdKmn}(V&CNfUUY_7}zX+hyX_Vzg z!lRv;Za9C&oh!nvF>k-on@RT8_QU2xm`uskyCKvPtgYVTR;Hc57{(!#1zq9{tv z>&X=;oA>3)8C&#HFV8Uf*HpQVwv=8<;aOX6EPxcpx2qi&?9M(c_cvOkT{J8=7%?BT z`2g_|nK;vhe7NF&3*!A13OcuTQ2{c)m71AKg1Fls~>p<#K7iX^Me9 zoC(eDCk>Xjbgb^RB)n`h+<&0uleBf4X`rmLl=5nH{M}$zVSR7K?;3l9&$;eA)DSZS z@g4YtP)EX=_W?8=3k|Ozd@k>XKF-T`#$W)=67%A6VF3PeWq5m`8!)XRz1?{0WCbK| z$ucMt$cpfP?hm3`ue4AT%F|E(2l?4rD1Tpz;{DctD-77MLjSEW zaN1Piqt|KqeNQv1V5}OsfYi&6e31?)POi;+3yvnOjVDe>DK)fwf@KT~mBJ|^@_p)L z102Vk=>^=>vu!Bx7i+V;wYlWK2m1;ULB6W$5Nx{o+n?@Cda-5u?(lI@hb^2Hi--3_ zOdb1t9f~hr6e&mIB)E|@|&?Rb_L`u zN>SF5!nJPuSmb6SA19Rj!teDbfPfD+s_O7f5mX@PrD8pV=~^i~yRXM}6<_viEGBO> ztQrL38zFoX3#3abm^HYVie^W1RA(VNKfMznv|9tli_iATw~YVHFuYYn#BN#GUQ z_HgEDu@Z6As=tel=IoB9IE^l?NZ^CDs|J_It$FW38-4LEV%Hl$8+@6vryPzrR{efvxY;;W#1aZa3tB`gmU^>B1AphN z!sDJRvxlttRNxuyECVpI4L;qcF^sw*<{+72S|6B2J(oKU^zcbX%~yvk0HF z^ULjfKkqnhE5jMQtfteR#eSDAxnIEd*xQ=VY5_}KpqSK{IpgEyB|ySNu?)yr}8BMnyub@UE-`pwfRK@uX}BnWz8 z$Ue%Y&~1oRd2X`RmWAgMALUpNWGb1MF>_BOpmy%_rLcI;=-KSgg znz%g8rBbJ_QL!7AC4dzo*CEDD&Qy{OI7d>Bq_86u9h<8ZP1vj?X>8S(K@ifaQ%}0K z2t;23x4;?}c;?OMwT9W{OzioAPOw~hj!ffHEBTBN{NB9Eywc`f7Ax)^4iaoJ`1Zsd z@iUARa?!WB^}D{8=IMgKA|H9O)uhtu)6QBC<4k9DpU!hHM<8F|n7RBZthE4}r_WxK zLFruyH(5OCguvjjq^7M%idlA0!6YTnRO+PV;vse5%<~2Eb+};jBXNtK?ITS5B94NI zZTbma?*hQTDz?e2p%6ABW1i%(n3ef*FJW%#hEP3`2eWy21IJZno%^Lr2Pc&%*5c|7 zoBkoo1-vd=*(SRi~n!PV=fw}dySw1ZhO-wO0;e!+5+^qH{%9A$Y zK1MGD*;J9HzG>;dG5-J%JW|1<&IqOd$MJb24|AwKVYi~$dvPTsM9Gpi{s=&p0{y~m zp)dFT8cV&ig`!Av0h`%nEyN!+c=|`@i@|;>p7480_@uU0OK8S~5T#!YecJvJGOu}6 z0D?#yNP5aeK%1iN5Is8|F6^k71K9p(+9YjSB4VDS>10?B!bu*(YivJx2Q|0kSv&q4 z?LY7cv$=eIGR%~#O4BjzBfKkxHh0PwuPG8?DS<{GH6lD zybz9^hT+3BPaXW!zS{80fWMYdh#X)dRR6Kh>Eu(ZPKB&ha#DQGA^YU^)ZqcgOQOh! zM>n412io8~Chn4+2voD3xmomn@sx$A`Tnkh4***HPAI76ii43^dt^B^Mm3zqAZ2D` zn-1{vO}%%b{K(H9&Y57#T%?~hTOHCX%aphQryr}!R^s;}JQDa^QB}K29YF85l~6l8 zO&fWpXAUSyt*bt`f>E@vNz7wz2w!)*6j=^_{iR@eCd2Yo3;1a4i1>!~?9fV(YNmHf zSC|i{W}_G3u28}wJ7Z||x)`|d0!(Jde(^#t7ea3j(eKD+AMiB7p#B|NcJ1V7BB$f6 z!hX!H4hUb9!{tuVBx9YZgV~#X8hP)SnI2l?0DpaJ(o%{pQB-@J#f;|#wL#X--p_LM zh*kRK)MOuAE*IZW-~2I;9@Aq;0LlEvn(+wj!$spX!DJF8(7p2EN81y&f>%bHaQ@J# z_uN5G`c-pHDB9h)E|%x#W{w6q`WxsmDV|hBCyg2-%I%eN{@j+9nN{7Gwcr6HJ2}d` zgjS>5*iztLbKuIy4t7Z5Gp_G2o&C94os4DjO)uc3@pcEb>5To9dkb@LS}FjmWWGj8X}%GWTZMiu<;C)X zBknYCibb=>pOD+P42`@{@e1Cf>}( zv=4|gs>cp~xU-W{e){ab2!d1-@a38fzvw@31%SA^0eKM^_KNm#O1EDNu0B`8yp-#C zAD+)}uwU6Mu^o53>>IOAiGr}aeXOw7mzpea`@o7$-&?uThrb_{RTHy{N|&zoEV$}g zj?ymp6ZZ)|fy9OA7WSJ{i)H~8{TsNZy`guu9gb|EvEDi-FZ-n(8e6dy=~IrEX~5dy z`xrf7({-6fxCINgaR(kp%^$t#-X0^gg{A0y7oht{X})W4us<+!Z>-X%0_~LE^LzK@ zoO77Lh*XYHeY5oLIBR+-@SJndT$Vqw^%A=|%6Nx!}Lw>1B-8Q&lf>y2@lK zJIFm7fxHeCh)b7S`*QT4(r>)oz0+wK;mDQQi)fI4vNty}!NGhI1bv6QJUT+MM5?ZB ztIWQnNpaz(r|LkDBgwW+4 zb?E~ekENSD#V?Di*2~hyV|q?HMUsgN(vlfWzbZIU*;BK0Nd%;H>g2z`ThzQR zu0Q{{UXw2jFz-bp5+dTpC#3iWqxXZ@;ALFkbR_Z5du>0>J&9+yM5=7m)c!h5WjU4Kq#dngnwLBpOuX?WqLCrwWnC=E ztur?$1iM?()(m}TI+9o7*KQ6c0*_&lEgDL3Z>wK2jz0Sa4HbGkHslMM4#Nc?BCG58 zPkndq(Sr;@3sa(9}$EweOdk!Tq`O>6qFmJCB~X zx5kD38k^#@D;C*pdSuilaIC>#fJxoU5T5j6w{8rHce>xe(HeP+Eyt{(SetSsHty@~ znNOM=!kA*s$|J4$@mBQV>W?nbVqwL2qFSHDFB*V_CqW434hZalTDuQ~VOD9!pF>;~ z^XufX!ubp$(CJ;@32IaxwWDVL_BCLUoGjGKuRBM6bXj-yT3#9ERYGkA4DKc9r>nBu z;~S50hx?Z!$@1dS2XE;VQu(}p%~&m~QJ|vf*X^}%BE?p$pRd|PlVlGoK$y3%L*}qN zOz_>DOb8rW5jJ6rzvQ1IHj>68kHCq2*0eK|lK_K+Kz7EID+YmF zj*3v~xSPaJFyNTM9d|+JBf3corY#&R%0klnWJI|Uk&+#e1O0VD2mQ^UYYj|-;mtV- z-CHILqZBSM-kjX7Sj#Y9ro0fLxA%{iGq9C{A&{}Zri~fp`Fd2RhElj51NVhxf(Sj{ zVlF-pk4E1Uz~uIqbIow48e1Ffjz9h|4^c(bT0jTMsrQZPw93$NS}{gTknBw~@S~oB zMUUfg{LtaI>*r39^GM9+*kFX|O0bYRB!{oLj~0~z+j@fK>U{>DEUD&tXevTJ3`P0M zEjlcbtU7%n^PX!>a2WQ`^B1aKQo#1uLPg~tR>inO`;L3AMyF)!g%k6@DtFEgM2lcN z<3ttcZ|u|n$8pDbk!?es41I4)M6Ze@@q0%Fj!fzAqGY!NBPrusn4<3-Dw+aVPjPgE zWw71D3ioRc!q&(=T~F5V|7q7U5^FlnzjK5c2_DApzobVni z?+14bbBD0@$^l#tX3XX5koyd!a5H8ZlLGF}f>yJQw_Z@#Wz>5`@2%p&UAtN zV^P}6B%b`80B8q$xDG%!AFG)`Y9-*);Eu*lc2?^xB`?I!#37}4syovU$#(f1iSp$b zO&|Pl+!Ac4f8bCf2%L*~=sSqkr2up8RW#*_D z_1e^M5Dz^aET%>7%ANu@w*-yobJg_y&YDE%K~7c-Pn#*HrPALXKx}cq6-+8LI|s29 zIp7UP3G3xfZUu8jAKi?Eq&N+D(|X3TE-cYl_oAq-((P>VC>XU;Tf0}K0XXr>gOgu1 zXv(oP`#rAPPYH~E<+DLT3#%`lzMJ87>mP4FmDBh_m@gXj>jKwU&B~+K02_8|9{9dW zv73}?FH#7xwK4dm!}La{ZX%*cSI3mhYYZe4EMXV|L}`@k?>Yp3B651S{Tva|_UVgP zHAB_E;+)VYgn%6H88X5N&d#Z|?IGMtbKQP~Jd$*aI7gz(Y;6e111o$;1rNi4LmsH6 z1miK>Qd@a5j(;T`@iamYw56oXEa!s_{2f9%RK z&tzo!@)VhU3lwYE$_y0`liqVMu8+M|Na)(3-9uNO!-Bpy@DS|EL)Hur;PCMh$6OBQ zY>B}t!MMnP*Jg8pCMQU_m`d909xF%x5WbTJn93={fnJ0wsiDewLfw~gH}BE8BKasw z|240w!*osyrjW5?s=hJul*vlkxoJ)un@1KDx2=vJIC|lm|2!KKzG3qV{B1%S_@*kA z3@Foy&?M3Yy_GFS3qf~Kl^x}S#h0VsQ{1~Vf<#PfEA``(^4k_cO^4e*sT&wt|B=RzNT@21$SyvA-0|=X^rVDPIEJig7#cI9$W%lVSrApq zl%mOMZRL5qS+8YE+Z|%4rY?8jEOB;@3lubvJU*f9`>&)A|Cai}*i-P6h1Up(XOF-@ z7Ts^PQ4^cJ7%8kZ{3H*Y(nfkJxGT`?(Qrr45)*U!DVJYQEAmns$lhIz0B(uP)ks0LT8j zDc^SEnRAex^2$SiLqL`~(3V1eON-bFh`>%~|B1kc4vPs1?!AK!U)SE|v12I0kc9r> z+rd5Ao4;^iKmMuJk38|4w4#M%l&m|sniaP@#xor#mfY-3wz7xekHe?X{=p>-R%!Ug zqe`9HYu4AJ#H&m9hJ>Q~M6c(}u0@*9TxBJRLpbb~`O`vAWj5>FKtJJ!<&ierDcaVR z6S`gCv1T5Ttr@*oWak4J97oWeY_W{SiA;9~FOhv8!lP$s8rt7cG74r~6Z6V*awiHq zjC?wnYlNlbG?n`*{BAvsOay|1a*)vmv@{-{XEjP2VZpYv0;~mWHnP_PH4U>1j6Uh^1TCQ_^F-Ye3ie0#ofLzIj2Yd~E zLr0{(UMOJ_TJy*F2RJ+K&{7YJ8d0D|+SA2nGUH{u&_vkcd#XQn8O^iT+OVnFgaD*|%u}^%TQ}U539&h+yzj*hPMyj%trRV{3;Y1t;l?;w z@`oGspun}4fNUwELlRZKJoPrp|5 zuCJ$}NUxdSH%M(^wh(g{BO|@6!wO4{E}oxp}X$Z{r1uFp8q)Cgo(PJ6RZp2R4v~3Y5^5Fl>svIfi_6Azj=mI$mmD zYxc;))1h;J3;G>4Ihy6)yO%^j1!9O`>B+ZAKg`cG(3Stl*<9;7>R_O#61<_YG?iiL zP_&we{aICLg<}6!WsSrps4a_n-q}#F9Abu=YLM4chYdrLS2cpqvBtD?wba_O(}E#v z-IulL5O26>m^YZ9|Ay_iG0ld{&cGlRL#|H#72)QKz?fEmY%105zuS{m9)eeae&B;! z;+8%$_8fS-0&Yj$D!tE?!}^Etr#dbU&%yyJ{#OB&iaAkW$S}gQSf#YyBUJ!?MCG3_ z+f9r5%zyIc{FGlw#a&@l_8j{)FQHu;CiNgA?KAdy=aI?X`S#MiQ%g48=&OO#y*5q) zKPuY)4VMVY*zIhAZrsVaGk=OkF{iw0Xf@S+if&q5>{s$kLct)WDDl6i4;K7uDKohA z^y(I6L;mXBw?VF|3=ha|)mMzkqDDlnaUODjaKVrK1s^!H{);~#rTB+eKzaP7F^mJm zH~3Zs0nOM|vpZk@7q9RIz$<)y#VbVS&vQ?y>~(z6@+^!8_OW+BoPZuN?En6y%m7j! zwUNb2{_9_{p$=DCjVa3lA{X*&Oi7NL8wkjzPnKtP%)Xz=j%j<|!K6w6QY$!E#_Qkg zc$GpEF-#VVxaA2sl(y0Gmv{+jv9p2<=kKbh;0b;FKW0agezxV>2n;IJihkWgxqbs$ zS-J#+kAN!ibgr}T2y1AwsHvWM z6=z>p!gpjoCg#CX>N#3!MctJnr%NsQAHxG!WA$nL6pKD0#JT}l4GH&D@c>Tqe?R7H zTtY16>(6s%;v4@zKmY&Z>)SS{%6f)E74ey+;x@zvDS5njdM2=@e-{ZGx=f4qC&?B% z@N8#~JvKR+h2B2y%j3&) zgCd~F`~IIYF#i`;D}brw{@AEj@hjV0Lb;EFzR;khF;H_!(FLtc4BprF^x8Gl3Qe z*&pdVd)G;~9$;mE|ukM}7wk20fCuy*dow3YTRoE?u(JBWtF(@#=V?CjDS2!VJ7aACBfDNW;Rrrj0G9QO^&|AFt2k6P`S$z>=wEfckU#i5r z6_|BkVUN{CXf=KZIA342RhTC6{!MDdz43kfgH3RoIklt3Vh4H6OTTn2{mZ|l=FxfyDVt4dS~o)<1ye`zmW|J<6KS?l0D+_>at#A7&Q#w|Ej`yDLtX8fxQD zlqc|bQVV7XX#O2m`BJ~RUV6OGwLyrp-9OU;_3yhr1R(!`O%E#hbG|S z*%;)lwcy1RsLR(48EgJ^gU-!0F0j1YB?5%@l{3T#O|n~qAg&V5;CFs6zj?=@sYl1p zSZR)8FMl^y({Gx4B7>}eP=hPX$YEgpBlW{T!0k8%w_u~5OvGi1uZ*_ZT~LK7*D|2_ zoia89g~yHLfu~K}P#}haT7VH%qlJkMLIvXes*u_yby#=8@a6rmKm^g0B9WlY1{)Ck zj>9|gqq_m9&cXihGfJ>F6;uZd-)s|MSO>)%{EjCdYb}8&33gL4MHf}qGc)|xB#Jci zgeAXZYD!vy>SDexd_aeCRps?1lWNgd-{iB)ugg5Ps%=iGlj)$9(cBX424EL)i&8F5 ziXiHUC?}E)BcM_;WH=m3VBD8*Ld(rt{R6Zt@8s4k)@J7vyye}P8#rF#8(g=8rByJ2 zFn>`jAW^*fkg>o#VHTpiZqMIUtb#%4tyOQ47m_;SvNAlVJeni@>x$;rS>e#v<=b;?KMfBC-5a|H8MN1SP( z?ZwA#Jikh_1Fl1Mkcsy} z?9!yUCfxOb%oCDvviDqVM7V44$GJQ;)?_w|$iYJdx=H|7vsm75)+lWgh(BIYsLuX(en8etS`Hay5qh4@O#}!7dr;CPAK(# zL4XO_vt}NX*rH_sjr%pPWO6UeUgL}DalDS4L&l+~xR)r6B9Pgz&uKmKIXeHJtSfVq zh+0}GD+Ps=^ZpilF9vq*Y@lQc}YnAP3!IrEXXg~c)Hn@|xEPTw>d;X@GV?V=O zXq2cy+s_IIaSVR$fUo2mBp3c&)>tRIGuqT#gCk_seSsbu|b9Q#rG6hWM~m#FWD-ww-({`u%td@Q zZjHV8flWuKBz@g2(%+$r)ISW`uAZsXc&-{~G+h0ZhA39X<1f$#Ovo5p;t+;yaOqmb zbi*Wcf%976`O*AD=+Qygz2`$_NS*Q1z(PpGlcB`~ZIs@34eaEuMVFO}>*yePb!BLM z=i3R5RToG6dv3+1-YV$hry%tG@{0sF`C{vjFnZ)Y+Gw03{1f)rs=-{+DhPghvB4QSePnMu z4=0RZDMF6mOp4<67wW+TA#t$J(9-tqMU41R9IOxZ?=H=9h2g?pl+Q|eZpi082%au& zOuSzHwe!eaGU3ZY`n|W9k9^d$R#)?raX^AhdJ)d0?L~{7ABmrJ^N0XJGKr8j`pS9m z(^hs;mroO0iJqcu6YOgF?tJ`k=FrK0=L`CqH!m;c0rG19i{v?K3G&8~tm62lA0sT~ zBR0ytn`)0n^??b_Vzc?M<>C0m)B7~TW>wyK*1R0d6KRk35qJTXun)pVgeX>ztD31k z&z<2^F3GoQXQIX+9=S1AR(?ed4YgfaljSEIymKHw*YNae!D1nk0d;RdSZ4}=FPYBz zlAAS|H^on7xnkFP?rYJ?YyTCoOpyH~dK&Seq;8j@EVp_fVXE$#6=b7Vq+95>uxtxK zuw1B2kYie`H9)>%pDXt8VDtiC zEqcuOmnL)kSs_{|=mJXRIcX1Q+SUE7z>NgPw>`1LbMM=r8jsxu8^7CQcfMGl9U_~R zl%U&17$?FLxSn+fOm{LSXM5<$2-!)wyt`Fat+6qD-B`BYE%noauEPLyxub4N97K5g09ee9-#-cVu^i0xy zoRRnd-(OzQFvBxk8TvWFo?|qr)6z~*+5%htD1AyHE49^%FnkiUX0Ft_$W(yq{#_`I zz*1i_XX=xOYuwsPl|G)1d`lKnv2ra-81cBQrS-gXWZj8CX5Gc^n4^r!@5NMh6)JW3#d_S) zj(pn)%ZXbXTk9ALluhs~VzGVxDOj=ZeB!okTg<5_-1cBd4a?7Pn~yiTkMxG-waFNVSf?GQx>_YpHCBlgXy9l4=5UgSLAr+VWmnM z`T$&6YWBF(0&lvQ{@Xgvl|7C&HV5eGbVr-7?Th))Az=k{%ow4x*PRG%-OccJFa_W! zp_)%;#!elwS$5b(Kr!+ndWj@`VN1r*!tmf^-UHjQ@~5d=&TkhmP2Od@>L8xwnojnU z9jNZAFn~*Pi?2+Hyw6$=Iw{(ee4#G4=2;sZd3Bb_AqkxxVWf{r3|6b|{gvTT*Z@C* zMo>(fjnn7Lwe?F9sMz6m&4};dt|ox2+Fo|YtH#O8^Xl0-cG#PvLAPpiw?@I)RBo1j zjG6-oC}#5&>4Wt)$5cE?4x=i)O9i^QSYZPmco9@-xw+24!uj#DZh)5o_tbH%;Pf%w}WIQ0-j@<6^_FxV9~84+U}h%nllt$;$-4#P1wGz_s*qnnCXmuSVB)SZK(QCA<<&nJfBJ=0 zUr~2bP+m70cB5l1!Myn^=^O8ei9l8OHI{^OiU9mL(&IReo*s6tIoOJDW`eY}{#tw#|d_)#xfW%IiNSE0uj~vz5io zIik3w(mP`d=Cop=(H&*Q)+p2AM1Qc5d}#FBmH>4@hG$B21L3N$u&Y(__|}bo+~pm| zI&f&(!my5vSjMbL4x&$#*>>I29l}_63%Zjn%gFYD;sF+hq_JAcOAuW#=dluJXQ^AK zWl(LZEA-3VkepjTqd@mmHluD^Qg{Z>AmnDA4G4Tt$0?kmrkfWgg_$y#7 zWr2$qn@I(`{ujIBqNX8k31Gv2PcuM`02A@%9X)Ai&z)f2tSd?xMSL6P@`6#Kq19Xu zmF$Exi_uo?1Hei~i2?s2H}qxYlafaiL}l zPtkV$=}In%G2{2#N#t_64$o(kVWtoo-Bf+u#$@@=asvu}5xeECU5!QDOpEktRWLxo zVkT!mFwVm_zHENWUj7rtF#rQTug8|8E9HHs@siA^ofMrkr&({ypEEaAYq38`iwLrs zork%>d68Y~`1)-}2uFs&gW5XZeQZ?w5ZQTE-Yb|R5t_&*LG2M`*&D*Nv=gW{8o%be zRT#}+fs0f2$Q|R+sHXV$xFVj_nlw)?y{eb+OY||0APMIeOUAD)z6xhj%0rSC-WpUK z@g<;lBJL)R^2aQO6RM9#bcWZOYWNmn=%nlYFpKXm0#IJ{~xUy$&y z-Q0UWYjLq_SYHTcL1(Kvkv3bN84evxN%w#Q)#Oc!QU}y!ko0NMgF=>6HkvCCUQCY&ymk94(&IDa?T(_w)$ueh>` zLi!REzpGp^&-GyXj-b3O`AxX>sr!}D*Sn*R2>;@X8dV8 z`~39D{UkhbVstYvMZcamgHDSkDu_&He_WN9K|6VS-+6^3>?~8<0?16Y@PfwFD~*(@ zwZCb2A?$+J)&6TKe;%0?%a@r5zaKA{5HAjRq2+-SH4_9q9x*+t!=)At>u?sMfDSQ| zkx?-Fob)Zt;s>w03*hYcOBj)jEU~P47;Qntg0Mz;MLJ-OmyEm*)^{9ZZ`6zFnB_@d z2`evl?}iz=B+WDnFHnW7+ibKD4fOnL%ahZeD3!}S{e9_*iTZKva-$~s!E|da4;9QR zGJdV$Jx8fgOcC@?X&fkm=CNl(rl@bdRE#!dbutP-D<%nunrTR{(8AB`4W^1x1)o>;m&vKZaw%@5l4|gRRr*&rCNMGc5=uI}he!#s3)h$!y z1mH9gB42r4=5@<^?=9Ai*F~i?Yy32V=7(e^Y-E`aZj~PGqPzu$@xrJS zqL+UoyQxV8L(u}PImcunfN!kZZ>lsqR9Nud3b>l`x*0(O43tkIl)wCZ$e|p~)pOQ8 z8;czh5lH8tpeRkgmm?=DJU%0MQJgl-N2F~EL05CH`NSk*@@0!dgmpuJ&hK!(EfXLn{if}ong`__hUd($IP?T(tRYw11b zUDiCu@OZ+mGwhgbuHBhx6uiFDnyK&DdTiiO&PL2fm<(k!Nq<3KA_}@He+n2a*GOk- zVG3e7@Eo^HGR(_XgAtI&?ytgbk>6`Dz`~2;?eqeIofn4KJ`2j>=kq;c)}0>~+K*5L z)*QoiI9Kz}^h(M=vhKj6;EJ?wk$-j5zUwR2WtFPP-c_}SpRMOe&Zs;r z@e>g;Upm8l_9farSsFnY)~C0?qVt9kjJ_e-+EGj*BQ4L*tvtT{ZqCa0;K@zsfQ;4# z|Lc9nSfR25AL;G9$z&!Ps!=t1@N}C3wt-*|QA`QzQ#f8u?3=#bh!x^IGsRYqz?{ja z12FFKA&YV2jK!+FJ37&YdmL%2OS-TVGu_WO1kYQXo2#IV;P13UtKO*+is_)luMc$C zGFmkVt+l8hN>q)BXT#*zWqaaP2YeqHOC4dF7-2G^kb>6Gf4I9?o?agh--U;h1am&< zy&@iPSq3R?HLREZ=Uq9uNSCAY%x=q`Tov+N;rk03VYz12$# zOIu*FrH~B%6`=v0AtfT#AP!kJi@0)gbWs={d=w1#w;mn+dckvi82RG^#mD4P;Fhu% zded7J3A72vDmdWB7D*dQH?IFde5&VArsM3E6xG$;Anha<`M`$bsi8F zm8-%VwYOq;gbkLrQQ_%yU}~We32sZn+*zxa*BP%`<&Zvzw!@=CRVxNK22j}YT@ayV z5f-bL33ahR3l|JO?s5d#r+2P%E@Rz>{NB8BB{gQS@p#5D=U056)`-;BK;kqLGr0$4=f*_yywk24{+C28Rj-M0p;I)?GL(MD)%xoqGy9Zbd<@L*Gr6S%n5*3}cmhODCC0B%xS3@as%qr(YC_f{IdQT`fS2PeCv zG%mwrGtQwZ@8gqim*SzEAX+iFxXo^ctsbFO`?#`h>@xO-v&V$q_#}nwdd=u}#7(eV zwNK9>R}px>8gvaT9J>l~4t{=eEQnCL2#NR--q4nj1c~@2`6EKFLvZ=XI=*B#FhL>h z%#77ZruSX`SZ~;u9hA9<_Z>kS`ESh0yP|dpYkL{Gjkz2Hwcx7vN!8t$D`-O1$A36^ zpon)?c5t3H320=RNb%@Zw57T7!?k>xn(+)W89I6Dc&~QX*}(9QO@97Sys4T)0-m|! zjAZUl-@yB2RTj9-8xwv-rUy-c+L8w_CBLMZH7ik}TRdNr+y?S!`@z_w&hWijK5T%B z?gc{DuTd!UB}MxLmrg6Q$V!AB_mTmLzf!-=Dm$|rLuO?lrB3#Z4EBT5`1_K(l@E^o zYdI@7lBijBKV*udI5rsJ&%ZNvj%MBN8Ndq7efl1!MHD|(mWFT>xG!F;lcx={T6+C; zapkp7d%i4=#{xGUoV;p90<(^szWr9cq7=5HTq7qLe_AxBJcoZi&hd3)K*Zx_cs;EF zq>j9<*#qtHZ`Zd2DjA)}Ty<_Yb@cpw(10&$#X0gu(e;P5*eCyiT{hi|t{Uy1hI{(D z!i1r$rocR(z1sbw7iq#3&yzIViCOEsTbI6PRDtu7)V4gsE(N8d!X?Iq<%fRmdOpnE z2BLvLe^Z3x>oe1?*Ap z0^2}I_(PJr_1vTcrJN&3oIogq41NyVo6>zt8_3s-LNlIRCX|OG;)KJpOyN8ivuzK4 zc}POGB=Xi@HJvWdhwV8NWmR7aEL}&F^#J`Qe8RL|UTLf?0z+1or>uDT&{n+GKIN`9 z^&R*3+r}ik)XjjKmHA&`WP@+|K5!LK>NOwTk#?AtI23+0>t{YX6YlKyM(jIf@1ggL z;8`|_nCGzQOJER>ae?@w(@C|z4_EUJ)R5x%^`8UiqRUjey40UyP9G$#-Og%?(qoK1u53 z3?KdmO8e#kousJHY3*I`8o46H>BF1w!HF8HpNPLsLd)&`lmZotLg^MQvu%eGlMu*g zWV0BgZLQys^eo5~1pQ>g^B}im4Ev2a-O{J37}{IbUx8XexFj#KGi1NI5>Wj58Hj;C~ZWl+VpJ zBN$geQ+cdXxA3b5wu_~#1^u<1ef}M{clX>4@p*rms9XOAx<12oqB6xRxEfpw%X7lf z(dTC#F|VjEnBtvgyyE|m>-y$1i<|8y^_KX_aDPdY&F~iA7rIe`?OyiLE@VS9$?QT@%I^&8+PbJ`C?Ia@H4?!VS}f!mnb1EF$()= z@;H3iAo}^#^x`PGIlM#8_(QpZHlw|xR=L_D+hVDZa%i}7s-dzAlN$c71fqW&N>e@Wbxydb42Q*a{1`=!67bQj zG(>4oN>`Y)oXp4h5SohAt&-@QU_YPKpOchz-de9+-8#>c{58a%Ym(=!h7j~&+4n12CZ!B<&}8L4%oRCJV^zb-tkrAPBT}bV$+CEmlZCsr#U+ z0qYp+RBoG;vx23tQDMh2FP*kW8vG4>wqd^^L9-cUxxalRE^+8jF@L~y#YX?(vi)-@ z#)ZW2jJfZ?6-ujK2U>_jgT5DF;FDR$@_2(XIal@rdj^H=?6(Lz#6>ELeYNVr$9O9b zd|56HsXr0?vvC%_w<^kAaKLxlBWXm&*qK`^2})iGr|Ov9H5sJaV>hz&f-)}~37QID zJ7gKRyE$A)r?r_Po2%7=t;V8uJ@z$ho{MobI(hwtus{3&H^v(EI>fq%^?hZDZM&?c zlUxUeN9MWaPSHRCfqzpumzx3;V+)4I;c}#>a63N?6L>DB)>2(0?3pM^CcbpBGr1Rw z&2$UHaZ>;F6I9aB~Wd>JY|ZcpQAU zpCcLmD}6kFj zXf^J0s#@vx@RL$G^6JHyrb<{~5$W^M-pIw*dgi7{e_Ug*GF0h)VxVrlXM|o1u6dHu z%`(;P*HEZaV{wkONme>g3e#r9+ll8c>Q0s}_CvdcQ59gxhT#nN-!sgNY|rD~^C-V~ z_l3v3KSvZ%G6bj#j-NNL(G(UJa_(bdw=OY^^K}>%ur1a=mAfy!LhDJh!#S|z?6llJ zHoT{K1({5UzoFs|zCsOZBPbzjONu`!`1{LBw%NSz$mIF)u9@)iG9^BjmjpZO2O=p|c?UQ`tQg@BHp=b1}p+>KTNyEo{upu{zRsLqwIS^tfz%Y~y0l zb;CFB2-E6PNhu3?`LKf!%+fvE%Xfge+O|YnvGb`Xub6Q^goyajPJMKKH}7F3kAt#l#}vY0`N))G<&qtJ$`mW~(JpmaQ#Itvc}Gb9n8dk`)G@ZhtNByoz00t6(WoUn8h9t|0r-f(%6PDnmuuO2{H-sK^q_+N5uaGB zFGJAV%kf7<2Xu>&6;O=__;fi-W_{l{Q5{)A=*vjy7Ie zgb0(H_05`72=lASMk9IUE||L~MXpmcilE*a z44l)OL%J(iilmWr*|3Zc0WAcC~)Fd(h01^x{}uV!rqFBX{XS zV`z9P;6{zN{5OlwVSontn*4awL)S8#Zrjmw_H<8sa_Z}1#2+#fkk(}y2C80*qS%tvQr8fMHdZzh(HfP)?J9&``Mi+sN)lqP z)s~#G__$I)rdsDfGkqPZQ9c1&K!AjX6nU+XFQsO2g(^dN0Ul#|&6$OJyP7%HIN$F2 z-?kK}PY==}t|DRVP7!dvMeRxWm1AY8<8B>$j1tQ(HX<6}kLg%{ zSNm;(>ehqztPwkXAm}*<0M{EK0(PFH zRtFp%xrXdI(I;q?bQTM$$5`XYm#d-AP_~?~NK$i)`K*=B(ed*5N*UpnYUjsI&ijk| z7im?4w#dK$ruQ+rD#Lgzkyu*3_`%r`L`1>H@eHG&+Li|4z!|LJf1-A-L2t0f{RiY) zbCk^VEY6!Mbwu{#a3g>C_$q4YR0;ZJfbXY)Qoc&u`L=2kGxDL4U(Q7b#f4ssab;Ht zXBG!MQWF|c?$zSW+=^Ei9@gy19Co}Z3P_&n`%J+0cp-&WrC0F++U}U~ri7A3p6R5Z zS9(VWd3u0~d^%lB{;L@%j?So=LQ;W38QMv%q)SB`@B$?{Jbz3Ra2Do&gL*iVovcO+ zn1L?%?D%cA0;(#V+<1_h@8G7_Xc*`E<2Gs7OytWef%Z=jep*}P2CEtSVI}$PS?*?m zSuT=l++v9G{q`Z}C+=-poi^K{&t^?JEO}gU#z8o|z87I>@v{u&uEVIP0&MU&gFVGG zy_1#9X0!_H_WL1vyLem@)5SWX?7KXhu5lx>K%rJljW#}q_O~LGp(EWBgY!zEg_pyO zMcwML&4Rx|n?Z|~?p2+cY6Oxw~K!ygWDxIV10!U^y5pk;$&2rvru>` z->XFloK%7DQ7;QEc5L>~+57g$C)`#sg|c|O^Y!7BBWufW%XI)(Sb32EOZdarOWVlM z$K(ED=zFekmW%a8(Kzk_;DWC7*6GXiA23M=`X%4qWyyg?TOyFpI_>xOd)>rNOV1(CW6lbs9hd8ak zba{bxajJ*z({@O;o=|4v|EQpjxeWRC;-Ms+v4vKp2S-xhej1C1&^@0=<}3nBS@4%H z<#gd6O5e(+f4r3^n96;t!&~ss6eC%#zS|93%vrH)hvI0==%@lUIQ_LvYx^OhMjJN}64M z^?v)K8Uca~Lb2GUXV9df&5c4`l$_8}=I_15!ZfqN%Un8~f%>{b$JL;gySZU6Mei>8 z(kcQNLAraj;r8dVF6ZfEsEph25wp}k>Q}UEMS6;H7WwReM<|7BWAwTO-m;hl9_i5O zBx@O7#NMjMf*FpI>Bd)c#Mwf9)3V3fU8~O@TM{iB=?qV%()#W`2UD3E{}E$saXdYZ zzZ%^M7vbe6&7=I!^GVBrY8aAkz|keg+-NpY0Zf6rxU{ctPd<$4%4C^yP8MJS;9X`) zgwbCwx28W6-oNLBBDYl1ySgfYlu&}kVRr7bVOTK#&A;cq|ky_BYgPn{X@8E0lH zA(T;`QMdU|s7Bd@;|z+y+8g=W>#ykVWU9@!k@!TcH@Hy;O{s@UsBU{S!R$^_dbHRC z#S8YQ9ng*KIk&t=X?h{Q+-pO&3lYpW$#?-!P8jbs&|DlL4KdI_u*B(f%ViP{6jh*c zq9|3b^y{-V8@XN%_s^7C1mYunAFGo6Ai6yv@05cSD2<2TB<;^BELAL16F*U(qG%|i z=)>PlZ_NuR$bpD5&MzrGQZISmMz*H2^jYgxr>^v}J_ku}PO%0UpLu(y5|5WiCdBCh zw_b9{z*jE~%_x88U7oSwJlhh;B@l`6S9}gPhs2-Mb7Db0HkKRPVq<6LKPyqJc_FRX z#~ObVWbZ3Y7T!Y~&gXueYIS}?pFH{3iwJ+Jlsu_!#|rBan826> z&uuG1N}qFA8pQ=iG&c~maawmb;cnFSofNu*gM)VQpxcK^_|pfK?T&(P02|2dU17oB zVRZZVusCjD3ssdJRp>^ePb!=zD~i0=L@Y6)3$K^@mLX_%n;PE76;ED6--s7sDI6Xn zj#awmp{R3FNErIujpzlm$zg{!MkCKpZj`^Wl<8{lk|~W`QN~;zcUht;{sO7Sh|hqh zhaZ{4(Q-DLhi#0sC*f~(YMo6W)c900aMY1Cj$w3t%llK0V)>U}rqy>pv)0dfLH|7U?Nbvs!&#$c(+C*8*3#*| zLNu-Q6LW!Gv@~#c3Knu2txQ4|8D5aj^=Atwm{g3Sx>Q^p)wiU)G0yvY)#{JpA>j26 zXM0}E1NnW~yp-`=vnnzI;Q~}oY`h8PqcSu%zHHp!Jsa6m^6Gpf5?oTlCpsMg;)nPz zr1bL@f#GK#qcJ!Uz0|LZ$Jaz-(81x}&wOA^SpEXxpT&fDy9utcp65;`)4v<*MsKh6 zLC}W_d#5bHf-bW8a;)h(k0Y8Sy_~!JPFwV2RNr0DTS2W@qxhdc%(ef8ah26axU_hz=V}LH^& zrM9i3mrT>W)`H8)zWJdO%q#YR5{ZfUTo-IR;)n@%TP;DU4;ed*xl@O&Sz{%ZEH`II z-`%PK3H+d;faV2>NGyu}W6309}MwT7V4VWcW6D(~aY6=V~SM#m`e> z5T7yB3E4~3MCZ8_S!<@OFds`!Gz9epp)4^Wl)Pg+rQa44q z3BKS`-F^LvBoJJ0l~2cl*6M>tis2Q|z0%6*FyaoSO(uuWEtWRJaNVNg4ul5K=4dLz zza<%0N1l`XFrsELh;PM4GKDGi&+^jUz{c4w^!T6U1!XEv3ViXoqptuuM_9f&i6T3-%42!eAKd(6&pm?}FpPpwnm)j9_XWRcwuhWFr!~Y%2quE67Z0I32 zpRv?$f_K3$h&Ycyud+~rDOv}{+eOeRq)=CleInc@#ZpqLNq+7T8IhbHsVZA=I z8CVj@MMKT&CT|a0+WY7m$C&QKz21co(0!8$OV*E{@no{aoOJn2HkUNn9!OD`rvNvSo0}aE6Aek=UiH{RTx!~sm0|0u|18`(8S-AqWVFZgG)Skk+ zuPirlPAgd&)G&UpSKE@uiC~$M#=nZ#>?sRFHtPx=!jfPFzjy^-9|lXzJp&}G$dObfhA)+rw|gaK7cVwH zuJVWuI&6y5Z59pajaS;epIKsF6EQU9qFY>^Ks;UtCHZ^1r{St+`n56P3N9(q?($Tp z7}X4EW!}t_FM=6zs!dMd|BJe}46CaB+jXT&X+)&Eq`N~}QW~VBySuv-q`RcML7GW- z_XO!~kgheR&+~uRdXM!U`@{aU_eVX1A%ii;xW|2+zw?USX~L*BecPYzjQP~*C;PnJ zGAI}{QSrqE!^_$+LS&d#cTE(9_ibrs|0`JHuTkmMwwPYNY*2Q*)1}9ZT8)FIB20}- zft)FrNFc6ydlxGG0;;jBkMWB{j^G~HhP0|iCoPZV57b~_S496Ws`B}Nqbi0WhW;c2 zAvzCcyU*Pg;K$9rhvx^h-%2~k&+&xZyPZTC#YmEYfnt$sr9;5J;4*QKT|b$$%cR~i z5Oq#gel2)VraC>l2_!S)KzL%76V#XBi%qb|UtB$A!E!QNk3r*y@&aM=cjUUkxd@b^ z)Uw__v~BpF-A&VPixvl+PFY8N#w`AJg@S5CL7@Moej9^g0|f zftx*~DDK+nhWBh9ejNPCX~)%>{FnjJ@(+zz5~X_t|3OZ^RusH5d)xCN^mvFs!s#fx zT_z2tBZ&4${cGWIZDJh625~0->Us{qPt;JK+KYcnrpua=d9SW-y@X2RO{zfOO*vVv zMl5kzmc36*Li9n?t#7lA3mH-)tZgGnDG-x6tF6`JFfF#1tLhJRY2o&j+>lA6_{Ncg zSTdY?2PA6rzV)`r)XATdGkyvY2*{2i2fB50KJE=tu%VEO?Yz=eK?y{9Yon<n0?9ahW;(McuOvj|pX4f?wBJr;UBGVqxO%W;|!>Z>BB-SG4 zxkoreJ@gEewYg_`Q?!DAdpuw^z!Im1aDK;MNJ8A8>EZ?rPL_KFa2qRXj^KxB)GT2G z;+{+O@|&J+$^6s-?NR)?=}1Cz03Mx(3dNF5Nhx?o1(JLx#r=~voJY>oG~}x~BOhYB z^iYnRaQdFU@h&J=SKiP>=lSeh@@$ls%sTptmyGnb$da8;VP2owpzO~LC7;5+DP|z1 znJBY&gfn`<0=Jnq_~(rQ%pD_mt*afA@Q(`BO;t5EWcfXT0Rc`-q>M*oey;Yvu5u{| z-gze_v)pAOq3V?B2kK`};EjQKTk^O&6R@OoO~*cz=@tW8d1Y9sRDXuX0&>FR6#n zL7Qv^hb+G9C`gBJVr79r_;gEU(>7o^pQ6M5OU~qZ7@A%f=F+vyb)s^URcJP|Fb+M6 zhXx=FMs{@2{L7jk_=_US`9imFi*a;@-Brq-jnc9KwF~dhuTOu+0A1a=@>k?!W3bg{ zssK;*t=X5${+mUi@*fgkw-j?K9<-#jmPMjk>xHQJwY0;xA2e!X)}(4MsizeUo@bC8@s>=n(S+mu@4^-tzd`5#1`@>ps_W*U z{hrfkNP?60w7`SP&YT8=${mgeBG-Z?-hx}&wA&)B6gzOy5VmG{^#U8D((s$Su9o{I zvYxvL9#+0+_cJRrCrN9)YFs(1@$;bc6CHsfOEQQon#F3)A!W0$`wkl;^Wt(1o(lAC zYhe`GQd#v&qLArIKr6`tu^>UE2^_Te$s~6Qj3Uid_MPh|+1jF~euXJIP8e71<*iT? zu${utWlO+r1t!zAW()!y0G@zc-`k&2Mnp+ajOcOMuEDzeC>ubHMK{}iT1+9uL@O{q z0v@gbz87Cu8Mb;qryF5Q-cM%43AOmvdI2Xio!EYU5(C7)^?xHLa=YdW?--@NRLc9i zJ;)u*d~W|xW%$~JQf7f!ehtUcz&Aqk=OX>wk_17GZXm=<2Nd>dZAL|(p&ikYY^2Z{dIGv-odCKVzYs}Ma5d~lgW z_1;T@ngA;!#|o|TxT`&bXLS+-?%eGqGIv$@NK_&Pc9qtqoNmOI>O0YpJac}$$=s^L zEU%Ey3dwDl`U@O5;$wiOJI>voA;)*OVwhw@3<0NVo_5v+duQp%33tA$k4GGQhC7#y zB(<$^vYyTV{`pAY}LHzWPqyQFjgHE5|0#U@uftgCRzYOrn9c zroB$FxISDkUKHhUMv?ZD}lYEH>F411IS353Oz-VAGIU|yaU& zqI=r0_Vs&;6=-&DI1|k7|IuV?8RsM@{A~buK%^xQs%JD7dZ}v(AWqtBbY_msXZR%& zE@p35FXSVh9eXa2152tp0ACY%C&Y|^waW#qvZ$!L>s*t|N9XA%wp!AJTup3|E9y2B% zRt--#S)+_^XluKxE;F?9R)Aw?VTq54Cp)NTi`B$a;+~F$&QQG94~&sOrrh1Ye$66z z5S1=f>novAoeiXx;!|{4mEW7CI<1=NDrxeOlpA?I!H6s!f@i5A zCXzLE*ldHy-3kA*`ujxkSbm?2Kl42!HzzOKj`8g? zTB(3~rHYa&hwCrVh)VwTEj+US;FPkGpA!rgvRr;ewa@aFdk zN$(Gsqsfj~froP74NfYQZQmw}DliHkCcI%3-~A`Ie$UI(NN@wHSk-Brg-TVx#w}2| zVU98E8%in2MiBG(-|&+Am6q0_6(yMt*JyA|%;ruxdzMEkSBc~kZ=*WiS`iFF{Z?d> z05Jvk_XI|qPj7fa1O%nt-7&3mHx>Ywyiy_lF$*~4*ljLVVFQFFSVjV+?dCj4fAq%) z)1QyP^qo@vV>oJfIIZ$l@38MTK0$h-IbQh7cw6#5v;eec1JwHWQj9v%S9nrzbmaR1 z!n$IeN1F!JX}`?VB&A`%N2tK$Ny5idA+Mr>C4VK7fN1{p_EQ=@zg!3Q@-%~ppAQ?Q z1m^SwV)hoFE8otRtErYhhM$e0lini3Ur}oW5P$dK_#6;DQMk<`x|9=Vy7%_x3XO=L zg8M~HA9^`rv%mWW*x7+)0}Vw z#w1th@~Z*Y6S}Hn-ySGe@(%}7m$?M83bn)SCBW5ZW!Ab6*+(HeuU~NBEMcMiXdetr zd6$V%s#i76cWDYU9dbb90~bEjIo`9oXv?|MSjyIm{P6G zH@ZqfECGoa0eTOT;7?~P!J!L(JhqTq4L-q5y{;*TL;1p>(d3VGM(#auX^H9e8k#Oi zLDBvu1}bmii!mzGczTbs(|4r2|BjdZu68)SfZ!!ec}ftx#4^*p3`y$icRyN^w&SMS zy`Gnx`YC8_!90VIZfri$Ln@GKu7HW>RRfQ8MRZDs-nqlN|JMoi4|<7IRf%2L+=H2h zo*#X`hEQH6jmx5h_$A*N|Am5t;~t=}$l7kv>|Uiv?g_4NbrXDj8f#;>u@iz*W; z8ch59pJH>@4`V;!lm!(t&CtCB_PUx__Mp&cdRj4Rue9_X1OhAR7erzhaYrfvIT$Ul ziQy;m9$arFN|?%Z{w)H4P0X|ZJ8W`+IfLL9ehDCWiov(v5CwRN1<6p{F&In3qviW; z*^$}>6_n?&+5ew;lX0DF2yddQy!JowCQu-NH;Mnhc$4fT!S}>uhxLQ)BtMccI%N$N z<{C7SIPP7={%G_C(S>xAC;;0TAney5v>WgF8>nqEo zSqq9K79)lUhUFX;+4amQtYbbl#*O4k_JGWx{pC$+0N#YB?|4%`VtbGO|3FS2H-5JP z)fqa*lHHD4o;5HqJcW=cyU zBQq)Lih1}Zi)zUJ8ii#0U;rg9Fz`F;t#v@(zVo*M}Ez zlq={X=~odkG2uI#od7E~{}JR5TLi#X#2UoLVroSp{ts^=j6R@*$X=uz7jk{{a|xFE z5lqnVjx=F&0ig_*@3tEOK(Y`&wcbor6U@4pe}f#05QsEuQ7w7mfIs~GS45#b6k;NLuwlEAJK^TtyBbAW%31Hdb+5O}2qpK286KWFLRllA}U zuke|Y!Df=^esEUjiOLj|d_X0>iR1H(lj%*&>$|eJ913lX=V6M9WfFSJM$Mb(B$$^! z^0B;7eDi*$U6ULf2@Kp&9P`@saLS>Y2tmWSucu>DMks@z;b<~&C*1$s+Fx_YX#G7$ z!ib1l!9K461359jml*K0%L7L7gZzGYYMz&e8AplST}%=_oAA5evzHA4J2qu;9VWqm zbi)0k;itX{)6;K?`L0B~GWZRrNVNrfZ(#;cW^>-O#RzqbhyRWJ8c!~GY+OmwJ(hsm zDCpgJ442P z^MLjEWyO=z=`&0vJ0M#CetRT^BXL}gO zCcV4u-TixujxT|zjR)rzen&shADM{*mW9tg^~GmLv+3T5v-SKU|TjXOwxo&nf z8_lSDvWj%SHyzKKrX#{8;;~=i{{Yzm1U@)?(3+FcSNl_$TAXFOO>?!l%4QG%<;l!u z%kbE3-i5*4q_ui=+G@o$tl{YQY25Dr0G9}A{tKfR4g19kn(z2S29~$rhW7=>85hE} zuNG!A=A)57XqQL>Z(utN4^3wBRUMs!r;{r3B5BpTo3$DbXmQkI1@YLv)Qcdt-;@v7 z1gth%dRk19Z00)qb9_!xww?N32X@LT^xkw^s>e%nQY*t z@h=NTf{zjABBYk{1zOI{l4P<}-6}lR;2>2Mme>^k_dl-@q_44(<93-Q5)Qd&0g{ik zgQ0s>W2B3S*S#-s4Te}i`RJ5Wb!&0?_7``8@qS3T(;uT)b(cEhEk9kAQVU~eg4KvC ze)24jO{(miTUPs-aygW7uFZ{EOLoU|8IN$WkINqX507=Rg(ALsFG%9{hxZ0`71ICg{P*Be+Gf#ILPgHfD!iR zVl?R6)|1jlnsya}Ve%y$r8(c_63Kb$j^DsN^QMn{uMN0DS2sdc8UEt}z*~wgQKM0)%6jfs zU<+vHw3KK(zyMIj>-nc6s@tmJ*IVguyM?R3rZ=+jQs#3U-*f14Q{+Dn9W3-P6QQ|w zRpo=ZW;hZ2wixdis@M40j69`4LnCT#)^iln)r#j$E;Nq`E!Ee=fd6nK#q8&I-kp$m^yYN~P(GW<8hVorO%h+oGNe08Q$mz%sX}mdj9< z49I^E6N7)Vuj9hm0rKV?!CD*M!sejRIyE<+t5U1{9HTD}pd(?89O#rQ=itxG|<}y zCBUmR-CVEwT^YABJD$FMtXXHNAv=m8wS=F(@hBUH0B2nOV7^$S+rvdw`XMSKZ#K=< z&DgP&s8XhT8A_;lWciK5mKY|7zH%Wiza{ibZ{8*D{+}ikx^jaREtR-cz7fUrUA%A@ zNqL%8o&>W?YRpV?WgV%|3m_V!ai&SD>S2X)!CZ)x)BN?tIeCs>EubUX_Cm~syIO)+ zqzQs^+%+>qm?4z()4o?IKTtIT>K{vPFXX^`XZE(#lECbkBlj^_eA2#@sKrr2^b7?c zjhuJQgdt@hsAe2$wqWtls0>y#&xX-M(V1Mk)1(UK*lTRsFUdYH&l)&r;ID4hk0)n4rsz0;X@>2<)>(RsmjTyp84X9M^^o z9TX~ogvwr^LaP@QaaIPMPBw|8NlDAoOZ`4V`$}Ux_{k1Q@MM00)Vo;Ls;uPgbttYz zsa<)?EL_42Ro&KXk65GBZRwfZ*OOER~lLx-Yi-Z99kOPJ_O>StgMbl06fAw7qd zdAd5|mqCjH|GmO*yo#UiI#Dl52Wy}1HNALkf>I}&Ud~9p1&pIsTjv}NiBrQM!-l_6 zUKc&z^zTEBdK?|Z7zPO2h_1y|KSou<fRq z;Bl;o1D6AKCLzK1etC2Zv{N|v*rXI8xosMr-zT+r`F*cXy2lN~U0S`xg+?^R8r}Ma zu^SIq4qv;1y(-TbPj~SU_~{wm#Xa6#(EA2Kd$pM|Ujl@R)N!Nkw3T(< z7ThrpL)DTqxWz!AjtU&&)~sEI^CnZOF&f;HCs9qVr9cl^q3i%qT>XrkM~UAs!4f48GiVV2lmT{< zw`cMuEpgh)TJv1Ds(rTbt8lkRw1$_A zdD)DnH1UL6?pxI~V$QzF#5>^7xZITvs2s%R9q4#6O-0*p2rz22rToU0SZ)%Bpy#;- zw8ty1Sg5MvtlITwjV*aAk9YtwgaIR~?!L#i^}w02{jEs$ECeN&fD{BXZ026XptwRp+%Nke`xCH9U(<{E^Jhst zG|QGhf@D&%1UzBh#4@EHT`(}5bnkFrFWy5JFjN!fxEu;_$PsbYYW8q)MpwyDHjopI zqp+A)TSm)AZgFv8J_3&>(lTxfjna^d!88-I;@Z?T-{%u4obT{+@S)sB`7U6gJrEuMB3)Z8v+RzZu%C;D{jXic0`96K~8p+4zD2 zH2>cG)%nv=u;8==m9RXR%0RZoF|<9cx|5ThX!*AlA8Iw*4a*@$z$wc5nGe5k$qBzO z>n9R1E#Uc|us^u!zE%KXE;a}TpBzdBOj@eT`2rP@1c&4aSfqOKB_v-cB^yKvZIMO4 zJcrc#Fq3{Vlr$34M^qhw3Q#r*Ry>S<4Cp)z@Wbn3j>$kpTjC~cKF_pxLvKR&a-3}t zybnKeUgxV>hv;9L{pKfy&HNI=YoCaNGv9xx)dhO8d_zxoL~I#a=c3FOMJY#EBy^Riug3bn6IY1%vv ze>$ge|BSl%WH?6-yLuC44pucoNIVa~*P0;JfSZhjI6n{x9;n7*mj4fS!ox*5xIkh& zi#!g(PI!eV+pc`RPV`SC;u;=qAUSI$@hdV7lTb~6>HY@Hi}yK{*?u^JS*)@oK3?`+ z*o|FrQ~0a-_|lUV3r$S~(R{3%5&tm5@KSYWyvLFe3VHbU?u4Z`GEV1DQW-=f{gS0X zMVIE;`5m-^R+e1H*3B|1$#r6YfB!1g(fdLF{tCb%g69Y$e)?7m;liUn!=8d@bp3V4 z54!k|c^_0;H++#=)n1X^ie#&FxemU+Q|oWj)_}w^j$c zb9tmzQyuT|9v@@->1bT>kDzZIAMx8GIQf`sQcjd-#4J&|`Gg=9U{GX#I2eZzAW zM-)^7o{7;>)iIoYPK`8_m2%b9Eu+Gw&hW~&H=<0LLq*zr?soNvWAJ) z%R5-+sM#hQpm{OOUEQv&=XY^l)c!nZ>0dCe5UA2XVJ;iuzxBQ9Mt%%Gn*_^Y8T0j~ zYG1)Re)qpf2AY|KWUstF$^{7d&Uz^Dd(GUCj$icj_X1PdZ)wM0&+G{A16+e*&O9Zd z{r6$E$;w`N>;$*-ya)oI#Y_st>K6vW>X_ODh37h^M0$U@hf~_<>azS<&iVaH56eMp z1v5_u0W=SXeP;tcH-I!)K9Et~0(z~y9D(K{3*53rslu^zb?~gaeI^URAW{v)pD}*O z0ZXp2V9YOR5ODeScOYKt$gh9shD%=3|G*8+6v^*9Q%(q&{EEWc+SR7-eiMVDw*)vI zvK$E}3^);7>>!OJz?yB~ll@6N^`+TkMH0Xq#BU{ihn2o(FlA}Bagz74C}Uagur!YPlgFDuR2r8Adh&qH z{#@|~F6={h=w^aD##@XhYWgf$;H9K?%&sw4>f5XS=9q3>znKqO`?bk(*QlqYp)voyGbl*Fk0T z?~Z(`xDi;3fyd393q+eC64Z@0GKMbcE3)C?jNAjZ{alz>0}j$LW~SHbqH=())IfPq zPF@<-t)Al)jlX(t1blQ?&?nxzzIHIU@!CD{27h8dZh3-?)+^rD-6$yB$`ae^soG*0;6|e$!2rr7I$FXS=LDj75sM#S>&~b<@Lo85KSdOO@O`m=UwQi|Yu%z6-QgG-`<5ZV*7_q2 z|Fjn8j5p~mXBfyG`Px_`;Vt3As#`Psq@1jMBy$)1@yi2Hz`LZktKSzfS`JHme$b(> zZ7;=4Ur9zR%BB_;t?uHp9>OjzV6*bi)8c1$@dnQW1W~wbO>xg?aF*g)5c_Iz4XGyS z^)yTfA8qpV^%MvLL@Ml8(jD%?W2OkB0B8FD`;LB36eC|T39A_%g&2s@iFe>cLrb3+lO$1uPC>8g^POm~# zmY}7{&0?SaC1FDolXi3bd;Jd_Z(Ys@KOn&R&l5l?ckkKyy^e)d4vZy`M!*aM5Tnf0 zr=U0nm%-$XKa0PVGK=UTP{IQCn&zq9*4-tplH=TXgwF>8e6P!P%Moax<>ZbT|2)n0 zE|axbFx4p$hSq9MTDJqKwdOca>Emp*aNuVnR=J5s@uW*$q|;%G^HfcGv*e3(rF#8i3-Ie>I?TTw3}wKBaR-hU((}Rbv0K^@V|p+}?*z48 zaJ@C1!ej`jJRFzq(437n<^Un|X(&OY7Bc&{o6Efo1byt=NPWu^^Wpcq6<>C7lA-(4 zVaGN8!V2dBS}1BJ$w69|X$4PxKLb&NkfjGjxUVj_z}P;Al&)ujHDZTD)%c zyFl7o%@+_btbvasm9n9_b+%M3TyJYBK9mABPt=?EWcF`P>WSy zD7LaJWlQ(AN}wR672cgiL%?Mx9gT>$Snt4K`2 z`7a0gnPO+Y93^y4$&uc$2{9!3q&OT+ad7(@{gJ^L=TC7#!*m;DRWcDh{9 zbkQivYXbkwW*PH2&tO%4B!x(9`7U+Ty)2SqDWRLiOW_|UMbGn~<6xcrypN1&9J$ws zp&LnrD`e17N{y=FFLl5Wq1t$$L>+z>4F(j|nRx4zN|}_mb|f%ayIO&Mc8$vZHvkS}GN zxMhpxfSRLh5b*#znofqle0<1kzZ~cBJz5o4WMN z_BS{;AoQJg}-HU+cTDm$=$=(OjU&v31Nx3u%4lw zTAUQGPZuPEh=U{~Xbc%hUJ!TPfN4ALf$RC#O8ZQh+13VvVPT5EX54ih+}&C&;-uwk z1AOZ*gNLw${4SU+g1~ODAWoEtaRJa}zo1 z1R#^H$uf{!SJiWz2mn%R@>00OC#6MUHPw*rY8Z*NZ7kOQuNJR@UoA9@9?D6=svuR!#uve-!02iy@ zD-%S|i?WIY#lVqHTZ_eJhO+7pB5@L^N=aCdykB6}EyruP;LWV->+>6yHDQ|dOImgF z-S&?&0rNGJ_GboT2Dk^B^H;o2CcvrH0Sbzc8y*F~jbF#m56^2SZq7z}KgIvD_Y|I1 z_Yx8^{RIC(1@7EbvhuLufq@YBlfW+qUZWz8ad%YWW}62hZAUV;O6{v;=aA-OPLOM$ zX^6+0xkq=PBpDsOrVKmHC0>6ziA)@sHZHp5=A_h|3TRbf#2VFenj*QcBK7Um`dK<} z1N()MH}!b1*!g214laJT36H%@pzrCkH)pL!wrqJ#?kG>Wz@oo@x-zh!!SItt{TkA3 ze+F+kU?71vV8lqz3@S5$g!s@RdkGr$K5Xm;YITxhF&v1UL0tX_u81N?wL|RJxHF_Q zwFPh;nQ-m4fER*hnsD#8&1#dR>%x~^j7nGZh+nD-G`rtiejjb6xTmg3KJ>{v{mtos zs6<$eJlk*v%)l5wYaMfm1pLJ{#0KGMmsI;-u7e8lSC>jX!=ZQ~9X}!}vNY(h0g{g( z7${^RaOTTn`OmlVV}nHV&!7Gi%|CPejppA`#na`F@O7j$k$`1!Gb)8YL!wZ6fC0$D zp&91+n?XT}PjJ!A6bq@|950H`!tUOI32fex25v4)Hp24lbVK3Bd_tFp=pY!=l}aEW z;c^&o_+_{GywQIT?~ds&gEKVQX+u~92sj(&eIpImn7~!vE55YYq%lI8R!D3iKbh^a zX_E{PdqKX*JR+3Xr3gj!S8-%zsHpkq0m`sEOf%=Rc|_#o7Z|_JuGXm#t6%IWgS%l;>Eb@g+{N3fQ`Yga5vQHg2$liG^ z#{wj{BCxGgWS1jaqT?BznFk0aYVyBBE^_FAbQN?rIgBt`rS#O9O1Xp~D#hTMbCn`t zs7iY8`y;%Jwq(&X7vgO2zH95$k;?>=t2sFKV7^QOqH{26@_Z50Y4VBVJmnE?oSG8o z&uu!NTxi@IeYAyjju=EqobHxv55#LcueOxU34q(n$ayNZ0|n6W+zqM9ELV*(maZBC zZ^1VdmF_>$ALk$8hqt#;VI@%ar0Yx7a@3cXU+#8xmnY4H%16eRYMbjzl|y&rfOsrj zee8S7&2*G^XlsPoS)n-Esky+Tti|OK`M1qhpI`6Kv|)oG@A6TPoonc8Kl39n*R^5-fUlxuKI} zIcAU8Di8Y-P2Bzoj;*jSXMM@>Ptd^;U%gq0-D;I5@$pu)v6{5JrGlDHW4a{zfF)u) zjjrtC|7;oXt`VhWuCkPLe)JkLo)fOjPh!&Z*e)Fn436>kEHVUAo*d)D2Dki{wp=8^ zB{s|bgQ^l&6ylb+4iHtr>18%erm58pv5+*wdn2>+zXou;4*lMLPIEfsG<&&jFFCw{ zuEYmHj2g$hPTx?8_@osw1#=-|cBiP;GfxAk0ay_?_CEUCOK8R#YCLiDK`MGA`65jfW7$hRri zMmOQ(JXun}3VEZrC<@p(XAS53T3tRUV0m$K%$2KV5IPJgO1q5tW32(ES9h6HjQl*^ z!b{xGUbMe^(D%eof?|!efs}^fSm8j52 zBIR%Hy*2cRv8Gi>7qrZrI(6Z`?To7V>nZ3$;>c0yb2^I{1-DK884I7@MN}TOL9vn> zzN|%&-r3z8Tk-T!@HfS{c^f-4UgvDlY#!L`l+K|wp4o9W6PE!iR9N2+6JfRW9u~re z%$BJ9j6gCj60816$2z?7fVKyJv&gkzwF#gsvL{Cwn=3vY>}G&nzt|%o;-E$OY?N!C z%omqYx4q=Pf&1igl1qfK;7h&pubHc@-qn9X2)xym;z)HxWZz1;yRRE&+bSp<&-26u zsnWXO9jng2q3!v!?!L{55hVyMDEg{-gS&Ym)D}(EpUfwaulPceHB=@wVAq$bh=-LOPH&&VJI0q)q(Qa9V`5j^Isuz zKLZdlJ9mC`Ph9KnvosqY%T#J5{LHi%Wi`~bUD93mslS?4_O&G1UjETTnLEHHiS9Dy zPqcizFU_mxyT0YV0z&EMqYMl>@7mlufXQof*B_Vmczid zwFB#xy@_h&9fK@ut^>S2RGBMFmCIuAey362`#@a2@Hzpt^qccfJodN3l>>s@~yZu)sukKGlCjG2iOf4>6$wO1B7EUU{@=wfhU8Db-W=DjK6_*vy#q9zrO7;#lF=6i|;Iaz(ptB zq+3UJ*}LVoxbkVrb~Ht@KgToj9LD!k2@zY`;Dn?b>L>y+oDH`4_p4)H*7E-ZOuaGj zMyJ_o-*lp8=@n9_wv@>cXy!h#n+J(Hza042@9!>bD)Qh?=Ic{;6uQW#{L?V%5 zc>!;U4NME(Gy(8V$JsT3k@c2K1EqUF`W)X?!zZcADy~=SE^FJ;%jWEz^SmP+pqI|U zOk7p2QB5X}tqa7@%++2!*Ig{V5lY3eltrJLYAk{{o$bk@PbwETBBCBLNT2v~e%VwT zj#E|?a`&%@y9b2_w0*gf!BM?4!}9v_pJbVrINhGH%47pI^bcJxOt=NkuyV|ZV(`Ci zfhAplKY?t(lrW<{=kXhA#3M2Fgs-(s62+}`>tfb2P~Pfpws&Ko+_Y--A`|szbAVl^ zbvaq8iyF`}>#ohxx43pmA)WyMI7QkXxY9OX)?3bncm`f{ zL+bbT)JdiE>RhlWjVRRS1RQs`fpM*5?2U!9 z^?_Q`@qMasFQZTufW7{0ESd`J2{d=#oP+UDT8|Df_+B8gfzG^tWdjScBV*C{%j)@| zo3aNQ)p7w;w}VNQ6Yx>5u*Xq5tJD!re`sbP=mkge|@q^dYX1alxIzVrH=A; z{@pN>Ac1@x;AQ-eIzYZLnpl1WK3U6*-|nNQt7=2=E%PJ4ZTNA0lWKIkP2s8xKOtun z0>z%$WQ@E{Raa?Sb2_j?fFyVzXUZ1eMDTm$aX{stLJuI^-XQSp9#dEAr;B&a6)5VD zKWg|dtpJDl{gi6idF}jA(l*p0Ldw$fN@(;XL@qFFw>)MZ#0%V2F}S4%Two{0vkcji z$f@PQsa63a-wegk>UQ?j+cQ|Q0>j+Ueyn(O=)XM!N;+g{q5pR_jlN4@4Rt;NY}g1{ z=rfh5=GO?yNp&Q-DY9CxNBkHPOU3IUG{Ft?ff2_QH6GJ$${eH8yC~KImi0zASwiD~ zq~{UXX?ve%@wxT^vw(B#VMpp3pKAJ0;5ZI`<6W{W?{*^*8XU1s0}Z{j(9Z2+r{iD< zgVWI#UII29$hDMcrNYifE}#QYJ1+IUefOi{SgXWte1S7d+tI{EenTs3cYq-?{4#_d z^6JJKjCui?ZvV6a4V8+nmrC`W;Ac(7<`31{r1ERp&#`=qfb5n+pc-EenDjrm2|@d8 z&7I*Dv%}_IkM){AJBBX4csr%ZC|{iuKr2gTOb}d_PooLS>gM*ud{a8&tvgO1wwy`@ z)B=P8Pv_y%6b%cG*SQ{Z8YB$ufQ?C%*>MUp5tklN=;%qIA)9P3Y9{~jFJ|dz)be8M zotbR~ukmy*iL&{mX+&pv%V->?#c`c~TQelWYshD8+Vl!W8&!+*FakG>_?g)AdMgBY z8Ddw{<L^Vvr9YzJSfu~<+*7C-J<}GB6d6czULXi31Zdb$6pWm9vu6+}dy}vRY;@bGb9YdI zUW`Gk(dI61P)m{7tOgRj5d;=QTV4vI`${#j`D_Y1_G8Zevt`+S?S+88Iic_mj#0s?7jfszxm@g5ATq*U~A>ny6r^3Qa=s z{-wn91)>(tq3Y#8VL6vR&lX>JB&p@D+OAfuTGeH@j^Bp)VJBQK0@fAf3aHpJ%(>Ht ziompb_ls<$ymi~0@Poo4>7uy_zv80)wA`Cg%^jc@L~AV7hT4~DYj#!2%GR)}H6^!8 z15SpH=1iXo1}f%Z_(r*G@e5dT)iYqUW6bU>+LOQgac|O46knz`LAda(Tx%KJ<7n3b zym$cGfpB&(5RsRPz{Lw7lKrH_28I{~?w8k}0ZqB3PvZ%2W8_ZdCVdsM-*MY=(?>&@ zFa3CRx3ZSfTw&*HjU>ohy7VaYMYGL3l+4Cv7R!b&eJF)KPXyc+G3s=r=Iuxqy;*R* zLChb#YlOd&sMkuAB9Avoal6!qMYKB1`FOCGYl=2vDKhg8-7Adeb1$&K&t{~?o5;oD zDA(4e1RH!!bSxEsR9kdI?bbJXIIGHO)Y)K3&hVIWQd`_^!2;8S2*(8ka-(ZhoRZ)% z6LU}m`lEhKtQQX+6t{kW3eN5e^X80CV>P9WBA1I`sMSL6A^(xkkt3c>t6s;;8lf@j zx`MPn7TVEdN~&EQniL=?FX2ac0eM@DT~DLG7L#4&B?zpXp5jQ;09`@1=mw;mD5SKK z^I}+)sn>4Oh(240^d!WTsnCx;+eEupL3bc*Jq)|X320em%$Mt#eLI$wP!gKDcd=F& zJ(yK0{OI)p6l2u1ajs-aoq;^hCX)^5ej$$dtFwi&Qffd>{!UXjz!E$Q`KAmCzQKF8 zF?M@(HMwU{vebLXHGQsZDo}X)xDrFaQytXbv=G;cZ*wMF+iurjjpWlmH@Zl~)h!{A zF7q;i#zW)+6ebkDc^>L?g6EP|SOmR2eIG4|K?caci-VNQr{mKrTrrar*6^cie4pp; z@@-Tq8zNl%99)d%C^FYAfZz-m*xgKEXt4oA+n?N-?Et2D=tNE6d29u6Ix0AY8k&wx zVxH({^o(97bDwSpCjo0Qb5AWcx&RKXn7=9kYIHBJ86Q3ab|J~TJ+mnuts&IFf;E_* zxJ*dwpNtZ_8_>Pz$!fCF(EZqqfuKBr+$Ya6vxQ{t7NDD=ytgO#@oI>ThWEfB zs0AVqc!{(v7_Tj-r$>FeH4#tD$^^9@B&!`;)YTZ`aH4;{KUw{z-g-*WAYjpM!P5OBXzY7a#<_Qv_fq9X%GEHO1z-Kg zh!KflwcN6vp7d*)zR6{*dD$fBLtf!E#tBc_rO&Fb{hzN>V11uUu3m&ZPuKN&WAfh+ zGvkp`2vfwwF$3LfvhZjXP=H`RqlY>I}496iHG~yfgXj2xnlPc%Q z`9(}GZ`LW_oZ_-Bu?`|WC6!I_9L&35gqXm`8ZW_7EIuHSuP7S|_K(&q{qEBvKGmvg_ zsq#-Z_7D_REVieiUhqQu12cNU6R~x0aQ=DC^=4uKt;>DJCqnGwZ=KCiJ-^MKaoUFQ zsQ8JWrc=ynA}+rCZ1U!y z`1%2%f6#Z}`~n7p^wvL{(i{aDkPDz1kE;{>05MMC4q0H5*}@9N-=I*D`g-_AAosho zkq26rC3yr5emlWr46wb=asLn&ie>Uk1Wmfj;~i?(be|DemAv9{*|@3W{2BXIDO^W) z)Hu4W5|()?hlOxpv#Xk>ySCrpd|T^!Zg^8A|It`ls!icVb&CRXzih=$Jp5_} zf&ZCo%OMO8;N*vA;8p-7G5~BZhukHL!o1%=+E)qX^M}a?3qz&2Uw#70JdSx2e4wI`AcK-fz&DBj*CV9@d z?ix7yoOtR5&mreZA|Rv>VY);CLVAfae_@)_q`x73Ruy@6Mj*|Jfy5bl0Gilx@0aX@ z<$U@&{aKG2Sf7(C)60y+)(Na0#)H%px&Ou9TlQ7ewSA+~-2#HZqNG!5(IJW;Dcudy zy=agw>F#c%ySqcWq@=sM_QdnN?tR_Q7ufs7^G5xFbFDd#Ip&yS9RC8}v9fuZCvO=P z*IIYSN9X?(e^>z4uJBllb9KLxU+a5fC6o&3jJhPd16Kao__gVL4%y}Lg52SqZPD~> zLK`roXuXdWoQHYz7BA|r@f_z}G{ILR5uVna@QH5{_4CT@VZ^Y&=U-k!c^-q1GdYK{ zkGJWA1Esfjxnjs@XUJsgi zhrhr5llT;~sGd|nd4t5q@3>!UET)hUzrBJL^FMeWz0awQtc|EX#TuElI6?d8$0PsFt-VsjpS40BXMj?G;1zT7)wl!9O%) z>2|&`eKKiVMY0I!e!tnMU%!{WBfrUP)+Rk}`(uRzjoVJGb2Tw-iM6tN1NK#B@B8yX ztrN&Kd4k);r({gbuHtvT8`iH_WoAum%m0SZ|Dd=ZqwRO;o%K;`D0gnsgr%OKtAp)A zPbGx=kIH*7l5;A6w_JZU5j&+p^kD@ftU@JOOpN3!jog-vvn#H0feUB~SkQ(dXpyB@* zE7Shnl6aROnjryI_yAELEP25e{QSp*Rzr~Z3`bgRbErp7KkPCLS4OyA1zZx3#!PZ^N+=L2c3Zd`v5RCFkcvm+}d6+-8-zppuMjM)gR8U zWQ!N4IePDL$6cd!e9?*?gp`i-8^=e>(^%(Z&AdyFPqI+XFLmB7uBNZa_H+U5oc(a! zu)_w(JlyOGd`Mvm*Cg0fF(B7T;%f()UB(eJ@w(AC^VC?DD!m_Zx%o9Mcu#&KL>F^C z#O^%2;j|58Yvu0~X^_=QqH>t2gh!c9Q^@?z`Aduh#YEu$|ZtPZef<^b-Nut0k6j%GUzZc7p&bL zfjGcevAv(_>}KP1>F#!bK7{=8dpBs4;H?|}d*|tEhOiBVBm~0E_4NYzAu6KUrW87i zyYXZ2uTO+_NX6iwa1JZG6kB?lxgRlV&6^F^pcF z{8BS*NPq9Z5h`(QoM> zOg*?x+2NSQ2dPh(Jcewy+`7!=Vr(Wj%pf|C81qh}n3wH+SrxtLyQaFA*n()4rLI9( zjHzQVC@QTR`5=Zd`jJ7e(o!9JQL}I6RFvU_w}fjkKcYeB4)-qIoBWA*GF<_Ag9h*Y zj-mhh1aOVs`kaKC>=Kk-e##16?IM^5i8rNY&R{!ra{htnmA=z)BkiS(KGOhywLF%w$)%3+} zeOXbf6rWh|@;`4x){_h*e;4*a*7G>op-taHBt1*O_G@XSC7u2<9rFBx3XM2J+L`RiVUf6+=` zrR}k2&XdU80O~ipiIK}EjqOG~n0;RUG_yt{Z82v>7MA~KoO`EdE`o^}Ngyo=!%k_p ztA&-~f>vzolzr|aA)_?To~*zRn&`>R@ynQ%9idlj=`GGmvl0cud8=GC-uJgETBzGg zavyL>pDnxX5-1{&+K``jPpIe>l*Ma@z=zCKLtLo?@tcv48Ak4Ci!A8`LEU$fFiuiW{;06AS+= z;Ann*BzUp_l`y*xyw%5V{qE>Dk#L0@v;c4#`X!J;#T=QfZf8aJQ=&4J-SQx^p~PCJ z-|tpc7e85z>v$_8*BbHbxa(pUX-^7?(^XWXEWbCP1Tguq3EtSu{g**G&uV9)vDFwR zMen2cg7pK%%gK4s&OumiEQ95L5UQSTo)h#>jSZINQ{a$=J$uC69$Jj6^X#IG;e&M* za2|v@kg41O`q`D0Ya1!8B=m44`AR+1xvPB{{c#o;h zSy993>>p|C3kMQk@5skbEgKXizWc;o-dulP$%2qXoh z!(y`f5{myrx%l^Pklmc2{#DKXOK*7j3(FTM1EQ&9VetRKeISL^PWv}V9?H85ex5dlMftf_>r zxc?cJ|2cESRz8^jIjes@TnP4Mh=EB^CbS(0HJADU|Zj~4))TuDOF2Z=ogwH6aGmA&(;oywK67dX$rLKT^hNe`LI4?TUNOZ?_Or%eteGqoT;KRkOT_eCgdLGL% zD}nti;2Npb>Xt~KN`#2+1yHACfmi8tPk1^X8dEg1wW4|A8`CQn_adc2a>bu9LI8Kk zfqr_@cx>lkv$)#un5+$<3~sH(y4d#s5n3)xh{-!$rGhB*XoZ(j1(3dcqLxmVcLt_@ zB#GIcNYRsfhyBk#-*VYcqk*D8N~Ly(ybY`lQ#wUS5gC;aH~aIx81aJ8KW99f1^u+v z!<~DVnl5@$K0awsnIO|Z`!(=K@6F}`vG#I3wx^c4zniZ~U(8hw6Y^7okv2^_Be)6- zMpxUDj)0^d_VDygAoS#CKGD+iw>_0$TE#)*G$#nxp}^|b`kQ7UY^RSpQ|-$NZANIF zvd?XT$wwR(!UBm3b$FpW@%&XsKT~8a%{R7W*Rh2$CWwj!JE^_rohuJKo3U))vD0Hd6rQU^yT2 zM^E44%MNn~GYBLm{|VK0gzu0S5+$7~BS(0a1Mn|CoKx*3`s2A@MNP**=-rI$sX7F$ z$rIDsoHF;|(d;x5-z$W!RAqe(F3=Y|Z@PLsCVM7a>)5r(cTwYbFngENYLZKKmne2gyX~*)VRz< zU;O5_Uz}&uoeYk@_^#fI^tYWy3Y}!41q`6V2P4Q($rpc`S9`FFPZTOc#qc_s%Hwvh zT9okObKF!52hp0*E3BF9*L^}&3b-yeng7JfSS4~Xc<_AMBCf#D2fKC}O+TW6^ zNn7x1;bH(vM+`YB>@%*(&1cdZ;zaXnol5uHJG6!7ZBi)$0ohEDv^U1~1{T27x-a=% z=@XX5x2_zLEI(IVdz1qp&i4+flypvr-&Y0of8M20Q17sqHKp^pf3is8%6nlP zL~TP}74R6p`;|s+&AmKrCbZ(u6rtF``0O*A3e%$ZjQlSYt;?Z?0)wDG3L!NHLAdm? z1T#rw*ic9A^c2z0D{CgFZ#~RL27BUj7p8G&BWwDsg<9MwLYXSq4lk?KUn6z|;JgX6 zsFA>ZE!}>Xl#BLCbr!PplEj{wrXIfha>pT?;S zB?p&I4|ro{Ey%nkmvm8I(r;12N}E$tI$|D(L<`$qXNur2>)A4O`NQ1vs(o4s9ClBR zB1O4f9b=Dx6-{}}&};#j;}4}(NGl`ib%lCpQ!2?fV!HNq|y9f!Yk z)h?yf904AZ`cdf?z20x(Bu=!gWCQ%Kk4gufz_&ld@=wxI7wF`PDK%Ow@QIBOmaPK? z0eUy_L_RM^p|%*VEN%E34%irZ`qY{su+)a87wx(}k zJbAf$#{V?m-o5UAnlB{*bLt5(OBBUE3XG{W2AAgLHN0c|cI>CVW*~+cXbXFmvI^3% zWsN;lIV9@-cjrxOkz5BM;LhN}NhG8jNEdvT#tP1CX~N<$C^k5o`eV!~AM%Uu!T>F8 z_rOF8C3yyDnmSGYMn9hQiQ1bY2cI}&|Mpx&F6qK{q zSr;gARF0qNIZ6wJX9IkHIIPQgk1tp=*gRA`i&SfnJ1m_aulV^2iV~=SNEBQ`jeb@t zXZoAliHVjEZ%t!9n2zOWWTNq2?a5C8PDf$Y6Q2*C^|#y%(~D$y=8yW>fIHmnpd;N) zFIwP@R!x`%v|{-w=RJn>$|$SFLgg{3;qp*c zW=mt^1LFn%T63N05EDtl4OeznncW_jthZp`{;wYh5)mKwIVX4R@GU~TI~Hz)AEz4F zfS9ngJ+#_g5q@AsKAy?sxCgPz*ZKT>TKL&yZbfYBxntiHphlwbe(JiKQjGTQadF+^ z^bB$`sVex(UFmF2T=*dUaIi2uQ5abuy@@7%qBx@kzwtMa`-})J>MU_0T%M82)Rl>l zsTrUz(@<$xprogW52aFP$9%Y5Xm6#5O%4v1dh=UGGo1GPg4a=n_ff%Q-3Iin^jW~Y zR2@<-Lo9fK#t|EaQ!dleofOohs;ZHu!X$&M9P>jHpvY`#-Y=ET@`6q-9JyyC^;xXX zA@H-?0*1DMJl|I6%Ri8ud96M~U8?VxOYg|Kw>(*NsAU3aLPeTWYqoo6jgcP7`?7JS zwj`v5vw+X3NRlw0-+*Y#vhG_);zkVJ`|sxzwY;C|#dI`(+P=#jNUgEwFw?gzc(?#f z<$`7Q5h%u-;wV#%84hxG_tBhACK53vPQE;SZr1{rAqos;4`53Av>I=VF&>}%fHtxu zo%zgtN8rlazYUr&+{Z25Zz2lm3gOFsS5yc9J%JSibadIvH}twNqiH zZS{KUNi}}#F1cSxtyp?S#IgJJ6$O|)!(_0V=^1w{ z6*JP4Z^}H3Th0^5y_kdN>eKPDv~9BcEq$~WSXX2f71fRzwK}g#Jd#$rx+zh+e5^|R zgbPU3aYZMVpNAO#Wb70c8HN-N8yQ%I6J>=xm%H$%ZmXg&u94pKzlAwy;6vm}=cs-J zlf+P5ai$17bDMc36R3mJK4{83zNP8lLh$i?wq)XWi%D)LvjRswZzIT0TMTbYt;`l` zdm^?@!+Ty;P7fOVQiG?y`v8QNKkF=L#t#LZkuvA6$Zrq?qj>-iu^e@A`)}ke4^1^L z|u*R1jIdyuX)UOGXdiXP2_<1M;PM;@>PVP#B;L_u1#*% zUp?J2Mm`QZ+Lf<2&JBG62Tse9&~TJ0i!nm8yN*l5Q}Hn)wT7L2pO1Wgw9O+lP%Ksp zqQZDvV1%1iF~TE4lO!G3=4tGGSD(t~$vXRL9L2X(zI9iZ13M zTDClB7udsFL?_BSY*s;oU3e6fN-p_aFipa%0qe;+rtvPlCF98IiPc?#^I%!b72^@x zn3rfQ)7i@W{(L!c_93k1OwI0IP|?UDI_p4S zJ>&NS-v0QrLy@w4U7OP-;fQ)Gr?Oni9>!0gw+0bgOQDk~4R!%z_PYi%*n?0wq%v$y zE^FWlJwK(M5QQ7Uy@g%jCDy{dN6c<8S|rg@ImOS#2{ZfeJAmU>OyE2JY71d@)U8|6 z3_DARm|g6`I6rfNLlM#nkL}(B(cwLqjm3LrmQ0XhY*Qt%T6CD-wHf|yUTL^(e?0>P zF8UF>XU;!wAHdZDxge*{mmA}Q)kxQ8q9+w+bVkIa+N0YE$L82rfeJe3NG*-Vq_bp) z@-y8@YHCE2FgR0(i+)3~UGKs^oJFlBkHyvI&q`Mh;4{<8#{zgWmU*w^kzkf&R`pV= zctq8gqZMuKL(47fgL89iH$Gk!JrX_By`Na$FA(O{%B(Tn68znH^jparjod3j6DdV= zy?XDdk|FMFo8-tZ2r!?dT+S~v|=V#Ps1m|&&Iu-jou_h(DO{!Y2C5sBL=_EZXACLy0FTS~H zRFw_@fUJzEkm0G6>9P?sXhngb`SNV~g&30bwtvUR!>&98j_GXciFy*ulp*lW%}{-- zOt+mA1sCsikPl}Gyx|h*sI~7>3TwArrts~kZA#SrJL5=IMItwkwpUklRSmU84td_i zpFO)w&`^ChwB=s#_gTGHBbCIkT-!deK{w&FLE46p!h+a9q&GVhzpQl1F$ntT7iLP8 z6fL@r+^EK6g+17uQ3bDar)xF82|JK?3j+MI7~H$>fFlT<_S<5UjMm=21<$N*k_Qli z$u{Zc{C@h^-r5m&#-S)4r%R>-9=w<|PLkS;05z<`R%^23hlt-g+-ZwYgDB8pBz^0} z_wMH2*(=duv67-wsmlJm3;CFp+tCLiFJBhsNQp=mfXBK##+a z$pMBJ_;5xx9GKW=T_6Mx$zCM?no?f8fObci((7!{R(9jWkyrrMnl+gOM9HIdi1LH09nj#_} z7L@t0`pcBM0~;_nWlPR)PoFnT+)ohKBdHQc`ZIM;--H1ghvfO|VS%GO@oJi-`o$t0e*dAJauB{RYE)dh`E9T>lcJp^R zsjzc_0#{==A5GsV*~O(`z9mO?@Mo1`A{6Kbbsx_i z;YSUU9T@pyc8+T7vQwOwzQ3$*NBL}`S2m>26RL_NkoU=RXz~_QdvYPw33u+uqDsoW z8GxGL)Ys@FFZL^{3^}yA=_5Zhd2N_NVTlsC6_4-<2(bQ;d*L^Mr*ye^>ny~V5_B>& zF$t%GkY^rO|+@Wu%8zgSR2kvJFuP@7&}}pZ+^kR9DWbAV7thXA0s=vr1Zje z^PT<()(+%aA!dUBo<+MCOW{NOxMVr&X^5?(udAHoKcU+)4jSMGwh#O>5jRC|BK5f< zn}2wB(@)WMZyeL1%_?P#ktUhP(G2mU1vdOFLsX02Le&(QfQxJBV}K*|76Vs$Zs$4N z{i`2_IAJ{IgHV+ZxV+WNn{e(2-3O$=TXs?0B=|jnL`x4$7)W`}Ff%OiD!CQ0l>^7u zw`9bi5dp^vydqqlwabUD{ViC2{0n;r*ZGy>Vv*|AHIMf0Cp!fKR2rT5l-?vgUmT8E z@J>HW0SVvt1=kDMSRooYNFxWrD;`hk1=IEgcDUEbd6VxoBNik6V-*x!wslqegb^edHDZp~DFQ)k@eZ=*C!M}@l7@v%G z_A7T%$~{*$u<2^uqu7S?`X)lUhuEr>$UG-&c(?Xz{IlEqsH2_~#F1(SlfI*m)dB_G z;#5#Y-Td(t1zLBtGJ4Fa@pc4p!v=rtxR|6ns^se2x8*Tfz3CD;7}E|>fSa`b(Ljd| z+RM4!B5dvz_t5FBqYdTT{9NVy+Nm^U7r2H15H!4T;GtVw_KpxcLFXSGl?0^XqlH9H z@br<tNp9Z0Xs% za2T1T&2j0cx~i!R9yJPu-+nlW%%^Ict4%UnOZZPosE2UEu3v9XuBD7yJ>6FAT0T;b z29P6;M&Li%ksA79bcsT!0_H~E<5ypkM>LA736oA>1@d@f^kvpcy)A^)P|!TFgY#6L zFP>f-Ui}%kMZ9fifX^cfRD%XWg7191&eJ-5^cFxMdkZ5Rz(SOtJ<)f5xKF#w@*WPI zz5`nOqv|5)2j8|EALE#1ytdZD>iW|Nw?uclWva$#YargKBZ(~kh8HeuEB5nf6|Z$# zH(iVhoHGnFNe!3rDNT+|Z}mQ_y2+_EzV4~U#TG%T@h-M4+6#I; zt2SPjgu%(iQBf_v0AK*2e#e}_p+8yPnp@Q#)hrpbd}K4+(2>}fc9px9F@D6XrBQmq zcC-5g<+HeZu&;x@m%4-D<%M$?=9WSuQ@<=9GnOL$Xwb{o=YzM?N(wx0Q@9hcvo^a? zRbfL{Fk;`{@|q`q{k4ozWnRgMD9Q^hCw3wnC*1hueTUB>mr~(nHQhTBbWCAMdDDB_ z$9AE~sP;8Ptn*tt$gtbPMxnsw-C=w2&XxJfLXDN!$))#~&-O_*10kx#<{sZ|eorCm zt>ARXtlGMS(7@WBb@)co`GWL_JUPbFa#l&!;+m;$rg7*eb24ExrW@^!N$M6wW(7Fy z=RUdrnmXCnm#%3V#b~?B4m*6Hyy59Y&tlf?(cb?)LK#9+3_LsREY1t%`ple52C~Pc zgQ&;|Ka2Mo-=R)QXTXAbeq}0Fv_Seuc-qNmq)bv2!u_xIiqKQH6AM5}TVXtUc^HO^ z+TP)=k}h_nie(z&OQUy^ywjj0!1e(dc$L!0fS2W`iZzGwhz>H)INkm2l!AT;0!w!P zXX&#B6fGEqc2D#7$;l=?6TtI5$9`BOmwhk-G?zE%Qh+?2tT$c-6gS`YfjI83&!xnQ zRr=x%-6Ap#ZX>y&URBDswl-4<`sez%N^=&>>)`gh1S~?Gu~wHM^mt+*CzZD%+KB#0 zw2DMPfr_<$CTw;6W&_JdBNADMq#{(A1g@d8uk>RyrQzE!do@0+hm{jOm2mE?Ux2UT zF8M413TSuQT-P-`!h@&M-^8=@d}L?^DD*F6Pj1t{WKsga<|C5}!&7(-To( z;t*^fa|PlqH2QL4>G0?NTME_ya)^d1LQVZBEtIgyWLA(mgrle&$0gL9+N$Kt1)xdQ zo!>ic_3kx?LyhL>KE{rrk~9BYY7L-xol9fkzNE%cFSDB=G9_})6Fz`>7h^3BXxcUA z!G~8CGk=)vx)y_;eo36j9qTME1^t~EQ?Up2w@eCxG2u5yInmuy2I=1QI$xq7|+4ATl0g!07% z3aLXV6P%2%(<_2x*BSP6oE}mluA1 zDDo1g9VX?P|?Q#vwYG7@Wsjqu3pE!Uw@d0oP&Dj9sz-Jmq3WJZbtV+PEj%-n-M`vuKwbt^sZ)a2!ThBn_-9!K%SspMZvK zs%g1h{`~23qg?=A<-D{71lc|N3qmYFEL(}Ii4m6+!ge-QG}D@b; z_woJM_9jFN$8GA+Vy0QKR8~vzoL-A`WFgOJZUQi{r)R_El}s^r+|UL}JLsL5CI{mp z=d6YL1?BV8=G1<*{gRTm`25aO%QUSvtmjaxkNv}i^Jp2D;vdJ6*Ch{)2^`)K(SFJ8 z?pnazWwTH-oAB~z%Ac-0y8gDblhc!J4AAP~;n`(9K%8NLHGd1i$o{80i#toNCOnXd z$S#HyK)-DY_)0R>w+pP`O*=|HzfD653kj9W(PU-{JWoUP5Z+7IHB|`6W>D0hWUM-9 z^tfq>kNv2AMZ6VTKDL=O@_usn5Xi0Gsq8NUAZ|=J(Gv<>mTxrc>w5DH0fKWSK}}=C zi!JvUQ!%-M4)CnqS&A$$4Y6sN!-Ul)KLlr!O;)RY{jxGXw)E2L8}U1ROS>|Ai=MqX z*Ij^`{56I_RU?r^_m1QIh@UH8t?hlZC<&HzF9)CA`pIsC3?`3zHIJVP@fn{yZX~!lB}E z*`neZk5GF_-6M^7eu)No=8V7w$Tfhk$U~sX;CDLmLlR7lC>4Nr1nX5!0T4_MFo)1! zy>^6!0<|uxYgFY;9C6U*{|2RyJ_BT@^k)mwcK~w*9*WEbYcd8(QFMWyw4So!FWW#@ z44}%)k;T15gdX>KMea+f&<})C^-DQp!LzyiVyx$gg)rA!P`m>o4ZLO@@{WSJ}{S=^JNxzVXjGMmv1dLSv!;%4xB!B%vp;;*Z%J|aA_Zsv_Ga)e0#_5cG1si0k*bOX+!7O zd{(4W%9wcfrSS;GHr*DuDI`3fW=r!M!iM}EfhxQ9iH+j1zM z&o*40yHNZ(d+58DFX%UW!g}x=PLWY9;jasx1mn{w1Oh=GQa~nb&GMjy=e-w2wZ*Y; zoqpvy0dQN>Dk9n7FoA!-mCNOP{#%vTuV9yVmLt_T2B}u!RjND~8B;9{e`hjqo(D3k z`>U~+$SBfN6-2c<-9{NPsea^|u)NhgD~hF`h~YxOdVFB>iV-;jVngLgq)v_norJc_ zG|ttorFgF+mn0ja!7x2oFfQ^wY8(oxX?*SyK;KVpZ^VAEzgf&sB0dZG)Ns0FIC8^I zt*_8ZDe()wTD`x@+UUSbp-d|@cz?aqE!yVA z4Duy2&9OP%CmK9{`n4IVhqAf{+`H{fVWVKrSGBv0}n4UA$L>}4Un6v zCcpO&OUw?j{h~Xbjf%+9lh+UxcdR1w`QGuIZy^zI-=53kzWlZr%%t5WoILOEIe6l= zXYYaapfj`%+u?}4jns7~KT@Qq(B#hQh_%#|ZrI7RNJa|mR@;bV&K1hkR686Frc_P4 z!@y`x7dIR8Y&@=~?^7GB#Z%3Ib?DyA=Ny2<@arSw0BnB{Q+dm2T(x-9?%?MS%w} zMzH-m`-a`T=fmi{SDNLp(GHfy*Q)RPPc=rt0K%%r+oG~J-&*rF?j?KT@*qNAf++vq zy7RNL7J?Hdi$iyYhSvRO>pcL=UNx(`p!aajD_ z_HIVR-eNYmvD&f}eN45&pXoHColcesZFjzBM%8vRj(G3pttL;wP^Jwhx*=m1u`zJTIusBsaC^zHtkktUSv}VeiR_mkjiqN5otEP|i=G;N~5SzRD z=m*(&Mg@VqbEGRSQzU`F%?_LaQ_))&Wji%C%GBI^fudA%in_;RKJ6Id*aBz z>ko&E+?Hy((TE^kJ82WS!5Ahf<%aC87yMVywzD=>;Eka7V1@rH z!^Q!K4*^jb*ZGq>u75GKnThE;CArzPf56*ONgs}?H!}}yU7X%L&9AF3X)r{5dTi%q z^Gr)>+K0PKa*KCO_t4DnHVj&1(@sCEWFXtv>QeKg**BCH(BEgI;?puM(}@ zq^9T)VvXhIRuzmthK{>i$dnfLlH)Kjf6E~VOE@jq%q!=Pa4Pk<&$Z&pv2Swc0z>#a zyUMhQdEOURI{#Mu^%I@6gF~o<8#iUWXV~iu5MHE z{HW(&8PooX3aU!$02QtDm8)CRX8~zJ-Yr*LIziF-lvVxA8CFxIrx#fRMxTNoDIg|` z8=1PoJz@ZuBe}IwBSRR)vy-xYNFtKP|21mZ2aEln`drphr2*}RYdBIohNBywVAl4dY$Zi*>YnD z`uVF~x&%|pUN!=J?rMC=kdi+JE%4X7y{s1dMe0PWmF*NODLw5dEG1TsMRK{7cY#vR zHi;IA!|#L9T7&$76`r?HJ(hKr=h5$4I0v)Iueg`@n`SE6Y3lUh(XlFv&(dMFp2y=j z)w+FJb$i0x=lcWMg|Ejfy?q(9>JmpQoD1VrsBUkRgjjRm0chVvh5!%)=HQ3p7ry?1uK&{@bTuN?4B^Y_8`$C<$#9=ZQg!a@vZPqGxvJy?5c0H^GNYTb0 z1Ul9qHUnq^aNHPT?p7B7z08)qY--Z9$w9k0^N|Gt`s7th!0XaCUQZ?q1{{r=ZGfLV zuiJ0Gx>!Wu>5j|)YPWbA`sP;%pb1i#b?@!wi>|r<7g;SnFj1EwuBB?*xe<06b(S-McMg_iri~8Ix(gM| z_v(g9Erj>qrMCPMs-5n%cFsGVT7v;4>Rw?sd=;~bv!aES4!cKbfzzdpgd z&Ahc+EBS?~)iJebHt>nB9Eb~PhbOCBV0YFuIeqSg!I2-bVyy5m@m*Dq{5wVTGs8aY zrsszdmFSiG)Vp;?stf5!3M2NdltW1J(g=v0USWMKIs1#crW)%eoijiAFKr8lpE|*RqdX3#`V4R9i7pi=W`X=aN!kZCud)M&aZm9^I=PKMCD|8 zBP0}=F!3%Ht_xWU_YIo{X>UTs&O)glJtgh%OsmevU3tz^TQx~<(Em&p<8iCmRYhaZ zKCw0GkV5qSLN{;5fJ>e#sOP68jam-Zk|>2$1lNVRHB;B!-kDH2jTN=Nzg~1~r;ddw}wuyKSTiGle zF0zMkoB?KKpD6AiVUIim7;-e!H!20O zRqt+67i$e%JIvLObSzH<4;Y{hedTXq<>8ymhT(mYLeO(n;tp{=`E6nvVI%FO(`9Tl z1ws#xS1CX+_k z1fa>tR}~<@)0I#9Hb30H<3(9f%H*k_BT6v^odkOy!|*o%(v%!9^G=W7r0pdW zTG1zgnk48yFf2yWZG4OrvYCNV+Hk+P;8ts}Nj|h-QD?W1ti|TkczP>+_CcW&i2p$O z^a=s+L}5oeOyWc)eS4jgpw^8zUg-AQ4#Q*2y#4v!G;Zj1H{}h=ths$pS6}M#x}2Mc z8=q?axg`0Ih-7Qh8XQkV51HB~U>WBa#FugTsheP!<(skEk5yITsamq|K7)jV(L_Wn|UG%NJ`Be zM_8!+CgRRhtJpLyeS&4?j(t!0dfSSId~ih1*7IIE&!29VnJ9I|=$SPkqD0479=5Gq zLJRxhfhOEU?;Hnk(5PfZs@21$8Lii%m#t{|G4O~)b(EBQMGkNP%~Nxi^L zavSFWu+qTbtn&6qjYj(xH3v8-d|gsw&(fBGBR*hS8+P@wSZA-8O-Pb1UIu!%6lor~ zRCkwi&h>=MOtJl$?@pn{q2D@ochpsR*=K3}Qh2o713NS+JgJP7->U3Z!rsvX(G1;` zEx-!{Hst%{quqb}8B+db7EVJfv^K1X#58xY_?4xZu%&Z$20_E~O&V@Bv7YI6v_cnSIm0_AaW)x(N%%nuTYIVggPk=6 z)LMo)7p`5OM69zuc$utaoG}dt^)wKEwj-NDQ^bV=rx~kb-CL8F+Q^>duBo`J3J3?&|j|& zh)+%Z_(ahmb`d*Zg)W;{^s8IH*-$wQnO`N3L(Nb@T(UbY3Q{7v{>j&2l)ACdoiWrGhPc}>zH!@d$pz>B*e&S!O zQF2b+N1a=e1mX?cZsoOJjhYP51Mb&{7@$cLpGREpv3_8k(Km(0{+=cMbf#jRFE77( z@=G}^2xFihs-;l)C!y&AR<_PQLMN^FO8ysD)8zHOwUGw!FNE0($!7p4OPB8Q+G5`Q z+f91~y|$M=5|M;R_o`CheV?QqE%TP$*|2jsn$DGHnt9w%Z||=F9I>XXXx|WEvE7-) z$4dkNggYD%G6A1~Ar~nmElo%0#ox+*aMdyq)hrog4W-^5c(8EZ0A>dFcftNSlK>Gv zk1y@4`vCj3mrF%;;uodynthXoC|)nN>gTq^?lC&M%yr|js`c9$jq47tU?8rbDLbIh zt~DmJmf*&Qr3^_Rcq1hIR?fUk+;^3Y@Al)#_!4#S}qe?{x?B06n9gtT*l}-RwZX z$g+25<{XcL%p=QQ$zY)>I1UmB8yAqHe&b+9Yjg?ljSEcpmh!L#r9hDkp zoZuifrQlLcMmbvIHr4W}xUPgeK0tGw`zR(ohbx%#SL}PE@)`F}2rv^M0^6&RLYBFn zvMt%91$0pMqV&&XdUXkglUI|3fq*m}j)6qb{S$&S&*7g1)C@ChLN4|;+9hH^hRbefd7HGoZr&EQsjeCHmA zoN{@rqT1CG9mOydDLhbvYi&w)rmEE~VX|MkB>)n|XvIzaSGXATJ^HaKu?(;qILEpe zmu*b}J9Zd92k^n$##C>K&r?w*66L#XXRh0)`Ve%-1!Wqc?H9&QPO`BnIvMALQ+W&e zO6eq>T*G>2;EJizAABjz$-7lxP(622M0yerW?ZvN>>J9~0miN9E&w;hFsP3}8^TxQ_yhpJYny^~1z4J*@orI+mC03LDxCm}vjLW+KBHAEo=j9JL6$XnsJ4#H~FBezUr2n%M=0)2_) zt7VgvHDi_xb!tPr09!I!ERg~7>O#B-PhZw+E~>WH(UlczK-PYn&}MiYBaJ9P5uFYT zLW7ZlSk08G^O`psyzJV0{4HJ&g{ZXfniiOf$q-}ffv8VA)q-NNr(Tyzm_Kx&ct@lm zgr}W>(GCC^ex3hJi|2KiX(D8Zqr?zqu4$49>KR4c3OwFf{%% z^N}CB$ca>rfjZFO7ZGkRe41x~ap}OI4z;~i)b&4?4@-l|{UPYRug+yva_3qS4(*32 zf(>*_Qz3v2?N2Ahx4;|32m?YE`PeX49M@~|XSeXFjIN{ahr)-M1hab=vqzy4nRCfI ze`42IxbK(E%NPiwETFxm3N*315WCy(o%_@#H$Nq-<$)?Rj7zdU&s3@#>5~dsdICjQ zL51>8cYaCR@&u%mc*_cSDt-z!8hIRFJ^H>pX2ZFhe1m~MgOpp#j|_tb;qJxq7WMlE zTNoqqZW;{9o89ziPmd)GN1`N7ho6V{4PP!0*6Ts!ubf6QkKN%zK0ra#NTZS`iU8r1RZP`d%2gxmw}`Y~MwG z2DIx{8+V%bo!HELlG9Pxs!R%Ij?b!F~Usq z<&(bWwq^}xwp?rU#5smwN0CB*&>Du(P|18449&t>?=hrEiO=|vc`M_V@ zK|=7^GTt7ZAN2R1|Mg!_JiF8_t?i0S|FhhGU+Djnm(NabVq{?vZZ+`Gh@cqQ+XtVS zO3SNL7jU9&YhaMkjOLG@ZPyAz9A+-u4(D^j?V2BS+l4}77P8`>SL~nn`GZU)3KyF} zjXDGnujEo&zt89O#TwkuzXg;TxifOB|I1># zzUMNev7U%r7)?|mSN*UK1RL#r&!eITERd(MK59EZx5hs#Hs1w|-T!-e)1cTG51HST zCT_r$UKew{|MCh~lgsCx@<|}~z3#W>dK;B=YP4dBv&|CScL_HfRgs;$arDpS5D8?g zcK7M(KfpGi8Sme@i5K8Fq2h8~y1+E4CQVi zNCup9+2_x+|9bT}UT6piE6DryAYg6iP-0Mv_%Pf8NNKLu@m@_@DfL9Mmp0FX$|7E*sC98+H0s?Y#NsbLD;7iO>FgF~{yk zm0jrK7|l$%B*pFJ#mDw!k&YkHW&KMY&?dV@zzCoq8Tq9B36VG2VB}DlgKiZKgjQ6M0gAXqM-_R7>i>YvGfxx~`*)#Ymz4z7F4N&?M|5&lZ6Um$vRR`R-0_q+L${lXIvAojC7PBZsqI&||>|>R) zyl7_2$mg4(pOG&4F6RL_4}z~jRgGh&#;3CXvE~+@F1JW$`*)11sb=>X4XOo>e9k^s z=o4vAmwqRc@+@byaaaQrBee{cJ?25K^BPXajF%2)SFoiBp2wI1$)JAr2N&rjTb=mKv8LuL7~?~%zByZH5KH}hEcpU=VF{&k2fO7j0O4jv!n2!gEk_>8(IS@mNi48ZtJ6rN08k<4Wkfu|W{=8qv zy`uGtaN@{IpkHQ#Z83D}w50+Wj&^$Y^z|@lM0FD6UjcGP`nL;7q%gX3Y6sB&YA?OBuqJFK?n5$doZSiU^o zV0B&@)6sF5q%8*Wab`xi!#h5>mf?oJ2eUyB2GX`?qiav_%Du$pYo0+#=gqpygvHpc zD<~j=)<<&-bOx6f1Jkr<>j3yd<{#sOXiEu6;(CSNQV#%Ow*Vu38rUB_g6EXdH%PB2J_ zwpfWNC^(O1z*H>+VEM5UFf~#7?$Go5zGms!X3A>WFXY5@OymdC4_*)Xt5UA%RLHE1 z$p?uTY5QyTXnar^)o%T!>h(1-8M^`mIxOl^=OX)ra> z;p;=r3x+J_XbYspvQUhJYz}{Ln&z5-E>Xv*0SLg(#nyw*?kj4@kshFmS*auI^UJB% zr)ntFY*j2$B8Xi#R7Z{D3mK_~I3zS{9`RuTgt~2UQpJL$NGGI&LC*PA)E(SoaNV5yXImtL3QDdS2i<=C<4evk!~78cO6J$0Mm~XvazA3YTxUVX5z|JPJ2dI~l`to5ZNs^<0Iif0I$7I^+n?(+F5OVw3=zb0}2*ysx z^VR#Z-|V5V9>u?FMb<5AlPY#Uep|jDuMF~$eO(XlzxC�wEm$cFKRff-7WL{y3`N zDko-ctq;;}%L2+noPwQ~zw+p2(jp(er7qg-wpxr1Y#q~X`gP)coB~gBXL;|%Fx#AJ z^qp`E=~_)qz!Ut)YsFY#<%87`W>0n!dfIJ61wN9{hT zG;%X(xUXh}&^s-4#OIx?4LJqy9M9V@1)YF+`9Vtr1Hh%;FD#g!O`Yb<$X_gZ649Af zdU77#$cjB(+z`%cI{TYPsGSP*cUIEG7N`oQhdFd@x}_fwrYBp&8omGcJ!KMzkrnaA zlol!eptnZ--2fs$F#ZKZISaq0Hpxl+qkOTo`-I?#uDo%J%M*qxjt>TYLa*JRX7ST` zRfbDVc3-X*>!;tgY_A=V6K}SZ30%&I{pru&3XEzR;jpzLJ(;#|+LZ?=+3BcUH!Q5Z z*OoWTBpSgHz3<9%?Fy#fqB$9Pu0d-y~oz&e=8 z?)zOXlvT%t*EK#frNI`$w>^!}Fu>C*2TrzIl%nyJ{8Y;6*9MFBC+;L_Q+CO&4e<_` z$QhYD&W3fB$XKW{Praw!aSzhCU=8X^Ag)t#N&dNINdTdYc|5=Zddt3C#6l$=GKnr3~RHP#~bV@_DpN?gwyCCy4fn_IgTb%LP@ zz?08E-;rzRzUL%XVcRgJuX}{F{M^aqj;^`E<->S4&q9=-D&cLpR zC&`mz96@CR^xA}f^;)i8V{Cujsmf2%$$B@&SxJh*HY5h^+ydqRyc`~rEu;)|c=zrw z1JxSD$I2F@3DYb(_L-Q(ceJnc-={SD4i?})$^c{**298qp}V4ngupMmBqFkb{|L7% zLBwqo(XwEUKdbpHp3UP*!!e`w%|pfK+Nb3P#`{Nbdi&OgG7T=<+Jne;Hz?AE0dy&J z;BF|I&5_|Zc!L<0`L~k>Y-e9-`5$ws58wmWF^h(}@PDKBYhst1TXc1Ej|M5Vp6YN! z;3*GpFIq`$(81ISw=qlom3JM#YYm~lt}j&14v{nocm6&m*lVu3r=9AaZQ)8HN9TX7;hQ{L`wR>v!?8;n~Xr@b)(42GeWsglT>>xkvlEJ6};oDmB?Gr`Y z=ca+%dlBuAEoRBq$c5)94TQ5t!-)5H9xe-SPpjpOza8fjG)@jo?dc^y0@lzs{xgS` z0wJ3uTW{zvBVoQ5slzQntlat#zBg0Ht6>O7X8@( z&Yle}IPQ!_x;&H{;l(GU)P*-N(}#I7++7)!z>)0bW1Rf4HP;P$k$xkuU3~HDG%aN< zt!}9?amAB#?#_=w1UqDMY>Ur+py3=7by^|akuX?!;oR*L+%LqJ{cANYOm%-$bRU8r zhQ4=QQPcKW>SO7Tg>(+1J%)-vM4+U1YaaNFZOc&+t$dt^aj=czA<9Fw_h3*zTW z!Hd}Uqke1ah7-^9O$5w~0_Zlg4az{AkTU)4Uxt?%%iqe3kS+0NWar+rD3Jv=gii`(=)ryClSZ%eICV- z8NcIXmR5`PvMbbU(xEU)V}dURf;#S;g~Hyet2i;o80JOPAbtQX_Zk3v&?m^90Bm1c zFqpH3`!-9#;=SM6#`Y6n%qw2HtGNN_F*G;Z3HAJF?x53Bzh~I@H2T48$YNy{JcWOR(o_}n-Ba_$_jY*7EsRDnvPeU@@ARTN8^WKc3yb0QIOi+`JYCN)Ceww< zf^jPr_9H@Lhq&SzmAVf!c8+CQb{Z-K4X>{|3`Sk2S^*i>h;$?DP_~Zl)#y@c&b(eS z3H%!6POO(Vnp5sWBGA~=!WW(D<<;Fs0JsW~>jh`Mfs(PfIc@i>if*M}+6Sl4it!jF zBGve3!)ELw(&0L%@rI<{Wsl}99!QcMuyJGV4}e(BWz2@}IIs3mFX3`E>@0o=&OQeb4HG?K=q3Ix^zp4@bnNR*P|JJ%m5vVU)kv&X3!l6lqa0;R+%GKdzqE*r^S&CdPhv^v~x zxOA33Yut3TwvIsb$q6bX1Du}3qI0+4x`sl2c7B`b81=@p?I{FD)K{~ySzr16Riu~8 z=5~5Pz(Vn=C=+n#YQP?$4D0PK@Tkm9WgMJ05JxLvY)0($rFS5fZtYQx=EF4x|c2n_m@5$Y)pg4^NA-0^rHF|o*DN2QRC!63rTWmoxA@kkw8?Y(jXlgoI=z+7BGt0obm5z z*u?bL)R*LM!zM-)&iNeX9062Q5l@f2Cd$I8W%qxyZC*N#N*!IgQqh0)-KmY=`fJ#~ z4FIe7l?6dB>I0{*QMCFhA;|`SHw1H>qrTmCbwqm=;3HUH?fA|8kFfny4av0g;q&?n z1P``>efpSj=k0B)Yj{2%C`cLP62M+9x=mgt;P@LnQQOwvEynnsuT|NjvDU-0`gdw; zlUfyUbGjOJ=Kp8%jZSAUzJqaE!mrtT5)Kf!IN^QNok-0&jsywTw&4m)lu-DUSY!@3n6N4GBcL1HBa{t=Q&Xn~$ zBX4#)*@fPGO04X{*pMT+EZ623YWapYK*=!Or%VQ8vwpc7pKJC=@Fe}DVhZ}KjCyzNFcFwomI~N=!<7+E;haBJDu<*@xOwk zt>-CFAYBw7)?3rYcf@vt80^3Dk2v(eCxfDD04>)PKA*=WECP}KEFlc(8u4)5vVQQ2 z#O)3UMv$u4-WccawV{G$9j^@2Y1#ZB@e^q=c*a5e(vx*7baPtqttcLASXi#U9TP^x z5^MIfBwvb_tCn_uTTH|;YG!4!tRHgy`#ZE*SgN;`YZIvZ5PI^2SXP^xM?=(zu8MsW=sX;#ju!cBUD5r;z7hd`NZuc>C_qa^gJBq^V;B_ z_t4>@*A{a+uR4C?VPw-Zgq@8yYP=f8bN+>Lrc{yQ_DW?bl~BMF(8&|Lo`jRdiE>@_ zAD7aUhXCEge=dd|me6GVdy?}_+EFlkVfFdWW)WLoRUZ-x#giGK$K2~e;OlnH6*OO{ z%IsXs^zZ@)V=%~ca_j4{bDbm+U^NXn`3d!!&E6b*O-ad>&}qG=X`*P@%$Z8>*biHB zoJLi%RyQil$}rnD!gv3t8bH8x_{g>Blkox;eC})5+St&{V{7!T(nQXxkw4STKWp5^ zyrC+J^)ILm-~lI~*BX+udd#luWgOZ;JCt8~F@gz4d<1_s?}rZ8jNA|8Q`B(kpeHP8 z;uql}gB%3=p&Sbq5BnZBel!26we5N0w)E1HzEe^#_g1ACwZ1x|lJ zH#&J{0(MkYQs+L)fdK{K>8QK$fbT9oW1ChX8JZrD0?`K7%EZknru`dA(t~S z*k;#n%2BL_@$!&ji6=z7u#}DhP!5ja*RDPKP}Rs-orpnU>DPqA`r%niG)YKxo_!b1 z2&^}mqYTZ3K4hcr*JpbvQve}QA!IHR7;O9K7UzZGI4cPA7SE5r(WN}U?q4=vq!*Po z8q+s~K|3gxPOzt7jzWtR!-Dav`Byt8?RD!3<#b!SNrj+TF;O>a^JWu7I&Lku?jcC` zA)0Hj0J8ua1}`knlcmSt$-80K5XfO2nu}(ko4|m8$a()AV{E4)?7>Mt ziv{juWj!n}#u8sfH)<%I`{!BNpi>F2;qyZ~mqlF?4u6r%%iytt{T}*6%a=Ba3V|GR z2o3%hRO8$aaTKWV{}S@5NwI9=>$_wA?9w#_AQlq8`IAy_q?C|(4fq{ug=)|&nN_xc|IV%Qp!Kb7cn zD33jUHUV(MrC1%0EOImCNIk_MQ!sGbnkbs!d3#K&xCOH0hDzkLp5EThju5C}vw%aP zckdiYDbD~&&x&VBp>ea`)|K$y;VmPYJhHIH0^Ai62eZn1D}Z!36U7EdhqGtWL2AZZ z|5Td3D^%7h(^`X9*MzFkA)$f?gmq&-)ZXp-%|fh|uZ^6keXqk@k?U~sR>{Qk)NS@B=5&qK4we5K>{>!a4dUl>FjC=&>~L<{ZsR7R#e)_+md zgmR)3s29C3@$1X0pM3k5o3OG#xRHj`1~T0{Z4`(w*Ra%JRW3rD9QcW?Z)L2%wXl6) z^}WuXpDbiA=5|Rd0;w|W#2_r21n7keOzG9xUDuZ12Nql(P`U3#lK9{zGFx=1cR$27 zJ?Lu$a!K=XOEPzJ$WzI`)uJ9`or4u|T`ZoBi2GYo_Z;>AR|;(Xg}ndnazp3!{+ke~ z2xepnG=Lt+D=5y-{a1L8i-D=jMlME$gl`=aHv2xz%2d`&D+8>BEnwtzco9MPU-Ror zayV@+T#(9tn*&^*n*&0LC4Rh1$@OLP_|Tpz`c~rjheD7j=QG!lrt|5L{-5^&mZKOi z64S$@E`;t^a}# zUJ;`8C5c|qiAvV6k}U)#ib-$M-Bk{w)7=YqQUHEH0_FKr{m98r`_HEehRMAlCI4s@ z(-eY$j^#udtkrW-Kj^tlO8U1f!Qokz42j7A0Qdjp;nimYfR7GiH0s{JH0J-|?rjA= z(i#E3v(kSslz*+lp1r_2Jw`^9s!30VEQ`j_w)USJ^A<5*58PWB0nR@6i%o3!fQsnKiq}RRq&>4?EVv%UnYNprE$8wNV3CNj!DDbw&5cMz+JS`{g=A{ zj{w{Sq(em&?w@xT2p292#D&u`S1_)0WJ_WY3%~pswN40)Z6y};lo>N<1q&zxdM{d{ z0We^?Lr{Za{|CRYulf&u!Nc(X7Qc}B7r)rHYM-uh&WE>N{Lai(rWS83m*~vUokyd* ze+)E#C1#i2eZ*KE1@y>gcw{0-^#AZ5YwRQd=>gtTF#)Bg>cWAmo{LbPVwsJ(`)1pJ zC=H96m{Uz4EAim6xqi7F@4BXj_R^f+Zp9$~=+_0<-SYdxP6w|O@>1x~*H?F^Su8Dy-_UorU> zprwWR!WXF=E}Mi2B_NT5VTJ1pR}qdWh^w21Tm4|7r`gW8EQsizc(uw2X3dxSZCSf~ z_V)ePLlWI9c7UyS6VKVoD{lP=tI1vg)#mdW5HS89lEYobz*<`%3$(HFbbCy2+LK&0 z*COw8I6Vn!=xx2P%?9NHfxDAuQ+vbRK+w!HZj~cyC>lFD-C$*+{E4OD$KBcZ!4 z`z8&gD!`NWxTQ69FryVj0d$zu2mGnMT2`XhDiO^?x+*wlV{;!koHx-#Wl~t)ml?ba zmyCFw1LP2&NBO(>^Vr#`Q-2Hr z9q;*H8FX@d`Me!AE#0z!g9GXeR6rjuq%Lv}JX-3RM^DexeM-pkO8*17u)J|LE7sTw zQEzk3D7hE|D3dZXn|&#rW)BL}=j{nT5H3pevLw;!mOwllbn``2j%)+dw{Z{!$r>H%VF2kfPO(l$PfI8hYI^}am3PXIuRzAlrp@(#s(ud7C}Or7a+JNfs* zmTMMUK3({J0rmu%;&<)AH2Qvk{Yx$`N2F}S6RPwJ2S3+adyrl|0)E+DP4kK1$NR|GM zo(_yEW4v`9mpUQ@3a+jpNpt zPQ#1PUDA&j{2cv;zib~#9c^hjRzc4(__6=Qe>8$b4Zo&%Z$8j(I%f;WGB1f<8ZjMn z_ajD)B3V{GF&Ssewh#yG=~q9F&{C?Bm^CJ6CDhvu@bmZYUv|Z!SBNFk!;4rJotyq% zriP}}pl$cShc2xtktp}N*eZ+My0;tL4GW^wsXw>Ezz4Eo?k`tR0d<=p-^*8jwaZLN zqkYH+u5VZ9X`-l$)Rsx1YEozgP-_hhuaptDn#j2u!^NA})7`1~!ZF_xA+K%H61}#x z+ecqE2b2{vniUnvc7q)9w>ppSCBZr26&gZ!8^WMlL9UJBJSoAKe&@${naXlGF3HRc z7irX--*3;jisnlzRj~!a2!%m93)%u=pN?m=iu6x(;FlUo1TPWF571@{3RVVx{1_^} zNEq_K76>BIsurB7_?T&sRbuVx2ORGLCAx*_2Y5=oo-xUhbmB85>Q~3<1deIdQ|D=3 z#|ssNZ5;LX0i;KG=s@W2}_UyTZwJIh~%780gjlT79cqHO}#orMA^kg7gJ!8{G zJ>Tq|60_&_u(^R{0< zskJ^1e&ZDTUPTu0v4;PQs~AxL>(>Pg{y@udT~-XhVGva4wsleVuMP<7oC*LO#?Gla z+Mj21Ok*%QJ;C|1`H*7hFNpE&*BZJ%dNRU|e$w7P>uD58F&vq1nma{Im00*c_S3eL?`Uq|0YbJs2&p7MA9 zHn8nB(99I`m%%uHjZ3IgpZT2`g4;l}T;(G|#Otj}QuAQcA-dt?g@~qXFRZ z2o>b*aj?F#Y?_2|0%^|lIoKbl-8C9>XIzWLppN};*JOd@RmEj;p~5oepLuhUZ>lpg z6~5n^LHX_W8Nzr3&r%lV62GbzWhTtIqmaV%t zXrQn?JJYp4uaEk2ZuneUKHe*aoKM=()d6AZ=i>t$)~0I(R&XAxk_U7+-Nwso`mNvO zcqTcD@jP;?2@8ow!~LJUE0W*7<$)u*evRIOaDqblz-{APv1TfgWl%)DrhU>MWHVw> zPFE+W$%gXCH9M`f6L&TyB2t)O!?RCq6T<2kP=H^H@>Mj4(`K$jNpGnRtc$p3MKhGF z2p*~`u$c@CRHx(CoI!H;POi+KAVTle2n_eh#Gfgmj5Q}~i6ty#YQZS7gJjEsjWCey zG!`cF=@L=s&0*=QD6D}0CiEirvgL{oJ&3#EZA$z+A|Er=LyiJ1aSOl@g@&F)r9+k9us_{QMUClWB`9ASB2klF-9MWKZR~ar1in4x1?mLm|Qi_0n;Bp*DN}t^Zms-H3sM zM7Ot_hTn0rg87CkK046yoFTJRf&x09v8hWH*XXORhq;Ga*Gi1@qFHBDEdKVKYGBS# zG|A4sfAoccu=MrV*l5DW3ytPXj&m8}3+`eAEvLK=-?Pjb$4*gN^x7%HY1lQ-`7Hs- zbEnoS#a~0Wpw!bht2;6`WYFJB$9Fk5uyvQ~Ovrjf-GqqC`P5@2!FeP4Su(}LH_J)y z<-t5BeCr>pP>iyg>qe=2wVG-(YF(p$EljsEDlCiQ##ef7h{PGdtb_-Bw>Jy2p5(Nv z4UHFF?fBF>c;x$+!DzaHXq7|QyG4bl(Fnd5xP#XpTt;)hVCqXc)<{KhMUo!&{TF;e;#fMyJ zQIa_Mirj&!W4ACz-SUKK`w|@Oz%309l0u91nv#AU&P+D*$MxqL(AADi0L1V(rnFoB zIo9f{vSe<7CNO=POoix{W%_*QlN;WR1mu{zh=qM@DR%w(n~nmJuAJ|-aBPoRQ;rx% zpmEjXYoLKLfpoqnH`ZEEJ^b06qt|Yh*?L-=F)8=64^Mwt_LY$wi ze>k7_?Oi3Wz2IvZ25yJKc*0%;Z2IT#m$dYRCmrDXQzt#aKh1}fSpFD}yh0P^ykW(< z*-!VHH63dwo?h>~y!do+g25nUH9tcSzx9xPG736wc&!uUpv2Gx z0{!|5dbxCh6gwW{7dpc&72^-`NW-V<7Bqi~!y&%6Q7_$enicc>^>fJo;zue0aHW*< znZJG}2f=1`is~OOV`Q{{*rMH0Y0;jeV|5hxaT7pzq%qWHc#|{UN2Vq$q1b-RAq##% zTc-rk^!&?Wc!6COs3lVVvKS-AC+&3hGkuIWOB>j}^Zo52bW0ce{i!Z7uPQD!c&~|E zHkbX|JgM@XkZV-n@w4zuLeFefCEe0Gx+G|Uuw7t-^$Zk^jj{9Gt;^>(lrCEFUQ=A! zw?m2kRf}{+Ib*PdrN?7ZuedG~&v1Z-GJ~U}a&9na_F6 z%e6drGZKDi#1$?g%8lEiJvdckF}l>3Tsai;5$9tIDqq(q zEd$diEp=CD?J3Ruq|nZRe5v7dbhBvi3$44qu4O^AQqvF3u?_P5$dD`BhR#9Xso&llWhZBjg^pqHL-8lbfi0 zB96UK*0>?kcf3}kfzERD$R4Sc=d&1QCqvPw3#Ox{4>1+0q? z$8lT%*>n!nFiIBQ^($rsjkk>ZJDPqfG-~8z&;st!QTXBM9y>F|73SIbFXYtJRX;OE z(#kAE9th>IzCBMNP~pE@!ot#b`={tzROC@6)u*=2;}O?*or*2G?58YpKLKCR)5gib z;BilO&5Jq0#X^DlbxX&umRPAf&(r$?_9@~#klhf}pqd5{g*O_f@ zKDu_(kdZn{A$|%16e*hqjL`-n_H)Fx8xg+ksdDF_cE^CIU9_HraCANG7{J~?f*}OB z`$zQ^G?$>)%u8ooveVX5XXqZ}PCG*o(OtfWvAqQ)>a2K1f@ZabXU0H8rHM|$#WR!W z6Y10aV`#v}n>ZF8WMM=u|Hm25ou(-M&*ThVtd2p2_jM`oPo3dYgihwMO6Rv+ZI9(f zZOdh`bCO;(*g>(QSB+CNMf-#_o`{xIWALfQ73a8?Z+iqWmg;$<_J0_?aH(UwUXb^X zE^ip~&U&7&2C%?Yz4)_O@Y8w!TJchN)07|Q;0~SUNn~_5CJ4Jf`FE7FY6>!lM8Cex zB(h-eH(_mC3K18T_VWebZhcSO} z&ab5<25tOEgQ0?pK9P+KnQ1IeFcWj}d^#~@GsaAJpi0~IXRxCti`|eUw3t~9VhW9H7;T|`Y`z+C8ajHts^ z@Ik^BQgAIBLpX?pqjP_XY#jQ_715Y$PaVnSQS!V!y$fB*Kq1%cs3Aqt$F9*~Ch9qN zCFafBpx6L-hw3$41sToiTDpwfa%|M+gW#y%;CH*#ye`G zANV=FMI91hMBc%F68#RZczlUhfYTuoL^5z^e`S))JKhP0&b$7cGyv=yUvVw{x`Cj9 zs@NqX*!o@=1u0C`1@BmuG4^E^?)C5Z&yfOhBl=*Bjq=Ok>Q6gGk*(YQ=B()c9IHid z;XWC%2KSPi%x@pl_Zfwv36V>_*%AB3-o0VavS4P%9OVp0M^Nv@c%GJoGv6p@4|-li z^OgWJE|3;@d1kv;*v8S)juu?EE)dNNodC;X1&z^$=5=L{qJQJ!U1fmR@%rjPBSuNA zFg4F7mj?ed4BXTWM&|Pl@Wx!^3DEevKkeT96azc?_2@|><#t=9y1g`8Li=a&ViVTI zA%&TRSknHoo78S7JxdM~MS8Yuwg?eMf2h5^@8{ zJ`mUc3Qx_QT=Ao(z`m_xN&`Qu*2)1@Q!Q}ZkWBcBuREF25cb=%x6}8T2yr)mX}-&R zTss>tGLPvGPjUb(nD+z|G}do8kUS$}R}sy|hzv^FSdPyF-xG6=f5+D8fez=QXxHqS zwe{5oNu_bHS~J%j2q=uuD(2bY4t2@u?ZQ~bp31Nt(TqX%Jhau8#4v_r&bF$&>xt>~ zuwDxC_OL9S%zW=t3WT>Py56U9?0q?%8zP6|gVT`d=eUcIraVN^H8_TBbyF6fcL$P|EQ$R#6}W8DOHH!kTtmcnri8*ao_ z*A9E4V!s%Y(3u0S^#LhF;N1V|lC7kGTT$7|-|0*F-v7`b)Bp{_$AayAxGc%hGUkq= zujtIav`o>?#vhjW`xmot>dv!g68;NJ!E6p&pKEhRAorwENI=v2S5Y8u@p|RhA;UI% zs5fDQHFQhxgA#BLk^`mtz1Z)#O%t4Lyk4#s`ckj2;8@*MEvB z1wuFTLKESQj{XKEsMdnK3C?B5X7Kg zExZEW*xHDJ3Fb=kq%Ozzk1Unl%?z6z;3|uB<4p2)%KJKOUHZH|xV1yXD=I<-F@B7%wWkd){KaP* zUVsdd%&L=!Tt7O8OgZrOELZq#B;cTQ_5Dn)H0?`iEX8=Qb#=Nrf#X3O$kcA6WQ=$* zzW2uG@{KfkxER6vfTP~3`|`8kH(5usWjXx^T9HX=-XRXGXj79$4;6mT9kqIBm+8sC z=^E#rcbCIG#S#X{lpyXH0{e@;&hsBJD2dm-1421eLd3yNNW%wJz^wU-dM)k*I!KM3 z=(+up#qFzcztPqHQ=HIUIlz9%gmN4R$502y5WEPFDNpyx#3$d4GA$FRle5cNS zHU-h`w2Yk;7|Kn5Q>*Y6mT|-{)h0_yb4K;?{ff!pj!@}o5%CG0#H{%PNv-{Sj7j;8 z0`dKN_e<=mOp_S{6AP+%qs&d*qjOgog_Jo`7X8O~T;`kg9LUkyhTzUp)LHO) zuzwV;+AB(}iu@bYA8?rz2SK7n5}n6eMl|V7ZiUjaFPO}efqu`3Z0&v|wG-g@Ao2R; zx90Q*GiGptoowl~1XR*;z2GpZu)-hdQ;3X;ER86_Z>fFmFYt(rIq24dzG!e^1V~DWpdXaXsn$STVC>@} zfbE@CHz>B4s;8ijR`XRxwP>0cKs`MkeIw!FfCc@)@B%45Qyk^ii9$piGHkynt;JG` zzl0MUAa<;~8%rrw$paWisO+uCo@ZZE{SXpI=7)QBp3#q)vuS&wBs@2al4z4lsUiU@ zPz)UgrZB}Q&SA$CV`oT{v>d2D!%3-xU~lb&a;xu)Hlg4onfoRjNs5 zHkfE>=ulfIZqS^eZ?IBA3ML~}+8cBN#%E+OXj@BL(yBKp@04rBcEuW8*HAe^6oHDo zgpFYJEW_1G$2AXKm|*{}zH0KP#e@!|!$ync3i$Ko@%)VNPYC=Qm=Iixj`^2`88RsC z#G>s*UrygbkvMmxmHC%e(`_iBNnic8Ig#$Q+nid`B;5ytx4>HOmh&PbnuGluNk03A zJf%EGO^|drUxo;8-;BtwRa#>D!nckGExZN?{13^IOm&M2a`mR{0I;eyM-<@zB&*Ne zvdObd(}#|`Bb`Htu~bNC5E{2hH1sYmQ+*foKG}=R2NMO4_+PzqWcM(pd{HL|=+K zsF>QZGAqGasN!ImnGyRVw|``8_1GM-z7w~0sB&E?%h*fxbv%lrxrg_IZ{(J1h9-XS zumD%Wf2obgFEi_@ak-NEw7i#XU7bj?_@ObGS2iJ~kWk2{B4iLR41y1Om@wi?NBS;t zP<*)cTc7@|-dECcKEU#eWUNVfw31)|%?os>wCid{;4nOxhA?6rh%c;XD0pxCSeI(3#pkmxm4T{0Qx8B5Xc0{8l3wwbXozvL= z&1!6$z`I6y55t3c7)&Fd_1X;p6aa0s>`fZ~M~aVt8OFy_#oPT#Gk*XAIwcV5@+TRA z2f?8LYv8{xllE5+2TovLLxGTtpWt91cU_Q5PwDb}O3bnyi;x-zOT)0{thbfwr#W0| zC~15NGTawz0514)-XNd%FI;d=0jG#RQy2%-6PI+OJaLCINy+S-AHP7nvxNq9UB5C4 z2QF+OCO-M2z#qaS2AGBJZpG36aw7Wta9pKYJ6xL0MF_7?m}9s;YTB*J&e<>5$tOLZ zTUBHLV!}YizK&DvSxtl$`@T*ZU@06XqnSogVHDWC=T8z?1(C$M`2QLil-ogjXxE_`Ue|I)j4K&WoJ>nf#5`=1b5QG^69I-F;yJw;QfqWC zFVd{1C$=RP^p+^L;<4K6GD-M-`I6GUkzxGurF=8yJ?p6}h!nbGJUCfN-7Z$ye*RL( z>nchqqeTX|;F*Bzbrfk9t_*%>&~kv6AYFE@D-bsz4dMFbaB)=Z{$<@py~e)}{Xibg zrVr=Ou)p6`w8y6W{LxCjkejuY$da-Cwnc5x6yB2D|JDgwSx6;EPpxD(?c8-uaflrabfM#w}~LQMTFwLL2D zJni6#oTYX$kXJ31$dGVN@9?V8gr||_T`{3n95n`s43!Ha(yYsKs60fgve1UdUMF7g z&c0ztt;m)ZfS*EdNVk!mM{W#<6BYH=8`li}KLDP_sD8HGnLdF<`c#py{18B)?liZ# zyGZ71IqIz@wKWdU>yKQyPTuBQtQOn}qUmKRw&+xcYh4aH(Le0%jakjsaJf=2t+Zg( z{7upkWhP%Y7ePF^+VAo!{*dW8L;nBt_10lkZDH59bV--e0@4Tw(kUfKcSv`4mwC%bT^80cO%{1`OdB9ocH;j>%Bhz^n$(FYpp%kT=y8`H=-w4B@*KEcv*M9zji74 zxy07*PQvH#q7at!)7?MLjlx&7Q#QkBlaVI2q*}=)nuOy8q6@P%lxshVJ1ozJ(2`HV z!?OI;wt%oPL~~yE4-HOZ51xifT3%? z&}-$l)apS^r=HEZ&+nKz%KExwLXr49Q(G%>i$lTexw`rp*h2GPul-%I>H|Ke+^cy} z4M5Vy()E|izM~>by>6tsjPbfKFN|zm*uzB` zNB&g7}uFK_gyJyUVUm3n#{02g*>F!~rlWqJW zT_A9~BLp@uo!VXj{HZMRUML3@vXh%jkAjIaZ{TKcU8o|nCCkfq=KOBd?)x0Q?6h2K`$fgLfCmtzO39aWT^GM?U%PdWygd-;cE*rsNbMaFPM_wdcZMb1?3(%} zc|1|1VC2l(vuxIc^)!oLICp++R?IdoUy#lc;HR4cwXdUN8zg%S8FMRSLP{mxO!r{LbZ3^sl}l{$6XsbS8TlavrHSnF1%!w!ToEYw-+r9Z z_KH`yYK(>$KjwNJmV#T5pvR=12YW0f3En@m>@$fZ&XVE&$F|YL%sfT_ z1W+N)!&Rnl7wG12gP+S$&xu-Y8cXXZBP@5N=)tEmdSDz$>$aSrZB1H=_F zKEIS4CS6R<4MV=UjGaMg&hssLJ>zNtOvMkr*!iaS&X1Te07HF$5#4NI=Jz7A+fLDPj7NkraQl}IxBC!cam zhQ^Ymc~UFfi9vo1eSa)XgLx$2pHc4#o$TSb9yzbs){|Spr4D>430`tOOY)=TbFDSl*#nOQtP)lkvNGo(fh#Ip{P?RdK=Yk1 zm7stxEZ~nr@#yZG8r?z*f-h=g`T#r~yxMpF7(cE)EdJ#Bd>i0L@z>q4JQ|m*Oy`l_ zRoOF>3=N*bLQ-6=f9CkuoS6Q~BznOxL%qa!v}pKyEY_3tmE~flEeAY$S#p)A#ulU+c3sHo^-$C!?WOmM@ zFPYa&V2LBY_?St}$iU?|CAr+&ub`J8G9xiSOiQlRaB^*`L|sey1sKXE;%utz{qC+b z5g1`izwSmw$|EK0@o%6y-z~JEzMJy?9on(Yj#imRWcg%?1gJQhV21rg;pO|`!=-)| z#IHt%!&4}#oZ7F`53g||&0*>jKlj$$>=K_QBV}RT0e`~Lsj@KcF&8>#>dIomN_2Ls zk3)8^3$VBiXf*XY%zzU<)af_PyhPvB7c}jOG$fglhs0aQRG1nNcHDNo60g({lP)pf z8cp06gWO)K3vD&{ebwIjsGqEGpW}TN<<$arRo;P>Cao@3?Qs?6(z_yE$r`w*ti`cpU>d}JX&qP6P^ok<$bd7KP3Kb z$|&14XR%&XR`a+|Wc3Jv@~q0}#4=X$JI`%&!N?P}Y<44(`w0q(KQY|EN~tGfD^Ba7 zs&Z5md#a$}a7Ldoq$SmAD_|I^c}N>4)0)m6DS6C5_O4rDGF@Bu6p9g1G`!)Xt0|bn zlX6s%z6mb<%7i_7h)WXJuiikF>oDI-Bqfk1?RvEE;CMm4(=BhpU6IzHg1po4VY8c= zp4~d9f6nDB>)V}qms%o*&)EQPmj#xO*onk`dLC z)Dlw=#a>-L^etKFD zB-Mkd&6a*eE!G5T`t!_Qs92Pj?Nf%x#A`Z_d|g3~2qF=zYDlZ|pdfLreTu~GY?(h! zF29EEg!SO6QmxS@R2|&jv6J?!rW0Ga+!JTZWsZ~TSx%ow6A}q#H7D1uh_lXA-vY6@ ziLQ(1CJB}i@g*9+rPWdfn;7oq5MBj(SN&lk_U6NGCq1zjU-9|q?bbJwaN=KS?xIdx zxABAiNM4bk!9D=amxOpuZYq_A()rVD$>n%e`J(eO;{IAdM6p8besF#S(H@tjDyghK zEa7#`GQwQcz59;ZMIU_;o&|VfZKC!=t*Q>@HNf(Eb>3I0_wxWn#$YYf3JAJ>a}ygOhUDn8U{NnKY? zGp+60)rWGH^0P@cnkdo=qk6iXKe*_9gb0eO?+1k%qiTL9%>vsXq%36@`6h(~Y3PiO zIXvyH#8I^vfvZ=&uGu^KR$PtY=2wNKQcm1d|<@WK6 zoEr0OZnecl!)v~4LiBlHwg_>cyzDEvNj(2mbMW*nh}*gRO4OqhC%zWF6S@9Rt|?Z{2}mhL#HBTMXY&8Nfi)!B9D+=$sn?Q!Ml z%DeKi^jS6OIhWGuSTvz%N4_c~EPK-VOUITaav8)UFY@vTW2TPweeWU$xTQ_bOl#k3 zv32`UwLiu!F%rM&86T&361csYkHAfJJeA@@SMgh+n#vubs}J!l3K%5kpRr{kSb zwetSW{3$9?8KW7*D)l4V%57GNdhYFu_`k){0=K7PkqL*gsrgw=#V;Fti-)(Er_TdX zxAtBZ)f+r=#gS8%lF5-_d^8{nR(k&6|D4)^=w*5F{(h&-Ui5$mSL9Wef%)9s<}Z(% zpjv8r9||8yOKj4gLyd%-64OvsY* zmol95*}t@lzb7y`c7Z-gFM{L1&l*B&X)R9(1d$maG zd#Q4wy9_H^V$NcRcImB0H&}AB<-WZHHdNd_OEMXmY;@FvI@$4iTPC6P4LNB^iAhrp zT46t0WC+-q#Z#OzawJR2CkQl#V!ebi$zD8Z-2ZNsT-1i-JG&0CL3ki<-a%{-d05>D zzrvv@U^8gOy`|7|ZA+ulngxqEtZJM06FXg9JlU^Vt0!xcD0odj+v31^+-cTUa@y-i zL>?%}puWp&lUNS+P^MQ_@;1(}^Ssgh$-om0Pr81ep3~Nj4tuhsCK8K)&8!m;f_5j% zZkLHdtS3D;!-LA%Z;`{xF&A*+ha0>6!%>rGz>n!689@~T^vZj%s9UWrHO4N=wF~U^ z8!Rj*rK1GY>%Bk9GTRz%l1LQ&jJ*1Z@IZx2Pxa?eh;2B*Vtz`fp#DH00uJ#OHyV#A z_dxQsRL3(U7T_YNw075&#avgV*2E?>eTiYitv2)H!Zzqet%RRpBk=#vofD8^i3y)MdX*!nkx9+>4#(AgB z6|DxOEeDN7nXSM-QDv?d=%HMa&>i$I%MCV!$L=vjE+!AGz%HrrnVRZwK>U2Y`bDeh ziL!J5ouI#<*DG)7&%{sLyP#Eo*`|oRSi`OIgX!&K1`)Mp6KjU;89PsiO5@)5b;o|n zD>Vy2A7N>~R@Z$hSzgb$96Q_Bnh|j(+!Jr6S8HGerejY%_@|?l4YHR$rqP(6tVq1! zRC<9JgEy}WDNDd*>(Q?JggIqbM5JYM%F&ALY1-u7RkSYj-7dJ~FdReEn8$y{8ODTT z*Pz597q(U6trx*eeh}Sc`1=nDR^-$bR+NVOk$jH9Qx=F~y$*;?W=-zo{FJWf8x@Kr zlijcmEo>*?+$N4H7hGFw?dmVyl;AvO?A-_k$2h*u|Ig2UkD^{X8wRi+xs8( zhbV~s;bX4W)wXT#iHl6#ed2?2wk6LJ*J<*sC3ct8KxtCffEDa#K_xbZJfyLNq9#)c zPZ*5fNU35;_{XISmfscmz`X%cGlF>;!>6)iBbmuJ`$i;3OJ={ChXhEWyCNN|`#aBj z%il^Ll?XPg>?s%a8Mfl5(7gD?1WO-3U}A58%YjJI`?Mz2ID)V+K-xA^i^!9xHuNC( zqzImLUufQ`>5BV9QguZ80YaDqSu7o}`<1xlf<3ti=Uv<0Q@gVSN^6A#!KQn>2m+dn zL`c1Yu1eLXh+{|Xa`rr_=SE%ix&!cYi1L%{-7REt!x+N^E>G?Cg1j~lyymEqFBTpU zy(tfAEju1la~NBak|cmR%vuwlh%Ki82%?+kG~6*~2Ha@;Wlu$p&Tq)OZ>M~QB9n6% zd(kP?ny;*!e+F`vAQ`BNeS`IJZB3^`oYO@FUVvc45q4mIK~C z49zJjjmV7gzT4~I$K-QchUzJWB|Y8|Fx0er$4p;JPWqNrB`QoY2BiblEYyC!LC1Om z!RfVVkYcC9#ZrdX>ST6Rd`7ITy8xYL%dFV;=ASi0paK%=Iqzzrpa%Su{J$V1HN@#suV#WlN>0S6A ztWJxzK%M6y<1n_>F;}A2mhRM7vfh8UKptM#6p5YUc@8wn%QQwo5(l&WJINzy)bZ;+ z+Or?Y5vIvH+fyYJ)kDV!NHq)2a1Bna^Mz5>k$GLst(;wxr~O{jERm$C#HIK$zsR5r z?E6u^oa!^=4>lh?a14#jJ+qWCPPwJSrT+%?tM>^hyH`5cFT!C$NtFm2pjAryz!>KK zkbnVntH2;Xn;6$F{Llb(AYL`RmfAPqn z_*~d%$36xI-?FlVPQP`Ch(qBk906?dr4~LO4Z-I@{C*}x3i3%^JdS9t@aiTxB!1?H z=mC@89?>!JqB?_`=8OvavYQx#Z(P@P605vASgwU5RD$1Yl^oufwT}eVy zM!m5G&qQ{Xjb8@)qW-mc#uVFR`PD3liYn!R+E1MnNa!_vC$#$24fnjXq9SEmO3S<@wV{%n+u?D_;|` zVa)uV=eQh?b?a=Nk2Vva_~w+`%HPzp+nBoznZ1GR3JlQYE9HQf;oB4EO@mom=pe45 z9&egx(i!bX+vtlM=k*?tQ6MAxm^9Cc78}HW<`DVrUfAm@^Rrt>euKkxZ$4tL zIjEa_QoK|hg+|1;jVKP5T9C-Pq9Hi%WS)LYD3fl7F5L6m=K+&H|R+A*0g2T&@um(Zhh*BH8>6v2+vt|i)sh9UA+45I&t!yoV%pDYMg2i zy6(7yUJ$>IPcyerF8SPyz4oXZEEO#(!7fB5Oa3I{=wB1Uz2FAsgKQU6kb?18It-i9 z6->i!$LnnGYhF; zhCcI_))0Ka-|OZ@QhvY60`dG<{wm#OrAp0hYGJKJ7JIyUf=UDHl|A3Hgp z`(}Jj$7U_36Cam>)MBmqev*^T$jaMj@kna}{xuHR&G^-m&gzT{+6vp19 z_X*pSveJtciMsV(t+vhYcKlEGb4kSQ%)40e%6W0428Eb@1va%~csdr5N)hjtgPnU9 zH4z_zznag=MbOAnHKjC0w=AM=U>VcUjt0^jXy!H>b6o7gXWCCqHl{HP)!Q-dY?l-x z2nTKEDhh5QUp(-`6Cu7#jdM@CAmWv@`rD$h@zOaf-ANs8n}gu~S+JYVWV-6|^BBw7 z8)z1hK#BPZ`pg2_!Qi-MgNsJ=6u-`sN@HZibE}67Vgu45zNXLCgNAiUofdPz2;+wt zT5pr!tQ4}kl5*9?4yxsUK>4%PO}r`<3oQS;2nCKjEmcGYqZLFlLv=XzeVV>Jrazzs zQkTPpe1;3lTJDJ*hwSSx83GYf_BC|1_m1~s4WYqU0~Q-y6hr+f3G<@_8eWQDvjew9x1LymG>=2&{%|@TqsGtOl!+h& zN>D_tJOQW>5F0{ns7T-wDP#ImUF7mjyGFT<2YM~`tb9NE;#ezG z0)5!;mdwiv&|7-cSUWRHQx}v^%p3Q12ztxs*OCH~mxJtM!87kaGI}&A#xTnci~l8N zsU;(%VK@LC;srQNe;@Xb7XXiUo4pj!wq>Uoj>mE;Ba zg`N~`bogv>vi- znuFx8ln4>n*3@CmRZ1m=J)E+T*L!CsF-P75%RvDVu@&o|4geY)$Q$C+suR%u9v)K(upKU^z%DkHPt0#sab>2-#w3;dMdT zzsK;OL%AFd92h2+l%jt<4gatn1Tnf@zeJsG$pK)KJxS3piM8lm%Fa(7KmKeQ{KGS)k&9WK>q*k;pDd0T?_`fhsHxqfD!a zfr#ll<@&NWTQsb&l0?9bU}eUQ!vpiX#(l%__;+;!I{Bu$!YyBHldYdgr}YZkO)425 zpoCu0A>zov>i2%h!Yk#-1w(qpb2>YvCN-AQFZ`hL;uYW!OnL=vJ5Lgqvia~t5KiW7 z+^Lq>ztrZc%cZ98_Ojo3GwXE4sRbb;JiO)zZB+@UiXxJNsJdG0pw?P)zKQi4UEKt! zn8F}#<%VTg1mH# zjhy^s`b)Or_gWgYnZ3BZ9@Qp&Ey7i|%krfLBa1y2=ONA>Up*mF&uF<;ve@NIPLCcF~l40Kct75ZL{lcpRVc1g* zFH51AKh-iUpx0rkw^v{KUb(l)K^5~YXrSng%bEgC3ixV^g5A55J|;Gw0e8s_PiN?o zvwzg}_`9`jMebO~r0)w&qW_W%3Os?^f zanT>a@i_FpSq^p82Y5g7{n&n#%;A>kyto>9tI~@L`&JP07Yj7A#%uQ_5%}D#A?*v; zX?S@yRGSrG%rqU6?Q^*@z&eumB~R^Ot6QvDg71^<+b1>0plm{4|B%v36Kn>xcna6% zfF`!N#8fk&(JI48AW)Yf33Ht~q;ahIyw>ZaqYBIU;qpM>NFr>*SM{pFCW27Nj#t=y zYy7$S7^P;I&tFzT`|+std60E{4wARRj&gFVeVRe@%NLFiQ$u7@>I=lWNI@Zrt0C{3 zkP8w%32~Y9ENa;dd59M6lYoX?3xr*_0{-s;yb55?jQHtO8-n65SL0`o4+9_E&#pp- zu4Bc~vS%lXXsKtxawad+z@KD)EV${+SfF~#9%p|KZ#Cl)oEJrqZH^71A~^z}xju&o z94L=qfBu*vOdj)AoCHORA7U)Sv)S;Irj6b{MnYP$vNLs^QmI7H`4_O z>mQnvWN3l%1#lN9D^aEDT4`W`_s_p*BKFAKwp44ak!+1m5;fz!H=!Pu`g!_>47jTd z5nh;}!ksAng%-%ihvI(Iw+v4%lxmEZkWl>|u=W8*g6rITD)(f4Z-7lsyePFfk!ZN( z)%;BxCQ_tAbV~RSrxuPno8vcy3ZCbMO7;3~y}DA0)ABoK^T-{pjeH7l(WRFfCEybiGR3TLbeJ`_MJubSY0?jPy`B}aGK&_m=0jFb@>+k{_C zYCyW5+vxz><$)>t{TdZWJDeY*e5c%yfR+40BjdDUp@8aK@yQIHt;R*&AZ5_Y*?q&R z!TuZ`-Eco`b-}xpO%S)QO94pQJB_Ea0RDjNgRk&iTh&X`?cj8TLr6iPG|V6yo^jLV zkK{pp+sNZ_O(3~`SWO$hN;leHP5}X9FpULH;YotIc&^;U3!z|r>jx5kxG%|i~_G!mSunOx^63!IQesqZ>5qN%}&(BJWhkJ z2n8HVF5a#B^( z8{@i$gFhPgMRTN+;M+?yt8bcfmJTWPMU$gOBsj32=P9q8o%$@!%ZqY|`Unxj35LV4 z>PY>CvtkX)3%Y>99e>i6OigfzABX$H!==(m=I14pO3|P-^Pj@c%lsUYu~F}uRmcT6z2_ywKvAUAQccLblx*E(q7L*YSHpq z*g^k6OI-?J4r+mCd)doS_`Bm3N%%^D8KHvQw@5taPCRwlwyQfBsmVL`eNQ(MSJF=> z@C~y~<$f}n3!Ie~9d@6%B@>Zi1Di69>KAjBl58>4$^`0@t%pFyFI#ogQP0+pI^)|T z`BdvtUl%X5Un6S5HC8mdvG{hHBYiaT9VpZaU#Hca_@+^{jR$_F-Cov9vnb_@y`f9O zF0{RjS*>;sogU2Yx^o;_U}P!#F#qD{lOC}wVL2v~eY6i&b z-2>rL2TTX@lg%E5KBYf39M6g{a+&7J;s-ZPZ~BWGuf3B*U=Pi9BG}fiLS_uc;g-@M zlJ-CnS0r9L0##?2Nsuwy96|rZf2wwYd7`^ck&HCE{{*^-)~)?ZfYw(|0{Km z4td@&>3HU|Rgq2FCGHm!<+l`+bqz%Xzxoe#aGt=+={sfynvQx@OGk_&EpfV35RRCh zWb}47osJr3^j{($p1fdZ3b<&N9)mqE<~6Du)91Bq>6V-$+)f$~N0=#RF`8gKxh&nh z>HqFeUDwXqOwy0x0`~ETlUZ+^Z000|;_IwxE3j);0%>RPc(dQHL@P3Hm&F1R-L?C+ z21eWzSOd~66Fbt_JTvV-jv#<@9dy3iUhckI)JMD)giW$p7Bl9DU~TxAj-yjjNN%)r zSJqKu1@1YeyJwoQ`9JcGKEe`PX^6oOXjl9yYPsR%*fGHJj7#Y>rU}F1aQ8C&4^i;= z3O$tHV3XdWE6QQv8+tFx%i90TC{VufUe}nsngN3M(Z{|21n)sFV^*TtQ-Z8Pi*q|z z87zBZsI$|A>HhVrZHF4F6f#vswGkMACx{1lf{je7MHs`{MW#f!JNH@6uQ*Uh>J8o~ zpiKsfF&HN5|4-~D9|#vM)phor5vOUupgRQ#fRWBMdVWz^F9XC0{dJrcn2bSf5n?Vn zHSr6IQORds9RNvCrN7GF+q7@xUNpAwz}U%h<ep~I`qw68n2$b+ z6}m=#FmHq|V4p0ZVMi#^*3y^h1*bDi`v7q>04GGGc0>L9MYwQ7)hjAo6! zf03G7u)%}>wfg2wC4^#60TgiOrHnW$I_I^1uC4%u8%MJt(uTc)zS8S-uIq5Mxr8Z1 zC+(srT&9^J9mKP(>A;2!k&p_CaqDIsW7ua!3ShD5>|xWsdKaeU<)}f~>Xx?r<5CZr z-jeKUQO`CPPLFM>f;Kyre055(t3p6aeLc)24u73+wz7yDm%vK|o;9Q;xn6yUm4oOl z>+EYkTt~?K`8*lW3byqOg{ZFDa9}owgCa-Kv@;nr^szk07>Y=Olmni#8{W24q;m^4 zK5n|?7tFx(Sm(EH#~k&)o712{dH`;{cs#;opls91!WadF8$2EmyEzR~?KHhC7UlAI z(NhyHQ7k>8Sm%uHE5S=q-cHIFoEEqdk!QSq<8t`Z$w$lV`0*DEkzyYtn|xk8 zH|!`RtPqjb#@xvp)l;J=zN_LTmCAZJpT*>=F)1UpVq%gG+y%u`lW9WMn4aS%zn;~+ zU*9(IY~z1$Zb^1$+YRCDYt`F-A(gVs5ZR5@9vZUS7nyCK+SQ~&+Q{8oe&-M5a27kB zA>>o@0TsK|d-vFQ@`fdd9A3lEB}da9E>kLu378X)tj*lj)g`_c5G{PH-!;?0Uz+1h z@dj6|waJM2BqGnXM^=Wv7H7Z6@;j|OM~OEMm7%IZoi50qP?OINh&9~aDt3H0Mo~fJ zGaAb=MS>>$2U4JlQ6>D|o9gN+FV(oR28C#Rjbtd1*ZnLp^GR{{Z-K^Yv)@K=ziph= z4KNQ%C5{BvU3L$eEdz3{iCzE=y!EP)gSxOiUE=Qi97u=_#kM|Hkg|w>DFc`sfM8Zj zBD0Cgtm5ro$BQ1j-e9WsDfaZEEA48xp!{w3i#CUk z92jrlA^NvcIXDwEeG-9Z+hUB^1cin?QvIi3KreB0KVaMDwjbfmr;Pi>YQ7iK=~dE1 z2m!yFFQ|nOYNQocWUY%I7;!FF*C_k^=Yp0kB*@=;MB2|;y=28;;o(t! z%4K+}t-bjhP{l$Uj=k?8C;{Fd8K0r?N&cb)NdG|zJOtg2C-3LKcov-4;=~EBlOJE~ z#*yRoo0F~b@p|#;P(M$c+@ro|X4>;ev+wmr0;R)VWZd+QtMwi(D>S(MFdMbIiIs_; zGm!X?YFGm%F7~pryTXR2j;ZpPyCc=d}T%q&T zQdNXZTJu)@r}XV|hbkw36Zcm*8-a}NhqEnj=2ng8zJdDB$1+9ayb2(Vr*dEMAmp*u z>ACl!)wpuFvV~)WsC`9~sRqx*C#tXmYp3+55nc6i*W0>F9;bs={2$3 zhO4V6g6S*AoL=o8+(6JZo>U&0g9>F~S(URJ1(z|FXQK6P^!RdEtFfX5BKbC1YV8-7 z8B%B=>AijpaB1Ig|9+Uo@-!vjW*cG`K+4yZS}+jVP~rML{$iNR^+jMM)$^7oH`V<7+Iq5!fqvdMPF;+WbN3>t6BQ( z@x6y~d$ELkDVfU-JHNJCDtvyvJFiWQ?{|ob&i0ErLvZbNu+}#n>G|juzn*&us%x3- zAH3wy&!q?#@L<_p!|r!=xvN5@! zum3;_y*@!E$}7DI`F~xWObpa#)fDSV4XKb-!@DkEz4kP-p!kz`4I{_ zc@Eqfyxq~hC7jQYT|ed`CR#Qr3#L7mA_Wmx6-#~|8cQiD*Zo@CZN%tF_}-|wzID>S zq?=|Ag5JeZ5`#Yge_*o9lDg&;x!vY3IxVZvt!v~T@?%VT8h3471NyOv@F=Qca3@mOU zP#Mw19gu;R-y}E0BF*wA_d8QOjWgywJBZ)cUo`N}u7CV4*^PW=@(Ttt?%QCK-O8Yf zzB1V-u1{gTVG=lbIo5j5Q_`QiJGK4J(7eymTvsb95cXfGYklsz>z1`A?!UZx18#G2 zJ2nZma@d~qWQoXyeM}Ne?DB8Jbr4nPQik+kN>1>)MoKjbvQPAbBtFNYCS4DMl&7>J zX=A|mvv5DOU8544vRhyl)<&19-eUgU_Po1b5_}T@ndmPVV85RpG@ZQTL~Hj%WiNb> z#Zgd5ZtW8$hB|(K3(x>Q-Lr*qlp4f7G)~N?8`U?7PO)ibhpe* z!~bd*V3T}SorpYBK@NG>NhtcV^f7(EA*Y1uibg=mZj?c@&kph!OJs4c!DhA=VrXb` zXgxqW*wQmGKe{|OL4lKQ=Hg8Q*p+jcaGSK#o_g_$j(a}q+XNyUES=$bD>ec04imCG zemZ{m$kzfkvht$09?u)AEJPuqc{#Y$~bqj zK3y@a@Q5)i{~CIxDdoOWzz-dk+^zL2TeA%PA(0rW1TC9DPZ5R@ z^x2-Q;Pc}ux~36y2lT0$jwGWpVY#7%v0WTQH|rNXw^^iD9E@ZiK{EVKXe|J5KNsqC zeb-~LmjR`SiP$gB%Bg^{##_3I9ll!n=PLTvOZX_oTK`6q>+?WCrC1?P3PdcQ{eOw& zL3$9eywPEP)tbvLY6^JA5u(y67<;PLD=Drxk;;{_SgOY;x8723GDt^w@%W*4WnFN2 zV>@p*qzbk|o)*R$fK)%;Tse!nrc^Y<;`K4-`%nT_m+eZ zDnJKU*PZtvr)ylzAWWdM=N_pa0bNacBgODVj8!V2+J zKl|^e?!NM?k3i{S(mNh(*V>HLhN%=@)ftP0PgA4!O=2zAVYg5DKM`5G-b^L+X^+q<=^Yiabu!uXyv{_^@5twelOX67#+2y#RoY%^+mqUen*V>2hNUI-X`hKqd>Ox)gOr>Kdxsq=v;X)a&byF;Q#KS0gA zw|wy!N|4vJkvZNv4(2I_6CC}+3&Wk)X!XHH^Y;>XK4p7&$aXpWVDrym$6g*>geX0X z_>~13XzDB*rC+~#V21L9XL!pDIOLR<290L{PTbov!LHqnAmtUek4&J%DcD2)YKJbh*!`!L~RWoL6f~q zNZZ3dA3YvkxBg@u^?o{K_gM}AKkl;#*FC$5peRYQ-W@mhtxW74I(#Y|<>gT-_hfJ~ zTL}LVqbfJ>>+-iI8rkPxM6?7ez!DK=A~re+fQw+#f4mG2?H*OV*f~Q&qVb*)l39G! zMIhFYrW${)?Srq*Za@0PjlogTbc^ZVNF8vo--7ZAs-PTS*96PI2V3f`8 zT$ea@=)C>l$+veRZr{&s@u>o@EN$myjVdaoE&0tD<4PspOP}i7RpcOvu@fEoZO2 z9hxp=Dri^dMG}F}y=7};wKKb_ z?8|dAAgK(^qXTFyL~21pY$17;vR`if10Rc(a)Yj7k~ii6Yz-RQSt-m^!MYTTnJUMT z{zLzlo7f3eu&bU>KTytzhl!v>NEEZnra0ho-LHS`TTUK@=S4m%{+x=)kM&1SCoN3O zacXCYzKgS%GP}O9`t&o^}vM+@JgUV&PMC=<)s@};*7vYj~&!(rtwmH>&))to<(d(uEP{CSov zr+eg>?v!N^A#tm^_TP(qQo06(A3E`RudlK}`!&a~3 zmkk1oS%%_FLB9hbOD1>U7kEOK14UMrNk@)L$f9%t7A18Wq!xITwI=A4;y5Fa(Rvw1 zuZSnLX=L{atw^ziziYczsRv6b9zU0+OVLHut&leD4UH?O#l$xPoObd0tz4lqJ;KM2 z)gn_ixwq5jJX%W!oNRpS1_sj%`AwtCORJIl2h9jCT4}&$O|{E566_#^qz8ej>iZ7k zkI3H9&v@gKDBr>m%jI((`1-5q2)XtWL-Psk;4RSC?`Csv>3Lf7wrxacNmARvjd2wQ zTl^Cj_|}|X{QCHyi4r^03-F^hBzL5*X9R{aN?IRBDJMjpw7fJ@2iQyaJH8Dktk;zX zbg$q=O}z0q+>F0*f{Y?5o!aj;a4`lcwxm-0m7U2D6CrI3zIggcuitl4oj;ILT(VOerDF+Hhu|a%r01|?qs0>&Egg%h@(t*z8e%(pN zwuE?rB)o>1$PVz^65UiuOy#upw2+r1hrA@>t5Cdu{+hQ(DBBSU#`x~A$!vwP6t@?0 zoNe3p>_NREG;1e&TPQrnoXZ`LC_=<{okmP&g2+$&1fj%1B0tWi4C9Ul=EJ+NE`{9~ zWGTmDlr&L|yi_J|6h=fdjn50&s#Ng{<-Vux2fXHle<%crL1DDHbTR~BI>;0x2MT*{ zEiTD#faF$D?a?jua6N(G5dKQxF@IPA(SQ|Dhr8-}|5v#$o6d$1-w?qESbzS1h4H^U z0Avhk(OGGN16-_{>qEVE{Cc9BzSVA&7_9##qeBUiQ9`sr*aSno|5Xcd!Vdi?u&ehn V+Hg0d^a=PUAu1#C{jILw{{adrX88aB literal 0 HcmV?d00001 diff --git a/images/img_mobilenetv1_inference.png b/images/img_mobilenetv1_inference.png new file mode 100644 index 0000000000000000000000000000000000000000..931442dd849c68dd4013219575635d974e092662 GIT binary patch literal 72038 zcmZ^~1zc3$);>N10}MTYgmj~HN{4iJcY{cGGe{!>($a`ZrxMa7DJfk;cX#t2?|pyo zd%t@Be9ka4XU^IC>{{zt&w6&0sXP#3hsszc&9#y6i>q=-_JC}>Mi`OmRFNZ(4 zJ!o)VEw-c@R&5k z3;5V%;-TgmAoyq^r9zo>3KUzR9D?*-`#>Ji0LC4u{U<=m8?ol|QvV`cmEdVHWV}$_ z;x0Wq!;f80?F@^mwHZy)V@9#fhOf2d*z^4U`B zSvFN$PuCgnbA3!oLw2d4rJ*M<4~T?CBQg3#suj!PeE3u=qjc?xwfEFie2;M{O!}og znoB(`FTH;LaPN}{H>z+Z5X+NIH$0veh0v2q5T?yLdujhY$WW5YG%*Z0TM=L9Rgn7o z#IDIB7%u8}iMxtu$*AW$nH+Y(eEi7f4v`;DGDDx$U|MU?u5@9{^gcQ6zjL6xz>}vI zAHSRarl{xL)NP!?Kv7I+K#l8W!ClX2l0;OrJTtgibi(_lUhkP11LiS!J@I3N9S&Hc zbG*hPH7tQvsadyxTa?y#^;SHcq@b#&sAC@dc@fQA!{_CjZbIyz5$NiJaXqr9tGE+B zD^F*dj$L;AkHjIEe3VZrp$M@U?1+|#E`ClLY!phOrBBVcjl3)pSdjMFlfSYX`+@TW#&QHUH8Yf-k9N?WS8v$5!LthD&CS? zltQ$I4DPs&DujdsEyWQ?^iX9YDnkvo?o^&t&-e*E>~=&-(Y5Xm^{CG6Jsq<4Lg4O; zQ2khXH>-0Kx5xZ&wubVm{CaiOey}|IkLA}Y4h#;%W&~C$t$XnTQ!&deuHiNH&&t*o@r)`LEIM0yQP3d$|ircI!@%KQYApyCl zq{uz(ubhE~$W!fP^}Y@58Y3viAwbYeSumLyWp^Z!`CC4kgGg-L$WW?LS)qjJPc(Xw zAT>(2_djI$tBH%mYbiZ?*?M2D@wsAci#gI@$CIxiosj=fCANd6=Xq`sjiXrQtJzWu z;17!1e3Y_9Z6Ga^d_B5gOXEx03^kS!94)XFMP_G-iPAG?O~dYPH?QJm#!ECit>P?2 zDEUELNpK}<@nR_)S1=PFu1d%Cm7>2$3r0nU*UGyqxjV)Oz+tB-&K5}H z4Oe?y8=WGxC5|c}p9s4&VcEWn1c7lIm zaRQJ^HCH&!?@|XP%_QL`IV2JE)5-H%L5pQ-3XoOtie|>8#^vo0&!3#Lo_|hBNl8wr zn3QCxniMaSFKgDfsUf#*&for(X-hi$B>hcoW^Kr%e4~<|T*0sXt@bl+Uy%U*=GeeD zfjJM_8F|9`86_+u#<@mi3xxqvw;!;idbA_TxReTH3&d?&hv>45jiSr5SY=WR1WoGn zFLMu+76OICvonlKjk6ayn~i)o&$71?&-Twc&ya5DZVb*_c2baxiBxZ!pd=dx+^t2#B>w$T{{b%@qadQ$GwCw$Jx8wfl>9*n|8+ zox8?xFlowTye+uLo=wZzYFdBm*=iW|vq*7CbBR4>T%}+2qpy!j?2}B?qt8%O{%{t{ zO|&hVAX;s|*7CkJLb4{KDjQ;CNHd z8#5eRs#@LGEf2(Vw zUs2=u_26q&-p&HGR1cs(yXpL@dQcxHsFc&j&O31_C}U>>=G zApW23@U^rd>wf0)(`A0kdj{3x#2uGkOVG$wI;%PJ#}G-Dpfry3b!19YTVF+zQ)%6?Kr}}rr>Makx!zqK)l6tZ!OL|pBDvC4A2oqT@uTs_@H^4OHrcf7 zN~UEGbJ>;XE!EH7ASL?E7tjUp~Js3-HD?#=JfUE&VvL^SOUIACn#< zA`;P>{iq3H)@9nlK?dufQ`jI|mJj(S;K$d{m&iP!Ery&&^RF--pYHYU4bC{Zn!9{48^n$|EI3UeM-d(2)153IW|5tIV3l`rtLPb7!F1 za_*J#aV}+LZo|@k>6+dx`Z*e17@>{eps~QMi<oDbEfqV@Nx*PTlzKiBt}nzZfp-XO&%n%e^EMrlqaw0(Jb)E zSLNNS!Lf8+5+sred$h&z${BmN>y1m%=%A(=I&%e!G9!l{+AEW9%HnG2oCHr3hgCf0 zlxkC*9_uBl^WALjh7vs%JvVY?%cM92QgVO4KCO(lMa#m@cka| z=|f5nj!O0`curO9vALc+jut9HnCZl ze12D%uz&=!#ILFP zDOSTc_`;>x?k>yFaP(Zno%eMWyu3Jtp$Agbe?4MdcAODj@Wa1UI-GhKI5zPUXcmYQ z0tYxBn{{*w8r_u_1ob{%jToIoJ`}E0{a9^aI8ed5481gba68FaQ2DHa;;ZmDc3ZOX zq4TTBBi6&MIqXx*!DjVl2wO@sci{KciD?~&dzbr5lg(ywZ;gw|tNkG_k;9=T6`x0M z(@B)Ei2k@(3Ih?TfCj-dUcP7Recr`o0A!T(52$T#$p9l7KvTo;+lam$w`_UH`HW^* zTTsW&*IfA~fGw1F-81VKTAMN7p1zh0Zti?B12W=xr-iqzWcK!Q`Nqd~JnZHDr6mO} zfGi8ZKM4~H%bt|a<7jz$5#WrmnlLlA?fxlLM=nrIWc8tG9zQ{AvI|$XfvZ(ZR~y zjMCe|-qB6KTbTM^R|vpA|31w|P5G}&++Pb*>nf>IN;tV%QSz{Iv$9i*pixp%3b|TZ z3#dy<|Ih94Z^G2J?(WV4Y;0a$UaVeRtWK^rY#jXj{A}!;Y@D1d@GDr{d>q})yjdLG zX#QQu|0_q*%FV*{m9zURCr8TP<(ip0dAJKxQ~$2$fB*jdoL1hi{;wuSxBqDtyn$@L ze_`WbWoP@}vf;N1{XQ$8`pVnNURUy!gO#Hjybci#cFq?<|GMF?pZ>4P|F~7}|8C{t z<^In*|Kpecxl@SkcMJZbMgQ)uf1QQ*mk62=+yC~x2-@s(S*Ol!^MC7 zHNyWtgTH>i!(aY^Nn1l}0Du@kPV%LuH*hZt#b0w`@Z!|9pJKP?j!agXQ{ZD72a zT$_6V{XrD1u)6GK`B0gRdVN>d>ZS(Hmn&%asarxml4pC{;P+ce#yAZ@duyot{-&+& z#agP-_|Nx<2aRwkfqxyrrys;5b0N-WTWZ(C#fa(+n$P*yIUn+=X=ufR7)l&-8xzvJ zH9sa`^_JB{%fA;#8s790s>pG;JXla~aNjQrp@hs*JA@>Vp-yrb(Eib=$^unh4<7_pX_I|OiNosk*S=7*a zx#)M&P4dffchZ3#WUB1ca=oE>?+pdX{_}WnL}(9#YTbSopK*1?W%_U76lPS1Q?ne|i&&WKG!gBhK zTgRo8Qw^zTU>6Oyg1hX|fV@dg`1wHrTr>AwJ;^IV0;Q~DbDtS4I+ZnCM0X)~K-f8GUQr3-qMV-T^o-L~)6WizP6U<6(8;Q*#yzKyrV2dxP8e1622qH|ho3gQj1 zUpS0`K&a3OS>qYCibMUIgC0(EIGq-SzHesxebOjUZgD|G`2q%(IjK@D`M{1dt3xJ^ z%sRv6W;m_Z-?Fvxy^gydU~83qe7Lt7m}I$`Nc%)vW>BY2wG_q-Lv#al-Y@zco@azG zo2a6A?$2taFzZpTk87Ci)w{S}ucz)_=6v+E+Uy`z_{S`V&v=_)F;yzl@e|spsJDq= zo)r~8mO!o>v!)#F5Q-N>;>aUQ>p^I@z8EEs);Y_dm_Xq#1!dbEb;c@ENP^kaPnJGK zBNr$askHZ5aBfxKC5Q$+B;oK|oc2%z$)4_JWbjoxFL%TXBaHb8f`M+_+_{684r|w& zZeJkm{#VYJY|mLt0*|eHD~QMVL$ee=a7r0_FC+2$oP2u1sH=LiF~HV%xtH*wY*@4+ z%6c#zRb0AZaY}1xpTv!?C$1$ z7ma$8?(q_@?B*7piV&6C#?SX8j`dVVQ~-qjmVn#6EUv~C|D`sd+rn@vn~8wRO3=KDbQ*CzvPzjooh;*BuxIp;8WIi|pt9EC^U8=oh^(kkvLjws&n zy#&wu@p?0-o8&Z0yYZl5x0FJhD05S}#L065l82dw=Zy8@G3fq0RZ>byPWlvoZ|33< z#^%C^Y`t;I=YLg+%9HoN8-8oOc^@Lo8~6U7E5cHd5|e7FvTH-j0=qW$L8@dFah1aO zp%S+I3tMH=vgXWUe2dpi3|pqCJq}|V{B*={d(j_wLqY|t%wV4^4<6VN2}SV65)+kG z6rz>bB}9`0Uu_rUtmQ?JS0^v)5i5wUe8)LCpE4O(6!!a+WWD4xUtcA3KCWTi`!k(q zAmQr1)I!-#y|W_>y#(Fc#%C$`HpJTXaV;@!{d}sRHyo2>&2~(UC69<^qF6i8!14Rc zd=Da~m<+hBq>W~&uBNrm@34izyBm)|Syu`QH=uoqDk2~H3yHaHiR6= zgu@kdzm5RT!Sz&_anLxC8`+#=55I1Uo;bP1`X=?S)=JxRbrs_kCa~`@4ng`g^1kN> z4&9W7OFcy;H*gcpkiqLzEmx%4(p1%SK51~@%`a-I82gkGJoea%<_mqFjSn_I!9BODjXhqQkw|>ufoUX7qD@cB>sQc7Yr~za`ZW)| zJogHR^1{2uOY_{Wba(i>0GFqfmmPwKO*E~Hsg@o!*S5tZHWsvbdvze_PhqgvI#7KRa5i~bH}Cq^FU{$3 zBv0Z(Dd6$`vRdO*eCg5WS+240)H8}u7rx}1G06wVlBlcTJw~+Zm{(jwzpk3Nh?o_>u=mh%Or`V`=T_Kx( z9+n8`!#6{q@59lu1n1t)NGO1aKTFU{O@gW7Dc>pKPaOj$^@<99jHA{_@T-~X&)oLY zBQZROiBw;MFN4p&o$by1!oU|w_!Yjx$Q7eUzoN zHFxiKv1f)X&^^NgMLjIh!zs}!u{+rR{Ea#P7rFn@O9PJ95(}B9#T@6#r)a?TL?n^h zy{{(+jfV)WFAh($CiyU%L=)IL@X0uKlnol(i`=)zCUkk6Mh&fp)ANV%jvRfff*CdE zy~rOMU!{GEmZ}vLNE=AfmL9wdUj6za7lli=$9vtifxDa~QmMimXkHL!P zdHjpl5o%-<6l^a{?7robnlIfSmuXI1rwdpW*SEgBb&{*dQh9etz+#{)8~tvA#A4U~ z@ko7pvMl42ul9bw4yH&u{k~EIHv8Br8uLyK2lLVmosuXm`SP-nBj^toZ~6AOqw5-L z)dWA=;dwD$ADU$zE{NgBW4}C9@hBR^ykSrea0hrJF7djeh*3fjps9asRPa0dAjdfPKROz ztP2n8X+WTGNSDR8sF|)Zrtp<5&a1FYafV0+~AhCu&mI6!W<|r zuPTL*M`$|N65yA~Sp~S`h&YOFt8a&>7tSNh3hweb_%qlJF9#jk~(V5Otk*I~1D`(RxFnYgWGG0A(ahdmI*hoPWr;-fc5cFZEWtzYkLp z8rZpA)tmVPI1k|hd9Lo#Qvy#h48JXKxmqRIbVbt+yA31?+G$vdajp5JFHZ1380SSMvuyyqmiM-FUQ2=R*;(*z%l7piqWd~?r)L5$Qc z#)~w1hPKhFWiUBgPk&qW{^|7WcO<;C{5*C-gBiS+g?`O4ubchGr_V`3D0ZkFH`yA` zI9p#ALzaq+n;VS22Huwl*9V#U-E0@^-$@Y~?Wviyd)++T9$tUDD2PqV`~_6osy zJ+FPowHCZchYQ0~WEOga8#s3b5B6;RYh{E9_C!=f_(%BnT)vKh0$WyxVu>A|3)y*( zBfnM4rLDkN` zHfQX$P~+*?9rLusV!EBw?pa9I?qpdbjQ_6fvceeA(^x|8Zm7iD0tWjGC*|Ss(2qmX zz-j;OAiS&aU+ER;Jgla9KkY(?^|w%vW4ejR|AOySAVzkD*Cc1 z_I52%WzX4j5%!(6?hj`)!vIY-X+^fM9Qjz5y{V4jLv+S|5>Y_3Z@stc#C7L#AXA|x zDj(NgPf=n3M!IqkM<`D`iQup}n+Gq5lyr0u35(#ao?o`9)L9yXRcZ8%y6>|oM9ICE zUUIFNIwRM@2lEz#^Bpi7p83HY8_gtc5V&?4;F)u zu|rTTc=B=&pTww=VsnctfmIQUa}>a;UQV!k5Bw=F%*$A9qDm3o#py{e-GS-r>|uv3 zK{9vzIWO|=j9-+-%>w>CA^T~N+*9medcnaK{D9#$e)dj-I25ik7twR?N-z6e^L?Qn zkEtTc{9%G;p!!y>^Ks{fTZb%u%{c7VReHs((@%8qwk-&^DPtLIkpT9sh0_}MZBE!- zBWA$nM*3m9Mt*~iDfmw0C>HW!9ItEppw6hqF4^Wf8vjKF-}@}MacBQ8#(iW2y4kY8 zAjOkHR(`uPSkZ?tF=+BQerjM5KH*>(S7g(=Z!ji${y5bSO~DwE14MFcMUWyy;E@L;1V|+!WweV1^WR5z_e1Ian=UdYf#B+wz4>DStQh zB>BO_fhwXAkjLwmi)LtVLc{CPqDQ5WC&u3Akgwk zW{kH!KcDKFOdQdf#mm1T4B0k80v$7ryYD93U5)}-aXUZ`~-)%%5z`ety7!k zXRKe3$EY#60wp0)V%%}6k>`i+H7aTBYO}S?I1$`=A9?AU4Zph$BG;Tz4KM`90q<-szBxoq)GgX6pPRFq|~$X{p*AnkW+mIm;UjU?KtQ`EjD0 zh&8ef9hCFC$?ym(FpLIECbDZz+20q4TbGf9{jk zfVL+(QFkvBq_lln%9W?C{Tkp30&$3ktb@h)U#_u$?IU^OBvZs3AiV6nwhG*%WGlM? zrDq^lYEZSf39Xx;E3F#6awaA3;ubyvWD_1LQut10T(KkIq5q)IU4Ag)cFCX9^$dlc z^`lU4Q*pv*Oxoa_fW6Ls`89fr!Clt8p9;e}r67SN9Ymo-Z*UuVy~H#1NA}XtO1=D3|~TLm)sg<40%Q#0-z94tiRY6s-*mo{(kZH{s975GjI46ln zbo&#H>E=ZMNlJ~1nKor6D8*;^Xy+v)IA1zVK*2tzgS_{gbBz_i5U;uvn4e@~6F=%> z6FJbNgfCYM(eUXm85N*!ivF`Vh_bUKv4*T?UheE@e(?*nQZ$|S`Gk%E0>8JQ`$RFY z5~7yC=0LRsSCbqXdBNyJ%kN$i^S=6sPb@%E!F4%0z|T8IVu3FpBC>c3zF`w$$tDvq z?Ekck1C50k#?NCh!Mu5Ci6ylI?BB;6eSFJ$V#GF$C4|tT6$jNWo3ni?UNI$E8Tgb; zVSuI$!^vt&;t=oWNWkhlUdxzh#bd2v-5J@-fFttT2^v#LC(7m;W2JL9l08O~^q6REtan!prCw0|4hkW-2V>_8u>%x$*V)%GdZ3sdw3 z6+&{mRdQW+!9ZSEaUL-J!{P!d6`&_`1fYpyJ6ZZcL^kwJMq!@EVRl@&>w>2}{oCY( zjZh1HKi8CJrkE{2d-zyc1w+=z95VodJ5s1d5fj}r!Z|Adf zzBckmn#=B6n#UaYLd$;v>CrsS#w4>VVo@O^1d=Ae zm*KQLWF>Mg$G3!Rjz;tR2=Xb+&(o=dli4|}e-gb^jC!oU_*}OZK9-Y%LHplEGJ8{>Jld5b^JIbm-3^* ze6b=Z$c@*_1HHko))n-L4wP#Mt6pas!@z{f0GgDFw+vY^e&WRA60@6kflqy%joVtj z(koeBW;Y8&OnniJ(67ch+~Nk~4SX9niQ^6h9QtEU91x!*B;0m9eY>BtlLt7aa&E~y zL=1RyUl+PMbh$i)!>%9|!Jy?YcH1NRu#DfimwK7QHT5q3c(v!lRdUv)mr;7-p9qyM z7x|-!#02!gE%03E@l(?PQIG zh}Nx5#bQFshOJKLt0uSat8HZz=BYP`lIgS4fU z-vyC`ocPwaHU>Og+jwFPe>B4gOa{*1|K{V?_qZ|IijgQwgo|lH$Hspw9=yLV2I_aH z<+x3c{LPPv)GOPs7yH3Xw<3jgGEeQ?VtHqD%*`MYv~j54p=r zq7e5pN^~l+Qy_OEZhO-lw@yT-uu?q@KbcHH|E+i@Lv0w$3wZD+dyyfg8VAjLTPxet zU`sKTd}r%{D%RBMgRr$D+=Mu?G#QutK(rw90O-2Qe*-N$+9fdC)!nWG+uSp#2c;20 z$yFu;ywNE2C_50G_d^FWET_+tCD1UPZLYBirec2Hp^NH2tDE;PDWA25W8X`)gpUYs z{NeH9)vNNM(VG)zhwesO?C)y{3iet&K@WF)rh|mUxvfhr?}UKS(&noxYtvP$4G}~R zU8AXWZtDZ7buL#7!f^mv6iI1+1{uJOh1B_S1Vsuwq9zsHfBN3+u`L*J5q9H*Y4o9| zpDi}u%!;5KCM$ip8uw}wJi_S`SPmn4JC7PCbQ4L+pIO){q+J9U&PAqBHo$tboQ}S9 zG3QhV8xlPtuq{upDAVV!^WGU=+J$F9gPtmF7Rspc@Nd4=+Gz5>(g1g!gds?S;N>w4fd(D^_4Hj@iVM zBeShKt#7t!*Si&n7nmQho0#U^A}yW@W8bvhBEV!;-qYN`vSm&-nQ|`$>3ZTQzFxkQ1JsizPEf0HaYyzLBPyMQ(OdM!Q3nnzV)(z?6r z!$yVg&gjef+30%)5XZx0#=+qwdCkH;!ZMBUnD zI0fo$8Z&%_3QM^9OMHIb8|g4F6bzOTJ6;NU9zf!XNT{8!BVMA=pm)bMeqYVjDK~un zEI6VS&OD^&#aenlOODutwr@H%Qyj45u-yDy{NWI)e|P>9n_H&sowfh>9#>4|->RLJzxc$s=~9`IV${KNdY9|=6k1Q(koXbcp1j~!t;5{UK?>=Q zzE7qFmq|JME_WsdI-X><9I(jWQc-lkS^vGKIFm;?k9R!_KQ7+OkxxJ<336LCvRSoy zpor{Z?@fUG2=q!P8*1P!+%c7(&$}pSTS-d!(K08?{CM^5_RY6_f>gFEmeGPS?^w8V zZ2^}oWxA4>F}x{ z#VL>d75nu|JmKP0=;3XffAG^5DX-4UV9Aj8HxzWm@}P&h(3UF^i25oPnM--FYBD^J z_W}IgoxAVBqPtGJK$*->nN|qHUeD*T?e)pUq1hKN&VE_2UyPy*)?T3m`66T^%<-0{zIU(-r3Eb! zZ#Ixk;*W%8J zp1ricPGK?lSt*$e^}jv|bARfsN%_*^B|f9Mxw$xTVwN)T*b*L-GZ|49yJA+^*EqMJ zp3^0fZJwW0P4bj4ma*To&Dak4n<0@KKg%1^(LTg%@y3pnDwKsNs)9ZD-OeA{4}Mtu z;k1kqUWP$~SQqG#qLW@+dJ^4Uy;Zw~llZ~!oyzZTNId=fKYk}RScEZ*Z^`Jr^hPy( zTNy07;6==co>rFVYJ^V6=c0&ps1k3i)04w14n1v}-ylU8;Q2tFu(PaiW-Fusk~dtM z`xU2gsff`w=Y1A5sy6zu>h!AbuPoyCXn5Kmz~uXO%05YIE^Ywz8K9x|@MW3kGhm;c zauF^dAC2i9dlB{d)boA9B7y)*^8Jtip0?N=$(W;8Zv2x!#L@Agye~PsKNtk_N-lkC zAYJ?Qx~}*Mlg?ymI^$|!c{*l`eJ!}us8L(`Ri8)!zcFB10FxY843_=o%K*=R{^0-l zhY?g#zkbJB{;5Ow_hY%@1V_?D zMcG7`Zhj=(Ecq`LiaFAXQBm>^3BW%5Gw}G`jw#$+Nc`)9Y&PXTGsU#=B&eZbkfmd9 z+UL*ZJP&8-`U;d$0}Cdosi|q{1@u==Hj=5OqGj#vOC_bHzf5HH$o>6%l%-&J8O}Fn zujFK9<8gnyEfC~~fRDW34xQh1E%H8@(P$PFTv%A>-g83xPpkf~o(n|`CS?P)RjMc} zr<57g%GTITjAh};fcZWfBi>#frgn8n$_e;tMwdWUi#4C0VnBNUB>$}#rr=5lf^xRd z^VY|Q#_=)@h6NwjP=Jik^jDiWQW5{@CJ-6{qa+8Vk*|>4UmXF#0(FRk`CL{)QGux@ zo_Y;#2tWX)fj5$v*mS*1zGjiS)C;FYW}nmbL^zW)#zkoq{a;FnPecuU2dOmcME<_h z&%~99E`uRv3=Dl@4{KQv0-_PJGK=2deO;TNeFMgeoD6+~f!7OR5c~t*z`w4sBbd>F z##?uaF^&br0ksfBxHq83XVOlB=NL#Deb4Qp`VwX2U*o;NtSp{hW8UdLHWYGCZ9ni|wx?4o5WV|IZy0*fq(N+Lr3tMy>DN7eCa zci$Zn`X`^$jgy~T!6ub*Z22g(_kPx)stGXJB96oMaE&eIh5t9$%G;X64QHW zKf1EAVsB8TPUnkE6jl28tI$k>-}NdohE!O7_S>t(bM)=r_F;?^cop&`+GUnMr>kZf z2v8FLSV-_m9lsj}P!x~^(qc;v0qn8K1T5E);>birGidl?p0h*hFGY`_+|ssCG*G_u=OG-ZpK>Pq%M>;fsz%Sl>enf3I>y8=?4AYOoE@**IXk}4 z#RCmx@T%zB#X_ro;w+gir~GNs#OcMT+QRyAfX=-$GK26!p!MyjsM(^!aqxs~rN*}mNnIg2<+UkG&gV1;0FQ~GU zFOK~C{-gn%CKb_E0Xo@=k?7dA&B6i`TjGT2{X7qATYM!NYuz5~kSMtVqq+xTA6bIxa~xzG@4>k`5?b{2GSXDd7{ zd2$5>A|M9NO_p(9hY?yvEu!=ze2RR}8V8s^H9l9?%{LbtHUD8blKXN@RTu>l3mkq! zXyQM5G3T{KDRF+IOd_OPg8VZK(nboz!gKu7dr)%0ooJ}o>%Oti>iN>|PG!JbGa6Zk zQVr)@RLy>u7tO%=J%d-c&2Z(4EPVJZQ;5ge7qgHr?+NKuYA3uxG1^RJLQ3wJ+K0Dz zXe{r#qQ*+~xNIUFs5s|+ddLWFVM?87=&aPlq`>qIT9I?Xu#g`ys(VDwM*u z5sBN!vtKIoFY4Vu%!c(Nl4xL%#usHCob~{XO+701C1FPGQrm)Na^NYCj2P1f*cowE zL z6A8CQk7_()t=AhQ2BSHjq+1&u=4zD$=G_y~fH<7{?_R0v2qTY#V^gqx6}cPfps@Bo z3igY1qBwZW2|E4`cWpXflryY*e&^F2oSH^2%+^qtFt3_@L1zaOOv#X=y|2Ul_RX8? zI)+pgO*h4dB12fXq%TH}kRwr7yW>&`SoEihY~Z2Eo9k0Oo;Ki*zhIVS0vx44pjj&7 zZSsRSBND>ECHu#P%eQ5vdAIn;g|8oM-V(^odG3R{Ya7W#{B?<6;fmCQ6g3V3@DUts zj<`GOlz&w~4U`Nvbx0?EWGgHQa`9L-Ih(PWv?}tI7dl%)v|E$uy+bl9xA-zPLx6=;|~J%kE`T~Z(1glxMw0{u=*JQBT* z_zwI?^@kx0{)v+dfgn_z%7KqIz^YVYm{vlBD_08+ZL5qR|JrATYEI%}ozj(64q7lA zgxO-h$00JoPrCt^efdB+P}b|=iFtqow+O|hQLzsq9OzrNgg-G2BggDn;=n02v!bv< zTF3)}pI^b{KI2PMICfX74eU3~D2gbu*DTuGOHmBjl?sAdi260w)orGwK7a7QP!K0I z9H`NV`#$*NB50QzzH+(rwinVdG|YsNV2nrx=~lHq?}h8dp!DxZJ_HMU%s!2S-ZpsF z@I7Gh!k8t(F^(IvPgmh-swBB#Q8u_69EAk4?H@g@3wj?$!IH#j)z7yOU{HT$;x@?B zE4#%2a-Z;fDE*FD-dG+y?{+9WDJq4>?y&UjA}&UZ9u$|Pr4k~a%&12Zi36e26&Z<0 z6Z|6}p^blz@B~4;%|iqgIKU&*hS-af`)JT~b*q^NRI#;fE6C}fp?vpSHU@Rs$HEkw ze4KIHkS$B+Sr|!T~G)UU%?pZGVlfj|d9p z=L@^y_x{GT=;Nfo%hwYCZXumq87L|+od$q>QwjAB@&<7`eiDb4*Q$7|gW+Q0@EB75 zjuWh=voQr%t^1q6C-0v?$dL;Y5*^MTTrfq&B&bEZcHpW)k)nJ#(0ducM`&YSM_XNb z6J(H#nBC&Wy?Il1m!i}T&flP{mDMdKSh6rg`lCF9MTU<+uTz*I{0QCC91jwIq9jwI!VgzhE z28*eSc-aCVOAYtry9^O=q@2Uh-oj_6q#sng3K6GvX-_>P;Qy&D#knk9($&8|&02LeqT;BH@9y+6;6h|~3K^Y=ML zJMe-tj*SeVU`bC3CalGI@1hW4>4;wsrO$)mFW|wSr8G$a6aE_#nNyK-A zdVQ9t@;M*^tG4`+L&U6z`tW0F}OVggM)Rn;Sq34 zBT0;blaCN{px^4&F-iH6TEj^h%6|RR4uKK0?eE=7`f3gGaat#C3r84pbxw72No*#I z2P8YfYT=MWb!D!5n=iqL3D_Mp0E^RNFR{Wwvwdq)K7vhW>?60+y1<*3;JamTo8M9^Xj%Zh2f~e?6}LG9?*ek4G-inE-S#?1 z{1fp}{shCrs-rol);uy;f$Hi9qTt-Gs)j6(E$d4aE`?DVE z`PLqy50h#$>9M(@qz`qByOYF$ZGW*M%hJD%&b+z^`@D!)3-+H_AifQzK*<~m=!(o= zo7d7Y_EFwqF=#PlZ8{Pk$UN*y%}FFV{`jYbh4Tc?vA^SRXz4jzJc(nOD+-UBmo8^) zM@L81;fbjt)x0-Jj9T(X8RBg-lmoB-4jAu(aC(o$0c}*jgye_G1U-N1ycx=(Wo2?U zJ6gqFKm8q!4KSbRQT1{BT{&DX4iB7%7;^my{tG9;mxHPb=F72(82+m-;6>SdfCGiE z7)hyr2U(QAx2A-4kG;nCc=2D#r3{`IgU=$zG^-Yxzj_gX`UwIjLsWwQIS>BDJ;2Ej z_~Swr;^CVg;9O-wW+wBtlwdBcsiI=c+F*uGGyw}^RaI3-HT3(|+rP;YF;)f$fp{By zgNYS9WKCc*X>l*WLjXR+QiwWsv{za5{N6x94~FZ+KMSTX{0Ufp(bqcP%HJ)3+e}4pp;4pff1xVi@f`MZ z&BFL3CY{)9;I{hOe<>wX@&CbAx>Uo2WkWu5zzG`VCsX(a6?Hhz^4ze#5^jvTy8xl6 zZ3<8`gr_T(D1Sc-sWfG2C{(~>$1)NQrz*pUDjnut6zNud)WU6Bf(70xooo)XS@kDX zSv*1HliOO zR9QrvHym}KzcWIXc#|!C-2+$0fX8^0b2vId>A7S+@qbMh$^lS^G+4LMvuwP~fY%Zd zO9ZVBfjbr+9Kbq=3OTo(2|z8j-k-evG#+c>$6LYyxG%RlY-uj8jQp7+YErCSmOd2u zO@&HD;&fxMtn)W7vV%nUY2ky---KbcB;^!3Fc(=3(1-Y*3t&*{oZmzlNMizU{XLtk zP@U_s5HjQv>HCtNYwY1ScSOr<3}&!R6l)EPy>1MHA~Avt;m}Zll>ZNj4-_;;6aP94 zAm-A9kQs0%y$HB#7X(CsXlQ8YL_~~FHU zj_;>Cu%q#r2u?oZ;^86D;lj%zJ|QZg5h8uKJ5_nESG}f($9d5D7{Ad zZ`oRO1w8j`tM1)~-$0-?CiTvF$sS!Ii)0v_w-1{u3&%nS;hW+T&*Un@4^f86;9Jl3 z+>0$xu*tcPi%i?!6b*g$0KlGa5W${w=yCp9x_)ygXwu^U;c0ivA5(}w?Kkg{`P~f! zyd=m5vd=%>?6MVU7A1tCSbmNMh+}rYBVkZTVaBO%wODTw3%+xOD^e7KjPxqt0@+id z7M6o34uw?KbGj8KwU43WB|5|ACu-Z2mFEGa70d<=hBp`c7Eu?*e)~Mv@U$H>3W5-E zdE3FpBO=o%D^SqK-O zmg?8^hXS4fPdBV#G=O1bDgZtye5aD|wOmDz;SWIQD{ja6=|$0zqXYOh3ucolL-K$% zwSnCuxF&kqjgF2^P9)&+_4xw4i|nV`&cBY`oNd!EX_v;!Py~#3NErv+djKiEDX=wb zHh3MfDrLN=sl@%ca47(5^x98&h9A#58chI?AKuT_ye?=sheKA&z51%^wOhJsh1-0y zKoTB1TKL1)`{aYLI_mf(@EGA+<1(sW{h~iLkBW?m=lnb%3pbeS*?tGdcu4Fr&3>Zx zFtVl>-LYg$@O55BsYkCLhKB!pQ%iz@Q}aaN>7T-;BRV*1!&4UvhDr>Hm@mWQVK0~J zWt;zxv9}J3vJ3k~XUL&JL`q6R0cntip%G9NNhPI`?hqK#qLh*@=|;K{q@@w0yStmS z=Kc1*_L28J@#kC@Ogzte*1GRs-Rdv|*Lbc5$7A`?Y>nU|HMixlN&RS0w&axFlU39m%nKh6cB-ZJzATr)zyTnSwH3|A6@0_5qEwR;k(NMwr+j zZ+OyFL+O&RX0q%fT=Uvjz%;nnd+DuEVLgeKd+uX(u+k4qfHv~LD6j~uBfMM-`@Q1P zZwjhfO_Q$x9$?>=;3A3Hu5O0lkVAaL7xg#@8_`Qj-M1AoX72&^1fqNVPXBAh3loyKycmxZXg5LdyW%+9Pjq|5p)zx*6cWqI?-0o)X~%5OQ)DpYanoS z6u_HCy~W!T!_&5XB`e=MOIf$uN@+aT@}Y@1Z+R6^{4^0dr)z!^-;G(8_Cs z59+sZ&&jv*l{b12!a%ekSbP5y4Ovl&-tB&p-tR}$l^N&CcsijJp3V<>_@9qB_Oew> z%w<3So*_56EIOF+WPf9-Zagiwh??(8)I!O_Vkm2?3RJC|RJK+@&!u!jM|Os)zJ)9& zxJ>P4c~r@XqKQ~YyRVHDZf9$yWysZGiL9B=91B=vja5>(*8{YK`VhWjCWBV9yffSU zqGxy~t9Xr)ALvCybq5Da&3UG|OfhjC)y7(Z+S@^3_|C>|E#IN28t-E#u_06-{@oX) zFBmB?Rt!~V+{rqxGX?uIwa)Wn)xv_~{$;gZjD{}oY}Gk?2fZ202o7#`Nnzcft`$zi zTb|h7a`7SbLKHf9>j)krkGStH+q!XSNO7i1DA>1J(&j$=BB-Yl$#uB)zL=+}7xvi( z4R>Cz+F(8Uq3=$X>wNzXhHbSXNMj!+<13VUn#+g zj+PLHJDCTAR@-H%VDdnyQ?v75AK>7`jXmp}^I|H2-B>P?|j@yznZ05v|M^)rN_T73?=$N^| z(x@I7eUpNjqcCH@rJ0DS8pR`ezke%Lb!lc2i zHEM5YA{{~W7jm6?&GX)F+|ql|+W-{hUdz0J(+)pLG)6~(jblGyki;`df?!^(sP1TN zy_IIa&S_nJ+I+`rHh@&*`s%o0v*sY7j|xv_$Lz%CBO{-Zt9_3IgRL|IP{o4pJja1f zqPrDw&Czf%C)`8zuggXFND>m1<}hq)^&XFqihEGMR9(Xlmvu;UmEd>L>%6Tc>L+6+ znZ;+Xr116uiGs0i-A3(oVo0j}E5h3a z?ECdxF_U(+_|zU7#i~*og}PRjO{8-Q`hYG;N@&=wKYyEf_qmi5P>)XtYvvc~biEn! z{x*e@QVwip&1iwU1*h62L#${sr=??3If_lhP!wEgI#;Hm8?Hou4xge9B827}h5=zx zybr$})qc;N@`TgRfE_wAl2zk(O6Di*Bm6}M6DJgvc#$812>E@zQqwV;uDgT7@2FhzS0>KBN@^tqsgT_d5$(sb#-p zx%D_xdr9aTQ0Cr=ZO41O@4kj2HfaqK9K?`W0AxBc_(IanmJ45sDcj=2@+V(J1UPgob$D-@ga; z5>0cVOz#;`#3ziA;*qZci`<=2#&6S4OW+K3iiNmz{v#Y2sP`02%=njhR3Z^U4V zj_<>NITbwjgr#n?tfLR|cF|TEw4Z?xMW~^x|)|N;O zvyT^mX40b?1o(QXau?3AfgJ5bB!myp1f(nfBG!F48N&t>sN(NJmkrGjQMNoTw!e0$ zxtDJ8Mu@$5GMA$PdnXq71o`G!C{b(cR7B%P8(_jkG zPOo(qNBt4ih9>F}86b*WfQ=TUKy8)V%4Rwf)!&R3rPN)EU!N-Lk=6=tCH=0Ue@g-hU$LRw`gEEcf#PW96czO{0oFy) zwq=D`iF77i6a5m+vBh{nz8tH(=DiN!L!Fwq8lQFyPoU(#Qp$P$%{K^xd`jafy$q} z6L4}$auKh#q0u^f^LvkCLcV;t)Z;syufRyVSzKbD?{Vw-yt2mW3muDv_Rr2wo3Y<6 zbB?Tpx^*cv-!-_)LW=h0s~)F6e1;bfvxtlA-CIagY*Z2N6|gVj{nxKSa=Goc-USo| z36VxVJf=Ooh7{>j@i~w!Q-8Za5|g|wA=yoEm@ZSLVNk03qf)B>dNWere%4s&$e%y!JiQcw<^6+ z4?rv|82##-?ol`8`<#NA4mGfSbGKY>zFqW5nX_=vqj+J*l`v5(gO@vzzZ-ut)R|1P@j$+ndh2RKH_F8``N9 zMR^)8Fl*kQ8t~x(9_6DQGjt zjuFbo=3X{U|1}~=`2oz&`KjO2HEKeI&YqHE66!BnU zQnq!|h6L{+vfm$3o5KI3gix<27`$*_R*q-4*~#g4#-vP!Wb)vu{k7%>zd zjuxls7+zhRs+3u%jrVSduWiz@y(%v~Pg7_DT2=FH8cNdpZ6P%1C~xFjGIeoYF3u>K zc(FB^V(q#gtQ27F)yC{i9=lA5az0P3QIwO1`op|tiMU|NS zNf9GPw_Y-WAxVUp<<{si5jiiJ76SZe1PcA4O?H6-s2KPg3#jJc_`0Ejp-OLmUE|3AHS@dI_PPz#z;Sr z`{b$fWin~+3@!HxQsvDQlB_fxM9#Btc|1*u2QB(+S8Qjml3Lr zN35%I0Ah%QLekBp0SDu>e^ack?f8=nF>A(LjorK~Th++#p#8%7ivb0cY7U=|=!Epo z&Yp#fRx4wn5vqzK2-B$PkFE$mu-9C0uO2?*pq%H2dnXGuw_p)77>uy5gQd90Lq}K# zfSH$01_c!_WyCpfw86s*=MFX>70uE7O44Yc2rFB=ssu}h9AEzijOC-P zjD)k38voovk*ri3bp*_KmE)RYVLtp^7ducK*CHB%aNsN<8ryX#?@8c|fY-6*t?o;& z@Cf|yA6XRpQNew^k{0^XwQc(JZ7u0vTMs~L<5^J*=+PQQb;2_fh@aZ#1}ch86c&H^ z6~*P_^k$?ZtDoL!crp$XO0RF0>t?mJf3kUq4R4nUGOL2$JsTg(dY)#yIX0~n-WcI7 z10lP1MY}*1ga_bUHiX1p!xiCIR=us1H%h~>#wrF-7dM{9KHSlMN?rKmUknr+C591z zh>+L)Bl)lHztj``zD=WYJ)EoYDawerrFf8@mceL4`d|NMn|KR`LkC~a>e`yDVI!J0 zU~3MXHcd-W(rfZXPk;UMqk0rx12bAYUP&#+&Fc`>cuk5FdO_}%G^A|_tz`)xjwH0$ zXrbfy(RjDxgz?+xj$cv5%zrW_L& zzvcSZTlhebZxAo$`4UKpzDe)SK?|3!|3Ry%gqh?*?iM$wq$|eJ5|WTCPRIP`SI5Qr z2dwr4Ia8LCB{Df`xlvoU1tlAq4=F|)yc-ypm=p&3(^Rz4{gJv?Q-}!0=zm=SZ6rNm zySMQ86znwV)(JwZJ>m3d0&`6ODqH{!{1W4{qGxpfKLG2>i;^y&?t?+hMt`tEP<{Y# z4&*$sl(_Aibi{EaMXo0JjD<`@(qG}FOt+1` z!UF~V{F_0beMbairAxk2(=fgPS+}&bjE;h5Kv5}CMCD~TY>w8!uwpB-t-^b^-0g<=GDA&BbmM{eM^H#k`}(Xgm?hiA1GENzS7! zH?y;|E6>-=Rte0l_{|l4_1&@K7CE=kd)nQ5XyN2EZ}nfp)om|!+h_&~tX`3zNQ=dv zrnJ1-3bZwYNvygqCzL$if9`s)c<&y)(!y6!kMixqKFoXVOe85gSD%SKtE5Px$B3GS z6!uOR`<29EolQ$8j%tE!cW>HGamw%S=4-}=2`TQ|34DN9@ZoGN`uUNq6Ng&ZOB>1G zFe*kx0KsU3n?=v|xX&=M_;*|X#lKj(TzYK4M>>zmV``!WYs{&_f?8npUf5;;O-<4? zq2$|Y8m|+HYL|5r46kT*opb=ush13h-t}~M+_ruv{C@W7i{M|Cf#Ar z>E#~BaHktL{Rr^x5_Kc~`w{eMh=v!CpdkM2V%r|=Qo5n;U~WSgZi!@LaHyMnmJXvs znF@6POa(V+q_ob*DMSekSdU+ouAz(t5tw8VJ{vybIVbQ(P}v2{*jb}x+D|3B_GHSV_X#7eShP^plPyNCEY=Z&2H2;C1Lh=e6%^x z1O^vjC|hhn_sO)8Dn=##AE*O7sqt^ntqIoo5|elzM{&Gj(ma%91%fs0bTxeH&B6O- z509GsFpUTHd%N})yAH3gax~xH)-bH^VT^7!ygZAEmYnfSB;$muTX0uT)wwI8=lgS7 zt*L4OKfN12?Q+xbh&DYcCO$7cfuZ++!{zyLTlyj}X(5>mft@dVgUaIONnXy^q}RIy z**Z1la2VXak;|~H6a3ACtkOQM0En7e31~d;P(z6*D3a+g)_J#4aR-MR@}h{_l;xTE zzENtAz|U5Bmb;>Rx3!X7=dhlXR7fI2lpF$F+P>dsWYn5Iixq!86RvzVHECnZNMtyRnKV$fm zUEK%WvyWEc5`^A_yG*;uIZMV57!qc%4~R_yBpc*EX*D!@i=Cbrx~(@{N8)wfN=2{t z15!i9ZD31c)meNb-Zz1dMCN)$-sxs(e%u*KHb5m8yZF@no|_DN-GR!)?=cWZ z@eQCxB{thhXLqMgJjlGLFEMi$Ieq1_&YeNgSc|H1wfQG46F{tz)|Imu4jk`xS8FkA zDQvHAF@EaW_~Woa;aU;J_VQ~R{661;K|4k{Fa;t*cdS~s+ALIMbIA4ai*-y7l`!w z@MneX{9I0ogCMHx8^m!pv?%Pu#J#;tH8aY;Mz@_El~tAcxO-lT6DfVR-e2zGpqCAf z04Lk{YV-lQ-fwZ2gqJ1<=glsU%=e7e+q`ca;`U;o1(sOqj!7MyabdB z=!K;OuY6tI_BtdiRG z;f_kXOrKr=Rks01qZuyp<9>u82;nq6NwniwXoQoYCzA9+C6a;Zz!`u3xg@do zO%=hc1hQkV!BgzY7rBJ;fTx`*Vm4 zSS06m<+QZmqdpux9I1KtmCj|E$|Z+wR?{sX7*{PU0x>{8cG|SSSUF!y)dSg0l*A_llBi+y5LrX# z91K^W0rRV~4%~dmR)~o@G6`(_6SuH_$jhFJE>q1C6-fp_$VQr`E_F$ayoOzDhO$z6 zC#O6ogO0d3{4m$GU+WI(+W;f`bXf39ik*;g8-9$*Qpv{V0yl|^YqNxFm2^Jnv|JO< zE`5bBKX6o@EE6GP1{THF$^nwVs^@uFmh`;dbNmqrso2ozr=p!8P-u_xOr!#t|C(!X zpx=dJTQ#LUrK8#Ols!gi%_@@{pzl00>rQcHx#bX{Fqgj`AKD9~{0KsN0dz-a@!fs@ zqwZDf@Ak(KkbPsU^GEj1g}$M8e-UaPcAAN>dbm{(g)?V$W-gRGuj$DJkCS&H(0I6$ zC>=W41sPPNYzt=#hx3+GEndyMBp~(5oGifDkk!$Vb%lJn7PJBmlyYpnG?OQ8;3T6ZlZd>LHAd#^0DCkr)H4Xi&USR zMXgKf&q2|_td^@b>>t=vReg@RWvC0(?fDxyC&doO9v5~A(K1P^wc0E%G81A*jpp99 z!B+>&@1wNv1mETQp}vV409inXzsDGg^}3`E7v8EJRK~RxM?E469H=%{#9?&1JY=^2 zV;qg{+&>U{9mZVvq`xKPY8u6!yKx z3>U?=pLKNwqxdi8c{KfYuTu+F(t1SDpqYm&Q&QU%Dq91Q-i4Vy4y`o-u$|fhleV_D zIL_I4nTKN1zV?eGt^C-D-AW_&n5n2hlAa9{}&qyxfrMieq@y7>~%;r%cAlX|Mp zo%iLV_O@N$vUy>NXXt5 z;qIm&RQ!}-a==raFPXR#^P?LVZSiaMSVkPBY>Kc%Q>gueM%0Bzheoan z4zOei_^AY*lPbr``ksYtV)K{T>~~V>ox5jS-{YI+1+Z$xYTZ!0Y1Gi~JPa@0?vn__r=mDmt<4<|Pk%NmxtDIT0Mz;^$5 zvSM0+^cPm2h76iNdTstN3dPz-9tE6?5fF8fFx(gfJkbOX4`}PI%oD!%A)Mv!kkJ*I zX`9kx}&0 zCSLD3o5tcO+&SI$7!sm?c=SuC6??voZ8d!qMGzeq!cA2$|>a z2Su-=vOtn2bb1pX%r6sV<}_l;Q)IH12plCvU(p|~pWla1WE-Hi(;)0^T zpthn(yG`AeSWtd*0dx@*g@^~%i$u@S8z*-zY;tEQiOACzH%(rCk%>S4NXx1!vg6KM z*`_}&f}#q`%9DgA^GhQa9TzA^gNCYZCqzd(>aLwTNok^T@7`ap)l5VgsRi+6vR>}g z&?ym=re@43?`zLL02WfMOU@}Z`}1SF{sG<}>0CG*hS>49L8YT^-!1~~qjRsT{MQot zQYX;Pmv`7y0Li_zpWs~Ikd-BWDp~hrVv*T1;ilvyEbX6X^9($jjkGoUA;NhJdfHR9 z>I#IfAtCfB<6ryJ80qLFK+f^~B(u9>K?NYox^uH)vLJoBTyx^+8Q)eIA^sGaZ4*u2 zB8-a(1TUC8NE<*5hCQ@h>?F~pqm_>r(Z0NIc63^*{xivAOE4yz^$ZDzZ=7&_YVv+* z_>0Hl=_y<@ZBYEH0W~3cFmSnd5JBi5lJZu+)DQ#CJDkWM-~g7|qt0A+Sz>YvW?7>O z7YgaHNLF=)O!litg82O?di{H~bS!6!a<)CDr9W$vMP`1py?R1Ro=S}(KZ+TBmc(n+ zxMXbVyU>-?yM+GXZi0#R^;(^N%*mj9<8sSVG!XHMVee2NGMj#&<1naf2iRm}N3lz9 zVfbSq6Q5~sBSwSB*Md{jAbqsX$pg?m7}&jZbMX!;2!Ub?U-N^4>hneySDVbI1A4j7 z1U7+s=}W!m%#%4TJ9J(0r&o|nf}25of^q*)6&$&Ms=%ACvq=*3C4`U}gCNeLVoMpC z+)7OYYGLR2QZqY~bkchg*It(keJRZ*GcvMrYhPi4PC|cL?_p8l_Ls*t_Ds4T4f(s7 z;AUt6{WHZTSSUa8+Moz8bd z5v9_E*Hca22D)e2jrJ$7CB`4P4#Cg+szj_LgMWQmBBE$Ezeh#dS+iLsFc zLbC|SVHD9;Rq@_;QQrUgi6oke0%eBb3MVfLLQm8CQ3S<|5Id}9W+uh&2ya&NaBY~) z5uyf*$4E&|STdJaP%2TBla>9t!zoe6d=m}*;rGwgG ziWjn&Y=1^(htF+}l_|=uvg~2!W7{5#J|{+bwuVQCt-Q)28dTYMG~-9m33#W{IXHiv z_1Z_&?PTi9N;`9Rv>Er!vGT!^%i&}V<%zEBL0YW`e zl*l)~b2=pf%vDni!7;2u<9kb*p;Uo{RYBv4(Wpc?8OBmFT-?A` zt{1?F8=t+s78KL|0T30)X z6N#B~G)h$2nUEY*!POd^BX*G+DtFsJaKCvT4onEYl7z?d!i_NF#HnD%XITjp|w9o|z zZ?T^&8%Yd5J$brb-h3$hT@AITqj@3hQ-NN^)7CeyCG(2XEPO{|sB$;YP>RZ(*7Fn$ ze^SMqT=xW;HuI#$kRm9qO|IXtV~FI6zWyj+8^?rYnXh$nGnW-9TL(>ytymNR?mGhi z?Hi>zcuA;>RHehs`59i+0P}ip$Hv1!e@{|f*A%9JPrLcA|KF_(VL*ltybHzBZvibq zbRkBG$G(n^eY=RkQ@-fVL-UD0l>~(n|H0~b{|_g0o4bNu>Jq5$EkAv=`E=fcA=6Sg zPv8bc8W%juxqq9^7YjuE4Iuwx1}KpMZVQAZa0&m7j{ZZxw96nP8d5u!{wpy75&-c+ zy&)w=z6|>WlQw%O%2M43tOuIXQPN~EJ7)oiqM=?;iF-}>`+*GN$6)JwLiqmUM+KLM$~SGrNSaE35NQy>^>Bq+PSt94Msp-> z#S2I_X1~ZZ0)rI3eEn)>2vpE)Cw~9m9FSInSDk91uIS(S3Yw{DY3Z89uWzFcq_>6t z`v=Uzk-q>qdA|}qt8w0RTm}<6Dav=S5r`r{LT3XpHj;Qy$$Cb|tWj`R6ue9Of4qxe z%s>3b@BDB@_9}+vD%%#w0tsBc30U3I!C|8UXdoa84*mdz5hJ1@&1l>$9slJy2!d;3 zK>>!;LD)c#rjNs=4-};;#jj;@84yhkmq&5Hbg$PL!ED&}0K@2g#DAoN*c19_-$&E8 zo_O5|GGg-VAtDe$7R2W7{8v_=IDG)k-^WAiM{ovm-b)Pzrpx9S zI>pxq?Scaz1T5x;7+s(h|Ndi|5pz3nEwn^({_5c$jBj1)uy!+N6I zcibZoGiQv6OrgF9>BN%6h5Knhq!FEwtb;tXR8b%|4uo`Kc7pCSe*|QWD&fkT%Ex9B zX7ty}IH*}j-(lB&0JtcLSWmY(T&V3EAUvVcEYgQoB}g=jA|Iv`@uMD*>W?h1=7q87 zuoPc183&@hx&_BTmyv)!BSQC~<$eG)ijatHn?ZGA#=b}E+dRPNiyN}?vij~ztj?Ly z_HPfQ#Ge(Xpi9UE;3`Btsk_`%Byr;4?_iXAV_n}|UFDS9Bk0sq_)+{w%+x#X|2*73 z2p=4SBr2a5kU5_|+(v%&%Fnul4Jo7$V3Vt?b1^C_QHM1 zqFBPd93wuyISY57oa%qskbUf*CTk0$>})g(qH4bbK@j!VR~|@XPy*Bz`*wwqv9Z>P z1c*7gx5XkXJU*rh*2df;Yncs;#;rhy9>L6(yX3$P|Aj#81U|>w+Esd;!1N@JFWd(e zW0dVv4SjNMfn_sMcub7BIaU8E0J|CQ-L#x8j(n z1|NFOkIPTC)o`)*32UTjhE?GX)Ssdo#56pz9Hqsi&B;WkKtl%d=I3R8`&QY!Q*YN> zwZGJv@g#1cvBKvO=TeFvr_~s5HUPpjMnt~~?0V;2wS4gAX{rQVU%P3NCsS3!l7h-> zM)T?VgbAk+ohTxxl^BSKui7jhw|uHBvg_~PRgE|T{`D8o1~R15U)#0`o!kAu>=c}# zY`nC5d}?CX7Y<0r7Wo5zBjPo^A+x(J=Do1E*nz`QOWP}Uo(Ez% zAfE|uWy(4^{8EQ90b&+^d=M0AM!R#2dy9#aXAiw1N55)jYcwR{QV9)J1_<5SnUXjl zIQ|1Bs0hmLOaaT!D{gL#4$xaGHTB74J4vK>?>{>DseGTpY z$3U5D5yD=<;dO&}mKKK!p}g%KbwZhm@CZ^{EUZ6M=n=nu`=VL=g&bsn2swEJpGqAS z*JJ8AZ#;D%-xd*#-g6MU!32io;A;;yAV&#-6L)N_czEdS5j7vi;WK@--X!sB-ew+p z`sScn!0mU&mc3!r(CDx84RrSwATAoimqT0nU2J%NlFL@wx@z$b@#X$nk_eIuCjX}S z6U0&+KVdZhIsF!z75ljIWNWAxXm_DMCy59N>pp${OfT0tsLb)4e)M@8eCOgihXPG8 zn(cd4uuY9aX8z0=9h?$T^KLqNGGl6Q0E$zO%nD4+1g7h==G+dSSA_F!YR@0-$ z_$JBqFVk#FSO4T@CfNK*O;2O(`z@>CM8!9g03_)`;D(o%R}a#f2~?|Wd`nN)8h=w* z_xrhwx12@`)swE}31P#)@SIJR!)l5|-5Mlq(&Mr^@Kbo18sSsW_{!U1x!X=T=?L%W zE4|nnui&QpxBFnajg2So(_uRcy-8qeSxQbJ>fxV$J3rxhJo!kyThHYYnv<~Ev%s>Z z3oe&QPdkgPSe0A;nXBB_X!A=G!hJ;0EteoQb@9BH zjh&x!kP24YSCYkE9{#EpDH*g)KNS3dIc>uAd-3|La!kJ}%K4(nk~~GlGPUnD53T(^ANKKuC-QK8`@;1v-8L*cV?W z*|Jpq+=iWir8+qng3bSOvb2<`2UHMs`&&MNA)|+2(--)Piu1R`1Aaplt$F{?p@vVoi5<0OF(%u0!Hm5JA>H}bqQ@wfV+S=+|pO8N}h8;dkcnSp{ud&#FCeemL+3Uc#rrt z5CvT`tv!HK;YsbVo0}mLx@@+9nz*dWaXV35qwqT?CNQGe4jd$!XPcMD(GSTJSou<+ zW}Exy+_HrQu#QssMtbI*$-O)~UbmUw5oc|nZ)~7Ptzl|3-0)rfj zB~>EB--#aY#ZaE70iVNLoqV=P1U{dm3?37)nJ{rMLV};*3)`y-1$TEFFcS&$9!1>G zYMVa~JqONVbNkJa07QKgbb_LmRLBGjS;ZR0J2OogLHYJWX?K=gp9B4T1xY2GVf(Nm2tv`#tR(dc^y zBNlEsCK|SF@96oQ))vxv#8IxO?i>|FCZ-Z_m7y7qkcDf^69hxLiHwWtG2j&g^2!`Sy@aVWk$rL+z&8*=9Lc+0=$Q>-!q-TJG}Q z4u8VgS9b(5nC0)Qg}JDH-3F}tonR_{dPhVCnt!UO3cBP&&IS~*b_xhRaUdhw%10H) zGQEP@TUw~n*te)Q!V}qRu1>wVZD%0JNpKtLnM-055j#@bp;c0X;(~&%TPcUwXL$YX zVBB4&$*+avtRQje!W(;oOqKvPyIjF$h!X2 zq~GfL%apJV^;hCQA&5BZ>W(qEC-uku*-jOmrlj0e7+`fRSbMD#egw^ubZLwVsHix~ zHRdQ)#CSIE?f?g=I|fsb30be^rq^D_eZRH9tHuzB0AA66wl?7TULN=jxv|ufy0e}K z-}bG}pHjVO3#K$(E+U3x-+4#SJ57}A`)+_!P$qbTXPRh+2Bxy(dC)JHI)1iTcRWQ* zGR>1#4S1rxr8t|HvKSjMGH%EBf>>eZv;$O7mZ=B5BKtvynD4MpDxS3|EjC=p^oX5X zj<5QZezZla48ah1rhniUH%-I|r(zrz8_$TL6B-K%LCkI^J=R~w>=_z%jOf6l#L!#d z_`BQoSz$BU+xyA1}KabPjU$Sizz7fVAddWc_TGyCsZ&^j=18`iksu&|0Pa!zGL6 zJFlI7+km1nr(@syQ7fT!z3^5vD~&!-@NyGH7xr|r87I>vLXi~gm+de7$a{ywZh>Yd)fVOZnEbWhFtfOojN3+(9?r#t0R_udv0n;gX9axF@da7f$b&!o- zLXE}?>cBgbF|Qpk)7Gk5=x0El+rdb( zzkjxjWw2$0F+;E~IoDp5SuWD@(MF6p*xSbgXD4#3*rR6r)vGW)YIO6(B8%WhoGm;& zJio}~yxb?x3an6ATR{za#En<^;(?_cphOPFfjhtAabHDtckT<6c?r?r27>CuX#%pW ztWf30#&~c&q>nb8&A&e=ftW!2qGc@YAsRn*Y)jMQ0Aw&II#I+ILlS4ETtREozFfT304{20_XszhmipR9gZ zAIX#F^VzC%{Ek0g@p_+=o8mR3hMX!ohuYbVy>Q3SZsDr;OMB%X+S~5-dVvH%Dy6!b26VpebXAiBZ_Y3n` zPWlLU+9pXeLLqg2RSwIKn*;HD{cqV7%69@CH>06#pb-_G5uUt)!WRGtcK{ik44xAu zW$WG8{Mn=I7mIkqc<`qELQO7RS{UrJn8@`VwpU-7AWQGRR0+Lcy0W^*vd#en)VCOA zr(^USyd%NWP#YFh@HUY*GsA}(1 zi0RAH1~r|WxH`xubLez9)$G>4BcRDxMZ5lN(+V9SLQXm$j@+j>!w;)hu837s0O3?e zj!s&Rhf_fYS(jCB~P`P zu5YC+wdpnlqtRN2>(l`;(zSQss^<#>P7vY++#a`?IuqhT#=MMKH)qX0}C>`jJNa%g(nb<9s&!Cf20MQ-FYD+{n z3dSMgWO`a(IdHxpJaF<>njZ|hzRlKas0Za<(Oh{`j)SP0bK$cTNrNX>ydzFHLXyh9 zt@9a;`$;WTzDw!8?)D(a)x0m+C!Ob@f^E&NvN(mql$ zVPQ!=LCA?=hJhSz# zRV3q&|Mw0P`p*ut|HU3n+BP>UyTp8-i9DX`Waur5@Z=#N*)2WAR~g<9Zmt4wMo8_r z!FD0=`HZ?xO6B|jadl^?)#;YzebCFZF1z~%{2yHDe@dTD5uYqOG%x@R*l<)?2&8|a z$V8rf`bPAO1(gLg;oivQ<#*G#;_ZUGni|v{%8a8A;kjEdw?0Ro>tU^Im3ko>XZPH_ ztx--`5Z)BjMFZa8btx@FV;A-R@Ie<^KT*0aD1ZEYPXHhb!ICEN$+AT=W8wG7@c{01 z#l=|=6RX238u#I_DtA2Hp59sEKLGP|C0drdP(m-iA`1VLl9Cd%2E|l(V*yZXF>Ppo z8J)zXY4phq?Ii?%AJL$u_7VRGk8uJ13)Q~;y}gGcBK1V$o1v%d*xH{AM+EG$%A_V~ zro9$VQV%*F-PrSyFL&gK-($-)(%^kQzW;ZHj*MpWDUn2kAWHVW2|WGR2SaZ?fTB2} zML?vT|BC`J4uC5;KMi@Ij?rcJAILQX>o#&d{G4z5{U6`zSq7{ROo^o?#k`>f`G?^6 zNl!=;0{MpB{p(j`^E>>GZ6GA~$wvKrhqGQ1{qax)0{DzR1BX@y`ok&Vmj8Hvg#*=o8YqCum>32*!zI1;|XV4{D7;j3+y;bciSTbd43|uR(EOtwZh@{v|#}8 z2YD2+gtiV<_FQ5}+eZ)5^$`Lh7eUBNiNo^K$I)VS+8&!_2e|J)-y`_<{bw*n>5zqu zmRrTCKG3`R();5Bbh$UFBFq=aZDQ}kqaJD*H6Kqaf&h#S4}jQ{Gub|l-T3F`|Fv4e zhF}c9_3=6YJ9=T;A7V*|fec#$**UC$c_=AH4w(I>knj>bM#kq~d1k)I|C#kxnt2~s zy3iMY=cfFUfsh78Bc2ct(;3lv!Pc&_&YFHe{Pw`{37!!oVb2DvZlf`lp2KUa+{A<7UPn0FLa_e3Ez0a z-t+qy(W8M_T{Y``O|IXx@np6@%rBUZ^mPFIX{`J!gVk8+BaNxIrz_-ysKdbi-K7+(>*5(?`*$C4~4AqCZn1G=>#lt`|1s)y}N)m@>Tio%e-a4*xM2z3yvpKNG8Q z4YYkxKz80H!W0(BCHa=yv$Y!y3mQzxR}y2=6Uz)(LHVccw-$A$TrZupKrB#c<+T^^ zy~?C^cRaq)o)$br(a0l2v~AR)Oqjlv``^h&*Xn-9)$0Y{8GLbxcCH1EHm2B+(KVJ6 zmCy69VO3%|Yrbocz5bxpJJd7VAR6-+%zDP7M9g^mN_ec)@B3t3)d>qk}wX z{s4VS2(N4eB@pBPN^X9Xve~jbZ?C=_kSSp~Zb#)ua*|t6%~CFfayc_=d$97n1K^~o zi$W{I>k?YkM%heiN{p9hq6~}*5n&*6c1+N;JD%Jo+RGo~L=;6DTmh0Dm_S-0`F-WY zAYvZv?#0Su$f&^EMh|~e#Bwy4BrFhgTkxA3@qnp3M+~=d$Y3RiXKtP%7dfshJ>m4T zJ<1IXGygv+yG#XtLtHMLYc$@ZI5h$Fu!cp`Bw+Z@!s^CfXfu>v1rfoR;*$ zGqRmgLv5IqPhX!umSGxzCHMO>FvzH)=t8K0!C`za&OnaA-Oqh=iqm;JX1 z+h)AIx+opX@r+QWH<7irXpfckhGhXk+gOSy0B z+wT9WXwT|T`{C@VNnQY=*pPrWxnSos<~GqD&B~o6nh-~C}BdiARe%MavyIq&uy}Yisfv7;$F}0E*LysrNZ0SZ1D062KJvi z?iw}fV4puA1Iavv?p1~LcP_%ty{uy;fG;4k*>61+@Pl(Xcawn`d}W|;BrA-p>jIEu zBq+NSVzVahN|1JP{IvzbQ9dXjBoUaI#L%KH&sK#C0`V)d_COj9I{i-+-#v$6=Zk$D z7W<92gDg(63J$VVc3R_XI-l@QE`+p$Fqm(wo0n+DbA7f?T|{kv_9n}Y70os;A_15A zt)!j=p$wOM+ip+<3Eyn-3tV{7mB&*an_8lLW?_KF>dkOjJ{G$ca$4_n=sgYg zaQxx0K2Y`j11gAmk5prURH-+3d6Hjkd_O7!S(YUWaW>yhMUUHwmBm)8+ie;ynFvZN z_3&vvxSJkmd2G{c|M3{ih_Ri=K(FICQ&L_%e{Fa%nnyVm4YF}Z&*xeVgjHmc38t>$ z;;SC%+YA(m4Wc5K$ShFqtpHNr76e-ye=cpm2FL<5LV-|+9@nuudeXUF29In~{ZzVn z8b!YY5*1Eu)YivC#cI{=ABKkj5sn^|$l1(37U1@>%#u0okrlTe4QpbV#YhxSW^a46 zo+mRP;={zRPhOPdDDvHEGzcII+(xZ_8N>oR2^qcyN5p7V@kb$TbcDE*8sDet7?AbC z-=05)E37jcAfsNSFX49a;r=(ESwq?O#Uf?`kENH{$f%BDpObnTu~qli{=F%W!94#t z;BT``I>#K*)4)@Q-2z>Il$U`&%E3!8akSUSaU*&Vy(0#Aj)DF%D@`j;HEs9JdC3;?*o1 zx$qsV4;7@Pc4&cQwx)rTf*Rk}ZdHQ!DKA?&%IYbTJa+@I(2+%)qbeQhCcyP0p1ND1 zAGjd~&n<$y4~Iw2-uHA#AHK9jA@qk#_{9vlU(eth0y%8_5w-p&44WXpQ=SAsJnIXb zwtqi))&w19Pb3%71VRc9)852L#+O= z&r_)%_SbQcU4WA~t}@^aTmI#Ft||0u;mCR5{kGSO5W5-VP^hRiDny`5_97}ly9o(u ziOH{wWp||+bi_0NKSCxKC%evdJC3erF{YNdHx<5>B0%iY;=7K`SM+}o_TKSS{{R2@ zImfZG4iX(%A%%?WY_c=U%E(Aowz4@g%ci0-vo|Gs6CpDqdy{qSdGLLn-k;Cs_O93K z_WS+WtvZhDT<3W`9`^|xlt@b^lP1j3b_Vhz&&yy^+Q%GwlixuF*wW$vDYNz3NU3uO zGD1&|p5HP0XR~<;I#RS&_EG!(x-;76;DtJFz`o6s{q4$-mJ);fV7hSW%1kgoidwDV zkd1S{7qtI4G)g_RDHCA&%b#u0SiJ z0)z4RwV0%```wQ@%RN)TWlX(Cs>ys{E^wt(ChE0~Ta^7%+Ji_8Y}nY#u1}aIjaJC+ z6@Ve$_g{A1qq~Ie3yl-7$uAczcEWioOs=5LG{gz!bu2) z(}T~k(Oyh!O&0u2^s2*4@i$zqf!sSHR{SJ)SQFv5D=I>4@Ef6edJ}GFqGYFg0zVGc zelp_@@y+3o_@JZ~k+pXpRM@k=7qWXp`*NHS*iU2Vp7-swQby(Xk(;E{a* zOTMN~7bDp8CLeA7Yy5EZAYcKSW3I#3{8>lNWedoB5+(>@*=sB^{I@mk?CF4flB+~L|*SP zy9)hp4P)qXz01kBj*M>HuPkYsRClYnnS8QDC(0=XVInMQVm+(Ny@%E9Y}X^b`vn5= z$~2>(CD_a&W3RO3>b6RT4j2=>QEzRM8PjTodNpxaUX#_3 zH+~w42y!w>O#cGMaMWNvBNs`*3CGUG6=O6BnUtPX;k(cC<3}#rwLJ{TJn{*V(LAf| z08jj~{+qRT`hFA{nIE7QJj4!U`gsBon}xx6h6K4<({C^F2E7jlD~W0e1|BM z)k=*ZH1hIfe1}acb=%2@UaIxz3zwg+4wm-zN*ok`_{J#-uk)k5ujD`TdnuM~7}r+P zJs*a*25-MpnCY&V+SdMWNv@qc`T$rkXSlR^NdiWT%d{@#hhbJxg4rV*#QbNr$1d)cwU8~28G zOv10Rai-?4GvJf9jCe`wnwMS`Q?H}xGqD=Z*Bz#)W)f>EG{ndAkZ5Fvh`{cG3T#2s z4jAo-n4@V3hgYSJiutzRP=Y&4RR9FF7>GJ~x*$1Tr>;qEGG0l|0tz=4tr7N<%lpzf z()1|v_OtB}{N+#50bluCmo@x{n=Mx+{Dr+8#z1N{Cg*Ag`0_5~yu@QuODvg>L-=i8 z{&p-+$lb#X8IR8m@$YJwP9jS=aSwzPD?lRC=2<6&<`bkYotv-BM(w0-L+-Ig>wkd42 zj1-F)Y?d1iCUoSlwXc=ML-?+H5{9zZab%FS`LmC$iVX6T39CCNdmB18=tQ-}X`>74 zQEeZYBgKpC08FO~W|7^#-H_A<9gxMqskcPJz6)Sg4h#aDM&D-)EGtgnz67ICrgp3m-ICb)_ zL%83L@E;h1Q6xnb(dSM0UNsY^u9P0zCdf)h+)k3nx(W9SYToYy&`Y-&&p6l zVr^-=QzzdiD~9YV4|NN5RF&UcH+=!;TjleTPvMZ!M_io-g#-!;8OU~(A?C)*t7*fg zY0xR(^dZ8HxORamhEeVZKsG($22{Q76!-J)E3xOkKNSELaIkX4uD{bYOYr1sTsBU- z8o<$S`E8K7v4~kYx`Z(O%n7YhqR?n#F{y9)k_R0#;t7}99ZAh3%-6}cA>`NoKrqHg z%GiKIA@}hw{iq>5P>z~43DnC#W=~6FALN)wA1N9wo^qr%vE7mPtCY1IJl@Lvp7ADo zNHLMmD-!wU%V49oWTnfXP-`%^VTlF@>{WTC+i=Gh@0Jo5j&5-Ip)G0=oE6Qhk$%S= z*>5@~nsN9|O;0`dIO)AJke}mU4nJGK;nkJN!)x?{nO@Ypi9XPfG|IJ~S@LHi20vdjBEy~Fpwc+yW-FP4ZO#Ji z0mBG4*lY)15~DrHelFGUmzkg^>!d|s7~qjw7mg*9=YZ!ZG*&2HTJ9j{F__}5wY4I+ zoSW4gP;qn6263hpAkj_k*u)GmzG(aTf&VaHljw%{9;vi8Apukvf0T)ePNN_4p&2yp ztm}_JdnGqRg?%xV3$0R!bHer0-2~yL@9!D@!mQ4Q;js}2i0)QrHHD`L z(yTm&hr6f)wq&K_MI)uRKv!u0OHX=V@D9f!LN!M~I4Vt4C{cvK==v##->D;IIbJ;#6x{j=EoManP8oapECiT}Ed=s89d zBOnJ`CxbNOMVF7hTjTAq;ggnSxag?{&j{JfD@tGTrl|>&IJtZ4@i!}u?(X+O@e}W? zxb7`}TVz5}8Nce19Y14CnnNK?0eZ#5%wbUe#9!1gf3}|x8hRA$Rl7hC`e25H_y?Jy zx6r+a)iUb=z1b?Sjs9=D>c-bKm)2KLpR^TvGUa4iw~2Iie#w@emommBq8hH!=tMA4 zDM}Fd!-9VqJh;}IJqZB}P25aN3Q1zyhSA%roL3WtTD{om^=iE&2%t{A!_e_+ z?{-oZqLQEOxi8IVmO7v%s&1bQ-MTX+5Nfy?{IlNfecjTSYnWCT5x;e+?zhcx3CX`; zRu-%QqZEamckZ<)2FKu~6St8QdzSh209NekzytHnrP+0t>+9y2_{##M9flX;+=e!}z+zXS zGSd6vHFIMi#E7Krh@mgg3C+wegJ$4e{~GF`iasB;b?FKuq~v@vZWQg}64jIQG=FSn zo4HcN?9&&U0|51~YKcze$!~HL*L(Jt^nQMh(&p;3ZRZPd3cI<%1a`G|?proL%RJKC z79W0HKv_;|2cdRux7FA%+^(8pHhHl~P&sV}LF0&30QTO-%_p1R+Whg};^Dn~yy?@t zB33FUnQ7!ES4{~{yy30ek9}vX!q0sO$s_S3jxuye^io}BFE9@pTzj1gPfdDw@S0Ow zd2-;E|Ayi6XXQFLz3(!UNYQKJ)T?+hHWY-Q|Td^`|j`1{hi=yDDnG= zAjl*G$!fo_xoO&BvG;DsSvi7HfNZ@Wbjwe}yuK?$>19Z)L$7t#u>ev6f}^!AD{%du zK`1Od6L{JUT^A;X6pc#FGRuYo{pw0(u7G-%_t(t%7aOFSp6h%UpD*!|Giw21CQ80W zB4n!9X^JHGP4`ZqH>L4qDL0OBpPn6-;efz|3P+lcQ{Fp;LEPUo;0ZC{c66LwSWw*oc*L?3mDU@r|UE=AGETf@8 zPX6c|r~s=aa1_(d8(blMD4#rV1yNkT{gTu0@Wq?2who!wj^1_GLZ3uMLoEoHAhXhy zc>c>Ast7`+>C&7>{>ve5Yq&&cMfSW$OF!<)Kc3OXzdsS7G#QjAE;0Y3wh+^(Lu<56?kZ*VQ*&?RP+UZ%?RH@cFGW^bXSVfi3NY>rA176pc`DWXr=tLiXtKC|bPMST$ousOycT?SX zFwVarTE02kC@8TG)S=JTzdrZNH6+@4E9pu+c{5ZIlu^VPRuvJ7d+XPy2>Z(HuT3xm z0mA1XmeydZ=EsbiCPYoJN$B54?K$o5cPmmFh|)`B|LH1yS++H`r7xI}gs{=U(0uj0 z2!Aik>ed%U{d|kHv-s{W?KNRL58oSQ8U-7fzjEwPw^aXCkU#YI%d$jY`Mr>C{Lxty^F*I z{y`=|2m@IGn*?e7kiM%I#G`c3Jo)sBThTzE4^|oS#l8L7nMmaKPon?wA2GcIEJREY zfzFs$Brgs%oXMsr)0zC~ZlyfTRyx+$Cz-kdXTj6&`6o5G=|HZyjV)E#IRtXjo3xVU zlTS9c{wJ>vwRSdgO{eYn(3rCQuSpNPO-jiSf*;D5h7eBv6P5pMTPnH;KDnY-Y~3e+ zi5;<2DilQOpo*dY@|ze+!>{ru)3Z+e!`=0tSA}3-6&3Q$;NMGnVVy zS?cWq5CIG5a^#g@B)~fqby*B2#Lrewd4rWS!~HJ;bE5E@eU~--xna3!~YQz90MZiK7zaShE|4R zcy9+GR80))?7brT?5|T7N`H`MN%ZiUI@qiHIRP=yw>D%eEtwr@XcBn zBO`#!WI^!2bs({b`}USkWn;YR*JT8=+`qj_Cw2@ivFN&V!>D?fH6@4=Ct4T6gbVsV zUnh#VC}SzhxweRFc#L-;#xP>aN*1+$>2FmugU7x`Xn=!7W!CL?1?)Zmo&fOwSs6xC z3u+lU0B2(eTHeZ|%oV62`ie)RS&o{I^R9bF>3o zN_ctLl1pfm1UuF=IB1X{e}^7A@bU}}4f%@Pg_XG?aM83hFJbl4Rh6BM}m z3LvbC4dlXmCoTpcg51cpdB7M+lU=&@7i}p8i@)mlg$N3P|4%2&Si{+IS0~Y|=r%56 zh!h}TxV~(^yiFApk&LHIIW07=WtO}*Qi zR0@mVw3$-v#pf=@6XxH~We1X+d}3O`yYGY?=;PDY{vkt%XL0;;0m?Y{e-x&v+@;^; zEtyjOu;(LAi$t3Z>Ojcy3Y)jr_#yd|e*>oR1w~%b$}5P+QP~@FHK)1wU1SLr{;Q z`VI8f#K}^i*RjPzEHAqar*C=fKLJ7_UkxkBB%7Gm{D;sY?ZOL8(?XYv=%MixFh;#o z;OL75q9~fB@D0Op^*`k-jtk_Kvq6x9uSty46QHp+Sd_RB^sMW

    P|2O`6f0?OE! zw-W(2DAE~mzsOJ@8{EGm>^MAAF1!TJ1l$*E`{*%l@)+`1afN3$?p242f#M_QiKoL= zMgDd`nDc0%Q+uGofgwK-Re1n{_j{Zf!C7Y9tE+@Zf_y|p&O^{n?_CoAf~mOS>|J1f zH3%$o8(3{hULhgBQQWXqjqTiGgs+WN^b(H)gJvhF#moHSbMu?I8Tk4;Z4>AL>J2b? zy6rDH^-3B$g{x^{eyt-6!M6-0){Q%u052UwP? z(&b68{AQW;g#5FO(HjByWN!fRHtK|#3xpU-Hp4eD!^?H_M|-+nQH@)<*~zh>CK<17 zFxNlC)ZfxqLjzQ78peUnb{zlxw^jU!Cprsjp@lWlzg8Cj5EzT^weL+2I}p^=xihCvV(UeFOzgRL#@nQyB(x|gZU zimu&>#O9aza~y$tAu}XWq=4{{%buM5YDPRjt`tcG1a85hPar2xN}&b55K8*qQcJ|QtDfOEl% zoWUvbsJh0{T4frVZU;03Aak0j0Rw|dsala7AXLnk8~hAuw~_8U5L89jmxozq5j#I1 zDH{`BUt?LPM#s1IJhO1hrk0vC76z4EsAnSB#b@P>1H4haI_6N#<}AQLZ`q8U-Dk zo}Hlm=KztmH{m`PQ4b0&4<%04_{O4E>rOVZo}JsEPVR4RCW(0rWYfdymC$#$Uvn9a z_N2dn;_OTV%K^sZ#Hk1*VhcFl)t$guLIb2;T@G*Vg&n97!nE&VD&6)13Bp_O_h2P+ z37=j{OdSy|o@^3teJi_@R{Y9HMU!FiDamji{dIrKaNom7-b$e|P&jgnr}!G>Zf@fv zbod3edLYdhsnadc=+C|y{Qk|KLF-^{Be)6_<17;}oaf?SBXJONqt zK$*++0pER7NQ<5!a4ymfQ&l|3OQ)Nkf-gYSOb8!xGuiA6t-tcLv z`JuDh_pE0{_7l~(kxe;Ysy3E ziQX%}1nPZ{^0f=|+QW|c&!wg}!9zU!UPo-^H@!9}&0MA5I8l*!nRQMeT|iC=xa5td z4Tx##D~X8IWX5{XR6TM8qW>mXdW zbuyJLYI>q1j+Z}}t6lLYcj=j;E`z^ z3v;hzFfP86)&#>jo=96sBeV%!LtKvJyf^buu&79rv}@pnT*scC_Mv{R6g0rQCR$S z@=fPP_WEl~yV^N=O&~Mq868$OKRx)aSgQTv*5@ZjQYH-UvW(X0c%GHPmUBTwdr8H;_LbJ(2=sNw6v$PWK|0s^x1e7a%$j@kUB73o`8u3&G z5psKnt4&n!Ud5B2TCM_m?*#44I#$j2mEl$S|itKSfKYEAYm2{t6i;49>g895nhKKufpMYZ!m-63BXbkB#EjfyTpMc`x?*3?osB5yj`v zIw-1ucVYoV{g<|S(S(f?0JrhS`bj@mIf>04rc!etN5=_?VV(w^%5Q-8sSA{tT^vD6 zjEJDEBYz3tH&JvoO-E|?Z~HgM-IG9Q<2%x`l(vc+Jsr|+BNDk5jfRRa{&Vh?huPd^ z`L!s6a~4JqeJ3}m<1kZ1Hx|sRHxy}2+>DXm+NFpi7$ z{0*UldWi%T`~4Cd*w+oRD1xQTnGx&fE!W9{2^?m945Zy~;Tmc@Aj|2QXF(5hSN$2c ze`OvtryfFv^Nj=ze-%Y+cb({kt+Xr^KA!w($P&dHX*B z2!D=uQd;rSXI=DAdu`gSv;GBYDQ{t&G^1Ie)+Y1UG(&hFY_YbJz5b6L(ndg25c?$v ze4eTHN!8SJKR)^iW>-&-ea`c$FU~3Y;AQ@c8+q|_i!9ya z2g)4)w3+YPAv(=do+oxSItA5U@6Cs`@)50>O+?Lw1)`us#I5mkPM?uW(opu6;*tkK zPytmmav`fKnLZ0)7d4R>56jP(UT^9Bdm)3s z^l()Nx(T6<77d0ZF72RldVDk~Ep)&yjik+P%j`>Gl*`R?V21p*Kr8>DNG*hvx59y@ zD}lNLQ%fQP86QjXsym>Ou#JT9QLGMl_7cAL3=LW470%mHnqVpOupfRijjP_ zrYDe~pm#$`s3!=IF|d&kDx76}xh)@z2|8K5d+nEGny_;lE2E(yRdW#D9X5Cl_45gw z@O{wjzQa0l8sZgsolV3Sb65f+q2Qc7nme=nmM~Li=zHximAXF!pxn@l{eGdv2o#0O z3YUGbcxN5c1_m^@>TK9=&3=(6sAn#xZP(zSCkYlZJ2d_FTDQe;VxU`5!BFhu;FNKC za8V*PU;2nP(ouIq4Ay)(dKrfavJ<>W;5cXWI^d&4mtF%nHb?1RI<5ASPd|JSsSds% zN0m-#i8JBc~}dQd0|iog&qg@p55 zDhVM-^AGu(6vKJYN}74~J=lN^kEkRzg7iUa$%Vwy~l*&4${6R3E84&qmu|x8cNd^swQMwe4fm{6= z$c}rW|EUAuD4!eDukLkxj?A(Ne4p~yr0CCq4P?_3I}ap)CK!Ke4l61$Fq4c}i0ss* zPJ6MXj2}A{WQ<4AB%Y49y7D@IVK6t&hDSbuF*Uyr8#B*gNpoA?XyxWlI^|VsdjGx_ zSh`LN;XE*rQyZz)FAG0gep$UX@k!yrrE23#T$LJTJWrN#lFlA^b;lg^EPlakS+7Vl zQE}+Jnqx=tKN(j9a4jW2iSp8f@7TKYq?<7q7O$;5V{je!`I$a^{*A9Up~Pur*IHU% zYG@12x)1)E3*=WkHL|M#2#?A#0o3g6OEN{7WxGmA7UKKGFt29|uJo_?2DLqI-qSOV zWN6SMaCe@LyXA6BoI923ZzcTP?e+=4uO3?u>~htWE2`U1ZpAIyd-s8zcL@)p8Ki zSwT(Zpwv-dOTtU|7Zfq9(!)ef|TAX0M)Vhv3-;PK0&sGGx9? z_!5QG*lF;K5czty-qW3r{tY9^&c9o7l1k0ygOo9=Uy{Ue>BvfNVvGMPaum@z(I;ZYE$g8>17N> zof+>Y3im%djX!&Pdr$nb;FBC9Es<_X&3rYx8-_V0Pew|lC$<}6BWO=&gLJ5(!7Q-T z;QFifA06SLZ?Hf2EWZPm&f1G}?g86;+^2lumhB>EmkNTwa??;`o3s{;e*lNL)NAbf zH|pwt&V>bS886Wh4l2iU`RFS$=ZoDcH($7&$m_1J)~R&P2S?EJ+>6^=pR6;$X7`2- z_ZGjV2K}R^N;v0y)SX0sWba_TNO}lG&~4M%LU-HsE5#Pw_wb#=WEzqeM_|Eyq;!n+ zc)U2i{{I8|HwB#lhsc`<27}fQHSW8I&x4wx8o-mnAXfNEoxu8>DEGg?{*4VvjQRx7 z^J`Mc|CMO;f3AjD3UC)F{j%PA@RxJ}`!9fAAG|39W5{1YFTd#ot*ZzpOe>*Z+Chfv zP_53N zWBhuO!b)8km0?-RZ?y)pRM@a7Iv_$I*aAAYx8jKSC;qR+^1t{$);e)Aq}zT-%t$kh z2Jo0O=FsV+>n%q-Aq+tBp-!Ct;_`{cnuX>;D)|06W+w{Um2-ib>&kBeKx7Xo?75qh zDC{JomVBF4Hy;(vXQ{SBjB1DcB_0E`3xtqG7yG*$0^73%w#mHw?pvCaI7(QeyW`=e zsmQan1P{VTor!PFq{4UI|D`V+un?DI8Ufv-N>Ev4SVfaU2#4@LnW2I}P1_^sw1DvW zXO&BKhDwTYQQiInPrO#en15k2VsHKS>wvH!_E+3~9hNiylTrr4xVkABK^gVOnG-Z? z((kGoW!39ndHhxWwOyS>o>89=O2u)%lEoLKje(r9-emDu(3GSEXqkvpfL^g^uMEbj z#1WV=d_k#PwfM)1$>`5=XZj^2Z1YL2StFO~tx!jZJB0B*WQbYbsKi zYaiQd5cO~#$73@?$1ux-5No1LkTQPDu`l&b;@f+F^J1tJ>`_r}Kfzw1Q$Vx2w9<+K zj@z#{3a>!OAr`o$aw_iy3+;9myA*mM%uZX94}Fg^!>@>FkuuzN?Qgmb#Q7*FP67X% z9KLz!+^Y-3%&r?EpDYc~*q!mKX1W>gEng!5W{QJ17e4=eNnCU#aqImJ({zCLUf)^l zi3hLnm4FKOpSxlyp&C_=JVKVOu#-oL$g<{JOf>StYhMSyCrOU6l$Z zZ)DG8vTN0V;IZs!*nfgYa^*T;%;H9$u4oY)4kyFZ|Ab3A-ZxT#9@x^`#gK!`Q!zpi zzfDr9t4$5sS1LEES2{c%iI&tHbxyQ7zc)JFJN5E@zYG6%G z^S`=MQE&?IZf?RMkA-;~31K|)Wb|U!$zx)4lo+)t2<_UV7=)OqlkrOJXmA{Y{?{L6 zlE5%Date;ob`J{)9;L4#DFPwQfCldHC~OlporT|^)+#4YyCi_Fy%#N+s`|cs0G;N` z#20veK^^smOe__jdA^OdtMgN@xaEgtFd$}taru_A?iywK%n|64D3-p9KyKDvfDSKo z<;yfStdjFMBnRUS=Gvu^Z}>XZeT|!reNcTGKY9kx#n@}ANTuRTUT)nJ)#7yx1qH{C%{L+c7X@Dav(QZ<-VvYo$s--7fpDGuVvO zZo=9M1bJobbUR}BYF@GHE7SYVlXrlFH?{bLXS!d(0WGERUcs^gU%wK( zoiXnOSrSWYRH9W|vpbzDG&sIrholezwA5B(Mgl^#X`D#~be9c@G4P z0*uI1SEIwxUWLR?m=Px`osVCE35dn0=4qTjzV3+&-0Gw35_t(QT6vWD`S~oy+%jJz z=4+ksnVqm2Eqjp&1Pt_6KecYVE>DyGyp&85C-7@}50|LZz@dT8l{1SQXNVarCr4T} zI(AqS$y#00=k+N+DU#zsER&$7V7ui zaVc&-)68<4c&gi1S?Mh;7g6tXoRHmye}3S%SX%Prmlifq+avIPzOfN~zZ)fb6r>+M z%BrUYHoAZpdYc>WQgWc#Su~i64e1w(EXUW4d4x>KKmsgg-whF=3& zaUhFO8&3>?iwETV2@n78(f|7ie;GWXr#%wX;Eh}dmpQg1bNnBGyh}d!Id(W;=H!m| z*UdN91+$S8Jb9;}U@-BHb&dMBx3Xl2a4VWQGK4!4;yJ$OUXZKGDK zICyXU_~&Pay(FuNyw@M8Zs>n3yZiR`7rKvjV9tIt)Hwv&NhX{+&m2TxgP}$DGhdnr z`0Zq=-cC8sbn0aoCUwfnynrZ@PW%P@SivP3@uC|!rool8OFm~NC`-DOXERrRW!Mba ze6;G%es(UAVw`f`a0w36J z6k8qHp)OJvUDSwruCLiZCiGW_yP`cdYTx678r(5NCzt%ikX)sOXnGr6n01(kNr9bC zyc^9mo-*#UGG#c`EI{>&EBfKb80jF2oPb;ZgN%@4J!l;4yzJ9s-WI9df*}=Ji#2eE ze;e+Dh{=|L%vP1}pe3{fVb~5dxMMqwHNpmQ$PcZo@Aw9`yzga%mMsm*Or-0pM5cc7DdOsO6c6DQ**{w9Hc1q6D-zoqyjPC)AYsc% zyNaq<2lA8+z}l!-jc7*P2fam$2LyWNML+o-_q-afa;cFmm&2SccE8WC`D}CP0xD@! z&&p}a=W(bi2%T5xUal)h=jTgPdet-bGz1`vc{*)uVjBb7_9S!Y zp&+eMI|_BEOM=30owiIa&A$VyA7!_9AJcm3fZm%9DCgkUpU!weX@OCh%XBZLxV3(2 z?QaS~%|G|OydvU~XX<;Wg&{ev`oS%7y8vg~;_;Gqrg)e;xty(!8(RycGdU50GRAMnsw(8ay)BP4Y*rX_;&It@SqD;_y)S%_2KrN4Szwx z&UnNx0@oS#lILylz%Gip>A_-fs`*<|?m?9Dj+hCz$$pJ^YRnAFu|04R_7#18`|=7_ zc%i%^x4DvE{J}OA3q!yYnX6Z(ax~2w;-v@sE2ccVob6uSLz4%NxQ{uW7Gg7KZrKPo zgMFA%IMDXYe&PYu(12BrLgkc#%FoJQY`Mm#f~P*UG&JpGf2>;3HIN%+XwT`Depd^p>vPWvGLpuM<`(FJdp<@ta#g))-i^&Rp=D3nWh%Y7 z`B_AFxwhXtai|ReQKPa$zuA}h05S>3n)C^?FBPh*KURuP-xu9~$b6m+z78bnihNO! zZgLbNIA)8J;jruihl{q!jnw=`9!QILNV#Tk9jGbk#JATP_HHfD4h@Yi9?S{U^uvts z5>cW+Al9BHI~V_x!RsbDYKc6W##&pciEVl6eZ5ZBi{z;bIG}QoCB^sHdgQfND3YP~ zOz*-xw~75)!juPm+C`X~*gft3k1704@vFnISnjsZGZW8wNqqS%yKf~=iX1Q7Na)Nv z0JX&BZ25)?CP`V!J^IEYwmLqgIO$2ND%?#&-l;JM+X1Bos!?OlU#xF)%+jaE*{*?v~Y`MO8%U*LK4eb8AE`Xw$5 z0LJdidn<1yN=#)he_-7+`)*=YXi%eOR7Bgs$9^Hu7M-&tX{^inDc5ncsyd?vPiP-# z8ahsGD))8O1Nwp=J>hw_ZTq5DzuIgSNioL6YnnXn@hI7oH`Ual%DB#RQH``1m!&l< zmnoSpf12lTI#NQDvec-!%|ON=YGcA8=mM{RZLXXDo2%jK0U@yQIoez%o6=R&Yf$P< zf5c{K2#N5ZRxjQ>H%+m;wZLB1p^UglBXt{pGb>O^amD{74{J zLmZ;N3@B0N8}~8>6SjNJIM0LK*SJ=0)q8w@tdw-^Ge{hL|9CklSyEyuNs}a9w4NTi ze`O#2-Tj1gK$Jx@sV3_ndRS`iXlHR~+n+uPa^-+_X(uCMdwHxPWy)4ow>5;A&Wvld z`}3=-%IsI;BwER6Y83}VA3gmto4{b28XWsgmlv;!MP6Xa2I{rb!t8N{t>SM0|I$Y-WuJ8tVXcIuiWE)xx)S-wJyDx z(riale>zv1AV7pW#*+^L;i-6gTq$x$c83+#T3O6r3%yE-G&~E*tqjia=Q&K0LiIw- zU92RFO`uE=V-f;4Y2Sdp{YkHT7Q4&5E7mcRio_Sz+tGg>XM?}L8=Z?e;!aRZH~KsW9&%;Q7hO z=ku3(W<2`YuI`HQ4@2+=D&pXL^JfyBUriu47*bNuVQkAhx56Y5QWkc{RlM4jc%Hgfe7;F4*VMRD6oT@Y zyls^eM?d#68@W0|OjoehG9A(+XwnF|EaDvas3Y-9Gt6-+jf88YcX{O)3~z&lq60dv z7Jw-#i(TQC&)*e#ZD*dIyayeJDY|RJG;qjf+!E8%U`m<|#; z23lXF(er?y7Me8)%3=Ty>512o24>yQGgQjQt9{mQA(bk_dSSlBD!YolFIsGql?sA@ zp<;t{2&gy-B05SWH%}O?WU%6qkzh#oELawcd8C*PVI??E&Z@J6_;AD;Qfhi zOPi~=3uMwU40(iN^q)VjEIJx|MAYs|{rkgHu??)b<)tcBy=6#_v$RekcL zTRsixk~QgoNe(~qM?Rm?S(`YzsUWpy(`Grq;tqfHS=N8ZCuyhXnw$Q%Tp&Rsk$Jtd z``MAxXfNAMgYwuA29J?8MoFnDiFcAMZTyg7NH;+0-6@P@FT&9GPn1wG|nLTzTUR zOgf$1b=C^(RVi*)uN-F7>4F&9MQCkxi`xD+&MQvso z{T}4JUsaIe@7OWpJD^4upC6io00(D5lSixgl*YhR#kRLH9H)=;Vn`wnT_IZ+EfYG^ z^EWQw=y!is)a2+bkBhs;u|Y`d*lu3)G!2UIE4M254wZ5jzdXU~=tA1MK^YRjh4dHciE`Pk33+P{MDZU@kG;#V-r^`;*ly9 zYQBNNQ7xQ}+M=w72%Hi!k?EP6(Go^@Mz4*K8}~~`Lq>ObfaD~XnRzhaDicV>aLiN? zxlQvnCsgCZ9VL&F;Q$+VdMcKhu#NheHsiT{B=YuFe2`<~-qhIiWTb7N@PN_|ZmO$^ zH&R9kp*5gPNm~Bo2u1vT^;C^gZI%u)Z0qunnQY%%BPi@x`uls~^fq3k&Z=yL>soP9 zFqkab3v3`o0!1WBoH6d?$rle*1Cw!>Si|^>r9m&Xz>T@@(%+etOo!Nk*|``wO_i|e zCFLz)xI=d8{56?m(#^fKkru{EBiqIF;Q>VXl*twHZR>2|2jv zV4OCG?t@#Zp)P55NbvCsq?$nAUhYOSyAn72!30Wav0$?i)NlUBokstPGBE!bG-X)k zFKp5I2bKuEtMS;G-?kFm>{4IxWQC|y2?N6~k0^6g- zqaowQ`UZixjefm5pldhWXZCKk&AF{Q-TZ@_ zlhIc!tLlTOg4vpa*y*T%tsQ#}SHV*Sm-C701H>yo%)`%8YS21*!vT*vshzn-9XH`} zVFFD^?cW}$Sp46d+-R9nUf`nu^#X;!K=aU(NZTP)>uo>h1j~ZOYaBe8yJyu3M zT_&YTIrG}3P3s=4tFb56Uyu3e)3tc>ae+;B&~+o|EbOlSxf@M|71q#9dhEdiaK|TsdxT0z{@gf2~=m`fr6*Qd+-V zb1YG^k9}EYMS+!m+RKceMX@V^H=XrY*IT}9*1PG5XqrbN`2XN^8yn;pJAip6gmWq9 zrtk5t_1M*Zea4^qVjahEBS$LxnM1edzmg%|`A8(+bD{aie}@oaY_WE#&Eo#K7ym|g zzW`u(bSpg#_g}enzpY#pGGNP-S$5+5m)B+Cg^?pwqGD(M;amhBP^g;BLlE=!~x5UG8m}! zE^S_L@AnA%V;}ncqkXZjAnbQmdqvn$3aCdI=3IhIPz``PO>lGsSFmb-)K(Rt2(l8x z%{~6fJ_9fi$Df58-dGYs-H7e^0-8mTn#l%F6m?SrPgr(rfJ-T0M<_6I^70qN@czMM zVr2wX6j%xy8`=p{<6Z-oaO+{>e}{ImfNZVUn+Ad}p}Yfg(tj)zSo@*>ycAZ#0EFQJ zOsRKXQ2ZyhbENsdb2~xhMMt8@r}>_w_sLLRt@(d>m5jTLKM9~f92Enq4j*po0~?qJ zE<&pW$Z|TsZTF}nRt_+vR%W0bX_~e1|C;~@Z@B6&0vub^1!B!&L15-r5ZhsXavjf* z;)ZCnR3bqmEq;UeeROD1)d~ae-V1xqgFlW@2x048EF+F;;lNuP{oKWl--rWAgN*gtb#ut|N&Zo{#AO2><(_}+BF&;Y^Y7^DB+&8E1BpywrtN`pS8u%At zeZ2NNMW|rOpsCb)w9L9MD6Ll%5PrraL;fa+BK1Cq?_aW(P`=;i*JSNa;=U0IM>9w# zY4Yu-op&JMIY5KLD~5zZc%GE9EA^tnI()%q3v1Z}Q{@)JohDq>7KjcYiQ}c!#aVT& zzsh-!f?M2f1DMEuZdTjSG;B5lFEYE1IkxW#+i)a~26=z{I_6vo*-#w3uYkbK^bI)P z#*8d+x8FE>EwYqb+27!I4kiy7KZD%6%bORV*ftP}Xa2`rnQCtzfW{0?Rhic3T|PJn z7?VU=nH8`xavl5<{?o69>HST+hss`TTwxxM`JYr##(L(55NCo6#h6*YGsm+oHp6=F zD%a({CNA<1K=z=GRq%cUzJ!~z^{l~ckPP!sJrvF#iSBFK_ zZt>2*07D5#cc~~KA>ApUAcAzKQi62H5E4p>lu9EVf}o@@goJc=gCHFO14G=G@7#0F z=Q-y-muLPM1@>(AyJM~OD@bdq4gwvy6K>STSdSU2!~@)Q@yKp7T`9pTWYC`^M!I~> zcMH_BL+M0al^m)Uu-}2?DNsGP^y2L;|H?~G1VV6SdCQQ<0H^w4Fpt;P3Gw2k)V|<- zoE!P>hF|D!_q49NuzEXLq|0TkA`%43u)Amio15CE3As5L%u6)!OuHk?zWG##oGldBN+rZ#g z8Jw0qe2^?@*(p#8VXa)N0Fu^?0^+b3-m8ecL#iN$RDv?2mzGlf2E!7lpGh<912a6$p)B;Wo7~PW?F|9b%K4^=iW(DGoI3Qzr09?bb z-{pt&9RrD_j0zJhvOoFojQ!LXC_u_!Ea~dBMED3_q5NiRSGr*!D_qwiCGk~*0y7NR6gKvvF!7CON`WyZ^zTD zGqvssI6jB>X2Fq-%TZ2Ln(=l_c&r-@3E{j_sscaOjK0Ha%ig#L?D3zNK>Zd!Ql!&W zL5%TRq^=)vmF5eGuPV}4mh=h9uHAbM!=FqZuZl1OkKx2p@19vffQuhER09eIxHI3G zmE-jnjDd8K^h*Py2Yg^NotUuxB<2{qhW!;K$Ft|4{iNV0+AdIxVW3D0^z^M4Q%>~d zklLBPd1sh!30Qti?LF(c-q1`aS=|Wyi3`?TCes3d4!H;U0$ktHj?FI$fHNlhR~a5u zPbp=ZfNDHq6=(Xggh>8nw&J0d_>1-doS^>KlX-xqDM;ZZ-4?^h4WcX)hG<%1=1Ag{ zWx6>vsU-k5gj zNu@lw29g=AUFK&U>Pl*9Y!q`W+OJ)ASYdD)+)?!(lMe^fq{&DZAuCsd;&pQ^^0Db0 zWlArlrE`m+vRV)aBBlHij~rtLz*?L@)?4BQs|R5?f#*)=5DS_hZ1H>bXHCF?v1l0> zV8#O24fYZ{gS(Z+(R(F9r?M|KaH!d5fr0!k4`tzvjCDj+6Ba`=kvQE|0+IyTl2sBX z@tJ9>!0Czo1$sq^-RiteQ_~0SXVqU(f+2pMMeNnb zPk42y9$oR($m@w6i#_)i-Pt|-tF)dfe-Ek30c1=dE#J$ZBUibc$8LgRP_S#v8wFVr z(EMz-g5OIfWo8HRDEx=pRUjnv7eG^e!u)a1w(D294Xb14t5%!!nI6u+^Z9^2?@1BS z0HXZ`iPk?A5V5YfYnOVD2Okka)7uWvqn}LDt&ammz6I1yt`Bi8e&yz3Q?K$(ua!ZA zcKYX#vv-5XtXF%U)1jo%yCWOU`khe&j?-7acm+Z?8F+-+*6{K^Sr242fT{*PT)kJo zG_OGfO|LCRv(I=J(Q9CifEkQTQg%_fdod74Hk=-=pA38M7kp@Vd>=l?*C$TuR-FGh zY7!~zHuCkHCY^qKRT8AQjjjt&97fTJ3~n_nYe#XI@P6~3{_u(>X_HYS449=)o{UT} z=Q{8*Xvoc8nf%v1<{2UPwkT{4eoW{&2RtUJOtVi2f_~age9lQV=^Dq{3OHHO`OWZp zLRi4v&$bdL+D<8=CjxC!X{~M=@LW=Svazwe5I(}n)nsOJB&ueJY;kEnF!^j5@mL4BRgI@ADd3K|w79=8bUz)t6liw^CCTNU z`;-6hU7xrvd<}XWr-z5Si8kUH5MBj%tzzzo--@ZQ0uYS6uMYdaz7fKYl z_n|+*``nNGiv{XYO{fQD<2O*NqS&=cmo@gfO0`-reLbc27|8DGw;nIQeO${dN#46z zzb*8$l>8WR@@M7CR6~p`G>UUTNv%cPg@s@Qb`Uc%n*yVPXBlK#7V@OXm#TK55|OM_ z9P1AD?SKemaY;3EIboG<#Q$t#w4}lO`|DrHS(1-#QM%THTe2WX?z<p>&||5I6DN95R{NPsNm}bl&+CJryX&r;wcI>sUCD2>hR5n2@2u5+_m%=eYyx9cC%sg>6u3#u(w_~ncz6AISNRuJ!Jui$|DP$3(x0p^tzW$|5x5LIk!INzibnsL>LTe}PHGzOMQrEX{Ktm#>>v@Hy3_z^;+e zbWeB<7|3FxcbmLmARx3)8R#b6e8@9e)hO$ZKat>>?|uwq?o>*nJ93TCEYF)nXuUTAng9wOVvjp>xFb z?|$Szl8zq%p3zSR;6ffK)^CV+M6mFPcx==}0)vU5;6rVRS{UsmYAI$)WY4(f=OG+! zwPx{!w(oktE@y<@(DRGHJzXQBG5*lGIYmPEs3oFxj8Bv)V#6KB?mjP5tW}~P_Wf@RetvOKpxlJBcX4O2Es|bLaFVy;SZAXp+L8j0ELu+Z3 zp%Fr;|i5AL@ry9%}YjZ0|Y)OMH?0$IKfAHz9; zAq_|c*DHSe2+oUF-}08?RUpwFb)5qw> z{l5T@;YAKurEf+KHLc^L82groJOq??4=A^Y4VrXh&R4>n3*IQpt@c{nEI=V$s z%XR*1m@SgI37&8hxEMkRaanWv9oKMbx%2Ie=1*S4Mt>pckt&PX;$RNSfCkkfdA2oO z!^d-taoVlbwL}K*$YGxzK;lm5(3U&SFa(6`idJxfSRXeqer|a_kl7fg@H77XzI|*=fnsYxZ=JPb|MY z-W)C0Xnu=T8+pXLB5FNWnmVIs9SxZ#N+TMNzpXx4sA1|>qgPRfS+yRy3ogLZS?>*d zgL40mm#l$HPABuSnCyV;4p7Z@qESnP=LN7F#-rOe3BL6v5}>+_F%U`&Xs}rAYNRcd zuvBaKi-wb1j`k?E8fVV8SnK1|yf?@Q-fK1@H)NGf-zx>$(w6EO!~LG^bq)bGhWYgy z(_Vi{p#ncDT@79$enm5!GVyx&0?2S{56NCwgg2i`+xB5ptxB@A5VF`<9)}xjH!=8h zcKe9>=F-E$I!S_|D;Gr3ZD*L_57rD-su;+0x(j`~EmyBerYu%bBC%^+J~uIbinV&Z zXN2d?41A#%YYoxsBnY3VPAy4Sl{JsL^xHbSzztvUcNz6@waM8?0jZ&6znjBo@jGuU zvI95J#GigcEqZ!%+*_OTbrE4SKT7oo0UiLV;Q5}5!wNh*iFd0D#`{wKC6h_gLhsC{I8mVD+s1 zFJ1%YE5P|7cd@(-lX1?bc@bU_+7wm0tgChZnIB!S+3Jt(Oi595R5_#GuQ$opa`1V_ z0MW*tzz{swYfz3J_*?^~k+KMyShsb<4#Ax^b1|UY6MOOQ^V8?Fr39%PQs%92m!x*L z<_uY{rhX~QH(?}6v58b77)ZL#Gqtw8#QiE*a&r$|UuG5_15dFN_m^xE3^2T~YUjm4 zWqY(A_EM7e#ayza9Nh82$-{yqtX4=pH%}{xJHi8VRdG9%Ud$bq>uiiG z>`|EsUE$ZuV2Ft!Mv&^+bX~sK5l3bhe-`qWUAu|+ZA|7NGZsXKW%w${6>8+BMMjKY zR!Sy@xA;&?UEOu$3nWW7AA`M^gT5{}?^qM_+5s;5vkMo?iH6D0gI{HDoY8@EkS)m8n{Q2O96z ztwkC>s^IPSvRL@k+A@Th{Y`c#Xwg8M9f|6Ts%c;4T8lG`>_Jnc0T74~kl;~Aw=9_H zZn^a-l@1xFJ+n&jSD;vZ*Yx!>hH2dP%|#a{oFhu$1k=GZioJzluz!ILo%Uv8Ynp{q)p^j)yLYRO09QVTxP6|;| z1Mq7xyPdGw@YVxl?*zA>;l$tI7bYi&rVS4Mu@mjB>~MR@8l)wfITF;M)Clcf1$?C7EU&ju6DHdkIZCo!rJ0^#%7~#_{a>=R?$zv@A z`?P=qcbfz?H~n<`+}v+!K7!uBc}c&2uP|8Zz)@JFmg9q$>+&_dn-x3)Uj|(qJ^_w8EIBG zK)Tb;bGP?l4Pu3GbcQ-dCDw9`hc{;yMoXfqB^T(}Kkx|D%*kODqyWxD2aq9g!P~uJi6S$~GGx^bKuhj*GsNT<@ zW)==k}=0RU;NrIV*F=ZRze$qQe>#D^q`GhlnK{wY6Zl!&M_u+(hdXgr|0tbWqf z*i94V96(rA-&VDIVY3-_e7_y9l7jBB0_L(dK7C%JA_`TV-X|R_=$mpub5sE)2Vx5$ zq{U!>?|;nuf||bRr=Iik%q9G%UGP`CZEF|`oSMbDyo&AQ{^`3ru9N*Wz_;nnl>vcf zBW{Ba5WYtFXdAbzA+iM;DwHOi4zAPKI96`B8p7nI|NV;gzyNuLymBJIn|M*%$>(8T znhtZV85RMrq`9d?l0se#!>$6G1|hcj_YXzu?*u}TpLF+_xn7b&F0U=oT;Tt1FTx%>`qH(KGu_PQKA&t-&105BGFz}yhx#F7qCymyFpRb zCxsSQ+%8i6`_6#(7oHeSOK=1vS>X9Nk52f#&G%+a<<_v}ABcU^yljP-&u(^OPpN85 zqH+(Zb;qQhCmA~1B#LYj=eoQnZs6O2z>i*HZob|Q?c%EIei@B+3Fy1)GBZ|U^xEKw z9P_ZclO>sdcanKZD#_x7kH9(+kILMfRD1wZ&&l$J?@qc5JK{Y&9Pzi8v@dZuts-)d zoq~$-Q$(8VzmMGi-Ek&kcZ!bg;`LnXi}tx!RrzcLEmmi~f-%U~WP1p>C#tbpjDJ7({_#G5EKqz0&Lqqp zv;*ATMsQXWz51G_SE)J&=T{jIvI{u-?4K2G^Ulg9Mf~%#{{6=wcT}0U=%9^a0yAp= z^;by4_HRQ)F!tA4ZN~rmb6JS-!8gq_bM5|PwXXij6mwlkY3AF9mC=#^@m;WC=+%as zZfua;|J!-@e}5E78q11&+pqn8g-D)+$$Y)?hlW=R~0)iK4~H_NUZhtt0Kl+e+^ zfj5Ec8)zhz+8~a)J#P|IMk9cq@f7gKeVUi#PR zi(4k>h)cWn>c3VF>;^axSAKpX7`?XRJY6E0n20C?1+4zST>vW>fN}Zel}Qm82N?jQ ziHth-=6!mye^nL#eACedTWljK^FQ~t8?4?_Uu_n{%;!MH2VFS~-p7as#$=p?JXtk08w|zo1*HGIr2ABX)y^<}`@!XW)HhayfheY+{iLOU?Qj`jE!Nn9tYHnH z{4LW83THa*q;}N&C@S+`tII@0(%}&e>3x*XH*XM5aE~5A0#PBKKiVLqO?itzmtOzVf2&o=@NW(6~vVBDLe-ziz2i-&vu3Z z5H*!edP^7FmF@TA?@ZJQu4$!n!eC{+x<#U+7?Z1;qtS)JOl%G z^yuuXAVr(8!`Tnsw;Xy63a?ULajYj282&?Enc=6=1teVo@40 zISeol>HU32Pm-k@JYbI7zsNE?nFKJa&w=fPp;wbh`E{eO&-IGbGJ`;%^3SEQ8%x>T zCJoc4GmgQ0XUb&UW~(VK(FvS-gW3VV-*BvSZ~%1eS&W1r?-U6+u+goVacbW>n#tYa z6$+c~0mKcvrR_$kC242t7!Mc^i|Od*gvhYnQC$tKI^@8#{|8B|gu4t59zE2Nso+2E zJeI0VbD|jGt%_KvzBAPs$OW$`LN*A!KnFreYe`8oUrEp3&#L&S(S9v4F2$TJ%{A_k z@0k-@Sx2^OE)Y=X@QSud`YtW_Aq|GszElyLsIrssu$6&XF$-=SWP$+fEkl8gI%_Vf3Z zXg&!fZS+O0_18vZtQYEy;`0-nt_5AxfC%HmMD_^hut!_zLgz!aO(=pF{Qg%a@&@JR zFV))-$2kk$>v!_TW0~cOX|l+grvTDzTcllDd4Y+oOGp>6eX?o~uFtQOCr963Q�Z z>H}iX#vb-877W@LLTd5xm1C)(y;Se@f(4^m7wneU)pJI_-@HY084V>N!t>vbXEz86 z#+W`U27cci-D<|SU&@Udy$CuI6#b!QFN#sRJu!4@=n`cVQ;E6H(BIeylG%n|{$MOp zK!Q#*(}t&4^29P~Yqnx4?%_?nmQcc(Zyix2I4St)G-ci+>~$-N2GbRN$-FIbOuB2d zb5RU4bq2P!g+~4-D-PWkR8@dN_qh{Qq)}CRzAANzD-pl4+Mlo`7Iy7RSX+$84ut** zS!uD(r?f-Nx4=>$XCrKc3;P<~+902?&z6A4%-~aH`=z!=+x50uMf!7gpS}Q_#GE>= zI+OhGd#KikwqfE|!uN!3+$o56`!zXo`uo??6Pspqmt`Q)I%h-K zl#i;Fb6;FFt?I-k4T1d}vZY;MB1%}SuwYQ}DMoWQWc;QlUK$Y+aj z%u~EeQOTC>{i9d>pxd;HKitOm#iIvJ4KS#IEQInNjSi%udRdsqdLl_Bg+Pd0BB@4&YY zSG(gWjd$hTr)WzXXSF#^%p|zh-`pklY5Y7>?By~Q;iZQ@(H9ENJm`ayiw2igFTR?W z&}n|-WIxxSoh6TJH&aU(a$wy4p~Zvk{EUc(f1E0ZM47dP)=c=SKr=F~Rx+Dv+>zd8 zIExYoeyG1#VCG(zRSOc;7`+MIvjML>?geEVhx0M5Ne3!DtyVx%P$k^Z804GT*2Ur0o$BKIQ9?0RqkZ7^f_vX5=Xx zI>t^pgf%BW=c&?z4u0ZVvzGBXF8~jOuY)$f7+lLzrm|id@CFcboR%0=T{HHmBpQY% z#}q=h>0yn)J=oxQTOy^xoScO$zY$r9tzgFEoXM_zITu)ME#!FP9@&fsL${800;H(vvSQ%Y zufq}L%p(*Ojp$P4N6bn1T-H`X7YFI31S>k8*FJxg3v1l?#g{9G|52e#>OZCkp0}73 zrO7JL(B_QG-+59`r%0qRZEiQ$)f7ry$M$WHrXQNwYJ~eVkY}#)Co_2N%2=g1-}D%t zX_;==<7b(weBjXow7~ZK9oFN&L*G!$BLqaPkxeQyMb*GTtZUXqF) zbJ=>Mc&mEGRoKw|SI#z{&KJ2M%w*C2IvsA0%@wdJX1}(TDK5<}zc~`>L`})gb89cn z%$Y+5=@OX*5JAD^NB0+GbMHCYrxbnX>iP#axon4Y}MGzUVClvAhY9BhU9sZ6@+;3RU=Z6JPBO zN|cjI-FF(d55p?S|5=7#4TA!!ZxIVcLue{jQ~caZRYva>s}7PN>FgP`8G51=fNwHB z-zTHIuOLsC)m&G?rOw83q(Gy+`Z zF4+zowPA9(&W{^}GbqV3=L={q+GB_3`n$VxOlOKXa@A#wI)^p2eOBqA)n;=slI zJ)iYz&zHEH4=00O;*E{LP?#9%<*~X^%&oP3XM)? z*Ub|BudV|(eHj&41YQuGd1g<37*5u)es7ot_!#OOij1zA($xweyy`h!y!V!Qy2D;56B{eq+t5COtU& zBy;;%k=SIzoOPZod^k;!;Rq!AY=)4kmR&Y8&wo%RM4sO^R(w;pEBJKZl9mYC>}WBz z#dsO}nhC@Jq}4?h70`=TRxRJ{bmu$8>ro`yNt!F*L#{!8qLR2$VVEcpVY+n%Q2_(A z_3yg|uL_~<86=K`@|(bpm~{gnMcRIsZ=gnKLTlwK%4*?(~@ut~xZu4a-2Y!4@>6UB}f!Wi6xe7NYa+Q9im zBjjKn^?~1g0llF58C(cbw-a}kQ`EUFrbg|*a=0a_;TxkNkZXYt(}vgfkV>ON2QqDqCz&*ZYg+4Bg3=%DfT7*fpAiEqx9|`R0!a z*9i7n8Hr1zv&H!nSX>Pw<4s%Ts}&%9)S-i}mc99bA52j=S%S}M;0Ug8nCYGGgyH$g zf?H3Ee9Vn3iO;UzaEn01Gk@T@vet-doz{w9=(@S#4kU}b%a%B=k^bwcu5m_0fvtj$?>#L<6OYsV2HrR^ z(g|`tUOSadZds6XbV53|FAacVNmOP~1fzpc6k^PZu=v)M^K+MmT^fmt7!K{1^pbkT z4ZL{1@6fxz@X;E7%q#+zQvW*sYQ@oz#_NwP-eg$~R7!q(=W|i+wlPcC=FfB~b|#vb z>$AKNmP$Kk{}QAu>p5N*>ux*D{=-W zQ*XC9p3-z!_A!KG;hjKYtcAaXj!8}7k9!eTtER6)9}{L|vyAQsenSRFN0oMzG~)cC zc{2%%{9r?|Kmrx_kj%QVP2-)E8mCR4A<^8YVfqoohd?$4q#|of3#1djWS7ww;l|%J z7`<=jnJqjh!MbHJg75;}vmO}u=A}m)x;wT-{^E(>sgQ?bKhcaae`(A~Ecp2@E^n;d z&Hx5RsE?YU({MhkH+E)w_IZ84r9YdrQr-~WnpLsh!2@Kq0#4P8`v)WMiKnGS;Ib%1 znYa$z?|mc7dYR(#zUJ)WM*_PXWkx(H2c5{iq}6a<3>+dP591L)CKcVF#k2T4Afm>8 zC*lIK+UO`YbkUL@gCX1xcTr5V{d-f-Lc9Z8hKHMnTDZX?shnas9<=N47jH3i-nJZp zlvaLDF}Y&mpKQ9s_8B+IVET|`IlP%vgE#dAED?r0yTm$NXTy1%sUXe7lA)6u4f%N< zhQC-ITs7-EbY+NL^dyI!ovQmwG`*<4^FD=@Z`J*XCZ(vQ2I_~WuV?i_Is)a{Jjbtl zEsIQ~utG?n_I+Urz}eS)-Fr=K_x&Yy6_CR{l-X&0&~*FjfVvgSs>MK+Qe@Wqvfb-y&0 z*ULvYuB31*j72F}Uvdz47g0Z1n3t0eEwhE_fDB!+I59@=yBySIHx_Tl552xAz*PCE zzi(LGG@_dp|=OMTINt7c7*48M^|$Ja5QXA+?Dhj1&R_cockhIz>TR ze*|Ky%|jEiDj9J*6qw?Nxaai!anxg9;I>Awn$Fa^sS?~3dKSkKqbb){s>_5mb9+-E z`&OLv2b1qBuWo!F)73(p%pK=Y#}bqi={&NUl&&f@%A^S(_Co8h1Y9aTsFOZHaz4Q% zCP5}#y#Hb=Leiq9Z8%{S(O^{2I1akuXG^9kH_foznAvG}Yg%aAefxHO<|~u8wAE|$ z`{H~>W=`c;=P`Gs5(}Y28AnnSE8A2wl1_{A#XLgFLMnKI#xMEdrMY8pQvBa+PgoaR zmW$E}o4&13JvPZdae4ln%;)(`ZNmA{#tV0vhwCM>cci4!FHWtipoi$1?cYk11Yn+1 z!Nubc@ZHyKbG%b4d%iQ!7uA4r6*gs3d`O63McB}}!YqXZOhI@26{O%yaNSS9RS*yP zidGNZ|6ahYe}cDS^br>Geg8rV$0!8XhWzfVUghl>KU6R87mGVT;G#g5eHL5*4F?kl zQC5se8CKI`TAD5e_YevU4Fam1eH7{AkZTiuxtN=L@yFuOz9IYp;Xf~dSVlPLO}}%t zufykGo#Z4LE|Q(tlo8y&5F*s4xoy})kFQ>F_d}JiMX0Qc=Epqs&Xj{kv_hgyRc4Mi zcve#i4>iB6{$Sf^qU2U@UMUHqu{7q4tvScUA^fpxsWC(c{bH-mYWgg6saQbDrHz0i zsC?zkeciU# zM+`O2zA6J^>OH{_TAPf)Oyu-=f6d-1y*({4J^@_to=WP$IQ`9C)P?W0-~^GK4@?~$ z_3olkwqv+ugrW(K!TkyQ{b*hmL^NU`TV1x?@AUes%+>p+^&~qM9`PK@2SRN>M8D&4 zXy(t)>WDH^<$MM_+>K%CH^fQ?rmt7d^CThJ_~TrgqlC?`F;j?H(a%0&Wj_^u;oDAC zKS_N)_ zbxehKLcN~MpAHutR6H`8DC`bsp!QR3X*5Wm#;A@^vF-X65Vf-tDx0hMR?5+|QMq#s zqS(e6PholgrC#_l{Do%OF*yWO$x-`JJHi&z=|VoE&+Asx7Ooj$esohy=1o+Hf|Ne5 zvK<+02oaTRnfe7|*UT?F|GFbIc5`>qEnKOZ`R-+^*hz0ci9E{HA=by!ZXl~>OEu@} z&dIAu=(qxD3O%+eGR%2(PJr`+0nX}t!J$=Z`Kf;2w)Pd)gx_U>6ml3pm)0}vI;#fC zAmeS$%@lK2in5n>u>os4TM*I4)uXr+?po@GKWQb!QJ%G;==s`)=o61m7@E} zSi)!hS6=GR&_-2*KgOKlR(~OTb?u9A(__O3hL_~SRBQk1Zsg{cB!~;ltYc?mSe;_L?|C91UC_Ckcwr z6m5;>4N(hd#$xTQ$e^H$GgC?L$!j~X-6`ZYLJqNhsH}#T7R+5K&QCTC^!YK`P(yw+u`K zkDhrs;0=t@j@(##{wMMicqaQ7%zvs&551t^S8BMt@Ua@NP)Q}SJ8EaOKNsL{_9WB^ zJ21Bpg{8j?cqg!<1TP+SDG_wGFj;PDvqQa`Qo8%Z1f;LI-J}7rD7N3329pi$x~9~J z1vDrfbAf0X4XFE zr-v0r0!(?^j6~<+5Umq86W;~r&nM#iLorlsXVZ zhZGKtuNXNO`JBdZj_g3kBg2{~(Ld$||L9%=4cP#U2w|_y`}k3qxnX!%Q)fz?twWtD z%5AkE#JJr2K3n+z7dkF%yP8ZE?78^xyPaB_4$ZHcn8j~q*Gg4Ap^6Dj>HVWq{_Asv z({KU6J@1KKT>c-}&K0(d+>;0A`I3zwKgAz+pMP*(I-Y`i(DFS~H|~GaDE?J+wUY+I zmWR;SSKEiG|L1QL2O2VgPMu3yE9qaq^zS#0_194WW=k)<%<5nALBQ98`>OJ#vZg`* E2ibb<{Qv*} literal 0 HcmV?d00001 diff --git a/images/lite1.png b/images/lite1.png new file mode 100644 index 0000000000000000000000000000000000000000..711330ee7661943543475d862504695ad4b7327e GIT binary patch literal 258476 zcmb@uby$?`{yj=bNGc%GDAFlIw;s)EktwZzMYhwYyj%sL zZ}>L#^bqyT-+tm9#opG%BV(dqOAIE$b4)varM^MGD=LIK0WIcUWba|*gUvBm2Oj<_ zxil6ZlvgzoYw|P~F>4MqVX9YeI?AaOkwyLbR-`!)m7`&wHbH=oWYsiy0zocf&pHP2 zZtb3sQFNrfqDV**ApGP=`rT@@H8}b@{KroFTfXFhk3|C#uNZ%duEzUF+?WU9;w&C(IEdvGGyYY zVcu+9(?}-rK1|pP^qCP~UPh$|ypsK3g-v^c`7m%uSEIh*`!DM z(5{CcOr{Im`Mt3uzO-aWal0Vdv3^J4JJ!wFrXitV!4UL+v7(d-dNoCscUqHbDS!To z^zFBo@>q2K&#ml`=W#E8W-bE|3IKIHx0K%50ok^5?Hg>!Sx;N7JitGqlm+eu74>2FI^D;7$DEfDVw^WE( z-o0f|Bwi&NBmN&xHhrHfqsIA87?Dq-)O<}b!kxw?5e4ufLcN2RpVr|V{;D5+V&H9X zjB`efE+%G%PCg+f9ENM zLroU}lP0w%1%G#-E{#;8_g7W18b_}dlgfmcQIQ1FrbQ3SJCNvne~~FSB3nhW5IUJj zHypg}ScUB2bMX;Dt@Gs%hD~qqCuU|;xlRst3~n^0PR25eBAhQ@y>1vMea?+2?OrQ( zC)6P)m#(7jMonxeMUS7eEJl?`@1z82()q~U-)t&OmG|aWY1*1xnRJu{ky>E>vz^Gti zFll|Dd&0cLVXm2K5kpH1itGy$;p`|?DL#dIDR5>%fkn=4Hou}q9=}qxs$=Rd-$XzG zWxkHO0AnzJBx5FP_F)=D?qW_$`iPlk6@m^UOL$F;rnrENo9Jztq!wv*m*!gjphlr; zq1G2Iw|qs-T6J9Y)FOjCgLF^P`wG`^RPAw)%-gz7m}eFpdHNZx$WE?{n__cU#x1J;+zsLC=N6a z49Ho?wNl=sAf+&-Q1g>xg?#KZ0@%Ooe|byH^D@;T)B)M?x*^zc&tcASX)kmbb9ikp zcK2YSvE))FHJ>sgwYXKR_1$9&qO}o_9ct9@Q_^<~RfxJu>MCS8jGnyQtAkv8d>G0O zwar=4Y?t%*knEEDDETR~T9Pi*HH5eeu}izFFeIGBIcZqoPW~Hhp?p|^AaIu*Ie~9L zc9X%hG(X+EE`Puw*u2oZHKAHNLpyTDpz?N;0Hg%Nf@y5BY&O53ees#aRC`=!zWj}@ ztIkvAsSl89cN-I{Sj+0+f}gty)ex?^hT(?E8wmDb*OFWN&N#9|Gkj0TTTh)@$5_c5)=$9-rK*7`myqh) z)Q>E5HLy5*vAbP8Tw%2<1sz?K9ZSPL9NCSj%>gDsfm?iRU$r8p^h!-j!xs-1!Hc)e zj~8~wy%#eZFw2nhmk&RzKJ$AFy7RT@x(nR>@HF>Ky63-BxaWJAcnC)tKz2c#Lx!Nv zqMANA#BnEHfB6Q3pWqC#F_<{WPZ9DY;7L4ct-qPy8~-9qFrLMWL}DG%S<=lH+<<)- z-dBPg?h;oDHP4dzOFmvoy1X9<<0X^{4GGsHx~A`Am3=4Pj~0Q9s+3$l&oH z{g6xCPEtKbrS39Y9^M(c_Akd@TG}hxNfnF0ZsVgKiXIv0Oj2P~Qk>g|ESkO-DTx+> z)kYRc(*}C44}WY5S)ErI$YdEe5=1ISac`Qpi(k+rgzgi~w7a+772M&DMdwh`4>RKt z>Lq^U`Fs+R>zD4=I@kfU2i6R5q;Ppnw$J!f)(*Ve%o;LJ_##~EVm;MV1{T;-+XN0h zR`!(VHEkb!U4Np`DcK1N67a&f&8~T0{9e@_ls@!JA^2G0mqc-}{)%F$oknH0A`FJZmMLkv>&(D63K`9%F8-5!pOx-0#hMjsE4Z8=s z4245lb()qs;@Z@@V=BZNGj)@n4o<66r!XeCh?KM-I z$x%x3OR}-Tm{T>h($FuDt>g0Ai?))PgFb9t>kc~k9+=Fv?&BS7PZQU|)I+5d23gzn z`z=Ovw5=`=0~<^CF&4)@Y$?~8uzj|f5iV1zIl4YFZ(W+EtZ&`$-tuYijYFHF%%Ef} zmf8i5Uvi_9@RO*K%;89pS(~4l^PhF+i8yg4achx}@Kx;7Z5TqiWI@q-E4X{KJGC(=br_;9 z$OYDc4P0x3oXsv{x9}m(&3LWbM-10;u+nojD+7uLG((@Q0F1uI1Tnk~4^^KwuU~z)Nn-PedA@AdZ<8I#ygihQ(?-+! z!j~}+5ghG^C*MVy*6}iZR#^%4JN{_29Qq6)ix#0p8Wr`12Mf!62DNt)b@_^OBSo3@ zRAAdwi_Zc`Fn}!5(~3CJ2SViX(h-MK2Mk+jZ4d&&3!2{_h%%~gju8+<5oF%KQ+Gk! zZ$|SrmU2Au$Z3~Xg4zK=1~N<-uP`uVzEcH;^m_6lc=8&N!h|WfmjIi@`W>NFMfXPdUtRP_>GSq&>pSzOevWt_a8YC@ z#i>Um@emOIT0RqR*lUj)OJb8z`JRk z6T+{5-r(O?pZVwP$?vb8MC*>1;UK}^>7T3Y0-Zhp#Qrkt`R9mE8^Bf+0C7BR;PpLU zqFl!nq1MMwtcqkSL_6DZS|rQ+iV4g2+!~D2x3_2>$6SZ zltdKZWA0Q`mDChkzPvVVh~pBkb+44y4~vT)7M53x$~C?U&u7|sO&3tJk<90{)h|!$ zTCG+=RK5&qyvqu_7M+Vjsk9Vey>1A59^W1}V!~H1*FR~J;HoppQngXl62}D_tb?ZF zs+sbu=2+hL~XJ*2gWGD{yj z=e=j{E9LF{{FX2m<-3V%&*!qDJ;ln@K%?YZwS|y-U_ySqG&jZtdj52 zm)HI7`;TA{eT5gX#CO7b>lrG}mrP1Zn%~rve6KqjZ7f~)zB>8(GsCmg^!yPJC)cPr z0DplXJdDN=U(gmH6X-HvmzJ<5kb^8YItbD)QH%!_Clw^>7zF6B#fIs}O)6+X;^ega zt-8m@O!kJMV+~P9lc=@sbN<`AoaC<1mAjTb5W@Y9< z9#lj;)i4`YoGQB*hmAr{z?f@P7iFmNe#PQxJdc{T24<{Qt(vw3W_&SblyxWZ_K1ck z`hNnC7O{4T74Z2p=d1@r($&p;-<@a>3wLaPnqK0XOF&o=wC~b|i1a14?e2)eIxTE( z>Efob+H)!E^KwqAY;l!Sr8MuPa$>*ME+I7VRR#Y&heeSkTpmU7LG9|eF4gqD70go$+z*wA7(WgL=zbH@WsGC>u5ly0Cc8cK@C+WX_Wg8J9IF0){ZFap40UIJL;Y+79 ze4HQ6>9mWYkSrvDas4wx%bkP50%k%f_B&Z4-mud#QTOy&3m&U3&MHehAyWJ&a{m+|0%{+;(o zA8>Hhq!`@pWax7DmeYj|yB)4;$V)+hg)lkdi{@pGlB}egnq+v)LE}MucQKicj(Jwo z=~^v{wc^F4W9Ri+^!D~w`ISBOvnKdMg~xF2ak1w4vyw~7J7OVOKh;B@XI`L|0r_?!(hkCJ~=f>A&=#2 zL5*cDc`2COQTWm#Xq>+iGI(=i@J?Q<11tLrdl%(E5zUEug(A_~QXDs=FQ@0nwY6QP z9$;y5Vq<4Gu6nHbbYVZS!c=eWY&6n==i-w%P6}{#+)luO^Tp!?N3a@EY66F;p4<+n zK+K)GxoJyHEGvPV{kD?;F!*togD96(sB&W-Dj^r24LeYVZyb3%sJ1<3>e||ze+WxL z5vS20YIDlJCo#ZkY#_QWUO%NooJX5y&cM7up}~EuB*}`0uQ*ejhd5>Ph3n4Dj&%dS z6<ek#scV5-K7>Gc=bFbBUx?4xSYu_~L*rJtp{$?p8H=4)9 zQ7^!tfqy|kadh2wZ6!AMV0*Z{>{u&z&-N8?M~H^QzApA?-Q}^`(GkoqZ_#Hz3mk&z z8%yye&&|y;s8VRhl&`3`c<>YT_2$VCT6iD$1w*__bpvT zr8b4J%(3V3$c|ki*%}R7fa2KMdrI8R0b3w(nyAcI#-t0ixiBqdiF?IdhAwaY^lI#L z#D8Ip9$dtOo`mhaOloJ()P}=mUA}MYq)A2l>IQ~Nj!E2V{TC_fy>pm;zwYMzQlYVE zrZf%|afIfr0oTD^e6BnDUmADT;aWXM z>;sc-(Ac3+U(8{lWg{=zzi#+nU`vcPBE22_Gm`tQ!&ldE&-gAh+=}jbJ5H$ro%|9L zN{uO-3|rzMEcDp7$XngTcP{ogJvQq&ldbS1bSK|V&3Epq8~S%@2QDA6@&s><;Ok) zuk#;eA)WK@t6ALgO2B9WZpk$1>)m5|Vh&NQ^T~#~ogL_oPZ06wd6DK*^Gd!0pV+2m z#Eq;a46U^D{) z1@77xP7mI`K#C`?;rL*Nw(UM@H&J%nz>H%LYnSpC44ZXs_U|j0 zWw(vCx+_Eb8-zWP6&3hm%r=4Fdc{Y6x1Vcf`YE|DN!RRFfkzFE%}`fUqf^)Mc#?e6 zCVsuWI)PJ8Z>%P%mu0Z#K&or9_o7~lylM2{)QazRK5lADM=^kZfr)ctpf0{CqmepOVwAZqm4s;A%Rr@WQ;FDq^3US2+x5pei6NAb#J}J#%9C@YP?5 zEc5xInAsfZqS&qBBPwHYL6=Ruo*wEn%73S74+bJn0O)w($nz4d@VZ-po{ZzE1_im} zqT7D=IBWpuVoTdoiI`~C4kc3gd~0C9FYwEkOOVnR8eCtIoFqqE( z1&=EU$Qv38>Kaorm1>_k0bJ66&px~H_68N6o2O|XkO_-FGQRIOzz|t-2 z{f$~`nu|@~4_6$ehQ8Xy%_j1?jltdnKF~-Bxn>B4Yo$HvCD9578*= zwf#LkJsCB8I9nD@ya8ANm8U%M!=h8-;~vM+ckivSHPmazx)y3x9or=-ipt3bgXRAK z+*47iYwzFz)u%A~Sq(>029j7I2pR+bn9^7#Fu%f%@@VjbpHGsY9u5KG-+@v1PE-K+ zy!@q*+oit4HYoKKV`P zq6Q!f=Io-1pu}41OoqiZu)T4NKQ2XD!WCJyfj!0(a4UGBbfZDD4EOK*`!{(_cy0^g zMT>hWcj(4H>Gk8%@&W0QwIHJ+2zA4Xab?X~-m zItF-rV92OI*Er#ry*0eoDcIXyK0>TaE=&=cDVDPTIWHl(F!}=HIFOQ9mu_y$!0}kN zw!d~`AEaxL`h|~t*P(P6o)`wFP7Q|SYS56m3)K{00Uh^(gJU9&TI@j_a5uq@M>}gH z7~xXluJO%_lx$ltpT6r}f1cIj20>naF2H&5os`#I>Kojlj|2|z zM7ke0tv3?jaJa*Ej4=3Sh^#0Fs$Z)Nb_57Gx;T0BTHF4vIHHKpBHktN+3^ST=UdI4 ztIstB)8nO=w@BsP_K@#?i9<8VI{Ia+i>S}7{^YOiNE^oJu|d9<)zXSnYo&pct$N|8 zRyoB}$eXIv>8(&`ByRjL%iEaXpMN2 zQ;3mHwt3Gfnte^XhE6HOSDWe+;tLkq07;p5NoXP(Q^(ETIXMxpPb6STpqj5N0$A^0VDV@KW8U{LeqwO{CqhVA5o6x$kMyQ zm|VcKDKSQLVI-D)pMJ7G`JF1-bZy8ypg{O5^c0GdN^_(&m{J4Gme2GFl88a5zg8Tf z{7x^KQINWr-`95!YR&HUGQXDtMz*Z4!SCw>4CM--PS$lpa}cZTbTUI6Q~}_51vpgC z!2jX3QK@BYO4p_FGBrUZj3DyuK>|NhDAN{qyxSe5(LAFu_!nVp)%EONzSR`#!$H)C ze6Zn-b#ta}qSZinNOAN5sFdF&dv0US{-T}*&XtmJ92>OE;24|@EU-`-#sKWHa?6JT zCh(Ks$X_8Kzt5>3a6V9%#5}4Qh;UhN=fpo%T{{9zGL+74s;v|h1g*O*rRS%=;zG@& zEYZ#;$@uUe()Xu;9xG72iMo**ICH_pcDfHC=X}2+O*m0|utv%EZuJ>8Z?Xrq9~HWH zTfuW_{*?a0#QYXVN`>&t(e+~h=e(PDBKJ}r>REF47tQK8dNLGEMNtvKpO=DO(uI!+ z#0o9cqs?!#1Zkob*(-HKG2s>e9KFZJ=fmhnK>@zrur--P3KHXhZYl*_+!6I9iP`%G z9@X|u7S3*c?*+{7mDj`MOX=yKrSdZJ{bavb$O;b;veEpLk|GtXmpv1ImV;`2qgR?GWoCXB@ zE0cWHr(B687soUr>fL-Un2~trm#ZaO&XQb&B-+YZ!lyLHRU6C$LMVrssRNOWZ4)z{u}I_7nl9 zK_@SWeGqJtRAhZbzpuR_WW(@h=#d)@_%%TgG|t?|hVkv1#qS6Sw*L&#=hN%um1PYD z>CIj&lLUtO`(s~D0KZseyKl^Q`_$Lh^M>*%g7ODL&HlT)yV7K%yGGKLKc7?;-_Yf#xkRLh&0Y8HF$T)5ECx4$=rJ3kb5 zg2=eW!J}7`+V|y4U2F8r;Pp;v7D0XsxGp>4`uQdG=RM=w`9d6n9jEJ>WDo29JrGN- zc*-GRqP^+7+@j)T;b(YQ&lwcJB&6JI8zOA}vy zjvpKD#lIWIy|lSpR5+L~)!^hTXmL@YtWTVf_^ti^*IIrj%9l)IV_Fd|=}MvM%NZ$~ zKwOv9?fcc@*tNGUBH-=W_}Jl ztNGmIl7vNE6*J@l?vY59RCb7i?ti`KBs)Zkb z4|~1^5s6mxZ#9&1!>%xqsLEVXVlsaOYvBZ~->N3}otE`x_`)____S6{Dy15r!`F?6 zX+3fJoJ64&)O-Utc!!uSfm6eW_nbT&@5i>F_Dg_v=-Y0O?7NYlX*#k3ss@VZZZ3{* z9_~U1{k2(q?`Nd!m~re%kK{t7P{3y8j(V4P#CVEh(s6AwfZZW;&~H!lO6C z=?xR)OsQ~YaO?0IV-8botVC$M;xir3kGbx9a;d1%dTdKzWz1%r>C~_P%gUANedZiW z%A-OmkkUxu6>>uf_I8D|!~*%2nr>oYy2$k-r(^dGQ}i0w`KMJ}A{;v0OCMiNu`Ds$ zYa6uiZWSi$ReZI=>p}IgY$4((j|~jdZLBou9^@VsHLAwAYHq#nD|T|@U=blb==Tqm z-%FKhzJ9g_7m}OK_O-d@@^j4TB>KQkl~~h8$GqY3$P(~OHV*ebTYb-kon&5C;HPEt zGJ7Og2__14oG;c!KAd49h!Zv1KyhhP@jzP{TDlH5dJqhQhLnNo14IhhY+WFRKr_>f z07`?+^0$Bhez-O5)?Z_0X<*drCmu^|;KQyHe)dY2@OrUy+8ZA64 zBd=POr`QozA_^x{sobhraHw=0qRYv3!8nkSJyxBzgOrmk&-~)zsvf_N(o%8M6_Bv_ zl3$eZm6DxdBS7hO)6Ki@j4tN;E+Ld3A_LT-Y-}*HTBAIqoR{uidoI(+zI8GXSYf^` z2bv`UMdGzIRNT{2X{G$HeO?%7_*kJ%zays)RyNlcIwuzrQ}Q@8%-<;ZlIIg2Yc*T1 z#uAxgA-U8E3?J>7!6(?6m$h=cv`_7*Osc2UJ7|#ttOU7Gcp3}FR>rLSg@8V^+Ru#Q z3navuPrX_I=WG}kY-v!!^(+H>gB>{Hxr~y!v3MR4U}6#n`gU9CQJ_RdNt&~W96nY< zYQ63mO*zTnkH}a*q6w4M@GKD22mb#MEJ7Q$if>HKEzX#6xra{MRQWV`EL8who2!Dh z#Zob+9~cP!f)q@&Z65Jh^D%K#swRW92Aa1o1$JF%x6kl$xhk$dy{y)L`{X74Yot~B z(aL8V0zLD3F{WL&6H&)&_W@TA2kFyiPaqTXLzgw~!tp^dIyd}!nuB3WvXE`9Y_hhG z56rmb)VwcEbHjL@U1at}^J2a-GJl@4b0Rz8n5&qtVXiFx^->@xXQo_8v7BBYGXvkE z#=%k(+FkFyIOcdqMszJ#qtS)4{bDh`IweJUF0bKQwII(t7f&F2t5u{|SAG$AMe&Nt z)?3s7-3;xZqceEi`R=8o$O$htrB|FJu8_opz@;iV+n|ntieh;$24rRcs_n$t@5cEP zkSgGGlwOn4Sa-C%aV`f&V8A>`}9Twn6JHU0A~&=3afZa@07v6+*YgyM6v+Q`3a1!zq?R zXtEj(koe4>l6fc|xs!#ajVI&Es^i%jZ_p+@%{)m+c_r8G5?{JR4S`<(L|)QD5{MFJ2@$JnaXtK_#5(lOmSDSJ9udAbF zQ{5)8cHk{D0Y5@06iq2LXAy{_I^}UWj4qI1qWxucrZ*dVQ2iIq+FPg5IO~~x-lUmc zz1MEc!MX_o8AjR+C$s3gJl?$DQ=q*?xR+|LSyJL0^EkD+-Z2m-6!?N}cw_reV+gMi z;yNXH4NY};D%JRIW^0Gp*=B@hmZa{(MdW>z9)c8)2X+b+bazgj@18mCx9UwO-1G&m zydrVm#dtc&tt!4H`Jl(9q&cty3L&9-`n$zThy_dO zl!^;}(c$}5Ul4eO}!L;44Y;MsQT~H9AMZUKmI= zMy5c=;3LQRF>&9mhP)#4W{OBFzcWZvxCgopu9StVuKPf3fRY5C@m6a?&e-W@B7a_LM;^J{*}(S^jZ6?$4BQ zfZC>-2gDu7{TvZ2zEhy44qB;G4cr*y-v>TJ*XHp9YyKP9d_`IUd8jMo*dU5-oa5&^ z1J@6sHAA>Qz*d(JC_XeOF}VS1(W#|&1~sllkMshGl-VaOhAJVK=9{56(_mHc*!`;F zlBJG0J{F`Ket(CFQR!N*ekR5A&1hibGWDxR89};9lpab66F=0xy|0 zn9}j40Jp#|`Tv3rI?{ZlGH#8uyykiOk5X z(A@RY72;NboUbBeOV{sHnXdKX=HiOIExA2|3H zmj45=BRr7$R+y0&Ild8K^;+hjcOOUd+~$3J1}Aoo=(b*6zSAqJko;Z%@~6s7K{Y@Z zj!;RcgX2#3lveI6y<_MzJi4K;nbL8C6o(!`{*9E&0aRi6LQ%**jEng7eikH7Dhgwo zi*8LCgB+|Y39PGBvfssf@7ycuGS0JV!!97gmMdtxi378?ez`+;``?(Oz|v2<IAAC=&5H2VeSWi`oN179~OD-4!TrM1pK&`a&+qE^TuHOb0S$HEpz9a-kgKV=0-qAA+BKrY=5fw?ehi`K1` zI8tza50Q%Fn&JfmO?-eu(+Q^JGh{?OTl8Z$@kK+|7ces$Tn%n6m@G14BKga-?T9rq zO?~9nRM|qt{GtZgLE{%_eq$!<(B9#{ydvi8zw<fJ;zjyAso;JoSbX+zWCZU`=-rzj6YbD@t{V5|h+pM?hl=HXO~(J6aDnvsh>M+VpqPh}414Y`Orx&L^WYr1?cn zaq-QuEG#Yw`*r2TejCe!9?kLEUsXi9WfYq-L9B}DY zBqX(R?fTiG(uKr$)44$as0Fb!^&C(eSrvL|10O+G0$vD!#Y$1PCXDV!;^{80@hgfGNO~=XrAbFxJS;O{1yJDN^{^n;!;*(a0@KyTy>6`GB%fX6n7!ZkyM|z9 zeaf!xXn3iaFyhMx{}JZAN#=YkUsh*Xe|PRPQ>>6j|oScAEI8Sv;?kxt79T2qY!cS^b` zvZUzv$1|jA9wHm=C{An zbaF$Qf7J6ak%BT8fV-&F4jdYWlbno6a)G+mjTvZxNFR2z*lMKBgA=&lWJPPSG+Ce0 z<4|jD28)SZZysTMfD;f3l1t}vc z5|y$Zii_vi9i+UXm6{CN>3APhRMVT#5O{^fMt%S0oZ_B;P~KqOt?cWifVCsoM$ z;Y&||X6soRxPL7uH8D8o(92RgQXi(c%XjsKq;*;@Q)uCA;IdT?e`-lIMN6;uHcgX1 z?D|dq~f`rNH_W7stmaJNfH7hwsai)9O{5$>|`WKPw%T#yX=#vPTvJCa{`pv;0W}AmZmNjr4k&o#xP%3=7mFSCaRfWJ47l z9-u8yUWF6#d8$`X7i!oE^YN=);WNTSvYm_^7Pc&D4Yw$p(a2U&(zkdY=1}TH>TS2F zQm%hG=Vj%D6(l9P1a1=@KOYfbP}eNDpf#97h@0XEPDDNDngd^A8p=D8a*y_=E=f7+ z)}BV)r}Q=}O&yC1^|wDUhj)eyYVV&38I8ty?Ef`1DN=(cLXoL}Ym-NPVtxEKv6VT=1+u0Qedp?yn6B1ni9tHjx2u>2gsdAi; zJ17V|E?_QPK$_u`02 zT52%fq@Y)5ZS8krzF*6%I;x3RrPq_qjwK{mbJ%Kg1|5;UqcB||d~i7M;@}cfbx^ISM*Mou^436%kn5nze8{S)7WQmc;F4Yf z*tLB(_9#1&|IA|UoTKx(;%zp1q-qmN7H*`t$J*wl*e6r}oJOd!r9%vlC)-Lh^vm__ z6);<=6t)1cUkT^%&P9z%g!RH_-%_f<=k2ad`t3%h# zS*nqhw7%?bCK-z9-mRkZ~Hyb{2 zf9JU<3DSPq52lZ)5*P|tDd{D-!kqy7p=0?4`vUV>nc}2WDZ&YszR2|WFvZWd9>CzS zgm6B{0U1_2tmv3&=6YUoD||s*ZLL`lbO(aZ+;YP_O4CKzGva7#wM?~_5Jql3y-B0| zFya^<8a{G%4T|~kgX$@&dW8=0yUygx?{R}~2C6Hyca7sdb~$=1j5Wd>QQw67L4^m4-b`@e3@5QAu zvXP$FVAr0`k6OZAE;%j4O4)$afTLI#J>Ea&we*5Wm^fP|kFyL-U1^fPH?@omDFK^} zEf=MyLposN(|?v zI0gFq&Crs8465$D**$%7zCjJ{JGxgsap@(jIQX0nH*d(VptWWZLneU-@)X9}Rutg+ zZDCApR?OB@&yA*no!?+fi;qdcFcdE8=WOrBS1r_J52^LKyd7(>h7T($UfImqG7Yf| zPRco2?RT>%8!n}{B)hX@`fw-Hp)Zgv0hruWPup2zIsdVyu7|{TU zU+iGcI%g9*z7n6a=OG9sd2lAJluZ0B-14NnRT6)!y3s`GlooZ_E8Mf+I#Ai?7M*;e zPeWSwky-?$s#4pOOXTjPHM2KZh;za#Qqs-UIQAi%B8gd(iSme`5(`k0^jG+7OG);# z@+5zz1JyB$xf$AHIvU&ydPdN+dB}p}EEMq0&4KVrsUiG0u;aLby^Lo>$0(`1R6MpuZ$~)j7X!&(|(BFLle&)ZA z-4la8pc2cQMTmU^%z&Tu$MRJqhtxaNTmH~Kem}4UMz!`%!H*qpxi^Iaynf&1y95j% zNYe(p`p)Wg7fuG_Y#Ho{`do!Bi@xw%mmkpbF|cbak1caXK&nla6S-G7QkAtr_?nM3EX`F9`TUIkZ8bQ1P?e$GHaZU>kfa;^xukm9z zW9!RQtMV`+sVNb z@~QH(y{tmYP`ZCj^8g}rQ*Qxl)d7N^3o_@bwrPf3I#_?6e?@X6BHAp~ItWN%JFUih zupQsnC22eC`>Th3`U<%@J>YaXMtIWCl=t~DwrcHW3wE#Hb;(y6R%T2z9vffGa@_c4 zD?rc#RGFo?VFGm%r;H`d9~4{+E@h>OmLXn9l#k-PT7AD4jJ)#NCCtH}RcWzXUG zsOdTKFx++cn>zSZ%TeuEQV7Xy;d+Fc(%nq|B+lr<);iggn>BW7ZbsogbOFM@PRe5 zx~irki`(T$Ns>*DvUb9()^sV8RdtHwmrQp%{_gyy5{OgTSUvcXA~7ie?iVud-H?kf zn-v_LU`xIJn=@(Y-tXSV!We!!1;>&-T0qUM9SDMOAr_U$} zNx_)|vQoJrpzRjpC<6}IRoOoTnN%&*ZM;@UhJx%MEAxuQy4K4kHB5Cec%c`<(#=B; z-YHU*VPnK{&$qq`Tb6wH2HbkO8h>N`ZEXz;SA`Gf4m^KiGN@_{y8UFcF$Lv-t^pI3 zNszv<1YlP+b}eU0Zo31=&S`bZS$Ywq&Eu&+iV66DytYZ*ZzZVDWRTmrFkClLpHhUQ zMzLn7`FG!sx9OkRs%a9QvgaS)F-V7dNSfCV99sEvl8I{%viXP9l`I=};19y&FwU_I z!P}p8h4eqG)L-rIIb5&Z_o`D38HRU9dP$@o3aTMmEQXGo>Do^Aw5w_kD_1F;PA3?` zp=IqLWXJuwz{~p>8rwA)I$+Cg)ndP0+*C=)QjZ(3H6s2MEAU~1)BS;hg#R%Q8I5{( z@1B;S>7sw%O}Jog*6fk=5%Y-Do#vqVz(jpwT>iw&Va7u`Csj9LcScn0e|9@I2UGt3 z>goGmM6i%8elA!7TabujoPD%|TqX9+|GB~Hf2+sE-^nM4Nd1o|XP$l+&GX*TMT&xT53wFBS2(7s^*h*(v8Rmk2wm0e?( zWPf#;eD_2m7vEuJx)!W7iLI_(>j!@b^CbIcHNyi_bBzujDP=CBLB(~@RBPbZ?2kd+ z!vO?+gb=S|?)n0W^l{8{1Rp*854`}Lnv$t{xI?bjtPMjXs`c~*?tnDgSK65mLK&A#^KIQxEceu^3bOu6N!w&-srnrFcE|g z<jLS5FpK}fy9nq-9!S&e-~+?YRd^_(ZN3PoOM-1hYsNwX zPNu6!Ly7k<*h)#(1K+4v>^6lJkaDZMuDPSJoq7wM=SmTBvM=t~nV&u(W6h1BMNM74 zPxe}m2_%Nrf5z?K!8i^GPUPwj^Wi%Lbj&3GNM84Q{uQcvx5Fa(X#(R|VttPY%P;Q@ zl}P+?=*I~*3`PW`y;$#=d-%U2$v*GHJwW|aSO0bGAw}F@N#+@+M~9p3htZ9qMAP*n zCrZLA2|$AU5(@z>5`h1Qwzm#yt6lqri?q;Eq(Gq*FBF#+_aH4$yl8P+oZ{|5OAE!_ z-HTJ)9RdV*2<{LF5-j1Q&wlrJ_OtgpbH4L_^UY)?L;hjiYhCw!$xqhCj(eT8A7zS` z8jAAkN}NpRTdF_nf+9}Dd5rz0C&)=q1h0jl)m1qag+3ZxlEB!Za#%G|aZQKNmri-_ zFfFJ4Q_F!R01I(|Rx#CD-o zu88-3g~elxi7I@e3X3{R28l4-+oZ$)=xik?>bJS=%mLfZxj|h{@4E*Osjxk(?DD@| zaOE*dRLXs6Z{RjdiHmxqM(8hY`)#rw3xFUAW9L^7UkdmzLdfHZi!HJ=T}{5pv(878 zh16(wv*f7=rW>n&oRmW~&ZZ$MQD%%@#n7o%J$Z!RgO$Bt=fN2%{RqUNf#)hUKxsMh zKJ?4bN9LOJSt-!0B>A=E;3S%Nr}SuM?tC({(0xztl+)(-l;6iD<-x8wxTq_9C{HuJ z^FAAxlvm;<=2yr9(8wrHr6yxfg;qu#B-oKOvT+)wooOg>Xm|2qn*@sUL2#1ihTip; zEzehPn2}r3R4!oNtw4OYG@C_}6EeduO}nKFF2K1|W&%6U_(IYD3R@-x^qbDoFh=P1 ze3d@&YJLj2Fh?P*#OzXdfFFI>nq!!CNAG2$^aZSZtHC)~Fw43eHOl>FmO`bHBXYCuG6kIYIRg@h+)F z1(sh`o06n&mR>f@OkSlAgVM3OpIPmDM?BdWCtCqu8-(^V#MTfx;LhIYS1uf>;g{JP zIF&1@?IF(!n=V}!?5C&IyAYtcV5)u3 z{Rf=!R2n`{Jc;jlXtf(KH78hql>2_bvC+K|iT_DT_TP4I@Ts@Km3eiH3?~1T!2VeW z{SrfizyAF;wE9Nt5$`|BBvf;Q?AYjRe&?NVqg)f{=o%xL|6HWm1L(+X!wA5CF7b?& zmS#4}{I%wEotn+00n6jK@lOyU5xUdjJPoH>*VhjvNHG%#$M9`0GesnJj*)L4vV1MF zyv4IxYWQ%$IoI)-xu#1wx(y(Hgc5mb=wfNr2XRn|z3X0HZb6rh0-qWepH$FVB}H_| z9e)|0=49^wM>jUDe9*M@pL1_f7X@0j#q}SL$^N!CAE1{icldle+n>exYhC_nL~r`- z91UJPK?Uw$s?jefLAbuRpU)FhEA8$5oA5cO-)}0C+{)ZZzI=EyjE|!yw^q$^LD1Qe zoT=(;)iK^lQ^qg6`Rlu%K}`F>8kVJ*yIaqlNi`HG>ri5Ck#z~WJbRwDaYWL$w^Wh2{<_Y5j`ZkIbQu)- zPcHzuO-<^0o@Oy$6YF$q6wjlRb(sT_Py@E{@Ik4XiY8knQOtU;!Noma7X?9X+eLYo zjN*(r=gDbHAq9;(-Hc-MzGahW%?`}rNCMOtXcZ^8V!GP(Z(%_XP<2Y zoXWxq*o!R23C3ztLz+h?uCb08eHZ*w+%>J)Is1Q|RnNbegjP~CqpCgAylUg^WCxq> zKO?U<=?YEWHjKjA=j88fnqAcp2{v&_8F?svbEuNCQ}vXI65=-qrY3_bewY8}H{jor z%U|ndef=QhOzyEHH7d2PhWd*ay9T-Xvtwk?p3 zocwhEnWfaamsg)A_!Hmblfb9s6+Tqq?FeuY{mc9tC4(3MNZ!)Crm6C`2>Ro;wvom9 zkD!R7U%S8Z{LHly`Uxh2KhRt+=}X7gxU`OrICE!|jQzK$i#rSD=D)?IpZHsXj;A|c z8sv)T{t8%e*T?j&4`Xw8ENkJP>nQW~`ImJh6E^4eQ3pp0IKi zXSB%M`IwTv@3xPo{4t}VUx+D!+PMgNmh_&2|6Ffe4-9P_qek5X0 zr^Ieq%3nN5#^)_tvtvi8g#;>wr2f)zE!Kv7uh{EZt*~sk{h{%TIMm<(vj9~K2xct* zN|+e^Od7Mdqgmj2>GI!H-L{g~_MYW&~52$sjcoy#;TPcc{RIlh$!{S~o)Si0g9*&@825$61W0ay9r z5?^wSnL5-V^dd6l%N3_rrPd&!;f6TP0RFSX0(;AchO>k(k3GJl7!IQsy;2WLoj=c* zRsERP4hA0y7Nuo)MSa(p7L z$Kp3#4_nJRgAyk-lx%rI;y?WbFwg|j!q!-_BT%alr1g`?NHb#h&ixPq`*i;?C3!c; zs3`5_0=z1aD`?I{b#~}Jxe%|Vt?(QkI!^noJ*aoSKs*;3E#*1|5}dt@3$Xy z5>~3BH=z^Ta}NLU2lKoTuZLNv+m*_ zz?MzJpYc^e?1#jPqfM^3W2wu%3^MB6`5zw)h`Wp9%Tw{xraL1QSP<5m9)G2~$^ZLw zM~_)4DIcxJr2co?m>EXY^PAmqkts!?TuG<~a0Y7ghI>VR(zx)N0bAw0Vxf+!_9dd% zapCJ>#%c&Qi`7EWubgCrzeL8G_fIp%+cfDfW3J))hj;uk6ydLz?C1*$VKZ(fLh^6b zL{qK)^T9b1!K5B~kK|5rJMTx!>{eCyCXZ4J4N%j;oUA^3*Z^G4>0(p)*gjp91Zv#` z6cE+?4}d%WO$?6rs=EKSRB@y9XQ-1UUEiJu#pN^dM=h4*)33~=v!u?e4hL6}OBI0$ z@o8N0#%uRGqmg9{5hme&r7e|)w2}jURlh`~Oe&v*zDG($1?!0R?Q(bDmvP=Iv_gUs z{l|@|N_XdsJltGEO8qntyX#^}hWyRk5N^<$ji$CgEUG_H?jj4X!?H{TWF*E{pvGsr-MzkzztStI3Z7SUe%7F8Pa<{l7yv3V@E2ODCPY zZTFIn>@R&8s%T~Uf<9WP>s*q_@GZl1N98ff-$TvAG}cjJ+lIdR`(qOQX%8dKOFNDV zE3bXsmL@PWYm;V9mj``TJ6+`E`B*0qfr1N$0pqj6J$1^u2PzIL=u{!?)qkR%C?pW($Rhya~5i%#CE*JkhmFY!9C5Gyy zUEf`%WV6Y95t!n&`-daXl3%gOho61tR%yVp6j}wR8jR#YvxVVRIIYAR`SC=iTf{7! zJ6JDf7(*^*t>($a#KTkhJ{IM~6ky#nD3j>`UyOZ0^*c*M9ct)}=06GlZ9ZNthmz-* z0=Ujp%r#SzBme0LMNEly(a0w=xv|5{CoI`%^qs{@*xgKLAbCyC8CNoRk}&FPhOA1% zuM}#I-A(f+=)~zwvfM-wS(8Y=S{XRLpSE?TGXCDXguc({jC@T*9F#Z`YEC2=j|8uPJU;WIZ zKX|q%p2L5Yz#R*(SG`AHf8plg_0$v$kpqzzv#FM)jl2NqHftHw|N7PPjS|co_g!gh z^Taa!Cjq86(HJCw(c|$}TVX~Xq3@^sawnpA*wY&?K#!^<-40mIS(9QERugJ2ep{48O-5~|Nx%M=PU6k~Q77T~jG!aW z?^T0r(W}3e@V^Yv%em9i(Rf9F)%mVvZT(}1d<_zZ(Xk5&OHUf>A5w&>X`A8*hgzvq z9ITZ`9~{pA6g1Fmj*z3fAdXb7?rsD{_jHsozBO?tdeQFg&(H)eCM#QaekwB*gSXS= zvbVXR+xiE5p#Oi6+3Cj~)aEMz66GGtD0Lrc)C~9~dG2OsC=7q{mswUOv%(F~(_B4ly0;)}<$^1bDqe6U8WV z9zY=!#nzjqzvVqJy`G1BZnUsGTi2>G>_smH705YSF~qtI!$?-{(Sy-!0Vqa)O$IaO-(y8mS|cG3Rb!-O6-Nur|Jv0rUxRz zwJt~AaHlFxV}q4IeOuQ=HBNooSJ_zcX>khHNiKu$3};P6zcBzSbtBz67Hn#kz>c!_ zG$CSIjM`V+Z=GY{c2xN&:SRL!Frr391s-x3%tm|tmVZ1`?$Y`r86*ka3-!RRr+ zl4aR+R2%5$IqtVdd=+b}7wg}G#G{5vQ*(E@*Ur_x z_<@sSw%ZQC0eyer{Jor!JPsJ_7I%Vy5>Dx7tNc#Y@@V>6wk)$Z6k)e+5Waq zPEES#M4c$`F~Rh!kX#%@XB3>&rKta9~Mh)loa_Fx3g%kYVgjt9SMYb$FewEgIW zm9z@G!(<9AJX=!JrQSY?*Zi25)47Lp_FeIjZ9YD3f&=vsrGquY5T3m+zI>D1nr-x= z!4M+XGt&veg2y44=I{P|WQ4kpK9kasi7`@r0Nly4&V#QWCoJ+ll9F@ZsaFEpF9&ma zJzp`n^90B-$fjzT>Axoz)y74zu6?9!A>=Cxlv;WpxJf;kRp~%55OTy9@@7b(512>1qu(?Y zq+S(wK)M;iy4gyx1;skeBjSl)QI%g|E7~4GKA6*#vAM@Q zr^dT1rX6kb2KE~#B;Za^vt;GDBkp^pcA0bb)$a)^$peEUop9@)fV0h})P6Y+IYC$| z8^t<=(tF?{WeZAv<5~O;Su^2So37A3%>ij$aZ<|6@vRssM$-2|&3$l}U7}K4&>fvt z|1FO^@P9?qks))v7M>YC-=f*2_J~hfx^H)Wqj0*zkc}!g;!<;AY`n$9ubV^nkwX*M zFst0D)oNAgk&!B;<8?si0}V}06(=>(>Dk$mPO7aXlIcxsX+P7OTdcjQy|XiJ@VZD6 zq(^A6&U)3~heYWtP_w$1+4*qbaY|a6>QNlp$jprN!qR*vucE$I4P~KHMt)lxE$_h8 z{J7lFk)oA!CUfAGD*C#fnCFT1OUl$NY$$Dm%h@9;y^5Y&cV?D6;>T!v-6p@Z(}p3p zD{$E~=?dSAQcG}#Xh~$W*?Wnx}^(3%8+?qLd?sE3vaBA z%%jdn7$edPVhh3J(dNp`rp}1g@)z=cs+Zm-T5oMWAVp{7Zi$rgiVfib?doMh(0uXi?S~bai9{-Z zQ^HNix41LcgFdB_@?VZCVC|A zz3q;V4hn0Qm(QAP6cuZ;f4h9-OfT_vJ7|>GK+pr*-S8UGN7o7?Pz76id&ze zJw%zJuy~K`$*JhrD1I6*|5T}$cKzb7NV3`MVkw6REK4JcB&AH@!sGms&?gIRma#WS z^iVRN7vt~YX`^tW6u!HGRIb151qxa+8&E?*#O}~et~aOf;$$~;L=5SxA{pKi-tn!D z@4|q8{<;Q>`QHmC6zs2rs`F~zKB%9OU~l%h%e`y6rOt(1jktkJk=M--Ip}>DAZua4 z3XJB=+X1dYr}Ex>(0W=A5w1AkRF!N;Y$s0ox~jL_)uryn_z0&ZSBP|FQFMw#Ijzph z>?UX-)?PV5-4_pbsR1{DIgdf^=P{rb4no-s*=2O=jjJCVP3QZw?1a}L?OxQa2a!ln z4$mx6d6)oij|1$&Zu$NFM#a6v51$Qr>U{pTHPaUUFFdRmUFD{v9yuq8$&r&H2T-Z| zoykyYFlTll@PfB#ldx?@q=pi@^{Rc~p!Q}-pT#^#fWo%z+HA`L32MKSkD2{yaQ3ok zJ(P7yfw?pHMiwv8sRBe(`qZJ{tj~k*^i4nd(B6yHdCQ>;b(;PV&-?0{>#1Un8`!L` zEH&bmv&zYZ&8C>M{pgYBfnDo2gPWKgHAThv9dj8SDKX5=wQOzeqkzbXiA}a%9?$)n z*B%HY_#WYQa|u6Ao)x&-b-Qx1D0Hm!(&t1kc>RAseb z8yeUQs;F|b3Ld)p9QZP2Q}~N33>}DiB|@gT7^pDMDbeH$g;1%YIK)uc^)RbUw{F6w zep$N0y_a=Y(aw`-*%8EkC$8JP&&{<5SXv;-%*dG^|2PdCa7*zPGRUisX?7XxB!d}^ zQo7y8iJibl-AY5+E?kc2SKjC-p$LwydL6z6pHf1$S+7 zdIaobYr=EDEtN*`MCvE-Ryi||(;FV)vFv4c|H|gjLJN^GbH)KQV6S+ljoZ~m<-5Nt z!!V0`it2`&mOQvH5c9ExyNhni?}4}AOgY_N@iz+h5U&-*islQ=eG8l9cC0)=0)%*j zF7~{i`ie%>cQ>l~Vp`cd&A{Od=L*4+iTYnPVrC*XYiu-pl6M4?HHAu(>~>zwsvbx3 z29ozMe(KGTw=B$Jj5Wo*C1+ZOpsj@5r=R$oQy^ny3MjL&_PW|M)Mjz zFZvF8cYx14j*pmc-c<&|>jg!JwcY?(h4V^wnn0j#w;7jy;y>u$zov_UmuTiJv8JQf zRb@a&-DCkpMZc=S$<7-=6XaN| zPZ#pgtZg@rKyDnkO&EHwLiZ-V8pz;_dHb24R7nNj?@56T7Hgb{z!?X5mouJC&>a-H zexEAiykGv!{y1fsQ4m)aYs5MuoMy5*HuKZRyX_3wta zUqJRSP!Dt|adaX<(dJBJxwpz6bV|+7sDWDny|k;w$WO zHmp6U1t)TQYzsG8s1wkC44~*={9aThn;=9!x$r4ly!y$pra(e0ZH6 zX>7AL_jI%Oj_`VhjqcuMq|duzy}?rTG=atU_YVDVi!W9~*KO;t;P-9(5JU&Y(7gB7 zil`i%8gQcJ+}qYV8W{B@O3uwRG*h8<-QC7lW4sdNplW`)(E+PJUgv z?jM@h8xZ)Hei_qAsD7GwVfqdR=uR zsd1y*D7VMXq29)@ipF$)!z8UB2s$iPN^^8fMt-tu2uny&a^w; z7Ou{bRWakG$LitX{F(EtD1OQQ)5r*v=9BT5nj$B##bg%=awmww5}h9ShGmA(g}vhR z*Y8zcz3;t0b1f-fXFMHwt?`~|&t1@ETg2xS1ku_$GBbyb8=LQSQ{VF6)GPZ&nO~m# zM5n6dkMwm#{~4k9QObvx5=AP?5Ow(GdbE&34dZ{Wp(psrouTVwj7c~RzqA&|Cs3qv z@2NFFxao?KuY>W128K3!YoQb7Rg65O;VM<@+wdq;+dX*8;i& zhsB_DS%8^HpF?B%ib&wy#`#2ITbv0=?pajmQ~E+fhFOQE6W0ga#v)guAbdpJ*~HNE zp==>WulI7mv|Oo+^t)AM__e`J=KVqDo`bK^&VG==kmGVa;5O_wtnyxMp^dK#N>vukN!Zvq* zjlSqPXC|WQ)#=?qxr#Ny=viBr17zjsJi6V7v^A32lhFM79#P0S9%ksy-g=lo{;4a> z;jw5BtEczzrSPZ1^bGfXrq&&0U}AamwxhQBqVg%yw4scPl~qrtc%NwFmX<1W4Xef- zU^CL$&MQm8jT7jWGvQ_Dk{N=(wud(Y;4{h%V3 z%U4C_^4vv;Ilj!3(4(1Trm6wsL$B=YB8r1QqOq$HlE2qNi3W_I6!i#2o=XAc%}3oP zl`-0)Z1BD%hcJ?am~wrF_Y^F*U$G2#;Fy{Ydz4=eFj|{ezOYX)J1wP=nKn)G0C0kjLR&44mz>tCzOv zl)qe6>Bn}#BVMLlEa{8q>!(5~$?5q3_gnIL=F`_VGn8(n^B_0mm9OVd_>}kbx0yVj z?lRD=HN&Tm34pkbop$%126q8pi_<6v!qOH#C_cAe^yIvpj0ElrOqgc9>8Y7aaNE@h zOX!)T_7wjlzkhk)^t_rm*CniavqwXQs{z-HJuXc*g-}1rdGnBq6o1P-fq9Dozu87e zzbozrN8_yq0hT|ngb}38=e(}IHGBo$2Cs7Z9S;9)w98Z=6-dn5bq1KD%)8A(oRYLY4V>P}?$dL_zladQE-UW*+I;$z zEy3j@L-Q^7G5Y82fGdusy>C0JitIo!S;~Ad!s@`SY0ma!TTi6Zo(=FyOK;ZCaTSRG zKt|Ud-Q+@mM8R_$spcMgeIzf}M`;EFdxxpOCXA+GQWDhF)+KY(b-#aY9`O7)fDs;| zOd#r>+PNQ??iPOa<+2_yg46pFO?kh^Vb+)VXLsjo*SqDtd;rQc)_lQ(ES4T&k zV7s+4S}XE>cfOm$mX@JUr_{FeXYmJyglrqyB`rNSJWv;1&DJlVHXC{bMVf>Ydtaxt;Y4(Ql^;G z$VMk{Wr0p0T>dfT_A3vTjrqcKl*ipLpZPFwR&N)LgbLkZVz^w*`h!+&R=v2?FTsAv zVL{Sh!r%8zhn=_m?H44dDQm08*46NAo06_O=T7*M4RL%3<8`Kp5f3-F6a+E2e163p zVu8cyqg3o49cJ=)eja%>;Gl|>F($-(S9!?gg3Vt1UG1=NUj z(%SF}L?Nn#_IB}f2k5?m7P$@?J`WqcERwI?a-rjb9)ST7BhhYH+YQv~8 zot$e6b?cQvjFTkjRR=%qag^_9!Ml7vY%}z_zVNlB@-l_|J55_tKM_bQh#251t+*~dNct)D@CNUQCd zZhvEbO60~oRL}t#&FbrOcUmbhF`LSLv_rb!5`$^AtWz znm&@>KA=IxtNLXF#UQC~gxKQ8E%rI5%@_jb*92zqemp1%tZLy}Jxv`|$h7_am zLwmbVmpPhKK&nh@@T}oT_e*?0eE3C-IB$?AJ<$dukexUK^rRN7T^ciZ)RZaHUk+FtJu~NS6;$&VM*4|I(^DxR= zmZG^<%$Ou_z-5Yfx)AJ6k>Qhv#K7ZI{|r9vjNrnprARFUhu1!vbkx~{O=W_%#n!mJ@sbT9^if4Y)~qUba&MK;}^?n@2V4Izr5FALvs^VgrMZw)SfUF-`V z8h7Iro-z%lSf+Ad3U-Iws2WUsqGEEOP#A?1e29hDw#aO?8V*5yE_@YFN29#wAsPm+ z4~?cm?0>@GAlA#{d}W{A3QNCRsD3IRb0I7pzAUrRqE&Na&{x;nXRJ(}2M68})kagZ z5t^ULK;Wd#)A@-obgX0OkowWA%YYWs)17TX9?v!O3kp#MIz6DE+gw5Ud)nIWQarP! zB{fF~#j0om;F{X7fW4*g@`uyTE zN9_%7-nDAW5JmVr8JP8TXT%Nvd3S%Pk7cCsen!kPC_x-M8@^Kdz397sCr7CMXZEl9 z`nToF8rq-*U)IaE(AKi1%eqh|mQb2jDMB5Z$9wni^xN&$)I_*}5GKkj#_c$c+q|vO z?YPxxJ1y`ad{$xeEuPFMI1)f*P|h*xL49(wBboz$n5p|*apm~yF|ZJc<$$Vt9I+GuUV2WdqAO5 z5HZosCGN;T0#BkKJRFm?^(|QMECf`f|7cGVotKtHSB;aMy|;^8Aq<8s@vb+R&FUgf z_Lc0#{`dza0oRr#2e8_Ln2#?BF>y)O0^m)uHu=^YY+)YS?L2FeWuUN zZ2j+vtW0Z`7KUBMf`(^bvsvj@S~u5~O=lWgGV_==Ac>6p!&p=KcLu+Or}9*J)D%pV zXy6QjAuM4e6;Z>7F8eqRS7S&EIs?s$QFS)vJMe(48nXJcruRh(+t|QI#k9V`@H|Cn zq2Z0XSd$()%dqCQj~FI&`{UlneTr``!rH3SNltNCt5c$M+kt>}$>H8(TcZ(YI^lLK6>VjyyjQp_Bg#rM&(AMM~i zS~2&8U_NSm3I{&B<5!bsLVH<$z6H816_XO#PWm$34!zsqwlSLzRhT!g_~<|nXq5s+$(Mc- zuW%*xFKG$3xeY z9#ebtlgC)$@f(qXpO6TcR}#K%7vs52H7A?JQ>qDkpKl+W;uJb^t|wlDwxF}irE0LQ z-clMPfZiv5X2Ie#zFNYHu*{Ana8;yo$DLj+UVy#bB+77y|K%bzS{wCgifzyi=Lpmg zh`11Pvmc#naL8@rR5?)ydrT~F7MVbd2e`8*6LPl5e+f2qS4xz~A_<|}d0E%%m6MHm z2^0AZWp5?lSb&;2#urFFn+x)TP(IB8Za zH=HN9eTJ}2xRI2xkJ$Jl>g9%}@K{0oEXt7JRABMo&FOtx{2sKG@n~{#E2g+!j`UN3 zfcxR6=VN7qLkU3O zhOt!eon%Cf{viZw<@d;rOUsJx!Rb-u6DYl4gPq)1WKaXtfDlNWRd!>|j+Lhs6UTtOIsS{>&t0yoP3@8X<4|TQ3q-_p=`3i@GBM>s&2s;{{ zcU<*tJ{Wc5C`q`_3J?4-6%X4_rDi><^7kM_Ns;qHdULd9@Xin_d+A~7eQ9b6N#%D&bHEk7XU!uqw~1OSP}8Lco1dj+zL$p zFlpkSJ;vD=!at`S)Qsxt6`RsR!qIMzM0E^2FMJwr`flTfclEJAGIpW??{hI5_v$QR z9krxe`yg-}cOeCa4O9^J3X|pDcgqJD`?CG0pnYgFHp0)SNkcH3`?l{ro7KHHa!bJj zEoCdRQ~9gA0HQW`3c829J!D|lDR4*XbmPo7(pD0^>bMSvf8gSCrEptvfQ`#g51MEC z6fFE8e(_NyF0vOoHKoqJfe9uiN%z=J=JLIB-y6>j84b?s20ZaCcF}wSY<-HV!AR{H zRSXHNnm@JZM<@a+jMO4=%n^Gp8bo$WUXCb)@nn7#B57kD{t!g7^CoFFvYMb()``Q{P{MKC5ObW2R}pQH(Daf{qf2s`3t1lLY|r5X ziti&h8Q0fG;WTrTUI&0A&cfEKnDeejX`oRdDNnbf5}yGyFn%XXPc2Y65##a&9lwEd zHe1Nae)f)Gl-B+2XTi%!nm7QtSy?2&B;Z^U@#R8mq&YbijmS9=l*_4@I8D$net2=* z9Ri}gJuV|>{z100^lE8ik03z*u5ggWep{Hld*n@(NR}4o^O)AVm0Q?N2(=t(64Ic~ ztW3}4Sa5hw-3lEs#y5 z7(HoUM#l zdxPbAeQ?-j0v2`5R_UyGQrc?_uZuWc&USU3ys;hUA8qZV#u$B!5O2nN-&-4xwIesy z6OqSr6p#GxeCIF|7bp^V&{J!eibf$RoN!UZ&n5U$SOb;}$b~cw|0#~?e&FP${!;x@ zJ7upIW;?7Pbw~TXndiem!jFdSi4?x$Z>1^ohhui`e2OK zA5O`+7?^k*Nr-YqkvF+%zK@nM@O?<4wrfC7eEH(7ULRCc32)KOFr8l#EF}0iNO_8jr6wy`jkxZ%7pWiLJisNv zd*_>6Lx5Bz9kOu)Fi&PSN-&AAQ@AjLW$4|SQP&||?-V5&=k1jK-G{F5tGpx|5 z;%u6Q+UKcpXM1}FkPCAOn4npv`{gas%gC*;d2QTh_D&gArHweP-)ev9AZR+??mx`# zIeoME!Jqqg8Z*-uBdQT=wZO=Q(D3YF_@pvm@Ome+qHCMAqD2)?XqRT%-Bl82<;?VT zhN}(5w%l_Md+avM9o) zmk7=Z)gt0>INLQcX`wCfU>yyMa7C%9q@nr4MwP2W5L6>f@Jr?x{X_8@;3-K7_uHM* zM)A+%=1K6SJ0-WrQ?Q{RkFUs)r0cLH=aFj@&HEyktMNkaO$2_sYMmGScw00HJ$?1E zNHvb^k2;&AZ}44}RdpSz(#5c%=onq*IVABXqHrJpNN`niWyX{Nig97s9 zw+w;JgfCOs*W8lH=|A9-%lpmVn<1F&fUqB$Hq+I?KRGE$>80Mey!IYprxX>AC8ew^WnFaX z+6_MBUm=b^N<^MzoIqm$iqY~)_e4R{%r}QyydxfFE}A4aZ5qBI_P4m!byp{)6{e4l zwMc0aXHX73vE09gP%As%v5G;)!vXfa0sX|oAMLu;MeoIYx0$iiLjnwEF3Kfs%da27 z_oH_*Ds(10L)!+l0Ku4L;q2Ot_tkQR?#no2bxjSm_k0o4#@lJrGQ{|XG z$?DmE)Lw5eU!tQg!sXj(X;frH+{27bVWPDLhDk{3e9sl_{txhS%7=@y`B*^>u@Pbf z3pFcWIza-*zi+WEA7``kc1eaUJO{k8)YKp?uWE?(dswD2|KO}h@+?a)JIiPz!Yfk^ zn8fpOp+?4>Er?gOrhKhyDE#D1A$HPO8b7`;1BkMsIz7h@37RydgtY)hgJq^k+C0d*6-HRfF4% zX)FtV$XXS+7o8tWDrP)-#6jV->C2U6$axVy@}6N`oG9|CT?^tdiEe8=Ns2sxVj(0w zJGb3^uE9Di37XJL1fqV_DO%!>k4t(mK(k2 zd6TnW%w~kHcn~_}&DRopGx_ODE|8j-87>VD6m~N;m0kvIcvjLI9TtX0D=AV=hl5M* zpBtIMWaxXHF9Kh3l3+LQe@kkYPLP6bX(h_3Q!6_vbinqU&(LXvm&N_~=R@O^2zKWA z64eK`quFE66451l1!u=APNSX3wl`t|UTN2L=@vm^w3uX!%6G@w6sLnrjKX~sx|V7{ z*w`h{1nS@Z(oSV>z4c64vK`MTxLQst2=4Mtvy)|*wC1qP=dokKUv?_9o{uS%+)TCg zdiPcTbe7q1pwnxwWwJ_&e2b3CH&c*w;>ZtYs*8iPqehIMMpnRgy%Uw)s;PbNOxd|E zB4jiO6x*lq#b9=dx~;CQX+Ae^t$qZUMS%eLppWH&L=nFWv&pb@_*hz8hB$$>>3@ls{8p@WvuG-mN`D{N@_guM}@X?3ssW0%0Gx&=$gN zj9{L)L7yX)f!e$ow=PhINm_wa?D~gObLYoYWdWI74KGgFq~G*Tu5T{&s2Uh~3H7hL z_n_Sl@&k}ek?pXOCrK5pw+8I*!sj){>O{7_cW-@1z$a^A>gYW$S0zq6D2Ch)r3qLP zBpM8ontPkuB;W9ekCt{5^6BM7o+Ys1X};t&c{9vkSWF|7^yxzooOPt;>#7S{%05d` zjHgQsr7@cTcnmQL4I>4;zi$v^vKCt#liIv#*Q3!Fc2%L}uK4<-0D`E;hx<|bB-C+Y7#^fpvOAyq|jrn z7rCl*jW#O-3$D5MY~-2Bm5530xrStay>*4%r5@kX8yq*@*zJ=GKelBy8_i$`HeQqL zg*JX=9$s&M;_rN)Av$gPO-Htq`*s$x18OTwE2zDaDt%Z;;gaVpO=(pGPB}{zx$8QE zEd~U`h-ynXAGONddxWL5?G*SGN!anGQd^93~S+qL6 z%B%;@+_l4EA>Wd5ypF2Ecme?{>t(?NY=uKt7;treC-5=lCRvg=;^1N7fPE6x1r~>F zP{?`?;YGaGgOL2{Kiq~ZnF zRpapJwt@|sJ=;Lu(@ZTsh_cAvbJ-j47$wTZPB(|TQq6F>9|=2w@zZy#!F%}*uty7m zu&c02f%v?x?Lz8C?GX#7MBleOJ4`%l>)XvxAP2m=hgV6X==RW9E#Iy^gUW+vvJj{| z?~>R#LDsTzyLvXXAr+}8Gmm!ib#6)bG4AjSpcHll7qS5Gd#4u>iB(@lsYy`M}E0Sv6_ieUPe;*{ynBlNg>yht&rv zSAg)Ut6bv4LM}`ui@UoT<$RuE+`cJGe|1d|tez2_a`h~>lT+mJjycY7FHFpKp|&_r z%D&oqo{P6?#4%FrGVkZ2aS{i5cWKU(R8ia=dlna`Q4bNcu---j19pRIF4@$zGZQ}8{QVcNEk(2?t)FK`eGd{JkWBF&1-`mV(ihjK$m(+ctD zxYlm8vU%flHRj9Zx>PQtoFE~k80_qxFb1JlL{bL|#(;C5F@;P!+lk>$71;X+G-}#2 z+Y9|N^FEzs8yp^09oem4xmMT7<-`}gx zSlexvg{Ft_Nz-6yVRyuFmj_tZ-2tWT#3_aC5^8ZTn-rYF9T~_7r^&$hF?9TtR1PY@2SGlCQ3eH>O_bN`D0vdkTy0NCcxVT-M}O08D>eCBlDB*k#Zjkhp9s8d0@8~%bv z{N{3(HWuj^T^p=|w7YMTSNy65YR87tRXM{87w;U#=0r7OJwM$eZvGEa;{;nsDJiC@0FWn1Yvp3_?P9e)9%H-Z@Hm&mT$&GIrLnD1ih;Qmz^_rw}L zcQvPoU6pK}7d&Fjgq)RU@`koH2`uIl<9sypOiVhgstB9N(L;AD73%AnxAk4>GJ5-j zRl#O?KjehUlekD!@7w#q9Yb%dfSJ-t>^LS5-LTEZ{=%WWT)cpaZA^W7a|rVTgGR@) z7PQWX7+ZVw-R4}uBj@HNtV}6Hj5cc<=jc}~2+DvJxKU2(%WZx=m-5;yL3@3+@qkZy zN|Hdi&GFlcSmrnSpx~5=FG;Hxq)#3we?BX_Z;(|ILlRKNu6MY9@z7e3Xz-bE2zm5y zRQrvtZjd*}DyhVa^WIs2MIbRB#QDg>QJ^yKK`X(FSdm$R0Z>q2-_ZM=40<=fNm|1F zMSUY z5y6e~!Rj5AXyI~zR0@43kh@k_G&|Rh@cW0xZ^Y?hmKQ^NsIG;e6?*dgu!rDW3mN@Z*8M3;mH3**INd(`E7f` zEpEl7IFuH*Qrv!d(a$0k@MM&CjT$D(|%u#lwh_ zc_-)u*HfxoJW^iWMbapYxQ);I@Y5y&Xnp^oswn7U%eS279s^K0n?u+Pf~gfam2wr^=ghvM6x~<5jG|`B&hv zwpaC$I!3nlh4j|Oc|!3}!J1n_fm&rDLRaP!tf_n>Hd@iv!^8pcaVFdM$c08fp>AZW(lT?qti8>?3a_O57 z(Mq2$G63soWbt0tjphX`VH_>)!Uaplu@Rm62R&B{@LD6xJ&ukO~MFfdZvsc??#dv4w)p9F0#z|Y* zZwQx3lcOq4c=`|RG_QWhr?zJ8Lt;0Y0H8es`-5)Q^WBmZy0fT97k-RrjjeIQj|-1$ zcW7jLFJ&Ac8_-Oo4azHB-g-dIAQa7yRflVp89JPMoN4DX*1aH1ybZu;95MwJ3%h>u z1LicvQp|rKCtn#$Z1UHh4LDdN!}qv(5a=k35OVbzhx%Bk1zBdA2$cR|z9-5n^Cj=` zf|4^gMaO;|^}-LUE7#;1U4J~mz>zawo(V-FQ`3gkCP)mIkS)1#C}?5>CKje>)am7_ zCVxo4R6kqceDLj5HJ?q!fuPKhnuew`>O{yj5iQef{9fL4BEZkW{P@@r`*vbvooYhC zk4J=7Gz9B(LX|)vmLkmkj`b^}BDD<*XG4zXnS7e#O{2!Ipi&EzA1_b4mB&tLB*5Jn z7K)jL{XU7&y-HY@3O^m0fS&8m8@7U-_@H2AT@wdc&7My6i7LzGbU3pRg2`?0LN&6w z)O996Hn}qSX&@>jX6o}IX)Frft^^~B>yJQ0__)p+|1glLylN_BY}Qf#?OdgHBidcl z{s#gggQFe0{sB*U-B(lKE!v^BQs@~t((O^*0h}S z=bg50ROQ=&0Hl*Q2uM+Ds$E{6(0z|xKZIqY9!`|quti>6q_3cmo_WkW6D zPg|jsj3_!!9Z)SYb_0LQJ0)dxp5(kYI{@QaU;3aotY7=$oP1*_gs#%}8;_5b#dG?8 z+|$OWOiWoXBTT>{Sbh-b{v&uG8Sx4%u$sfATWke5m%a|baEBTYTzwulV`+?GV1}Wm zrXFBb_)(29172&IRt}7AW-;dVZZ07i`5q_1&FD|fC~JEU$JjoVg)L*=KBo`kAleB{ zKh&`QT<41O$1q#)UwkcoyvvT~A$8#(rUv((&Bx!z(utJw4$Es1)HQ~SM9ZMTJ`9_M zebA5LL5MV2&BYq4BK@2K2Smy3U5XvG^GmVY#ez(E;HpU7dxHjJW4sLernp;rSn(b!MeTTe7sm$f!Pmy+9j3Yw@8DUA@i_m^q z#+ertz4bFVaoBNccc#FloHK`u0rDh-OjvPIV`P~!)F84ch_QGJ;EQ;QM2OQ=#&LpoHDdyn~X269O!nuKVfIEJ8fYy5N981c3 z-moNfqP>2Lua;~|TJD`12haDkw??rn{XBr>hlEBe;A&oe&YHDc!nVo8_Kb%xo);_xx_YYdrMsOFH0b_Zwza=t zzyBd1J!JrHjm*5q_xzfI(Y8V*dN!fXi(t25jnyQJw{ykCJbl+36i=!Y|7^}rGuJO&2L;9!|9^ixxap#Zm^|A}cL&}|2lcr2P z)kM`h0X3`uj@y~8E4${=Jsu-*DdNVfa)(;AY6d1$`gJ(dsgq->BS(bSR?d`RUS0+ICo2a(mBPLL}1HA z5C5#*maFC8vAXz!i>)25MI=?g4HY2{RO|6qTg26whN650X;hsn*V;ZiVhSr7h}uwl zsI*H z#`>Ro;2vW3M+g?CnK zdZQ?w`JdhKZkal_LP&Ry%d1%xC~plra{{5s!$rgRg5*Vvipq zPq$4B_LEvvHs1CZXO`vwEX&Zi(FL8(Ze29k6tyZO;Nr@mlqyjn!d1p;g>w8>tLkwf zOYEiDit4MeITj3=XWgwk{i@Jn3(kl{QVIRumvQ(eX;o4&7#V=@2i}B~t0O91w7G?P zC|Jk(VzV{>$*?GukEihN!b__g4I2vF<1uPsrs zl}&`?;%0a_c(lKl=TLvuEC3ddw{+KRp`GcjE{L_lX~FNcIjd1FuJc6a1giI>jVSc+ z;NETN3kPjQOtBHw&D;6!U!43BObD&KEgDg0Qq+5AcD6oOZLgxUw1W*W{El3W>DLC( z3n*fvR%_Mp5XS8_cNs-yLi!MQ43t5yCIYYOZ&1dl(xnc1NFMAOgiHMI3W)!?jhVBf zr1rG3hQLw!*8AyFZ9GBIT?dNA{8MSL0Qa~ElaS5+>_E+G9N)?6zz`C|g0uYR88*|h z(jV!D%Aja=-}wq%HXwOZwT{o7CUk2_2Gqvq$vlFHHb1(Q$@G$Oo}goYRCzp@=)tpb z|LX+ZZ)|N7tL42GU`v!p`Zfvu`Tphza|4vGV(!}yDDH*qPffR40U3hS)_8KkeQzJ4 z&mSqB2EELqdQBD%rSh!QZfH?DW~iZhMA&szDaAQ{%2f|5eApD93>`+b7e9sQ$hifs zs9|@#g5??a>Y6*9p3aGg5o*UqN_U@dt4#$`H@^Lj?Ti<(LAt=HeCd^-nMU0l-MN29 zYO>_0u(^unn;Rp1zAdALYq3BpKf1eXjmm7hfODAC)Nx5j-#@Jf5%&31TH1VPd~kX+ zKJ~>iLKt@9v`n1-IU1igMUf^^jMdVdBhzV|kCC24&>b7W0A#q%r(U<&o|5ieh|Q~f zNR}c6NQ|SOXN_`tb2R-)G=eaDGa8@Ab{S)fF&k9_A9MVjss);pAb4Nw&AcFDhAy^N zU!`aRA3fu+hEL`AtWs26IAS)@s)+NsNAZw^`YNq=l`gCqu(O05#gaos!?@v+Ec&M7 zJT!>d7Vn}^zI{CYt^AILXikzLB7MW@OtFH7<(z^?<@66%7kH1n@2S2acSKB3lE^;- zGlw*7MFD}%_Zn5U-}QR%Aga%Q(O!Kb>zc)uKCt7N2~NZZ44ajvq(&=o-x03Q(1gTU z6tU-d%ch(@)98s;^90)iq`TWWP-NsBE?C%t%lTs!Im*0Y-3D6)Y-R;#d)4{ z6ysHjkb#8WWSz`Ah8mS?Q${Lp3BP=EEZgAv8NYP#ImfEe|`09;wc$qtk z5x8HiAe1j(O`4)f`%9UTdvm>f(ldam!QArLv@%K%5DXzlkM7L@*Fk+V%;jjS}lCP*CitqtZn(xc7(DHy(38(ev&PKC5qp1U2B4 z8ND6Hd(#cuQs=narP-vzU6w0jQ9d6PMvZV`sZ|yZ_6?25iz9x{3%k)0%}86G@nApo z8Sq$;1)|syv1kSaKUp8m*UO$rkYD9X$T%%IjiC_misRCBgLFS;8LA0wWr)g>RRR)$~xH@z7=5X%admfGm|*F$)4}9v+?+ z%{{_f8KK=+@u2j@RgIn=?qOxi2TYFp?a@N}Q!=lO)-II1mt_~t=|Wv?O{AR3g-kb!v1Fe3{Bb^gei3LBBa^b82gHZ(B{9KU z3l{gVW4@3+%um&06}a$*<^0bUIR}oIMpyaVtHGehl}l<~;}S zK6$r>6#cD(KodbZNjrb;iwO;HJK7x4{xq>)uH<<(!`J>T;pJ#;sedKpSuGv5KJ)=i zi5q;6y4^9niJOC8^yw0ZK%;CEV(wUcXOqIRfjLLL@LtGHr7S+1Z@q@XldSg2UqI7W zPd60s@;Pk_Xf98!2RBT+uYhBc*O{N>S=)=I&1D>&UR=PFS05V(0_R~nx(>V1kX8x} z9i`tdIsSVp-S_j(+xhHB!b9-rfAPQB1h=-VWA}i!KOaA^c>$GC2k*oD{VKLuV`vd{ z=f-=^_pM$&oGv#di$#)uref=zTKJ2AHYfygV;$rEd+Vqin2bT_JbS7^jJ%YhGZ& zV9y-emr7lcqj3|hyab|Gg!NqKMo8a zn73Q}K=&A9*YAkZPa@1d|PAIXz4C46gdbq5Anz}-+ zMnY>3h=yyMpu@f1H_8&|ja7nKsC6D4VyWEC?XomDa(t;}Oq~Abc{f4fqkw?+a)SkY zGa+?C3uhLnfq)s9ZK`hs( zVx+8vV3o&LpQbyg0UF8lIw;<|cYV>pJ;AJg!mPpU5fMEV$nl{~>7%qqj?^@HB~@yQ zicS-%yJ?re9rB4XL?kzgPw?uJErfUSxl0{zPQ7str*u4z*J5|5S~tadzQN9tl+|sQ ztV{4>hl=7zJ0w?KUltQd)d##>PCuyk2BI}4sHUDE<)n4>z}#$^t&th7a(c z?I%P4sD7*ZU2P7)jO%&PO}{n7gMQ~H5y*P@EXp8S#wT;J?l(n!E8FDd_!&MHsMMYY zIYxWHY74$bkvGOadjBus>}%T?5E3pT)!+K)W-q?iyah$0rwjBG zonItrI>KyHWO-8ai%QK8GEyR)dOzT3>6NRQ^<7;9_C~8sE&WPbn-0!suJW^mlb8@1 z@^g}J4Kr-F>c#k&b9pS5--~T=bC`{LgKkDVX}1^?VRX4`dXwhaoaZ7QM@d+_D-hp! zRc!7T&Ig!5=~O$W8g=Ns?N5*9HZ&%6v(ZjB9R@RbKFN5)Cdi*1+LE4Drl7is#|Il? zzs+pN)z8S_bfnJ(n=L|Ieei~p{(6pww+^_-^7eDMFuz~zjo8fT_{iRs4x*pvb=Cz6>z6CK^HpBhh>=iXor0tB9k$N+EA?V93{qG2DJ-!8IF@g@D{E$i8|;FE(qSK z)}>O{I8pV0GcO5)CD=Nz;>R-il(~{D_LE_(Pwh&R9UKPh7k}^+5~UMpOC`U^?{ge> z2Ii0Z1u!I?2O%i*FMTQ1{S;O`U4D*rwos+ZKcv@=l`_$N!+qv`evQR}lHs*Tw6I`) zb?xBq2zcZL3cnnC9j&xSjAeAotx*;j;(aT%RYRRCI@0pX5$DshzCNxnPr=4$b1P^ah9P+VNX%G;`lja-Y1jyi=_Qw?U z=e*7AmqAzZ@=oiVmGXvP;JMNn%qcj{W5-$KoEF>*tTMVXEYS3VO|@wsD2xn!Qb`_AJ@pg6wAiohXceP6_x zWv7O*H3dNim~)rba!<3Q4D34EM zQOJJ$gLUu5#34vdGaEgB5&QoUJitF(CDR^$sSIs-_@f`Q$9sC|^N_M}P+rptKQ>?8 z$0b@nr|SRfBj@eW{SIq4KM*Ygw&4@`xU(Hmz_!Xz&?_?{9ZE?%(Rr5~?{VL_P-r!X z86P(T~*lej? z(WnkghgWY9Dc2?Nv0q+<={DcZ5QE#1tmPn^z44umw@&j`fUznHI^(Cs*g>n*33FQv zEBsSpVuRU=epoCmN99a{(Ugz;4Ps|tlh=}F3B>)I&m0sOF|2);#k z@aHoi{7=_Ae-xF+ZG?npyL6->r=JF45h&*xUl(5mQ+7`WU-rWK_RCoXl##B$3@nBm zjVzoi9GgUoPmR5N^7^pTHF?Z+KP>MdFDU@#sd;);bpLjy_HKm*9cciPe5EZ5bgas8 z>&;jA-PO;L{9T$1Uv1$*heB?oRcSAGkcEq3qK)B&4R^8lTH}z>nqHqg?0uHZAjOhv z3pbR*1ZrlsfTO=#;t!)wZqK^ORT}7&11x*LN9|Jd513;SuAF?KK`wofqVBEkWkm1L4L`$9&9p& zP=E4w0a;i9|3IyXqB^&$btdQ)WDSAf09Qu(-!*qZc;8}Q86SNL_szF_+rMaL0ZfXU zWFvp=n%|tohW&im586#;ykri5tZ#1zdg|5~KQw%sdYfVVDr(KGo1)DRUUhsh<}lu8 zbN3ZV_$6j$E_Bo}5%vLE95Is$#x-0rxDKD=I6Z&ewof7Qi`FxeJoLeYq~bnn4aCA# z(&7i?UvB~a$Uft~fKj|-`p?M`I2zSt`dxLiTD#sAe(tm<=6MLl`dUiix{%eHi*W&ndG|D2SMpIrniDD_2p@OdWkFYr2~AzcAsB;{TP_&nJ!sF_cYuu? zKhD;I?95?k^a&PU-ds<%q&vcoi-%i?W51d6<=x43oM|F;MPKISg%@PlOW&J%jk?FU z#eBqqqr{EPCnxXkPu5Ao?)%kxu4-U$^u;_~&PX}LCs#O-55Vni4}auYyG%I+Tg)-#|GG(KtF1s*Qmjexu!5`6Rb74!=h!T^~$&g;iQS(D`m zHcMT!jF=pDy*8JVl7%;{`HlfKacZrA!ia_*Kp^;e4*yV3x4jIB&8MT;Hi_2E7!N_5 z2qlTq?~dSZqBzH^&-5zCrtzCqDV$mI-7D3u8ORr8T855>4plE9jU?U(2W(&OWVSl4 z(8LUZU;uiE-}xtV#Gs31Q6&b-<15FlOUn1$MY1uN>pp|?zPBdP*8W9b?$9sZtsj9~ zQO?rqw&HiEQi?`rktvfO44%FFi{w-Bzy}%C-jLI=e?Lv%iW``Ti6D-zi;0$KZ=t`* zBa289e-k0&bik7Qdg+sD%t&iq$Y%e$_)#YF=yWIcEX4UNTD8IRTn9G{03 z7q?NmoQcUxROFazIYn`QTrHC*F=j~)BHQ<5epTkwgnq+xi{J4gfs|d{WZZ^qnhC9n z9ERbLFf;uWvfGh;xQ{Qt)&d@)rjEaKzo?pe14s$But3Eg@|8vIC=$NUdv1J}fuwts z&B7mEIxbNsQb^3{@sc*TZ_7c789g7kL4h;7t?1QV&w0Bxkwp-~tiT)D5na#%( zo)+>QuK{N%8PvJU{#`RE1?%NW?~1H?qCn-M*do z8bYTHGrRdQ!toX}MZ*Ee6eS)N#W181b!=S3vlRE#8Qa-JS`pd868i{zqWzs$f)&+Y z&;gU*x{udY0wI?;yrLxOFtbPp2AHB*d)*S*IsAf9uXmcKmDV zi5wEvVVH+uf{xdo=5(>T`kXE8)5lrZBVUAdus%rNPa8#4p;Rl-0QL(gIQC{D7#!k7 z;oY=KRw{Sr=WP$=lgMki&%P>DmP*X()SDO4_Pj2WdyTyuOF1o`r`ll@2?$Y_R;>C% zS0tsB;JoJqJQn|n&=l~QYiEVnam{6+bjkYOq6pw#JS8Xk#YaHj8HbbX;bzVajVBcP z^t+D?cb#Wd*}e%MN0d^i=3!fL5Z`d-11ijCpwYZ+j6N^a%{6^o(p&)vO@+e)n3p&ULG3N(I@`PY)%FnUsjp~AuvoKz>!h>OYko089|@LwGMcRCVHdK+ z>x?_w^Ijbh(pR@Y!sBOnh=~KB-Z^ErqXF ztQyU#zW2c#FEgBnLpJiZlyT-nDLd|72;DMjaMj;WHI4xXEIc_FryB~kohM3~NTbNa z>`6XcW9=_(23I<#ZeW6Tt1ZYr+%xUqG- zM^#WpDE~gete_b*jO~hFj;jEY!=e6Lr||v1|IG(G8H@8hpkABPYABMMejO^IFlK&s zds*GBK3}rd0cp|QLT-(r+brA^0UVA}w&M0&=?MvpKC_peaK-_it&tiWB1Y3Ts}ys3 zNkJ>Eo0WD5gnT-`A%{NQNL@yXEp%g+l>ELG_&*1h8<*+`;jlt;5 zJ^IPXx&XqT*H}>VRcVe$cI96aZ+^0?=d)6 zVMWuVqS_9ti@Ji@+Rd@;l#`i3X;X=sa-Q1!sc&n{mB;9EG)kE=dFNZF%8kIT97;c6$tCAWlm}R^rZ#*U0%9;l;K=Hee+8-RjN`7t>|0(E*jtcBl?efzjYXbdx;e2 zpb(o~-|1jY#^>mm%kj_i!^r1XLzSs~hC6L)0Qrl9_urd{bz^ww@i_Z-iQ@dOyJvlN)5O%^DF3E?%xr4AZ13)W7qE;_V*C}at zcTaZWTh@=y4c=IMm&_iA!9+jKe*Y~ylEoYIHnm=lXUg%ujMA}xin@5zwj1rq7W(lv zC~H-N-SpO8VR7t%4x3ERS>kGKqJA&Iu3N(A@%;9bM<(KGb)#77bO21bX=NUDew}Zx#zS-%Pnz6Vrnd54qdR{tBK2TATeyZk?Aqr@NEP-5eD&5>7vQEnJC`s zzCj^3urhn*{BPp1yuChssN-Bs5mmzV+HVYbRl0mEz0JB(v*$(v*BeTwKLHmveqH|1 zj%M$dombBQ?02=H3H6_#oj6Vvyh5}T3?fW&?h$2w`0i?}Q$_@~d?02>*u|IJtp|F} zd;U=tb~Yhza{w`WNlc%>yrjw;FQFJFnKCeN5L&6t-#Cb*%>#J2+?^auO zAusWSER^w2HzX|N0u8ELXdq`+%_gyR56Gy2-9AX4RbB6b4kc3|N$toYbta+#a({wk z(?)He(djDG&=_vAJxnxG3#9lcM0KcFYZTS-JPxdLPNN%=?kDU1v6Q~4)8~0F*Z;Us zXM(0ss2&sT_|kA-vLDkegViF?vU!qF;Nyh?#%am3*gKZHO?(!e;IUn;xV|okCmjJ$ zO3vu1@Lym5vMC9q9t@ObkL{_+fcgr=`1#4{kJqaGU0_L)HP#$wUPPx+Vz}3^$UOZ{- z@5P>n6X1Q*js&vcKC`iZGc<*LBi~y_A5RH8(5339c7H}lub%0jEZUH#HuP0fIUo0Ql$SDW zR-q>a$Is77E!2dMNi@gR)#PZpGiN?I!)GKXznZG~VPt8}TM{iu552FNaOnA}?szJv zbhpF3GniFj{azu>`)`t%{K=(li(*&LKJUH6eluQUtNy@11Y7qDg>MXA0rtX)Dm&2z zYFD;}tH4_^iwmHQLkQs_=RsF!B9{C+pX0j?yH%l6g7cIcB^|$C@U{$vqh3XP<8S!Nr3&b4ac;eiB;* zMzGp;zV!hcz7egowHS^}{`|>)3~D!gF_(V3l=5ZNBJM%PQiW(5F03lXI!25m9Z~Q( z^*la(S|?igyjttHm1u+DIXLvysKD`*Ca{Z*GCx}4AqBk+btsy!3$ZhMgsWgL!qkv& z%qja|P@_svYNLL3IRQ}b9wXBCt|jxin&Lo+SY%+xrkY*L*(1_tIsma8({fvegTz?a zu}u5jJTcFtHIjZE6uldly2e}WoB(|&&a_V*SLgZWBlXPF@8xQ}oQmFzK?$_+Hh7_M zT-%sPLPb7d0Hurf5MXoH2Tl)c`$DpH_dqFUN&R$B5_PrG!CSpRq3#LKz#Kd}`JRvr zRL7w|ms}P*v}oZtb9}*ryu@(m%sH24v+d( zPjDrN`6N~AZoKS|>du?^v$b|Bim(g5ic6sUV8#OWI;o=?bIQBReKfNe&U7>+q0XE# zkjFJs?6t-hHP!}JV$&yX^$jc?^)N-+pg;KBU);rXqLLp(iROqZWp$d+Ai~RhF}MfH zU3eyy8=K*|-g#)hZ-p-g{9e%WwrJ9UhLNVy7_= z$Rye2Q9^W*@LGF0EritGuD1q5>*O&4-?g{XeVu`Bq}Lc0pSTZFp#r@FuEUTA>oE3b z)y-AiBEQlpEhn$)K$h^f+AdNKxFkhe&^xf+FDf8`28#k5UiGd{n-h1vM1tt3DLvNi z>gG=roURq))S;)%F@jEmHjX2ODOR0+-IrBRHo@c9UAjX7kh@6>qye{2yB8vGwo8i5dmvO)~{`JckRJ<5^(M5m8 z{qaf)Xtd-GM7e<+kbM{kwcFtH71n)J_Qem+IP07kN<@T-l$IOqScvrP%TCa24q?KN zGQtnr0-70(dPI%p;hrcBdB``APhOG4tmFc3^>sy1o#Cz@ZjM-Eeq-r8Zo8MU-czG^ZP1v*F}8$-Moh( zg_jIgh~I7F_jVt`v19I^FQ-+y%f>WfSj1?8ZW}f&1ErPnm15+^7&O82bd+u-CyfuR z)BK%<4FQ{39lY60?a$1BAb*e}HFJ~M*cxRp{_gJXP>Q}cgZL)fXPwD0uUu~GT16*m zDVe`I^=gRYCORjA%<|Gy9uF zxPZAWJYVI}KNe%V9DBNqKayv}>A7#K4KH;diXF4^7ymAsW=4Enq`*+tO9#dog=;^r zK>(?}oB4($_~?C}{iUh=t{ss=fWoOfKQh+Oxv*4f_8+?$mz)RQ>u#ATX@6TJ-76+G zUGSGr5tTG*P+5}tirtXzDX@9%@Mw?ge@bScRA@EIL<64PiP7IGBBO+_K5g%(9^0y& z=O3F}YjkM%x*xv$*a>W%_JBFW&!a+o8#B@$K`JB*UbHd{T>_`v-l?lTa~rDdpW#VL z+N80Q=$QqNp#4AWQ08!JrheOi=X>4I0eoF{uiV*zIM22a5~QqByD0dv^>@y!vkd}e zn^}%~Aeq3HL6(`P8<;k%W8)I!*}I*{7yhou2j_JMG_SXb%tb$ZR1|QhM}Js z|1f~8JEFP;K?g-(=a`wmJ+fCe4Y(6}BUQJq=Qw3%9-@{T^TZlY_5USr|@9t!7 z5Xi@qj((a{Dk8R;Sol1o;s`kJD&mSg%Mw7!Hr{1-{Iu{5Xijq3_A%y^WBCvkZI9^*&+0rfm|Q)Zi)`>-%2wP_A(2UX|4|Ybc~S<`D*d8Rn^h z6K!iCnV~f1LU|&OmZxriN6ABIHaNn33yzX4k%O?et9LY2DQcCud5fdJh zynT^ST4;#M{tpfUmtP{Qza=6@E>gOwB>1{zWx&GIjetjSv7ONejmrLHS23kMLe_I? zsk~rcKlR$`$J7NZ&kY^{BEeU^NpAUPLB-|ALl_$;JB?#*4+;D)z~f=yTOt=Y=Q;0> zs2{K#?%6PkNw>?5eEcHz?qe_(Lb~ND+*QFTOvv*~JPU*##Uja11^z)=BYzne!xCO2 z#i(j5i)aUifgcU!d2PC}lmQUyWZ%2M)Zc5rOgT}l0+TV-+&bD2F61JQuo|(`PiDu8 zLU=;M_h*iF*bzx4!=i9iMb1Xr?*)^01W4GP(fl@B^k~AOei-i?X+B=gVDoaP=(YD( zyG4A4PMNw$PayPYofuX9#o%fFYP#i%mi{nh}~ znhSoFg|$E@-ApqtYcwtKvW;gQ4~Fa?={4kohYwAo4Cl%J+@73wF-b3pk48~DM24oh z`E?PN^t{@roi85%n#3tQI@`bt`oK|YeFz>$%nQW}b8y=&u$g=HV-u#>1sqD3 z^u%irxScQ$!=cL_&Gnq_!LwKFlY_}{XnWPKxSa)kB*ZIu&RX4{h(oYC*wc8Pg2fC1?n^-E&`bU`cr z7}pPq#t74}EH}+uBR$bpK1fFW{tugLPR##;yY0^vvhLgguo~b_ne@n#bH--uVE6U| ziXv3D<9Mi~CM5^DRgO%RhmJidHUAxyN6)Ztd;f+~#{)XY#?l*>uRVr=pgu1!BSC0a zi-lUEfGg@5z#*ujGYK_Oqb}>yU1*9KgqH1%1e;te?`@;NKN%G_1>O8cUv^XQFGL68 z@NX59S7Hpub>?bXv99xX>1v<-x^jFEJWN@)lVS@{6CfH85V_?4icP$k;mtR5vIV2d zKr>Rf$C$c3cRb}mgTo1-laZEPCq((axQWWtfi5uJs81@~l2#^d30#WLihN*GXxgt* z4sfzDS1YapD}=Fr_3GQCovgDNZ5{T*aysi1?DyzliEeMhSwWw@-cS509lH0EN_3nq zl_tw2)X^!l8}!Doi~ge+Zx({Q={N68qRxXjb^iksZ`YPKGDae-$zQI}iRR}6rG^Wa zbj--n8iS1PPqK}fc%;p7yQVdmd3hvRCCWvS8oD%_L7+1aip#E2!yW0#O*xAVJ&yB7 z*2MYC6X|LAW8!m(m|&Wu*YwijUmdiwm$ygEY5s#~>;CdVN&hbq6% zv02vgcL$c*CDTO=mMU!UBJB@FdkzS5xYi%|N0FSYP;1&#wQ2&T0S1Q7msyr&C>T#YD zFZQ^Or>2#z>~^TYF#%g|G8n+`PArW}>1zI$F{ukXw+wTFKXF z3X9JC00+ljoZ#~eZZj5KOBGY+ja9Ga454oBUN#({Yf^3m{f?>_4Zc&&McoMy7+k&>85Vt++cG&QG!Q(2It?ga7ULo&%j#s)Cuef=FeX}0fTfvEsf&z*ihie)0nHn&K?Z7=m;+WQZUq;XP25>c3j%V_A#FBQQ5@5hf5IHwa@k>t}k};9vfh5pB*Nv!_R_IhbYIs=xnb0>e8U4H` zP--DW8OHx>`q3@$$Ey+*OvsYh1Xa#lNAz`tFPbawGgckpl`L1DpK}A>ol>5ANDidf`$w{OOcUcWP z^PZX?JUjcXp1(hOU#?TpaXkIs+7++3=|1pZe%JzRFn|7>X#1~$*g?s6&5(bq|JM2l zhgYcRBG1*Sr=zirw*T6ZvqW&x!wl0c9zX&PqFpoTt$DJF712%WiQfCbj)2LiguX}l zvZd_UMSS5^0Z;Fx7M|GirP@WQ07_8ena~!IpdxpbPJk7YKH^lW6p1y^l0mynsr%99 zcBjg*fHUPFT9sjE#I~BoXP=P3I;pXHB0++(n+;v~MJ^m24a!UnySmi&D3M7UDV?qQ z%1QCNFtk~1&e26R#@TY&lou;0YoQ@4eLi9!CjTIMYD7X7ws1L^pVWMX6z=~{51#zJ z@#w$l0Qz3}L8qTH7_VV8b2bQj&udtg z!)FI5JADY~`;Ej5pVgKcj9Nc(ThLav!Qv&OG7L79JoLxopet!JWnMNajozxG68%kO z*X3HQS^|youd^5ylKsl<{`EKfpFzJR(x*f0-(~xSeQ1o0OaF%febtY>{SqXvBp|D_ z!#QU=zSf|lG}E5y(MN8}UOvP=kV!ie(PWzpB2D&qRx?x+YV~Mm#H}$3@yB{=t&^vk zTTlclc!I_$d9EkLW4?f*(m7FYg{og*`{~Eu7aFpwVI)x!gE^S%C=a;-uAaZ0RWB5+61wbU0S-IV?4+U4lPgX)7AAfuJVzp=ajm3b^; zgY(N;Nqb@R*l(($X-wPazvv&!pRZT#Mo~ND3exvbQW=*Q52mo;hEh2ei?0Z@*H%s3v@j(oN1 zpNmSQWU6Kzc`dH9dIb$o&)V^)>!))Kmq3r3b=;u75C8TQC!yG{4{)VPgr_5pBl>^P zYRj-UDc2*pVabdPmB>Q|A0ZjvFj8I0K~We1Wbyt% zc!ZuymE}ELqXZS^@^~ySP)(tN#i4wo2b*4b=ID-(ug-y(rzTM*;#*dsX1Us=J=@#r zIYM%-nrC7GKTd>KapDhM7;v@EqV-b44=L+)o$Ult@nt1T<)LUmX3~wn)5ySFxk-PA{-?J)>qY8z zD0z6_tCx-PWmf9`JBM3Eq(@JWQN#%KU%%o13+JA+$Np8ZWo>o#&MYVr@4sov0DfSu zVuIQ<=ezW(Nq^8e%O6HdQIchFODRt(X<+RE)GGpvV&bSOVr<13dAwk)m~me?yg0yY(FBS&|0i zXP$FYPa@a$-_uDc8d40nzV30No=jIKuEMI$Li@>vAlHF_jV2#79bg!XmW<)PRsx@$ z_EodE)zyX$ZDqS_9AQFq!t3y{9wkZTf;7I!^#HzS+nIVT_Q%+7Gj5Kt$H-&x zL6$tSq;Q74YD=kn>cI&P=l2?`Z@r9}Q^cw4o$LY_P9H%`ajMMn+V5Tm5ZU}M_P#T$ z$!%L#5D`QKL_t76q=_`8O2~El1Vp6?(!0{7hh9P`BE9zx3WU%i5E26f z?#C*7>vEm5);{;1v!C-k_fPyxU}iGr81HyTnR7y)8Z1^C6OB*>H01<;H1B04z6YOEz~KtW|AG2+#pCoo`w4o7tt+7+ zA7WlThVks2aBPW&-mlJ!@WtiXT26QbkHpjQ+%(+f#!SxT-vwQBJs~SC06I-!S%BjT7u5c6=8+{to z=j8OY%_APn+WxKt*}rN(_8hx2+luLauPPbi z)*U~#o)b$eRc&ohcKGZQ%a?vsd-vOyK3ein1&GOTakoLL+~?~g(sbwF#=6#2dA}GX z6D!Kv{y-90peWq1_+gA!yLd+aPTRgnNjw`Tlb*bLPXwg+z@*txZ%bS&(UP>LE^5bQ zdGz6lu1|%=P#N)#)xsL37z$Kw^&35XuBP+xClSyx-~$_S7QS6jvUm)uTZ&pJidUF0e<&IiQyr5|z2@IG0fAhcZ1Cj}B45fU^ioJ>ik`T^ zD&08MUGLTsM==Seu*7$n!Y^9kOIQMlk*#1!s#^5!o#wx;X;)t)c!k%G?&z7D9N+@o zrt%)?@~xQkQ5Q+`Xc+~!Oy7Q2z3Ejh6DY?Zdta$x2gietx3m6~rQ>692h{`|Gke zC^+^Fm7R-wVY4W(nS%E0=1NiRy4h|WRtJwa)*KPY$_g!!ouRT`KV{0iNdA(fhccgD zR1QNNqNsaBHCg(s%+>%(JI(|u+vNCCI^`6ZpmD2D;O*Rk5XDAK1}*$2VE^s%1>gkk zfu<2&_j^G_iog8~DP@3vX`WT6%78Nivw9M5G6dckr%XeO`c4Tx8{akU1Ox{rr5~xEK3_-C{e1- z&6Z*-sH4E2k&tBOp56NcDb-H`O?i#3x1Bk2(;Gb;@liEPz>oe@fjV1*ieb9!7bZQ= zX9tVRE&Z+C^)A|d^oE`lnXnWoh9d80o{!hRbz{FQqMibzn!yUVig{?}w$^mi&LLop za%S8`c+AFu--Ux7wCQX)ifDh~8crcitpWL03MX>%c0``*=oU=Zn5WN1Sbqq1>%aHz_45;h*OA zyC3uiwqH*cpZOCS%?&UuUR4r7aWdtHdipMSsTc zG!UYsJL}AvFHdJRZS$OkMTmcMh2G+lhnz}koJ%D1lYD4EYnI~|QQfbT?9nhQjZ_~-_CH^<&FNn$yv(0fSfQ3w zvB6tZzp+TM@&X??IP#)tfe{V93SLornwNb2-TPVK;T-i+>tJGb&eO#KE5o*E$fS#D zud39I5E`EY#df^Y*%cL80nWCz09k5M3H7%#n6e5P6E*)sh_+!%l=q^XpM!QIMPHDF z?VkWX0j6-=>8QBS%9_2IMy_Wc8=eU3%-MJ-=8)?Vm*&(eZFM`SS4!coJylS@(@;+P zyRV(8IeNKlwsX3Kbpe>bw?sksog5s|W3}Zh*}00|3LlbOJa!dNBLm8Ei3X^>-!GY( zcfRM93`{(_Xq7goRTnYykXFzpqV^MizFoS4do2({HVNHtl3JWNZI;|@Px=e#`~D6Y z>AjQA*8lz+ZijyAi?Ry0Dl7Z;{30`%V0>=+Xz|`05z10Z9RMgBV_$cyDfS z^7uXT#04x~G3{;6$|CZn0|jxV`5`#^g`kv4*lU)`-YW@E`ogOdbsJT*L^kd93tvgX zJuyb9GdV>p0(Lp~6zZuwVr&Q;2+42q2F$OdJnnv?ASUk_HoJa2S--m=2RUd$N@ZM_ z5gw3EABOk1_P4oo&}?5}xc>0Lo#v~#4asLUA=H@at!|ps&}7Qe*-ly zWVN$wt&e-4eyQz%*7;eF;{`OzRaJFfxXx+*jApDK@-cKRWTWTaJGG9Rml_2%QWJUS zW7F|3KE@Vc)L{=Co#e=%k!3W{``BbLaGB)aB*3ZH01t*IA1 z$bVRRp7EfNFmc>9X@#cN1<@XSAX2?%c%ui^BTjq2NhN#s1Bj@G)1&zCf$a5SxjGl6 zPYl?w2sK;YMlFLbyD(}S>b-l+-CG9H16Gy8w66Ap6y-cb&`=iljdyF>gr?s*C=j#^ zvA{`6v4bYXT*d585e+C?~p9D>&BEa<+^O?duKFN+p9wm*FmZ`&P?i!pTxP z+B(?M3S(e*hA-Gp0uq+~3iB>XY2&*;=~TczNla}>Io7l_cxoT^00sQDRf_)QGwz*R z3!c5f;-ptd>Akh6*!vN{@vfbRJ%Wif*rYIeaLE>*pAd=0hpmOo8LXjj$=e89qi(K_ zSsUIpiRxT8eon6lKN(R^GOFjj=Q(DvB`%a#??OsI>WYfdG_XuikmcxLpE-f6%KVe{ z!gG(Wf&|T!_($VN+{+9kjwJnNM=9b~(6!4lV4*4G+T@z^#$pI->`@`hu$6q3$HX}I zhRY)zzm}orxO^7XQl+Bm%zg4$23z3nh8p@FWR;<0wJYM0qEtGJAyz8E$cpCVsjy_m zy>eM8^@*8bl75_xm~A?!4@B0jNQ?jLV*awCColS!iPF`l#}rDz1vN3UC(A$a1fMob zfSKYgf`NV!eH%*VSVEXc$>#=-NBj1f@2?CLw1{oo^6>Uhz9kYLo4+l*=LS~1@4@Mb z+_W(J+T`VN_|EB6FYCa2cv#ioZFyIZ?v2f9NEM_o(~Q!Smg#UfucQJi847mmxdCtV zIiN|M%j3O-yZ2Ac`}HZ{#YLi2^?FC*Fp?*vdkTXzCyE)MLonl7BhebUhU!U=fAB#W zQqb*gy;+M`7gdNtIGKQ%?$J3Rm3R2S2VoKRN3{{+J#w=1PLl^6Qp4jpMdVkXy1L{s zyt+-2(2*?(DJXEp&4_>z4o7;$jr*@<_!C)~RcbVpU|Bck8w58rQ!|J4ZryF42O9t<0)@@ zb}DAP*S#POx2yvDXeVd5uaLT?tU%>Ml_JyQ#kBq2YgqHH*+j(;Ql`hEw;s{ix`VD* zW~dCu8s1#!Bj+qHciuZYABz>MUiwVd$cy0t#XW%3d9`RCY&A?dH^&$Mi4x^|BaE`U z6m4V5qb0SXEi&D!>ZCr9Jwi02pJSg-Hy_$hFS{tlGgP8>vFnepfr!|+ZQ!YSzspWr zzi)&_Yb?9gthTBZja@$6D{aUUtnv54!`?FtgsHib{R1IR1}i)ZH2zcToRF+LzLePEMSHx0%)K7UwM$OBka&FGmh_wu#4h}i}P(aYPY**8Jb z#WA6hiE$=Yeh=QRB9i&BY2oB+vkm7Nadrp6{`c08E>GHRk&QB!$_L<(&*H_gB}Yo8 z?g~`w>`U>-U(Iw=l7KXT1G*0NyO?$o$LV+XZ zgm)z`*NGm|CmZ#z-`I`mnXhMur~4UcQLXG5qz~v_7o~c1Tz7OS)ZX=c&Ih;U>uWr$ z4#kSuPw9#t?3E8Sp6S*$D4r<7DZ=(BM? z8M^^gj0T(@7r7{;jpKXdJ)W zi&!@+d)?yAQXgLtKM5{gtM6C%1RQK1FZecr3j`lA{OV(3LF#i)&dPY$@GRiVywZ%n z>b6aUhznx{B9V}^nZQDCN7kfF0;vH>@&8U}aB@1=qdnEm9aM}fYOnKtq!imE_Dobz zs_PB$wI%vOZ2e>9g}f88Yl?=6rcaGsQ1y|+DLC^ODi(of>1 z-!146-^r5>sCLu3J5jW?t4^SrYutT85|#pEh>I8aoq7H439D{O0|I*vc;x_kPGg>_ zWD-Yl^1KI4XTpZ)TXzPgk7Esd_Cg`Iut`12Mll5rT%AVrh?&v)>O1vSwe7d03}b5~ z-?S4Z>le+<(v@_kD$WmQG!ieBbgZ_DLK2O4Pn)uAoX2yyaGU+57Sv%^OcDbo2OENAW%&#&~tNJrBfvP%sh5I^cOO7+S5U5G++LWu$stJ?R z58lu(+sdRlpd0AN(g9lLce9g{OJjXAPx3+b*=e#B68p2e^>*?7+=iYbL;dg)*=4dE zZM{#x)7f?4k|aA34GSIJ_?5WjsSDHa_}%f=o#+?eyIw%OnzYWmZsn-bBVDHWh2&ZK z;!)qSUY}wPnM3ntdu-^EhMAi=ZB-daVcb&954fS$DZ_)+`|wc$6|k*}bg!PPri*0t z%`fA=Ox3);+7KIE`q+dRDEqOTOr)RE_%umXnC)B&cP-uhj4Q4x71h%mDf+Io@Q9EI zIcv)fm05>O6?Fa0MR=q&?321(?KQnx&qJT1x&mgd-c6V8@EzNV`O$LU{JXxSTghGm zpl@5%d!hFzMKi>iLj0Cb?|eOwx{)8DA_3RLu)aG{q9SC!dFxd3P~~LAngw8YqyLZW z&fkAm>kLQaiS$bA2?UHyw-7U9`FG*$jn|-=sy3%w<@vfvoOg>HY%)-+q|wC_O&oXO z3Vs6TnguCsjSLgt>c<WiYbu)c2PK)yT5bqO#t*DJRX_XWCkmR*BsTJ5HASCy(g zt6M0Hz^#~=M185a!a4JAb9?FCT|-%644!&Zb%d2*e-Ye(n4nXWF?36+N)X(&P;U%Z{?A!Tfy!!Iqm)HMBB)&nU7|q zrA3ydA{-`>zIOwA`>jQAc9p(|W0EiB6v~MbtkQ>Dy-an_maGkhrM~bsKMXT1H?LTPRA8|2!!`wkZ+Fo%8*6TgE{#>?Dig3HeW~1_ zJ$A0%N@QH@k`uVvhGsM*4`nR%Ut-Km->vvBhW8i`zfb>imN>wYbpnb&)^lYcD(yBm z7IMbzl`A^G-tR{U)I@fCYb)tS?K!OZq>e9rN}eC!c<<2l^@u$y(@RHRsF|EnXSCg( zL_9eoiO^GzQeNz$xAS4N$~zp1Zc=>_LPVZLLsb2cpU-)T*?p;o>AEyXZDF2``bHXw zx}l--p-K9i*h$@5Y^q;$y+b2Y=vzwXrAbU@<6Cyt*~xsESlezpn;gId^$1eBM!z6; z#|?#Bhh`+JXKtu|^W`_Jig+@nPgmH}-}!O0fUoA*zXG!to5Kw9+sKKV5^ve3jH9wG z`oHH5q+G^E;+P3LB;Bl2`Genm;YsXA&jk2G*#;%oi$K_ZKZAUKDk3T0Wt@0|#DAYM z`?wmj5gVxL4ct1x7`I)01kyE3i|(8*={Q8O(#&Bl&|HyV%CBM&Q@>rpOhwYVfh%~W zoM&;m*E6q(SfZwZt7mS^%MixJ@6nIQQ-(^}`O1qPsCKQk!tya%J_doZ#A2d@gu5it z$}d}(n7(HC)4UUV_N}x5-had?;e1fnT&gT^-bl2__Og^CvENE5 zlU&$0u8+-$DQKUEm($a-$l+%&>pKjruei7i+O55`*3H{h6G1L@I1{oQMJAc6 zlvJkKQK?>UrZf7)s+o%O|JsLE-6!Y)M`r9^0GF+@{R5MLFhvz$XOs2zZD)sOF@a4K z%%yZ^8iwcW#l0hHSMc{K&7RVE-Y(4+%+7S}wS=2E?{3U@&dkDD%VXZ^Q^VMYJ?7uM z^Kpoud)FGR;4wMa7i{HRdB4}4Vvs~2d13ptO1cm7lg&vE0TQRHS$J7K%gfdDo5dbh zQXYXT-E6%gl>D&QZ!WJ?_soU>PIL&{>4z5M^W3=&-UNbkHY4x5#XiVnPu1PPAu;j7 zUA#+-mm95w{g5|%)~x4HLnrax<>2gEAA4PShGcEBPfDE*TfO4KzS=y{Z~V2mDbA-k zEd7Y{7vIM7cJ7GH<%3LVZiG9#+u33u(I$B;s?dF1tNEL~)Z!5UGxMxc7n+C!Xl<2- zU2R@t&HfDEO^{|CkU)v-QuPcZ4oa;6JB%3fUYx!AoqFPdng~<$DLIsf57gFfWE!|X zj($}EHQur?r9UCOukcs!lTU7~G`Gd*?;V=UyxEpEt|vGbpbk~YJN7`)$7CXpG!kpj z7wCO3g1Tuw8dMfqo0o=WPluQpZ#W@n?x+!o+l1_(89wPbvV+vP2ov|HdkKS35&5*X z>9L8gy7|Dnmw{kA^K9MCBGH>=9vjTG3CC|-Nw0D?T+J!OX3uuE!wjW#M(S+{dPF|@ zIz5cf%w>2>c36E3Z8*w?y*1;nM9WI?V6kAIzA2d57E*U{T*c3CqxEQcnySR}Y>~mi z%B3>X9D2VmVFpKoWM*LNA;OVi+B~UmX=I&?9*v4q^Z(vQyg7$;-O>hH+UDH4yEt>TP^w6XyX= z0&+`9JeNhuT(}TaxTW8eC)s|SL&*@n~)eYoQ0i9-|^DJJakgj?oQ?oWAo zE^VeCG^q5$xY#n1^n2x2<_vX{Q(|VioENV?I(#02(C$+j1)?8%m#ql`^TubQ0-=gh z2l)oya!cfFRMWFW>rGdq-a4;tBHz3V3_+1_6XOgtXwNpjqUw0vv_*02?t(RAGvv{< zZyn62c-9%)wMkOoQht*7jU`hgiQ}x^jK_>4;|DPT`(}wf(R~D&@Lkt=e->Anvkbq} zm0yU9bSB9qnLF%aB-A0}e9?x(B-#W?bU3$6KcZ`D`TnH)W^Vg)XSk!{O+RSM8Ybz| zKf6|n!}lHNP@;xn7D3~WdQ}lG;f$R5*{Dh?^n>n2 z(a#5Y=NXR@Q0ve_%Kf=kNrQ~qcM(Mcf=3TC=J3&1-@}->X>D&co9|jGXJ`8$eLG3d zcTS;VOnX+1=!pURNW-LZl#M=yzpe~Vw?7Oas_uWbllBl-iv?8;b9=s zJjmsJy(&7z+u%YYZc}I=Wo>mtIy|w=%1c(rVaK>5Li+x>1Q0hSxoYx)CoHZi0!UC# zFLO4U+$J6^OfX^IofOQK<^j- zKzZu`%}%E1bKh@FA-+>`>2t5Snm-meu=)f~7V*u4CX#S`%6)UZRRS%EE;ASW%sf;u zh65d_8YsqpLuyZ-uk_d|u@M_RH;;XsYQ7&GaVjmk9^ml^vE046e%UOKg_jY&4AHJ$ z4!6G*zSWp+DmSXCR4nR={Aeo{VpToyvTFLeC^S>Is%m2LQQV4!sS`S`XGW|<{NYtC$B*;a5`Mf-yJGgK>tob`J5V2_tz@ z((aw2y6ypn^@@^ui5X_(@cU5c|Qu#u)-?rKCCPsMR``hVub?DGOKW|e$W){TJL3a@M!Ji zVY6sUS4j4m?$~K`_rZEFA6xb)VUl*dwredLVouVFF51}*@?G{5SOrT0JUT}qjh-Zu zMwEu%F8sUb*w^R9}$D;aDB@VLd9nxLV;Rqt1 z)Tjrt=cr|*yzo|wF!o<)U*Ek(Nzny&BY0x3EKAeZak%8Bn*e=xcO_Z$ZPWXlbxZT! z7$pzu1%uPQUh~|u_ao&#_+$*o+;&`%@pG8DB|;=bf&c1&h3bZ z>S%Euq&xR76^#wqY^?R;z7Y+(3V(kIkfAQcgKO4D_z#>?G6(C$QEyF#c90oNSfiNV z0hr}4ttNe1-#lhyt*aq|A$4RpY3^*1O^FI_TB}F5;VC0myAb{~#{zzTRd`T^Tlktw zB`^?n-)m=flXFx<8y<96Mx{r%s}!Kl&mpG#h*vKpH;t0<=rnwf(Y;#QE8nDdF*@?uGo zgkb`HvPdSx)n_`M81Z*^cUm#0d#mU-hwHxhWjI{~stc6(ZJvYM z+0RZY_Bq(|)_zvf9=}Js?Dn4Vdq+gh2GmyIbaR^pnrZlhb7?Mj1TWqa z8Un-73X5o829(=%tf>)3~V;Hr{ozD(f zNrv2VyK9a-G#KYy??+~t#%*=BxF+LTg0Rs#b(#Hy?}j&hgubQNBcojPOXziNW$z|; zkDF^kYP)QC&DIGjv)$~*2-8TOL=DyDB<>jqW!m*24xa1M-xhJ0r!M4DwqeXs$V_n2 zO_IYG=1W4A?U$<4%KLiGIz%IPGrlSoY3yM- z^XI=+h@&>j`FzoZ2H34?)6bQlj-gpGJ`Y60`87EUe*Wi)X`^W_t`+5sHKm!M%7&r} z6Dg+^;6X!7cBuLSGX-!UXB#BoW>3sy0cW=}^7EO*24BOKc4WsPJ z2!{kH?lOIT#e3?6Y1BO->PPNdB#QkM($Z&8pKGz7dp^%dbQbrmqS_+k@%sYS=Ny^T z*zs`ku2Nxb8Mb`0Cb=0~4C3!FVBZq1Uh8Y-p9=}oN;=m*bCd4RjYxleO=#^Zuz-3@ z+Q{M>;}Sxsk4oGav8X3Z727~fcg7Ir>A$75RG4gXd_T%HOsmv2nE^>WnlhLW7&d`J z3f!|G*r7>1bNTUjlonm6yVdh0?8 zxs`72EmQM;t88uwSOTPNH-I~LovpM8t5wMh_xtJt;73KgenRw|LYPx9Bz>&XuXuf16k@LRMR+ogTh2Unu+){`u1 zMqD7(TF0piI;M_HJGi2*#tH3?Q1ool8rvfohH-Op=0$@az0QTa*H&Jq!%U`1$zI(n zrYdD#(9F1h8nRN0*=9G@mF;8|6WxSHiiwiR8zG}o42|MaB6#|27Qja<+}LF)-^W^l z6Vvq`u!?n`=;_KrV$s~`p?$BpB8{cUHEIQQ^voloLt67)l46Ssq&JdJb*>FX&-PiT z(iBORR}{i?1!=^54J@8m3R|4@a<&}Zl@BGXq@2nfnL8al<|R;VRC}!z=^1%m>L`mo zTmne%q5TE~oR1%gr%#wDv1~|uD5>!1YJoaI;W+x#F2AM<^7YJXrH!4X6?-Bn2Db#~ zHuHRCvd?-^_fXGVSn-(VN!PDKuB5N( z=Y^MqEVCBfCeU*u45{_Fb<72Pd!A>z`smA zRE9B9*c9>%(-9QL{!vX?Z9t z!aFV2Rt8y6n7Qv_BO3iE7upAR6);g5A3ry2TwTF(4$q2|iW;Z`Wt&0l%-LleX_OtoJuc(BJkRIL~!{z|xIjDQF@o;PQVawok|E37)fzUN~ z*WJ%qO0yzGrwQV+yuPM--ku^z0Rv@Vg0Y7Vf=wpJ!kDi%+)9~68dkYt0to`odc^V? z=t=EzTKkF;PpOR&8t(vA!s$W_7a<}unZbm4N{sF3o>r=-SkzXPo7d%Hkekn~{I;Xo zYvcQ2woLxgP=Ya>N*%8*7EY-=3^McLS(SPwH+M{Ze0Rlu?#(1;JkS{BsTbH53h`dA zWAXCLQat?$QLIABwl0xTnU(l-S}S9Z3;ON-%>MXsa#p&}P(+7~-LSNqYC4P8SzBYV zl4W(#QIFO8-AECz^(>D@^(GAMG3#{6rMBYee#lZ&h!h+Act6w`lj5uEevBuwpJpMT z{mkFfklFU6C=kw)!EUDpK9)K zcaE97z@O35opgHMWOiVFSBUfmQur>Z&q}#&=Nf-*f6rkG*UTu*JroI#ov2|YO>ANL zp48Wg+`DtAV@!HhQGNTC;u_IP)fSH!q%IQ%LKbbv=p@g23T+=`u5Y=#oV0n~nUfmM zu;qZ=Ci8hK)!{UfhV$ zz~CYw`nhIe-9#{u_8HM`GAJLk3V^efAZpjay84OZ2Yu+yG!w>gS1865@f7S+FTYM? zTklVN9WhMkyrF(te&0j<5G<9>ao^v38bnVkolyql8e76|3i+{zs_qI8OW1|dEHS$lvTVY<7~6 zlFmHGa#6l^dw<<|(Q7_#V+Vqgbk$_QjaZ;U`8UxGde0oj@}Rk(??VRvCZB+NO|{Qh zKN(!Jwvz8!-3GmFgu>p}@&Wl$)><#zWo2RNcp}mHToY1&aws>2_2?x)S*v+`LAgU| zV_xryx+XG@4eAIZ5^VZ{cXcw|l8%10$$=aAXj~(>AN1||Yw)py)ae*RdasF$?Afjf z=bT16&x!rLaz!stx;Pdq^``l4q`WUTr?&4-gMlaFp%0ny{@e$@BVtj9VMXamAdME> zge!k)6rEoY25PtlZ`>B_G}p@Zn)d~-k@&IroV_rWv(1*%$W92*7N9q7 z`t7Sj*Fb>9IWzqp%?rN=(HT|*O49nr4P=Z`OsYrCus3t}Lr_Pv%;}Rg5DE~yX z2BZ1Sfy7se&R=>RI)+*#IxXOPX}$ejtFKg7C83X@y&itzcOSaFCH^hsaQBy9gf^|_ zw=ccoJyf(>--sEFy)1H2S<;oAeyn8DN9$Up@uL1#|Ajj&buXpQ7Q%{14*HW4(nh!PD|yAsi@u(+^k<*a3`_VV15N7wrKt> zkjbi3tN2F4UdMp`5wOGAzZy49qRQ^z3^5C>5M-HD)C0&vhv2x_?E}a6(~fKPmtn!Zrz46xzr`q`Doc2ek(SOgA3NA3>xq9^P#_5utcrhBuEa0 zSIjoG-2Sz&Tv|Zz8Wbnk-k>T=D0w97cb&t>uQaZ=OMm_#qG9g~V`ma!hW=g^UGUie z_;rF%L!Q#uc>;|w2~v|^|J+M3PP!`COrzcs*%-x`k&a&JOyG2m=5~3g?~B@ejIA%O z05llYs~DZ07#$*oIU{lu1-k6wb?xAxu4KKqoZPt|^6?G+7jT@%S7weYe2J?c+e5dJ zW^Q1ut=VS}WyN{Bn^rT22E{&LWEIyf-rFt)M&-XIlU=?$849$bQr;T3L5Z8la26H! zbd7uiOYq`a($X(>W}UzK?eU}w>Pz(2)s^SHV_6*Gzoey``p9n(rYe`NbO5asMF=7A z=J9?tL_mTy-sqVTpQz#HT~Zm$)?}sw(?!qjYfxo1jt5nh+pon=zcjk;FMWuQxj!7N zOZ6bmJZBj=FK4N%-}idA#@px)v*jhm<3#cOPE~nG=KYcaOSPnRj%}D`#mis5_O};7 z3d`DO+fRLw6Gm1@OS1d{8F$?(4ys;V66iOyo_{L9|CZgfW%#sK#iK#qrW3Dq6lRVzsBnom(cHN*@?^_=AH zVyXSuhR2KL-JOi0?X19Ss>cQqQW&OGlloX=mzOu?zX#WW8hqVU-L19Jf^|_HqC3b@ ze-|&Erpcws^aR^?%h+k>gI?awjZMavzBJlW(%C3Ut3crF?+0~f(TcK|ms!(wS%Q=1 zQaXuE<={(kb0N2qyQEHFKEa=uhng3aBwc}THOuaBFqeLkaWXfP1TvpaPBQEaIbVCItt-zanRy!tqNj>dKYtgBbL>mFlV zaT+vwghaGcGdZVP@%ms6Mp+RYTLz>gh=Pb#gK7DPZq!$C*@(LNx!XOA=LEWuIYP_F z&A>;+_Wf4-DHYapuGK~YlHUsWTo|dq6vM+;^LS0 zyJ2stj}VNepMx1Yru4Kf(weI5k;=&)`=6(m;K2dws~?isoUg3D&pP93k?R|vg%n?s zO}cqTfFj?z>#?3+(avZ=`ku81zkT>ny9}b;e8XMe$S2%+GNlmUilb&eYq-z+o_&e* zFlI`J)Oav%sC(@&rLb3JT`)=uA+nu!^=K(~xdTY3>9?` zC%Kbsg25MI`zkKA9j9b;oX@2KWlC%drMH%F(BRB~&N$V`;o;p>LsxAh5pZT^fyJ$TD6(La+u!32AxX3hpn`i7J zYJcGTuR;R&xu3(E!NC{fM*Fn}UW@H~E?Z$-Mxwmfn7QZorK&5~qNCR~-^S2v7B;V# z=Jrj*4E6J@l&h|0ULh8^Pix3PebrS_gU5C@D={R0YTkiT_!~PH43rL<$yt|rgxfvX z>YCi&OOKO=<0lK%7lFpFMuF0ca&_{1bB|;87fX_N+IpJ3+di*{z1E%Osdu%iYLIs# z_T^FL-#n;S@>7N@;&^SR)Yo^gh&E599QCxJ*Ezu(!u8z}U6seuK5bPEY>K!Wl^Qcv zYliWv!U`n;TP~Y%>vRV~R0HETN`^a%xOvuGnt8P0=BQ8}#W)arC=NY{8^U#POk#?2 zm)~I|nP-{3Y@tV3t1mA)zOQFpH1UT0eii^ekw9G2*y%p>vmODL72pzOjiSO;&FN77 zalunQTDjq(ne!W|V`N6Ox4L*N$@KzdJpD$!?JLU!R!MVb~)#9HwE zoDxvHmqmZa$HZYs;qaU}wd5GB&&6I(KRU3h>--{UI@D98_}aQ$XRj`}pdaa_Gwqvtv|rbC zNZPwJj&QH7n;pv5hPZAfnJvch#PGFmuDO8Ed2Joj#o2c!a6POk6*u?Ltedo8$YD5e z@WU9Uq$$ecYyo zqJmTj>byL}Li*w?bblkN!Z*Nd;UhEOepR*NeB18Vw4xAbOx-ir!SZ=^KMuVEvKHIkM=c0BQD`+gT{0zXIc&NZjCgoIg!x$6$YlNT?FD#N^jQUhlse;@N#4&0va9|fr&!(v*KP}*~q45 z!<(&7nEjjFiEFNv=fhktOcj611CpoT<<;`arI{8sWazb~6V<=!a^4&>qhh;8uT@E+ z8M4Va9^4zVJB=O}{xH8Mme8Ml_P{`Xw%IqHld_Wc<}AvEfA64udCgU^nzv?i%8}4~ zx=8yo#YVafd(F%ZkPhrn^yVyM4|U~c?%8qWdf;f=gNp{3ZrM$emD|%RkL1OQ3-zfK zB)>U~_{K9B)X^H$O-JY@c>|xMD>vqO8rBu`FL57ctWz5v>R#Ts@EffGx@vA-y5VC# zcxd#swq?ll{>d9u3oeRnnx;Q=gt3w5`3AYHHrqVQyTargaJ~9`w~;+a$2VTfN(9&i zwY5g9FSG1|ut&s`D(+(H-A>*^*PD$r=AlZR>6S_3h*!w$lMj89^gR@{LAOAx2;Vg4 zW#OsnL|n74tla_P$dlDBQT&KmN75FglU6DKE7bQm>4HBS|LDNb(-F~C?^ket1v%J_)&_{c3zj-%sE0 z`&9p7&*7YEM0eJU;-3BEogaq!@#jlvPNHfv{)Zg^asN+#?$=rQ6EUY?IlkYC_k)7| z^MgDEf3G{*Z8A*X>+e55{^1F~@9_NJn5rhLbW`n3p>uyN03ve76^rjXx_+HR72BJ6 zy>0H;VvA`yw}z;Lsz>v}<*>PS-C~^c0kGjbx_vp^YqG;9bo!~-bZECov3u3<>pF_O zU}JRw^NqkS*4GTP5*4Rh`JV`Y7cMYu?JsDLiH^)DJvwkF7iHj_|iU+?^W_9il zLJ7#ZWSRfv(P;T*_=Woa!SDUUG=TqFvH{(zy(nvEu0Jj{kbp=V{&+IQLb_T9x;-EM zgLR*jTvQPQ;m?MfZo_0RBv-g$BhHvJzpT%^$JgkD@)%?Ov89 zBDlGypa|6;4PuQ+t)CY%{@*8fC$j%QC-dM%b!(Go8D4W{j9q4- z_axJ-r%T$lN8d+SA26DTERRG&_wBMTi(1!X) zMfv3$pI!pARR5`M)K8Z5lU#v*{(C@Du}`?I%#r*D+59=s-#&M%MMTCn>8B06KiNW( zJOJ*(!v&pXe+}EW&yoMB;r}#UzyPl;%qW$xwf~qI{j;z5ctg7B?@sj>5ezdq{e*ab zGO!cTCHy{aCxM4_e?wk?Nyh-!rTz(){xq_57@#&LA9FbVbWOikrHn)ZqW?9b_4jw- zhbCPm`Af4Q&ph>~j`1hW`cE9xp9wYC-$n8h)ldHR=V7HYI0(++!Q*o7P+d6L2mX`x z{rDaLGygr{^)HMX^!H`U{I_)rkb$}A^}pfXq_0r{Orh-7FwtKw?tu3RCL#rwU;dR? zPj>wSnAoDk{Fmk>y-N&WV&ixW^IyZ!QwjhRcgrds{`Ih@@ReBra$GGiNa1wjFDG#d zzl;dLL@IOIzXs+c)e}sdx9OtzYkmrhn(fZjBkfZ^JlfcoCF@hQwrmtzrA9ZzYksj`PE6VBKXk*{gF)qp8}g-az_3(B>oF? z{TH+g@Ec_d|t$fq!3HxXLV~{MR7gd*W!|$%r?1ehT@1R62=jno$NSKRdfZtMQcAkG*zm7xV0zba_W6 zt|j68A0NJHvnXQQcY1|XTiZ6F^hkE9rk*dTrsqZ^wxt;;&34#@+;aP84@tI@9X2x-&e}=-k~K!w)vZ9qr5PCG!>2OO+^kxYhZ@jsP_{ zv+meeQ`E8c`(=syhA_k0(l6(1y0fu%wH2kJfmY_z^e|XI2(ykJp7qr6vQO95ssoN* zvz}HZD^5lWi88!xmpI%3x}Ikxt5L(fo~)ZS5PQTL5l<$%9w&Nrv$onvlx6i1 z!nUm04Gzl}R8*{$voaMG5%-#>vH5L3x*Dum^ZLt0-G7`DF^DfjlvTf}e-5{y?rHN5 z$?0rk6WyI;q2jzxBO~<@=mxqRH>+WXweH5I=0BwO_33ReA}J^uDT{^ZgAJAHd-w9< zvqUlz`nijUaWm@PRjUx!=YtWoHcaSl?%uY(!5@v2HGrESC6p2ITHAW3_A#)Dn7P}f z+6{bvjITU;6s~8k0x8(_+-B3Tj=2o@nQI2u90Ge@X;!lgOGL|Irp)S09!%BAW<$Lq zv!>4O{U6r;I;^UG>jH*F5Rj0PmR7n`x}>|iySrOTq`N^Hq}_yccXyX`Z0W9dp||JU zC!X*6-sgG$+RBBOd#yF+m}8DPesevS-bJlcR%jWJSnJ`V{EK-pr};uh-Rq9bk7o}q z)scX<-0o9KBxMHrOL$C--!d_kX6h*8{zTpZCLPQffc>MO!6Ub{MQ7gq_HF4%#{F%c z%^}b3lu3QVLO|}J%c5tp(UM8>ZG-C5Xy~7H(fJ!6I)n-4qFE*X#^8_d)I*wyUC9IA z-g_y}(CrlZvap!Zys~E|D3eT1zqJ+7+(ofu^~pPDwP>aaI|xWSy# zR&-azYalM|(j>${WJVGbS$PTAP4Wl)$HzQGCvM{TTr`9 zxb950JQ(bsAvDM?V&b~bV=oz$;hK6zz(eeHizK=Naev21uRFq!cPig>Bi``$1n_NT zg{{(TbY*C9u>@@VRW_Pl6os*g@qwI-PWLZ>v!4cPTFH}g&){#?DDq}ZV}Rpn3DbrR z+O=cg>0DZtGOPSfnim;MWc-(XqbyeA{=jKIHR1Tv)9+3#v_4Bft~)FH~QoL*ZvFM_${7wo^nQ#?vou; zRu>Hq*H%~idQJi0%ZWwz$?+-dMVs+^=ERZnF{?z4`V&1jV8g`O>38N@$J_RrC7zjb zU3?HP^^goOzqKGSH|tLg-WSxPx>?cns4v}42Hb(t7G}oMy+$fVEKGE6>F$3jCX&5- zjO8*8+G3aI5|23r8*4g`Rbd$%Mk7o$OXKJ9r57Jh{^b$zH9nuSR4&tQsxKSshaB!g z611S2V02qIVRSep11w%*B23!{+{!~|wRqeH<3_^x&WPf~c@QQJW5WK!fvpbu3VMW? zW%g@@;R2%ZLCGJ_al*s*w8@kgKbh01`mi{qlHR%BUEH4)@?2eOX?gSBaojb=Pj4r_ zEav^bvf^yjz&uf|+dvB(h_PLWcamuFv|ZY?1D1hS7^!mVnR%D}8qK@WVlkDom37oOT2Um6Dn>DUP?xyd54l}U~*gTiiB-VS5k&?)EH;n1m|8%?#8 zvE7@){vV5oYHDet@Ej7P#oEX}V;;N?W7h-^6EBwiS;NPMUqC+y{Yq6`Hu&$lr{j~k zn%BpXn?Kl^Nl#~J7cSq4AyU@vi*{r*-5MQ@3XU@|W`pvp_O}z`lDt?~OC~mI;+8AH z#$pk8e*G`*F7M+Lze)|eS6T5c?3g548Y@jRuci$do+apGC(oP?S0#(B^&27b$n@=f z^KQ$_Q+hgU-x}#qy>Gs`0$?El|{JKF_N^2_aCFf7|DKup^GuTOuFXNeq;-^!$u3xQMFnUx% zIlh7)S>uZtS4&Vaa)clUX8=#(Belai@6=l+cXlFhddGO}cJA3LylBVNWd|k5p)Gu& zYtJ%QEaR$dPU6}h0tz+yOS=Uy6PjwEvw=D*mFbb~i)5n;9$wAuL?h0Ri@j>rX+uJ+GMB|IwUE>9=Meo`X%Lb>~6{}k}U@a)K+fc`f^S<#e zucViCri6=|rUvO!Ub0gnKDEh3vPfMcaMc}~m$z%c798G44K|5W*o4!GJExh9;)(N) zO@@JR{Ial>d>zuSKMU=>>x+BxlEJ!}RDJ80L?xNlEky&>oAY+(HdP}d+tdE%81pqC zHH-y{MVm3nweq&@Y9qCjr_CTE0|!kYcRdp|B(|JO7aqb}>4)aC@FpoL*6qw6XUS#Q zf7}rjoKOcaNM1f!k0Y8bv~-V~n47~$%``Mx+~X;t{jFmLjLs(v*yFk~8ZL>s3!Kb0 zYfb)cNJgQU4|3(J9{9Cjh5AcDW1U$Ka^7+^7pL#`ZTHp-5z9I#1f}iiy;~ZoOn|eR{dM5g zaj=U7Xr&}ISjPoO9*Pak$MU}Y!^?q`H0u@#HnTFX7EStcU`<*1+o4I&98l?*oa&pQ zW(KOa6GQq{fX#5Jz=F*M1S&Mi39DNlJ2T$is^0nj&%MoxBe zy!F%)mu(#Mi+VQ1?D-Hyc8W`W>7}<@dJCu>ff>gMba<)#o66@S#t zMKW*in|#SdPo*tVy1gdML`P=%;IQ;7Eo`1U2yT(Q>Jp5tI}igCa7g?qc}-CvcugH{ z&n!+(T5jQ!y-yLvoSh^N{RDq|O+_M1ztk->OVMAS)*aldYWKB4#Q|mlKF{Zu<(!=} z#xxoyyj8&K8ZtG`mXEg9KfUCU(QYAC(RLSu9N9K>@|Kq82!B-YxzQQ0!vmKnv>yk~ zsW-owd^(Hp54y5IThZrxdXMCVHseECJM1aPOHL#DtyCrmsZj!Qs3)&9M~1VMAcFZy z!5w*(LaqYPN`VuZr!m|t!1A#>kh{+Ilec#J1pXVdi$cM%%--t-g3;Df8&P!=9pJ6W zGxcPXe@qm(Z-entSf& zZAJ==SjKP?R4huG|Hm!8E^fenpyu!dU~BXDV(5p%kHrg{@B{1pFo|WB^h=hB3o4;c zs|x1fpH_HSu3*{$sTOZk=W_im?Q!kWXzN5sY$s7$`tE_Ce?9YBis3SvF1@Fn>Mi=; znG&)omS2Uh!;g{2yOGBp{3c+n>bs_h`p5v{b z(UvoOpe-f^lT<>GhkJF!ld*JC`HO4WnD~$-91YFOCYQTUj+X>gmu|zbem?-F1)37Pg3FtxY7f~PoM@S-8p}6vi1MpY_hcq_LXyyVbby$mfl}s zs{9pK0>>2EH6v;x7dT7)NIuZO-@c$Q?YSv zozo6BLt*-M-M#CR!=J9byMrI^AL*%iICU2VoypRB0FyGK)CUdxb7X%$JJuHt$M2V> zX_g>i95I64A##oM?~UU%ZQrr%;=4=6?9$RMb{^S}LjC^V&XLJqq=MAgAHg>LW_WYd z>009rE}2~CI-bhH5pSZar^3AGYBRP(17ejUWG0qj`+@6Tyk zHM|(4(#>$|GoH!SYB}PyKP#jj9_}0e5hK}j!qm4jXqV&ebNzv`%Onx(&0a|G_gJc- zM#iwU*3v1)t}Ky3+!$QpB&!{{^H^FU0d^y>U+wfbym&yAWnMO<@j$^k$MS$H=k#{^%Qx))#r8EBIpXhrw-9Jx|w` zdmhRl6$=7`pMUr>0DsjjivMLo3n zgS;}+DaVv^!2$1s-tNJi0szGzGKN=(Qgw&5u8@f(xg3l1IZN7}KXBwv32dLhpaSKx=;;NYJ9lRdyUsVPoTuhYqGQ zx}6s@I@~5+TzxkNo$44z{3e8{P|U)Yr1R2r_QoKn0) z1{YgWZtK7vu!Qqw{ysjn?JU+emOD2G&v?ehUk z3BrY54%mDF*j%dUqv>Uh9S`^!fkVk6n`GR^5Zt_*XIxV(qjZs;S0v+kOiU%aiH5_E zV4s*{kWJ?zcFMmtmNm4%;t6*%F$+phgmi3P63{x`&UvykHkOBc&;>UsyGP4byHI*5 zn;0Tcs#;lJ?gXZK(vn%F`{OuzM@HD=Opigj0SLX%(QhUAaII7pq^mcxZ`Su51`0jpcAqx+Qu6LP zA&)v3DvzWK2EXbHLGo6W$J4smOr~JJ%xj4U77BX2sy>|98)ZrPyEPL6ij^wr4Bil< z$7t^c$%)xPd)!Kk@x8OSx`sl1Q`5xQwU_R%y=XDM+TZ*36Ad}eSfAaKME0i#Hhg>U zq;C8(ZQ6sK2kg~kEKw?p1uX7Q(v5I??JrO-KU-sVsp6kXo2XGXqGmh*9r>|3zHfvU z*onmOob*eymRj^6a4H%D^DU^#&>t_kPE9R2Gd8}+jQwUTQf%enewkeasoeFUkCv|Q z+Qv)4xLhl~rrBG};%ElScj!}7O$NT$uKUPNQWfcWjtB$t{WEFB8%S&^^uXlg{Bo_> zecrpWCL4#53_13r4TTud@`EHN@PfYY8MTgny6z*X14AZ!TI05=R03)cYmZj~D$h`=8a8ke`CO;?if zx;Ggz!q!%eBi(*jFK_BmY2?D3IYq5hnab?lRO|AYql%J>5BCJq-riU1D_92+TJa$@ zfe?zhjSj%k%V_r6K2Q-E84Aj;PtcbvyE4+i{pWM@viq?5`TjzbP%}n2W!-O_FWryN zDj0rBWyHvX2*B*>p{f*)>-?S`I>HbGUA9o3vbmI~I_GFqhZUeD0(P zj%=b)_CNc>-W`<`R&jWc52s7g$}pbPr9oz0xvbF7^aK&{47v$N+)=^8K9>vfy@;5= zZ-Y3O9AWUFOKdvkP_pt2=CA-vnPtM*U)X(~jksNIxlnlY$%~H+D?^)I@#7l23i_z( zk_Km@bv~@2cs>r==<5?fzr&B{Q2imdn>KYjJ`YcqA8>Y-wH|~ceuNbH?%}voZpHV9 zq0sfM?X0&ckD?5bRSrn`+Ve!Bi8+3dh+(20ueDYGqJuP zLAFp_(+M$pnWj)iTwE)0D!G+!;k&0_^(7?y`-cD@cADF8er<6^3YB(&x8!&ynX#<$ znmvmg^C%s(Nu3H51i$WQB47USrZlt|MhX%iy1m_!oQNBj)lJX$Mi_?$mJW=V++8=0 z`Cly@E9rn+y{U8StH)qfqP`}*2V>1V7_K8*ue(|fo_SO+2eR$$;8N3 zm;+jP11k|%821jjFTm%`*xTv&XNRrKcZ}q;MB!W^$s1y=bngguHL-HgDojbDOSuKz zJC9g5O0up6KWzvA!9a=t0UA@rH3?QK%QScuAMPb+!RjNx`EdA3JSh_F=@Y#f5ttV zo^5tophO`uw4L@_CVF~?EBR2**+Z3-a^ZN;XyqHP*`g|NLU9j&S@z8Q?y%wh-|A1vl z{hlAo(#4YtBmGF=HCvwHwr*A^<~jsMNku$V@M8c2QF1~VuR3^bab^Pkun~VVYsR0W zUPFVNf+oGuX=-wLk&j`tBR)yAGDPe({Izx5l+}^|!Q+Zfq3)wUIhseZ@o|vD5h5dr9WWGd3Qg_g+Bo z6KX=y;|^!>t;aHwvwP8EOda@=?O2~Uv$WYJyeQ)8$|>mYDE)8+<=0E}rRn0TS+V<# z>+bmvZ?GwCUd?=@-J^0Vr~xN`I^=IG>b-|JN_4-uYdAo9d%}5 zk-LuoSHX0D&Zo(g+s_Y=tDm71_tSe10lm>Y?2O~9!Vo;G!bMMvNv9d!bm5{?`kEzjxrNjYYjIb8W5c_fPnYj@AxIWrl%|BrGe*KC12o9P~`%Q z;2EER|B3y@+XC6SP&(Nz1tf(f&hl7J_IUZk10gq$Lx~qhX9K>+>dK~&*=uU z?N=vl&obgtMX&>fyt%7|(Ht%!?FA`81Goadp|YkWT+z+W<82fx>MilZ?pAp-Gbz%U zPR$c@k|xreV;ZM|n+y}l6UuIFWgVLa$Dy} zM+Io1jwvWg**RH$@=*oQ+SmD?BO*{~{*E5Mu%s0O!pQdLu4V3U_Xs`8MvxsTOa_8k zETfr0ah4(6Q=vg-ea^*z2Q%`OOERN?q{;-_spe;vBUai}8arvnCIH^(5R0FG1^vWC^zGh5_uo z>t;T87k#4Ny~_7<@i|Uw#+TBCnwSPdd;!&tQ3QW`>i*fuuEP(Q?OF!g&gX1=ykTPy z@T#koED5pnrRS?$%l-OMeWKr5j=!Mb58~XQ_c9kg;c=^$I7l{K2t&ZPU&^lNz(41S#hK8%-Tt% zB&T+I!ARR)4HeUA@`Q6$4CXsnU0KJqR9cT=O>@cxAw14nITB*UiZ-=KAGz$JLZZLZ zh2KcgN)Hv+R)iTd2qXw8KhS0@e*@)NT>zZCi*(b?^SxgC@TR!=8sN9y|! z)CqB2UE16at_Ni8qi>K%K=2tu2TLXG#)9}O`&`(4ysEKx@*S20XP+I&I-ea(6FE zt^fzIo{T7CG_hWqp;t`pRl0k{KI29*g%w7tnw-8kHm+ebd%UozcKuN+^?Jxm_)|~6 zY9jplJQw9koV3&en*!5Cai%{Q`K(rveomhBzdT4o;x7opZ;Ypdk5cTcCCv>`7hnsl z0-94LH*>9jk%sOd-?gI6i?<3~jVTQaXj9tfmc0&bNyd^6r;3qpKDI&;0k6XIZ2d9- zvl0qMTqvqDCqW>$>1ewA+ zr*LY&hFLn50hRcGYoy>M()ZNgy3Mn(zOhr+QqgAoKpUr^!PSnTx4d*@RWQ-Ot7bh= z#2;;1axxv@q~~ctTvvX1;iV;r-DNQr;RaD|VnqnAz@mo+WlZ|~ZZKz4IR{e5;gear z_aHlHTNXTzM((kztytko5jH5K`Yh%ZB(w+GX_8(T-o-O$Vbt3n$GzuBA@{0-?V%O~Eub)fDdbyN|nPB^quf!BTM#zW`8D;twJRGlefJKs6GDL3oR7<)U+ukyd@J`1Q~ z4zo(0{3ygCrNKjHV?tKLASS>5{TvUugq8yBUkKYT$kioAk%iv&J{aj8Fw=VQA_(6t z2~iZ>Lw)*9-A3frPDIjd?qiBV!h{L`)yfG@!P~ABb#D!=<}SxjHej#Y+RWdxXub1>&~vcDhAOaFNom}zHId) z(sPyP56%Vaw``-E#@^idQum%3FbW$5Y9S|0!|n*-+^9{4b5->z@gI7b&>yV%OUzjsnzpIl{p(?#|8tNEcfLT)XPqweOD zv1jbk&C;=PAZbQxV*W1I7iShiPWc}x!h`_9vcA86s2s~xIeR&_8No*S%>)a1TocHS zq-NO8?pA}z*jxmMdoR48G3y&v6?F_NSkoNJ1FnOyMa4Q4A_F#AmjTuffSA6#C-CWD zE-pn$Ao*HCsn_y?_@d1t8PgZB^;8DWGHI%NuCEGP%~AJ$_5qQK(Q!oS#sr-@|003X zyl5dPY9W53OCT~Kr545cRtqcYRsLd34h6e~-xz*iBe={4r`2^h&?33S8X! zp6|M{-L7LZ)8f|-+p^;?F!e{am#Xg1Rr+F6gh9$>`DY(JvCK8mFtj~9k3SJsp%I!_ zsRoD>P^K%O32q|A4;N)rZpk>`8%*h3k%YAR!WDz1J6CAwxq#(vcLwTj_mS29y#WO( z$_M->Cb17@Mi@?7jR=+Cqq&M@V?33Qo>6`Ki%8JpK+&@g;b&XF$_lp&M;v|wt@?^z zS08!xlERwnJGgq%ifBM_vcro1=6#QWi)vs;fioN1M1o0XHaA@}AJqsuX2xLP z-O8WLlo`;!p}d&rA3|5hto*fU@c!;R#AbzC)NA%p9jyb zvkyxlF4W4meV$_Wm@-FCX~=|8?}m94$_6`g$dKbZ!+nrjHm!t>=TYP!BNzDhJ0AE< z#H0071o1`TK!+j$+^2ofc!KrvQGjh}EF*0kI>K_mE{vzF@^YpU^@ zvO89q(qm0c{Psu4r$xPhLF4t#Q3D)aUcVZBJ!u_+?(w(%niWavl$Ww51|(qjO(9pm z8QDGv82!gReS~+j4LG5$8^57+kMJ3i$*ZFDsKVyF8;*IUN#LM$i|Ky>0Cb3!HtY<= zq*8slR@G${xuF5c^}esk^9$qF{JPM@x(FtgS0`W2lfDGGFa^Kc^dB(HRgmrFMfspR zD!Y-BuqUMm0_z;LFNEOIM*QF^ii@sIgiBA4O7nN2y&wp@1hI}vn+bxjJAB*TEGM`w z>uVk#{m2jzGnKB4h8Hr zzpGg3#-wEJPGBI;J>~sCQLU^rzUXwTaNMu7!LO>0p9m22r?T>uur-=WpS!zks9F0! zfzDu7ahGFE;oszhq%l}>|I;vGJ{-C~2>JmE!r&z&?q|KmmcU?Jjf!A(*ON)pYN8?2-VI#ml8^TroeNIWqgrd>?^7Mei9rJn*9`X0=c zBDl+J<=Xu)c^s0gC~N8$Sp43h!6F(Jsg3|ZiBW_%sq(m49YDG4kto18d1e@1+vD#xaFYO zSJKzY+yZ%_uL%ZT)=C$@BGC%gkooQ@ePil&bMRq1&Iaft(|6wK{99!F3L-vaEcoa8 zw)pVw?N9P4Py*=JQARVqL-NqEC^tfuZ}k*4G`)F6ZY?fJmD_$DET_I@xmmgIU#aAl zKWKRPx=PE#xxjcht8-HGYx&hD92Db)g$d|8{OpS$nIhY{ z{E+VEHwJJxY`(`Vv_yVeKS4-Fx{t;#o*(BSt229+?2fFbIW$HtI9}z!1sao!Y?P-t z+V*j&9HFnRd^CwKw4;@*77F+sk`cBqu&~d9uNxNd-_F6R`uY7o7Ti0nJKKXjk zj8nZYl^Aw#V>(}L*_bds#B0N|D z?*p8T+i?0bYZHE;F|O+MKqIMsFw7opU5NBAUy+~bRn5xgVWaFyD{ogC8?PM6A* zo9j+3biHOH2f3E|rLuO17xe_FA%D_`I?_D4S}QHhn!R9EKMq7gA!-vmrl6oKVYR^b zeAEE!c`wI@8@i9au*5=%xnTY{NIC{u-jX=K>o`7@hHP9{7BS+a}(et5U{6`uj zkppdFGTGo?NRm%PUt_kF9F$Ns^Vye`)unbuzKiTq-s2vo0=g@fj3W5NxICu>fGN5L zUCZv}to~L?mo#+2?!a&XPw77L`~V{|zo zWs+#J3GZyR#a7^d4Ann*X}Btm`rlX$`t$zP60-ViaV9z8KUwif94bsm{qE<1<+4;m z<)N+1zO*X~83~l*%gqt)sF(@3I3kh?6$ix{67jebnT)Tt-!neA1|Eut?Y>Z~w1RYU znUtF{sa4RCfk&t1PS#&TEY;uc53q=H%qT=*eXh!-_wev@-?Sj(GxsFY$QD+j1hz1u z0lE3=?Gl!AdztkX!#8Y`oh$_H%1YR#C7CJC8hLwR(!8zB_Ak7ZM76=@?^HgvZV=HY z=ymn)J6a6`msrh#rB%VmnVgdcD9-D(1E7-^4XmfB6A@g-#A>eFu(7ZI!nl8av~G|P z;t$8Jz+^uui$Jf><-_DD?j92tZC(KT+}}mY0&F~gdYrxiw$7gY&fMRtA^hU>qNh2_ z%)8vUxozVWe4T+GI_?hkn&N>Qx5*r4q2~Z-7HL7~yaB)aVR(qy;u`xkW@gz%2P^Kr zK$#%y3qNu`J`oYqd9D1g;i6=<$!ZTw#Iv)C{Ze%Wy~14rUde7~1$#xBmx&|lV)X1= zj1KAp6$2ASH(aPgqMPawOUc0o7oM~$XGB(Rdv^WgYN7+KJ_iEJ1e{zspmZLfYx5Gw zsyz8{GcN{H;AwHM%&}h`uMLET*Yvft2~#>d_=C+5QhX1fwP|d=Xn2(vX*7Sl$K~}( zxmGJVc?5U~P(0UvWXJFFS_0b{q2zl6|7S461Ow6WY*Mi<+S%twqhl7Ke4&vYM)tT} zP|s4@Z|rptypbt}!D-J?Z-Zb#N-L} zAgv7AiS1}pVagf%piBtrNE&)-wz!}2&|BSF3uY5JkGWf)#XrcbvV14;<<%?%XIINQ zA_R8l*MBl*|0O{!{%<6xF4!c_$d=gO)a+kp6@=mdBs#=7hy5B~tw;Jw&V})IPgDja z%tscTOmON|>;#)QJP#X!(I44jsX%voev2a0<1AyEN}tugJp8gxjoISZUA!+1##5oN z^nv^9-2uVuOdbB-5gL~}cL#d>3E7tlIcQQlvYz46B7N0x+$)!yPXBeiSymo>LC+=57MNN>3Q3@`uWN zWwjd)tZ)VR{|Poklm89)pJs9j5D*OSWdgNP@c?OER}&doVd{G;2Y28}hTD4%oLYsh z9v3-+Miec(W7sS&*v(;e|72w>N@&mb1Cx54*3T}40}32G51*5sM;Y!2n%i9Vp9hxoMr%eEtrliMACB$UZ@hh%=W;VYber%6Y@{napp7uH<)xcs z8w%VbPHpUTH<|4>{iz&d0y_YWel1Z!jV*}SW}X(@>^p|9A(#^FBqE=-Fr z(Ez%telDFbeBmg7VXdyjgyq4qiHA{5@N2{E5d(?ab6zL_ucm&kJSo!NFYg9-nCWZ{K=nW?&pU zQ|OOtKdtSi#iUAj;JI)OaawZIJhs35l;8Q8 zf3(fWPC%REFlqJP%cz(husz&!(*s9hZRQhoRU!&mQ_cJiSJ~L-MD%hs|k{u`#yAbP3-+8RqvX5fEcv?2jPi!G0sUk{~!`I3y zoV$fNy2mt<|8cX}^!g>4g1cVyJiK;yl$0eF#ac?(5E4GK_6 zfcqPDIQ+N!`_F+5&_{Px^1Up=GgB4Hc&f-Rpl0>>1_Kqc3P{yOyi5>e?|;q|nvF=v zwn`m~%_%lsnSPcT^p$Ie=Zs$DMq6UiBsg?RjcyS2YEV4%3~c0vRBSru4 z)VVQcbQj)32=WEGgmHO!(@dtvSWSE+fi{_gkLdWN-3wyFn!)U)c9l;R_YGWle)y!? zBOWt7GWN)p>^w4@RPOxt+n%S&TAU`kOWf6C-(EWLwL^4fZ5oX>GnPj^gNJjBeKQqc z&t&d>k@8UAPtVpOn{((kssA*a|KuKg%tVAl)j36MA25GcTt6`pK94?D6)({#jh{l~ zCd>9L(Que=q{WKd^er>a@EOB~lifneeU|xWMK~keIg5d!a$B^rU`7o!nvV6N{xcjC z7DP+n%*=ws61qoWOGBvU@j){0agyYR_$`jX-LWcG&E~PmHA#z_JyXhQQ%Cji2=y7J zBJhTMCX8I2PR*649;O+=>*bWvew^>7z&tx1U5}|TVT!f!Ql^8 z&^>@Xx{Ycw@Zi`lbiU=)vy2EVYLj3GP0OiIYaY~6W(*mKu%n_v$BjjV5330k6O9Uj z5pj!*Z$Uw}$x-1@y~G9xu!S}jvve@4eTxce{P$XTuh;FdZ<{0et-qHz=TF)iu^%S` zZDINj_@rYK6w6Y>m9+OIjEeSEsQT4N-l&wz*hJ7OY?#tW%rL}5#JXF|1N2phWh(o#I1>zJHUhh7JTegmS| z=0(GlIKqW!>l+koS9cV3PSRtXp=O&$wnE=1B$eA)hze z28IuFJ5_I3nG=2-8$YtdUr05vTr?1IHlB=hfw-Z9eq>QmQgG40?b5IGO&@dnc8ui_ zDJTLu*H&&Qp+!MP1g{>>%SiR= z$x>mMLb6IAc!MIaFtZ}kT%&x#I$7+&Fx$(JE$B^e%#_=<^D#czDD0h8Sx4G5DWC3u zp88D_sp1joYfWpQ!(}N4hQJoU5SVi_cz{#AS#A``&0C~xd$N=08gQF%1(5d!ZW*BZ zxPNNqSIrasALp{FF*#Ph|98*Ln8~lPXNq!W`bpiQrm?hzb^$1P zI?XxTr5`ELrOGqs3&F`<#K)1M5Wy3`YvBef7X?4m^Y-Bk*~`B|Jo>-TH6UI9f0-rX zaQ8~iR*S3%mGaK7l=$QgWf&Lwz>`9ccpubPKb}}r;03&vvz{DYuL@1!lwO3&r9a?Y za26dA`EWZejRzNXak6Tg-P~reD7UD|u_J*h>?5Bk?9Oe8(Fp$h&FD(uL8Z@V1mZ{= zQ_Voj5fS6M=y&<{O;8O_(i?|l_EMwS-vWklu00Le9TT1Yv0j$k^Y}Ne^zV#}Vlnae zJ(hhU=lA!YXH{3@-&mFbrXkJY-Ty03vH46$NCllQDiV_NuZi=qb?^g#0duGAo>(e^ zr-Pd@7{7Ex4no45MaP{k*s3;Ox0yLjVVd~z*7tFuAr#kf30lwBT&4Oqw4wO$MNkg7 z;UMMG$U?q_Dt8cWWHo27np68gUN?D{ciNTce1Lz>$LAxF*nExS6qhJ3aID#{{r%IA z*jPJ)Z$)H`Hfgqb0RtxBMP2(rqphk0y~KinGq<-X(f!qFmFWh(!M69wU0sUb?OD23 z@kZ5^_|Y@Hvjy_@o3}U)&MF5$duVXs@nqvO(j4zMHTG-A)yhlHZqz4sLBlQ_slXJS z%-zDfkjt3EVV;t1*6F(^^+bfq_O3%kJ-C0+bE^ukbVCXSiu#mQ;e&qI61P$p%-_L8%4-#R&#wrjOQO9Tv-Exokq{ONu$lg3vEx)2ft zBcLnC4{Uz2hZ+SGp!5m*t+m?MUVCjki{)|-=lSF+nv|K9HIfs7Fd;dsX-B|EyifE=v)&Cwn(h(sU`e zJaj-|aoB*RecG<|fJ&RGH1OZTBD3;SUU{axo+Yg+1f7ZE;=&w1dMa-gPrz(Mh@6kW zi*s?r{Ep7*4mEf(ytI?jm=ecn3#z8(X~&zA)AZq|SO{58A)t&*9!{(2hj+%KB^an$ zpK`l-Q)1n~$dt)$B|_yU7v}4z{n{m>@IR;U#!z8%s1DMER@&mYy>S75poU%wxz$73 zjqOsdgs#0p!gpGgx+_x|- zIh2>T0Y1o1QpK0a^&zX_&1D55Q?^(K3i=Nn`A0cKhRS{ou@E)ijcSpc6h6*_R{a+V zAolZJQEv5MtDR|qTz7b$w|yu?fh~k8&WU;@9fZL8&~{2+Z2Q6u6@+Ze8HCfX^U*{< zJuBRmT}HTE@{r>m9^1L;hLNT5?yVc+H`7;*sY@X*RiCwmDrawaxD!v(&*(|-bGFjT zBF-YJPFTAfi|*~44&x#N6DDLF1)jj;0e_&o9s@Xk&j%lZkf47@GX!B@wuBA=wE>+k z-*kPTS5U_t`E|Dlb+NCFr91NSpr7CU>{uQG{42yC%yH|`>?xRB7?wjS=}FIy2jrqs z%wfNb%nq;Oy@Cn9OJ^6rFfdA%qDD9c-AgQ^B~qm&m*3rMU@vZ2V{{oJMbpna?}dV`hQzgzR6aYbmGuUdhgEC-C2V%D=OswMvjsR1|DBqih!S>nDKO z=y8h3cCZBANEZJ~et-CEBq-(wmswnL?dt`n`ZfM{3E320OkZr=#`yHbBh)`2`}2TJ zXLE1d#cL(vBF&-dDKC-TJW#|r<$|ILe->4e2bV*i|HKMKSo=K)MS$&P+rw%_Q~r-E zz_Nil;7&6t^J=uiyau8M`2}pREar=JdJ0M`?v*KJh{LT~I<;SMacwS7eUW_SzGhfBb0k@Gl?Cx z*QIlCcYk+wypp=nQ2XA%vgEhYBKbdwOHloWV-nBc#Qxb7{nfzooe=?UpNBhC@jc{(_cp=2idZ>uYPuM2zZv^$&eJa*_0M5??CYm1a;FX@3QZ_ zXGxLjx%9z**WcG?O~A*4^SZvlE_JSaJ>=-uhKFa5a ztWniX;s33eI$h=LR&SZhbh<|{p>DLhugfg$WUAe8@}H(?&Xc4N!?rXMN%_BZT7Ttt zY-H=zNSKs2t`Ov}8)1op#Uo`YL7F@P2VpQhJToH%KF@3?-xb{1^fXF(gM~cOxjQv= zWZzBvv8u#vqg#9U-isUXE6$l_yTVEpmt~Z6ADJ@x_W9U2Fck#3T}dTqJD*E;-u`NR z&;QZ-rVlz$BMdd+0owPTyhk^PR^7$xRlW{6bK4b(+*Yo?LUZ--|6e_D?rC? zdx4l9<&e_>Z>#F3_@8;=dFX1{0@`ei4trqao=s&vnLuMM$Y-(ZIfM#+dly4MP? zAUsz(JL{1dxidBGj*lb{zC>){nFg&Dz2BTCfs^5QeiK?(nxDdRbx(7)j8`iUjsOGd zAcmjVVTPTuFJ>h|6eok9{0Tf@RFeWvDNTk%?Qso@K1MR#5o;qSnZfR~V&DB3+f5Gg zt5W%oelPzuz)=U+*8cKI$wd55R=cRGru(k0a%k(8XtZt@RVx?_ry|=RY z)VX0PKAaT(a>4arsVSt98!W6r1OGFh=Gl0~ClY{Ui)mF)O)f3xhee^C)8PH|pwi@_ zvTfT)8W=RA>3FK=r=rXx@fPgs&Ras%w3wT5ak857j84ib@*092fNJ_LZ)^rxZeQqyOy4O)CwNxVFbH6W$?fhFO&@qnn+uy z15wI4p6qrWY4%&EF5vk8p;MRc{SP{|?*1CrwRs5iACABmfN~@=?o^!+TiXCZl&}O_ zzzwtg{(Zh#OBd`6#Sc^kLcxpV8kaLC`<0Na9&U0 z`)!%CYH8+QNk$0!oU#7}gvmTQb`yAiKfi2xse-OY)>9wDyQrVjZk|O2$B^ImmNR|v zxM*X4zm;dWfqK)P0yZqn*W1LgHK+P#$K~2CjtDkrJBExQ+LLFc_c<|NmJ?QKq3w0a zIn=Lldo{u@ydW71Q!LvI+jeUWu9!bQj?)eZgjV$(Rj&-bsVHEhGt~oWkaGkHFlAWj z&i%%W%Kt?tCdBZ3+)ZwgJv-$Or~XGBNCM^yCJxXA;e}FLvr@`%b-$a`JrbZvojmM_ z(pb$b+26M+_*rA`+H@i3Y0rZoWY!Ov5DslGsQ{%o^Qv_<&5yeQv>FZT?=XO$Q#pP( zQ8m$1s|K#Fo`fJF@u4q3NAK|GL0N~zO*49ft+SEC&vCeZAec!eyR}%yJi48iaKdPA zvr#cW_9b)Y>3dDQf;gQeh8(Ke*Zzy#{0TEaXh=tnikhD&V#}ieen=14OZ0RVZKYt3 zTU`U;&SXZNYU0|_N<;JwxZ|SYglH3Ctc7A-$-bCdgRV{Xx0s1`WM;%?`98efmTrCS zkAM{zQ&?-jc424Dio=zB&#Oh0+36g<^Tz}uIR#><@>N#) z7rjnP7c9iVwG*{{Skj&+{RA7R!jhRH+1N#z1K5ua2zkaPa^n0b!0138{#97kTV_PT z9Gn&9_fJE?DPwXx-e+5x@$N_Jr@N=RAzuB!sB}RAX$0}t&=e_gCk>63K;l=|bf5|H z3V9lu79=cE`1yK*BtxuG-~9p3zs@lEC4YX#S{7%}h&&1nMUzQJd5zhxw~07x(5v|P zBU9IDwhu2{TOEm@{bh=YN=LNb^t(6Sr2~>&4(~-++QN5>RGSOJG;~P?h^#VN6Fdp-4hc>mxVvj`cc<_Iih}0+{q;S!`<};s zTHk)(bFQ)GkQrN44owufyXRIBDT96b=17JGC3+Q^LyGX1xn49|dAA$^02z!7{{qcx zD;WKyE!s1sGK22nZ_+xW$JS>IOkilnl)YB|imee?O_K+KUwRi!c$iQk1cM$TR)ibz z9^lMvx!-Sr&jz}qTm8B}1HCBCW{I)sM$(%@ORFYzY04}#gerhN{xAY)H|avgco7;U?*{k z^x7GKTWDTgK|4|{r8tAt=1m`F445|lUwF-V+y~Il;T!kal@%q|JFtShyjnWSnGKuv z=D}eKL9~tg5F}HwNipSTLwnrYg&yD6J$6?!1SjXF4z7L8{}vAU188G62{MW?dFJtS*kJwQsuu4$l{BktV^#O6(FN*to0qBIixAD7h63mv_F3R7%N~10F ziI;0zziXy-dw7Jlc0G8oesnZuUZRoLrB)fcrU;g+1(S`qSaN6mNL%oEb zjr$$=A;O$Doh@5)GctCr9(Sq6y@5Bgx+}N(T?*#qCWb}UW;J?zUvI#dFM!kEzq&$W zdF*WI$i0SciV8odd1;Fj6_u2_<&er^S~$sh+Lp70r>E=xDsNRts=Wwi!gc$@XFFd0 zy2tkSVZ^Mdr3;S=%IS)sEOpkZtxb8lJf8!X3*KTH)>her5^v7Yi<3Cl}T8!qub{o|=sf zQvPsH&2r2atg(DPuR`42uA6!Y1Ke{kFil`FjE7Q_hN9 z>XzZdS`N{}tBNA-x&~7>@i;%fHcj4=Z`u{QO3|d_1-@5NEcAS1x=JYR)jX#7azc$* z=8xgvS2Ps7oyqGQSy_O((}arBH{hzml9I`lMH#YnIiKHmp5_i-4s2T66kG!5c@dZ80>5Pp_OEyl|VMqwndWvB$R;u761D=;+Lh39c*IWcxCn zw!zjCfBeYPy@vC=oxlUq>oHuhFwkZDm6qM%VxH=e%ip6XrJViF4QKR$T9l_ngTE*% zq*p_c!E5%E6t*fj?Ix~}8Z%ZVzJw?&OxekwSMGpzr`xZ3nz0D_Xg=T#&q8t$cYPs6%e8zH`0 zpC!IrW5mI}G4J>qdr*;-J-3a7v+&;CTEi6#-f*T1i|cp|rqLU(pXNO}I@5+JnHkeq z6Q5DknWt@aQK+`%$`K=fWjSl z*>6BQNvhn&{BG-rjkA!}A7-D3*~1O;U-;r*DzB#Wt^;>%W}=u>h|a6>E=-@WxcT7&#?F{LvPx0=StW)f)iKcRzSAF)9@!@IarZLwUlQ;~*jY zRwO!Nw@P4AFsJ?el)~I`+Rt{_!6`J8yCeU;WI2LqD-fsQxeFpLm15#su;qK#zCLYN zLZgU$Zd=>ahc8z9Z1rfAN_Bj4SY)z(F4qiu8u)Fw=HPdkXJEHDeLwt(eqrncWI0?< zvyHmGenQ}R-i0=k&ToJ4a9mX~!r2m|EOsHS-|pX(ez0@PvuX=<2Yk4>sqZOB9jMTa zidG3zdJd*3xnKS_n1*yjh+WwI6ULWn%lgax zB;{3p+fU`rfwVfn5d^0-Ri9{a))*kS*;3ZN<%oh;Ijb*CKxACGPgt-pZNsU+;?Nf( zuG^)=4!n-hlt?k)hPY=#EY$3)(E*Pc9kyW_w@al-NA|ezK9$JLKaImd`gsj)l2%pM zb3}hcu@0%(<93Ma=<-@6_89hG69=-NvJ>3^`#N)GuH!B&{->K0caL3XXZ0{=;3J-X zT$pITyJHhm_@4;>T}_|`Wwg!j()Mgz!0}07YY1#H;>o`UqCtQi8SvG==E#ifZ*vRi zpdYyUrQzx3`E~2W)4hsjYD!r@0AY5h2??~zfBU3t>b&*|g7TKIs8@Qvr{7vINlSCG zV4~xbH}Fwqso7pfmEksc^FF^SQ;MkqsLmm-3VYRW0`~6Rq+W?ZwN5bxQMX;5ffU9= z8dj7BXuX*6+=KSNvImS$Fmizx{*)!giNNej<7$?+d|Da0Z! zxroQ2vEsg(8d7(AoJ(&DPud?((bbC*$YfuNksPVfgZ;zZOvU*IT&1ji;(V54Yox3|U`i0h6t$$Aj%X;e4iXke!r{XZu5X~#ek&G!u z-egslll%4|QIw0!t2xGXe2}|4>?{m40zT5y7}3K|)kkMHsE?^^K4LA6nA#d`PWsVFcz3~#*XzL@~ z(|~?9gnVq`63a19CRIIALsNe~^;Vm&RVkA(kYeXw`PgnUse%ju{JZExe0*i$eo_VPog@-H{4FFk?TmLmulPZ;<{@IYMlJeqVWHK0^L5g5v|_tdSu#8IoQm=d zoD4t_8gKY*p6J#2 z4&XkF^LG?dp0&=L8%M-+u2VGH+zF44@IH;R3{PAEGmr>a&E^I&X%u$-`^5YqNAw&F zdb1Y8>N3`gdv$wQyHqcfRQ-P_sVH1)ivOadipg0jo{hKN*`LB^AtSN9OuGcqFBIJq zOckXln)&}28Q`P54}aBigDrZ8=W?~q--G_4hXCn`J`-o$LH7w(TgBS!rBr++Zj&Lt zhDm=eup2u}n0*`Zqw2!B-x2`Y4mfU|(HcD2gzH`Q{*GURZ}9mc#_$pc8L;*zYq?tG z=T}{r5N8muq%XSn>i#-VWqv7i(#`SIC@~{J2I}w_TJOYF^fr{sQ!E!>g}1J*eySyR zzo4lo_U+a-nqJ*`SEd=umu{O?S;p!69?X&Tr_jwqM!h?qbO zYpw&-!#;JsSJn?_-W_G#_E%|AjuUBoa^!J!U4y=2v+v00v0toE4=opi@5=u$p<)?T z559p5;ig5-u@i{Y#yL1t4%{MjJdT}SlS(sgguTybNSv;x1|5u)P%D;X*3sl$L~roy zbdxPB7g;4;ziQ1j@j2vRO*8mxq}$mmd-M>_*nINSods+8-MM%QeS>JI?D;2bAD<_; zt9{V}IUR&H&#LyKPtTeek*62i}m(b!9}k0H{oFCgt=vPTnM{_N3q|HilKk-U$p z8BcAIF$!>otssKHj@pC?t-p#}XV8|P3TQL$!+f}ihm_}kaM67NK5pV2d9V5IPc7Qy ze2JiP``pP)`9QYiTuV!jTObw~M)G)a?${GQB~48~JC6gk7hgl{XjnMkp+n4s=bioI z8GW`6cM9V3NnJVs7h5Bm-?L;=o?$n2T-Y1-Iu$#PhsYE>Ppzguq+Cp%BY>=Jz{U2E zIMHi-k()*fd;(@Y85OdfRG-!)b^50Q=qp(?k*(f%&FUWV0aL>t9PZOy=@W>Zu63T4 z;gLg7N3%8uuC5SBz;WP)GxT@+WjB^xN74uj+V;Awx@c|^vc)k_VC zbp-m~tWUtFIYAZ{o+;?5Q(Z&>&BfI8S`aA2p4Kn*hNmP#Jq%n#x~?xg`bX+eiVbLz zVYQGHFQ9TvP-*y5b<}@oOfT_3UlkdW^Wq53oWC&cqtoHP5|_M`j`<&8;CASXL+==) z;S;qPLWW`^dkTY}C$74X)^s`(qId4VS@o+(sYC841M06N_G27wK+(Uw0g=$w7Mx^Z zqLBx-pZE`tjcJBk9Pfy#e7#t+F{n?T=IH2%f4bR;u5$tNAP{*ahxYrKFBfnXQx38? z>H&R^L0TK}ozLzK?C@M_t~>BV%{Vk@5mjqA;rA~TK3sl&9PZ%O3b#d`n6^k+@6=^c zAp@RN3eGVajO0mFe2kwU_SVxsuR&)*8@1vq4JxTOo3V}MKg)t#UcR4Bs@NSpmk*h2S znie@~hNdwe?*-NQu~Hnr25`tU4NbS=A?(Q`z-QNud8T0Zh)ep zI>vQw)ACF#A?`TXX-x~u=>@eb?h}9`SZxkMiNkQyRThNIM@#$gM1_I5_SDlU=;yM{ z*jUYdjuk;GTb9gOx8c$ziZFIKU?fu5UfKdTBJM{`C3649A_#wsuRh_R);517L>fcp;4Oh>+&m z#{=QJ9~c|qj;R0ILa=GY+>w1=!cZ-e1s0^TynavIL^$y-hi8YvgQB6!Y$3;) zWLQt3YjdJZk7RzsgS&;|_?_ZxAU1=#QCa3@d6X%)sLRj}dMxL_hj-jQ1xk{;>P7!U z0e2Co)?`I|sYZQD#~x=dk|NjuZuzQ*1O9mkoi?58x7-=nsDX{+-h-WbD6jA5jP4ZI zh0AYMe}^I%SJCq_6@I53E@W_L@v?V}mV2L4x~CA zxJAwyw-bZ4xW()0>)&@vVdnGiOclh7EBx1lz{qd0h*inS!lEGkU>;htIg!Ql{q6xo zAldW=&zN1zFo7qW0R{k`DkToF%L3KfLSz;$It4D2Bel)o8XZU{~4e0wK+qy__F@d>v!+iBi1n^Y?RU{-128=T)t`iFB)T z8vd9G#-nc^e4o}%<)sZ0%v_ub)+%Kg<{K5?tM}buZ4OyGp^wYA#$VYnO?IB)FD!%&KRixP@=8n%F+pID#0&lYpt-6t09~VLOM_iF~ zw8R2eI2G5Rf!3th&O#Q_BG~r&+;#K zAu)k?K{1j$XTE20ELHB}3WKJfP7jB~npn>B3|D*8fCVV5Y2LUbd(C)~eBFvl0lv*) za5#MomI_;39A0=VgF@nQb!$alRWvyUjWU0}qm}Ta^$_4!ms;ffYX7T8c8My94wEP* z5OW2)U2Xvjkx*f9*ev?=*&PcjvF4tQDMz~FeiKjA?-cELip5a>WH+quUAxe^b*k^PdMz8iQr;gg0O- z=g1yU`~4w+inL^UJ=ctTa=s=0)~!xH^!vwhau`Mg>wr53c-*4N|Gc*7#+15|HdP$= z=7O&@M+fYMc1(5KjBEJ|N^tHDXs#40s?F;t{Wnc}B%u9FR@C!Mz1F|^@h2ex%Bbqq z27Sc;&??lT0)2$!nk~2`s#{v4oDeW1b;l3+Af!5mq0hX z#Nt|rF`=FK`%Q@S1Gie0Bva2!NE;wVzB4cCN?JOTD=wKmTQ0@uF=rvWvE08vffKxBstfo% zbgpgOYgf`LjK_hId2@T(r844Nyr%J<{d}T)A(&WhM0hIA`taywlIUg&9=m^2bIP-m zEjVTsK4Qg^q?W7`5@j*yl7Ipk9N%l}k8h{ueQw1qRip)QYigQsY0{~w zsS|MuP(a+`=bgw$19Wa3kGp(!&z{G>UJ-6S=uKIC1Mi`22=AWSbu(}=2X#$RPaqFA z3S@ek(O+(>v=tt0+8}ZfIx}D9pT8%YV_P9}P--s*itZSP>IbA&@5|c?D6IRx?1+E^ zo9XbQ;Jfqd1q_M`?`Af>es$2=_Gga~I%%Riptuo z0hIk;Zg=(s>_bE-2idA`dim$qDt+pq<+-)(9%5O3muh5H0GIB+kQXwV2Z1bw)cv(K zqIQ1y8=!4@;Dh6I>Qs*ZbN}wH#wD?i#&V6yTdM|*(Itj|h^wjun=x@BybxMi0C%a^ z^R-x%uq^|Q{~cqW(LKGw)1@9!h&S1Ys&ef*FqL=V+ANVZH|MBB5 zo{$0j@rlvVUU=IN@OcRBBp&2Ws`+-wK9aPNJ;FvPp2NTTM>5^qcI2FTEZZH8>!#ke z^DA1AWgL_{zYI6?w3Kh3zv1tu?=5SHmV)_DKiRajjsks6^6tT&UrA}nQO?OeU;Bfz zkDC3y_S_AxHBr44KH!tN z>CE8XrexbD@HI0Al814IO#A+g0|cQ}>I3Is)o%fwdXCoY$PU`-=I0U7TS26yL^C}m z?;jBph+(s(90SGlL=mhaeX5AqD}F@ z9{3Qs1=(rZKa^W<4%9B;(t>3GM5NsRif7s@;}uaHC%quW{y7OFz-s9#m7l0E-H)FM zZxIv1@mM@Qf-MA3EwNu+C+fXDN}(QLygw=4oZV4Zl6bdsd;3yAQGcfMyQzhh*Hur( zH{MDV?T&A^2*1yl-!|;oYF%vtvZHN^klmI=L}xh?W8jf?_s%b7Psfe#yO`eoqJ@kM zW#3+t8!`=7Z2xeL9mFH6@~eKATUsBLem?4Mi8ROZV?bIAS}sW0a_>72tLXN7d_`*? zF~$zw&$G1Nd{^-wMcx0d`<{&4@6N;*G+jj{bp8Fd$YCOoH2?VeMA1&n|HZFp09v?%37vs)~xq$GvY)J zjk@|xww-K-5Z7C@2C2hnuWIkrtBJNC?mv*i#GMa*r(^LoOgDH3qciaXmUyj)+-U=Z z?Xfsx*Uq6YJA`^vrV=Jv1*BsUHE4&CCPGSKJnMJ?o%q%MR?-ohcc(hQVkqL=kIYV8 z^X}p%qM~5n3Gs7@D+!gl+KB=1e3!M;T{u9K<>o5ld=!J^Ou)ak~xPi6lK3 z2R#WFj(6u9HXmJLC1x+S&X>N#rhC?IhLhsA-XFDGPYh(2WH`4tqXSL+|7)smdjcAk zr4R^)T}g(RY5shPTLYr}6N=go#gW?2bjNiJ%-ix*@+h&8Ol%`s^y&TR@X;JE2M+Bp zLEr+eRYjn!R|kXi@rAxudn^<;1xGG{iP)JK&g^GgcC{vKS*nu|s}!dq8eA@?JBxhYzxZ>o4tFoSru|AUsu z>5c_66Bia;9tgkw!Sw-}@uXz0&6!_T)%e|KMZ-82JXMaxMZwBeR_JW#&)+J(e8^^# z73jLhHq7n0%>l`qUw;T>E&GaG`uGyHo(~a!2-X*KKb>8d`3!x1gZs5NKO0UXEscG` zw2BonX@+*HI6xgxO`*|b;!{4{ADqSp@d@G;8bCr8KZC;wS>ti!zakOz_eDJ2PzBsw zXr~L71S)vz%*c>t%HN;g(Eu%Kr^E#6`+7U2`~z{UrAT$ji8OXhHpb+-!S+L-E5JX* z-z(<5Z=TXEks?k_KUO;6fHSVpZUjWI>jAmD5;tlUZmJ;syiAmmF~B7?TcR-d@iBt5 zV#Rab=z7cUA(>)dp5^B2ZRtgfb}bkYLkHj1rMwIk>0x?~}E$7cyGZ#cnJ zXiJJb=27ShvAU?LVvr4vdT17@>2Sd$=FkhaiH0(apv?(w4YfJxP?ilnh~HeYU3Vrt z7jHfKH}BZ*PfL`(>a^Ik2aw#0<>8B<9Ss% zUKCkwqf?r^_Q^x&R%E*i21x*|W7Ecc=ghVgsLTM3A9{?w(~V=lZjD5GtS<2B7WG9l zW?1;Kp9UrEAH0=Xiq#!mx9W` ztT^B!w+Qyy>SF^Ikak5EJn!w{F;xM3N}}<-_0|9DfN|UYRDX%yad_4-8KMFxvIJZV z0DqyQW{5v$w)@UbFasEpUJT*}bwFal=Q%^prJ8$1qiD zqWzq)HL_u5Wif1+`RzN|V4S=jT&2)ohL1f7z#aqud)uB}B{!rAqip|%hf?cE4)7+q-!LjU!$RNU18oIVAG&iSYgZY*p7|wYBb;3Z_>Tqp_Z%Vi5W0Ii2vIW% z%!E+PUuarFSvl@f!JnA0<}m{Z5jS!lquj@xG*Sd;~8TE z+g@XRA!%SdUd2TpRKXH7rwtRB^^85QY-2pb-`+l#Mq02dvjh^{23KVjlXRbp?$j`u zNWu-6(S8v5Nag^FOH050gUYQxH}m{(CXQrXEI7Oo1gbI+lZlcOX6u4)`vebHVd84{ zRNVfhIJX9h*L%6Du=>CQs{fZYy$qg__m~B-8W-;18H#c{@yf%kut;AaFdp6+(O0X z$;khr&x7+8-?+{1@8v6NKU~m+5Nrd{eia1zU63EMm*Xcu%%vBleiPPjN$4gt-@ncp z`il+cdbF&H#fVerY6K~(apLR7F5-SXD={fs77uWReLLT#=nFvm_89r1Drr6{&ak_> zrS|BF=Zrx6&@PdmkbMG~oz1p?;-cchZ96pE>)}>UCcC)D2ulER3d*2%4ot!cgU-?Yy zx5vM|DEyF%{LdU?O))dJ|KxBinwbWCWY@e=nv+q8{bdbt-qZVXDb0s@3^kc($=VoO zMs+y+Yiyz19=~0+dNnAg(tiC@|J~(WM8<1L&0fNdap>_XaT4pZp4dcu7vCfFSs-Q_ z$@~NF&x9J=ZHybxVSC54zH~j=wYormS8R^If{o!NK$@33KRNYYcnsul6;JwR^m}w*}J30>(Mo*Cw#OSeCV zrL>TYxpL!D>{f&Nl<77YKvo3+6R|2}m>M#9;%vLVUiG-!JpQ%bLCZR_X+a({=8f3> z@rm$-A9}Y{y?MehzQBijk~T1aj0dF56@`|SOv<0r=G3DPuN{e8&U9Kd@@TTyre*?m)6ph( zYBzlPqKPRS>416ej@e~9qchV*PSJHHM5F&0L|eCaljeHukSa#KCdt))_a1OPDfME! ztKd<2JwHER_7Ww0;mj1nVuCo*u{&F7x{fw^42}crk)61jU>AJk(hQ2=qj`V(*|MPE z;>?`}7d-e^~Fh zWAmVT?{Pkd5{BATY790mx4<{S8i>1Cxx{Wm zRHA7D_&vwIk-1_9T-raotq4=`b{9XaBl%^#DM~z~b&9^pDvO>4 z>6yfAm0O7nQOhS%L#WOWCnFF^BU_7kpfkgI5Zf2FKXzTM6fbVdr;kfSUb%-wX*U!tg0&5U-OaDa%DZ6H!-;l(0?JGPWr~8 z+(0kvL_V*;6~HjeW;cQ~Q)Pg@S>bIziSgad1Dx{kxz?LH(3MCB{#&qY$qm{Tg4LEr zqi~S*>EHtjIzBo-z5+T23OdWD*M;9-1rbRyZI*Z|{pD$dy*Ef^FKc*)TRSV4&7Q3u z{!_lCG-znyb6#t`xAyY#UTYP^13Y;T$>!WGIzw#g2Zf(^62^S2NSIzljDBARH&=IT zu9<|DT_ZG<_j+a;$M*oHfY&ipVxMUcawD)I$CT&K*3$@J`P(f#;mUcMxBMEEEWvHOocKJe!DRc&BQ@;|)CQ&)CPmLGWH z&xV{$yEd0_1Bi8bOB1cPDP`I?fpr;^QEaCGZ?Olz(y$4EDhd&j%%}f8{AiH)6saAz zD);-89594jyN8Y{Ty*aCmT4oxRzdNL3lXH4R?MI#w7t9qzBJ7JDS95?S33VvI@A?@8xXhZ^sUb}IlQufn4WNcRg4rS~K$4E-YB~^h(t()7EbcZK3TWas9-%QZ8~M=aU60X3r;))^7(#zlI+*5)%UT76j`k zq7FdNR&`~e2h$T`gI5RK;}w3w&zcpKMGswAm`A00BwTUsnO?Gx z)pcpw(RDVkqf;tFyVFQi-$~Hw-OZX8I28Y-Pa2CCcd5;E8EyE0HEd1z4(7D5?YWXl z;5Lp2jzUEa*;%553UYSYOn<3NyL0<|BPfpYGuOfIxI(ayrLE&bSGSs!IsGTpV;7|d zUTJl=DYu%}@Pxh2r3knf*8s|8noTxKUl_UDd>h_~aZA?-3d|r3Gc@3x!{&<2Y{}1} zKh4_7>;{CKX52&#FWZgC{DPBG&CFpKJ~so#*L=k)-zQATM>}nTwVri%DF&)Oci|o; zhbNqB_Clk0B7)!o%hh~DE9(y=i3zF#WZpZrXvuPTgafqh-%q0VQAj71vOe~wDd7RR zx#uK|S@)vQuEVTuYc8ippcGcmiq8RP%e1W+W!IATHz6vfRO8==n?E!ifJ#y?>dTDm zGMLyunZo%Bi-`xPWv+HclB$&*7`?{%eqLrYcQIVOYGm+gAT5iuEJ%Ia2*FcIHzRrA z@f_gG;7s%GZ{MoOp3kjmZHzc4t-ii~V{pr@I~z+2T+5&u?|6Nt*75XUZxLj_loNod z@Kg#$w(RbgWjUB<#*t!rk4qzv4B=$4$zfb%`g9SrkH5U7IcS~9v_Nw~8)z|9WZ%Dk7 z*TzEj4K@FG9IE%MXRS|U!%8Dv>lp%JHRJ7 zvv3p;9WARrBi-zF<(VTlJi|DqMFYjZg(pngB!HPSVh zx018uTkJ-FmUL!g7?9t-5I9pXaf>VCWJ zo_c>?)xp|%sp4dv^#GN2(CE~rzu!+&^t$~$dG0AKs{g%_{->=j+!DB08#N&OyaI48 zcH616JD)?#)F@*_9hJ=8-t(a*Jcx!8wP;fCwO&1yxGa9dFw5xL7qb2IqZR&S<934s zBirTXc-^|5_T?K0+QVh}6wF|TJ3!OavEqi1aoN$xY3f4MrVv5)CPx9kS}y4@^ie2k~0!*ZPIH9uHr#l^yTQcQjHEt|Sx%fwV*LknJLR9Ph12 zvpGkSt9U#?Cl~v%pan5qt`;U);UOifF=uGg_{oIBifMtEQimH2Pw#|q-k*CLLBkI= z*?xuBU27AWttIx_)3L9TZNU2n)!n|x<@m=zgiPnhJgzr;E$%p6`l3(Q=}?J9|7tal zd5f(z-P_#W@)LI9%X7rWo`2BFbLVzAcugnTat}9QRPvP^1tOLdsd*Z4u{bacXWQD% zz4v>K#2R%VE2_ScsvX%!|5)&s59V00(5l@IdicFiP_66KzQsJ$8BEmt&nq!5)9dWf zlt|4ww?9qopNN&q&+LTWIK}7N#O0U^AlAzszMX`L4gvD4wj8Mt4Un#{Z6x_BD&NOx zY6oVQJPpEr0eEKaivJvusfur0Ky030QP=SB^g^qcn^=MU=^|f=DZvD>+oN4dkE31s zip#O{Sa;q^CiV%70b9IzOpb_ermr%uiVj-65X`VB^zab%a~GvF`+M#%!sUUhi4$Kkp>vt3 zMskw?)b|B;BV~QD{QLP96_}{WtWLgA-yqo~S%oAK1S)c;^{nSu&hE2sQ&vD1-p;p0K8A|is*g_n!7Y{K;^oZL z;(pDBY~ZqL?Jsv;G07XHJ2X`0@5k3Ha9W5dN~3uH%7*M(=9Gwh|J zJdy|Mvjo7IHFM&-KS#dk)p>|tb|PZ8C{WWH1%OCXjt3v}-SJ(v#{?F`2QtQyl$w}d zM+LsmMgQ;PlM0&|X=ob)QBxK^X5FS~d5yUkan>QB4A@ka=c{!2 z?cnu5pTu8B^#sGJqC6X*s_8hXS;n?l%w(-Qfry|9U3de!+Pb$a@PMhyh$vnTT%pW}oa)~mbkNA9Cd zf2-0jx#iJ_Gv|j2G197EW3|iWUrb$BN(I71p)C3o(?8wF8I{TR z1B_*U2Hp$kxC9HSSfBdUk*gFnQOq~~{{5}MCb$yQdZlyBVSql-f+Iuehmv=A9odJ^ z-NC8)i>m5NlE-|Ue?NqeuUJXv!YC71(y85hkgdLP4D5JhQYQN--~D-mTu*EV>k)Xf zpW~S+c7Cp5on!ascjfPh{ivQ@Ze6vR(~{a;X6sLqKkKeKZ7ATX%m#cUm_;_XeOeHQ zyzdct3yTA33PTg(@KN23H%XryY=aDOuy|RmE7Sv>GVIz!QRlvL1ToZCQYO{ZnHNi@ z?+XAquI=tK@$4r)2kml?rt>Qb^r#DCC`5X!u$gS8iU0CSaIYf6ELkZY;|Pd2{h~92 zoxr3b03)4XLF0&kt$~~iLQOMGOjmz9Kprm!tfmX)`&rXGJmyWZ&qM`ZU9@+9HGeS=hD(J~j22x(bjI zfHPqQvUyusW~3XfZ0rQlaMBTxGlOWG=9$8JK^fDzA^Do*1G)XPu)1;f?tCgvnmU*W zPeI0g7XmG`s4}R`7S>I^CX21PfMiUVI?*r6b7oRP{p$l;$T9d$+^Kf^TV1%uW)?13 z)B1{oN4@&~{nT}Mo%Aoc@7NL-y-9QJPN;R1$Di*dRj{R;mV|g~G8`MLxc^Wu>R-5+ zxWweL9U4HcmaG4yRqjIkvn>$IZYrPDSB{B^4R%+XIG|8DYL{2H5?q0Qnn8bdArTvp|r7~g)UF0ggqd0tRE@%^GeE)CV$@R6i_OqFKY zU7&xa2evGi?eiq#yK@NngcJ}*w6hmBVAIo!=vn?pUnZ(I(O`kH!1W$)+Xzd!@RGhr z@3(ngP2FmMlI3DEWppKbRg246rCak`w%rPJk2kNge7Smi*s{gIH)rhP;8uQRf&)7g z^V9X3=^KxLEshSpqj_?%t)vdht28}fzNwuDZkKjCBZ5_C)hhNVg~cyLm%SQXX5IX= zCz1SEXPJjl6|ZTZt$Ar(-tJ?#G>vlm+d_PgOsYQd1TZE>XP)2UW%_R$XKeT53MTyW zGM`hQab)8d6AznR?(Qoy4f@F$wm^C3c0w!W@(C!s8!DyLwB|qRY^V_etf=zN=11lH za>di6&zS?h_)As2?j(!RsTUm=&9|GjeK zVKld@5B(1A8~$tE&A{?Eth&CktFeYNbXOqhdK1*mkH7FR-kUwDIUQjW*6ol<*q|Qi zo4H`E`z2}U=F*oyOVZ4&O1Q)52(#pt8z_Sa0Imwut3oll0u${3=^!pRuOK zfWB*xK%U!D&#-e4WDOb)9V@j#-BQewFzUYI&&9m_jBthd~R9)mnBt1QOiVV70?^Ra2?S93j za{^WNlHop569>a_19C^Y9huzkPijemtmOF2rAB9jP=dQ>To@w1B(3Xao~8d7j3^(b4b{~NNq z^IjRG@(04-{GjVnShOsF4{ThN{CJ(j zs&+TwLy<0XO{HZ)>~}3P%Dc)5XDoL}sJbqL_`YJ^4o7H8(wGlgT8HfqUkH>?|FhJ+ zBN-0t#;|UsQ-(dSrq{DfkC$A~t#FYk9TwBao%xLr%*I>T2_TwD2;?V?Vjlo~$I-g8 z-U5$1TE0O#HeH&Hu~Wu%;wNffI$`pQ)Ov9IWaMNyx!3CnKp7;?>1c2)S*-9FYutE9s~i5r3n4TWMUYejx6D`k+oOR$W!D*C3xAaDmf=U)d&%q{L>X z?BLoRRd%aWu5U9u)@M8z54_12yB-9-AD+SkS;bQWq)9mR9En|Q3;~+;lu4RO|s)0#H!p3cROCa!yi=N zS5OSZ=e&3|IYhrE@-|fV>Zv8O^-z|&I(pG;muxgy+z2E7!Q=%;bDs3ze*F4 zgyVMElIp`%3jNnUCLdyK{yLA0=!8zk1XRe1085jFVvP8z89F%x4gW9B-YP1tb?X)l z5}e@fPH=Z8xCVE3cXtgQ+%0%;cMtCF5CR1ucp(K8a%=5>uYJz>_jA){Bso%i*86?RxmtJ`=`k8vwL)TJ>K{#+FtULI7|XZsqc<H+|{5#H-^9x{A|c-~n1lzc8`d6RQT;|3Z|!jG3VM8(@#S0%nQH~38<>#sqZ zNWcuUP%(TtLrXIl{Ei%NWm#K0TXjd6c&Y(AzZo@3D)*J7DnMAn8anz9k2pz)Rn0mCjG2ACWB)qtvon1VdH?{8oJ1 zFiAXG6WRv;?I>r)^v*+pH=CYB7UM2DZUXLpufI$riYwWS@54ACK>DLGbEuhD849t8 z4$lxJsFhcg!O#P$e4i5bO@f?*+=xUPx; zCuNz3FQ7eMaFrh@9`K-G|HFYks7C6>MF(X82z=MkL(K1Hp@RaHR~{_ohtt*Tbl-;> zgVHMz z1iJ~)uH-i*5IgXxT={G1V9IQm)o+qg0W7zq{S#pU-cOE|;-Sn&pGq-sJ@sE78n~Vx z<>Yqlqqu@!V=P{-$rd|2VwMg4I;qdO)&E>{>)CnjYBt?xn?KP7%|~voW^*GlY^`$a zcD1MMaNO`qGxQ5vy4RoezuwiC!(Mcs&nPH3uxOcr>-CSBECeA$yTdef<>ZZK>*yVS zPBP2M;a}-HP7m43jyh=VK(9CVV3fe`D<3~IF)u%n{K8{Rk_GrLeIWq#mvNuG?FOny ze;}ovGSPg>B?`8?4;4B6UTyg>00rFh-euRSs35*RuMC{imyQ7xRsDCt&CJGv^aJm)SGxz^@h#oelSWN$y>ETids@w_e)4U1w4L z{RYYDUs~Q;yKezjoM}!mudhb>`mEa}*`^0pbG%55Azx81H8nOluZ&Qna!+|g*>`Jd zeaMqYF3mn`HBVy-$vlh=jD`D(WR29(e!o+ZA^OJ6+lwSIz+<+!nIttkKpWnA2sSPQnvA5hmqoumcdlmuK!hq8BFCSoN zk%*BGfz0}-DnS)baFcN2wyoVu(^aM^5L%+_{kD4>lW{99N1xTNq}TgbjvRkMZ#UU+ z1y#M4ZO_xB!^&Y$YIpD*-Q$d;N|ss>7)%Ff%Vh~wLqsC{6+Y0VGs2ucQ<3kHMeM&U zE{);DRD3?h|J&z}V$MJoMk;U{-g9Uj!wNIXmPJjGmOqO^z|F-vR&j%2g?VcnBmA&p z?fD%t%#%FHW@c)Jv{vpg7e3t|n}oFpVF^x?kBG>Od1JagfEpQcu^xhH$R6a*nEAp8 z@es#P+8rJhGh&v>g7c0%I9)9^yFI?m8&mgT$ShwW&Mev^TAS@K@@_yF|HO=gKT;6Q zT#=2X^<*do@8L8oH`da+A!1Z+8j1!-j74@bCS=a0+fM+# ze0?EfC0azK0k{pZ!)HrL5Tk7mH7mX%Bem^yFX<-;UPLI19ETAJMx}bf{BiE#@qoHt z_+$-flu=SY`OLe}KF{E9&o&OA&DmH+FMYoBTK^K=yn&_8lbfUEYyodqjhL`=B&85Ks^HwAf0~+kC^Yzo6SvP&p z(4FWR^@4WcOZRtYJoI1pqsipoUSITed6#e=Ay{PulI_Ltj0+%LdlK>^r~8aHuxu{- zj|{|~rN)jjpJ&*r@RTmx^(Hhd3GD25P+=ZwU(&Ho!oDk?b!eaY7X~rl?Rqq$L`U*N zPB#GT!Ve62i5GlOm=lr=dK@p|=;$a6xO^&{JZv3NeUDxhngga6ms=y%%F#(qb zf}GNK<5<`0Pjqd5e5EMI^^kdaws)Q!u@2CNyS?!SU)HhQ-0b+1^uRq|W#|vffzP-E zpM!`J&&pGYmrVgrsBY*zMcZB@0N}*)quV|A+N6ojTH#yjspVux(NxsT% zb)hR)Xu$#9J)qZ$a%HmF7wC*(!K{~<^TYD)CZ@3feKWHSezb@A9}z(x?nOws?by=R zkx}FB71tBO({Ej6NJ4tc-WV?>+C10IA8uHQs|7f=95YYWMY-l`fX=OWRr>3Ev)XmL zJ_cPC@ZIs|^YQ;C^f#(vjuVnizip3;cP1 zE(#&HB3ttTJ!*gtmw9n$e#f_=QTnWPf^qe5$_7-*0sp)M;&d?;;k&dR@J08i=hbKe zk&+OO%!#U*>gOLWLx^5y_M4g&JLzsj$Fsa8J+v7jDrq(>|g(=1ZN1=_o+P< zCv-G?y|mA$megr4AP? zaO8i7++(GZO-VMBYmF56);u@;>m3Wdi$9|4F%?BLZkJU7XofO%POl?r@LMK6Ifw7V z!O^J*8dl3e#N^RSgYg52wjmS%$mbBLK~6{FRVnlV*#QE`g9!*?N9aAgXCgnJZ3Q;W zTQumhHNufca7lC*RGM!C_CUg}->^{wIYS~OusP{2;fCdqg zudEJp^(z{S;a1ypelJAE@?`8qxO$Eknekuz4p$WJpaVO4i!v2Tgic@Y&Wn8{A&pz7 zn4vMe;}PFQ`Ka6fIp}SyLg^Lh3{U`FHfq#8_iaPR#=@BZMejbJT7knYQR9LqoD5GU z5@t#Qfy98W*ypfarwyIo_TyLK_mhq{cqL#RI%~6M9-d|BJ-z046x_^$>=AbL*bc_G zjc^@~wa+7SQcXpCb3Y1(3hUF(s-06>iieBPJ(oju{V!;2d5u_S>I2E;{4W{qw2kRB z=R^+D#{ENj+L8xErogeQGZ1hqtSWscHyc@vVgJbo=iYe->HYqB`yR_pky!Vi@73{O z2S3H1f>D5d5=o{{@hVYZi(i+$yi7`z#QM@v_X=25GHeN&D(~R}jt`BIgxEX63AW~O z-~0gmK$YhNlBf|+gv@;3D8)j)!5;U;C->#~Vqhwnfyv@1Dj98X@HLI^QGY$5vq(!# zyp*dw2WS~Br06&*XE8yo-M05Y2__C5v(vZ*o}rDZmk9?CX96*r)U4@4DECwS>$9~l zS;lEqm4kPSdM-Qc9NV@6fsYZheybj7(B?I?6f(#hO+Z!pmyt(Xx(zexhH9u%Pzyra zanFY*`x%P1>gy6|f=E~HKK@uR?gU7Mf5Ut4*+VYB7O_lrWmB^g^1<@N|C=ysL`XY4^bcpfDimQP?(mFhSb@keD$qVYt zWGT1jEag?e9c?ZS*K2Eh2pC*b)B0?Ry-!SLPr>?D8B*@n)6!!=3SQ%MyDSM#gC`4? zke`}V0=GGCWtFOatNTK2t(dhw6?vSVv9P)6DYXPp(Zz*&$@mi zr68RGRa4bQ3~KXBlFvPr5Z;1J~sYfpAxJOf5DO~&uH-&%MBbGdu2|?X&@i`MEW)= zp78?O*kJPm_s*F>@fA)zCeVZel*`~$8nPhvGsXZptq z)9q7z4U-4wQP}IXJ*G$+oNZrqjQBqoNfsT4#yu8gLJwDa;nkn$4O~jaov84xJx0`H z8s5*PFDhB8#WJIQXN$TkH^HX20gpz>)1$i6Fjv=O>}ihtsQ*BgRvcvX_`|sD zu`<8(F`+Xh2wa=OZEakkTq^GGTC6imV6Cvok}5SRA5Y_P&RUnbIhXg|62k};lMFd2 zA|h%Z!YD|wuj?H3k@+by)1%E7#J24VfiO{-?CLvyA!Q5sHg#&B;bUIhL~;Ct@1Wh^ z8A{~t{eB6nQ6;>f+-}D+S*;d7mHV@267QJ z?NO-lxs|$zCzjCf@%X`QrN#naK%2up+dsiilia6=v4R(A-p2y+)-14q7n3(1GFPIk zRvmPA2#Pjdec8u)Aj5|&HOF_M(@`>ucTu+eXFsf(D@~HdlXf@maZHN*e)}>j@faIy z?hk`!o&8?V=R!?fQG2P~$g7G;zgsKTB=GHK?^O$z$;hEc?K)c~?m6Gfqc2HkwfPMc ztY)ZBMbJR$RnR>9H7~}s7zs2q92_iP*9*>cI|u|RY-BK3YUgyah8;Q>;|%eBWfcUP z`60*5RqE8lZ55$}7SY>~}H>Vt_6pZqUV*LL3qn5g$( zHbgc8q!wlWAtGRho+Ch&xEw`SMR}1Apt(%DaEZq}9|@A>#BCsgH5mztWQAyK}0gKm(tx`coZ3b>5=fI6Ias>042b+ z{(kZm)pech&b6J*bbEmJdiRsFO7E8um_W<wL`H@i0#;W`Ys@r?uYa3-Z=CGq1+hE6TP&LSh< zC?aEnrtmKO-(F>mvnD*G-FghzE{Qr%`(!LA(q>BoTa6J7iyPhV0>jb6Fdc&Zv|YRkhzE9~0_igF=To$1(Ujz|=W8^IL!O#u z05Oe?h%y&wOSpO;^!9T!)QXV_qO*KS57tWa z7uIbaB}n!Z&Scm+N&id?sEy$f6>owp%$>_7*!!H>GOd{1Fh{&`_aoI}ecASiq+hCv zlu6+)SPsrx=rTb7au7uPYC*v_N=+EZ#WszvE8j+%-cI(?WeQ)TnT#&B&A019g45GA zr9*u;YDF`n3;p!U<4LIYLXLpVb16I4(+D4PLzODY{fSnJjRT|nZ_krN*@G{|5ap#R z-5uxl18pzpfTPi56WXdsHv}uYmzXybp=3)@^m$t(B;eX`OW2;uv>_xP$mJaHIQUhc z$6?!|29(`5H><-Gsv6>4yhZ9G`}(-ZXjS|^y;boztm&@k?WX^gx!?E9IVIftb)pjF zRLQz`u0*mkKKJ6md|NsEXzp#ym_{(sSy{vt88kU{MX zcfMqqF1}ke*O}=jj2#SqJ=Q$YPNO4IXdh3GK4%xu;!>`b{8Df4S5Lw?Yd_O z-ToF;e9e3Xx8-?mE^rWo1I^lKp92eXQ)6OK^uE7WVe3#Rjx_jiPdF8^Vt{)8)d;$M zrlsKs7IJy1o>gehWJOY!I23hZT-CvMy&et#y!A2)Dic8jrW)sU9GAXjW$$1G*TkV- zxD(->k53Q~IRv>mjH+-NU|%5F|72tp#$qWWOU4+68)xB;RRk|XEz>Db9UbB!WT{@h`!}BOGqaX(^eD)L^O)k*H>}+!?Z0A zWPd`%0_VA?#ZbUFlCgJWb>X4U_nxFpm;=!IU#s$7!V3v;t%EKh2CzjoF$jMK)gJV| z$>u~}$6XXg`>0UvL(QIc{SLK?s8@SD0q1lJ6-E|nyF zpH+ovxK|MLbYhiswBxlkbJjZv%OEXhKbD|#T9{K1K0?Aj`raW9L&;oQb_@niz?XjBp#+FJX@ueKjYLxY9EdRaGX z$lBvLnE~1?kAt>q@wI4#GrKbsyyXLCqaRXH94AKII*{rz%vCxyltaw?ZYztMFmYE@ z-oQAzT@DGw`$H6P3`h76r#heL6oP`tuz1}b#B{XYq(okC8RK&$55dQI^e~M|urP+< zEP5>x1!HZ2GQ36T%n)dC?j5ph7#PX*{H!b|;KOPgaOPT- z$F2`#c`Ckk!AzSQ`KFy01e~CA+Y1B{=<*H>0WHfr7NJHBaU(M4?XL;7s zlz+V54%*XpV(wL}dmPz`g3tzp9tTyNQyOjh^UIZZ!Nlrvoj#9lzW1$b!y|Ly5Rvgz z<+RH|Pfyw>OJSSsIp?i0xchTD(U+B%{eT7KI=*9v&a)cMGBjL{4p2&B#+=VPM2zbo zQ!?VH#+x?AAZj|}C#og*%eN_W4mf1bS%Fu+n;wWx=HA{DC;xGdz0$0EU9Yn8+m$50 z323J{ZzdI|gY5NsaSGTE+D+AnUr6`K>^w;!BWzziyiCisz+ShQhxZqU4`k%n$;UnS zwe5wP|4zKZ?)z9XJ96{tF#a~5n&_P>IskmiEmHv4Fm0OWYMM4~z3uwSkV*uYI+1Ol zsHlnl5L_#0wiZN|9f;wrc4SP<^a?Oldyp<4And}00)-5@|Ed@NKl3#Iy1Rf3*@VJ= z5^L4G=0Xw$nlwJ`kKM$Z%{Q$F7aGR5mT<=oR{pldPPLn%O#7tiy5`B;m$q}6U z%tw_`w~NofU~e(AiCJR8j*%QRkU2U3IwB>#p}5Ex?3Q%acbeFAOuBx9MOKNby37Gl z4or)T(~4rU%CFbb+b=gWa~?ZM1cpsoMFpCqPrXE+2Ysc*H15h1gP;xmS*>(e`kh_n zBRKZLX-p_bM1;*WEjr@^7Sp;FWRBRgQ`&VhfBZT@?MV{}Yew&o_xm{BW%P@70{s{3 zor@cxw^f^TgWPWFQOPdnFrsIYbL(E9YI39pIC?F=P1QZoy{y&_VvT9E)H7~atoJ?6 z8nnRE+@ax`mun46d<{+I)=04&kox(Nj+jB$CDVU0TRXc&=JnZmzmS(pHuf2S);_tf zTGQO0jhFO zKA=BmK1fY`$;k8B9(22%$jqsnl-KduA{ro0wcI>(U7v&y!mn{;Vs6*5=7=`cRfG)S zP~Gn9Ek12z$YafsRX?{oVkuu0{Cv~^5Z+W*=Ql1WtfsI1sD=Xbs_##WBo`Q>*ng^a zY3nUzr1?$mY}dxNzo((jjN`q-lJFKJm+};PEbUNnU1&}1ey{xcUKJ*^XQJW9G(T8q z1NcjQ@p&=VrIkX>e*>V#@0NDU|Ahb6(@UN4l5{@Rn|;}NOnSfo9zteK<99zA{}F>V zjD)3t=iRC*Z627fQKQS8i^qa3pu#c`)Us0*fJj=qK5JH!q}xB&_gcOJhB`Gu0`>8< z0He&QTo3bAqXjlq)%5J1lJtkg3EH$YbumlwzEQ>s6$YPkKW;davDs?y>;=xOaM*9Q zXf;J}{({|@7Tn7-^iEcoeW=?G7q%d9>Uvvi_)bC*U5wg!m9kSv(ebM6H}{O$nt>h{ z-#Fjma+cFfPSA6$5#&~L9JYE>#7BxObw>8~$m;vSai#&`V*c?M&q^tq^0>e!jJR*F zPYvNcH_qbvC+I%fJhf@?oivIWLQ9mhN)gzOCCd$@)c@>2a!^Z^)I@-g zrQeXBhO6+AVzNX=-dCDRS_U)ul&C$A!+?0>m}euKviDUQtd1ZCYU232M6@4u^7l?d z!^}0YnIFQXct^4IgVjmXyhQA3C-ePi7hjr%!Rl6o)}eAgW3|g4$!d2n>aT6>1P;$3 zSl`p@{>d58o3zmobTY5#)B1C(SvWF7Zs)*3v~|G;OP@AfVlz0H(uEQz7s44s(P{i53T!uNZyVZY)IB`F=j7Ww#b=S=CqHEZbE{Bl1?2@0nK)S z)J&;kLrC00RJdsyqX#AV8Rlp^!QnOf#?aM$eiKuXQ+D)dzuP>1G>E&MC(W^Eyp!BF z%Bfx*mY10t_B^ye-%9rP3OpSUrbb?E)RbPnqs*Zk;`jxUcTq;vEu*czoUV*m*znvR$r7>ZsM58&jx7 zHo`*T8_RW65qVKVkP3LJo((VaB)o5?^u?B@$gLYI^aYC0gZjC^>PrF@*ho)M zsv*1*zRGKx+Hb!dve^T}gdzi#_ri!aSL}K4K`VlDc_?MC_}PeY)(oGc(8y2%id~{n zq(3d2QvY)LM51PL&I=RkWvzyF=Rr>#O3rHdNzIhge{5oOq%}NO!9*)Tk-d=%g=5F< z1?^{Io3huosl@uX$y=hdY^bu=n4jEU*xP4AW%sAfff2%$cv(&KtRU+hj643rB4JZY zkp&Y=169JaeBq1V_W|fkA2tf4A|J~G^(W3`>-uBG>cv9Lp)b)ccBTWhj3G$HhGnK9 zHCpNtiq-UBG8YNr=n#K`5HEh2IVbk4+{cqyvz!mWL~1-hc_gXfF&RXh|L8U7?hXXN z+6EN{J|^@$-fwGdR19Ga%{G@$DmRyp2kDIexK3XEOnkFx5@P!00kX1sYVKJXGBk;* zLt2-qfZ(3deB2_Xu~Fx9sLL;JWD2i(YT)kMMP^jO}KT`XoKEKaB?5S+*&99{nfo zCes-cQXcQOoHI>b!1j0*d@pz^6CbvZqu_y=EJl>>WIe`{c&ur~-RCrqF2f3ap&+q# z)8h{8LM|t!8j-oLeed6o0#m%{x~1UmGle}0do6G0DUN!p5l(vWdP6V$u%r`G3XK9V zMi2D}5}(QzbCI&p`^v34lBJFLkAA`w%NCf8M&*~Q@N#J=ex~r80nPGWs&6NO#$oPx zZ#9Y@`z!bk%nNHAbW%y}Yc#9255A2GFiDrHv6Ib(9a*ttp`g+m75hKu)D~7Yc@h&> zI18Sqg?ish9-mPOn(ZAN{ET`s^ud1kJ;iEy(etP*9%7n#A~;^kUSdgWbH8cJlh!tD za+hnCHuGcR+e&<2MvRCXupJF%hiOU1w@?cCHe*mYt-_og5#S1ZIJmssusTT^FLU-3hhCp`pnzTel(4SZ0+E#f$YVgA{%pt|^V9buzW z=B{_mnC@+$dqu&y=VN{FRk5l1yQwo0TE#TP5}66eAm1dEOJLz1-I`_SY$MDTGgN)` ztSH`I)(MCGA8gLLu15Q0xw)?QP+oSZ|IkWO(?iRxlQnl*tsJOnzqTIU6R}d!ng+}2 z1n`qpBQkh~eHlq|WWd0$M6tR2L@SEd`D^zx?V`)a_rbF=rMbLxiZidq<)!OEc{mrd zX&=(Noi?(hU*p?T8=dOXLh9CQ!Ztd(kRRGw0WOqLNRjsu83MG~F%v~SrRMz4eab^W z;yme;{M6BY4$Cd2DCL&HYR$HMP?6qoGodil`3hf#bAO^Z{5qB}Ff8+_;zMGt9j~&H zuMXT+96-h%EsnkM=9fga?uYb0II4eYbhRlp*T=HIEkNf0)h-;y=Qx;q^-1pu^pqN2 z)F_htktUTYj*CT7a#$>hGA0-2-e51G11-)jc;&rbRcV_bvtNi|@4)`3X*e4Vb*3hJ z)?Yk~3BxKy55uLH5?Y(i4&W`Op1k+PO+(`Iyy7j6Od!SAxR;D*1Bl~NUqrA&+uhcd zUr5=3-TWOq6bs?x7{1sXs>ZF_0v6YYi0bn{qxy~BR=$r~Rgq0mFS&8)v9UqG3r z@gxJ;arS(0{0Pzf1o(WUuHW)ItR>{qv!&L0m_vbANyWl*BCC5MoCm&jt+ME2kk-ts zjyX4D3ivo@Mt0oFVRm$<#)S*Ly-n03Q^j=*is(ku<(HG`@nL1VBITH~XKUhWD~Pg25WRe{vh^85 zXJkP)YHx%!l4nrFoSGYyKUe#U zwg0Qi+dZC~bfM4#C^+L83T6i1zF;fguoS5N4}{BngeyV+kyxBsH-1jH+Hbd%TCmg6 z!rJ4UFhP8c^95pd%5ddFo*XcABur?^MlN@~s9wMsBN@TA&8@YO%*QM%I;Q6bH5TPn z>2HmIXkT%kBYPE>gJZgAfRsSDGsl$65@yBdSHN8e)QC$nYJ0s`!H;)HonM^{HbYlg zb!xT)UVcwg^Ux^^b380a4@GjMBJLm_na#BzX8pEJ-eBX^I@T&S|1BJeFXa{Am7eEV$kU{DXTGtq=9f-~ zKt9(g*V=c;3P`a(iw6XGvc=)3U`JUN)%qqNYFBg?1(t7}7CrOy;*oJq_l?Z@Lxc5y zfN5v_$|c#CtmjQW4sv$1^(cOCPg5W}6W(&b<`CXW@_BP31_)0QEjbF?E`}hsdzk9Y zb1xQsI(8fkxByYDHfZu65gxqzoh8kCS02Ct93QBZ4Ha_eEWODf`RMFjk^R&( zgz!N`gE@yGx5E)MR>hS*)6htGSj zP_Z4#0i{jDjI-l3rc^7m#q`-lS|WOwTr=T+uuw6K^^3ns>KoV1C$L*LJT30Khe1cJ^YC>=&XLgK$MIvPNHjD- zIV?2i+_j{7W=>3F1#>&xVxzV^A6n0tLe9>8MJs&tfS@lQrV!%>b*Z%3GRc4KmIn=1 zS)U3y39P1D;>VhBSWCdACA%~SB!rD!!Ago26!V#%<29Ky_@1+4mbV3(C654?%=KZP zbIO_1M?L7E14tV$W&XU8B-3@fM3mOdro){bC;-gGuh-`hwlV6?r>5Aa;)8oYBrvqb za6SFP!}+ZFf{XDVFA(y&$6~J5F+v+#A=>vnj{EW1#jc#tWVw5R9<;yAiDY&j1%j04 ziAuZNsM@qD)24eST&fIB)(HayjKn~0Ch$$g`Hm({oi{fWt*EF`UMP)M9-HhDoXk^h zv0w#`w5?V^ZW-oQ(%~<{qc6gVR8kg8z@248X4I+XjFt=^RuNMHrZ{&vnP9zWJh15l z>SP1u>rwZR$M;p+-KOWd1kA)~r7PAgR%7om412IsPD+l|Y`&j&JGHQ8K}i6BGdhX= zEhDk^Mej*b)o^_59+6kc;aSPaB3h68$Eau_^T@X7%>8%n+JrLbOrv-B4hG1UQ`!1@ z1?|h+S);{Nj#GgSmyWFc>%Ic3Q%a~GRKjV4B!&ssRY`-hr+byp6m5!a`Q1$bt=;Ko zt!3sp3GP1c-=x3Ar5%km_TCzi20gy~>UpFz?7r!qgSq)S9+e^8%o<>KiO?5b07%@`rPqo_U(f8w+KO_%eXk~G-3Oin>n@_2{Cvg%6wP003Gx}r;d;YoHjULgdzNjV`{7jR`PuyYi8m3=$!Y!wd-MBVHq zPJLK$sS;6K2(FRm2h}-1$Pq``Nx#p?!Np>b%f4*C-*H{tR z>O9&B122DZm3(EK5IDG2m*ICNe0f%7`>{)T(MQ3?yDtO6@0w7E9VU>=_!ly{Y=P zH3FRuQ?rvM_2$B(xpPyZm}K5L;BDYlT(j*xegdst7how}#!skckHYpbQjN=s z*x=S$oIt*PxR%*C{809bA1nf1wS$qsrRO_v182znVybbC@3!Sw&Y04%i1$sb7pb!d z@NUZf;EY}Gao59J1VV*TuE!#Va_oV1CdX0vueWA`X7=aX!Y6(Fj1^5J#EUGf{kPj< zB0=dzigo7YqqT2lzi*JQ`C1ELq-nRwicz80)yz_s2&r>cHwo0Cz{$;2;Jn{fyyQfw zTY>k)47O}Gq%bj9X+VO}6q7s#sJ2cw@ z!v@1?xP=V~iT->JUO{o>%{L*ctb9C7Y9_rXP=+yJW%YMFdA13QZS#Fuq$=}->&=Wd z_Wi%jIb!LSA1g{i|ig${J^A3G_Zss6#quvwtTv|)K5M7+|Hk?ES z;wyqq5<+;nQfB_Fh)UEM->?qv*pci;p>_L@uDzS??_!V%&YSx*Jmub>3jo7?h6rY)tAc|uFyhg69@Ep{v|`? zgBX{t*9~8bIvd3qrnD6#B~h_w9KeXuHAYqBe_X{hL$naY9<$Q&&vkg{p8P$$$B$Wt z19y>H=<%+P;)dGXQ*vj1h2=2gsI@^O>ktyrIIp{$sB6F?gQT$+rds^}=M^eGcSTLL zskPJJeMuq4&_8MoroII>ttH!pAvOH-xrO@bVYGH3On*cCheq@k7MdA%wr(+h4*w;~ zRs358kHmCh{U~JCbc% z?Tq>`iAwn;Vhn?oO{lzGp@E$8Ttr)0m44J$pF;iXqnjbJm?XiO zKqudakqs{Ycxf!=9pK4#skejQ3W9tc^Ms`twpeHK z=K?Qes$CJ|L;RWJ`g*8cB&%%x>m&R;m0W&NDnbAFx5+}-R3{sO#Xb>bjHEv4fT^-J zzqpN4yqsgl+V5N5fo$yhjlPk~Ln*@i{2MxL73CmA({Ij#Ti(aNP0l4#U*&9VQ*z+( z&hZ5?eQ$p!?p{Dn*(v${U(p9Sq3k@y9BbRQLSn&y%Qffl12L1-aCEyZNaqhavXb^1 zt87$}PrXp%qNkP!Fs5Cx9%Z5KfSGPLKpY4WW5xf3hkZGFEQ0oJ?EzM5;KgMDv<&M= zjic~i_WMQ5jt%8WmoPq|{NLIex5}_r4#O6PYHnSJxGKpr)yCYDvJ5nw&B&YdAN;II zH0>%9QcVYqX!LoQ2Ss*VD)I{$1D}-K2KFOr^$=}kPUxYFgVJ)0$vtUKUzD%~^`huR z_hxsx;8ol*jh*%Y?r%p{C%k46hos(N8qOaxw57YbWPY0CgB~H#DGfKXBoJ=;Yg)^hD){>-zmW~Mu|90d&$`~Oa_Nj<2&-LF zn~26jdxK)keuvmu#MzPR+O?u{jv1|=FBc-KWb>^|U&PR}1+cW(S=G3(wz<9dDTM+z zNA^*Wiuo`5an6P$uGY+5BYgq?mTPet-h8momt7(if)_FtJkh)A4KS?$s2M$>-QmXh8V-8&E2?VW z(;*_;=bt|~Y#q@uk?SzsdFs>f)93=>yz14$Z}Ni36i=Gt_8?4-*b3_A@7b8{?a zGL6kHuLL?)FLLMKYtTBa%K=@@^8`BDhk@m8g~JD(GdcJWKueV?>I#zj}X&Rg=%IlRyRy)0|^`F=G+ zNw%|AtGTxOdg+WEmr!9# zSZ2&z!cE?88tgu%7Oi8`)fM!w7!8O?uWJS-tMpj%apu21LQWY2Hjj~a75P94 zC0}&No^fc{Tj6p%u9W}d+W&q%hkk?yJw8Dm0z>oc#utfNZb+RJ`bknurt_11@ zLtGPbl$_Hwi%YKoYW4biHQl}svG0XQJ=N;{iAvSwhYe9_+)uh5)%05tS7+qZxE#n@ zoi4B)(qqV=S8MOT7<#4FP*9bYdz92XpJD0nuMw_YgV(B671*g6RZzEM2Q7W9+lk~`JbG(mNm-|k$HG5 zb)+eumne}|F(R8#*F#0*WO{As_x z{ki;KrimT88xo@De;cLsM~|R3-Mv2)a!jz%g}B8spcLGkNVg?RWY3KqY83UC z>sX?e|N2j@_wN|Fg$FH=+ZWi{zN+;U_&r=IP^riV!q~1sUHdu2-m=Jpc z1wX>12fapnx4%8^yJpQJ#PJv^O0I{`HQxTCfm0oWDJ)#)YytIhU`tkQ?j7G!dN=s%C>*0dnTF-I=v@K4f z1)Y{exaNcJ?wvC9J`x6n9AYPWA+4>z?93xUYm1$Z$5WPOsY19N9RoL72 z=L>NW4`s+}Uz%MATxpy9wxBBiA&XDW?FLxtw4*1(PoO2_T_~idHillbb(VgEJ5ETS z#~vcve~o2vb@=hBp*E*5k)pa2(iIo&AgsfqaQ*=+cJD8t{Qa-}M^0=$5TP>C9b{Td z14+_iVH8D6qll~I!t{yxQ;hgRFyPk!qxQ$^7?ev=3$>>1ipw>7Fr%1E9^mT{d$cXsX-Imf%A#1Bus-AN&gBE z_0#{5qaKoGjjr^09aCtW`w{Url~N3swWH4Y)H$AR2b+sAYV`G%RY?2!AXVMG&yB~u z=U+lhyTD$LU3=Ev-ZqbWn}hH~CxO4>BCp6Q1eJrdlE$T{J~zn~#zsWeNy(G<3hu}5 zh2RA{18AzBPC|yaDFdiw-~OvK3Hi*&_&=Y?Aqk;Hk-#~rbxojMB4muf{!Bav1Q9k*r~8@-%sT4pWBiBu9*h@4K>;%<{I0n?(WB+-h3@&Z>!Z~n!X za$esg-UTnZccU21ny8~`w`TPDoXth05ja6~cOyOG2f+3;e5YdHnxVx3Gj_xbPZT3oBxF zbTRxl;BspOtFgcBzoks`?~ePoBCJHJkMB!`Sf>j;_NN53-qug@T0zvFtvI8Ooau{% zn6OYS?`yN|85AssC(+7^>*OIYmE}MORijCficM&Ue+G~Ep%#0aBP?6udaa$B>3*Kk zwsDG@8Zk{vJ?cRls*H0D2_C{BHm;nNQy=!AULZN7;4gKd1@9KA6 z7$HkIue*YtS?ceCUe*OR9+MN*pP!6+1)yMgTJEdXXBt{+jXNHKA$Yp$8?zXo@k~cE zyyM86HbW-2lF{1mp<28CZqq4xc2~SYTMd{ z6#)wZRuGV)NRjGBihvYF=}n|p6_8$}MM@~T0Tt;bQbRA&dkFy)0U^?B2u% z2qAD6`|N#=?f&}T|2y}8eB^;7EY=)zjCZ`_9dphVlTqd9Y%hY*uRC**ZNvJyw^Zf9 zny|J0YptM(lgH54RS?RMmlLLTH(^9%Izf~z6G^99c6Di9xjJQ0c(PHjE=_%K z*yD|cK;RL#PV1gd9u=0+F}CUR_1=m{Gorg0m&Q}4!ECQOG(O4Fe@Opnb$^`Vq(F-2 z#Uh)eTWY|OBSF}k0FlDpY>-HpTaU=R4^!(eJCx30A2t43$Efoga?OWBak+_p;VC4aVys8GWb!ajt)#AFh0z^5t8kg@BnXCNix3 zZ&u<+Ix@rbOa-3(N!L)4ojbUd{*7mju4U;=yhZcH4KWk0qX|8)J+SR*fdpku%|8;l z3mKazN8eQ&9?|9`cck(KKeb`LakG7Ck~fYa=LjX|Jx00pB{_L+sK@J1XXv9EUaJyk z69ZI{<6=No4!Q8B?;#M_wpNFeE_%Ycu2+!%uaXT|r#&9QF4@jwX{vI}C>4~#Dfw=n zGD|5X-4ptYG)uOrjv5i)^Gu1rMQ*_&YEvHLsX8r*NXrvu+Km8Fy#CC5?(~^`bpO z$Bc1EFH*jfJZmz+Lbz#H2YL%Y`I-m9u-BpKIJ=HVR5EI5VMZzMK8zBN0i=?*nv^M54X z&6d@FORd%~lRf$46FCh&H8tygU)+NW@@L!6`7=~&RnGB5{*Pn<&3_Q{+cV`Cc_~hs z+JoY%7yd$jj`EYX(^YN{MsMd}E>fm@{mJq30oNWbec@BEu1t~$y)YR2?q*BF z2wQcBxyY9Z8m zD*6(>WEiDC;50|Z{vk@?=gXIJ3-{hxoOpzF2N7?4NdPqa96X!JT7QM4Ue`!bf_JYBXAUAk+07@x>eKw&*KwoYum>--(<$gG^|j`YX5FhE_H5=9``CaVY&Y=Rp9 z77wm!2@oelvF^fhxa#B)Fg7R5YZWru?j70x80nE5ISPDw@=-#QjM7?pfPbT-B%ro- zRsLLU{oglzK19t~_Fui@(5RE-IkJ6ph=uu%HPE{Y+B%V59U9^G<$f)EzT#Nc&`Uoi zak6y9PON>w_;m}_QGdE%_weIvm3i4VXItjI<_OdpjKib#(y4# z0`*jz%X!`hKAL_KBiG6s+K9Vib}6btk5cN-Kvso(k2IfmcXSNyTu3CI)5_I|=dN>a z?|LD>>x{UySudf`XlE|lNH=@-46b`l2_4}Mg(2u))b2|#shjWpb|yYjB+!O2BAytS zN2wY@)GT9^1f|_> z;Z1>GbWP%#?TD0kYW|m))kkyknQYDk4S&u>5EF&|{kZ6>+95dCT;MQ>I|V`(mrTU{ z4{_ZtK7PJF)M}mqWJE;KTQ&t<1EF)P@z$%SRv?po2dwYJuCvf07XCO4Y`mScEv)b+ zEuBT()L1)&UfI=)+ZS&uj0F(8(xi90Y*}DgK~nMk1%1$TShx#T$ufXCImkh!~hv5 zaE8SHfyO>FH&r-m_KyL8M~b|Ba+*2g!$gL*6!d}-Ro2Ua$_mG`rn1eKOc456*?suC zVoIw@llR&UVjo^M+dM0fY!P(o$P4R&&EaiD%0 z67I=GT8btq&zrD0MNU)#f)roPdkNt;72fUzOtpu#rtfgC0B5tTcgAzirC#*HCJeAm zS}PVFva`&1gnzj?VbF8}Sc3gzHnF?AR}HO+@4{$S5+@#+`)pPx_bhnkDM^FSk+(3p z4qAZdgzuVNupBHp57bwdG7Wg7d$Zev`8+FAM8y4lFuS^vPZ|R`Zd6X675)7uLQXZ`%unr!vYj@J1|N9Oc!9X;n>` zpZde0eRqt-AV3I{0Dtbj0-e+qYO{=p4w|E*F%btP*W}h7dcDTOO_A}(y9oMD?dvvl ztlR~`!Hc-ZA|K_>W-pLa7)wIqF;!qs2b*?|we-iFFQxmz&RiRRbr6ENRV(IaCEd>3 z#Q_TY<+SU1F)Yj0?<1~SPni3vDLR|O;=8S!=gT_uskRksGHvhAJjbr6$=!MT=;LXv zf*W5r9O8$T-^MaLx8{ZNz<87p*8C0njDn)kJrBAEUX%>qzJyKlspW)13SIPB>>{Vs z`+;8eO*A&&7AYok@MPa+7p3Ewv->{zZ59!rXwUJg^sh6#zW#5u{+1F)_LZ(K7Cwc% z)XRlvIc~iin(W-y>LWtnLtSW*C26aM>>TvD+o62Qobp|usHi2q?vYc-x&@jQRO5k! zZdzAw6fao`Ds-rwjnew6g^{r>r|ZzA+Z=i$^E6adkil%oW$KcNM8Y`) zYUri9&2c+x6MJ{hz37o>nS!zxwSNAXOL;k+?^C^E8fPC|W|qAvg1?76KbO{hL%D-r z>qPU^`G4w!Be3ra9N3AVwDTUYGUG1eAI4lGvwtdBExf@l{4ZlXdv<_lQaTKm@i8FeY%Rx`<5@EQe6+9LZ_;ot_(eMq7JgE8|^tSqWhYKd48~2?(%fw zMy{ytoVjw_xl_z?>vYd@>QNEZD(6xQOnUa7-ex;(NTW~LzLoY?tvuDTXM^jpywr(p?b+ii8m{}R(kvr*^Wl<97(R#+8V zC1*yQ^%|u)_gQWM8$j(uGB^IMt)f%rudW1&lpKukj^@?2G?|N7LEXFI@z{- zbTk;>G@LEk+@qy7f4OOjz<*idyth#D-NF=o7FNWT8;hjIX1LzRYc_=?-KtNwYX@fC zRpw{C?ct{R^uYsj8rrJ`_Q|k&C7dE0CzodKoOs3J8hdH!m|M<#QBQV- zY`0ceb!wH60Aml&JQwXj$`kSj{_`bB`mF0g8|i44rj?2c&Rk8gw=|I^cX^W&uHxI{!i@j>)UQ$Msxh3ICPI-8%i&*65nmiK8q%jPh_&nbIN z{RJQ}x&kBRK_I!!y~QuPt295h%E>BJXs08mtjK&B+-@B1Z1g&lqM6QfG>CBWJ&vY< zv_1c>QjSYaq69JAOy;jAjK2_A{yfejyYce;HL$Vcv(+N%H1A*vpYvJ%H5NuD#xVt9 zjMg4|2jQ-5As@sGKfmHmp~u`~`_IV!{oRZvrD+Q?RiHW6m^bVI#gVG?19_GHOoTlu(REZ(u*8=U-1< z@UNv_#r1QL3HK^pa(o6?$bnP-^dYjXz zj>ri|N%BCZNHmEJUdlKx|63tE5vm5DW|oI06Mdx(f2?|4{DZ;PeE4Ifi=n>mZPp55V~tJOJH*gts^(Sx!4agZHKuqC$FHG^$Lv(;rn!PupW=aMRR$+Z z%x5vCK@g>lybfVu712fz795;kiOuKMcE`|i4J-xh0FtF8%{i$3^_wZzWnGGMgSWML zWu_t@H*L9HZ?+&1cZTv9lR}7Zi8qOJOY;dw0SkZGej#XJ+_S9ChZ)?dGDN9h-nxDC z(=i`N(&kvwsKn{563ohvE-js+IKmMKSn;nfy4)uwKc);kXRCAkB;x!`%{MpGQHD@d zk#u?A(4t|1X(^pcmG@WAYrMWH$XcLVx&G3>Dns7G02@LAW7uW5JD@vgsdt!IH%yis zB%VlbWw(%Gn$mpnsbz=f*yvReQ6IyGN!DJUw?=z(Fav(5`Vf6mDrdD#!T4m77xQG= z;B^otg*wS1ex8U(itF!vpc=9X*5{4{>xW*DDF_CNgBNu#p&E+SxUvAxR(#zYws~_PlStm#~E9kKZe*Yx$F;6g0kvw}d^5ua~Uf zA)wr_p>k0<$)lcHgBgs)>xhq90s0X#gt~P)dYCj6^i+uyz&Cc2(hCw89vEw8;m$IN z97H$WT@^Z5!|?BL2OQJ*P775EY~X3dYn~8yq*$kwzDIsz;G zL8G^8ODcV}b-bd(o5uIz8d2)IcFzKL7m%fC)E3gPZR_508>`2?cJ*!7b~!} z>Y*}R=j@8)5{uL&&e*(l=f$jnL5a*@fwAn~$2I(yR)+cB7>rs>+LC4a3xdju{PPUc zU>9fx(bj|oUuN5;*U54H-JPn_i{uJ47CI70dYT018P^olN|(%>v7nr=xgmGDhH-Ha z=D`l2&S(N82V`~2&7rU&xA(C*G;QPtg($~;(h;(X(E(!%NtV~svXO6^T6d@BH+hA$ za`w&?@|ZIAkZNKEcBJG46FWpcS=a=uS$gl8Oh@%(oLs`PM}0Y+_vv$1BllinodbcMx6eUIn-b+s7p%lG~f#Yazh0GMZe_ zu}<~%6rH$okkNB#g{{wBhr!i+nBUGo3`wWreF5-$CysPjWePPeGq93+uOW23N0qfL zh>^nc#50O~=7s(}Uur^U#?|IePl{@V{45qqn|U#XcPp-JG|^n&G)Hn~k$M)jQ5|BR zufJ~2GZT6^)cD4R#$xI42dWw!;_fo5WTIXowCVN14f^%^VVr|2*u2@I2;+10+NC+A zwD{+o?2wYQJDYS{(vqz+hOAS}N%HALD$%l;C%3kl)N%#~3grl@IlkGOeJ7XLlKax} zaaIpSKQ}g&WbvqDltj45%yHO0X=o)A+?9*{VGS(61T)_{*WV`Hg!3AOt{RMr^)-pe z38x-X01i9KKJWkOPGTd;d5QHru!?Ae?0S7}_|?TgdH$n@>PUhm2BGHD?WljKhZ8HU z&+l{yeEVzIoxD6l(Nj=j`-)rof+Noh6Ku*5)(~5bh>elTwahG?Kq=`%kuy~qcAC~sB=}oJ!mRU&3ct?zV>R6}6+J-KTcLF@3ynF#G%33hkeD(CUY_n8b;$M&COXjrGNw}qZumFu#vxRJOL z;Y@P{6N$a%Xb*C5`9yLdy^M2qiFC&0p#3=v{C85;*&Ea}BPXMa z>!n~M(K^Q)rKF7#Y)7`u=X<*GWXzLp%0Fx5Un!*jDIe5ZLmJy8ya?CwnCl*0FPWC~ zHh^yU+4)7TjH2Zt+3VFpxOqHp9b|2o0{g2~4Mq(+^P5H?lcxKmD-F;PE5f!I_{u()P^0b#BQwv&~m9ku~!p z4|Nr(`RE?%*?x7v{Hxr?5h!0tbL${_>Jcc$2JCr5%R_6)`yT6)$kXE8yHovgfFpCI z-w=eU@m1Cc@T*A;^9lod)HOC z+oNI~QfKpoW7*1}OS1ia(lx`8?#SC7ko)W4dkSueVx41=REdm&@BMB!Uo{U-&;^{xc`8R?dHd#&(=%NB!Qje@Jv$}%7A8kI( zPWvh8JDbLO(~b-IZ6>WBkL>!qd&v3y5}ZeKdHR;GVq?uGIb3ph26e0~Co75+UI;H$7KN^tpo_tfQaEAIt@i3vY_{8%>(zfF{q8$_8)S}1uINSQ z$-?_Y0eoldV%HL`RR?N{$Wa5w`T;k68yzF)LSEarrZHmza5rl)BLWl+k%JNr*7v7Y zQR4au*%q50`-8=5lKC40oD&4o*}tCXGG8jwmioMa$n3*(bpCSsU9`Cr$a|qpJ}SNd>Yxn&U2Av%RYs)_Y{ra zW~p6A=N1(U*nalD{q#fFw#<#t*P~r4^^4>z8_|}XsHtkO&bDHNt$gFO&k6VuQQxDv zM|AaC_E<54(lczt>glLCC^5yUF%+8Z;)FycuUk{!9{F&`Ey^U$8x zJ|i7Q*B>M!L;%IJsHeMYd-jQyvw3_n8XF(0z$xWXht(>wV189Y(-}g*#_zT&S$|oe zO>A&A=Z<6>KH#xFLFy)8uEc(dj$^>7N_x*)zy7*pNxr1MY+tjfA~TD78Hxl?7_HJ)n39K~Q(2!FqYp+s}CSQDvEZmYigq zq$=(W(9CHt(;lN}Ezo53*T*T=z+&8)4QuYRkcZMdom;I+G5H4)J!!X!x-s}(MzK;^ zr9P@m5h=3~anY7ZcX^Vx;&zAT5gUYs->MYE!8ba6iBuaew?CV<_v zlku8rdAoHqx*pY3k^%=F7@f{~kWfS_w|!6M?R|K~B+FD1jxu(5Rx_bdB%9jrStv!h zg_+(7`H;HzUJ$*oK3kS?pGgKEwL%^!T`rrUSA){*g7k9PV+Gi$Ax=*{UHXKkk%$0avU_yxyx30gQe-CUip^T|Rr5ANP= zQl)5!MHCfx76YdsZWyt`Kfbd|91075yieZ#6ZFL#06X!z*x<7MDy?V)ow0Ouk+|e8 zlCR+9oHlG1ESM1qd{&+}-KO2?RZsQ25$pD5qcoFqZ>+{9@^cfR3tAn)u*DeEPP3)_ zf(le6!OEbJW=$Q9?!gVT@BWQ{K(r{7@fDJr~(%WW|nF*MXHey#{x567HJ= zT70#BoON_&O_rQEH2?I@0TjlGFnMbI>OyU2sm+T66k=yxK`l!*F7i;6qg*JvuW5o! zuID9AR&sit0hFecG0whBvhW7Fi0*!|0p$JL;lj19(dGp#AKOUAYm#C|wu_^>9A4b* zrZJR#y0rHJbcjYHQ~Fu0?8q)|F~s~;MJ|a0e&zNa!k&yV-b~uHsx(4of!INyyex7A z1GX=t>@M>dc7;6(3qCwinP}~OgF9`zN?ms6nYyj3_3YN9$h}spP~p)OGv!>;wN=Dc zf$t|%EvUd$9Tppf9kjWAx{Brvpzs9gpH^NvLPsOv(zP>;aK81TKX!Am$uAOheA2gO ze4)?cYZ^3WO(8Vnh?74>S8l(ohiFK5N(i8@8HEiO*do7g0HG0`Uj;!3g5 z756zXZJ*iT`n}%_f}0A*}j?6+l)onf9_ZSjz8;A}%2?dZftd)4470;y$IPz|UKJt3T~zf$(NH;3TJGYs z3V9bgV8acvmj{{RUiz=AS&WcxmO5IkWS&7uEcJ9^bw*C$+9al;fW7`g$Z9>v!#C&; zF?5}!y&T7+?{Tp1twTHWsbPXw*qUCD z;Zwf7b`nK|wGS^%eKyWA_1~`{ts9L<`d-rZ7iJ#OD=kFiV3hm&gMCNeIDZNv^W-mqxdiIs{kqDK+xx_^-TFkK)lS?DWisY^&@NN)5n|cmX~Da0o4~OmRD#X84`kE`(qC12VVl7@ zwD*&6CzG_BE}b0dwl_{LK+m}mpKB`XXNdJ*o~zN5b?D9;(YYQ40kW&||3G$qe2}E+ zHMe3^zqUq$y+aub-FkVxujAO~n_m@#fbenq-qjgLRyOJNKLw_I)D~PC1+I=-K+55H z*7rW8r%4-wc>Im6vle+Mj;y#H1nJ1+k;*|ECA}uhx70e*=Hyn>Twva_V94omVlnT% zgXb`{B&DqLcUpLrJ4-8I8KINCtWG&2I18aG)ky6#N7gX=wW>SKB`PN~`O$`AsJk!W zOSF=xVUZkiSP$H&*-kE6V(ki(aITY%9<-dpQpD?8Ogpi__B!(}ZAdrgDXd^v2OE5s zhooQm9zL8i8nQ4V9pPl_vgrnk+wtcbPpqG#VQlFcwFg(jc3w19(eBmmyhEqOhXs&W zjrwi0ExcFtkcF*}=vR@2Z^Zo^N&VixhWSkeYlLtp=CKm3mTG#Wixs6l?XxQQ<}oCF*w1XWh;q z1KnM9Q1r6lU&@=I{HFC~)*5%hLZ}<2tFQrQ9Q_dtjq4}V*GCZDCup%;-JGsy3Wisg zb;JDW*6y=HY#a)-gNlucJ2hGdCxdqu~s3 ziN`%?^@bSnQb(d*Ip&bzyssy%v)_ z+AM%_Ssl14ykpkJpxd)&cw*dtqQWUq&it7cp6Iu`<$+&@@Nw43hmUyAUWLX!=aId@ z1m`3zV@ltL%I!u!^YsX1U9jbVcdhE|lSOTrAX<1`vzicmBe!vH5~uvo?oLR=`B;+w zM@TlTS#JC=N*2QS6*mFEgMtFpUQ~Jd7Hfes60{cPqm(Xt$cdA`R!FW?t`v596npC~ zjpG@!TL*$BFW);@okMhR+i1mM@X1cRPGIp*GJt?O$8%np3v_2yQ>BI(u(bR=gJYC$ zK7G!N&tx!eM{6XF@Xca&b1~&=(_^o$Zx!wpznt_{#Mw_px%YxCdYab;7RuF@PO`ml zX64@pRpz6%W%Y$7&S@m&DI&rJbHu$+z^H(FJvi2Hi%F9mnatB75ru#+D<{d zGbzotTcCAISXRiP&E1&w<7}=9b{2AbDJ*GT_`nXV*UaBiv6(>Dn}zwdWVel#O;4#D zbnR6UpBbjaI?P?7dBwDSLT#A;zl3M)vq3wl0lqkCP8)xs!G`33Ld>w-CT?#PG(YZ( zTLvA3UG4MVqSHR>!gu395Z}=;&Oj+^?6{scx^(xZJ|kmxVWFDlUY2m@N1n7I9TwfE z-#FQj0mrdxp;@BH=PWRjdt3P{&1f~|cSg}4wqKhPXJ8`kWEoHwH?ujci8V(Iuxol$ zpiCta>)5oj3$lJhcum)HMfi~A=JI5Bb2^Xvx=|eVVxp^(K|c2YV8;E4NYA01n0_4n zlwOX{#-H3A8^=~#R9y7lWP0~%{Tjh!TZY3Y`@ju!f{($H0EH9Z3A3Gnf;aQ^4x^to zT>M+9Qr#;D-BxS@ZbbNt1Y|h+H;?2Vab)<~>{SKwXoIG;_y94{YnRL>%}kPCHJhY^ z76WBBJ#-VC;n@HM6zDMNm{36eetAwCCh&2p|;Gw{w(dOvgR+_sp$$hhVzmhhEm&y z=SL92@3+U3)S+@bX5>R6q8S}R%rno0==S=}L7+=$!5eTLJ|S|y$yL_eqI_NzVU((34NihGI7r)>849Inb*UK_ zQ)S9mhQGJMwFyp2z&9WP7b+pbo$V7Zfd{S!2+;d4A*9|;aC8V&`}60s?;HQ0g!KC2 zy-3@%)YM6R1Gg5Wb@|gwI=I?r=cQ#Cj<8zuxRvotfe+@7K-WQeq#S;k^n;6dIZRhIVK`` z^oTO)QvdqXN$T=@BdDMVs(2o)Nly4-Iz%`+;8GzCjYv+fl9`3YsAbZlrT&>plKDNx zVyC8M_7{Z#VaY5eZn-01SnM_~+p&|j^k#_tX74CE4>ZtbKlF%a71vW?LSmv3&C|JT zWMLE&Rto2HaVYctIDR&|&0BHq#J4ZQ7p8IY{&qSz{u!L}Prh=_SjOjxaDcGRY zp9F~Y+TzuQ$N_R`OzMH)w z?XAPkD!S6dJ*MYapJIM38F4s7Jj*Tls?*1}iU8ynQ>~GrcHTK#Y9fP)J^~LyIUNuP zH~fTT{pVR4kt~ozX>YHpje#oLHhM2MWqSfWd4%@kZs7<#$NwwGI3YiyNmYR?N|6ij zHxc8a=t&rjNXm>ALf|3u%cy{&UZ(A|G|1A>(|k7g-FuP8!>0wGW`*Xsxau?vf_+-z zyxy=$+g7YG{JF3XYjON1%#iG~kmBfvli^le4!u^UW_<_nXN_LfXCFjwdQ=79UK%ru znfmBE-vVyt<0P@YOyU?b6%r%4Ej1CbnHd9RILH~{9^|9LsCC0uQMjDf+@?grE9@dp2xGEyJST`%8tZ5A` zt;1kgW68)8tRfuIZDkte{RlYzTG%Y6fCg>73j>ZuT}R>x z7spJS#3cU)uz#VLBqiVRYiMbxoOdij0I@y~S#MdAWPM1Yf)BcC%=J(DWInt#0Ou6i znvnC;KGZAYGwi&Z2Qzp&`0;8cIQ8Jmwkq-;nHpq%^-Vi}@R`GrT|`gW?u4*;Jkcp> zNU?5eA$I%WlFBM^{pKo(AYic*Usa;b(o|%@?$Qg0jnL8DrtqyH^p~YCjs9?rIku*k zQ118*Yd`Pg5Mh3dMborwH-t7XL83s$LMdaTN!@Q5K@@kATnTkSp@EKS1$5h|)jD}| zes11xx6I691VRG=H=$YuVD@lGaaG{{dUnMd^t(iK72asUO6_nPc zZr(_b{_q$T#eEJN zpiA4c_o>6}Bd&?Z-6!wkS}?tJ7EGxFK5WTIE0m|CnC*K{xnW(hck=1enB@^m#xMz! z56*d|k0qN|@>Dj6p5oP`n&W`0NLoM&^WVv}tudIO3k7*1dremD#=NdsgnJdq`VwC` zm2YyUjY8bduFODHYcpcp?lVpAJvJcMkUW~t*N*AST0FX5>f&MoBqt5eU}`HDSbJ;j zJc8)cnwgAmithe%^=eZQ)Y-z9q?6&7ymSXD;c>2hBj=cO?Qxl|d72gevPi&m z5JoSIcspd`Gm2;NvBtfe%Bjl)$zdWjo%}f=k8)PGKCsu3=Gy(LWs$-SOAB92n_h8+ z%$tI5B<(K)s1TmR;O`=AgdtjckNzm+wZl(8Ms}Yiy46ep8785l7d6CCz}Mf4L(4la5L7su3kL)=Jm3B#p$e3J$gm4~W;)-ElmF*e4QB1Y=3 z6Frl?!?LFnc$CY0iklvj{gI{kyZp;&^3cFr2(Y+g!`lfS(1xn?Ko|2nnl%d5n(sY_ z;E9gl9ld-RXz%DeBzBokH&hsd_!GRjf%I^P_k^P6@mm}Br?wg`YPxoC4|Ket3;5|N zCDsVdP>fcNINOs`XyWT0p8#_9pAv=veH#8o+Kg z5c5MAJVSEGY8&SGs1fUY(WX-yiM+i7s)OP`{?bW;< z@p8Ls-B6{&_&qLG{B+`te^}37VPveFZhz3Gw!1u5BB3VQBz7a-+sIGGAs|?g@~xsje}EFSaVeZH;vO zOHF1%?d}G#MkllW=Fa$Mi!kvVbn66t9`rw!@yJHaS=sf&5_4I%i?d=uinyhb&UV` znw7r3mFN8%`VVcay?s>ku2vMaP8%D0fhR>63oG79IrKv4(Q0ShEf&G`54=G-HswBD zW(MJe*LRfa$;CEJ`Y0hxDSTy9z;3Ge_y~kVdR5wEz3EA+n0jA}MwQvUl1S?<1srf0 z|0h0aA*OYw>1FW|G#DySJSJtpONex^;@8;Zzs$;YL8ea_-rBl-^&;8Q5@@{3JXJ`N zEsHlmU-Lq?zoGa#Y^%xuh9&7!=Jxo59+rCf=&Sr!Q({qani%Dr4awK1v3V2r??VBn zT)D*|b;pv+gWW@k=Lu+l&-VvA36x+f!Yjr3Qqr!+jyWqr1qDq{e!9QEB!yQ^9;E5N_sNSV zCI50*xcc9gj^Zs7oXP4C+;s@Co~jWl7k$28QQr^x{DeW;;}PkM=JPYumbBlDFJ68g zbdB-B7z0=+?zso0>d~>KW*_#{QCPQ6twC2K(Ce@1KS^N{m<~Y{k=4jcR+9MAc!SMi zrN+*%eGP>O2zfhC;aOh;W|bDejKA1B?Sd+876fA+0|&KVKXv!(gf$ipXeizi&XTB! zPZVO(WCJT0X7V8%#b2>h^dFrnKP`}6b+xJ;AWW-&5hky+(vaa)fM;9dK7o&-XwhoJ ziE*QOOCG5HaIIP?awC+%zN2*WE>zf*fcH=8tweeWfGeF4=uu)pY4cc>_SDU#w&e%= zLco8N%)WMt0`Vzv@$c6=ev@JUt`D!)HzNJ2-PB;7?B>d{;zPcpvZdK)hgJ>^*5K{bWWB&U>Utc^uWZ%MBA%fc9 zlovn0w7khNptl}TGa)X1>?gziZUn%kCq9}gn7&s0?u{RR95)KIxmJsZA!@$u`25U% zVrT9tuQJc-{Q7?TdMJ(;O!nY}P$9oG*^qXi^lwku=i|R`-T%*ykj_TIvSFJo&l2tk zlZS}MCfBJ2m(aYXTRvN1DS^FvFUH#rZ5j z_D6K_mD(z1@o_5SSo`5vT?xA*qQ>bH(>OQ0n2#@?Q!_KlAY6)XHGmay`=AZG?GzV_ znUY1)f1>)Q?h4${xIUouoVImFNxY){jiBKwBh#Uct8c=8=1bqb=EPG*X6kXO^>#e* zP3V6cU4O@hd@<#bYon?OK3`W@Ke{$x&LD9{j+u!Cmn+Hp`QqoH_0Jnas)2#-#a0E3 zzl<+i_k$EYPP^2V-rrwx$n;*;g5VkF^k^qWtX@@aB1!)>PTFXCaaCZ%zueQG1<^T& zAMqXk04}V(Er=?FxWmT_a?W~kYSRYU`!nzr6vl|Cpv;PWVwSmSgA{4vNF+1oCPf#m zoK*u$5`gzPC66yXfKo13r+zwoIv zRa9v^K`$#pm1NQ3MwxBH<$)R&C*!I^FKJw$-?iChk7H>~se~Se`d6^|H6SJ$Xb|Aw3Bith)RsE;3n2!Xm3SZ-7u(4j#_W8rU5zKYtJ{^!#kAzL3;*W1ele?320RGEQz#Vptj(l0*?bb+~B zSbK^I5&hrvk$KmP_UQiWVs>n!)3k)6QIWqq&|J$ zjz!^LU6rHn(thr{rbx zGWiF7{R2J+d7T32q9%?T_2ipO@!gF7_2bYT!2fYEUs^GXs=vXBP&5E~qng_sIer-} z+ARAT1iN8(YESi@T>bBzYTPJ28hx4>=~eI-pZlNZm-VZlYor!;R%!Vk$JIAW5q=%o z2e=5_>AzEwIMH7lEx-R8)yN53CxC_*lXv@lzc&%HaNqm;vw(>%6aR1eldW2iBHiiH zed8qlcZ_rLw;3UjAX29^YQEj4^Mk_!lpsHAAHKNeLPz`Ke z9He|L0FY>j*QWf#zb4#As^AAe*J{ZoAEeF!fYF%K0IXe>B)C)sf%7D#wf| zi-G;G3EsFlP}lx0@&FO_ySEDH782wBzCUq<{2F=AULMhkRh@|cP8a~PMRPf;r(@!e z4h!r&VD^^H0CI`XW>URN&URDmM)*|EgH=WobZzS=&>i;|t0`sn)E(cGN(iIqfmvDp zF6uk7>%dkeAiv?*C-M_o2p6gg_4qe_mT$g$4e&q(```9Jpm-Ut)8YSB#BlTx_G=e{ z%O5Pyei>dviu=0VZ-zHeH~cne0Mg~}IyO)?{B98bzc2w%xctln^5InC_ND>l%PdH* za(`sQehtr?Ei_w%JE?9|2Yfbe}>Nj`i&5BOsZkvHzX$l&@$iU)>T-LBhxR@Dp1~u z5(t;yBm)O}{~HtosIGq-*nXc4dDIK8{6JjN$0=A?jI-F($G-g84f;ecu+wPbVk9bK zC%d9`J+c^Y?I>&Nf{^ys(-{Gp&6{sVPeP5;42{j(MkaRGyj?6fljr@|w}+72}Ba!nvSpq{x#Z+7&`cNEOlD~Nge zW6|Nr{=IIj{2m~SQc?JgTfy&pko35~us zQEmnn1x^9W%xj==vXlIB>|SpUr`J-gwH^;%n7nZ_Bc4{%WuZwgHurB`zww5ox8IbH z7SNd5sL8kX-Eb>0kgl%Ebf$cLLA$&6CYPt44!0ewP){qPVM9G%>+hf+(x`L^1Xdu{ad-VkCMy*B3wm$ zrtH)2UCsQa*4i;&>{M3YVM;tC&1-QGs4@JbU)_0OlYDxWdBIXAmwHdb-3Q)b#g!F= z7n`klNYSd=dHd$KHHak#9nyW)kK0$^x|1KbU8k05Fh?<)_?!`vU%ciV`OC(Z7ug5e z6l=Ew5-dUhYi2A;1wpxk16Qx{$|$tPjM(jM)g5P-x(IG<9gr3|m8$Tb(X};R$zmOR zHaJh;yRr%3io>D~e>VkOw12a?c6w-4sAbSn1Qe9!Us!VE^bLm(7Xw?I8C)e!Trxdk zTA*_FP!EIc$Z4AUXz_{|Ba^u5d6O6;isLIii^B1X~!gUW3$MwGn^T zsLo$QfBk`lw5-YCyr&EsqQqumyZdxj^_X!${DX`JwPIo260<>v)jOE&*eQq`NV~`x$KSv0CJ8%qGeF!U z@BO}y16$1xqIJ8x{NXXnRRvV3N}e5$)k6r^FOfNwa%}FI4pL_(t#mQ^o_ri}e(PTi zzaSAx@#bEKsh4T*>b4g!Ygmar-G|fu3JJm~@w8-E67#v3v~rcD=}Lo!?d^#-DmE^q z$;<2*5#tLqz)BQnVAH|@uvvu^-`nlAik{N<%O73}Ox6iWnD3OEl9F9id>%m8SsT=_ z+Xc|*Sg5Z)H8&4y;!p8*c^>PCS<_bjCH8*U4+Za;)oI;kT@zwUuf*WX=IGQNGketpi9o4znyvk;k9QR(u!lBi*N&n1(D%cE2387@&fJ zcAjPVEk)4F?YPbbn#)X}Mqx(xkyjbUIuCogC7w-e+KKsKPAsQRoE|E4DRJm#ZYOR2hqShK-9r~VgfZ}}HxqqTub zBdyY{C?VZaLkUWkbc0BTbPXx+NQ!h09n#%HcXu~KGxWeP#2KHx_xS_Pdp_}nf%`Wr zueH{7-McHkpFSy9OjK>&j18O_wJ~f>#6h8!+Lo36?`JWO{!@Y$Mo|I3YGOx+@{^2z z6Fwd;J3pL0%2vZP+e;MgV8<>VRMB)I9NS@p3Nc0`%ZsaJ48L3tVw791Dd+CFJ00A= zE+eybtprEhtcWw;zMc4@qU-$FRV|X3!!(`5F|d`uT3ybhE|^I6^>&6r=sE`cwjXd_ zn}>y&^NMZ`9g#LQvGd=743dL))^9qeCPLZ*8$&ARAo)LKNyrzxI<~%{@>Z*W9MDe2 z(q4amTkIM`MN?2C!pw;2Rur4p$$coOkVUsOkq{L)`_?2#Sm8P#hi|eQD*3+!1 z3rd^B)girNUv}Da3#Ie?0s}neQTyLb^Z4C6G~hcteRYwEZmVuy7xsU=ueXmz9m7J> z_&k`7_vwYtxt)h`WKPSeZBOchbf$H_#My*GXsNPO|4E>NLq@$o=usH zs%#-X**)aCax6ovD}rcG)w0)@Je#+|Nr=gx_woJW_4RzG0! z{O*aSouO|VE7AS-@SU~#qm!YkYt8DdH}dQhJXUGv3jnu)BUKt8>ZD*X@u&vV1K7p& zM+N;S()W8Vx7T1=kb}nhcwTYYjm1N&5>UL$4N@M{U=BGZnWMva%U4ttC7315505QPqweUwvfn1S~_Z~L185%5v*hvrrU z;>oDGDc$wzkW0aP_9!rnHWuwhR=d^R+N6nLwwOuG`7YP}E+)ale{b9WF#B)g@EiT= zsIBF8r2u9;6peL-nV~4FbzQ#4LeuxO#mYHccC(gneDQnWB8U2ZYkau7{gH~tdB1Y_ z=tkcm(Eoo8c_1nA!aX3jUqevb(-?5qaykeXM>LnNv@+5>NKf%|6*06OC!Dva4D3rr zt?pZC7>ZHqPiDC!A*upguEhRe2!c2m0wglf?Tq8<=r4!<^r_{5@)w2LzTbwA__}H( zI;=W%TDEL7Je_e1ohumseg%41*`*mzL;U&iNJ{WQ+cWI)Zm8DaX@m1B>x`2RyEaFp zsrCwX8XGHiGqAC5+GeZgIIDbP>+LCcIk(8{;rBi}H{|yU$CoFT!gR}nsI%q=t?%#tj#~ShGu5pB@pI%W#@6w{&1DGc5PI%k??y9F19<)orxu<2l z+342lZ;I?AihboC!jm|64(|${7VbR7&QH#UO~Lk;@DMZmkO!RQ=)x8^J>%bn{N@=^ zKG3{qPosAGhJ0HotGM-^QM+@9I=XY9iBNddX^?l}pOC1<+xz6-fX} zNI5zksKx}jH1X3+@-nq`-wc2?+KU4B!VDgjHmrXBZ+R(!aUa-yUuS!9ar*G7>7uPY zF&d0=K$Z5Z?It}jm{63yTiDnAk3#SWyCs%}lfRa4!d{SIt zy6~3UoU>4|xSaJlAU=-Cs9HB`IYnV52QYs zC$Kk>mEF<}dE0O8r&oHC=VsdE1YmJApSq`ISa0?_@Q{`}bX5YWcs_I!?&L%_a^<4z zM;Zl+KX!CG?<%yvN7CR|Xg=OPXq(?Dm;4XfNkC8^eMSzuT3fFR*MGT?T4gpLaNH>! zWsD$P|IrtrrJ|;a1Dm$h=zdDTQ6X9;T7eo^LkzRtBI_b4iU_$YXUmGVF&uJyn96nl zu-fdyZ-POWouxnN=pa=-EG*f} z-alu#X&t~P^UMT4mX8?tXq-kSPl{wGxqdP+c>Q8`A+>eUN;@Ze!9*ndvN%0f)C_KhOCOdX9;+LS=;3W5@Q*y zxSUq^k}tctp0Q1x0p-FjX$VlpN( zjvwe?st2O+KTPsBgu=U26dDLlPH)Udr~SSE-<1_H+1WZ$O3NsA{_%)z0Pgts)AjJz zq9O2sCtG@vcFw$NuqFC1V`LTDqU^fqctvCFGrB3*>g4BBUgnr;N-KU^7&3J0io9}P zkCB3~sZ_Os)5p!>55sgWeaoNUzZ$o7yY&Two%D*@xMD^T!oLduj_S*P85P|HB7O6s-Q}q0U$1NYa*%;f5Cj87|)|J9meBBeYfj5aiHB~nLlo=7(b1$m3UM;4{)+P%4Ph@UFP ze@X=V`wbOm#e*|B@%#2CLfx4-8uy^>Gjo-J14{q{+V(3bXF-e8%)?~V% zwn|JSjc92nM3Uy>HR^~>d# z;Db+y3dkLOUUI{Tkf$HwCR2tK_A zv6%7G!sVm2OhLWsbTD*z-ZA0EMb4{XNT$yINznhsVyj`KDDPDAOM`K^-pU^lJ=lpO z1b?qSX?2Jy-*J|NW4X0$>mjm%468;(W0E(X2o?7v7%3p={`@Ty9h8 zW$&d08eA!s+{J=)tb(TY_)aIcgk9@bCy!C3Qr8RqZEy9SHv*$7TBVz((I`FFD!Sn@ z2m6GA*G=fG-likV!aM%TDw&{(tQzRvF$X(Li8hMmuh(^OygFkN()v~afv?|r@LZ1R zPvv{y&_47`J8x}+oV2T!P(HbyRX?=c!}3p@Hk+N+_T=UNqn)XK1QfeYWxQr$VP69| zU-S%r@o7?}!^!Voj`{zfQfm+9JF0rzzj1FlTvKi>xK8g~$&dJba-5hlZVXF}lU4vPPjoV9w+cY_4Mc>r?Ks@Ji+pRbcw=qgajy z@~S()=*&zKVnj11zRLS%IO1*sfgFFN`YC~!+CQDT0?H9A;L^Iwif8}1cUJw_5hQr$ z!PNlfaP)mpalNgtkdr^97QMfXb3W7nCD!7rwRi>Peir-pl>YFI$@;Qe`)skKB|eu2 zA>QB!JmziO;J`7~>EObFkA*}NF z7;N}&zv+_+FI{`f3q=(qJ(B|yf8VoL|0{3DY)nt*Tkj%UOhA#E9RGMOM*vh?Z&k+9Z=Frpeo-?y_k*t)r6^TYD&Uv9e)b& zl7BGY$6`GNv09bVYw{a)Oc>KWHc6)TI#0tLD0yo4SW(U6Z^U|(DE2Ay_h2yLK`VfF zaCLEHtn!~XCCE8nCMx_Dq~4K|96*nC1;22QjTGy797=nVOWfmVd`Kl!oUgI1{m*7c zA{YHwqQgjV-QU>giLm{hZd;t!&VX9eQ-t_-=;WI4pju;CFZev{sLRV(ADOfF3iB`G z^-hn9=lTBa2_OIOJ>hiW!HthkL2E|Zhl>em=lcW8tMV*B6TB;Cr_9mUP~gf3+>*?< zG|K<5J0bbsw7>8U^=|d;U3B&9(|bt6&dU6gsyd~~+S7)#^F^FW%Zq{`A%f+1dUlLR zO^A>H8nMFx&&5c*gWM=6qKj-f+@L7G+Ng>l?gdG{K|&-KC&k zk;p)8B%SAHMuN2pd-ubHKKLF#nSD^g-(e{ly6aQIaQX1W!iz=!d)~VdIIXT>YaZG7 zefHXUtI7JUz-g$1#3@oC%FI|7|7DHV^g?Yng8PBH0_HGzY9cv zHLMtn`ae$A#0)CbvF(Y<(f`s7yil^?fyS!cYE@+uQGU6u!~wnyYK70PlTP+iM6CkV zp%2G3P~BokB$wuKORQt7p4U)Wg^o`>hk>!xS_6r4cdMy4lJu{VedRn9>SYHo(k_O0 z!krV##ys1QO~2s>y4~VkU$Ldi$GH+3N0-8r1by&|FWC6HvM5gl?60O9QTBCJD0do! zXvL%bz}y6_ZN1Zf(sIanP+cFf`fP99r-Hk-dF(NDDYzb2>|*>%4u5DM6Y=$FG6tE; z?B*B*wTmnM`CDryTv0jp(-Y1gZid!!RgQ}A+Pu$4TzK5hPt(dvT(JX}a2F(J=Emnb zQ)rSy#JxYqi(4&^7?EEmVQlifOIX{R zFXq@EKdC6GXlK+0-^@>COIy=DF7zV_@!W%@6;0I*dyL2vmnGc}7-IssODuNwMa3`@ z8p(O#)SV1Fl*hRLbXchWqMc;)aj96^ig4(sim&?Jb<-G3bkF1IKJB-zkvl>Wp=?g$ zj8LTsT@KxXA*M%xhuDjQaS=JUr`6X4Lmz-dGcw8 z3lWY1KU&LxsTR&PE;cpPYb**E!Gm{=JiH8!kDGL#k>ZY+H#EPn{9t=|9#)OV{S>RP zRpxB0*`e(c4o%~#>;J#Zsc+$SWh=HuDdMR*w!iP$X8x+$cRRHe&gB0VmK>0`%^F30 zbX1>VQ!BkPP&OP@<6oRm{#oV8bkSozPMFWWy(L4>_VR@HZ=O!oph$^#Ov>2hn z?>)re++H5vCZQgmKB)IMRbV(56nxGRa&qE?? zWxA@xEry)4ljm;;r=vEv4;@^li?n!Oe}6_eJasGtqdE7&40m|MO0AE$55&npz=AVI$XUxQ$H<|o71&zJe#BQ$2W5nK}ZErhxzHOoE zSbbsj5DDmT;5eXLzsy5ApRFt6+}^JKu?L|&4*8#MVtw7p1!R|XW-OyXu=zpcLF<%2FD)Z6wnb&IyZtPp`gWwI+VkPh zi2p6P^x!f>#K(IDd{Pa)G z`t2*jV8td*zDbX_8JsX0E1!34uLF=X8OeY}Am&L;gQh-L$$CQKmB@6epDZubbA5I7 zIQn}nLB^6YWApZ;tnmKKHYL@GL}A7%rB3y!f>=oVHSs+`!z|e@vT12XO(Savqw{|9 z5CRE!K&lOy;rFG8;soa!zWZak{2a)?rE=yJDh+bEBJ{x&(MnY8! z*vb4VKE99gx{tnKZa|HA`9+CaMUtU$ygnW9Q+p#M&(RYUdGM@py0hlx`~8V0Z_(VlJ{=nZafHNlNjZe(tOsy<8prF-M@LVi0sZEvgN zr{eyw!pFT`1OIV#l~X?bQ9o|od>a31%o?sJTXA9BTmZVJqWt2eCF~s0EXt|vcai>; zH1X@%gkkm5jP+3#Vr99P3F_iF{1RWh>Dt}?dUncg@YJvVV#cw2@}$3^cJc1jsj*m+ zhmD%Uqw^)n(%ciMO%Ol&ZPOS!-NhLC%NzVJB!}SUKX<#6tD>oPYg@-wJrvefB^gz# zRdsb4G*DB4`fs-8BHs|^+TUuXY`vV)!y+N z+I_0F)^O|w-Q(xqJUp{rD0j-hAni}IeH>1Rn6@-8(Q$cmI<0qJ_mZ>yp{Z#HGrS=6 zZC^t4Ce{WA6IL#P9Nm02Em0X|fZa}*69E^v<3vhy5E4N`X`QFvos!j z>y7ss`Q7V|CNCFac~?midKBhNijE#QMKoc~6g_Ir1SPusMBtKU;2rXNSR;Af#Gxjr zNr@-AX;jp2`V6Ng;QR%i4R;~;xT7=IlC))3SJ<#t)76SkPfb!|% zU_IyQ_amTHceN|`Qb=J0{P`_!?9NiPsKGprN zm<|^l);~;4mF%C=DMNbapq9!o7i=HpH1#09!pB~Cy6Y*%`~)H zf1DjAZgI=Fcghy~!5+16BrlM##<}9K~NHK0# z?miWj0hIa*(0Td4D4>T1OtNr{uKmzFN9=#>o|yxpmc#Y90^(*z8S{43k$=PHISQUY zn`&0wO=f`YNm_F0Sm)^45wZ1RA+aqvFo&;T(Vjg)$#azkyXcgcjO6rSsU$V{jWs6D zu8e%2EDOM&q$B`pX2u)rC{Xe3+wju|OpMk~w&d%sq!Us_I4F4jM?%@5n2iZ`*hTu8 zA1!#SaQ~4m(J>t~(=mG9Hq>Rr++sqpvJsUvdKPZkn2l54{V+qsUr*ah`|-g^VHU5K zOiSziI7)#H_oV|)qJm_W1JT53qa5>ZWsZ*AKJuj~!ak+2$Q-H8(j^8epRZYL1$-e4 z?C~~YXq(bZcppqQHup$|Za0swF_@|C)o5`d^O{yEhX_e0Z#QLqHYVa!m)^c^{G9W! zh1bc3Yu;pRWk-`Wo6zzDiMD6NeS_|lPbKwX8s;9nOG0Wq{IU_}9?UNRQYgk$a9~*_ zl#s;;AHwTT1tCH@ruWAHpEz~>+a<2Ipius@sGHF0WP}v7U*}vzTu6r1$$zoRl4uC@ ztdHU-y^?~af;zmmUhN1)0MAgfoFiD31FMpfWnrP{N85|Wj;oHL%(fcOHdUVBm)i1~ zBREHo+x^A%S#(2x@ZN2oQ3v{J)9dUS3qQ)?NYq!kUh|X%F*%ZRG{lx;4cT$77&_}E zJwArT_Y58SVSmYS0xr(1;y*Thb;d>w>X>`c!HkKoRL7#xVwPWZaa;W|xMW2Qg2Gtk z^lHe$Gk|;4zA?7C^!JP?=0nHQN~kusxly+ZE%hSXE{`s=d)8a2{va!O6aKDjNeOGE z%I55tQr`-uY<-{;;aw)$Elt5QQPDf;eUG5B0n631kKAKF({6Y5h0swXJ7`ev(#B7W z{WZ0##P{#dxY+y8R>j=(-u0O3l4q2eQAHg)qwbMhl)o){xayViogrb4GK4I z`jMFv=qtKr6;9dbp!2*9-SK@A=mCJCzg$ekdY>aEHnXs3AtEqQf>ill$%#!eev8ww zy_L-{L}MQvb>iB&Dp@e#Wv9Ns@pf1;_mVl$D0{EjGnJ0F46e6l0U+}kt5xPx-b_ zm%u^C&z~KTU!??5kq!Dp!}>Sj; zaqn2h$+i7YJ*~~L0ih|6OhoObnR?5R!&#~NkqKcPfk`i4VZSrM71^}54dZq2Yz+#1pp;&=1X_2DmGkq* zXJVdeEf||U_I1_!BjWcj-Hz8mdqEneD!3mePH%%ygs($5O(M56!oRHr>z<59@IO;9Wgzd2$F8K!azl*NC_A}$wnmaP^63PAJ`HyyCyJ?; zw!E{)FWS$&EAZUyTd-Ii4<)A00mRE9J~O!;Z%wNTry0W;RiNmkcjenZslw5w0q32T0^cHP>mX>CDl_q@AlRyV(_8F(OmwzF+Y@5qKD#`Wvpr>hS#wM}F~!^VKP9@cTd+7E$ZR z<88|31TIklS2}Symow(7AAm7$a-2P1=JD=Em9c_X$wWrIQJ^j&GmH&0*){_2k5)=D zP>g@$^f@|xYZp2<^+`^?mk~r1WB7nudeh+5J#JhJ7K-mL_)+E)w7TT@tJxA!pJGPt z!j3*sbXN#9aS#4N8c6}1sI;==h}o~Qhp=wPUoBk-FR`iAU^hYH_Iq>R+kcoW?kJN) z7u6}YISFxQc7~m*hKKg>h{stP_msDj^o#<0T69Zu3_5^{N=OY_)Yshp&pD1ZmvEpB z3}yY5@AvVvbZG-N8gT8*-l5{A%mkn%`Lc)6s~%x*0F}Gz%*`I_onvS~Vlh>GcoLQ6 z$^L`4T<^#@)ZZ@QvUB<1n!UqVd$>OuKhdO<_o^1a7`{Qf%kI{#(@e7*nr*|`8~c%# z@m=eLuC(f<@B($VlkYQoD$KV|Z5e|``5!MB@wAAb>(cC#VJsV`TuY_$GLOjmB+*9S zVWaF?Lk$W+`K1O)V_2#zC;wWC;X5B-fc2YWVQ5&NtXNY==laXC>tZZG2bficE++Yu@i~Wem{l2l%4{(OFZj zlf(4|1f9exxR>g?(#k(PDBVjIKa6Fm z#6RKYCNHBe7ZqRPX{iDfW51Cvc9p~#!AB0LWO#3w>OX(ZqOUx9@wv~e$A_B`1dQr4 z0;X&=j&1rJ1!j{=Hi=#z2^Kx=$c+?o8A(v>)wd0;+whsM{g#fxzZI!is(anWg~>M; z14iL*kiY)1eU=as33+orZH^Y5f5DeiL&9m%;eytOHHv_-o-)aKDx|de27ViFQ)&Ru ze>`DQnf~d2uaX_G`2fZ1<98_kG^Us!;n%wrfSr=`rp2{oGKNSG7=5ge;RG zTbtB4oPv(h23bCrvX*We7F-J4bH3vU?rntjAq%MJQJWB9!+bg>9VOGY!$XA-;pMfk zj910`SS%NVVqZRC{n2rI-v-yz3V9PX9F87>R~0>5 zi!cqr1h#qph3uNyb<1#e6teu4(z zDrbsvkf$-E^++)D_{^W5PQ!|#8)G}OB9_S@hdfNO#cwo@)4ellFhh@f^>a3(C;A53 z^RJZ`0T_HNrH2N(seMCZYI@Pq$#~s$t~`#LS3;$j`vp$JC^iQ^k*O`pSu|1- zX)4`TKIy^KK@w4~JEE|qg4tjM)Ghb+dgT82_CMJHyyPO??B zsfIke5w@jLy8=2;_pIeQvGU{UdfN?28wa{B8?c<`W25wWh^d9n@3HC^gYRr?L__Px z>;b_!yKvU@QsB3j?~(9FGps`^wK}@QhaFgfTikVS30pI4zPJRLmT=fz2R;yMbBCIC zemu$q&baL`yW!zyJDP^X@?$W3ZpxVXnV1{%QQ$oj3^R9{*D}g48e`!FbDE0(W(09s zW`R(ta%!`Cf|)r(yiDuQc|W-*r$wQ&fn2XK)rtEXlPJw5WIK(5hT=fwyVFlp;*>(O z-88yTVOr!7GWAm~GR^a`d)e4jKi27$XoG99ml9FQPp@-as^s;BQ_{wEg$MSmo6|UYb@30gRjPGS({vwsCNgJA=#@<=E^2W*6 zK(;xBE;j=WSEBGzG1ag59SPK@76avwN~|}p=wI8PlsE{@cn0W)-L!1n7|O`L^cCp+ zMD1?A7nTN<6&;ka&NrOtC9B+9>calPXKs_<;cWMjqPn+W+`M;!#iny8z-rHwxi5Dl zXG=E&_ad8lCkCl)j=i(4y(U6DrBpl3LjLtQ--@?B*Si1Ff1xXI8N0=ZuR#uhn!J=K z)>;4K;=L7VZK&zzsVOfaBi!BB*kCkHdXGwP=8vPnl>TL@|Tbh2n@Lo20TbnQ^X zbohU?$qG|Vuatu)UmI%dqBLF}2emZT2TD@*`2BX#3FP-8Sx!-m)Y$8I2HE|_f_sy) zhAk`UeQ$=hdjSMsiR`$De|`A|dxiwnHhkWf>T|_NQPb#Fj}Wu%Sgvuhlox*P9DSsv zS1%XUfeaVp=83IGy)4OWq3dWQT%(K^ zWO`d5!he89ymeuj#PI7$fCqY_C{i+h%~PYyOV%UFSUrRG&^>%-+c z0ReX(r5AfyL9?683k!!myN|c>Kl>0pZ_JI@JBBRX-N>w=*#_&+OWiXAO}>SbS~^I< zMgI^xyWC_3zj?4=PN(bltETIK|3m4>yf*taz5+U~AeJN^)S*nDwWjc({HI}c!;A#s zV?kQ~?JD=0*oWbY2hi>)s<|#qeUd zOa5|^heK$>so}7(#>kmZ=8{*RHD?&*EPW(N*Ms`{6~|c0;H$p?Qv@CAh*;5ouSq~# zg&25H5zUVuTE!yldI^3d{|zd*ApNewb5~Qo*}sg#X#w{msW{;+em4o@DSRQAhmDiX zOvXH3D=PrjA=606ns|S5vSXp01?NuI;w#~%B^&;3r)6kC>VO_tO&@?+zAV852o%Dv zU#58CC$4XW4B%vXZA1gY;dqgj%QxmLGZ9|OVa~B-7oU7-^^4A*V=dH!*kM%l)^Ci< zcDh{Fs8XXa(Kf%0*n6`q`%u%G7HnYp{vb6aPghhdF2i1=xm`Kq%<>FnDuhmdpz9PJ zT@W`KzZ?oGd5a94XQ)(QA8&#XSkF3QWb2Iu*w_kL1XBRNd$zVS^!m!HQ7*v1po{ELVnAlZHY?=gYHB|AdHO2-R8rI^4xZvo! zp24U~lTAG~3}k1=7hsZ_T2`i?kgyIt5EYx5A)R!MHhM3ZDkciY(l2x@X0^9e>Yu(Y z9*FWX<5u6-9p!(2Q%~~Qp-7DO38II3FUwAA=oxqu;2hzitt?9**@2I$_beNoM@P8v zgByhp^<(^2&1olgPzd*H&lyOX} zk)DvR8Vcfmz?nH0Vw^peR8X^Q#PodSA5+@$E5GL`Gqz53z~90*{x8exVZU)iTh&KgKXQfl!cd?gIquvDL}4lG>gy=Ixk2f0L>Va%-GkRI~)U=QM=2 zM41=x8tYH9x*FSvDo=Bi6HM<(c`b|a_VXGqPdW(WJut9Fi-|G@5_xILnMHl!qLcu2 zlo!LnfiSVF%9kS-o=mt6OFanR;&Xu~H8R{*>VomNT8Vj7MIi49&(lqHFZmA?qLAJ5*Pyrabz%YYut z!_gB2^OC@4nukJW0soj?rp;@U$ND8Rt6gY}3Ew@g&9Ub%;V?kErL=!Ph+D5z@COZ0=!2(56+8(fOtMj5n9oHz~A z*x3ZFg$$Pv@rJYe{|D%@php^UKZ_yw!%%zX*WQ)l>_ibK@hPQ~_PwcVx)ZPM2r0W7 zMC)Ilrj&k$W|Vn+zShtx93NvBifD0-BKf#bW{rNE7Va^r%4eD2({!-RX(C=}ya`n7 zY8FH%YmlCqa*IzD3EeoXmu{;WngU_a-GeW7qWovwotox6&2TNuw+5i#Tzj7c+%w4I z^pM4=sFp}6TU2-1nvV|68Nk3QqaYdx%1g|AMWje|;$olEoLFFhjhZ0Y!A2gb)NQe~ zocMlu%-@`8+#q^a6=DK}m8o-l zK^RDboKH4dgU>lq$`4!b7W8tg+&rfS^2?V;cpqFtm5O+ysT+{S)0n*oH4{4DAa;QC zvl}*TeHYH(wT5Ib0E^-Al;c0?GBN=olR_bB+bIP>TWyv8W#SrvW;Hj9W!1GC? z#)L1Z<_XxRZjFAG*DY{e&1mDf#kwZ5tH|JX{JuZKzT`J)PPk)3)qGkadu9=?XvyhM zx$C*$asQvuzK?=go&cFOddJmnvxMsekZkycHc=4}{mN(5g=+1Ojl&g9D)`#G^tR=P z$6AD!M=p!v}-{>R7Epb*{8oJzMB_ zHjb)xP=Qt%WpibZ#4TP&^Zd)O;-e1U1*wh7aPy5Gma-d`I%9-7s-(%oa-Sn?=w5Wy z7oJ2M_UMYx^HrI0F*Hw2-6F5@IZL zPlu*}PzHcFmQdyzYi5%f982c(Di55n*|i=gviU}0L!?vU&_4^QnGYiwYdvDCQ2bmp zYlZPd2(JCFLO#;-2t(QPP@W^sdu5YX!t+g>C9UC z%%~%3*#ii6z_P|^W5*DJ@>*V@#-8D-PAxQcj)&Bj!}CS?=P$wqx-?Mmw6r(t0t*Qt zfaHAp&-GsgN}@o&an(crN$c%mZ_^hpeA9?T0b?wnvd~l4$MRcC3l{X=ks~(r0YM|5 zguXRlh0a~A#@LgbIj~=xnU|PMF6V~MwR7@N_1x>u2afo&#V_Hc<%X|SEX9>Ffg#`4 zeIhV3ABkxmR>+EgPbk;+#bm^J)IrbuWAW9pCxh8X4GIUT!~duM*`DO-ssraw7QGX0=X8cKD>UrWQu8OCN!^`H-S#Q0N;p^MI+h1s5*uF1FW`k57{lFee5~xfb*Ex|JT?FNhbI;)nNCQE zaFNS~+M)OrF&pGY+VoWbJ;P~V6qDthwk>4t{^CAjI6yjS7?;dDsto8v0aiKLP8`i+ z4hF7jnKf8_Kei}M*XuO;6l@DUl{qinh!bCqe%2t^7?If|d$We1G>`;Dc#tl0p^I$t zw9p~&VdiQz!PQ>NUL`?zUqp=ZRrByqJjCNKtQrR2mEP)RLuT;hO;v2$8)R5ANPcN{ zR9Djf-K2gBNCTXTB^wzPEIYQ#kold7=~07A;!9(pMu&ZmA)8h%MkY4Crwy65QTZJ% zcqTK?d!37FaQj@jL+0=EFZu3{1pfTr%te9TJS%d4iIw*+-yqF*(p?+LX2@i-o; z;qyLt3ZXtH=#w*z+`Lg;e=IDrT+jW*3S%)RQS_epv@_%Td9>-$B^dwt{1CpUwuMB? zIApj-pYGgUsx^Fh+crq@oM%_yd)<=P_X}z+tFFh>wuv?hMB|^#>O}6Mgyxrk1}u~~ zUvpE}A8QXK71O=#x3BcN>MH7jDOo?IL>^eVzlj^Gg}h2$qFOxu1)= z@=S?VFie;q9bKWS3vQ%Z>V%^d3uhS8?AWOMKc0=_$DMidUqJO|ZhopbZmg(&*hbqx zixtjt%A^BQp|0Bw{zPkSU=nMInzd zx=xpGvsv%OfJ$F-8~Ms-LeLX#DjhsT4`b(A(44Sld&c98VP-8NArMT|wOALQoi!(q zJmA-p^Q|(Gd9H2pB8r5d)KjY~MGM<9%W6DN=R;hd&7RA`_}zPquD0YKrZGCvx!+k> z%zwVcCjJ!AcCWbZ^;4GPp5k2+w0Kr%5{Aa-bZcEl<`wLmZ77=IUAO@okl(XH7#;U7 zpkyDuw9wj8{h%^^G317W0qc19fY+o5t;3I1F#Q@nN$g|v;rc?nZo8jjB(;W$(l*_B zUF2`A99@^UY-^Z0@=+=r7hXBAL$-J#lltAOecaP26=eq`&7AzSh1O)+VFxbRttXi& z$m)4wg&T6dCzv!UDa$3FWPFRU3ZGSQ3Qp1Dn1T84Y`bo0jH}k>;x@ls>j$z5k!uj<91EnPbZenfOrnXjkX>UG0slzmyGaD;#6)t&Oy{$yN^ON4)5umQm91 zEcwI6t_)V`K7DGsSolcX(byK;3=>s;z{>0}8V%lmDQeSNv3}E>W}6#q!1}6BR1|tH zc}{Il6=1qX8xKrSFZERBcvdo>oo?LmSO;{fTeces`|9BG#LD=(F7Vmf1KaNL&6;np z-@~kUfOd@bW+l7SCd^$ppsMpcMqDX&uD*3IZdFu(>(4YDff{0i)>C8=nHtuqL8*BO zAzqX|d-gaU?&=DTj`lMxPAo`o3zDEk(fYtdYF>|HDLovCl1u$>_BqBx&udx6uQQOb z7{Pr}onn#NeEmjg{k~kU_vl=<__=&(G+2X3_Nt=HD)h65p_Q6U?}C69mzqUBFJ0$n zk!u1V5wwS}lVnvU71a8V=1Y%*vE?U0E!+o2x05Gw^bf7#*mNtuEw=t-{St>rKa=W8 z@u`>?fQtwD)}n=e&KSCs@HypvC(0LJ2pKM)(=gM1I)0ajzxInV&Bq$^#Qocq61IBw z4}z+I=Z_7OegE#-^<Lza#weYhE1d@Cb!gSPG_)bmuO;0&37qIVSB-S2ZMYP zoV6>kCV|z;N0VC~IK7bD^hzd}2S>KnhPYC{4;AiA>5_&S{;|dSY@mO3<=f%fGSSdl z@aC98yiQ}F*QjMNa2$&}8V7|nWq!s$e*uDLONeE@H$HjvwCn%c2AcH@Z`?Z?l-EWm zj!GExU_(G}9(+yqE{XpuExQ=d^{@sMthPjC_|$v3keHXEcXt;l6gQ~qAy#1U{NEw= zDE5M_E?!6tCB_haVQPtr=PTr_^eJrdNyVr8 z;z9XWeqZ}O3^rQe7AbrYK|g!suOpCB^~Nq4mzoJfMteOlQNV`mds1j|{ct8uWdX~I zd>eu;YU$aS75~)5msICLQ30!Jwn7efVg?pitIi$7&*3ke<2U}l!Z}hL^G@fy zvZ%XC9>J&tMW7{ZUNGG~5sb#JSKe~1A|s@z#p!+vJ2eo)r}X>|ZN}sN1Hb9@)YJ^I zo2$F3vv1_RQ`0E40qIUQn1QX;DSK>}=8n@l*QXOWqMaRIzjz5BQ2XM!q?tRBMcD^7 zs&s6a*h3sgiK`VOQ+EZ9b<0TlYB#OOlvcdrNDIqN5-#7a#5@U!mKz334q3({zB5=t zi}N|zHP}D%kVs^aglZbbh)MoTp%A{^`BZc>pmy=;P2ZnyYMFXA^x^^?jX&|nLutLq zaS4}9X7;iZGmSBWjFTA75i(>qUZNZv7@)rUsxM&ztR@1r zDBp|$qkdPle9SCzmRuG=f2Y+C7qYFc ztJ2zYAo-Yjsen%&Te&m-7arU!K12}6Tx+!32^#&o1vPYfV%^aZHFAAh0%|zA7N6kz zZY+zawSXJNr6QN>EEY;xUZP}I+8r4=wt|&^N!7LI-h`SK94?9*bewYUD;r z%_dHLnBKvMnnt68E8ohtE>2SRd{7!EpyIc0`x0sRdNJz`-fD$C{no17b9?&fFF!f< z>YoB1n*)t`uK0*Q9|NM~c&f;~UVaz8t13(g(k-*;6Di%HIy6kYS*X7Vpozh#?kIAx z`aGJ3XHi2_M@xsD=)V6s8ClQ2prOJZ;;LBNpxu80RW>F6;^Th<8p+ksBRQi>6IJQn zdrbd*E;m3JG_wNu=o1aZ9NK*?+WSct*U_fFbh*dV=tH-BNkO9I67GneNQ^`gq;Tc} zHOlB7bCjSx=qDw*9%aA1nauCici5Fu^!&;tthvAP8H-<>8gi{TG|FghTOuT{I3)WG zxs^_|WF)i~Hf*y@lEQq|w3_!h2z`*M8EQqyXEI59QRx@~3qp}Tpzb6kLkvcUh2ziv zI$|j`rcgZc0i;!|63b05Pm+3T5qR5Dy2BSkbhpdadFx&qe8YpMYV%rDI&NQN=*KJM z6QNT+=*KW?2T8cm{{mH%?oRRfoqXa8Nf>QniyzIzv^f%co-{e$A%X{czdJ9YqE8d4 z{esMD-7dMmqJ&n;3+M*ZmFK*C&f0>HGTSka zo3ZP-u(}+ttCz#9CDZ^T-08n_@~MT*8W*k~CtQozwyoQE_iimiED8S3jht^rveYx7 zID8&!)j#_291uUGylerh6OH-o@ucFs;vUU&UMb_>^jsR@OF#QqR+J2Q@sS^U@9r3N zx|=y&TVNY)>GusRFdcu{j9jy{BiAhEHM93CeI5#h@IUCJ`Dc0X?MCpv0_NqKpN<{h1p!*f%?w6P$y zyqtEIUY&R74oLQi;wMJXOkcNgUY1AaGplgX^^j2D4@>sG!ioJRkh6nCj3OzemgZl2 zQA}>VqB&hi?PkRtQ=5B{f8hzY5^L^MZC5U|@#tGBjX=Z!V6(F_yH>58q3pBb7u2W6 z$dLlG9Md!9M*&!*<9j3BwRv~Ub>Km^>P)bQ4=5KE-b74cVYT)bwEHrPY%oD&o~1qw z^Bqo_6glIP1^R6ZTh-KXlk+DG5b+F%Y6+9Gj_L2wcSfdO_8J~po;!qcU(=-m&||*Ra1KA zv~A(=+{337J*U-1^`V_C7Jhjg^mJHP z-{$mUlY>;^K$@^QCB&_Cq1n9+a$%%BRz>``90 zk3XqNM$Y0i$!(l1aRIs}<%`dOZC zN3!T|c;PtVT|M`P6h-j=Ad(DZICrYixSEU<3C(}j+E4qJFXo`(j2$SD;{am0s395E zub+BmnQqaY?s+rAXCt^}_5xatR6l*8jnIZ!opfvU#_a(9Rp6Ld1v2ERVwzq0`3Stp zwN#h0s>oP#V1>wCz?$Zuluqk+6)ga=*V~Zw*3u*fuBq`BI{H|*Qw)vRkC z-L>VLbWA{Im) zhFEr9gHxL&cHB$RpXgCIqU%xIP{47s^RE2n=Dq4j1+fuUF;f;Ek;U zUj8XPl>27-%m??&h+=Zl7ILPZmC?gED#x3^gUZm=oPPKLULYdW_lU|QR*&6f;Yu-i z5UZS-kMdtM%9LrWC(k^$;ARj8%uS9>57HqEylC*|{vLhzuB3uu9pxx!^ZEE87yIly zy<>mYEW{~PT5XDxmuM(`l~=*|C4;Ad+W8Q@fuq72^|S8mwhVtWW+_vP_u-rWozJ`Y zs$f$0hCJ$^+LVLO#Nyt-_bpSl!0+0%Uo(8Ce}r5OD@%}Kk}tchs+mPse0c~#Fj^z} zW8+hzHpS(B^vvvalTyp<5c$$YY94++?{(!;?^`-*(qPL2Ti{Sz%?#DBOd*jUjC!7D zn~AKQvz{V_{?4hX`uF)<$Tx0GM*|y>^H4SHe7yZb0~jtAPKkGh!N%nrG9_s?YL?J;$U;}pNBZM3N#`V^T`aZ*C z@bXnPxoIS&Wc5X(O5`!l@-zmwcJb-Ze!n60xFuEs6*|g5U##X`$a-~bw+EWz=iUM~ zLyP5HagGwC!NCtcW0nMBdS=e3?p!MCI;KjC7Rl)5!R-D#iVz)EP;XKyK$KVVTiZbF zUGq;FNJhz3y*q4gQwi_RWKotcXu*yoQ&-{FZnSrE> zKU{COAI#u~_n~d5gf*fBTthJotDBn`>EL6B(*g>L-Y?1Yc0=b-o1l|*-QzH~58}Gj zF94dS0PRyZBlw#1uy4H1`X>zGRR25dS}}t=yx$iB$MMjEFK20)__8H(%>9Z2m6Dxp zsRLu4QXZ}x8C|c!@!B+@_+aO12PbNJ-~H@(zwr)E6B#j_Xq@sV!SD=KO_g|q5>qFO4&1NK?aS1dZkyD_+g3+FwnL!*n%C^Zr2)FBIhv6JT%|QFyobw( zyrKzA8Zd%RVdbrdp6_bva!K{*n2N3hV|-i3!{Il4-3NHZ1CQxeho+AU*0o0mshSc* z1l-zrs1-9W^P~o)y-&1bt8c8oK1rh^ike+sGR?wgfwxG+7ywZ8mJokwNAK&L62dI# zoC*H%5H_JU%nKT$G2K4S%Z5RX_xcBIpXkD-pvqv2MD}?MWntPBsV>mcxnXM zFMD+hP{79e@_Da2K19^qo~yq5!ivb92JT#v@dyX=zChjs9!3^3A=TLM`>+8*8k$F0 z8!t4+?4Gf>FW!x;K7q6yx8L)&?QR#A1!wma6R`jBL6&R~q=7fiCL~AyZKTjECJiR2 zWPzNI9TPH8^^sz7X@@y^cZP=~n)SIfBjW||?FYzgu2fl9$ z>^Kw0f5~JnG+fydIU7CVJ2hRn!5Xj8mR6*3@Wk})T}Kz^6f9Pil^XvwMd`HJ04MQT zt$l^>)!p9fqGt2wF8YI*-GTJLB&G29Z7lK>O*<0yqI#RgkBh02x;Ck{6Z^a#0#jgX z@utFY(iW$)VZq@wza?Yxv5z6^xaX{qN#h{IRx#;Sbj2>+(#Ro|fv{)wF#2`EbL{vG zWZ;S&i*W=3Gp4%9wN`ghnlFl?rnbn^m4`W%&v}MrhNg!RNV#W*GaNMH$+hS@|yJ5Ynnm?q}S-tp4ox^5sROR z%Fw4}#Kj&lL78;0n*q}mJP=_YzI^OUE@qI>!C&*BBe+curts>E4Pk|97IN>jxO98;kHy#wK#;C`DVJM%Vx2eO;v4FE2(1izzc@jymqATE-=ymF$|aQZN;W1NSM*qc z8EQe}-%!`;QnuQ#<@5%147M%*yG!ZC%cOof@q{Ox>JiMPCcW@}2y+M4x z)d0p}5B9;*(#b#@pO~L6Hu8-&eoeghbEvHxye|LXtl)>Ir*cp&uz?AkfYcdNzu-g4~PeWtylWH<3xnv|tP+{mVXaZ^)ztov9xg}Hl z_WA|$*~|a?lr;`h+_zf@xP4A@mD9Qhynvk$qvXa{M)Wr%W;G)VaJ zkDw6TIub0ip6tc^I8UgMci2mov4M?K^t7`0oM6}LX3kuYGJI{$w}MLV;I>qo$#7-3 zlI|b^ITP4QaBT-!yG4c?7eWi&OHv#&86C7OUGbEuNE0pklO>L_!(ieJO zr_WphG-)uixw^k z4B1hyig&RXk|mS3s3qhaqwvk#LIMUiBF)56jU4Lge*P;uiT}%@GgB^LieqZ|3vhtx zDfmO_7X)rf8uv{i^w&#ds+8#&#n_|_CbexC#`LA7E9RAH$g*PC_j+H}6B=|S> zpI8X09dZt^yB#q%J2o@ji*$rwzvYNZM>wBk1n|heyd4+HmO^#vh*)up78ZH&{;K8C zNOq4hK5z4%JD^z2Nw*Y!n(Jz_mA8hh-GcY^wH3sxUwxYz?VZyKi6FUoG8bY_lfKD%(FfI3;3+j-jt zyHUq|3k&zabIFJSv*o@y{U$u9ouQi&Z_?w1;YWqD@HI0gyteetuZ0>=e!taGnm9iq& zNlzo){(0e`qr9eD!p<4puk}jYT|a%ANwE^*ZhLMdwAxLS%7`fm;7_?kYtEDiR|Z-+ zOANoCH&TLIrS?OTGY!c(y;%RDWl=}&i{0s4;j@%f6Dz+6NKkNe@lbTd)? z`PAM*FDu~9zY5|x3m)w>1HquW%M!`iCP5A7)XSrX+gm77SaBwHclSKXAcC~Y3<5|H zwWgs7(f$GHiVK>kX?pXA4f4TD0zAG(xm!7sVn`gNkPY&5|M^8b*2Z9No*9{+d$GgL zEM!{oiv%!Lnslzo+zqq87|-uTF2wkv4Nh=5fkEfKJMdoT7}pcKnRFdrX`fYDJ|t1M zT3)WO>O-QGc+-Qtkwd;;c>`C;-;lqNZBuBYe8eC%{|tUMs!@;i#E8J zsX`eu0IOxPgnl7O9Q^M{&H4F?ZbuKm6wd$iW^tSLdV9g2!Gqtwo0 z&MoR~;!3i5y$y?K)&ckWHvu9GWFEx}s4@;CPbsPLmp1{`o-9XiL}+pzgO$t@-d@Vt z*kVZoad#F*S}tWy2zTYu>uglr?qlPqR+%{+5ZOcTBSA+uljgFXRYLdv*FGtHitM*4 z88RQs#adXn2CDw zPQ%}#9Ze`S(uI;)_6%Eh&;6U@W_ozr3PfQsVZ-i1F)_le(B8U`Ya1W}GTOvJ`4NMS zeFn8dvdmIuBnOw@OhfY|5E5XKZ4!zj%na9DG`$nko~P7#(@ZF-m+LEaOtN-8MF8A{ z!8ob?-T=i$7NQ2nz0iT9*yMt^;GYk9`-*! zmw-;ih8nmWnQ&Rl&&+c!JYFYc9qr3sS;03W&5^uYRz?E6C9cGeSeR>@@v!1Y8lx%*WNs=ChIb zE%X-LjsH{3`-amVsDmuM(IkEGHm5tS>GQy5o8^I3&rS`oxE%b)ZJKv{WjLpuPN?VV z^z%2Dnu%e48|j`%EYOd(3OwvT?prvdB5gHpy)v&nJr$kJ2J9Xk=2LCFX(qQgDGEm` zj~nmqHlsj1X(!?H27CEt2=*W;IvnVmpZqXCWNw(JK_Avgw}Jv^g_vA{M&c3bLa_lr z62+eIcD}9D^oRDpv^r5OiLI{rb(*gi?yjep9M$=}(jCohymQ~e^z2>%W9@o^E+$9` zC|J_9G3RSbADr#r1FcxwzqS1s^|Tsmm^GPqEPZ$=2>T51vkC~9ZMETLf%JFWNuX2k z^ro$FtiPDNkR3>#dGcad2Bpc8fK4-Z?$#%&#>;n;5vm_y?EOV+)5LWN)zkIV;1!F1 z1bzAce}X>OcAo!Z#>9Vc35b8-@*R*^vizHG%@d4K-C%o?cmEblvAP&yYnP`zX9G)B zQ91kr%unio01@(qe-?+L$M1HywOCWIT4gQ|WnYURvtXL(VLh{V^+%yxvRntARINUG zwQyi)*lA#HD(r3q3baGdx>H~E_H(psLdzlUS`Ux2zd0i*V z!@={W?#Fn)Enbu{#q_cS49^Q>J)SS>+TDobYv4gugqTXKwDIQdG7h+>ftwSP%@=03 zv^9nbO2OrEzu?$tQYvPvywGuq;TVbUoeqFe-hgPqlG=Nn-#DT zx&GC-y?$Twn8|`;{`q;v$PH%o?BRaO57c?TdRRWvu6aDH`D+7TkJKDAI_N|+4}biV z-$0qk*I}s^yymOn5B=@y3KG(c=27Tc%8jkZn3={5%K!^6lWvOHu$y)WSK@}mUQX4L zx%bEWFypBV5vJfe{bHxp^60G35y_JR(%%N1nE#`{eR#3%!hQ8?SddkX!eTmt!1`sQ zginaJS-AmfjbZaqP1qjG*{rMspBK8OfrlW4}`OZ{$d^Li2}WeMvt~ zu2{4)z%nj_{hzap7M<;5L|79uUt83EJzi~!X5?d*{v9*QQllh~td*4Rc18zBlmz8w zosiB7+Jo^3;e}0aRq3WKHL-K92@P;R!2mVU{gPTL6o0;XkyC%drsEchn6kb*i*qR? zcU0!@G}x$5@0p#g$3No{sB!U;lB{7vw_y}53#Ba9A-xtMOWK9k_Bki3b={%tU)JGx zxhkMop36f_F|?Fxn=_ffF%~IYI2Q!@fp;keIk~X0TEQ!q=~Z@5Bj$9G1%PQ-JjM*rMgo8hu1~6P=^w zsRuCO>^77`7}N9hC~_W`5rXlBI%z4mVsOaY^*9i>lB4v(x@<<|bmoQ)FIlTtry$y2 z&mtEAQ2#Ye43prH0o(^PVPqqU3VhV*VoBIGzP>I0#NtP}TzN@Tulo6| zBTphjQcQD+owbo!MrfNC8n%Qfvr>0G&cpPyo^wYPzc$PaLAmZc!NmrOD>3jUym`5_ zp`S~ZY=j?KESR3oC2Rp9MaY69<>>lhPK5;{HH3Tzpcns=V`slc@yc|?t7!O1o^f8Z zN+&4|AAT{fQtPe(0g_oR&^|bfB?!ZR+z5X@dB|J77V<2VQyoax>PQ3YUFy7ek+|*R zx&!okj5QeH-;LAdMOdGlhNwsdZ3D*3>$ukmR)fZ-N7oV}e(s($=MNK);)1}6IhEuW zX115_kokn>mVoU1dH(3SxhGBteaeC zz$5ea2YXtj8Mh%-*V}q#aaxv(*UHp4>~&~?oYhI4MElyJ+Nht{&s@Y8>F*)|VO_0| zOVyYNC8i+AsJ1CcKZ_)~S+32Mq2%z2u@e#Dr1=a}2vd?eLobrj?uZI4f>7w70u-x| zL*#X%1?|S3sIM3G;Qs~Oun>HFvi*Or;H1_!e)*EZ8X2`{fOk?Exm{3WE)U8J!d6!C zE=JU!`Av0#Cuc}yT2K5$ySLjT)n0HziW5^ag1kPnb$(=#pj{)-twg3;k`<^zL!*PN zdkdl;o77LA65e2PQm?|cRylUa{t*@R?qCgGGYe%L=j{IF<;#P7iNMuIk zN$b0fWyXMDe(%dS%P)?YS}=u8JM=6K7jah#c_)=xz^ZE$O(>`PjO@ix7E78>J0sBx zTbRH^#&Q;89+GzYy>0pFA_>U;bzad2?|Y)}t|NyW-e@b#h&Vx}*Ud^sUQGt@alt%Y z!5nK2PyEBAF|#h$;Mh2<@{!!$OHY$_3Dt7{G1O5)RChNi?GmSd9Dia6#`y9f^AhZt zWnml3V>FT-B9uwjBB+=Lxm@jEj2I<>_9ne$-pUl;Q#g_JDvIeBad+f0inQsZ3bYDYeFv6Cu*T(T4j>9Aza0rUE>p|BlW6f*vk>&!?r}~>oXx6Bp1K{DCAV1uX@-%>zbOc zt@SN`BJf(r2im;8y1U587DWf5(jTrQMR2OHR8Iq@CA*0UmO@gQHfhe$=S3$hSi2f1a5QLz|2qo!B%o6vP8 z-dnfRr#zd_5rb}KY-HYmK&uFpg z^5j*ShrSrV;>Hy-JiM^4ZaA0b8%x!z> zmtBN~C*+R!sV$j2fZi+&tinF1Az(j}Fge3DV|vJJ)M`BleBavMUy6~&xev5*Jq)&=P8aM zzTicZVAF8jo*W^MZGAyBg~vz0?Wx@kW16{6Fwe%8jclwhp^d2kPXZf@173^a!!D!^ z^SRSC^1*t39@l7G^zRBPr0Z(7cj3bADdL_2MF27o@m$PH$tu63AB&0A0b^$!fc*|l z4)3!3mU+6SRp+!H&oC<4q#MQju+;$b%^yY+x=or&a)%NsX{8a$K4EgN18K?CA7art z$Kc(&F8Ib=h6nfV7TVT|vMFvf{N|Gri$`3$tMQO<9ITm_I^qaZf7#i#f1Pi-}PE zz=kV{?!fcmG~R*%|6NPc92LgUF)iS6aj}9xpK;ajkDA(8laf^%1?4!E#3e)suJgcD zx;8FBkXuxNHl`PoIBj;>Ne+j~l9VE=Oa{{R^yH>X*nProFCwfVAQ|6Bb5~Cn3Zaq)P&! zdN6hCXz_#9k9&N*GnX3?%Lt~@zWNKvg6zrHU+#2x0Q5`B16!v1a!zPAM``p7w=X42 z7sY=^Qd{D6z;dL^jlT#j8Nn)TIjcuL3H$P{7TVju!kg5;7hTU`Teo9do7T4o{4<_h z@5GmDU9qPr+|*vnowm<$Lc5#eh2y%=eZ0X5m z16F!k4}hv9D(s8)uqp*UHy~V)COsZVAU5t>0rL=!VNUjm%(nUj8YdU#ZsogmTS1PK zq2wvnp*wY4xITKF)@4pvgcyrk+1;dp$X0??%dsWp1o31cM`rZ*=Lh~!CXg#NVL%`6 zD=f|?@BRg7H8R_|(gAvM=^?PmuNvdqfYj2uy{>E_Rz}IQj}3mj`4*oj{7Ujs zH#x?Q;=*p@#|5$R-N@q-GxL^Ciq#k!ijJ0P05>H-JAT|C@gL-AarNp!@J2li$zQIY zyms$^vk!ch;l)k)zu5RQ?@`E&>rKg1!{q05ngR{}9Rf?#Kj-6^@!I%U@qNV&LV>qr#tkPaN=s-^RLGXrs?_<(5BPnjJ8{3RU?7pu9_7X z=&9nG(#$T!y9u$>$g#9>NFqocn31r6Z-p>KEZHw>igli!)gaUuZ~ad>+c@Z0Bs@Aq z#%t8uC5qRdM?*Kbe7?anhX|_s2d1}aE{FXibn=uJI-09_^L(#XOkqX6=v+Xp9#DLe!jwnh^JN%oFTEU70_gUQtoDbmU zDv*#Dzd&<tV)GrVif8N;ET%8?`>4W%r=RBU8e1kT+*z$Z9 zQ$%E}k|kikU>Dh`Gw_K^ z<%i%F_4AUl%h*2D0aX=hv)%oBwr0m$6nY9kgsD$X<=63?wu{kb? z1+}fz@&53qjbEF&h^1m^{(NzSU|rcT&^Sft+8)4!+vA0|U*+w$l8wsJeC4BxUI%{R zH!<77yPzGNov*$1BE&7wa@1-$2uzO)SWgbp_IB2vcMUuryBb^AXP>t{dF7y4=m_pE zD??v*@x|qUF}=Pfxm!e`u23AFv*zz~)npvP?#rjs>$6a>A_6rO73eW7P=VUYGOYP` zKn5RSeBU>|rn&ScW4=FZJd4V%V){2CH$rCNP@qp3_=wTHT)YF2GB{1JbJZ{H-rCh* z!&c*|zK!=M$KSQXQay05TY7T3J0yY}F6u=SrAKNP2ze!DNVxZ_2}HW|LeY_*;T`Yg z+<$-@)WU&{XxcQ&W8hF=j?_##cpH14JC?bzWsKGC+k&hz5sHl|jmEejJgR)Uj=M3& zU0^1?@O9z^#Ny>s95|6&>v>bfMfDs1zedaq*qv>!1Rk+JS(2w>C z-w6JwDqZQcR}Ix9=CXC|X?55)mEtwwM=gM3w-ms(qIKe1DwJ>+5(-H&UFbs%rh@yD z)0I34i&1}8WOR<%MsOkjZyPnMMeUyf4nK^N+oL8h$h1UTcU0_ooa5MZQWI z8^+7auOa)zhw?NZou3#QaC-5^+m=01k`JhcT7IZuef;3KM|T zMfXXYjpZVP2fV6-hGS zTh><z=6U1Dr3 zL*{8ug*pj&n{OP`t2yGu*S^zsbUwX5KO1L0?;fynK8#WOn2`tvg7puUT+a4d*Y!;CV8n)Q5T$Qi z6TbTT>$|)TSzsJoa*Oq0l0)X?mVbgF0Da~O-^7i`xo6~059PuBxwZM}xqRq;-j$st ze#37HC;bby7H|MX4(J$K@qX0558TnZ*2)9MKjZD!%4wTg9QBr+v0ezKi%z5G0hdoWjdv!XF3a>!X zJY5AZbS)uSw&*JLaq8%(hC6MG!_M0g`#IHQ5GztnQue}25A-Dp9wWFiRLDmF99y$I zY+S%u8pr^TIflH!lsHjgUlRQnHbqmIKzBpk)mMB}zXcn<5Tl~>H#)|QN>|zSW1)u% zabQaGtQtjvbQkQ+!bRo=#IyLxupokfxDQO)JWZuPX*AoB=a_ z_+;|))0X{)=+zKh=$MpZ0G2rckU&X45Khvc&x=q`fqg`^puJc{;v(SQhM%RWfwukZhL|c5aVg=l26g>~ zjPS55pclDB^+buNp<7WMl`q9e^|74~w#4O$kX$`b^UQ2`A8jz`fSz;4*yz4iXXHQO z_GTv;R-KO_SJno{do5HfKA4myf5j~k7(=QMR2o@?i;8MuQhtAJfd*G%ePesLmBW5! z$=^5LRCz6ze|lkeS>UT0K`2h~B8|)3>q|Kv!47p4I2sKlb+x9f%URlMoL9K&nX(R# z%Z?o@-Ta2ls+QoL8DDIB(z*`=)@pMB;e%j{e|FNB%9hRbQ%tnd>30b)R61K%4nU>0 zfp6=y5AR|m>;^Ou?XM00ooc!*IqWRd2mPiUD+>RAX`V)*PLvUlb6ui4kMRXXU}-f} zc^tKO;X5Ef98Ud=FtgOVC(+ZOf=O&sAp}l>3@gH3>`qnQE1OYGF6c>uL@n!YPwtP> za42@I;^NR^^A>f#&D^Z6f>yasLrDKvqhZ5qE9aJ*b>Sn)l;XT$k(AEi`Mp3WiE7R> zK3`nU7W#O&92A9^I;9xR11F4ieB3I3n&Q72%^oS!A?#05f`_elwYg-~DW>PkRhmLsmh1U+(n7X{HyQYH>y#r9RA%St+t{XA-S0n* zrP#4hgAgIubM-QE5*^kuIkAr?O)FX~e0(cG+;jj2V6wk}1MV}y{ok#UYJ%!#X~=tk-F1J$@*5n^-i0 zrVW(NcGy)9l6(oGB~I}XuTSl=mhRL7I%8Yj8&yjRcGp9JE2Y7}ALK3Jgd%|^I(3Q4 zH_q(GQ4d?~=X3g7ZB73ztW`xIFlsc)V{G<1-Do=)6`XZ?A-?0e=f8#$q&!Q;Ws3)o z&=B_@klrf?tZCLz?P)nfohz;A<<7y2T|x$VU|>~lwo!9b=&v*H)XBN61|~%*xK8X$ z%Z*nbL`ZVsbui}Nxe6w3L>79bdMz?a1WD{mtN<Sm`x+! z04yBSy&ZafC1d%Wt+*BP8}qB?4V|-1uFY_Z8f40WR{tH^B8u1_B8aC~dh^a`DLhc_ zK9KJ-6|Q>baPWD`*XL^K*zgs&2bQ5;>b!&jf=sL6l6f7H7lwN_jS~#EXmhjNW-Tq5 zr~!-PKhj4GJ)PfSz%gbLl*ROi^HUR=WvAOX5QG+ZSLlgI<%x$nr$I)=m3Xjesc=fOrUZIev^KCiev8YAD27}p&i3=4;+~!LhAv4-*~CM8 zMpao#UwNuqDzdQ+fYGxf8p%NSrhV5#MMuV*YpFPyG@1?lTcvqM@auqEFlzeRHGUdl z(S@6Lzl@v+IO<$nJ|eGTU_F`HUV-NgKEJ%|e*8jv$;Lj=|IKxL<68kEXJAD^{napN z_D6F>QN_l;^2oh+trM5byAaKbo;|)(u$gBN$PpAY2)Gnn>b2_0R~Q+&&sSi25(^+> z!CO>1E&Fo9qfuaZ**VzKGsM#_QwBe-S9vf=crAiU;dz0`S;;BuxDFK>g*ZX)$3kl$?k2MsJzFNLw3mQ4x`;pqi7BoQ3b+CPbh>sid_qtS$ zm^Rd0Wl;8R<-7%@mx-RCSbS{vYex|Y28;o5)IyY3YKz_H&OgqO4k>1!iH(KVb_+9c zu{A==kP!2%6&qpF#9N`vuu)jt881lYogPz&2Fz&{C4(zcx$DBOqIXy;saYhzhz#Kz zBkadUGZGO_*1xlMG&o9qH2tTCx}R+G+7E7V`cNlw7B#US^jo&%HVDy`h8DlJ3u;}w zAZgg(-(UE`vv4odB92|`JMYO?I8}6%Y4LPfA6R_Eyk_m4(<_?ye1qeD%nbp!xsZ#S zZkT@Y1cr^V#P-1$FwuxQmFk3tUTKaQDvO$b8+2%<78}kMK5p<`tD$Hz(vP1{w@;QJ zh||`^7}S?|Y!Xp^=_b#)0>AYIr>f{vTJlvenyVg*6&vfUd%~^? zbE?3{4l~V%b%*xW!cU;p&Yqf*f4U6j4R@&%?^C?^TwbrED{|)1%74uH;&!~`5L7*q zBbI=%`m{$qd|dzK@|R9TPA9ET0M%F$+Tu^oA((?RMQAW#kpwAN@1{t`LzDSsoO3o# z?;TGnCtGz@iU@5KIbUenn*COVud~)X;jvfN<=af#ie8&PEki`M?CsF3@BbFXGSGFg z)kFCYcdhFjADb3{-TRIhM%c*QZCiO-D4kt;v;o!;YUil?A<~|M3Jo@RhV5VVVS%=I7A?gt;xPjEkcJzHx@kV$2)VwG9GUhT)W?1{VozFH)kdHjif z(wwLxVxK;N}Vcm3D4Z+Ch!d2$Z8t>~8CfBIatLB5$_KeC|qBv6mKWmUyupHf4Tq$hbaJaORVI5 zus6Zv{TU7GSmhbtgBUUjXMFLW0et&)bQKnX6%H`)x`_KArd0 z5sNp1(Ar%T_VR%)m*mFN`2@Y+&{W6cK}R457tP|?6I%`sm}y@-Q^#{lha~>fWbomL z?i(Zb%ViIGw>ch9BmPc~+I#kI%eT^QX&Xw0NV3Hp>e=F7MCRz~xgFKw<}>`G940-( z`$t9UPjeO*lN0Gaf#Yc%5*4j)>t09PZ-e3-cDDrFp*Lha_uw4vh8?)Su781Y<{Jqi zKfqqxu8=zMcL2VgYD3zmavPZBE!$T{Pb2Ry+#EhzxUqjwC8#SVecm_TKBuv}VGGy^ zZW^Xws#4f)%^;_H`K7-Oh0U(d3xtJ9-txX7zmt6X1T8^OH^-rnL5%t4D*r%J+j`g| z@gNRs5`PVEqIpg{3GG*2PWyDW@n+?|8!dV$sr!V+{r0#s8!-gH*R;w3RBIaX6RRjb zml*meSob7OUb>!NwBIN`0TC(`@xuk)`gq?(Yw-S5^QvX*s-a%WOp@ zH4*z*KW}$~@v=7F$|gF~jCFvy>n;vLT14T%=lh5BoQ$^@408*sd>}A!`@Kq3mcKAm zypAQEBIH|(5|{q>p&IPV4px~xnH&X}o6b9%)Nrl42p+8MKBS1q4jhedq8ZY55t7hF z;1JlnYdGJs4s^XDbUY5exxWu%aJcLxN7gyNohr&s7a0s3hGn=W%Y5U8*y0G!+)S5^ zTa9HEtz0e7%c31czFFAMjOYbiuufVvE{P35YXoHl>O+n=pyt#T7DHS)QUb(SS54uA zYm(+Y&-l*KEdk0f8WZtCjh@t8H&=_WNCFQn0LnZu+<4*(R9QI5{X^MD-}txZfe#wIQ% z`Y!4(2i>P07wsz->kbp#P5j2A&&mpUQWW1(Sfjh${E)$9G-}9(OKLGp;nYr7TV&f` zysLA4^_0ZKe(w(9P@mzl{EcU(?T}qHtGg7cDMls;fW8o@lNY=Cviru1niR z=BgFQeXNUA@=BFT<)j#Wfu6`x=J8*&9~HSag|WYuP^dkW(q(6EAem&>n9vR$+Rf~( z21>%W*c&3pmZzmins3T(ZF!Qu1051}!>Sbbs1M!X_$ONM>=kv+XTdo9ZzqDmT{R0Q%?042aYJdT$8K@i$;KVN22a}FOT6;3S+&p z8Gi`POLqJ!4`uI-D0nX>WJ98wSUff)5zN-_k&h4-Y_(fMDafGkCk^y|{u!s?@M=aJ zsn6cB6Gb9bf}`2ov@j=f9387rZznVgLbI8iHUuZ*pQsET8XvuK8qEg0gRkjM&cWtJ zXA&4BKIN&?upV|v--U^J?qMjou#M>|&<9_w=FB&%PP!`FiQ*&^;`8e)j=T@lsrUyagBpDn`bkCrfLh^>m1%4<{&5nemQkaYm#zGN~1x~yJ7sXQ(+5?B<} zRvGRR59%0bmo}bJhNO=;;|$EFq4d6}jpk>@K>Jn128|5Q?AqDVyI+G=1T8ub;OEVD zB)k36K8NtqFPe-|Te{KVXq7!YNrgwl{)=W% z_&342Pd?8y`GZ$61xBZAHMJRIfQ4(X)%O*mor^UCE{OffI2H>P97e<6Tu{0Cw}(^H z4K})FYt41&bUF~>CXzcpu7jDnl~MAts)Re**w@4%9veb~dm8hO(8fLmAN#dz>tU#} zTCP%FcrnX(r+09zO0c~&2|hkMA5SD(5BDPH3dbx@P6jfQqHF~7BraQ+JZAzOu6Na! zYA!01b%r6o%aw0hx?|Up-L;x<4i-v4XXS7aWZt?1KbfGRsq%bXmJKlv%G~!nYRp2X ztotrjXue!uapN>$T}C2lOwW)5Uo!A;(3-`H9f-t^Z_Q8i3wIp1lw!Md!{>`TA zq-8?}<@K!`y5JdRJRW2FW7EL7g}p-x)2_fTn59}IFJ8BIu0Zoa&goodX_SM%8lyk< zwu54p-iGHtYj@ncL^m}g7>tPioIG+kFq-HJC^m~{@ZuOsLVCuSHjSshUJAACh>&vk z(0Fe12tmF#9KLCFj}LlC?sZ^D^pvtWAH$Fzzq+EH2sYVg@e_8B-vf@;n(#p;LkQps z#%Qqo8PEw8Il;Ga8(Xf=Phc(cAyeN}B6S@H^)X#Y1#6 zge?6Jt#I&DGF7fkKc)yfHrbYZO6y5;UyqZzU zx$K-h6#?!{LOPU5=!z0L53HL^6_jbc>s)Kf_*0Vkvha4aQOk2 z%AQyc@P7b%K!m^d?>nN3798I!ff*zx&Nhq>FA!D{hEM>O0>4jpe$|=0K)#&PuzpqR z+1oH9&VqGuWWnNC@ zH?sFW?^7uF^x1Rr?d&;PMKEYRw9^<5G;csY{HzsYUzL$5TMPC0WAc+UO=e;s&aRoU z|G)RuSZB)f7ca|)ul9#f8eDCW;fQvaHt@S81XKD5v}8w2m>_ePEOqAl?c0fx7-y4ho8_|2F_LjzV)`=$@ zw{4TbqdtRh!+Q6^J)Rb?UL*W{Hfgeq9QcN`YureZK!DR_WzQ01-GB6`jGaDHa|fhO zBaLg-#GK_EvF5Btv5|b_Mqyd@+p^;ynf~j1DOa?Jw#B{=dh2r}D~d3AOw_?fL}fi( zbvGcyWAJ+lAV1x(0b*H;6&|)eU-SE#&a{iy2gzg4AJP58l+U~jW*u8kzO|VU-5;WP z1h_d-it9WiA*ubskS}+3DO4a>@7Ju`0<%Xql(--2D{2bs6SIu`d4e=1YhZe*S*@Hj zd8(Q;s{f=kYGC`}xkxNP|fW)$6=RQqRD1WeI%aQ@ecnT>|tbj^}g_YM^JTh2upgbgB>(#5LU7|JswVhvo{WV#;cCB2w za>YXK36Bs6bA1i>?%kGKAr}A&@8V#d1{!me{`Nh;ul2a4L-A}X89vl1ELXLoGMfwL z4{|0OJm3ZScI;4@_RWVf_OrKS@W2Ok|k4sw0rgq89Q;N+=jNWn(&Rn zb`x2Ls+jx(*j}Y}XL+*jvoih11wiC(=~#Q;tDMcpYy3lds#Pu{Qzlpp=c0vj$;dCK z$l48m!xXM}7{xs7O9-TezlVSoL$)mGrBsOm(zj;^Ns%gi4FXpSe7r+_WH={)hBMKo zTkzw2=hk%}Gy9X-M}TXxNs}fTwQLy(jm?@#t5&UK_UzfvvjnD-3gwE)D}6dky7U3zYU-GB9`#kNs>k7I$K;4V>Rvxg z{q&_7Pqj>Ic6vsc&1qAn&;si}Sw-IM_o7Vr;60f!>QlYG+U+@KW!W-ilo#F}B;U>Y z3Ht!fEP&IxPdHON${WCOFwyrMI4E_V=^}NzJSRUcT!`aI6boWK$}UDz-4C;J%*r!m zn5$`OsAs?(;yos))o=6QadoBgc!9_8-i1zW(gRL{xkS^6&$VtTwJKK7q)8#PIAqip z(D2_4l!thUy>v3qEP?Myvwu=%bKV@;QAP?G)Vr7D3CaN`4%0{er0QOaw^qBigemNK zyyviDNnE4gq+vMwnlHRE{qLe~1h{9Di9{Dc?c2B4f`fvBofKC*bz?VH!-&h3j35QRWxlGM0Do7cAXqOLctlGsMJmuBau|F}?@%>ijsNMNS4 zny6CwV)DV@J~HX+_hjnCVe-|ex8%*&y27NkWeq29?rb;{t&s}VJIE4PZKFW064|3@ z0<(SA*x#20zpawWj!x=g)H_ih` zwan>8(Qsw5`TYHyxyo!Vkw2g28};TusZgQ>a86pBv+8%{3*DaZ-i1!> z9^*^i@YCQe=Y;2AyI14(6n=T3K3k}~_WWVpyJd1b*wcD@&$!TOjF3_sS-EMGRB6*r zna!Eg1!xgXp}{`38og$Ir;z8J~lw7>EN2 z{jIsAulSx{^46_8a`MbM`EJTwnEsg|g$v}A_lHlAaxjButCKM0OG!(h0ns$@PXv0;O25f-&&R{AC(<8s>p+Kok4Q)ex9+7W z6#bSmSpLVn7hp$Fm_o27CE>*zt_B3UN>8h+`cJ*c^3f zf%v1%^BhF;(th=p-?Tf7{CsV1v;E#)wEy1m9sC3F*o=TVlu=8Png9zHEKq+(7cN|o zQKLrbj|7|M&6`W5N|j*3KfMwZ%o;H3H&llmYwGrm3*cd00Srwn@e>kpv|yIPCqdb>fC=glA^~8$4LJW?rxvmf%-S~w zkCHvuL-OUxE`9oc2-B1)r2W$kwcpM*>0!e8vM zd_H2J;nbEctsjSG$IWMDiefQG>>5N0k~Z^MAMZdv^gI!8E%(n4#jbLY-72Zwbd zo&U!le+YkMM)&O5Q_V(F8=Aq}yo+!+?-N&v3Mt|UYw{_9flp-KHiC9bv7-5e2hhMw z(CWtgH*3UOUp3AFhwPP^HE54>ySJ?~3$d`)?LFXe1@h;VNngDujh}u|NW5M0$9kBx zpR2B0=m*}I-`93yeg8OTv5f!rC&`^ND})S>LdfNPDO4~QEZanpSx2{qNAUy3Ar6>9 z@e+cO6hfH9c@gu${y9wT-r~$VbM+;SNgFj+hZ(?a+ofT*=OqF5pxYEwz`Fesn#<*i z!Ovfs)XH?@Xmak-tXnGxdX1kkQ@(EIu42rC_R$noQPLGi{BTarC7Y_BD>02)FhJU-J0V#M}s&`CFz;89m!ffVN!D zoH^wbEKYp<@yGh>-o3kY=+Hsk2$4C>V3?97zh{f-v5?_*$Qevc?2lkN#*fdu(L6`7 zR&a z-GQ2r3_W0ZFf+89!*w&HDI@s@eWZ{7Z@fwla0nh#jfvq0gzJ;?b_Ns8!N>ubCvSo0$FI}{- zOdt8FX3Y-kBcx}$_EfG_JnAwxjd#R(RAw!2lg8tv^U-~Eb1f4e$eba)yz<<0^7HS1 z$eu%oBr)JqmjMH%V4mDktU!Ly)2;Im+d?;EOV_TGcSnzr9GNr8U;pj`j$BvGDzm(M zz^N4uI_r#mH%>tw4D5a22s*~GubB_kBI0d-;;d7zizUp#!iWx0n6^Bak;MC$hJkn~ zwPIU0s4vrCeP{TDiIN+*-Wwx6k-WjdQoC{`q;Q3M5au(%5;!zZ_EaYB!LIx!-0K-` zt9uEno!(00I?{kk*z4dy*NvleQ#LC=r6TA{Oz8twlR!z19rw)`I=T83VcS`ftU+_<5P0#-tTU*iT<<;hB= zoC|i__C0vxxgxD5*<)RcS|nq}011FLG0pH~!jA~6`SCeB5dE^{i_5_NU8PmC+OmD; zel2k4t^;x%zBTD1!!umEd|i$lJ*|lNxbJ4k$x|>js{q;F3R^Dp?a<-lGG)emjXZPq zqSFkNEtPM#j^^0=$c&+CMg$q%^n0p5XzqmS3(|1xH64)pc>F;F*t#ar^nC9vuo*-s zv~1L)fn@;!vFB&Jy&Y@=%j7j4q*zT_trySS;KQoZs>EtD0kzfzW|NiI(QdRw^LuG_ z=r3tX^j|$kz{~J?ChHq!9BaXaD_HAE z=Q`GPItdmgX7# z&VD;LX-J@X;`hAWQ6oTqCniZt>#d9O@KY2aK60q+VkC{W!*Y*0PB|F zJ7{~-temw!*vFmsfTz$DrPUppB6j@}>i)s}n5n#k5jamp62h$l%b*(us|>3=YwvN- zPlf^aNXvh-RV?q>ezwJPt%OnWV;jx!vgW`0xWu*&an9v~&*UaBL0G@)lOXhFa1zzn z1quly!1v;}W1k4K$QZJJf1MM6lde8{?!yIVh38phMyFB#t-zy2z;{7*}+Y+2>| zxpS4+cnyA?RagN;Mf!L=2vgc`kNivv*mL-ZJm0#N)P%1;!gOoQ+49&HAbtZv+vQ`& z1T3}Ys(b3Fd?HLIUa@E2A!+sO>#C7OW^>un zg=FEZFIA8>Yxd0AcKg|e@I5$JcIp0HYx#cs`^s$Qndm0^`qt1d<=6?(q)@KeGhDco z7ut`lLmVT|GZ4==JSXuS<94<|9QO~_!+iF@NAoj>L#Cm-xqNwpWa5~2U>5k2WY3yG z1`PgOHvYL?huU5a+a3q~8rjPTY)_MByK2<*eadn$7k&7WUBAM}>o!hL4IUxv@Wn0|y=i^UMr~VOE>r#m=1|teHm3v9Ei584Lg14!I#$VRh`vX}S=541|j#HHM%g z0gt?R`HGsOX4Vv{g&aqwj2tcvaMno&GuZr|!~^-`S%>>U!Q8o}Pp6L3vt3)+3tW%k zbGQhjPhjplW?t$GuYaNuvSOZwp5NA4iR(juSS#$iXVCoz6P>2u-Pa9h4@hs7>%Hy7 zzfIY^r!pYdrNPO$KCQK7@kID+aATHBT;5fuwQgCSwF`N#(`rB8S$rH*m$@1%uA#@n z)^j$>8LYCLkN%q&9|0S~ImYY$zp0R#Y-HThpB}@x^Oxm?F0G_p>jv=4Ol=*3@0xXT zlV-KCh_Svrf)(KzO&|#<36n{8uwi)lLBshDHkCkVK{stw!&&ArFgmD}taUjmGaF>p zCrZh4otuGBdQVQBJO@+Rdyx+SO~Hw?2-{5@AkaJXxezpKo`N~=lqr+xvwcIiW$=FM zD=a{|xbf(BIz%o)8~YOcY+N{Z$tiut!j-5>rY{f!YD*u7iK9L61-c7;p-<40=s(R* z^1U9`JO-8~U9McdrsZF`V)^on3jlf=-*6`c1NMMnPc3f3vTI=6vmF=F4obkWObQg7 zICe_@MSVLo{eFGEjc>%mPv`?88?6o3_aw-$u6?@?YMa^aLomU_y2!{jjBVs_&J~X? z=qvvp{|Nk7M!+02s2SO#M-S~*^5n_o%$YOFT+RTZC5bc3V8*vin>MNu@Z*m^`ows? zbL%R2iy-2}gBAcZ0m4OKqG~{~#tHL(fzRe#!GTI(8S(zhLcgy`AQ?|7c)|heaKr-= z4`4u!0b>|nvP2;n_2Dbpn#`H3gV7R{cLxUtW#HmX2&DT6n=ar-*b!Es^$Nl}rn`LU z3K%z#&ISAo!QOF=aSj;B{MWHZU4RMROXn{E2734gM7s$4R3g&@Ad#3-req-*{pmnh z=Qs=P&j9J(_dVGOcpQf?k^hjNd(zF2n?mz;^k+@2Z-%a&n@e)IX}F66v)QZkMHKyE z-SMCS!Z~msjOIJHF00gU6czt}QH)umX>jN!Q<;G{%Ju8lmHEtg!+547KYtVyp~*Oo zzxl4b=Py)nL~)L&X~<9s9&H4h7lQS173<~NRhY6=samT|B&uA!YOS$Lm#*p>;a<)) zb{N7HJ9q4ne|GHFJ^#$@f7^HOR=;_~wcSl0 zqm9tSzIgevTm`%_;TrB4$Di(NksR}3DfY0TW39-+g9GV(iDvEdQ~dRz9UB^b3lL$_~cXB z@b}+3{VBw7<@z<50Uvt*VxLL}rU3o*b!^^D(xilI4-)XB{A+vZNTj0xAJO(pV5D!J zHd)pV8>&oa7Q-=)kq|@GgxQzy>JRUoDbSZRCT$d4OTZK>eYl=Ke;(SgN%d@V;=~DP z17{Wr)qeTqmukUjz<>eDcqUVd>zqH%bN=jsch2VmFvoYWw|5Xe?`icEv9SmF6dIcF>Tj8B>V#=BuE%vS~jgE?+om&XQEti z1-Ej|X4$Z58<8I8O5=Hk=M|oBxEF*Kh<}nc!?ag{lQAfa0giUz0`>~PAQMO{Vp`Ia zc#hHF7*fTz9B{;>XJd3c(dPy*t=+6qbvX=W4Tg_C{YDNRK4CG*9p8pH*Co)Z#!mP_ z(;PZ-Qd&2!qh`6I)|9pu@Aijhn|SdO0p{JroVe`v&i@d5Mu3(M=*yYHxYT4m24Nw> zf~;AyDnpu956YD*CxZtMmQ64dd=qp6;!a)@BzXg zlL~MP>y$$FjjC6dUhUhdU}YW%N>82lt1MixQr9})-@J8Geps+jw*33AK9_Gew88da7 zWK5d|cw7jq(tMyM&pCfAYq6XFt{>s~4Se790coXulg3&>df$3!cMv?%OXr_wJJe zN5ESE&Q0N7%i@Ro+Tnb;3F`=4gDm?71XsxHy@~5ySo+!e?=CrZ@|5nYY{%s*S2Z2i zGTTPx^CjT&taHa63xDVR`AZj-(QWE<@0YbtlM&8+lVu$`b{uWqhcm!Fb(?hd`~{^K zx)|4yA5MRc=o$g{Ib>>~h=D&u_XuDmZU1Ml8~|aFp)Ryvio@)pVazga)|@43Rwqy1 zV7YhGQ`*Q{ik=Hn32T4`261OOZUGqpa}`3RPHCCR=KBjks9l65D~Gk{MY3keD0OO7 zl$ERhg!bGa`SY)Tq&A-EPgzY|I~qa2Oh9lIl3#yq`9}*W4&TcaaHz1D)Tq|An_HDU z;9sp0^Opw&GIS3^qx$=)zsPY&y%Kpy1nuQF`acVeZxGNxFr$J=qd&TQxxeWEzZN8Ym_#D4wlY}vi%pqj_*(Y3Yw_4hUz z@Xlwl3g;jB0=Xc) z+X|ZP)g)WC%&PV0Ha_SiiA4tw9YY-(WajL}D0`JNKW_%~e-5V5t4i@A`6UswpSfw8 zG8~`9ExrH2KLY=$5#Sz%iB8kJl=j`ab*s?#5j6sgMl&@9Xx4Y};>G%_j>AL${{4mi zpULoIuoGRlZtlU%)P4AbjT48`@;Iz`-zzom!~QXF^AZ@lzX5JA>^pE=N|h|2LO}_^ z>>=UB5(e_94e%6aORN0=>=nuum)Bo@R)&q3s0sgAz5&FV+8|D)2Lt`CtOB8C+RXXd zojSEDNegIR8nbbRSi%G7X&g4j1Ku4tcnp49Q^Q1DOUaohNR|Qq%z_s3X)wS{_;Jo} z5SRkei8gj@`;1T zOL7Ah7TVuG=)$9%`}0}9|HDi^x5Gr}H4q)*Ns<&4w1@>?$ZDRt9a#vUfNR!mfvNZa6&}d~(~)fZ zB^(B)&iEC+%eTSgb+Gh=3B}U~j>@zj=F8yszj7Pox>&0;u~-Eba8dzpGyA-O9VCn{ z#M-T;G=m4)+qbU4BQeQse zX8N*?_>v~7xf6i-y6)}H0fH+^L$`g_?ARXo*g(bKjSap#2ylnk?IS!^P@d}#iEML5s zB!*dH;vt)CnUkd#;)0Fm?gWr}S zdGj~~7-GTx!v=c|gppw2pMmvSU$-qgL2 z=9IbaH0Vp~HY)#et`T;hIC%#6{dz6>An^AgZ}pHeu>8U^3DXci@eJ%Ae^j1G1WmM7 z&Fjj@G1KHI1os)1;%rkD0&xKW&>+M);MavKWX`;$k~&oi>Go_3(9`le@g%0_IPh$9 z=+JQ)IeLoHyegI}CeL(eqU)A|cBCVbzGcGXX}>`D@u(z)+mU`f+dF(p-v?;luoAK1^rvP ze7$@J+Eq~YOkhZ#)Uy@^hVNl-ReGLjOcY;TXPdi#ePJ``qqtgsaXyZKfdM=#wQJW- z=EBlOk|aqqHwAY2BfXV^_1wEY_}~Nmz4OjHa3$74@?t+CykKy_0M|M@vGs4cdhCcK zMBda8V6iNrY4S(@hkNsuojWb2E?QHmWJ#%Bp&agjcj6s!fyr}!hQ@I^=>dEoK48qe zrGKcv<8o+wn~(q&%HHqyqFPR&a3s%7c07+`ykpNEHL*+pN!KyeK}$*kJW25Z)JLI2 zy3=?Xc=7U0f66v6l^MFeJs8$d3PJ-qA6#p!mA6KYQr9wjj~tb55W;LwwJN@k<+q;o z#OA;)f-v|F7=~9)!pa!hed`7+o^pTS_7S_A@59hqyot-ecNRI$4yVAJiB32ykpVcV|{3Hq>@-4$l7nMIY&Hq?vmdn z8DIt(v(&b)P_nq<&g>^)$8iW~aJ|f3vQ&Oqy40=4{#~9`YePlzDLD%H7807qhn|JJx#j94!XH%xaLW2>A76^*g zk{0#qr~p9*2ncwdBTYT-*T;TxyLTtnK6xMK__4OV|AmGD`jE(T8$=#5a80Q4L`fVT z(yAl+8db|c(`}<10b>0ROkU>!q2E`4Z-5)G-ajvEv{I2+WI|x(3_xZ`jIa*onu@y+ z!Zt9*Uw~O7GM&kArWwXIE$c~v0=cm%gDGRL2^-a6NO6Alm8B&^CIfv+?{{d_Xn|4i zhk%uV%;r5{0)IU68~OU1S63ar7}kwok|P_K@U1$&qD7=ko=7rfNUfU{0py;2hn;OHRlER9 zRPK_`#!R(WheJ_PXyBZX-Y>+kd7uY=jsNPTS^Vzg&k|8pcqz%l>1UYxg)EW)2WnnwUYjxdW7|Gj(nR{ut1 zFdGwed`GE&{>WS=V{0i8EB?B`_j9XOtzh!Dlrp>6Ql@7hTvLWK5UaRxxg}ch`ZB8j zuxKEEhmW3wudiR#?-xz6wQt=}vO^GsCfd9*n{DkN%ph__srZ^z%PYe{+p1X|T;qTdepn_> z`bpywfgyb7&OK-(AJ88oD_1O`OfhZ=982zT1BZN}=?4$!E~P<~NfPDEO6$`M9D^aCP*{XS6%tfw zPO=)9P}F*+$^4+8AnRLW*t!1`@1W`X6aRh(eCn#C%UaRFaDLRib+ixb>Mj@_6t)}w z*>}3X*RTDAWPxiFbVLQ<)^FSf*CX?QTU7wTF;Kpn^0QOu#PlLCQ(E{wT&=&)K7L(^ zsWc%)O;ioQ6Q}_&3>q|>XtojIlVPt&Q3zyE>R1UUs80j()U8=jp6}8ES~WjPHkf-H zGk&HDg5-nd^M{{KloZJltJ!U?*J@SDs_>na`SJKmB1v4p7zztjgV5tyKuE|{8TiRa zSplts5Bk3>`E%uh3B=p7X7gtGYT9&7PN{mXh5bj4=#P<2K%mGQltWx6Afq?O;jjg6iihzR4H2qCN*<#3Ms*tL}8k? z&KDe{{uAjdaXPe&>>(g!3Kg7Kv-vOS-{l$U_slbx$FP7x2Ym>f%OoDpUbI*ygV@E; zu2CZ?mOsBFO$(W<>a=eOU%oGvca%9oR=bFKVjHg{^; zMCw2Jq@L^Vs>_YoG`8AJ+aEIBU^X~0ELPxZO!gYQ*Ot(K`-n$?--L6Pg0a1N^^z~X z_`)fu$ENW;tvL}s8m7~T5hH}Ze*OAMg9Z(RLVgssHGz4E$8!B!496IltEhrBkr}fV zLrW~BoH=({&nSsNZ}J6yrxqs7{su$vt!LMk@-75VGiOXIqraXZyUDNo2PWI88CD-fQ`j?jgNB&^KZfVhd+ic zU`z)F!nzXTufZDNfOo!>J$n!6yJ186N~sbB^jkRncmmc`C|6vC_J)u84s8yQ4@XRp zA_enEISAeoxE(!m0?hBpS_YZT@8GOfwlo;=`qujX`Z5~+=-CFswA6Z6KfYQA<_9bH zhgdNJ1_o5FTv^xhUl5+75Xu$cYz)NFC|FO%GA+IFdC;IiGIs1(dFiE>)W>t4JbCo2 z#q_2;e&i$T*U7)28)j+VRZ2E(rphg#d>~Z}zyvlXoWK^4N%ryh37RT(zZ6onTv?Dw zc(3#5iAR!alxdARFck>$D-LjG?Fll`v`?rDIAat0*2RM37|cM6Vs+j z?Kta%R}juRP<}{~D6u@>whfq?b@gmRCMz|Btxx=Dzcbf1ae?cwkhW;W3hDR3hjNw- zafBB;b%3yBkOkY(Ope8$+jhvO-+m*+YrO^ftLE?H*IrQ{);dlOT$t^*PA}Wd)x3!r#)uf9tg(e`J)6 z0IBQdJ4ace{(`I#;KGRuBH&RV$FqO77J9zCK~lLwNoAg1Y9SyMle#to}!UliEfkE5EwWHx8X901=x=fI3k15=T8L0nBL=V36(H`k`tOlG)HTg<20Q+=Ifby0BeRr62A}oMrI04eYbmdfHHXk(x!kG;T7qK!x0JA zw;<(AN!(>CH-Y(`UV8T*21GQOnq-fNIpu7S|0diKpu{JGQwluX)c%)`dIaj#tEWgH zO->~OLP^t7_F%LrX?B~3gb5QS=#Rc-$3f=XDNJxgZQnC95hyfs6Sg1JE?DWZ~$SxJyF0qJQ4!vvbW@`m}buF4Sbc>zqY zufPOjm<($W++Wf_dtnWkniARrRPp|9;^x>jkO029Mr>?6H5euc7U$z+?aSdVB`!bFw^!ZzSy z?Yq3;Z0EBbn@HL;spJTRP-e_pBHN#ak_xX=v^>4RB2QMI^&nVFTU{9Q>JDcG(7ZAMrudVIUzk31Gk-n8~N`+hY;H zG@_Z%c=3~By~5}4z3YHEkA{fG5%WCCdW-WMS6|Y2=6T>}RCXSwiWh)DVM2l2xzy?f z8Fg%nJ$SZ_XAI6O`Z%M81m9B-!V+e+a<2sgkj!S9CO&cQoN6+Yut)!0Bp_^s#>o8T zD^xp#1iT7>5ifvw&A_s0nTYWuG^D}acmbx=Izw=!5Hxr=&vzX-piJehbP@Of06+jq zL_t(U*k2Yw>OFUk9AJPy4=t6%QUucZ9U3oWJZZ^0BL2sH_S^CSd3wFi4X`@KC)Ox@!%>-S)N z32>IOpJDjUz|bquZs<1fEoZY@)~%~%-APEC{PWM44{;?3TFb*Hd;`XCd1VemCeAUB zEy7^(q!wPoLJ~vE-5EFt8c1BcCrsC>etr3F&Ri9EpjpnhMt&|8ORkjV>o*7)+~**C z-0kVs(8^D*<^(xkV|z%5evznr>&!8Ax5DE47@-aHe)M&n~>0 zK*1d1Q3}r>ut(wAPe1)se-y&)+_^K90}7}hAE&2fsfXVj)|)778ePG$H&9de2=wuDvi)L^YNoFJ$eHs zdu?^I4Qvk?R>@$EhHmo6a83vT-M!G3o(c<7g>Yt~=Go%k*T{-he@ar|f!T5dsv<`+ z>~S$+QHGYAc)sDe1_xdpx@esb=HvLUK97Kb0krt^$}6uZoq|HSSFsQBOlkt*!KqWH z6b8^z6wPnbn$*ytLsh`HX3d)F`vQH1&8nl?r7Cx^hLZmftXZkPmWHi9Y1qc-otsZOwl50Y3SDNTUtL|A&s9o6Kn5l$p)(qM}YH`EL2D; zLt%*U;TYiGbhx=mmnx-9{dt};s3{1Y6|mE*rZYc`2Iq~5@Lywa1T3sAT31L6_9iw~ zg|XeW=O7rf2eoxIq0LnY=9RH~DuG_LYaQ(#HF~PN*rPoN?Abucv3JJ%2U0>KHV(ZS zC~M0`07;*m)R-9v&DnHmQ$h;nvXm@VK>hU5e34@>x8<^cM_DjHO!y0o**egaNe(2O zn?3^(`4E^tTLH7JCKP}<@Dryj%``jGVBSaiO4xvLijPg=6c{vv-XE>ZW-@olY@Ru3 zxHM^4oe)?V^zm_^*th3^v}pgbP!pNVW-@L2_UeE!&xu1o0yWw8Y{#ZDbl6z=Wx;Z2 zP2G^GFl!kUltmfRX16dJMe)#2h7=$;j2$^pXd0V+%Lsz6O2s*LX`{WBFiArB;oFa; zBBWhY!Q2xMh%|FsyV3K|^tz^HEnBq_<5>xZ8u|-`R1o|=PM8F9anLACpDvZ0fyU$9 z8KYF=D0%Xv+P9OZ&Y|x=>vv3y!_46K$LM#YV1NQxKOEJujqD#T8{+bpVT$nP{WL&c zfq4B_Ft5lC#$%G6$RZh3@4ox4(4QF(Wjug+3p}h*5|EPrJa~;6Ge&>(15IB^m8(=i zHXJcvX3QMQ`HTl|#XVdciX}6b%+rG~=f*&!ks(utbb9smkG2l|p=sRQ*e?vjneGw@ zGa2lMRNHwqR278ici|6~;npn>Bp6UeEm*mrDQ;T1FbrpE8~1vljUCy30r4aXUcPo! zrQ2Bqna%IL)DKz(jbOSEcoqKfJVNuRY}mF{+FWS}+&j545UJ@h1KQ6F zw{X_@20qssK%>+MvXVeTQ*8rBe5yYDzJxY)BRta#BTaR;gf;@rY8!@gW4!CxzAcC+ z3DsP(%T%0J@t^v04+6m0$W?oZew-hNK<#4BV3x75%2gm|uzdXbYf>DHG`cL}9H?Kl zinM*Hzrzoo$i}VPlt4_vm~}?AG^y!t(FE@6X1-ScUz06o~50FSBew2Ev7=kC7fAlIfhIpK7aoC=lUBAt;bfa+emKg>Gxs1 z3CO(ekrtuc!aYnmQFSGJH!d4!y~%{UaF;_(L!P~So^xy>)^T8FOQ6-j0lFdLxkP8K z0TM_WQIF_Gf`L<>be)u7@(z5ZE5`MXd^=^Xli~byi+b{0r{=&j0hmD8ft+poU> zYlg740s4^7sKuE;CKDBjt3dd=*&YHz%@;QcnXf}1T%0gVa4n8Zgkges&)`C~to{QWmdXqiKl`RE9rn3bg5&%+Xgv z*VXdEV7l|H#q(QA)YAcee3z_QBb&BumjE!g$ZQUT5cF}lh#J(Zr&I(}TE|i0QEVMh z9TL|c4kyOUDYQc&T^l@gC47AE*v0cWYcSC7Fa!6Y$aeD<;Ir9a9yQ9OYXKb~j(ZC( zgy#IVM5sL+92ltD_B8EI0g1E_S|hWWTEf>scbPPMjxw7wz=ZrwNMwHZ;V`LDsUj=_ z(OuU)IgPU#EeO3m>N6FreChpRQnpx8;M)b2f1!it7`PD7E7PN#GZ!vOmln;X%~K75 zD<@SYMe;=~`iHe%lLJ>j51f;0s1GdI1%bg${N@7Apq0v$lGi^TAvqDZWA9$IctmC~ zbFzPHw`@^nb1>c?z?ty%Ze67v&XU<6U`jmWD&S#rxOQsU=SB6;{T-|h@O;o2)&`j0 zYJs-g`s0jn-q>FHZ)0i%EQa%A0w-f?P@|!v?pW1`{^H>Yodi%%94U!oe#?}GYp32$-&p_Laemy6`Pam~r z&O+jJ^rr))T$!RO6{Y~xJ)0KHI)Y%*sBD?SvSH(PE#sN?jU{<55a0CxL!bbk0FV$} zxyCXT-ls=f^?7pyaSTxboT;G$>`Ts^I3+`YtkV}W!|{{n93;kA?mu`;J|8<1OyN54qgqh&+CkEx zBPYRR&M7m#`xr=IK5~q#%^Ae78(>zOep1U){&Vv`x)CIUuOLcK?b&-sK7xr%mX$tz00_V%A>CUN?<_4P_AM(+aQ5!r76|4= z8S(k|@b!FLRzn+*^M?|<_IaV5@c#*O1k9AzHHuC7+O?Z9c<>-Ob0r}*X`r|E2@qx} z{uhsG1o+)}C_8!5B5Y8bIlMq77Y}Acc4%sw#E;RVN9(We%dbnb%1I?nA`-=M9EZkS z^nHI!EFepSYnNf_jp5>jD^jL(A$4&OULQigv#cW|hA;PQSu;Cd>)hs_P;)p`gfowI zW4$MFMFTE0C?Ey-q$YG~CdQzlbsVued)!66L~ zbNBE)XaacL=Wk25-b1tmYBi@zmqupIS>hCo>f5t}q)H1DNI;&WpC|*+4p_kN6F)&B z*|TfA4E(eY<_vt_p+^yoo^2TXw-^Bnw!)vQL*RpE;llfdT$4=0^dz28pk&&#Y5MEg z;{|Eax`Sj-cN_aWOfzC{jZNRfHoaZn-mD{6jPs88OL6#I{$j)c$wx~vnBQc^r4C3f zt>L?H@BYJ1zwG-D98sdAS{*>A?;{3EFYwA~ZmM1QCRx1pJDU*_;8)5E4JEa9@2kH>_SweR9#i+D!;K7zQ30 zB4GjS~O-E^W?pxfisKG*} zqzPmkl0ZqVS%n*}?wch4tt-DTe*c9X%ks%AATAG4e{?i;$!|!(XKFCL_flUqt3&GM*0felC3f9~09tr^nHCmH{%|_bDoC>6yfXj0Z)EkilO!d&?vPcAC9U)~ z!#)B$@Bj0U zfcg3SrWwXf{6T$B(}E4dbDf$>{n}N$bwb}q#~;$`7!o8*BU?8wk*|l$ zlhn!B&*-SlU*^M%0Ot|cGX-%O%+*_&Olw>e@ytZ#^r}^>^j8VOz8yPvhTE%3I72{4 z3g=ql2D*4cZ-3s3e{^wP+Gm8*rhYnHs+B9JG+%=w6f2NlUV%LJNHK)M?u8E%3oijd0RT6=pbklN9`ugU4ZM7C17QyYz$3EtT{O zSFq;OVbEtgHEk?!_wEJlAo*%TDnx?xgZM!L_-B_68~6j3-#U~ z5yc~*h*qdFErxTbXn#bMi~xv$2X`NnkzY>H;tJ)@C4~#+0rC%ImB=fwJ(^OiTk8oY z!+G)2HB!G$We@+np%+O2w3RzNH_)eM8q?B)2D~6$pJ@Q*KpDTGX0|C=slm&Yu5j*{-x;$)F9Tlk?rtTxOvOJT1=(##U&9K>699M*kCsQlsIr)I(uH; z7&JlIKsazV&Z)i3XW5)2r1#mrg_M)1kH1&6E(@-+O|1vdqSf;ie z>Fv+ky_-`&@NfLAejDX!Sp{5^X7YjM1lxM3<$uJrm`7WbRYOB z;#e|}F=fW$4i1$mzXpMg^ec}J(^9XQ!i7<|t;PMIXb7p_S zM?M1SVJ?Th&3p-DJo9iyZD;1&3(2~XBZf)I%nc=7(F|4*AbMLTGG9uJY#IxuaM=?! zFi@lL&TTug7Kt)hoG1Kd;fhpwzKBG*j-({6!~?vvO9d0L@M(>Mvi3)FN8=wA zH$eID35z*!{n`cj@yDN|vA+MGqRm`4duMn(XBrY&>D!so`Fr;4kuOJoB|G<@k=I_R zAz2WA_m1V$EWC1J`@7frc9wjwmU0)k2!rlJkXlX-oToel0!=t~{*vNUiGV{eaD2#A zqnUZvO*FX0;&P6hei8~w9hGbOR_YkAa zmJx)Lzjo}DP20DF30T(hy9(p4Yy|@(%o;HZLsAMsLHxxldue3>h*)=pvE)vvs5NdU2zynbid*oXKni{Nv(? zf$wj(lJYQn3-j{lZM}sCaTfa|biB9G!tsPS6ZQo%$W7oq;RE+8Q!m#J-9hAprnUxF zz`V9S#^h%^iECvHOfPTr?*f|M3hY}6V71BeeVHm{Quvg<4#k0Xz{gU@7|um_XKA-{ zAC50qFCLKjbC;TC`>(Nh1b7ajbtqo~X&Y302?`3*Gt_Em=JGcZ7N+upei%1#GS}Wm zINn>erX*0 zZ14AD*@-ZFr!VaLMh2WGD12(ZY4|xG{H=XRAG{RrIV}sb_f)f)%;tnROI^a69|%Ts zcler43+6B9yans2^9J4mEc7s(hPF5T96wRIv_A8kMFC#6msS>2!%{%AI<=h)=WV-o z%h@L`N+8TxTRW>}Q)Y9+YE_|a{1p6_rxt2FbLz01gbu4p_NmmRWlJ@4O<&UnFJK!3 zQm2vXaD!tQ&Zl58fV4#l+E94(`?(hFt#OU&@S$A~%;ps68!hwDN{2i-S{a>R$XSv9MrTFnVem7YLyt)m#>A2Jo3=U2>)pK# zd{Y8(1wtDi()e5y32nY=Qm0HJ-+Vt;sM%E%%ug~d`Q6Ndw=b2*Fc>dixvp*4^7l^J zwfmr&^fo}k**M#cN(N!!(-fl#oBrCN{shScCLq5G4ZV$?E?+}naEp``=Zkhu9*H9zioDTR1oa>3<)8@x%vPBDdzI#u}mg1;fKk=g^P66Z! z798Tl19OKkLKkUxrAF=@;W&{c7OgM~*+~#5JaAUV$>~GAR#|bC0Dt0?i{Z}0mw|7- zx6ZjD+_G5BV2f=H0=%_7xutgsF*#xl>~l+m2gXwnCoSOCtNor;hBM9k5>AuZ+^Sg} zX$sTbH2vj-gPw&%`7@;bb?wt9>1|lW$SE^tOoeb(5)3EC)E^=@0{E7+ys;05fBJD| z;2N|ugFFl5yr2(55?bk-oq{Ay;lH&DB-fv;T2t;_KPU$_til8$+G@>N{tatBdX^tu z&px-o)=&x_G7uJ+gBZHc7(I0Hj={Xs2S-J`P!Pi&!dh!2tWw}@|DJ;|?`R1=8Z-XP zKkCcLQXH0BGG@+T`Hu$!@No$o2L`L}c;a9>=7qk9!9S)g_A0%_+28Ztmo%Pv9`sp9 zN5{x2LHD3=8W8p?zTrGEDJ*7P8m!%x_3J7_nHp8Zt@yhELIHhp(VTAxv`*;fdEOt( zv=K`1cND`tjYJa$&e2onFG!>3yUWOd1Ef@uLXtUs2BB6qrTi(CpCd4fG6h+OYMG!( zoD24~z*bs2mhbc0^os4U-}Bkm@SyovzPG$4&Ro5=g8)Pcu%X}D^V#o1r;J@OMf2xV z>kVW$8$uetDEq+u!~_b7G2R|H)$Ib%7JPM2W72aiPz(Mf2w(Ia$B+V=)SR~rG&fkG zL5EwYhkd3=)f=adOLdr1teLL>kj{q}3XkR8er%gZ*hRruT6g*Br=Qf) z3mHk_1rlk~f%!ozPREWN)ucn;eP0^ZuOSI9&Xt>&_W-CsGYD{!@FBW`Ioq%vLkEdW zofdqGqngq?pwy5JXM-cK#$uHlmC%cOg?r2FdxyTkdorEx!R3-Ygi2?7No#r(8Z@T0 z*?tNP?%jV(1!;>G&I9@t1hG6Tx0MlVf;kbD75U4G zk`dtCqKhH=W?#R4y$at(#B}DFiPqbW9z80?;@d|beI$*WG?83EL9+Ub&*binYoX?Q zlr%U73n`i}FN7o$Vm?zD$G|dc5p&$A1<4TB+T!{1j>CsJ(}lf`X}Z{AY|US`T$#<8 zpfGd^LU0TtCQMY5-o^6gR~$5kma^6zC4VqvuD$~b{09MneIMfaz^*vq`xu&m;T$vf zzUPjLl%^k~na!L(Uk1RPP)Z1*T!tV@_qI<ZP2{3;!H)Y=6^u5*|vL^P>3@tw1w|r|FZ{L2_UnXIJt#)mbN5zd`C=VnfO|G}g3 z!|cVnL7D0(?SJCrS@_e-;4qyM0>6Dbq+q_Bs z-^uTMFjgaSWXq_UynG;YwruIbKss!uZ`o-8*rUfzDRGOM(A#$GmJdIf2;Yba9S0RH zO8*cx(gfu(nCi}yA&s0oeE~??Wj*Mb0XF>z4$2CNVTaj4N8iFCV5y%u0DNylz$V^> z{{#jeU=%(BD4s{XWE?Z3Opz46oM+3T-`7}&LydL);dh}4LJ~pJrcNgN4;&ZEKao|V zw|ZT<{db-t!0*USD@oFX(zJFXiA#pFKRh-gumTdN4B-(qNQ@z)nM507g1b+jKGF)3 zcg3K|qP~yzEtadtNKl{|{M&%ti2&lbVxupZwVDv-=B{45;UEa0KJy}is+o*GGOX2F zf+IZ-V+w?A2w((7IZBE+l|6-3zk6dXgwUQU?0dpj5@srtD<<#0-Ampa@&z>2Qp4Ru zDlnm+ftG-S0cHSk*hNLj8}>xqMc^=c_s%WJ5tK_RmwXzr?tP4s%KncEp&vp9uEDT` zLb!fOFeL+l!XOtRjZ6RP^s`PM&|ISom#<;}ymuaVDIJH8$K~r-bI8IQv@1%0 zyNvd+mD2lPU)CD&P~th)VFhKxXOmg}Ki~YhbE(O05;M}kWUC>j zo3bAEYVWuGJ~HJ#Xg=2AEw70)S8r{htBSi9&mceNKT_J3fjyu7-dj1bDUKRN-r$k; z0rSlj?Dwu%Z+TsB&7I}`ik7{;hcCG4zs{E|*sqR3;E5qK_NEhHM2~^F!6AM7fM}N( zL^u%9^h=pCV&N5oMBkkpgB!>8N!6;=BwLxXv9O{3CfP@T!gk}vjngPQthJ}%eF|v( z(8n|JDkDTYc<^9piL*@s_+N|@=f3P-xlm|B0xcnIf^t8nR)-oXI%t+!_HF{#b`GS8k@YaV*WKHRfe z|35AHUHD4}!CtnD4Cn28;QttNktU|OH?rFV7o@3Nz&UK+;lo;aQn*f5rO+x2>J31mN{L3tlhc;0>V6>g!{jg91GX=<*vlrl}_gn929kJ;HQt2Jdi`J`M zSu$lzrwlQZiObuL_~y?SEP?d*3?`(P6|IoxiHarF4^E?2eS`$YWh*wR`RFWQREHIG zs$fS?0~6VGYE)E)GtFmHtF}z3!V2uRZl|9$3*e9utxQ?P0=!SOu$w@EOe zeaaNc90wj#3+uBr)>gPh__9TK-r9?Iw3?=wu7Tmiz{A=1(=78#o>{#;pTBs`LPk-p zt99N|`s2bI0d9yP@U@-{*7MX<7AKlN#L5w%utq!}6U!lWI(+zWm~qachb_;>QUjWY zG5Uo)4j(5Bg9Z(fMvWSQ7*jx{hD{zKXmR2~Fa<{hIMc51uDaeKuyPXG&6^ySCYU}G_8KKrS1ys05jy9D$K0Qia>S2w&<1&~LAj(?Hhm-G}`D?GVhhq=2!U6Hz-C zB5cxQoPh9@2ciQ94#+$3C&<8qjOYHyHA;dh=fdgJ78A5tvu4t@YgZ+laQ!kc9}iGT zQf85$(qKN_J|~G_NkrG`V*#r}uNK0Au!0@O3oA{8@uql8+d1==%IL3WNUq>O*#{xD z_XmS%f_2yL^^YYdEXTYt=rbt?jht%LE69Dg`gkmclmmc6^4+^Puud=ImZqBNzxB}& zPRwxIzl*mx`+MH|lEyR7gFRas8)+e#Zo#tqkbXZT=OEk>;Ls}eEZ1AxI5#$I`5TPm z-MS{p^sQXFGz2@b9+6=Se2>0AcC5@>y3DgU=QI87T|m8V3}?J$ehNsms8<(6yo~b0 zf(0^p?k~=Ki&m`?{!DlZrro;2cNsNL6N4Za5i!W*i|A_axBWh%eD?PbnvZ38%WLAy z)m|6BCt+l8=`xZrZ(d23Dz#b%u}ZS`Eqgxuy?Z&a@5v&zS$*(qU6jB{6sq$M+RcBg zTd$hT*`ZNE|8p-r`-}u<%__Zz3{?g+8P4LN%4?Gnsvh zaNcnoI5#mzckeyqD8*`TMp#4wamZS*p{8%B2rDnzWWf;ABHyk~p&j3nyQR{xK>K;N zGnQ*Lo33N zumT}KRBUVf4pe9&vu?ofPqeygpt1LcCMj{vwlBRb8E`h8v*dRv2h-a0WoyhG6Bmp9 z0q_9u=?3-=3ckcok`%t(Ln~&*{(g9GA?FL{0nMi~q{o?O56(Il;Dg&@hH46Z7xo_t zA;y!xA^5Rz>vk=V!mt%fmP9<+X7*X>D#d%O>;1#rWzP(}^u$Ry3r$`IGM-m%+@#v} zAHMnuw2o6jNXw#Mnmy>c!&)r$p6C9)G*Z=Dup@L9TK=Ih2EN@$gKv??Oz(55C@F>d4ed0s!%1&GEE_2U7695nMg%)aR10`e*W1O(zSDQ znfU!rAevgH&W4ZrP73791;0AkER(B_GR?u__=z)MlvjW+>?+V2PNoMPZS-J)*RW=yuGN|gedMlB^MN0u;=LhU0m zvF!=a-t_R_j0bctG8Ma;;J>p*z)1M<>L1J4IV(8#8%K@qnH_pE#s zUYYKx!+3I(Pha&WaD2i$RGM_raaB!82Xi?t%&OBYEWxV9#0qCHqN6d7d6{G|FJHb~ zD5;+b1bjQ;l=F=m&6LPL3(YZxE?v6l8Z8gMJ{jOQh{0N;mb5NsTY|)?Bvs}TaBp#3 z?%$$`Mc08xGd!X{5zmZ>Q$%&sId<`2-DT^Rozm{vH()_0y)v7LzdhHfnMxP0-?UYx z|F}r2erxC#G9B(-vO*9h=FHefMX!mCS>tO70!Brd|Iso4u;~k{PUVWSXys}-1g6@* zd-qA+;2@tm%sqlWnZKR&lT4aBSM$8ssRR6il~Oa>|Dddjt=l-uN(Vwu`^JqV3kW4N z!A1fQeS*;x+pw|Ylz2i{GYs5I2y3~YQ>(QGEE|w9-J@MQ*?r(3eg}ja(myP;=4(WR-xd*_81^f}+P!=KpY1BJMKbBn1Eu96bVN z^C-DxqI(K~aHOU>6LpAIzGv6@}+avCkD+TvjJCTNQ;Lqo~nK#ZV2S2CJ;t|9JJ9KsJBJ9d=n z)vHVXeEB62gnk%kI-BQM;;(wPL7bjzq7xusZn3S@Le7yL0;33|WsMm>Q+hq$Mhen} z2-+1<;2zFB;Tqg8EL^-&lOH^M3YPY&;#`NbjSA{{UvmZzu-Cycr4_-*2k&)6rW~GQ z>}MIhXZyJjANueVnU-))a6bSHfFW(3Z5aHtj>wH+bf-3JeaDU+^2HZlNMK-KC}uMm z&9wML7(n-6RjXE&PMta_49E^2#u_lU3@&V3@}$q5M*?sTyS{&)H77Z}V|WfmYWy6_ zj~B3U~k%+<`tgAk}|)3Cn9O^oFZCE?s3% zgXlhG`aJpMi^=+O1AISMEL%*-&^18AmCL$Q?Fvq=2rbAo%#UAaz+(<6VR$9#TmwL7XK{-84cioY=0BTVYk|j$b z=Pq1=FV9*s;Faejd!As`UM9mid$tT(H#K$!z3~FfN+q)-V$jwk(0XFHb^ER|$~(1d zB&ndKO(`zI8TN|f?Ywf#3>3q?O8y}Q-@5{z++$qt9^%eXcOr~g#tfDu7JUgff$!4d z@ELs{<7vKWs5Y}&wi{(|lc1(xG_>#`Iv2aM`o}7Ef1ik2w|Vns88c>#O7a?0ax|Sy ziC?Z6`hI2T-MhCmZ{A$BB@-vM%!V4I%B(G>iwlWGnq##zoWXF8&I!V@L5Lv3xi*Ai z7XQAMO@PaSB~rImB{1UBS_~g=NRy`I(g#rn2-qc<hsjL+?l`W8L5C30?Z+(O_2IGkuI7XwnTeogf zuUHR`mNe>G##2@MWRtx|j!0o>)Y8l$O`%aklpz32 zks|QD``Yu}q(q^Dk`#Nd&LtF-ARheWo;fWE?f$zom-UC~EXn%2%Ah8bx@f+9atl7Z z=u_;u)~#R`ZkK!uV&E|_0;wfUf6%2sNXiHy76Ut+MLN1;`r?ca=fubAM~`U6F;k|qPM;0X!bOYwYiA0zv|qBVFn!XrD=c*UQX54cxM2kUGL!Cstm<3Yfh!F{A+x<&r_ z>?5FS7jxqVG!no6{(F68=)RmsWHi%+K4AcbO-ex1mO{Gq>(`gy;9zG_rX06*!1;zU z?qVNMp=y5ln4f~qHu`iEMeXBq3mxo*+Y?y6Z*M_CY|i2(5Hg!6`JmOh3)-=x2fqz> zE;)5gKMhL~wBWND%xDJs)XkkOyOb(Y7_?{lnhjlJ?1{0?=M+jJZom*9bmw@$$L+M6 z*wfy~w}j)*vrgKSsUWyoL0zG+l&L?@le$$ZL-;j~x@+(XpOKEsB>@y$PQcaX{N*bo z2Lxshf!SLS%v!n(a`I0i-?T{Qu(k?oLZwWWT)zEyn9#M*8CW%FwSEJ9IxmonV64CJ z_Fxr)EDE3EG>uI-KtW0}-5KJ7URWDKxz#F^bH`cDTt%N<<6-TiEQ2@+^F|a*F%hK; z?>*-qh0S=LG~tO-rKNonxVuEWjs+{?@^xJKtrf3uL^7J1)l9^D_rQ$&`NN|c0SBha z>ir)gDn5eQ=r?WoM~QeWliI~qDwUQ(Me|E>{E8JV;QSTGebK`Cq$HTjHLF$7CZtZ4 zRDOb4?E{C70YAhidWcWMtYl6kg4TdD7~A{dm5Y^p7mVBVV1WFOz3%{wqqzQlYNy_d zZCRFk1B|ihy#xXz5D0-#QV4-01VRY;LP82j2nivbbO`V#gpxuHgx-4x8*JRWCClo) zm+$wR-H}$Sz0>J*r@ND9Y~Ak8&b)c^rtHjnZ{GVT%xdfK#M7_Ji!Z+eAITe3)B(%) zku;cCu*}(w9O63Ps8F`g03q+SSPo@XY#H8ctBRX zzZUr{{2i$HL7OQ?Pvs}GidH3IGS7oRoPMGi*xvku1XC8Z_f6i|_ElFWB1K?$f@x5(eg!%44a0`KkL{l*Nxn>c z4z%zp*0GNb{B+;!hNkIbFo#Q_GJQ9bi9&`6O=Q z%&%d%$EdC$wUd>vUN%_VgyG{P9B3yd4UM%y2D1%V(GEr4`@oZO=f572`Iuko8(ZZk z-@jB2I$$3)|B%l6EnOsk{Pk7Jr}Kh&GvwM|+$>K#^NO{migo%bl2SV zckA)zUzDv7$W_75*7Cg<%c8k+mFYP$IcTDrp$;eWC~Q6k{sieejGqj`P|5*V^ES4$ z%GOG(a}m^-$CckErg`n%6)>ayU-=ib_BX=s8ksxvmrIR%n(sbh|NUV``_pp6_r8nM z+Bpga1=w|s4N%-wkHddZHtV75GW_LadjbKxXfP?N8Cu1-4xwp8{?QDeJFs6r_42Fo z>;Jn==9EoW!sZ`ccBve$PO=Or1@ZYf(R}q~mq-V`YgDwj%eYLu$sciO5t)NFi6RpDh;SYb1|6+c; z^wLX}YaojeJG5@H#=kXIxK>I|0Y3!%p1$5L%&$XA?5dD3|8Necaz9TvpK|ii3i_1d zJh|$}zk>jIizMRk1cR!tdqT#uQfdN|ybVI)+wXcvrKPL0Tqt%NcE~bG!5&YauF^DP zB5SWPG(Yg2!Uf;EbsM%SIO@k;=FpqLJ2r)tYzQ7-QZa??!9Si19xW&s?(&9%91TQR z-}hl()8F5%6lvAo2GHH;g|$EfEF=ni@WBUi!wokm<$3Ov$h+Viz|+i}1C}gVA~)W6 zqx}2d|B;I?zF20@o~_E!;~~)ZC;`fT*6B{Pd=&d)NQ4O`6RK%Xt%JEqA07x(>j95( zOo7+)@ST0dbw8I<%(oOaf9k{&(g&rB$yefQ!oE0`2U^0evo@aJ3w-ct&c zWprB%78-Lm)?T0seX19G^90Z%8F(vVO^SYD)h`P?hm(&!N>P6b6jz^r^G&qvn`)29 zLw;WE#Ql%*5sr@Y@;mP+<<<(c?;?ydn$%{n=1mu&><_o>3RPPs!u=Hw ztg(z*%s2merqK2GocJ=GzH-m_%xka7Kkk13!uLmI{no8&j?21 z(Fy;@46PrY!oK65 z536VXETO3+rsIPXPvy;pwt!yl>>lN3aA z?C_6sz=+M5JyuiFa$u@E6Z3I9+!J(S9`iG{!&HCblZX?D#~pL9EQcb^#?3n=J1b4j z|MCyyrB~j;2EY;~u{b2P<`f%O&I@F)tX#Q9uKdAo6-GNMYUIMtpDz0?S)ekY#_Bti zm6f61^JhI6FFb`tW(nUDha{W|uZIH5)6YV22Z5&=G3ls#_2WSXCv(R=)zN=SJCuX? z*Ms}`3UGq~pLgoCBxsb9nul(H?}`c@gov-oO(?cxn*90Ce^zBu_)ODqR1~SJtCJH? zJW>AjuYW1V>Txgy={(i!IYvidjKE`&WV(5Up0_!E0{`e>IGFUWs zjx3%#SHbA(v)1(rh(^G^@_S{?-mH&Rjw{I|dt_~`8c(i@4fRL=^%|L`3%(cIJFzCg zA5Y2az<$Smv~6dF{NxXRQqh}nf9i3^!iV_Dl8w_OoFlmuWZyp$X5&A9`f2J^eT3HU;jMr%M_1?Q#jW!Sy@AUxdJZK07%h5!9%&+U+t`@lBb@1P2u77 z8B+aC|0MW#H&0q2z+5bFQt*4uQnx_;@?XG;kLmc2Hwm1${O`AeTZ zOPze30|E3S&p!`KJI^TvHA_G^G=B^O7DErI=wsOsNTwGQO0s<@BZkl$7b2q-Fvm3%xw#)ff{XpIX57AN{;@XXYZDq9=*K-_8Z?1e_uKdNXB?I5tUdRwWf5z!j zhVQao6RK>E9}AwPkVNHJ=d;(n$LHDs&2j7K+swoAi7NM8`z>B1zq;z%Qi+2$RD8G@ z0%z{gh`;&{W1YHi?wmnj6u4I*Z3+a^qz?h{O1P}|i*LOp7k=;iatZcdPrv%A+Ha|s zgAjtdaiAPr21hEQMax1Ancz?S;U9NE=)YF+JO==n_^N;4Q~DsB-oC3+&cWK~(rd1j z3%~n)sl{BW-2*zn+PCAuY$P-jGj?nF~&|lj~p1dyRZQMHBR7i!egLNHUWd2 zfs=vzj~m(n2u#aiX7zXoAvnP?yz}mAX~hXvwLn)Bgo+;dGg{N3OinU7banM81`)MJ z0tUXuz@{niBM)1S38)_e-b#7(4a-#0`JXveN~gi!A}2vNC~N)>2!7O;3Wi0mek}6m z0go%b_E}7(B8wNyR40@ldgM9f!%QF=q(J z^yR-y>m+(ugJK%p|JLkkh=yJWv0ov94-`@h^JH~jfdB{-+TOxy(E z-`fL)Ffe5(v|^u7!BO|PsY?_P)nk+zlL5#@Vq5zIT$oOVS;h0KI@@GZC8q8KJ; zpMADE`K-q%(PjS$IwXK`5?5gibqi20ZnQY&Iy$=KpZ7c_fBDn{f)Hf<+I)X~J*Ps`E{tPLExqY^eo)hUiy&R{JzV?;#RH=*Rm&>~=*2+CN8Qt2_ zs>*VX{Q#&<{44Yq`tVLpNte>O$4cqcvm`5h5oRyPmSzI53Qx;U0HUf0YW`cMs?UVL znPJhaS@H&$D?fl)z*nHr-weeq&hP3QfxbiKr&cHsJ^R}0@|CN9r1DhM)XLs4>q>KV zSsApb(GOGEu%ZO!j1H?dtcP&88~ctF)sM)8A%pMbx8IdN-Erq2Q-ISe1<~x2J}~@d zmqDbtV{IrS_hR@USkxoLLC#9H0N>=_-wN0%Jh# zdgx(g+R!>-GB$cEKnl9P8%a0)#$9vwvraesiun=avIwR=5Bch4ayS^R*;B2x;+Vd* z!-1No2vK1B_U%fjOkp$UD!sQk=bUro*=L`XYp=al4n6cxoB)F9Q*?0tsdn_!8T1^O zlE!^SFV?m&txkmt^!X;4P|qJ9DW~6n&CMj@Wl3pvKADn`}(?29|)Y&;daS{n4`d^O`DXmJQZ{) z80Cp(hM)Z8Cvx9?_sKWE`Ayj$7GBuD)%ZsL8p~XsI4E$&+!3Jmf!+nkHsV#d;EVw5 zjtF*~VN8Dj#Y(OR{(ASnH>0&4;O(Mr#QPd}|rwl0`it^|2kU;0%!Xz9K- zPo6ch)&pE`ejb*&7-(H555n88U;8szy?%qG)EgvDIaDR>pTKYZuVI*-d7b(F-P=VCyExMhd*OH(p2l)fQ4EvPpZw?v zD%I@L>5#MZ!2I-&lwh_6O7KKTe>3=L+7T%O7*+#EARdNaeN0gb{+kQ?Z&`-{1#?HDec4~j>KMUfl_?nyIcIDz;o@`8k%hBoTXUm|asLd*%!#u*WiZdd<_|WVv&>NPIP;HoKOXqZ>y_Ut z&W(pY!h?7=Mlf02Ak4F-fSN!s5q!7~CzBD#{6F)w;~^+5w}i4fq5BH5lR3}Ru16hl zfZX(d_o);wzxJNe(p&`pH}+gJe3H`5HCoeQ;mi{PI<#R@r5OcIHahs7{E#jWW~xvA z=n?YLtM4j*!>Q0b*@RQZ#~%lVQMS`(UvKbET!t}uhHA8yn~5{#*j9(S0W z{i&0%nRr~8lD+)OUnpVtvgLbIy%uK44^c&-%JS56&~E*q$c5*BQuf(vzVba?3uf%& zPrWQxe*d>JcUGxv+FB{}^*noKnPSE!VNzqUT1!S>70|&S($c0UzfyJ>y3;Ykua-7E z{6k=4qyv3yFPuM1uKMN|<@(>kX9!GNe)gj8%kP0FwS-H+B&O!$J1f>GE#cK`H%e1; zyTbVx^c9|1WZPoBG(5PUhrcyXh63?J0S-0v=gpfpE5-{E{}BFj===4OOD>Va4m)h{ zq$J-BGGq7g5=FxcgdZ%TqlPgx(su= zg|}Lmf@bs+eaw*uD4+Kv!%u(lH@RqcojNT*zs)bc^0pEX^IZ*2Ar$0iL*T$g-tfBX z`6>x!e4m9-xeRcN3iH57nybETRQ4gwSG>PYzW3ul5S(0l^`&ye;mgGbU&v&9;=`rl zy?&f;{CDP`Yk`G=Ai$m!6+n{v9lLh~L=zp>X-*FvQtzQSlo8$9)cp61TzJ9>e)(d{PeCZ$D*Lp8e#F z6>rHkfBcgy0E25iOlMwmIRvv{=vy=!Dv$V(K}bL0*M8%2x%!6RD@Bbz-FX+{_Qr|m zlh9AOh8W5-h{q^5PFgwdo!Q*+?q_FO=RMbf^v{?E0frA_a>4@a-JJ)or%oZE&5$u~k`FEw8<~ zQf`4_EKfFX++3kBy5&gv#8LoOt}<*u&(=uBuwN zvwKhec+=e~jnndmvzo+J^T2o@gD#DV3iB0SbUl)rLyIX1I1$+;H{N=Wy!Ec7;PlI% zeG?|Wms)})eoK(XUvRdSd+p^fPgvuD8ig!Fdz&oyYaQ~D8P>84urX6WjcN2Bu1TJP z0FZzKkc+=z+io2*ya59IhD}s2kW^+tfBg$2*}yNUkLQhUmmac zaKQ=YBcjWqq9TKvJ{GwKWYdlv@myiXKXUejlcTtdvevUek0{@-Lih$ zcA-Dqb51!0>rS7t-lJg>7WpyzE?OwR`1Y0ZonQV6Ylw1bq`Nn`JD3L-9GOtWfMX`cstsA zCiIu)V66G5IB@Ns|B8J@KKN+Ways~i7oT&k?7d)~6z3Nx;cE>9#5CdjcdQq?G3Vz( ziJ%E>diQUCBlF73J^VgfV=xD8O&2S>k)JXC@X+t5!1$BPzb-Ryy4>>lZNV9=tmDQx z%$35dN=k(h2xrj;XO>O_tgp$ZzV>w~$NuTHl`G}RmtL07K{=9qxrK9P%QwzHUv9Yl zc9{!h-m8EA2dQgnlw+~3n2P-z5Am$Pe(JAx-YsjuyJ>-J=#=A*l>_%(qJ+Eb!|MCy zA4P?&v(8#(GT8U+m(DaQ(=eD^*I0bfoa`()`{N%|3Z2|@-tk|!*TB0oK6;|epFLC3 zFc)rxHKKbTe?lpbhwKX;x|5KV@xh+%L(AOKewfsgC`Xph0>4>p~% zyioEu^N)5v9{B$94v!!Hr12S&e0em8PGqj5Q-FKTBrpS;np)(U=iV61aK`D!OFsPc zazQaT00-X=-Kd2G1Pcc&gWwb)E5nDy`Fry03vUU1{8AXrG-?8{v#I?p+jZnvvQyx~ z_gK_t!nY>_crbF`f1ehf>RkdYR4c$&6(%no1{XB|%*{h8{JtH0;1czkW*m1`!h{jR z5(sgz>Xv?(chrk%owu>69YP>DuU9F@@yt@2UWeEm

    LUX&vWRXmudEcD;Qh4)>C%o<%M29mD6R(U1mPCj_Uo5? z2AWBA^7lLbi<2Tra{0G^Df=v*2VdU{q`9R{9)9!%s}TGih|SPfg?W_=qxV(ggJANgfJoR{`Aj7#@)gO~kKjy84^Jd8xmtF!>l$JU20GJQvHa8ff=+A6Nj!|mn zA9Ktx!jsHY3gPKy9YP(WI={a5@wCnWe9|SS>Zqt)oqa_2Wo-$-LJBF+v1lP?FtDTf z3C{7S;SEa)aOqf+4Fw82@Pu{VN+_nCcfnQa6@|?gUw9T=ksODy33J9&NI^U4pxO*V znG9Hlx%i^9l)#zh{TY7!``h9Bxm@PV1+zB@#)*bGp;x;q{qQ?KM5m$pO(ZTXH9(V2 zBq{;R;yRs*uvuU>U2(w|lmPbIzx=hz(+Vy9KSImhD^Pe_3C($5vQO^$(+zU;!3Uu) z@njl>{Fa&16AnKN8uGbPfm6veMfZj8d{<6A_E_~@c<||G2Em>WW(ZG{{QJ?z)QQti zeC4b1vu|D@mwfg-CG7qk{4>*sUJA`a4JT7cfryOHC1=pE-)k97mGB;$m=4rGKWAuRgF z2KWd6)!%MWuPAIj?vR6FX8vR)jHbz4=RCuS-u8`Ala&FnOWs(gaj10P;5AT!eRe&E0`D9<3B&8NA-bq&n_crL;$|XtUzE;GO zyqfY^)8*zr{YZZHtDEJ#}@5_^jl<#?HX+7p(#A zx9>vv&J`EPFMfBMI(YD#Km0@ZM=N+6H}4n({{>inTD@kYRPL%BHt&}r zv~(Bu6kH4Ok73<=51hSKR9w-r1&X`VB)EHUcTLbng1ZEFclV%yMuId1x8UwBK^h3| z8rzs4$cwg`5jrG<2wRerRXU&?kR;{gE#0t_SbA?GlJtrFmOlH$yiV5C{R^Vk6 zTp{+~50u}W+}SJmZrypAF1l{MT(q$?RTtYljPc?OD}=<8@Q?0(H?FEP^jh<9UA8G1 z2}MLtQG=(E&Vr9L>%H4+ISH9ILJPO41-QR{O>Xl7s%X&8_HO7!dhB>ZuMZ+cI`ilV zeHVF|ZNUnseNoe_sj=-|+DL9aAe6q9Io4Ioa&}`x_lbpK91RH`0c*}jw(z6(KfH2)lF$C9 z4C0!JYPD<~Tj$N+&~9_YS1B3DJpcy-n+GZWbhK{qeS24T^hw15p^$JhLz9XZtHh*@rxr52abs$uR_q zKfO|Hl{|JU=z`EXzKQ(pqg+OoD!Xboeyjt2VA3*^+c_Zp;V6V*xrcdC@9q8Pc*$(4 z-p;gPo1`MlEl&OOjoj>Ev0XJ$L%GLr7&hu(<=#Lp{Elx7Mv(+^I)3K4HIEas%kYo0M|=V@#SGBWqz1EyW#6@pt7TfAr~ZzMf~u&g!E2h8+3i6H8DLV zH3(zR=C`M?&n%J7^*z*{2%+!3-SR)2^K#B7usox5&s)N#@aH<@c%dmZ>#TyGS}wGq zOUl?S>Pw!%7Qcf7a!bLWfoEpSH?k?5&5nX#26T;osw7ppmiArm-3_Hoxr- z;{fkB&98VKR~+tjEfacjw+sDgm656gp94S1rs9gLNTY62c>RSQLh*T9b!~laI^L~| zolGW8&uL7rmHQ^ZZ3@x5PW*gCOWBP>9zxtOIQV!3Imu8@4bH;$ zb>qV%Ztre?Q+OOZ??R>r^M0=GlR0EdcXdU}3VK+p{?+OAf+S|Rr`u<_Cf7rUYH^Ue zA}OBrGfzT9P2yWXj&&eGNotPwHfrl(an@|KJX%&O&U9?v=DxHNs$HmJ57H{~j2r3R z+G4tTx<&65`IykXclG?@dGbrBhzIc|Hkis`?vPBzz9)Uh2hwQSrAs?$)~~<9^hU0FBlBh-pJ6`TYdN#N|Y;iWD~+e5}-QVJdgTu&A||^>4W9G9{5a&zoQp z_msh#f=DMXD||Z)3ad}K*{6S(kEkW3SMc$VE}M}U2!4c7d_c>FPF-}|by7vo@_?SA zoF|SYg9sP8p7z(B_D2r@va+ZbSn9A5g+IA_)vfOo zN&kd8?a2K`KIESp0g>%FUsDo?nx4H(zdv+&hv3bAhH{&!MV?<0g6Fg9f6lvG$d@18 zkT*UbyfpuPu}ZIb%)&`G_S$U*klMW}oBIB^YP_4=cv0B(;35`qq_?{_Sz`n)zrCU9 zK2AYLlq^{l*0(ObAq^W<-j2ZY;X(dz;waA3%<=Oy>prI^J#a_HZ39r|h^8I3+I$IL zh`Nj?B2lGs%<~UdpL4v`jKaIyT_3v49@q$jMNm}+?Y1|u;tKFS6OF4Gzcwn)9uujT z{91<89Kh1Mv5r6M-V#RtB$*EvHN*VX=zyo(n1SO5n0h@KVfc|U#?}ths?f=MX&^^L z;*Ws5>|K33bYHHW4?SKqH->#s5w#8YS;1fH2k%G=e|fs&4cmSAUHr9JPyZsZ-+F5k z{ywRD!@Y7fAzrX$JKq@xSG_DM-b9ncbI z)B)YsLM=X6%;aVn(n)qPa4WPNYwd}~Rbhy+J1!{i)^M+~l!_An_NCm-)I@T5$z886BMhI{oWH*aj0GtCvhyC-(Qi`U2Yy~}`)kBlPaeDQf~9UE z_>ln=kA{xH;{bU}!zR}cR8H9FrUrMh=HubR5qO6k6206Vi2W!ZH*tHnAXwRU$@AJ; z5@?`2)RwpKE_5UCZ`gEX;P|7N(0rMAgw%QbqMuSD>zaKyIyUo*Py}S-ar)EMy|Ga> zi;QlJnpVK0U2FX3QtbBTFQnZ)3sxU|rU^je{4(G%p4W80Uu5{l2;`(gFRe9f7YD5T zV`al+!a6Xoh4@aa_F}N}tQ76H?*wmYhM5)=?HQUdnFcsBk06oX?=izzXK#;;shWSUmh|1Ee++dS^X`IyeO5tZCjBYKIyrrOUcx4<(;1&BK z>y)2L_$irfD{XuP_-kmKrD;+)4#?OxXHx6(KY1Glt0#&$q{FodsPNf!$lWX7h$BUy zPun~Q#kYpSSvP+K&n|V|I+U$G5GwP$`*jl5MMKoOTOQD{ygbF)23YL$Yu3vJ004|1 zZwS$~_2=w;j6A!7;Gd*2#&w@;MNA`&DuTt9-Yf3yZuUi zp^x?E%zFfTMQZrG_|eGw1{3GDXOxb7acJqCc6E(tBb#(s;Nct6{zoPK$lNIw{NnOi zgm=M#ELqp@AoDIc=DNDUg0vOriGrlns^VN0?ZTNBGoyZmKRrr-89@m^eQ&gF%mO|} zyn6WDO97cya;9*4PHCNC#^q?U+D?~P`+$8foZ^$j(8H*a@lItaLC*Y6G=QMd$XCjG zntpQGLc|4oQVLSDlWP%E+jve<-}F&pwwFjZ;030Vm}_XdzLZB z#8vuw_CYeXCuVCSyh#hI!S>Q2g@|1S#?zTGjhyw{E$<5JR-IX*>SS&cyruVx>fcr2 z9JQ`%{kSX59q@FS`2|({4LEF8eflor*h;(5!RLoo8E@mvCKIU);&wd(G|739tBk-f=zd*ip0iHb6mR`5xVJGBVev9FIp&>ucm9 zY6bq5#@sC9P&n^8S5aY2kEH^m#_OBF%7D)~f7o!W>NRePwh=U4)5Hv*cj>?0sMl=; z-78Lvx7NLYC*>rGSO6v}gO`;nztAJKSHE(-eB`LcIze*Op~=Dh4NEV)FW97nG>@k@ z63Gy~vZbHVwMw}7*Pm%|*!qtq%P0)So%EP3U_nKP-$Z%0O)`KvG2xgxXLAZ@hL*+6 z@xvvzu3Mj@)wAyIP^j84d_Q6rPj+mfH8tGj@Z!AWZv0HX2Y*zRwhoVs0DSM6V%5>@ z-hV}jd!K@8@?6LUMTtxB_g8&EJ9Xooq%O~)d>;2g9fpX@T&M?`k6K~55wvBZ;KezR+8>E) zQ}-<#5g0B4Sv6?Z^sMlM^8+hnG(9MCdA!NvM}{(4yQ8^p`e_>)V%A?+#88ZpZe_p& zD0%M~O#*kC^vxlYUCc9+9R^rJ&!iV5kA`Gya%M}Uj!#(w`>N74&H|W+w=iSd0$W}` zdLI++{|Gz3w5Gu`lSP;V>Rf^fA5i{ut{lT6qKlK&R7iJ2FdFQ?Bbrj}!v{|CLRuYv z0Vh?xy?@JEGzERdzs9WOh7RSK`t+W|-)d064Ra0Imp^6nnJv9zOQC?15Ys64-hVjv zIgPvWdR>9{#jD{SNG5=(7{LJ$Y2q6nSv{T#=eZXuk3YHHxdGvws2Bh2R2EuJELmY) z1l;IJHVgV_t#PP0rP_C6T=fMGpXanwaJN?>k8SDIR~8tKSTBBOTGhIujgK=VpJ85y zzq@#Pwj9X;0Wi+SS{TZEjBH=`SnDv+aP3h8cNBW6Gq69nyW4!sh+S7%%AH%x|HM51 zJ){mX$Sl#1XSa^PEU}JmKDh&@+857~VVL?TzpE^G%ssQK0{K1?Qx&|10Q>mFst>@b z2;!?SghUYeohe@Df`?zr=MuDZ!%erO#8Cf8zucLfs-zRYSF$pktThUQP}mDuJ?8y! z*`qiO6uHRw-aCg_8Hm1JY_k4ytyO)!udQ;}IdJChOw(ZH3^;9E`ZozGo`RLNFf{_e56)Mgmy5!S zj&_5k+&Lca>K*Lcgx0Z22-bzSgk~;{vNfnnl0zQ8{4S8mA>e)x2EU!Wr#Z}bk^duu zoP1N0EBBRRvw{FVvU9kfV;MO&TX-YquHq>YS76wpzW$U@Ag2%aCf%+|mtB9og*ceI z(7#$mF7VPH!-_++Ut^-QC0`HBFb6K$|DyTHQ03>kZ5s_Hhu3o!GT7qqiH^A#dtd6A zwb#Y5A1;-n^4?$2_07tKHZ1eIdthjYV#n>KYw<)+4!cqqls<#tS{a`Uq=;c zY2>rEdzs-d_qP<=^=z^t4n3zx=co^oviuPkk=?k3>03%L`!v;=#8?{~IPcS>b&g2BHV9c8q({LG^=ki;E=I_}HbW&|*SX?wD1CfM zE!a9d{_0Mtz{U5pqFt4UefC&F8$wmTFsz(<=%-(TkbU^t=S&}SA(~+Yc3Mn0QExL` zbd1w}me}mMn>$}+OiC)+V^P(=N4~=(L$T8lZ+M2;aLfKA#KD374C1c#-Fo`%uI^0s zm=QPbcz2wRmTO2&7zhy1zxD;1S|z%CmYA$Odz?#Ud62!6b)P4rSY>V0KTDmBBf=2~ z0&=-2)=or7;R;_NAas%gr~Tu5T|FM}TBy3xf3#Yf=KMbB;Q`BX{4RJG%!`8Me39@% z7TMc~w*#|6gkt8r#+^>F+AjRcqY+6m0YT z%3>p}!{#|e`;aZs?=OGb(DqtxPI}j z#!=J+K4A4IBAiy>@t?A84|I^MV4~1O5Ag|+WB_-~WbJ&!+r7}qdxM$BXQbL{-D3?q zOkpjw=`R2mB37uBbf?z;{8L-(tewT5wPm%s7G+z;Cd_T}Q;QHsJURN?(Rkam1&g$^ znX|ip)DX4!bN`x(PK)mC=HeH{2$Zda@{CH z>WliU8p%0+Lx@ZZ={dc*mRfb@dN;z0Hv9|^t@??0FqF|=r=DgdhmZFqv~4q)VPj~L zCaO1uMrSV*3XkR~j3(T7D#aNRVJ&EWhaO?SXUne`oK`)YXz17vg%_7bHShxNj29Sx ziRgaM2|AEH*FML;tW;O?yNvbE*w7sMfgvMRu@yyq_wZZpdNAOwcXWE)>mw_ckCa=X7#G{?wiahV9VIrQ_XG6Ov ziQTwg)$?Eb7sz|}y!tW|{cuu7zr%MP5Q^#q(S30fr9xE6kvdT+e<}v`a($7W;FQfj zsvTbWJe2+x$+T5Fp#s`khwBH-lU2T9Y?&}@4_+FH`tnv_s`RTJUKAwbIr6J8SiiVv zw$xi^%UNY&XwNF@Mr@COPYfPTPEv|vvbj8Z;dK1^qH_sNS?tt=pR?wQD#Us{cZlU6 zj%^Tk`i0bIajst)s;|AFdC9o%GCr5L60mgDPG10`&ivwB?`bgmeKehMdi@)|n1-pz zhmM?w6CC>8c}#pOLwwK*H7!Xbw>Xg@W`N(wg7aMiI7%k7v5J;pjDD;Mo}Ph4SMX5S ztk_T*&@V8&*{v~(_52h{?9!SXvbU`pvxIUkD;g9vhM1}W6`6)!HBZg2^MK`dkNM1Jw z@6dU)>#NH=j$rO`Jb!%7z3J={+7jM|>-DrW6)bnEO{NOU^=Xe1!6`O#E^Mr;a~_+9 zmd_8U8dgtc^DJ!=0V{$?@I_{qZ$sNSw(lllCYr7mr-YB$REfaqsh^B+C{=~{f!Cyh zXgY9KKMiTLM{^LCIH#m5J#$AGr`m&e<3qJMoWtaIzz1;KJLkKM-dL9X7+iYIm}nc4 z!TnC55E6SKDQsqknp=v5Dql3TE?F^r3r3t{z*n7(CM{}(uc?{5vdbh|in98m;osbC zBp{E`n7CzDIBp6Yy9sHQ4flaeL<0tA{O8`rx%SKgx(@4D+`BWhykl37d`44#dT)s< z;*3(d#?Rp&DVD8}hAgSWnQ;V~`4zQ{GWIwT5Sc*HaK9<1+D!%G%{q__hic$XqbNsj zNBOo~7Uk$AjWBrX> zs-$G+dtpDA1j-7%2Z~`;UBLHU{IZrUdl9PcO87W04%0zob|tBkw?~4o+4lS5X6{x` z6y%bwslj+C&^p`d5zEtqVaV>!fAy!H`^Po(BJP0!L|ec1U5~8CPmDMi*&`GcCRetE zG}SwqRNLv672Ew;Je^E+^-!8<41Pe#Vs7T$w`8-DA;qnG|ppUTMv z&|aqYGd7(m>3cUhzg{vvB(P?uEhMW{|0Y z9`IttOu^p>(j#qf*eq2tiV}ejAA+{`1k47EGoO8t?LcKnK0V*e%7u#aQS6LJ61EM%8n_C z8nV{7_Pn0DpSY+=E#$|jkpXLW3WDF>tR!$*k>RY+-s^jw_T{`6n@r6x=-(lvh}fay zPd$Dv^xZ+yz25NK%@%&c>b<5P+YLU1yPW=tq#_;CQWQ+*%$KaR%9ks-s@kJ!VhQEgC5dC7IuZqP z;_XwS2qej$Y{}IJ6ZxQ<{D)~k0>)AsuO0ktLvCl{w)5==EJPci&NKU#Y|15p+pMZLR=UaBto3R zT|8T3Wd(IPi;u8oJ8Je+SnX6$4M$Zuo)xCJHyVY)+Md|iLJ!$3>9NOScZ`tD0BS5B z6XuC^gssEa$9Pb{R?CDWkxoEA5xkJ6BYo};|mWtwqwxY()iU$?7LlEYCdPA`U8PFr3lgUppflGW;FnY^|u z_zyVa|B^?68&#E*A<5$}h>I-jY2f?_g4zeM#BrkOu8cwSjvUWN^K)QU=S0+L;;i5@!4+6}HtYAk-c7zAsi=?NQ z)Rcz99ba;*Fe9V@0VPvdLx09X6~vBQvdZeVoEKzF&;9!@u8m+Kiy;D;oUI6%oSMnK zvA63>AGsuK+vfvoN028Y^)!`}mJ90L)FY0-47dq367JthDhl>UkqvNrFca`hqH7Lu zWHkBD!chm*i2E^>ErE1fKDK#^P&p`F1T8kQzpR++qPx;~CzR2nV{=VC3?44-Jt>)t z-)_2Je|d18(9VS@Fp2;YFj5dkKCDsnrF;TWvr@lS^6d;X=$RjoNvo!Ue!(a=X@}aZ zvx*tRDhup`s>$7`2vb7k@egAj`+JQ$NPd{iM0}vw3fC(piTa%*&B`$A|90^Y@P!TG zfRKAdz88UWzb7J4S_a+7#pJNS{<;_khSM*4J~74~{4+8LWZFQN3f1}`^bvjVdq!Cz zzwz*cb-$cRMv^eR-HO_w{YtzO!yRAIvs?_jdWMdso_-SJY)>39;;hJ&UhG{}-*DaJ z`pO?>|8WtVM$2fu-Zsef*x2sH`KuhLO{l24)@5Ls(iSB%ltvj=@5HXOG%+C(sRCV{ z?mYuIIY6x5D?wma3HD!j3xhAW?4LsZ?Njnku$>R<1gi|B7zyom-vLdezaK|nY{XJqX?eq$4M6LpcIttiSS(q*(ZdU9igWa#C)P8e-0kI`PUg1)b zsN^GPX2#y~i~szfiKrHu$(!_Z^ZdM=AK5%lK&M)rd70i=X|lbg61P_9P>^ps#HWpy z)r+kw@%3UYhS$k2Om}A}=-77w#zEP*@)VM)kjKZ#g!?Y!UEix*W}_D0buj-Ly&KC4 z=|*HqgZoHGETVB>NTUIt;@-&Wk>?uA7{jY5Pf4e=?^gTD_tZ**YE@1|GBfPlkg|Me zfFv52HgN=GNeI-t{*!DZ{xF%OX(yGqbfo6q8|or*oBG<511EXq)nB#jSN(P>nUOXq znP8eQl=;tt#TSL-N7AWuBd$-tmXRo+@XtWY=FhJ02pM=v?8iet)pF7oTXT$;(oU}% zz(@Kt($W z7mRNYhD->O>p64?sL2=Nm(;hlm-l?Ul$SXI@&w$htlKF;(??a%e+1}P=(*pZ%d@Ms z!_~>Nn=?4Lh7$&G`>M#l9SZFGyY=pZ5-t;Bv6jkPP{xWy^>RPL4> z#*wvnoa8eMa(?Rd4fbJp3bI*K4MK(vHy{2&ijfD*Vu``fkjBHMVefnQFK^oY7wD)F z^Cxq#a9xsF^8vY=p>eHXrm@1>D-J{bowQs7T*njAhFTxPKG-`}Z9p%ocK~6$ zms}e3)Ab{%m%svj^^>Vf>F_^vuS}!Mm~x_vY8^CU zs1@!Q;Z+pI6HgiJJn%9S4wfk~eV|fI$-!;hq{yuJSMW{D!uRMA zc{Lp|<6^SFLQO>dXfC5;vkNSIMDJ0eGe$;#`wD;Ewq@I*THne)fD|*K;5=zRCW7K= zjC4lTV1-Ze>HEUDaw`V)GNy^J8<4}+-FJM&@dZ=NwfeLWTZSSl37>jtDx1^nFYk@R zbPoWVH$Om`^y+(%WKgSHuK7S??r$En?0ou?e>kqgWCFDPmimRh^B-s#s7s)~+iAl? zdc08YnZLR%+Ld3wDS++9XE5>VeLt_Y>elQiwW=PBp_4x2IDD9p7RhQo&o`3o33&0G zna%a+6Foa_8P@2@9Ki=o7f}>9uIQK7B=HfiB)2{O{v%r1Ilc0m23}!Nq|DBa*@}^a z4ei3*PE9S}y(h@nPdW^bE$Pv^kG23tJmPLyCQD(rUX3l?T$6VWH3aT%ot`_0ZN5DU zUZS~y*D=E8RDm?KIpwodux;G{4&p2T#-G@Zua`h0`D#T^H4p?{a(}Ip-6D(rMvw$w^-Ht+_8(#JHUD$M#F(xj~2BY6AP$toCW>; z%+zm(WStL&Otgh^>kYfxia*dOCY02={a6ujMP)dt=9g_N9!i^(KvfAcDBuN?#}%3k zL`7s){&)hgym&N|1UYgSCPcH{d^ZgH7fDbz)MqiE%2)1C-u68o(NsT%NAtbe$N^%g zdqH15#9FlnPhcf0)tb^|^uGc67o@MKXp6u$9RQWs{^L6!%dv07O+8` zQ7(}w?C#XMEGexG6Gy>QabmWgmZo_;)RM3vPnFl!ryUSFhC;9-T@o#xOa%ehXG@MNGdgbMXm6px3RmLAM+dbv+ zl=+Avz)67EV?u##rJEuJg}@NK9m39EEsl_1luk~D({9CzT9>ZlumJ*VEvWSA$GL^y z&G*88sRs1rx;o7mv}?Al{G>ze%DNuI>Yro$0n~vXkeBpb0>f;X-L5z(AVV3N-Yx8S zO~FPFU6?M(fV?zPM6u&(E?(Bh{}NB!f?&ZdQqF1Oo`1h<SB{Y8UXEuZt@N)|txo-Neuus{!ehJomW16s5`4$#i$h1= zk_`Bvj{?d6DjLcYKn=nKT17@;pRytR6!f2ei?WN6!y7LbSQmRA6sG5}e2MLNZjPKCp|Ts8a?`JW$xZxKyttYlGXq59X{VE@3!M;cGP= zAJE+hu4!#9F(me?4ly3HlvkD?X8IO3mBz%E45wVZ4R_2ckislDY^c|Q-<&VrpARuCx^Ioj3go37 z4w*LWtB#@a{t)-)5ec~cK0&53Ti*;GpqG-mxQ3KUL3 zn$LgqrK`7Y{R3QD8QtyVLqqu85le)I!KsRFMUE+BTj%c8Sgtlw-{YptPiyj z$d6SZ8l|TUb;!1$4%yFccbXmiLW;Kl^f61h%f7*MEc_x%h;46q^8GWF*O z0Y({aOT(@I47;eh8NEbky0G*`z}lmKBk#-jGHHuIX@cbMFgVZaq>xO_76V^leg!`_ zy72)IJzuKG$1}+W!f$m1z6Va7=vs88a*0WN#{NtZ)9eBvoTgR=(k1E~^$Hv|Qp1!j z;Q#FWNMido=Up1G>X`cu$Ff`LZg}P-g&1*8*OTHt(YHtTk?+*pX?=R==?SNr1k)JB zbyo`lrYScA;~yhG!se(dy~{CKM&&Mj6BM2h2ln1UgegoANz*V!2Z*cUseKNOH$br%$tlaElrTq2)p4s zeQ#p>FCaKQA}Z+jHug&ebbex*st)`eDllwn3-c2G)H>$WTU5>QkGP*mQ-<0-=4A=(SRoU9r6QniaA#VHR9 zc0D`(-Z8C~;kGKvDQ-D(GZ@R6Fpd&4vWJGm)UcL-z5adgR6n&dYRP(`gJ`$+504E7 z;ssgbeCN$B*w_;^>)0imdCVuLerhoCAp_e2kU7U29%F;%>@>3x+X=BfJSx>xU>tN) z36XM0$XxLt8FnYbaQ$2avjB6#V1Z0VoBiGxw)x{HHJwEE*Nf+$31eAwQW`5(_7MfO zQ)y7Otj=r@&F1HNgdml?C7J)%!u>xIOXO_$8ST*9;NrJE}HLp}GsFDt)piwj->WG(LmZh1h|U<3nP6S-Owa;hW_}rpQo&(5-9C zo_;!0WIg}eT8+?)s(;P0W6%;4L#r3sw=Kzx$7lif;Ckk?60A<~G@x9Q2y81=3&!Bk z`F0RR7kcF26q>XQqlhp9O08`!d{W>rubhE^Q_VyX!pxwkZm1F@@DS^@ga7}%x7l+x zq;mXqo&O{e8o%6`+VU_My(E3`-1(g#QfbuekYyk6NOyRClwu=G&cKZ=sU;KDsi6*Y8_Toyj_`|(_-Qk-bjiAGi@u5HL*~p-?~)rEI5m_D2H;C6(u-^YMF7J zhxd&zf%rYj6iOEQ`?)h1q(imwEO!cKYX*mH^zDt;1B-yDN6Ojb`_}gFMocY2V z4d;rL>UJ<&77i#9%bw2|ptaJl-nVCZdrqt< zxufJzS8y-{3}^n^GBV?xS^PU0cRV#(s-j;FE`;(zRJkXdy4sI<4Se=P=}L(e4RoLk zF^WNKNPX=c=6&DrhY6*Alj)Z0@aH88thep8-S(N*A8UuV^~xpHLhBoI852I|?4815 ziy{;56Mp<15|ep*G{EW(A!TFu)YIF9{+Kbq+wJ{ryop34n)c4v3vpzZt;mgAE{{;< zbNO}K&Lio8p}=&zsdwr=!T!YigQbiOpYw5UfD>wlP?o+qd^lw7v=U^b!{Qfkr4GtW zj%{jG{wa8!=_d^v1jwz44j4iX>i$%(B3b8oXkD3>Np}7vp~ps`GUX6EIQknFqQFFL z^P#O%&-xzbn3Aeq2w+msXse%(w^xhHPB3UXB81`&-T<=W;3Y%X&*!Lw3_ED<2Mhe7MfP($EPaYUQQ4 zZe5TP=Xg~F3cm>HElZp~PCTw}yk6i{GFa(V-^X2lA2RO1SLU$k7vidJTSrC@syL+2$JpliynFEUt+~VG8aYdOhZna21(6}1;@WrjK4!+}W)4r_n8EEI z0pI|n{{e8U)XJ~-Y*poK&&Qbmfg~|qPxjGQAd7wZV7`#HP?Ok9#EoSt*AmNrF{A(+ z=iC^5_I+(-Bdux3!MenAI zh|P`I5*cAq)?7l?x7S>QT{&RC&P3-n3A#B^EBb7{7s4^)uL2kKmU?h4f5dY~cN?Tq z9zeP`iywSu6n8Iv1~RtKSEepSQIvr)dE=A6?1_1!>4e4%(Eb4GyGOb=*@xJiGMBAi z78mwZczTMxhGO6LY;I+yn3ljqK++2ijk97J;^*w9c*sE7p47^v6a;tM7(Jqia7{=G z3PxQwslcSFa)lOWIZZ!H#VziKMo9rHOn#R8YaM;tipS1rVd!B&M+2#V{m7j_&;DKF zTDQpJetzX>f4EagD1Sj@?Pb2xB2k!fOt_zRN+!u6O2?g1_4B9P* zTnWv17rFI&inm`QUyBf9?ECEpaMVc4bK_dzR7W$*&X{!UB4&v%x)WDN=YC=ouz7je zy?tMOeQB#KAY5j8_HU>Bcgd*2kgzsxsifJUi?I3aRrd`6QDz~{%e4lFiM~~_kV-ltZrMCa^_J*`|L_JaFjTz zWIjGyuHNYHB$dJlvpiGc$>49N?dUJ*+otzhCPS_5fg6-2DotfUSpV5Ji}9UsVj_#u zZM(&BIdU#(9cI7)^a|re(zn>X8?mqEVUOI6AM61OhNM znihnu#O?$14J#{r?uc|>AIvV$SBv*#>k}^Xzuau6RG@Y>CtsrtL?v4>tBxe~F6C;) zJ@Ntw@e%EH>47zD$=j_~WR|7S8x{vzZuYFk2>azNgwmUp@A@HTqDe-kc z9RiOda=ni5Uu94y4@%SQElYH40AW%8!!!wue_n|d!FZVXK6mf&F{tX!cQE}BE-VrV zTR;6xs(lF~WBrVe&i>3g&#B%qJyZCTYw?L0jrqL5XX}gu0-N)e_7z);`@P}9MpF{y z&Th27Jsy!E#GP71x^-qcp8R);LC6_DP3#4?*D~NbTp6+xt=RoLwpV{a8#b~TRww^c zbFgh>PrRNeSO4%33YfcmtNOB_O%NNTIXhq-`wQgwASE`GB z|DmFxPohDUXIGgT?V>#tYi?nV?wI`fheoBhVpGRWsD{((*8u!aNHPbcB+#23=L9*o z*GD#lt%Tg1{_A(3qIpk>V}-cUqVaP4zmP#sfrw^B%G^4WAT!uj)aa6>;SivdT~20> zqTIziY#Bd`B#0ZA&i26Pr>J+t2Qw+gVRrXD|Ek~Rn?=HMuuFk`AGCCDZ&l~IqC*zz zNCpMq@M(wH+?U_nHjp6MHVB7`(x*BTIn1r_QFcNbW)qF^99Dskk#NbGa5x;d*m7zL zb%4nq?7AF3GJ+pnjz6%=?`C2<;6uO?5V<&I{+hksk6stiOKANLT@pNuo1%ZAU0d}faWb*#T>lqJsJ~>eYnJ>Y9gLK0X{DgG z)m19_gJ;39K^Z=g$wXO5MfE3$GYe(Ur&{COTc6mo@?$GJC`WjK%Ma^%Ut#ei$cj(W zJM#;D4%~b`KW-p6cAu2pXXYDXebxIN_s)~$E8-Wwd!3zG6ixar8+#dBC$A6VV5g+@ zd9lN)e$@z^IB8zf6w&B=Ps{C}d5Sks_N!d7@SjGQq_(I3xa-(nLihE;ozQ{9H=4uG z?Ltrxc`fo9M$Q`kM}#Ba9|+F+haaYD489Hhq?h;8a5SgI8s(R;s@uSj~(1X9pQvx^=Rl6wPnLJ?ET+KbqkFT zz3L2;v#Rd}ke-Fsjjvn6Wtx6tkz(TfnnL0JmE~#t5j3^N#L7EYw|B^pIwgq9aHjMX z39*JR_(S~Ly4X^usVIwzyTk>MbApki*gDtte>%MLKV?1(GcJsFZ~lMc9#l+)LF`G- zQ{gt2NrG~Qrb0^~EO)U!vE96a!ywx3$`VR8GNBlzi_L~7y^Jih;f-Q=`v#M}COwJU&j9;RS zl&KqJfAFh-CHzjNR^u9X-H0R~|?mW8dy|D^utoJeL?JL6O{nwKU zJ5`2}8APe;Nx3PGwtP-0n%IA}`8r-e4b4qVrn#2|B&kSC{b-Hh@1gGBND87=3_GiF zW%3cTwp{<2N4Zlu1+UkMX<y zlUr1@8uk#77brY4P>(AlTUwB~WZ_hoY0KBgf37G{yBwvykE+)6XAER0V9bP|vs#O? zWzsW*jA*|nZ@UM`sJ&6#9%F&Mu0k#W{|IaS1(~;m9spq|X62=6M2^z{b5;NJe(#8~ zT-bUEUsKf<5@rep0;SW{?!3_$jW`KJp6H+|QazT4>K>^FM))}Dv_`G8nEwebwB!og zkcr8|H1qlLgwC?xqq29VL}CHi`6iEiV0<}VpjL0XHUMyfb`PjhW7>7>D;Xl~LMD4| z0*U8=@DG;VEb((B4zS4Hkx-lV?kzt>p{Dup{CKGBiodAL7jyI=HSXn;fXAE7Wzl;* z_hyxXC^A^6whsfCqQ6t;;WFlhQn)w#}G3w*}f z3Y20()v~ls18PZcx;rx!zy9_4{L7~6=$?<4FbX4-3b=l zU6Vj?cMn=P!3uY`1c$;kxVw9h!rdiUaCi7CIj4K{=(p~BU*Lfnqxkl+x#pVd+r;wp z2yh1zEfh8>xM|Mkxtu$jeP&R_X~_F5{@9sHNa=LaBBmYq)%`!>spmkv!(nmHN*wWp z6`%3FFR;^G7qO5%u96~ecr%#qYl@d7_W`5kkLyPL=_LqHX>KKp=iKFe-c~wdIn%{U zE5TJ$5~H7v2@oi4D_5V9Z-1Nxw6C$-8HTztDP$-Oa4ip%YxPMxzZ+(QgTGS<`CNN^ zMeybJSRg(={{3eZg8XdB#qz< zV;fZLx4LQ~`UZLpH=AR$FK$zkX+E2Gw6r{O)6+8rT|RAV8uES-+CgPbAk-@d7G+Ei zEZ_PCMmIOvZ#iGxjqczf0$?qB0K&cfYALHW!<9!XUT81 zV?xJ{jLg}LAzYSQW~Pyko#_sTn1$5Nd{Cj^BGv8uiB7`EgS+POWrDH0dnb$-qD0+F z-xn9^jQP-5MWQ~Ssbe5Jiu5m0OI#LHgnMihi`q)VpEiOc4sa#!Kl`2Q5S}qRZh2i&zKW3zNA(> zOr_$xcXqRPcwEvWrwI zI=SItWEScV1vCs0qOcF&AtGx_?jGmC1vMyqA^9FqDeFR;QP}{l4+;wOqVXqxKMl0E zrNtT#5}Frfg^W3H?Pk2y3fcFq^cGj6utN{PLa|@{OzQ7mFF$`+N1>?VcYpZ*J=0q` z%7iGbeSx}JE~kBuv<5}2`afa8E^1q(DVm-loJhmL7i!(uyjqn8G!(0TY_J|{@smre zPlm57X@09n+Sl)iC1&m;Y$=03L5t|U7f0G<*OvH_kV&x`>?5D@O=fy#jcUe%BG^@c zh03u~oc)L{3ccs`TSbIUc3|6WvbFZ0r-Uvj2X<)#nrm6Eze`-<_!TDIU46gM#!NKG`gQ^gE-*#`D=GsjwwJ0nVpZ*&w z4@!{nPT`Ra>y($%0{r7QOfzjBb;qAkZ4w3#$xRKZVD=1eV?3hV==kssHTM)%czw_L z7S+}T!|E){U~ocJk3tM8932~6InH=OlW&p5e`E)Lf8vM-z&0#xhmNHfzOPF*pcK-w zlt>it5F! zK&m=@S%Pd@z|xG0!0A}^49|7+L}-?A`5OIaY}Ic)_i_U=`(*qFtLSw<4A`9_)r5}K z2K9g6>xM;tUTu5`xCt3=k!j5^05k%wMZQF1D!q6cr$q+Rwm1 zN9~3XF-&Ak>`mPH)w#f8rek_xhF$}dodnUKBWB%&?Rf@#UELmHw_JS*Xk$9A|K)x1 z$0{)Afei3dH<#Cd+CWv=w^m*cY9sVKrh*#Hc~n8-N>~&5eW8rxBQxYu3M01)zc1e+ z@jnWfvAKK%LGYorBW}Y`rR-Q7OXxRun%6&`?{&V>@_C8fjAT~}G)u~V_$vu=ROu!! zg6_U}Z%DlwB!UK}Zy zYU~0%OmFk__V~17gY2y1D2q4WXx?i{P3_9i{p@(`c+7s@-d)4oJnasoHK&oLp%Vc5 zRA8It?p2D<#f)1rAs62+5j`;77_`Ac^RtsO9pqq|kxh=?MN7w~$f&XHyT>n|q&Yu7 zm0O&BG5rAInA^y373Ow1!|Hze1)_|P39HGy#Q+>^8%_203nl7O6SqFT(@pnTEVug_ zA*IcAp|p7@)P8Yhu_x_~T_3n-32*g$3#wk_KDfCX5tdJRBX-StM4pdB3TlXjCfS-< zB$x1i;^`XY_W|((!;`B4e>%FJQDhq%r)w;;3P0$Bw+x1;gu-RSD|hp%=kZC`4EcVPOvjovKc4vETai>4bWQqbmGXT$i_8V^7ddq{vpc z*Gi=)TUH)Nz(%au^)`Ifi?W~spNy2uC*NeXqGHEn?DlWhdRWS^xB*Z2(C#_y4ixGX z$5~Xy!LZuV8j=xn9*<186@dJyQNNg(7CC+_M=Xm*>7Os>&a;!>aE19##$+vN!Y3(A z2PNc*CzS4H4k{J`76@OH*A2qR=zXJf(fvk@%Tm|sw4O!FQvL9N99@z)?+C1Xx3O8k zF_8PHB@kkT<>uI_(k89|`mVW%_d8l}ybpgwx7nuUQE){QQ1>WTxfgIf-j_B zKL6EwU^;N5UCp;CI_Ft}n=hN-j-M$2M@*P$N83qjZt=@SC3(IJuQL%lS8wE}?90$Hizvz|1 zJM3#%>{Ra}Lnd%*W_EHaV1p}~pq-rkLt%wT_=5BNRqMq~;-#c}x>)!|(UM6MT;WV= zU!7a`H%bzw;Zf6ounoc3_`P|{p{6M55oQg;8W%or?21Xr=&f8Kmjj^^;fvxZMrj|* zO3aFDA_=QTXr_JGD9fATgIhkj{fzw_&4gdmO%kX}Ulg!lZ(NhaEA!NQ89GHv@$|#N z?`g%)P)kc%Lqi+!0;5+q*1}|&w!tw5+g18!$qua5F0MI2naO8yi2K~K!q^f!0^A7Q zdkn*tN@00i{gChSH|yVLVBH?C71@m5qQEe-|K3ns^+>hl|HK2fv~o3%)JzE-dl8c3 z;>-0{RO61_FiKVAx}z5TRh;QlP8c?K4WrOAwhHkm|0PmK{X(-D#y=jv-KVs&d0~h! z^fV^VQckOzRt`I)*E=teQ^P+|3037W&o09Qi7BmQ!J^*C%`_+f+Zry>a%TL_z4#ft9tZPU()Vhb3PTm6L zWJ9i&kIF*|Om3aQNZ1}l4&Dz%@D;WX7%EK+$VzaKc z1s+sdS}mclpddjxvYG?q3E95VfEF+#eepA>Io^$TQhUy+O)NrWfQjtx_)HndgqOU( z4>;&>|9MqcWos2(=vaA+t-jR`O&Rm|BQIh8A2QCmx(>6vsWk;j;56Xes`+|w*HkiK zM4^2Nh=WYls=Fzly$Pd8d=v;WCw9K4|Es9_B~9HUTV-KQiobVa{8W zJfjaDwK5p4d}Ksau480CbR1fre#TTR`Xo}w zb;_W=PeU1~em!tKf*SlY#makAp>%9PZC93SR~7ST@9!)@h`TLLAOd~(W5I`L?D;Vf z$QANRrnd4^`jBd1{Dqra@=!k?S#5t}tT^5IP=mP3H>R@V_5(OBTJY4jY{pftO!BnfUW8mtWLY(%lsRMGw5H4-`-nlf5ss%Fy-lOA?Ij2=uVzdtD{8)L3dyG4it0!-QE3 z0$Ew-6`2m1Y(X6r2CP`vv#)F*5Dj4uhI#%H!v=P^vKaU6gVCnBzwII$y!LG1XlSXK z66|S74VpMxYyV5j9dZ7}j#)vy50V@qb!?y1Lax@|qO4Kypg(U`Vk(DR8T1sVogqRy z{1|APFEe#jU-2Y{35P8ta73mA<&FJ8hQ(|G8w_~I$A@t7%l#g;xDUSeY5ihqf|@nE zLe&u(a)_(Q(-rrWJ^@lIw6!bbRF$xk?vytu?h1voDQmcT6CL;+yUt|3P6YdbL30K@ z3YMNBth1$c6{9XpM;9WW^dzCRhO9}JROP6omRFct`TpUJxyc-Ig{CkS1{EL6Mqt;jPuD^jv;IaT(@>n%^N(m_Glo=cX}zOf8S5wnd)%0 zw8ie5VrWkenm}*c#fx&_p-3T0)Uw(pD1)q;ja!BV0i&kwe=G374k@f5Fvcdxjq=xk z0;baxtj1$D)4wOs=eq2*Nw47u4#;*;%(Q!uI5f(D!BZzq8+k`nuY_0aI_YosKBa!Y zovaT}ct#4J+aQw(A6zY}jMr8`5O{gy56DM@a0TuwKy~ebrA?zF6 zi1Se0;l7?Wq8MIYbEN@f8xq4IiHBH)0aKr@ZdVqE9oTzhD?kSF*`TXjmqbOFn1aTX zV9<|%9p3>nrI&zuj+IFN!)cpDE8D!=uyIxLtJe^{g_wq*haXhBJr2ftXHFUGF^8*A z-OdF26C-1&Yxe^ltrs&C@A7qY)6)y)cj&7h3N)RT=LytN)F7pMO}v{Sqvp`+U=2*4 z@h;Z78j3LCJr`qt-V`&bdw!a37ui6OXoe3-c49XfD6*Ypgb|AH&p-@U zr!CpJuiC6XbxLo)G1X1w#~8RG1}o_Ib<}*EU43BSp#c6$UEaGT++smtKP8X|Ml`KH z%BBGDbcD~0X*mjBzB`W08iEVspVR+Q_^FR^6>y7}@zWER!5{0P-+MvvHM8FKv;Rf| z(2ju2`#O8(x?6_0<8nb=XD&SR5#fTIQ?y)p%i}WWsF|&{<)UN@gWb)5;|}|>13dKy zzf>keDrMKkvt5CfR~B9rgo()taU+qOUu3j!4fNrAym_SLd=qX%eao0q?nmwqtIeZq zPFpxe2PU0h%dozC1J4!@TT!9oWG&{-Id6RnWy9PEqw>=nZlP>4^lcW!t5ehH&G(8E zef!2Rw@_=qD+S{FC8eK-2PlMGR16)XPsC9RU1F+B!MDVUUa^K1~q>(>?H;Qh#Fs72u zUddv*ZfUFsQ!y*5@}I5YN9oEz+N*BvI6pEyqsiD(_;Sr>0K{#cNsk+kv|u!hucTu{ zN9+jMp7?)dFuC@6Bq!(xuB{o_NlyEUEb+pEZ(+UUh;4QEscc0==1G@dpJ z&cyU=w@p3w18*=kdq+4)yHp~3&4m{FZ%Thb(1ihxy zqlvF4R53MM8V;23#?SI!D(zeRJU8vAGpb6P_3j;a3pZD%#bOPOt`s!>ufx3wvKhYs zBA1+FpUDgKEz+j;mJ;}D>eAEU zU~TrURTZA!&n;x+Tj3UW2fQ;qX#HlUPzaPQY;!$z5&kJ6!QDi*d*1BPC`3LV%HFEi zHM&1&;2VLGeJ_b|WhW{`I(Rlc3O2oh0Jiq87ZrOuoI#h!3eR6%5ek&AOdll+Vp|jjnLxlRX{P zRFmd5sm+mYcTg1Ch@-x;sGy?ly&VHs2i7X_C4P;C+-^)UYf`93PGX_8N0(!K*7vvv zKhKM!-B?)DIIL>7GBfLTjiSEbhTLABxYt6x-(fV=_jCxiV96Nb&ydOF81sp*GR8{S zJD5)NO2AskCls59r=FAhOADOshkf+ENJ}SDKA$boQ-sxti&U<6?_5wK_lB7$K3-%5 zy3r|BSdLF$D0Pa15iru7-ebVk53xX%RQ7?+>N8o=0yM&p?3k$xD!%|{tk6% zFts=|?B{jI3+@Ykx4l7f?tAZ$F)LXo6ep2S^QvMSRXN?FtrIXm477e9G)-*kOG?3y zR~T{Hk!*TV6)v1Usdb|voz0?5UrGK3Toqm+!?5{CxK^>T>%7C-)gPkdog4PR3rTi6 z?uWa3H4n$tdWJRknpaL~5*{IRzBR2u)5qR!{NYW)>Ga&jbuAh-gk;{w9Ha-MULzLK z?u&lg!U%|qOhbF~m#J+)x~3L7xqD^90k2>=FscRYNEaNK zXL`pJBI#GgmWy)9=*$un7OMwM-|eBLx;vh68dz72X9DftqdVs!CR%#)TRPbj&Zo1^ zY}-=sVz6!M)TUyXGcGx%>&AKqq*u@HC`2E-pkhMJb!B5Rl!MNdc`7Q@uSy6|;!^s_ zCWumh7?M%sKO-k%cbY~~CZ&@6HpUCScd1%xQ%!cAyEAHkzEhnhVH zxz~i2=e54ebh%&JFF~>R^8-uLctA|_|0lV9rg(-AaZrhx5JU83@S6H(1?&4H>mh)yA zDFh5sz#hj{8|WCss7Z9q&a?sg^rX=ufQeh`UXi`TrVCy%$TBBUj{_XDu!~9j%;6Hi$3~FsKpRX8Y2Qju`F-~nl zH34XnnruOARvcdrQXr*d)ejL!*XYbScu^i4hAPZ};V$f}zFj}FJ&SwO^~B0UCj7}x z8Xe@7vt(PK!cGKz<@Ff*NC>kW7R8ScpsF+a{MkSZl;+jDIc&SAN zw2Zz?6bso~M6K;XgCSACy zOek!qy-VYH2q`SAo*#KiqX&P!9Mib`DNHdz%0s%MEHB|?=d;h!m|RX1i%Bm1moF8m zx=r0J%xSCd{Y`yrRY{Nuc}pOGkVp)oJQ0Y`3Wor zpWdi$kZ$2%?p<#}xSmNyi(!_%jCe;D9%|&+W$pwOh5f#ijplPV^wI~36=>%LVY`xH zPLmdz*^$5!J-=R3qroT{uAWylJj9VT1opVGEd!|#hc=4P{c`(d_hgSX4TFXsJ)MrT-#PgcT3e^8^gOT zwsPL~vKOTH8wK*-6N{xwL;LcO>a{o45vcbY@vS0h)llDX^ zM!zBUD{y`6PyAsqGrxJVH!?~@2+s)X*Wh7Ef7L1R6L9uNzoEq-C;Bs{^pl%FqUDTa zgeg*dt|InclgbRv-pf;1k$*-0^na31@5TZ8+ZOvsAdZF5P2l~uw9AcJRIdm+LPp! zvJzq%S%h_@efDNGJvFsZTe?#7QO<}n&g4QB)N9olu-J4f1qg)Aj7{`%j@R@qhYmrJ zGkVx67b6NIBEohatztN~^3BgHGM4k$w?DuubiUA`-Dc$#(MXr<{AM)OT^>Vo{4Bkm z@wgUcHN4tw#P?7ecO7pjP=kC$-1w*(kuW-^zye(7P4{^dNpEpV`c?+eCz%hjnbYM+ zF7NuGkWHo(iGW_wOk*g03`y-w#ie>B%y0T-!>m1V56mTrrzyPfb?OFn%q}~> z@KBFi=n@?q*M$}`kIMo!4F0|Y5033{P{f$!CNG4=orp-M%P8+}L#LbVwY{F!m3VckWwtqylSn250KTL??uhQJ>qWOg6KZyL zY!%~{PNhV`-aw*0WDJ)3!7#A#NQ4Cjj#qF^Uq+AtCRk4A?Qd`8(^0U7^sl#SEc&1H zR%@cfC|^1H#w*fJeYzMwE~t*W<^HK}tA7BPh8O_Tww0JN2>mFt zb)Ko`|7`3>xj6wSpy;<={`s=FKe?Zs|GBQw1B6e{nyNN|qAbb=*3TdAA&y*dQ$|ai zwY3*QkK&jDc62rN6k)L9_(YHX#;9^Q9lSs^)ih3B{ADmE+`&O#o_joQrb)?`nmc<* zI?tDoN^(OZ8o^;gBs_X>goaxs6n?W@JO;n~=_AI-MSt0q~O`RE~H5aZeeXC@Xy zmX@nN(BIIii|sA?+O>EHJV)x-EIJzzw!2dPf;qU0wXZo~~YS{G_RC-)GM7c|4sD^NhNyxc#E6 z@&q-pp5Aff;Req0!4~|9C}k9$FuN zk8s3VtFv2 zXLnwe4TTP+H!5801(T#r7Ww9lP$<_x{33iu%Y~;|k8L?+{vE?_Gh4^*WG0v>?qZ-< zD7r&}z~0PTG$B86Lz;r!^jv1e9TzuG4}8g4Xww*D9v-9tE_rGVGpkYx5^Z9^9>UMU zaWTxD4K((701_0>2cr4RyHAFz=qS>d3e){;VbZ|6QloLM#J;+j34*`P;Y!xGhejrf zl)5>=HQvNXhlZ?*JrtX`M0#IBBf>QZmDP9O6gA$5nT;rVd7m%~y>+n&CgFBF#0KB2 z_3mXbw!W^teRST>DhPG(o>_&9fkx7BMp?*Ifzt4lv?eDIfy0?bu?^|=EK;Cys9zSh zh#&s*yD9I`JoP0rd&r{8DgD#RKKv8F{97Pr6bf!g_47GXm*kByKS74rDX|21uYTf( zbmRl8O=zj^BZO5y5lrg21}TA+Nnd!9YeNZ&w(y4;ymC9y-c_|Es>)}OmqgnKmtx(b{J5hBvd6@4qo-!7Foi}x*aDAEcaRP<3M z{YYNf3&&QCvx;_{1aD$C{I}cZX2%GWy zKdo~<4Hm-{qidxUS8EgHJ=D0@)ZKFW=39c10_JwHCU>~|+8mq&!_GdMa;?qXtAEyU zRQss%oGeXQur(|}dmkNFjkTO(h~HSf)O@3a-DW z`d-mOe;6xdTWAXX%43Ks+GjT7GMZM8l2R7;HD?_LRpq=)4fYfP7U=xN=c!LU&TCO? zPYlI`u^bcijnv1SoJ1-sZ{)X0YGIby$}CWy7wa&4M1sQjYOeh-!u_(&JP`R>nubYY zf4}qV*lr3`1RNDJ6Gty;BEY+HI77UVD?d(BClFl;V2gd&c2DN4^i14Jz0su~F|#~n z#WY}QkQi-JcSyYaZtFvt5|DZM;8fvlSC@~++_cY)r*UDqt_+Y!Z15pe`6X7WnE{HEKKdE!MW{ZAwF7NJe3JTru3)79;`2(Fgk4~sR zMwmP*{t%QPuF-t=y@KPDaqNtc>zFC_tHiy{sT}jPBTco1@fJ`&kz;z51}$Mj7>zNS zeoxebl~4NJS%>t3-}5_eG%cj^O>UYoqd`Z`wui(0k}D$Hyd(ZiypeY_1icg+!}m;1 zHxLrv|0ydDX2J&ZcS83?E9(_N=rT~{oE5wp4D)Cp%XSTVzyYl(vY|;3#Wa8s@7_Mt zC11?*G$1YzU#-k(Z>WC_sz{Sg<3X!(I){!KO3}w3F)M%SXi%8HuWcNztTfKh>DG_N z8*Lx#Sq?SNYKF05!KiCew$|Eu?_2vFld_{e^DgPrrRnz>$9WM#0HdC%Si{IUX zj8QGnpw-w-Srjh5pL=RUwm)D=wT;qR0*K6%{k9N%P4$4RTLKP-#_1t!X(Z=@(P7N5 z5kvlv_q=mOMO{z1zp*J8w4r*!?W(d^d4p%!;=b;kP2?y$K?@$sRs1(B@d<|-lqMIltZCHiLE8T`v88}fBe6BMx^7m2 zEabGfYCe{st=kLeE{aVs*|{l_O!`m!W_^U>k}?cuWsB}(o<|IALrgT0j@H>0Sy8L2>3B&=cqOlyhr)S&*Dhpj=Bx&hYo|2S~p%$ z1XyD3K7akF+*eeggV=Z0rDI!@7v+vwhs2-gz?)ogkr`j6Z3H%WQ^nghwHK&p{AL>=dFRz7zIql=Qu~mWc(D@qZn=)YrFtLr*v>>4B&kXwv zld%{eDWE$iJ7sNaU1DDhyoT!KUqjvH5~8u?pgM~R%VW1JUNYIj$dg&Fo`QygdhaIl zNkTpS{x4%HC(yQw@Rz4=EdC#1NhIPpif>^)^+m~oo<~1h+CZ%o9)zZ(dO{aw7#81@ zt)M2fsJUHE1QR>O3PSs&bLvAq)}-{w0=?pwVELGGcQCi55+ZImM#g@$O9Hccrfehe z6oj*T8lVZ!YMVbg;lbGALoIy#a-7M&ZL#?QJ5jI4oZe*quA8zD-a%#Cn5{wV0hK3H z7!B@0hK5wBx4;b!cXRO2d$3QU3oqas*7;Kr(FM?3yc5}WIiW2J)SAsrV)Nh`&wVPV zXbR3U3}!qklqk2s2jK0M&&aW_{Q})>exOL_q%{OM} z3`&ni$g(dCOlOaiF&^*1H!QMWE4W=ZJkN7N4g5RQ{l|+DQgN(bq+9hqo)*)rE8^XCofovuu3p zjV8b4G?(^Iumn=b0G`{?6@(f{!jkAZ4`$b|#A-IaG{&8l!a6~6EG;9ZY?2|QlGiB5 zqGKna0s?xAi2_r38PbG)Xm)L3^u5Gn{^$W3kxcXCA+CxcexWe$q3gQB8|>e1P5VrD z`SM2@@(vOCa?1d{Oi(DGQBFBry|3JrfzTRp5-k(lv}K)6cu9jo|8Qs7Ikh#(VH+-q{B5M4|O2^GpRV z_(O50Z!EDMMZQ%q-gw1N_k7AZ+)|@|fMXR8W*s(^uX<}WM<59zIRWhb=2TkL7S_`n zo^tV{0PA<5M%m2xa-K?u#tVkllnMSQSXb)&VqiJ()jpx8`p~Z2WUFRLKT^1_*+GCO zt_IWmB4S{UR@VuM{Na z?xRzNrANmw*NJP`xL(yvXu4|XZEUN&RdkG_{YIfQ)sSRj*pT65jRmX0%sNm^Y_IgUq z^UOuyr(RglIwK4w#L!Wjk8PXBFeIIB<|b)C9F7nYFUO`jJ0)2heEtHOnq$PAzNSo$ z%J1VhF?ctKKTJ)ZV=C-KMjc|XO=2sG*9E3LQDwWuex=bdc!Cyu3DI)I@J?8G1J@$) zbO$0CUDY6loSjP()I^wL(O%xyBDV}vn#Puj#A+W9!ytQ;rsX(lZUaM+MNd7l(_2zkehnvXQX z9-FnMUbULh-)JF#u{HT>ZJbzCX~ePPaYVDvE7JXVz#t>+6&Ur=-(cYu7Jf$*b!@?@ zX@%$*R`d^2yEpl8UgtmB#ETd@>)FI4Es)yY?V5%QTA@4gqtA+tJTg_m_EII^$=Xa@ zZj2i2nAeCT`nwRF!nDsM(lm3f{BZr*IEJMh6hrRu9&PtrhdTMe;}QZ%L5If;!lD_2 z-rm&MZy!tq$GMu&pqUd*N%@LeUWD?NNclE;j?=dHaK0PL$%kB;!`7^p`*Uxp7-2QD zVHx)ny&3z8oj-y>v%0>H}ry?cQOJ){W`g7k&Kw(cbqJLGnrIN+OC) z+hV;h;3#gt{6J51u^thM-I|J6!WQ3=k47_ZhOso;E22r;i2QUlPj+KJcW?rk4E(9 z2twQzj*x&A8yT8YJ@0G2t_GFiF`3?@a|f#~DO`v~1Rbk-PjOYJr7K-D)B~H2l?VJR zfV53K&+UDK!8`HwrI+M58phLgN^i*6AmLpm3n!6ixgC(DSChm;#B`p`0xhTkZJx_p z%ta$w>Eojo8WDLXfnax9=L8s;s0pT)K?dTAvXuU(zss^ZHnc3HDOP4Do;TJ@TRr~( zF9D^c6?}C1DWUMjo?fY(i-{m&3JIg3B5qr&sD?)IG}KYsUzU5MOQgJ)nlrQ4s0I%k zZwVich*f@w&3ErV)|xgOd61j$j6_@W#@G$!#+-;jU0*gVZhs(a>w=&g47V?67~aGDz<#i~`I2C1&2u!w+s{O{s15#0!vV5Ut*e zeC=aCzVM9F)LS%?opyl9(9!*;7YUO_Z?Wu@(P9GRhI^R6B36bRe!z>|lP{FItSW3H zUclF`Dqr35%2-983oVALZMk~HffLTtkBTg;#)j46Q8azcp$t+HtokQ#q5^neyTpVc zBwH!dfiK&WD$f~x)6j{N9K54w&P#LrRBJ#&kc-J~zq2sm{Z8mvjU*_Oal-U+FqGOZ zr4>Us_-4L^-sn8*J(6&@_`-P!dQvD5l4S)0)) zm(7PNmC+yh^}@pKxe<7L4(s|y!jln4h-SuX-xY%h-jeAYUeRFZSzOL%nPwN=R#OY7 zC7fs!iT}zjL?WSrzwEhi1wS;2JqpRoh8f%M0|X!YyLu}D$A+ubglqW|Ibg^k1p6cE zf7c1rGnfo#PhK%WF>ah?1s6bSZz1L;jv#Zvrka3Btb%1-0}L;$ z=uY#MS3EYBCnTprcU9kCnfpt*&`iW=l$;lq6K!%3mGcYWRhMT-b|76o+p=+ zx$Mb5zX@-=56iBf&)%SaLGmwfE49vS?|v0|J)hcyq}D4vzy#gZ6n%uJXPD z?1OkJA&1ch;ckyZ1Ei!d~5OgtkUyrz9QDuyfOFFfbBde>2J&lHRZpWV(zvkPakeOsz) zDl+1MF12A@Il?OgH0CKEs0yS8JeB3!~u4#2Yc+^be3`H+`HTy8x#IWQ@m`@j~ zKsIzbXC!^J<^&7$-1NH_=>oYMT!J{wmTPt}a_|^nFc4wxSuS5e`{oi6EzX!-Mgydv zi1JcJV(0H!GG9_-f;HZ$R0HLLZeFb=8QLlF(NBF7t3eSJ#5QQ0Qm;dyID{AO*Aj`HL6FbEGwwpF@`_yVu@Isrol-IH#H_~!KXIhax*z*5gkATBHCCJvINzI}udA>PYn}xLW z1rW;A8&S0QuliAf+%?Mah5Xa4oM1vKD!WNfnCEzJ#k-T(-@VOT}Rmez7-9J0F$R)0>iXE+nbb%H@{CKkN9c6)cA z%STs1+DG+?I*57M;r`dnOWZ_E5$((@zqfc^DCe?E(WwMToieLiR(fd@E7V*a*2l!X z3NkiM#7fNH7Dn}^O~9OkbequsdR2(hEy%yY;zN1n^TtcwW5o9sXtCxryJ~bZdeV_y z0c6nc${}5w?MriKgqyKV%DFyRpq(=pbMwz>5Bi)1P6+X5EQGnKfyaBTskw>%7Xpi5sPLMbzpo#ELlIHIoq9n=` zpX=m(G7$Ql8*MS8*}MOH{^!55zk`RCKM=<8{$%RoF!Otp%_PYrjabkqFDtAoA^>~u zx;x)=q2uAos^VWE`kVXN^Ww1OC5oLR_e~|Du{^~+W-cam(m|QUX(;;uETI!EM+99F zw`J)Bp3osHkLu>?0XDZqyNpbWun@NYRoJWz46=&X9NBNbG%;*qjv&}UD*Jeu&Jmsj zW?x92us9rixRUzyBVhPAau!?U;p{EiliY7QT8kujjBx)GqTQ3ldY2(k`$qQvX{ccT zw>G2tVJhc$Tz=xqih`9v3q}#4Qr6u?8@YUj`5M}(>3x~SF$fqvk6(kTT=q0!n`l#8 zYvv${xiI-(VqQ8l`MXhJX9%(n`ZsWA17;kINf8TA2)B^U4A&`7P7svwFL70 z*}Of~6bS^c&1>1eetRe|XbTw7TvoTDIMTe2q{a*t>-?nU_xw?JplHKP4?*N@$WPXl z%(k#+Wl!%F$&ztRYh;%9Zra*9P1AynSS;j8Ie%Cxl9rvFGj4)@Y4+c9ZDvvW*My(h zj_*!$i#i6-A*}cMur9uz0j8siy?U_|MaFQB)%7KecwHXrmi%AQs0;&5bKT9V&MUIu z`$&}zN-qj2OR>UwUF~mH5v2cfbfHJwb4GjCY`M>(5y_$rd7k6l6zuo-#|XQ10&;~b z?QK}-)=HRQJ2?Si`>d1Lum-B=)A_T@7N-=W?OyuM$v$t0Ry*vX?fTo*qPN>3OO>{F z*+cVKPP-&dm-d~Gt*vJf?jP`H>&?6pW86?;Ki^WAY}$l}5cxO}J)d&K$U zv-Nofa*~2o8_s@`5c>5WL;H7534|VFQl2n!u+pmb$#v9ky& z)tl0i`r8-7zz{}h8zxSe{XFZ1YrUPJkXULDw~>v=dPsN^x1(1gHmHua>p|YH#ud_**gF^h%QZEx|Y=2x5pM{$x%&ri5z0KVR?zN3D1wD_*ovb=SMIjV6x<^r9Pbm|~It=atf^s5^B8$~q6YKPR zmAwM3?7c>|oUUx$h0D=QT$&)IR$M79^bQJ4Z6g48p6kAZDH@Q^wD zE?)Zmz#nwkI*5p#a@|WQ;bMR!o0?r@cx1N*#CI@_(WPWcAePrK$IX*1DyQ^1MKQBw zna%YLdF_+u+1s~enE(FkkY5Zj{%KJXM$^G+V2u+4?%QQ0hL{%cYz&DyT!*}!GYa~z zO-~wq*WVRwx39+)*J@0EmFCB6u}IrWDdUX8v8v zTXOs!w_pk}f?(=w=)B^A=iX$0Na*CpoX zW_AtXFv-$?8(<2E+L3#3s%4PjtD{arm@LIt&PP8lcrr^Hc)~u&{!Z-h`#;5 z84iOVChtpSHWhlS`#|yYHNtEv>RL8a=lG$}-*CMiXk!8uj_QNH$753s!|L`j#zH zOs*D9K$i49-5@Y;lF!5%mX#d#8iDo@BWR9&y=Xpex%!G_{mz` zs0yMfxgkpu>BOFK$rexSytTu}8rBD%0j(3>WPz81V9Ao|%cQ^1{GUOE#Sw86d9}5# zyKdxK@3s77P~J5mt?0&7=XX9)Hr$dA6{lDegBW6CO1)Dse6=6C_6cyo7?_TaKmWoK zTG%o0d8fYr{w4F)uk!Lqwm^x=-vo_3_M7|JMIoc$j2)=leC3>z=p!Gwuy{&Z6KHLsntdc411t%PX|9lINCSQz-B<{@P$Z6E^aPhT_l^E%u}ftq?iS+t$Lft6UW(Kk+NoRzz!~$*I1(Unr););hx!L z-PqbPrh0BMNt79%d;MZtx{w;$FfaHIB_{_9=N?3tm&6Y3jEQwE$c6tsdI)&ugn=vM z+B>*vma{$R6`Fnf69+gQ`?l0fIv&dBm@*4d5e_FftcqY_0UQTi!Nu1tx zq+J&F6(ItO3;HbZRgy|Cbh`x1m^B$#X}lNyW5=s0&c1tkMe=GEc2bh}S{iOZhvi4t z=MVX5cinVcP`YGf(k{VZ@(RGB+};9rLjaoT^YmdIRiaAAAP6K=rO4BX0Mb=-Vl4&8 zUCXPL1@SjFnMW|5xZ%NA7qY8->GYCX?ppb5|Jv;e?AS^Zh7#dfxs}EJtpd<n*620-<*{Re={RRNWqIwEe7e68|O!}gb-S277?9<#RBEytXv zV?@$$j={VDcH#Z+q*lnxZs?Sa>S%DyG%2L+5(4>)`nfvC#h$1LBdF!fLQ5{4y^d0b z4~rQY%QDu4dTyugJMcPgcPA_3NqZ(*tR3WtbdOUsxQ8`?k{QzJtSXA<{CiPq#O(27 znVqz-+wQh@FI(wwgtkV#d?!@%vFjNMX)bGnX4e%r!Rb8n zj9OegTGt8*k6>6DawKs?P!npq_K#TR;ovsg)+2Y6Ux8V2PWe=u*K%uS#7M+_jT&=W6Z8gZn z6gSJqgraq9c_SL(Ct_0pom*j&7wBhYL$+leFozZ-kIx3&le*>lQ6q}z{<4o?9n@6_ z+oHbBiu`OJJ1zR`6a*=*_a??!#%mP_=L6$Pp%xsj4$76z zIR_L@0wi*P=3KaLvaT^Vi4qoE|SU|;~VJ@|E)8UX0~ zmMy{Co1^Z856$}Y0d;Etz{XpPrX-DimmgewDI&UjaErIZt)@p2dV^F{tZWQ363VT4 z5e4ckjQs1mAiZadvEth(y1GU|evy7c+qI>%R| z2q*PG!ITP}V-h5)<;3ogiU*QI8V*Lz}FB+%3~!S+mQXU*`TYla6@+V0Ur# zhJ=m4ahi`Gd>%}^0**y){SrJ%?W2v}?skGQG#^KwFFUgM`h1VS`jaw5$IY5j1vi$L zdhK^MY@pSv)}Id!t#+ZY&2JApo1p>OBjahJ=%%u~+Jh)s&HZttT$wNFY?3|b*-uaawYb;nj zBe^ zOx?;l7ClLbVTNkN1;xf!SL{T3s_>aXR*{Skr17(L^n~f0@J5=Ckn&Z+;|flMbraZH zG+702CmS_FX3uE;iD|hgKF69F8#`;kLh8N=%R68P zWwZu{e^#s=ntfi!u|y2}4ppC4cFIo!U$t83)B19^rE@0nG!W)ydo3tDEqy}gf4L2m zx5Pf3@t`cD23)TSq@%mt?xxdY+gJ`(ANDFqF$FS6$9Hgzk=~L1DFvdjJ`AJVt{?|h zA-{(@l|ZeBLT%$XP5A=qK=YFSB^^Vpcpcw@YvNO9lC=x^9(1eTn^31#a~r)UQEKnAxr+ z7O+ovOGAO{y>h(xJAnM+I{$lH;OvO6+Z2e&Sqztsn3wb~u?6taKkVCxiU&Qh!LDH; z1M`?VenOt%#_I^vWzUwa)ak{RA9g2s1$gc1TJ8W1yW-p4GdWOILU5YSi0$i^5}Tn# zMg}fkAvNXd?#srLC%9^@?`AyG=s@$${ueXZ%>I$pt%xkLw0Rf>hAV;EFJIVD`5heP zk$5xOhE(nE*1&hkOPZ4kS#Dg&evZPBS(&JgI|3%MCXFF9we%ZzPB=%AJ-J9Q*SBM@ zx?bq%O_tF(S`#gg?x@C!>R~9+I#=&jhG6DTJyaD{-d4P?X|5a*IC|&Xp6Q8w+%M+w zW=M9Ou|-n}6W;nL%Il=+rLB?d#O0bSZz8xCB^wo;{VmbQMA!*>utoD9V@JgG7X4Od z66I*Ehs+Zi>$?j{<|Py~{9m=0{eM|ItArgaVVh*G55ZXB{y^_+QWtX1-%$YzsqCf$ z5BlJw0AI+D9w5;Z#rMsk)Vqy~vz2WQZ?rBuMnaTPr9D0NVJW-}YQlGpNB=?j|3RGp e&B*Scq%hfi%eVa!QK@;Cs5v{jovw2TO!*7*Dlatv literal 0 HcmV?d00001 diff --git a/images/model_quan_fig.png b/images/model_quan_fig.png new file mode 100644 index 0000000000000000000000000000000000000000..ea6571509e4b8f1fa00ee8f9ffb0a6870b740d0f GIT binary patch literal 315888 zcmbrmcQ~8>+dhs)iIx{TO1 z?LA7=3JGF=_4)pOpHH9T=<_{}=Xw52awPZty6$V7*Lj^+!XN6W)6!h0AtNKBg=wff zCL;s>A|nH+P+vTMhb*8}jf{+&45p%_=SjXky;|yQpf81|4Wij&xeC<2PyGquQj?ILBd$=1>3dy`0FHrI*$$Ya2}d-OLcR5Pfr2Htr1-+i%88Cba$@;sdB zfAHOZU+)5i_K$Bo|9<`7znL-K^PiaHt>QEL@1Ee#tA=}*|F6#BqmmKs?E|R%6!E_> z>D@<#QvdDe@W&}EW2o;wLd47^y8X`|>(xO1U&9ywSB^n{{u3MYQz()DscW4d2_Jp` zzqCX*Fp%p6r(WuRROz33vet0Aw$8ET*|FZ?f81fW<+~zphwz!4)VY1x=5XumbSv|8 z)c<7k0pE~A(%KK9Pm#?3?+4BQ+lCYx4gZB-+p&=YozIT3X9`D6Af(t)IGPkbjy+w+ z(gA%wLGru2Z4R;LACI5ki+|7Q$?Tb*4CZKhi*rdXU;Lx9aGu-$)5A?&&ab}pYRL2F z?Y|hWn<=pFrL*r8Ao68%67~zi|7geM|6y*J(0H#TkVC-i zMv`=o-`UYD*ROrdetpWJhyT7u(l5zTQ;FKOf?5&2bq24twvsYe7 z3)4hN>%lmOZrp*#RmHuD^1u*2Di4m<&g>S9V^5~Aw7cJh3)B4Y!CYzeD~emJ1uzQP z)rH%S84AH^b*G1Gr3U6+&-NA~MElG==R?K~Fgpo6R&a`0#OX4E4(4rO_IkKcT$Xnq z)?MUs=61ARitU4(9biEn2){KG<>tN>$bK~sFbvWT-KrItF7>u>Qhv4QidB@31iEK! ze)C-W|J76eBM+(nb|&?Xg4=(!ZrXJKcoJ6Q zxnDc02zG{jji2KSY%xi$d)eio$`EHmi!ed`$TUOkf|ON~cat*!CPgwH4O;Z7*~9rs&hxR`Ym2&4p3Pe6=r%u#}*c#L2ax(q9k zRPJfTbC|k242ct837ROgaRU^2efeY6AuN6Nnz0uB7!$dB>qt+Vm)2M5=bG(O(zUi2 zyTav*dkzq+2@$;YwLQ?$X_ns14Sf8e`=pSRf<8#8?wXqX-oN)>*`R9bihb71&z$Y= zCVwe%NqiJA8B`h1yv9!4(j`R7(JzArZ^2a^l-kVueXRzfR*ND^c1KMSrp|B#eobOE zLO7y&c;>q%lOcx#F?7SS`j0zd31A(VN}SVu+Z_SHJEQP-4bM&%wA%`6@~TAW!!UKb z@96j*HqUx>$!h;N@~3W&P&``@JzWU6%G#m>_mKAjIRwz_R-z!NiR3=0_wcrfKPDfq!Zm<`@`Y1TMnWKETMFC z+0r#b=p{U1EVWe^_-S+Be~t=DYQi^=X_xh^jGB2&TOO?+4c(mZc_rWG-5o+8$k$Rs zMN$Mj4tw!|=`g6)hClo&GmdhT)l|Uhap-1T0A1ZM^uC?MA;WQP$OL>dACcarN^V zGu7PDzsAsc!4(6YSM6Tqe^&ll`qPvE;2U2UuGF#iu#JKjU^DSo5~X{qMFzS}c4p4= za&JN1&iBkHA8>#@??Ly;Ogq>C?su{~8n(6>-y;CS`gPCTuMkb{3ILrfj(hBAJmxX+ zoHaO8&lJ7`5BUOSMYP3hsKJ*E^*RN|A7p(xLLMafdufbqUzFZyo>jbkc3yNngnIaR zniMN|GIj2l&vm$i^E7|x=BmDmX%$H5!}a;iXzqR7^0ku*OI#A#2eaD$ht0$RV^P&w z38p8~{sifkR3cbg3m}=qb~fYk98&e`>$i)w&uW+H_cqeU6prlnanCrVJ$x>K*%FJA z){0GBSB-8hlfl-$p0T#y$?Noa@(dE;o%i6cK}yGQZa$8h(C-$1gI;Ko6fuwc%*hC+9nmOAhM7U>87C*o`C8B&uA3& zV3}%jpFt>4Y*81IVfUk^SuVYh&>5?d#vdq%J_0@Kf=~4G7_Z+(Ac*5Kv6q)Hi8o$c z1&rw+y^ntvQN=@Cc1xOYm(HDK>_>U&BT@$xGA5iViT<`sUy)zr=qm1$*t&v~lb1|V z*c-EMfh*O<^zt{k+54)Uv3&cD%;bZ=dHtAj%xn2VSYg%oZWH=jsJ@{1jduX|5J`Wv zstyA=(Cq8d<#1C53GhwWkNEdInHtS%47(6wCg{X}UGtUev}CwX;kon0W(dHxLYU22 zP*9h1B)~HOGJ|n<7=fQ?Xg-QJhF{Fh?ABIh)d38FV z?DOtr>LPf!_dDq;(eavAj%h_~r0Z;J6D8YL3a}YviEwXB<<-D!?nd{z?LL#)I@PWD zFfLBy-pEwW2jR_{wSvqN$Jgdoqp%-ZCFy4uxK8n5RC2SYdzE&~BBm@K_#bWM2_%2; z;ZuhadL)#^ZBR@ng-(Khnq*9r;wg;~blAJQMGt}=Y4&)c7?~^7eM}eeG~zyutwi6^LOdcPNY-U2*Hyio+CaEhZn z5HyB*tkLFp`3U-;GCW#nrF#l$H*2g!Y<*JuUOMS!eU)A}bM2KZoph;n&TuS%b|Zdt zr#`4pz^Be0FUOLt(!QB7rQTmvC<>qPM*JBkjo#E6g9O`yyo8;`F8;N`U2nPcTGHkS z;VJJ^@H?u?8>8DV2F(9&W}16Qe_jhSpXc~q`x}s#xd;Heg6}xQb62TcLGTdDYFp{T zy=VFal;C2AVdsbw?|uUsCv3F(P!mOzircO7JUdTd(#PvrDvF3+)V)EK`o;Ta{z$s_ zW;P0q$kZUZ$$i3rlwH48)(p?YtW-(G29iGZmMtXq^ZwylA(qo?DNYM<=b0VD?q=0u zv=-FM5jpdt-RZjK4?_G;hTR@@n22I`xVO|p=gR_*w#rA%*-+Ttra$4%m;lB0Hp$Bu zm3KC>A#rLRZM0?^g%!|yN9R+8$Wr8Z`YSm7v?C;XXHK|cB#UAPznHzYvc}4 z^yfp_HnaDD)&7b1vt_bh>(=lw4cZ5TYWE(841X)c$kh$QEFUp^vRpLWe0}h?Zf(ln zeqUuqyLc*u{qVtbV*1Mj^k4!pzL<;~P`&;!`p0XwYfhae zYX1R+5kKPl*YXj|7Qlg-&nvl|y=_j2p{KkCs@SxAD4`0{17FT^rkUQ-+|q?SHRwYO zhPf}XH}n#J0}qT{ozBs({XZCvbsBOQAF2xM=4F2`p|M>(WyL4zgdKh0`q@93*=5sRcHWElxR}k}}ZdiYr7f?~E8*&b7 zbP{U5JN+3v{~{b(l|?a8=aWc%3WcwLwWobk?O4L}}dJ1giY)oYb{1eKF6>z>G7^XsNXd z6P2x4?Br*BX&!QK&t&#+q$u1muShW5Yi87c@F`nBC_=@{%-wGT?|7@iEZg4B9zT)(c zfTN_p!B0~0Vta9m1bn_8Bn`k6F1Y1|^|^TVw$T0~k{bESSE5UL1Y>dX%OK3AqR{uJ zaLXOXg;bXs2aS^IgXL5zOgURO$1S=8Tp%di8>E15j8*v!ELH{!>3z?6GmfRKLgx{K zuv=#@@$Dj{_4ZhmdVzG$r-qD1AjwBtGIxvoxZLa^c+2FlCsC|~`zX)+ZrrFJDq5m4 zWwzjV!%dqo^VosINg;DFv*Hr7$YrWcBb+tB@z2o=TDYeAlxQhdIUH*@Ymt)x;|NtYV66JV1P9RFFMks99RoysHwn`^q{ zH}*pY5IGXZD^tRuppLbX(N2|!3l{0*v44Q>6Yk4L33$H=*=xEXll7BReN?Kry5G=U zC5dAdN!93TgNp@8tJ!R#h|;e$XDa)_#_c`Q@BVL(I0VOX z`Vj}L;<4>t9Va{4&Ao5^yDT<}14sqOOsB|^%bf1BW_04I|JaE?E$@g$2IR` zPQ<4L0v>n}ITFv=_>fQRSuZ9s*U#)y%=R>U2T*QFfA(l*pANyg)beG5<6TUHi0Ec% z^bIb*o!o2T>ILOaF=VyOh@gDKca_YV6DaB~bS3{o6FLz29Wexhq*Ra5%vV8vej}{M zV*6MO9Y?Rl2xj>nA$^-K6I;J5=X%3?y|GZwb5TLQD(!m>Vd&?ZuFUQU)?fXSdZ&K{ z((GasK}tdE6#t;^psvPgQz?~cUs$8?2zven6SKMo?@|knq7IaYdSpUhkE)_!rZoP* zH?myjyp@(&+&4H0y6jTAj{&Rt2%S#s!jiKQZaj_N&HWr`TX0dtN||*ga{sgSg8tKk zGm3EyUrFS~ScJd}+E#dFQ>#S1&0bLL4P6W1p?ao-F~7@o~KVu|NM8 zXTnSnrPEj_w`AlGebqDjB1*SaZr@@)znkMSpN5?1a?r%gkMCy2_7D_ZRmDgmzd{~Eev*4d99nOSE8+Cc9q@%K4KhGzAqsRw{yapWQ z?fuLqm*LLE;~dHuhVM@I$2$WA5Ag5N9J!!lXOm`$qqw4u#IE2|!%YXM8$3o#-&%dW z$tz*~3#miYBmiFYajxsX^&aO+Ld%45x#^* zKw(kvw^ML7_4C2ZUi!A>*dXO`zO4?hg&)3zTK9u5_>r=zgI?L7m?hNaGcNJEI=#AULhjt*R)cnj|j6}hFq3zHE zsAr}DRKRA@VM-l0?pto-%%Q zcVa=mct}YoR~41|`MIQ}hoe29m&0?AQeiAD%US9QZ{-$IY(&6ILO>&@g)4n3EcDJj z>GmSpux%N`9Q3xNRFH^bQ&8SA#VZtLZ>Ow_$9pU;S$NJ~BH7$3=uEZk_5J|w1AT%e z>gODdh2a-`wtQVn$Jm|`I%3&{*hTAD@%+?Ho!@bcxO8a<|hM-!t=ld`Jc~L=MWV6G2KjrbW*F zW&`uqNQy!Y$%vLSS14Dn9a7WR5UJoFhiIvM{ZaH6c@e46m#0kUYpY!=GlZmTGgrGk zU<&=gFmn*{&G(*oT(9wmEiEa?7gyr_tq9*4<%F|-QqZ?@*O89s5z6+6Fc%#4$nCJO zfY(WYS^>20tGylx>}A47*@yRwN#QKPq_BhjiEdM?Iu?V8bHLPK#O4#6zFSwG$r89* zTW9f0yyOmF{&s*_XM;6bRHu zbMMuEcv5R|WkeGv{r1K)Keh@h{RGni#b^|0eqEBwny<0B)%Lo}z34IJXUf}@ zxy}n3xJx4Q*ZdjY4T@f0yqDu1lpT~6^bg982yGj&jkCG)3O8x;1<)Q3Ruz45#qF9H z`2`R>PPee~*h&PDSdgcU4E$pC1m>s>ptFGaKjg1no)seCG4&+J8x(U}2n zl?v;dxPxW#Ye1(&id^;c4(74g2kpTa@0E`pbX_Uyw4q`qPa-Y6#gLTMTt4f?O(H}p z58p3N;){ppOgcxqiBxL4-y8Gu6VAn{O1gC3=-O&IZ+P#)$;@7yW`=QIUvIa)WD*#@ zec!yQSe`8s)c{z~pQYU7ySxy`feN0Q_(}U5$0{l$7rh?utVMdW&j?qz(^**m;l2m% zo97M(78iM8ZXA4HxXU8t0ofX$njOSO7hk%H!;c32>MiWngo)#Sa3=ah>tj}CrzfdS zHmTi~Di-S@1sGDBD4qMWf;-D)2B^<=n{6@fqZa-r+@R?e&dvYSr7@8sFP;Szwr$P1 z?nvYvk?tO|t{iAGJR!l4pP&9iIM4iq?t$~qQuA|33cjya7TJC_F`o8_oOHZ?@s>-K z$FGM6PHvdP*Ss9{!oL_f z9`>P`SBg0As73M_R;DRXj5v-Mj+D8%&o_tgR9HoZrx;DLi6U#pOl|YiZOmOiZa>9H zz%vrivhFj!Ss5E2yjttt?iq;`J>(Nzwt}oZchy-*8p=FqJ5$x6I{R7j_rb3WIRMc3 zn6eQpL(Ra)!Y>BCS>tf!x^@}ZQ$weVv6SJ_d1E1>A##OJMub{~J2i^xB)fPmNaO&| zhZoR@fj8Y>!Slh|=%J+Dl*{wZJ@FN|s)DSfqkgw(kI@of0F~!5WHwlzV#re?OVjf* z{|nacKq*dVG6!WkJ~jo4betU5bXHsj8cUJU#Ot>RgW^P=v(4~kSCBHShgSe>t1NKb zs%Ty0w!$Z@^)$*qPEluG?1i0u$PmgqYcA+W>TOMXQrcjvw} z-~ieuHBi6Siyi0Q+umi!={s@NCcaou;b`-^QQGK8b#Z!E6nac^sWLN6dS8B}Q>T6J z8)!p+@%sQvRuV3JN^F!<@zqVwfiS`VdEP6<5_xG(D*Mb7z_Ys@JD;ZcJ1nn5@K;i9 zjPesl>Y1OkXJ?^RDWO7;ZE6W|Dm%lYKIGFAR5|^SV+>e+`bY{WwDcSaW*ujc|VTCAJ`Urdxmb^iW}sOPbEQ)U&8Jj_9sDN zzTskSZCf2AleER@PCHJgUv8J@aY1;Y1Ht^9bNI%$(nwgOX()%ES8Dq1y^+T@xoodm zvXD#SO9mF!Sk#U}=W-^ScTl-2XGTzc@zb=j-`)vd<^{QVt>uPEDq22=Xa=Q+J?M3I z6JUCl*FWo7$70eRy}k5-%CI+TB)IQqlbE%%V{3d8a-Ti+HX+`t>v-ubfUtb_J=K09PIo2+UFp3cxi76Q!zIQq*knNfXy`Q>+-z7?`|W5)SwIhC2L5)X%~GXf7Dk z2ig4sryb^3lcIRCTaT;ZPNKaqY*Jb0*EQ52C8ZarqNGah5oopVWgTewlHx!LR@Zy4 zzX$78V{k3t>fC-}&e3KqqUvlwor|OBsvA=gQ4Pc9Qe{;Yrm%$NkFwdixSURBE)GO0egQ-id$&D>J6qC{wV_q|#l#u%aAB(9hswO*w z=oRWE8$$t8b@TX2W?%&h?d`Eaa$Kq8fb-@^MX!0gGG0!ugAxWY==OM01M+q3#L*= zQ{6drs{JG}y0M(fN1ngTL&A%5Ef6?~C*>_5wqBEM!0~Pu({9J<_;qJ@0^Xi-p+hTdW|lwgju z3b7q+%${xpGE3d(e3eXPm4D(XV*17c`-kel(hNXi5`u~&CD6gpR<8N5oqrDeQH`X# zL#Vamn=r8v9$y;>*y62!JC}02#hcBSOpt`{Tm~cqn`_Xytr~ZYHg5_G*qcB;|5&b? zyvE_{hR)-+DU1vZwFJx?b2=O~?yx%ETQSyQB0nrqD?ODKFgi2x2xm4d4d3`=ERi~y zc+FZ%Qu=y?=1BKQms_7m+R(nyZ5u}GG7%<$E7HNmXP~Flw{tH9E}npV<-`R4PR)z> z0077z;CFy4P0yP|VNWSC7}&u;V9B`Q6`LfPg7p@0`}I1|Mj$`c&P z_ZfZfJvTZaua&)vC+MXVH(<}@9EtMC_Frt3YC@L8>o;nNCt$$B2S8Pz0>u_YiwQ=5 z3yfHbQxGuZH&VY56UD5adh>&W|1iMhgdZ@#ph@WG(!q;?KTw#9ziAuF-uAdy*M!et5BN7_RjH9`ZIKm z^%-K3{05KN{F-8JQADGhaR&0hd&Es@_$#$^F!2-GjMJLxke4u)Zgp97v@;kUuHrPH z7VTPq6`ttNZslXU=VWW#^}S-CJnmVG(hhIBQ{8HqpJt8(|8#_ad-iQ}tGmla@)e8b zB;@=h3zqX~>@+O#X`$PP8{xEtQlD`mi;B?hXYtr7W#%)_j{+d1UNVcpxT%c*MoVlM zWR~?4LSpeDy}#llLRu1l%7Urek-Vt~!G0`^HZ7ulO?Fs{qSMq|XkyczGB$ zX{KxFgzJjU+~?|oIHzz_F$_CnK|bP%k-V;Dvap4(N85vc5_XIC0=C);7b|ZOT9&s_ z<@{$6(3aVGe-lx>z=6UD(wF!2HrLK4t)*$b5kEq2(wn5$OwhQPPEilc&!sDt*keCX zkdaUH>*l_E*1X9)M?a5gPr18Ca3qM$jTd~& zZ^(b|_M`ja+6=b=_u`9quLxOb@2F5XQ#@mkdP1o(pvPys3V#~H{V~^p&r0J3E7cP| zZ5$jG8j$AWESJG!nVJjZdr~C;WKRN2%>{EaPvB@Mj@Hv?X-ncfJ~Z+i+zc0jxiG3I zUvRf~GO-MB$0yh?(CXtgpld`W>51mpQ%j9AAaZE?u7l$P1Ma(QP@7N8srTIyqqu5< zjfjysWcdMENWSkH_Ji(uE0%d@I3q3`^*W7O2~_S$JW+BP!HgbPG_N1fZe+H=!+BDD z@S&pmulQxHRu8l8huH*YAK{b(^h;P3pT~$ih!Tc*OHFoui+b*R{aa93bbFjsyO_D< zyOrXPHDM3h*%Y?P?7St+$6Bk3T)UE-0A}H?(f0kR6$JxIoz*`t8QSsd>4}9Ke(6BOHtE!}01XAVe;Y^!dVvJaQchI@NDToy)YGKj8D-pIay7;RVOT7T15 zze*qK_Hsnpv7LqJ`Y(9(!kd0>Xq)3eB=s=XR{&BRmY-Knx<}*RDfeFV*>qW?B->Tjbx6GcJX-D zDGmkhmQ$4~5baJgmG1!7RA15!Ul(&EXv?Z-naD9^7jY`I%0APrPMk*Q*sXEeFxuUP zhAX)tjYB8q%^b64=2t(l{}mF++yQ`t!94d%f^R%heH8tk!FRxEuaP`iTKwI%>&`BH@aaHiLs^56u_O6Uuw>_ zl)vsI8tzT8v*d9Gp*dn&-(A+I035w+293|%L%P4MbYgF8+*J*s>$)gVlC|MX+ZW$G zc5hT>M~%$7yWG?Rbkd?H?>nc&=G}D3G{>6EAK7daYsw|HJ?F$;YhdYPN^HXP-(@ah z-6Lmi^USsI&B}zcRKqQ1^r)y7+2J+`_g-&KRnM3DSaaDtg$kKG$l^TLbi7JLn!?2( zbqgjhCUUZmtg>do=X64kAZin*)?hDpy>OWwal?%++EX)&Go@3iu(5wC*kUiMp)&yO zbhXE^cAfneo=jTqpV^mIqK8=$QtxG7`C=i^MXkqQR5aO=m(A)LyRZ^5D~)(1JNz>Q zKJjzXxpIisKDIa6ZjDB<1j>?n0e86_AF4}pnMnWfgZ139`HJx$#PW;lSLTM;+bS15 zzK*C1p_8z-ejSK~MdPsT7E;k<;2V_)hB0V_pD57E{%BjV8 z*87Jn;rpZ4nbTG5s_4bNA*>t63|{v3s{cfxm#Gt3G@h7IDjQL^d|Z|~xVGP-GUu-z z4~p-2rT-VP)A^i|osnXZEcYIUES9_c%{4?hLtbJU&z$IiQh>Ilc2?4-OI<3ELN=!) zy+kl`mjUk5GO!GoL!+|lVVuSsdoA`A`t<&}@FHNg0s*|T^w{cYi!ud_p#$(H{te^I zR0L?B->6lxt~tUKf-vdK{}p`9@CCrTy;6Eg`DIj);c4fRKq%ibY>q)go6Izb3A1vI zTxoxR?SKek+`A`WFN{~_MfBcBn~k%-YB)0Wm@Si0*HKdU6C^s7sx%x*4cki&?tx~$B|ICx>A8+8VvjWg-?LR=Na$T7zj) z#qoF%@-LbsFF)e2HzJ>V87BdeF$E6STj$AUBSZ`Cb9O(@dW;b2T@A zjX*PF*~r_MFaXt-ybg9pC@X9@5jArmbancObeE61RxLT>pS{AI z))_{Y>Ez1D?}!vTvh{AhY|vD1r@S>cR|+}yN2PU%^u!cAdl0p{_wN+@5YTs>;+O{m zKUSGNNEnek9#M5H)P6D7(6*uu-Mw;rdyQ^qK@_s;wo#<Znt1klm~)e7#z1;V9`yc6z!|ISa{S&S%Z19jkc#X06XC z#^{1EbUmv`SU()1oD{9Q%JO6_(Q<<&lEXtbeUR#d=j@w?*FQV`nwloGr{+`a(pg7v zZq^&O)A=7^lPB7Y6Cb!aaftMBw}-gk-TbD5JT|VP6L2=9lAl~<`Odv?VgR;SBrBIF zi)qK?qoc?P@hVRV$61DLiFp+V>AzJA>Et2@f6Sr(4L5Ru<9422kWK&WFig$&fGy%T#uOsRfr6GKljs#vHm}J0!Qx*usy!JBO$8=&G=4MYA z9KpRB(c9y)B{Zi4(8z7Y;ctyuDnc*|$fXjRMER_^g%3*tUAjf#$hux7R18?bo9R;; zQ!01@Xj7uS|K*(u&&qsFVUPg%(r@Ci!sS-vL8MDpCsMyljUuNH$OM0x+Tln{;=Xm< zDVP~f`)jmZFn>xNA;sexMisfeAk(H^G}X>%N*OZ1q>ywRy66G6X^0>R2%x;tX{f={RIt0AX6l~hg>(wi2V<}3YQDns) zz9E-eii)aP?hPcg2fo~dAl|9WCf`Z#AFGJmc>2|+Y&$z$#mYpig#4m4sq~WTw$OCR z26&NG*sZMsY;-QFTRFR2tuL2lKzPK^X|2w1!8v^be+Qz^b`NQ8z~&cH=p~a5KyH-& z!JZx64`^uNT7b&#R3B6y{{sJVu6?qc8Z%0E*v|~bf4X$Wg79@kZ`&sP6$R+k zd@w&BX}nyOEyQ-HHWpp zY_L(^;y_i`2L`&P)_Nye+h3$(1-T*DCOOAnev!HOX!l?5W6Y3Y9Wsxwa(&ma#t+#a zrKuiw))hNO45O0FGhgx{7CSQZyqV0$Y2 z6a)kFOR$^ha8XQ}oUX=Kv;Wb%{t1gOUAb}@)i0*=gOQyfkBNBo0fVV4`qO3dinGM%K33q}xUO%~|7iQx%=EEi5<`=k zwdS)`??DY-<7!yEpZk2tdm&ulqFE`tF1nbdlFi>8bPyE0)joQ@2QbyXVz~g>KcWMX z_#eegM_uxx*&Ga`70@ry4_aRI@f(?u^elBu#$Ad5e!X<}+dyXa+E*+|+#Ear`*EqT z+J7Q<3uqqC>lReLSa#dI^TroCPU4$${o2zZ!l8J3dCOv31anxntSBM9GUPgzaHyS? z{zkuKL;R5_OS?!;-B9_v!bK6M9#iKFmoMUT4GcvVq!q5KJ1Mc9DdT8pJ%>`fLVDwF z*GMNim85|5T!Z&W{fTCS^0jYoCpQpVM8uhL-5JyW(?H$mUVsG)^usI{`rSFdupl$GRZ*Sw#K%(kC&0ff zNUJU+JvUog0i*p-51P)AM1P|HhsyPBvxARfsx23ymxdc8=A1REY!U;eJWkUB#A& z&)hWbh=WSeG8zt2rsKHkNa7Nf`AI_^J@Mh5;h&d7`@T$ z5R2k#Ygg8^_9)Qr$DG`jG%DsePSjVLKAaNs`&zDJUr?Xmvn><`ieJXPyw7BPKgw)i zL*yNX2sbFQnjN4&?XsD|`8b755__fQk>|o4I>%QfJuTuG8!SQgl3Cw6WqNjLbSwsGvKEnL^1 zchB?S&ul|l7W4lO;%`z21O$-1&xmf!m`lV`r~(b;Xl}>~kiBnACmZBfwbYE&oYTA( z>!$PKL985ktJhsK#=JqbrqXwX`0Bh9g%3ln_Nwkm;_$DiM@0tQj1aNLQVcPI>-ZaR zF#5s^$kw+xgXvFI7IVq$ZXYuW=R{5^4LSl_@A!$U8&LG)17a455F#LBn(L&$x>eLq zR0t@pp0wF}LUKjUCZnP@DM?l;7>6chieEE&9T31jEo;P6ui1{|gFNS;Lnj||*4Y*0 zY)1w4njnM-D-=2ms-&np#QPbSg-IUZl)2hN7E0F+BSq3u4aO2N40p}lEJ&*G2BC4i z(e~F5T2_ckbUPVJ=7ApSo;||$i&3U2gtBMZGJ|PTYfJUI(_rHITIz5&X-wsiH^FGWE{00~1 zA1VbMn^1t=;xw&3->(S!#SV3fKOLD5M-;-vL!}i2JvEI&i? zyqC^~?oYWY-?;2micFwZHYY0But1+Yjcny!vpks-K!hY|ERr*xewA!K*`I~ZaXr9Y z^1~E<5(f#_jMlI&3yugL7LVX9O*x|%SK6WOg;FC~Uk+EYYvEbZl~^w_`ois|Au4ow z-Nf@;RMKYWgANh{b5q18WjQsF|E5`)3ye!$ac66rOu+k^GPksBUBc}(h0)becH$f} z3+^bC&Bd_RQO*iF;3@{pLH)FJV9(r0ZWZ)%%$@RD`B*ByS!Ml}rzoNL#H6y-Hn<#O z)ft-a+9}Hc=j1HK7~M_p&rq+S@$(zFR4vXw#@_Qj5(xa%PANaFY9;KHavyYED|v2cELXnH*rHjht#obudp${=OJ{xdg59C9?S2Yle-hxq ztQ)F|`cP2zEwZ|YmU?0tsqQY=KiPCF*LTw-%O8dN6-kMXodFAKPKh(juy z>YZbv&oy$<(Pk!pm;Sp_Q(p!GfR}%+S}SpoD?v|4{mP(TPG5WzPb0V=tRa-KRKp$T zBYiMV*(7=wumvav!~;+eLJIbl-;PnngBOp6b!OjXy!2$2nXI6;{bEmp0@k4n#I)z6 zl`K!bQCL$Tmd5n6`h-{+UbwcJU>sLmf8CICE8wUD6jPQ{W_9sb{jz(@_{JUNEe3!1 z$g*E{>=J-qkpBc|U!zW@$2Ln@xrI5}RKhOyhLm+Z%J>jHPiHYZ|Ggcj;vX$w)`^7# zWousvw5rOtrka4iu8e*#k-_CdP$b8*4{WFO>D5@~l6IvYkoyOE4cm6qWz~QE_+eYk zgJO7zm~y*XbSLuJE1j1;D_(v-KghP3P~+`&n{tA+FS!dgYHvA-?1w>J!|%LRAES2j zF-bpS9yfl`+K^EjbWzY%@(>$AHZ$!tnh)5 z>Y?Pp%9nG;FRzNT#lPswxoGyq2}$2>J=3`i5qId=bmGr&jx2pdh=rsHRMK|A{eI~x zvz3%krDm+tUxY&rmhjNv`p-z3Znbx^!qyTM;>-V_|CramcyeVhdQ=y^@lh^rO4G$cUZKKWH|Y&9(Jc!t!-aP6SbXv-)*cD(-Z zwV97hW(wMJc;$TmK+oT=tqX&B}>!b4-yKMJ9#&4~bikVS>l zkxq#D{rlab4GOpj&Z}KI?G&=vYxB)PP&enxM%IQ@-odP?@Lm)8eC@FdH4M3@zY0&E zYY^WgAk(-S##1pt<;k6%3Ypq!EON$^RXPyEQlm9JLpu&hgu*q$H3 z&N+Lh>%8p&2mFq%FM%5a#;uDYMWRvJJVl2M_yJsn{=fLP|FrG-f`|ECYiZe{tS=ZAmHF_E;2@Lxk4pr7T%8sAwq~0QwUa@gSOR+kM zR6`O|OgCRb54LTlYOpNkQUX;uB`=e4%Fik(X4|V}`*MP48I`t)VFHBA16yq!N(X6Nm9rgouUeSa5)oo7dcn7`^2X~qvcCwyYrma z3Qn>8;j|AnoW%}YkURe2>Ek7DtX<2uPW`K;-Ah0ZH2>^h1j5PA!eeZwZ9j_4M{Iq! zTg)W{=MzX(oS4OLJB5kYh@?s(i|&b7tmDUQ#Tt7<>!n^JlnTlA5P7(o8+Ll|>{tSL z&zIorCw{U`===f#!kz;AOuF$Wd0lNRdXDbLc%y?p)K}9L!eYnau!nQJrM+Uu{`E-PBiNGq+XsT;!_!4Aq@(ZpH z(G##&&%%nV9oHX)Y&?pMkDhj1*Eq(5$B3u_*L{SOxxpuKad|VGH#;4zgQ5ZS z)7jK-FR_=zdF>3kQX~wNZuPZ*+z;i1Rr{~$;KiAqf;#xPOjq=ko{{8hT-FDIi`V6k zo|T3@24pz%kKb!V7u&Iy$g{Bptw<@4#oU)7cij^j>#cA~rZ)7^4Vc@N+ZoT$$Vx|! z-K}X4a15Fz4TR0rWyo>%T%aHxT4E1pRh7bABH_tnmji-d+WStp5kEp=vRiGPu0F%` zEyr#_3Qa68$+H28U+VAMF7lr#PuU>@>kqmnWvrfZKIM|?vusquJ!qohbdFBW##wOT z`ciX4)lPK(FsW{6qo?9Vl=jT}?hjo!Ek>9N7Q}9$A%2bPJGOAA{K>&rUq_I-r{kTo zRCF8SsNCYHJkxj$KTJX#-`PFiq!hJ1U)A;(2lvBl4+zjR!nhsuXq-nd1l6B#x|DvKDAwEsMkF(NW@Yx88-;$(PS@f^?!%HG5jNk&(?4ivwR z+L}=7AnDGlHY{+X&yH89IJ7L^U-@-9dsIJLdlmE+#C7cP^3CmAT<1dXCUlaUhK0=# z7h4C_X%_`tp`~xK4SctHR;Gxv^YA4n!ij=urpVSz$#%N{d~{CQF&al0g0et%WmeDf z-w7pZ1`?^&Vh#nYdPPJCMr$=9Gh8iJk-HMzv6 zbzz{};qc3d$e21ey9XMa5}bH2{bMV(2}5+;_Dw#=XV#4qX{Yy7BS*TIL5}h#`5z*r ze)8f;XL)HzcWkMS)R%q@-=n5k04 znUf`4s(&V)C45%jnkcTP;`FrKQ}-vTm!9vU&!SKIwvPK=h&pw++t(ovx-B@}0{w@5 z$+h-B_={5&t1mVk1FZM#PYFe?o^oF|nXE<#M@qjJ0-DP8{=h*0u9Io306-srj54B5 z5zejIuhIC(QlnoAdPz|1*3Fw_LS&Uf8i@cNx?!;@buMd{y0Q${<}^(U=KIReba)q<87 z9G0&&wT)@HUE(@?GVo@o;6fR}Q#RU_Vb}70yOSk{dPDbNMsTF0`r!+kLbr9?5znQ%28*Ciiy(96T?Sgxz?0DNk zLmfSXav@H0Fo4A9d>j9VYQ$*8t^U4(b6UIPx>i%@X&N;8>C7nlSLwoAf!Dh7L8A4I z=W8<_eaG1vPyilo4ow!~dyz=!elNy{BCx6sS7u8CrIS|a0oYB?v$!5$2xFzW7MglY0OycoaBLzyC-4_RjHzF#=_{p4Vp*^ zkVcP0Zn6i;myaAE}K z-9pjg6qg3KA}Pg7u|k31ETA|=f_rg?BEhNPR!VV)0>O%Va488IC~~vTID7AP?>Tpj z_rv^@_Z>5H&d2`GZ-&aYsL;(ao_+XJN!I30B;*Ai!(o7Qt0|ZymGE{T(Q)KkemEGI! ze&r;Qk?*@cqHMlb7g@=&gM49#4wae21MT>j$*nwR>21!10YOupc#jhpZXfggf-5eGW~)H% z#AqFBT*D>m^Yx;78bFT3D7J8aue=rhPha4-jsEaYtI4}AdBV0Q1B|kWM?XYdJEpTd z-HMX>@FqM-#~pu~?{%Xna|`LL(+bv$$Gqge2n2-1`3%(Fug`k;P&mBaJUVDj{a;Jd z^HNx0PGSCGQCLY~>R38q{siWP=Y$ACpDMyBLMwt?sfXLV%xGqnu#6+yr)IZCe_}G{O6W*Gv#sRqsSS1elKp{S7Wty#0`;5c|fl{M#K)(K-!J6CnM3r=`rObK3@{ z&JB1F*xk6|9a{HId@Ai;RB%`$I`JH2*SeCyJkdC5vrG>?iY{RKHS~cSop=N_lY$_X zJrpq9Z#Xy@u5d^ys|MR-zg_n04?ODo zh)Y&mPBkC%O~rn(;fZ+v+cs6%LqQQnh!gZV5F({E2;UM>SB+h(jWiG`oOtV9>Ux=7 z%6%x*Ol%5tY%h-i1% z)U*^^w&7WJ8d$mj;XcUpo){jRcuujRIhb~c7LB7fQB#6@CUWbslz~5Ii8}_%;ZWrQ zTgi!u*mRb>GW+36h^zj3S`%1!PRI-`us@b0YEMo(_D5ZIzP^ipk3+tN75)>Cz-08((Jod->p+0kFbBh*&ioecf4%AP`Me*ii>h)}Xiw*jg zgh$B3)hvVN`m@tNMdP)e0`aWzNrh`;2Rnwyh`i|{wYp?|Qv4Q0pclOz3lV>qkZMAQ zqj3XGmxTGfWG)X*5U#j@`l1n^x1MC0ZM@HWm;)Mf_=}{0F8Y1f90#^%GiomX6j5t4 zU_o3mg44Y}KBKSzHI;8Bokm})&sN=X$k#a?qGFIDoJ9z+Yv>;GE2`j@AG%vbS4ng2 z4Ur6D?Hs#m>^}oW9__u+PZf(5p==l$657&v3I|~3poV>oQfuLDuq9eZH+wcmNB32{ z!jE?>i!W|Bi~?RHuMtVee}5Kh!wmUbJDA%fT~HZ2(WB!#o!F^ubZY~WZZ4=DKc(mD z&>?zTk2yau2a*7m$mP8^sC1ntWxgoVal*KC2*g=(4>h2?h$p&Aqv2YtwHIF?>i~-j z)QPxkl(P(;_m?+)Gi)@>Xo<-@J0sMbLHbSst+8gSlP;*FzYb6>PzT!Q_sVn3yveYfKCxAwI0qEd-x6=o&14p8!0ZvEcI1> z{#|^;V*={0pUo9=nmaHSsSfTHm1u4C6KSq+R}W>z|Lo)4OC(*01kxuR#Z@)sy~Wvq zJ+?PmefiI<`@cicLVV{?MXY*)mxRIuk^~O^Jw{y`$-`;$1dM`*+N_#|FKN;#`!qaa zx=IxfzNE4ihOjd?Qjt@uES2MzZ{W&>N(@Vd{V2$6qF}`*j>eJCx`-T{vAxRQdL~^e zgE@rL6{y_>RI)>NK9V(2%xemJFh5wg9u2;4eGU?)N{j-~v*3)vnwCe+LW7L=)!=6Cy`MiKx0b#=S^D*skx+QLjKVnxKW4GHaJT0B?*5ZDP zk-9J^^I_hK(AM$o4*fu9kp1oG20mitjxN;E%q8etHQ+*8c9y^Nyn7v4F!VuQ>iL?t z*PV{L00UFPKfKL@h z1Ao_bq!#XW*K-6Q+~;_21%j%he{t9%&rsp39|q?VYJf=vcqa9qvL$8AG_D|{Y-oGfMP^V5v zRp!&w?p|r;*TwqAt@34u^8O`!9Wo3E#9LtvsN1?&oclzTRRX&R`MXK?->}JNK|YI! zpOakN1wKXa^X*S8r^#!U`@{jUqTi{0mxdqU|GRcHibcOAPhicl0eXmB?i|8pA6vKneyQ##bXab zn18yZ!@6DKSKQ-&*tVtSjz0ZSYiOg4D4Mf9bnxJ@BH5(eu{`Av%yb83#gerc*?#@~ zOtAC3Oo+s@)aE5w`&4eJ!qp!1_k8A`z{KB~!@ry==t8%}CiXMF=KX(a(AdEZA#pj!lC>GfzxaNM|P#t2v(7gSm9x2IO@#OA+<^djcL?ZGX4mNmTfTzdV;KYdZMhH;5XwE0^Dsgcm^M2NRt!7WxeC zSP+IX3M95^noZrpHMQ8ySf;)=@ARi0;`RiO=kxqfJ&{DcTx( zv)$)+4K8~$JBgAlVL<{5P`h!W2tNvBuG zUDF$iBXFPoZS?wjMvG`Ism67Miy*RdAS+c|{Gz11aixO9VnjND>`>BtNqfopG=I%* z1@JIsA5?0L-I=C?j~!Gdf9+9yhG}r+rYOX7!^bX=nA z5YP57*BlPHaqnj}y=(YtS%KXRD56I^}qDh`If zS{qK!b;y(@(NX^o~QoHNLW{hGw2S=og5m4I?TqR|vIjD zBoQhes@W|bD&B~@M9UeQ0;UO??wBw@SG)xD`j9PgA*{qMzbB64ggPW~L~vPf!>K~3 zZ?)GY+F)=RIR=s=r7pb*%1no8CTSqWM5MsSGeV{px|HMz1(W8pSF+My$C*@vzdCF( zC8lN%aSvz6!7U1JS_muS`QT%ZaOLJ@yRtZVtfQ)Nu2y9FGw)pp=WT=qITF;5*n6Ad zE*~&EIH)$qcH&{q{W`wtvqY|NYarHqc56c5L{b8?*WUR1T>2egTdl2tkL_!Lyc=64C5aSPXE<`IY&$UoW*Od2Y&xT{c6+a0C4CzG`A#Um4* zqJ3B0Yim9%TcOZ;$yBiQb3kRWPUiCR??h&<_Ey|j%0e5sgk4%9tB{DUtY9aTJaw`Vbt@NQz5KZu~GMoCWMh~S$G_2QtbN6glZm$pY>RRKF zx`Paw=xQ%F0;=~xxJ9*Amgci7WGDZ|6|6=c+KbG)om=%bREukZEu&E>Ow-FJW#8zqinefVqVa z7ZEVGI>T?{Q)~6bB#>#kPAPS;^DF3&r(V3>2@0-b*mzDN^#L(|y%`DcV7YmE-+1${F&hYD+Gc*`}Kz2#4t{mH1%NAy)ARr?9Y&UbTt&0j&E*rKR_G zf6(e@+>IuRbRSJ&$YTUYuVh`{`~L5U?X%WMS+|I92{ce7*QbwT%<7Qyf{MwkNi`kT zw4f6ER!(ku+=ccs;-s+gFZYe?8s|s5PRl>-toil^3Nn$k2X7F{OZIo#?h)Iql7Afr z`Rj8^aitirc-2gOJNZo$!+qGYhG>WnzKY&mru&!(w;H8)rDShduwKTWWl%m6299+Y z$R31u%m*DoU)RcB#BsVCJ09UK4d)N(aCe9Lbc=Vy`F6KpoCJ zOBhkl{PgFk^3!NKcb-5K9`<4&Md3yaOm$j~qPyDeh?R?V6`;>wnx$lSk~J-B?bfnA z_DTO(7xQ>nNFx;RR`ecn>vJ#N8ObtKv#D_A!V%?L7{zkb;*E_GIY_d!bMAOhGiY_jB|5+< zI_3F}ri~*aVJN-w!-kD5=KESA_#i}``-8icU?h{NiGk?OJk*cI=lmrXlIVZIN;RA`7upuVA&w#t`AeGRmlP`0AwH{Vr3JS- z2u)q&W`CSA>)16%lq36m1=;YEFF+e5)6@9%ijbui4n+-f(M<_JGL6hl^=JU@hnizSON{4L;MURZ(V;{x#6z zwh{>Bx~;5wz_ZcluBaQf8^Z>2C-8a3Hls`Fg*C6|_M%C_VmsJ@mkE!q&6LAs7xsQV zavkUUZL9NLLiW=5A7#S8C*%)6d7`zb;Nqt+`9DYNWVS1PC);VrZzyLIg{oebhkPN& zm7fo<&bg5%JGrjr(S3ExR2Hz-o<;i3Ssq{(en_Idj*jiEerKQFC++8X&PP{%&%j@n}vA$U4SU!syF%-_< zkgOYhJ0Uhi-GZ13N`@K4SQj@F-_|oN=Nk((xG<$%a3%(Jar*K!D1Laaqu#9M^cgjOp3@_JGG*$s#;2`dH>_JkPm)x!0w zM?s~VS7)l_j}8b|v~}3Umf5#LM}5V}vJT248pWAT5^QzJ@#tvP^Wj7JjAbuP(zX|1sECNc;pP{& zT(4SKfn{fe?xmGr@xv&^;^%2f8RbE{%u+3DQRCFN^<38#xW_~9;y+gG_sd2`IjxR> zNsH)=%h$)s8G;s=19QmQwNB7lp5D8~(simN=VYb@XEueL`7`6@^D>2B&*0;Mz|w)M z;*kRRGF!Qk-A#Suar>`=wIeWH$H?E| z-5-?d;-{Y@^9oz<(FPAWh|;!JaG|dXG>6xqw-9tT?N+(%?2nwvP^UhP6UE_JCBeFA z^`EV?PmRbyyj_;*s%HQw6ehU;LFeVw?Dni1rPxOIdrt1P_bJs4h9$7NpupzL!ZlO} zX#3&QaV6D4&C{ZnPwY39kjN~g{afcMOsZzERt}2#XyPn{JRtN;{->8Ly~dx%Ie7{F$S?9y|wI6lUd6$VtA-EQ!BdjOJMP$H^l?zO{9aKf5JZ zUmz#fR90RK^9=Lxy~-3h#!=+F?0OBo=}G#7z!x)6SnPdc@%tCv0y<& zC~DF9g%^)|jI1E5hwHJvQ8F^Uo9eIcx|~>lxS!10k{(u4SED}9>Ix$*7ziD5U-E4IAR zuY&!uD1JM!J%#ArMrm3t=NsQZCi)n?v6~yfYbRK&=SIKBrk5e0KdtUlLNiv=?_uxr zi}2vKLufSzLA|oe&_>LAW9go$a^`E?*e3KT?eM2MA9?%&C|zgbb=)*ZI&1}$n#W(= z$`|RR4Bx=gQ9cvMk$O^A@5<7%qxFwBzS{Qkgv%4t#H*Wfdc|tr`4D& z<=5l2a5?738!#jvCm2a)a4)=;7ThvaBj;$cb3ta`npQXfYGN*82$Ekpoo=X3bk#C+ zQgz~~7n-7yHfhlIc79vvdg)E6`mpuFxpy%%Ar5mH&9TBpmoLOfE$tFCb`f=Iq4}vT zREst%_LF!)I~(nXNSm?e+J~(0?v~nF5w1~8w2G3H>hLe8Qv+XSTo$LNz^@-GUac#= zJ4BOAuJOs@oSz;0xZ9X+a(;mknnR)q(3LP0C&Ld)&mm*=Tkz9p?~ z214C5@FvmXZk=Y$fv|wUeBZR|WmvS&O-Ajex&a#Npnw&ER)V1u$L9h$>TJ~46CO!0 z%KRbL5H3k>6P4Om2rXUX?&t^-^(SEtSe=OHOLqjV8sd2Y?iI$bkzP52#rAP5cZe?V17fOR(0vlXPc0jKgESc;*jWsW)G*IDU z_CzpY$GS#zs@D)nnxoQm`xgwU#{|vw^to!N=rYS^w+qkY&=6E{T4Z7V{d{>U2e-@K z8cH!u0B(d%>hd@9OlSa`mDhaB#H6Qa6A3Oiw=b7mSvt^2F?pmi_g*r#npcr4DEILf zttYu^=dmn-8>XEIYKS2xAT^w1T4;6tsf-F0N!z5zTbZn|_tvC7&P>5uQ38SAjXD=~ zzMfSx*w?XmMo16oYf~kUtG+jS_Aoe*t2-AijIxLXV5W(wErE|Q_&l=5T4$iZhR#w=Xq+(nz7ifRsp#F;d zK0ixbC8;(tBWO5LNH*2f!Z%#xZr^@oUFx3WsOvF$192-2Av%8Rk<&))^@Ph0?c+0K ztm^M6VI|3f;Z??|@0tn9dlR+x@j8xX9%YW0?~6tr_ZnkljG~64h9!*&J^c+A6yyp^ z+B*=gR3w>u4u{fmOEy;FUYsojtRi~J-iUpsLkA64Y6jXB9aZYj@Y|EfWv+qVQ; zjgVmyrxT&b?x=~b@Ov6K8xVb&0^3stiFcFQPksb}Kd|K5`GWA|2||91lTt$Lrs=;~ z|FB`-!9MMIRDS9O^tvuHoPvt7;nRBos35s`BJoRa+w!aYf(Q&Z4o!a$5}nkD`=*uB)r$M&~q zB2C*tmfXZ=oRPoP7B6gU@%99tqQJQhX>*MBw|i^LPV7TmWnU(!Om1P_*!f!fgX8r@ z!-nxfXKB{X76Io2z|I(0fzUs2xb9E7=HN-E?kHvE!gVKPR;2*ST1W0;ri(N<>7>1~ za8O}WVo(AeZW`db9;IV{Cw%d_T>SdsDRktq^5oT*`wFjtwMjS>iCOh*C?h&#$gXO0 zgGXMPKJ&WXx;6SH$93#s3u;PqW>n>9nT*3SuLgto9nu~)&iBbKJkMfoxG(0GZw%U8 zf;r`SLT)ORQZ4(;5rt^VZ?gt7x~*a7r|GxNIOjuZNS)59Nzxm3l(P$|f)mGK5HP{w z{{CrzdBvLF6+p+!LDocKe^A;%w!mz#x4zz4*5siM+nq$@)8B+g0QT&W75A)&@C2-5 zEAN!lb$$w3@{AhYk}Dhq#-KY4oJX@!`t0U)5%sgr=(@71;2IZFhp(E+lwFK6CTun2 z`Fy#d-$@2`n4Ane4tl-s4zrl-$7zk~3~7m8Eqw9Ib@5-%Y@g2WsJFA|x2S;%{*8ug zJvVswzsf8b2|{tgaS|Z3pVH1b6uCZegP)TRdw0u#uJFGkd@VdIW-acjTo<_s(@RC) zbyJG3Nnlp=Wd1`nt4G+L`j@)y5IUtxbQ2Z))oHpSUB`KikdmO+vTJXbR3PY4w?wzV zTp%sRxjxiJ&BFVwu9h&%$!~hR%O+)=kfZxQCoDIU=H9E`!z`3aGaO8*XH-yBU)R^J zkHt_Njp4*K_sRR)>oaVYf@Hcvs+Kx!m8qeGLCY2^hdzT_E^A^pkYo_)g&{AFqX`|N0^NOHAR^5sfN^ zs-j-^DqC}mm4=E+KP6x0CyozR={V0K<{7MTGx`MQRaI5X2-%9K;$qlhKB2FLk^ch= z2nz7I%4?`t^*p&7KRa4kS;%>BCodN8dsOCRJehf$Bk-%eDM5ndLV7n;er|>VnL@-z z{beh(zNmCtxP-fBN|3H&t31-1E?xU{5e2Bl|9YB%$$eMd&rZz55Pt- zH>LWWB!INqQKB(`(OdJ+D*Cb^S!|{GjUo2vUo!5Fb3q_K2bT0Y&)somyhTR8_1ZS7 z#I{%Vvq34K6%^Bn#C`hUqI4Ro<`KBz>Q2u1Ju zvm!s%8jn$LrZ-X&VC}JU7h!5P7=e`r&Kp+v`o3p2=*Q*=ubN~FVhPi9T=Ugb<}lw8 zxU1XqAl`rLv3XlkBX_ngbCBm}1|lkn&aF^MO&}w>*UWrn@rR}i*IQCw_w^wv@6iW> zRi&?MpTvh*sA|v}rMxQ{rIobcO}7cIWE~U9vw}%d$FxS(DOSd9Qt4a2q!`bkC(*v0 zb#38|PYNd0q#T6WDm>U&vHttf)yCOVo?hGNdXeP*BY ze=BiSeTYR;+&SDhqfDR*8H|U6)6imzm-22yYHHPq@~qZsBN0kdXt>)6{Wq`useT3` zv2yFmkGv$Oy9yp-I(6_#=(go8rsqIayi=>;Plb7w?AweAp?cvudDG(fer(>`+2`aT;kj*_f2z$D`2B76ycl)SssPhg+egn$eLUGQpg;coD=F*aU?`&DilA4ExLx@74VuN) zZZMW4eZcd+3lq=RT6sXET;WB=qx(n*w}S;^ybdpm?e@0_5jJndA7x|fu$EeKQu!f! zV&_(@mRH~Mj_+74mi5gC0viJXr$2IrMIJp{FoOfuPIIxXSD6@3aIaa3_p|S#eekK| zMb${X(L4c8zuoroyXkk=Om-jwIMuVv3(9HeQw`P*yXFyEJ$gD!3iAK)C`v^>C)}nc z)TYko3DZ?8QV}i7;|-Q9X?sqg!$?}KWx}YMF3hu-oFqzJB59MWoDvf@l5R7k{JB5p zTrxR{^6|DHp@~@DX8dWqM`7)A%qVwUyb(eDX5_MsyPEu|Zikgh*sBR@xAYuYlOTSf zbZmvqGsYaA=@)q03fHU(vh4iG7tnC0)4|CiWxOP@eb#V~SAwLQ?kJ{5qUmuEHWq8% zEmo>vK5pU^JU*LZRNr1gebJE^h#p@VBGbZ+{(D-7DxBRobZSQpu1WPz6J?359fXF6 zXB*s01x(~TwPe5Mw%FoFv6Bf~_F#9k5nVDD;%p0-@qv$7*9PQy(gF^0e0}C+`sow5 zpkfz^{8Qy5dNsAGCK`(~Pu%(6ew7)YkwE6yG_)3Jy}IHlzx!Z$`Z)d&CgBp=m=eps z$mFuL_>H&F@13{h344-wB*#Uv z7}`0W?0UVe*~$p`Dq`ZOR>;#fjMphJFb}o+;xrOQJl+^cj5iEQR{DbKKaZ}UpF?WX z(*> zN$117`3%^$Ojcn5z?p6|&+$a!yCq{Yz7T+*{9j<8wV3<71c^Pe$qIPyQ{pYa?z9BA zeBjHQEG3Y|$j$n>@;(t|ro5Sa;(h7GNs5MadXj9i>qwgG;~{wMeff4)<35XrU{g2~ z+>Uq-AR$)XjgN0FbTe672z}W;1tZ;&%eLqt&+L$bi#oAacEog3SQAaLG2*$f?m0vH z0I}!$(V&F=FyyP1-po4T3k`b5-mOp{wSv8h-Ax`AB)vu|sAaO?T9G82r+}tGTdZ9t zo>$}p2U*T4H}S|i7Wcy+mA8oa3=;YWJ?KnMJC)u=7ST_$Og%)eRV4$5^HjDTNd-LUB}&cywqKz<57Nc?sF2?R(HI>>1rk#qh8+Po$^)mRuN z!khCjh*m^(+W#8<;qmleN9DiIO!R;}4P%6n`Z*(y--iud#wWSh{Xv4HW%azZx^Ift zpHoDgW`*za{E~bT?fJBy-ByivFm%+6Q8d}r?|EoU{b; zBWbWC5sbe7NpwbTJv*MQ?Q2Hr4IW%QeA=HW@`T!w`eck-(O#b3_suU>$7%_y4Pisy zZnpDu{PnIm$Yv9Z3$l7(i*|x1M6##0&vT0frLxzRLbZRWrHm@I&vRf9+N@$a_3A+! zk{HvujiRj_$8uU`3w(oh|0+SKfnh4N7RM#G<$3NI z^KEBak;9CO+}rGAJSabfr%QTCmrEP4ICl;h&pjAy+uW=pJCdf8pHkjL^ryWmNu5Aa zKUh=TEy11tM7A@xZ$mB~j{lS6u}iXKn@nDc)^|zIiAOwC#|tCbpA&EWxS2bbfRjqA zFXDUzxc@WRWgM1wK#jCNg;2Q$F>5#Z>_VaNIzm(MltQskObYYW`Dc{k;6NSi+;}|6 zUd`lHxrM4lT8FS&u$fp#f5?Xpz?I$|$4w|8Ef6g@TJ>aboj4g8oD>>FMH*C$)Z@4TPR!Z+~U!4n5x z(7nFY)})~W(RmJm21BM}()sQdHkAH&;w+(=~;=9CNWg>sj=U8zJ`{v)21r4!x3N)5EGudw3 zm1?m;UD8ZqQ*Jl|0TY7s8lto8#J4+wDxQ(*_CmwPvSdz zeFo}r@fkID>R$$c9%m{{rBeOdM2EhjDD2!(crh33diVX)?fl@gbBhY?`~!M?FhYeT zTb?A!7*I3eTI{OTDZRJnDsov~qoI)Mdp@>rWR3q^A)5@mw}fOgQ34qg6tJ@#S9Fr4|SvP69H0Sd;%SY0Nicz zOyuK%uo4)B6-F|O?G@*r)D1G5uRQv>+xg93 zOyS3F-mog#qk_qV6+KpKXwz{^Jd;P7_bjf>ekNM=JRr6PF>Z6XB(ll`-&Yaq1?7y; z?n-r%)m8+FT$UZD7@ZCS7?jl)g64_=(0HC`1*B9{#xWO{uxjR)A;+D?Hxp{Pjw2EC)z zjHu%jM^ehgTCA(Boq@}#7L*xJBi=$dnGK4tVjNL|ppuPla!S=7$Lk5pbC7Pc!vK@X zM)F%5g=;Q}t?b0G6!irJyG|Z5?vRID&3lUheOpILxU+9B4qUpKgH$x}W%#|6s4%LS zS)`*60XulN*%xb`+l!GT9rf%&;~HQ2yNF^K6EZmzlX@URFDh5h5C5i{x>tF?%uTuA zm&kEKbADV2>1T#T7HjnbDj$Fzn7Y3u?Fy^9_UxO)LHzrqa3{u5k*Upf=`>~LZPvhp z=&NDLQ+YHUOvinVd$nx+502;UC?d6OO4|_)i8*!LjuP0mGaLPo)g~jmTWIcQFVdw# zi&yU`R9-XudUf5?=||($tA|~*R>0)g=~p{TXKigDNHG#q8(w10mUYj?3y;v z?Hsy#ctoDHM!HIKil=dqvu1X>4a>dbnpPW@sFyrjQt|!05dzuj!fQr#hg=P!og;Hz z2FGNb)li;9L-~BZ=n&$Ir_n!N(fng||36^_f?SduTBp+?I3Eke=W}cI_{?^ zy1K%Gcn9dt^~a3y;cT52>BM69g`Gd9wPpMQ4L8BMkxImPcTP7A0JNcfRwAAj&2ndc z(*yVmjCkI9p0-cd@P;Nz1mKnvnn^#)xgSzgX9 zfdD>RdOc@O0YQk9jN6mi^nv5ENrNaNY(~=41lcKTNQgV>Bjq9S@aJOV)@_P6rRu)K zUcxO;NYzA!Dk=e+mK>l~pgb$dfuM19AvA+;ItMZjiUZhyQL4W83h$KR zZW2ELyWZdoJL4yz_={s?5;cN_B2QjnvMpP-9MEOa_ZSZj>}+_S z>!Tw{{QBA@f~XCpo4iJQI_#R++156lRn;)`FD*c+x-kl28WP{^vgE%%*^$g59#yhB z;bbYU?mO~69VjMnu0tJ^^%CppZ6yeOsmFPK>KT?4j&JFf=UU~}tmdR! zrebbOjJ!*q(VQw=2Z0l(@(H-PA|!Rsmm;{_b4u40{ajxL@wSTn3M+{FS1^QrODS7O z9ibY-dFXzYM|K@<6dj<3JZkq`f1k`cCKNM@UIbhjnv&8P9IqEm?*i@%(_^}G6zeNO z3x^2roZ|YFV=2ae>u`_^j zhFSKlj?{v{N)OC_dEn3#V)&!mkTGwEex8zV+t7RK_I<*z$YTriTG9a<(-OA!m8E?fuC3#zE;+7GU zjLTrF(IJKXR}V8a2@RC7rQd6PFvPxc8~XO@CLQNS+#M$~h+YV95S0+QAfAdCFl zn~RGWbBsfFZG%sPW+?D%BPAe}Rr+>Y>&dDsuLcLgAjnnnXh<5!b7*rta9ytPYWUR7 zku?dspmwLQ9cPVuYyOXoiQt2@`LT%Moo7qxJYDmH*^K^4as4d3Ywpidv zG^8s*`e%QNpoedb@N0I3q3|cSepkHgew1E1uEzfZ4c&2`(t%Z%8#iakwbA-xxxD6k*R-e_{oPA0t` zZ@LfLo(u#N%zQ_pFMr^#e4+jyFYkXItD#5uh0ZJz@A4Sr{2MDQ?Naf}L`Phy|L}a) z)5Od8{rMHxBV7H*a2j{O$a^lq?8YrcC722>Shq!tx4(~JB$)byr`g0YqyT{1Kt}C`t!VsY`nrFWX$kp@W)StGMk0n}xblUH4*)8@l%G!RW*MR z-m;uVW@9Zr*n$;KFG-HqTSGv_<@@z^15apEq+p#GZ_9Hm=X_Ig1Y5?YC&Hg?g_KF! zcPa3d8sLdoVmKyVTWmPwtG&~EyDnTKQa~DTd0smGSo5<}{D;j<>`dXJdxzb|DecKdE6a6duHl?&e^2@8T18LC2F1Pxe0gOU#JAv+ zmvNU2JIQJi;3KO{)-XFrlr@zpl-mGZ`wICPeO6wKF!Q6p&-ED?T|=W_jtcw_c!k6j zy2PJL{U_td!-?5ZTU-)$J8+vea0$^5D3rP4+k$hR^;!X9Z#qg7x%)y z(#679JcZYPF{TQMBaS87oLTsJC;^q>W{hD(I3V& zje5?^DPc{(<*XZ%%T{2|sL4@i2D;WUqkgz-f^-w|EB*c3y5%_H5BlXaCC=zi*iylO zGsL^Nfmi|?Wj15jJyP}eUI1nbd%M@eZowg8@_6CohVP3NMwEgx(%_gJitxu-@_VyW2Y+DGNmTWRb1_09Umz!vkeFqAX9iku zoSs4RYps{jbGOx`zKDDB6fR4;*g&tT7wxvg$^yKPdzp57R&B}HA7HDNHbZt{pIFCs z!ss0braq3Z2QT-@MAiXdeiom^4&rW8-j68W3qQ3nct&~}30<@NIn|SN*4j1HdANTsp#K%V?K`>@<;r19lu(D|RAzUD19@diq;WEy#$6$lclb zNJnVSDa{ss*M7P9GZ3{A{X3jY91N@k1@)T8a079=1OBArjuVf0oFP$50R=`?8j zsdszl7e$A6%)HPRd`pY`(fcDok97{vk|K`|J~q}1#6N>J&;BF}%K0vyFdvi84sE~B zxx}zvm0ILGhWa$G1^w|mp)sAEBE36x6uT|xi30uvc9vq&9pqg_K0HB_R|>BZt!9Q@ z-?U!!!_?-=IFlQSnAQBlACWwDd6)6=KUvuSc5f67(T-0RjUX=Ve_4u8Dp)*4Il_v_ zg!+#{vwE{)PlmE~d75%i?8G1Z(|3YwWeEehEV8}8qVchmJ;for@uIn?zWyg^!*rxv z+>%4sqB+i+qd2&8nmC2GAYQ@$%7OjL57`O+5N-c&D1> z7a8hQ%9LV+-S`n5B6ok?ECLGW4_*)T#}Jo@5ErzR3u!^bC^tTeTEA9%1L&e4^8@6r z^C)by$@?{nvc-vhB+gDD6@5X)HKfRVD&kQG(PiHBSo^Hcoh0ViO>a#<)nz;gi;^{c zoKB$z*y{@GHEd<;1itHnwpr$Z{FlXmPb_9A_T7v-UQj)>B+8~R!|CUHBTjqCsu!eB z9lXOgC|NjDEEefSO#&0V+}yb3h@&0C5lvoxQI##EC@Pm;E2k-U6!Xc4;40L|PgFY3Y!Z2I)o#36+*oB&4MU1*BU*Iz&K1L6Hy;=~lWy zI;24w{^heu&vr5@lOLx4}O6iWR-^hVqy~ zkZan?J?ClPt!)0}&11vVMsHfmB2qHDFQM5=4^|g8e3d_U9W(P=MQwFes(eIMTa9`N zZt&dl1uc@v2*xq=hQ{c#ls%-&NkViu?{s(^c&7Ld^Q~@*Mdy48JyfO2lsS6CO@T#n zQX9u7U%_BCblHmX2034!9LmgNO!LnZu{l<|t&+DqStp+A4qdc56H4C0g}Ts%a!>B* zhn={qH(#RaOkO6VmE7>lSXQ;(Xmhi4l)Kb}xBPe(gpeQf9%LVkulw4qyz;#29dbLcgvP>geL0erG%0*p2^m< zd*hSKh_#qATo&N$nc}R!7KK$Ob-R6#Jn*4bkYVpxEpo${^U*U|@AzdzFQHzxgLEyo!?JVZVIMpH$%M9K)3^x(lrdkiQEqGBkOj zi{qo+jVIC3N=}1hA5yg{9%S%#f~g<$a@8;iRt0Z!uG5X0)5=;I(b1 z2|5-t54K|BaeQKl9c=QfRlu9uZlaO&28m1rBM%1cxYfyT`&YQz) zvA%EKNo+_|FmjMoL@>y9M<@LdAEsto_>3wxgt72SF^scmx{~ff@U}9$?VA6&esZl> znGzz{U8#v`q2;wrkv!!D4)i@~N7wLDgy_O@Or)D*4DO@%ZwYfrnXGqc7v=e@fmSzD zGoY@g4Fn2!>hua*kDNiRC12sv@)34v zX(Uw=w!bq6B&mQzY>zH`597H$E`;o7$r(G4!yOx8Jo3$9T#BXp=c=wo9l5RY{+e+SAm+ZdL?DV?$)G9i<2B|-dyfy;_VD;fYTVxtgXum3RSZ=dg*0< ze4#9C&KmoOe5=CN?48d#I#oxUx4iB-cCTxP{e#m%kG8rOmW0A`=6Ssq0Swje2Qk# z9m>dHz&GO2QMKA9T3xZDhj!OvDkzj$2I(Tm=HomhvQi?wSwPei*UQKHV&$+@g( z=gKulrkpN4cm@q;Br|d;s(4Q!zq`9eY@g#ZqC>9x`rk<_Q#y{h9*?uT7*z{4X4q_YvfHFQ!WiVKgga?xy*1{`{p@!$rg~d+1h+Bx*qnZA+orcFFvdP z%^~ghiLlwRZAYez?%2(OnOy9%q>GwK{ryhcioR?TZ>^Yp z`hajJiRo3z1=9`VhDW~Bc&|{FM|)?uf8C0|)tgZ9T3T574v z7URO`KR@!{@^P2ZIPV*=YI91|DLT8;cMxwU3O>w2-bt~#`}y#4otu7>40@OZAY78; zK@)WD(pk1Y(yo68dcTt0zSQt>lB&^I|7BD8;B!g|^9-?Oubq(Y#j6>lnJOj!%QiX1 z5RcK2(eGtUQsa&O$>#m}`~T&&_*wKO)xyV|^osvNPygEl{FiTi0J{^@rMvyl&+||E z=Z~K-QNxBJ!p)NY7ijbwBmJ*0#KVES_mOtsI8Ai_+l>GBZ_m@gdLVs{X8J#l9}Pb| zO+WnE%758C|7G%|Qy(KZU-3A6`v2qjC6Ks#F!8@u{*U|Pzpp0I3a|d%TK=)@fB(uG2{CM~lrz<)o4!Zvm8)#sA zTNq{U02-`Nh+-&{3A$JYo#{J{KyX)JRO^Shiq;yx+ zwWWxSivlnRAa`CK>V-rryRCs9`)qr{93@mNNZZ~J_=fzUk>t%QgiV_o`&2@%*!adZYhTuPqmfB7QzVLh6 zPmVN~;#K7a1I0fN_sQ8$mXC0tV3AB5?{#`Hm=erv@%HORO*dXRYCb+XsOWvI?bhjs zL$$)795=DpUr2?)it`T};GHWs0csV8O4n@+(?qVz#xi9ajH)q?gfS8yR8p>CkII7# z*JPtDUi4_Id+cyAoJL9{qtZ}(LFr#jk}s`l#lv((+JN3o_Uc2s&Vd4(@ha=KI4;tV zt){`QmrA1)&LiJQt&dimtL+P`_ATyn=V=5=PE?;96+Y9g``PI7TZ)A*@JWOm?nOq7smW2y%!_FgHDV0cIAc{a zeV7UrE{E`gY5m?}PtFQdINvy(*HZ(KYC&N5es_W-;-F%soV9O6y_emix6?zjUoE;E z1r|eaC(P@R)+n<-RpOlUn9E$5*U7Mw-EB{Um!G$Vn7r5(%oFG(N&oM@)!pZ7}9bt|gnmgRR zRiG5h9vZVg+mV!T>nPLCE<@wg)IyryuxPTVm0gK2_ePS?3e*d!jA%DvUOT?um8p<^ zkLjJ*!S~|2O7_!-`(?{=8NAP9UM!8Em6J6~)=PMoDn<9AXgx1LDA0YP_))#!Tw)Y% zP3X-iMir;;Teb@?W6NI`Rt;6cT>_z+3`@PQ=X`5#g(WqmCvMTJX-xm&@An9@x&2&W zUS0_w?@mz+$6suDOMel?P3(BrI89pd1|IsyoiR7-ol$4!!pa|a@NP!jJKc8KWyq={ zMM#-~Rpn!j{d&tKcar%BO$huq2FZsf(g*rbMP;SUPyhTcE1LZR8PS0HopA-UlK5b8 zG0$(hoMDk^Bj=72b@5)u(EA6%CGV}^T**&E6UMoM>f`;VCl#wzWP8c-29e^Kn5S>Z z#N=ImK1Peq$Wm-(^(sn~(Z(B{57uLf4}_QBMLigLm(x$0#AWPfe7*!~1-cF_0(-Z7 zLVLG`oW9Rr+nqS~VIeC4g=(oP7=qR1mKfGnCb^ctzb*UpjmO6=o(IdZ6_3T++WCgr zCNgyJnnKSQXo^A8?g04K>)76ry`+WV z-_2C-i!*+T3nS0)Xh*&e7LP*9?F9C?WKAUR7kL>?MEoeuvqL3TOTf8PRa{6?5uAyx zlM8~!f1SW*_2NNy)^#RWJ{r1>Q%B|f5k0yxjndD7f8?Q*iNqkv&T}1WTX@(agq2k% zVW<}5{CqG)buwSIJBi6}`032fd`1|9?-U%SD%-q^8xWX3O>WLu#GiPOf-p0@5BlgZ zzE*i0)V=?;3sbq#*S0Mfd~Ho>t4UgI;DJ)M60bbW_~#DBV_M3$nd?khy&-nw&}hg! ze@i0lERlf+a>$mCQJKwX`FpqN2ss1~F$Z>p0Aph^h{Ls03TIl)6uqa=V0?R_yHO`3 zf{0WwRP@IHiw>o!?kTtolzB{)J=dNPi`!}83ddb@vWh`>YyNAk74+i|NZ*hVeEq%o zpI}f!6bZV5J!I;(bRP-umMyz@X$_@D2>bz^+w$@=FV_1N)RGN90wqt3}A|>zx=K z+LOUNhoLb<>K>XaIIkX@p0i)@XM^UGzzSS;sL+Bt>{>*0543NQhcFL1(Br+6es3Xb z15I8|&HcxBNyqN%{Pg&fqXULdF5vIffL;Ap53`5Iq$4Hz+-&xBd@3S)Q31BzXYBT< zGZYYPFAwt*nu7s2uQQogBh9SK+{G+?$v0dLBiXi#$?>*gl0G>K2JO$orRsqmy?Am% zQ{k1yUS8Wp-n(^Z{Emw~nQV!`R4DB1jcE{tGA22wtE-Vx8RjwmVfg+w@~Vz#hM6Dk z)(2){@o+p{8mn}R6ywi(r_HAQZ2cjPN!=9#=I+3jo6HV`Y6}-zIjEBN>aJ@i!|56H zSg$7zphny-kKdoe{*N^i^)4TjW4<0N@dTv|<a1!$vD~=zBV~6yi>y<8pyQsONe- zH@yNO>-H#Io|JaA3G>N&Ks^kfc~Tz$O!~kkq%(I7Z1T!xrkiavA31iqNl0ym=BY$+*-JfNbOjy@?(_23K~c z!f>wO*aQG%P+5U7*k#Cu*im^MzSVhmFG8otB%n4W|6cPFLxSbxl?;zB5~wakB|{dh zL6kzp(mSk$A1C*`AkmXH$5FWzX{^I-zdZcG*`M6*nRBb%b`PlmYP|8@3b2PG);!7J z{&)&Sl=XQYe*5UKpk9d8<5A(XOn%Ni?q-1id+wDa$G+PEyV3DZ$AHTo>3mt2p|VO5 z%DV8@gG1-qtEt&eRHdCL^J*#C*IN09`sIJ;XR?qLkGvQ$jMEM{Gu47UvGV3SL&ycY z+7ku4mu$Y`SoGz^51S=ejUey`^A@;5^TIfrs=cg?LPP6UNltc%DjR2|_}le^tGi-1 zQfSEJ#6rR}@~UNZ_hYvjF=i0A($lGJ-NzRG{)Vo4;<`o!i{rRJ+$+hE)i<@+1ycUl z)`z?6=(d(#ZiA+=W#x{oPV>=CuYcLsUoIZbXkyUNk3OvJFZnzTn>Nm^?hWtmcaN}F ze8(p<;M&5c)5oGv)JVe+jU|YYW%n$bXR=dZ>=#hm54Z0SWaMH*_cpof1ek7lHN22Ng9<8i~aspp|x?Y7-n)!2c;r;@>ipl9Hm4Y}y zdoy__hOoh6bL~Xi22!H;7QHQT*bBUXi0*L*+m@4t4DAIZ8{ww-v z_k-mRX{9iX>mn-7Z|{9g4}_>jQ`P49FsnRk1e&ee*9Q`@b_HxF%e;RHz^Cx;-o^~k zT%!GrysbV1ucN@~!x;|Jr8s1$86}T&>DO~MZF_};aCLB1!b6y52G6PIY`4`*ZqIs2 zuX)LLCW|UXj;o)HuM_9q@*wGAG7&QyEqBx(zrWeekJEee&DcDdu5CTh*opf-(eWJk z0uX6R3#b@w?uBN(P* z9C$PNVcb*cnGhHp0%Ha;F@2{&(||c!_Vli1M^E3az#OUd*jKmg1&^=VRd#5~MLcjV zs|)J#2Wyjc7h)fx`Tac6m%=<@@9kz~CEw zw$q=jmJxWn*sT7dX3RD{7!$>{%FPa8(tHypgK)NZz zUMELVudB{?U+EQno%LpD1Rfyzo$K}Tt=m>}br;EJ_m<9xi*T(kPx}= z?Tf?!fBsj@QGy zN_jd2Xt{R!-xx{R@V~bDFv?ewon8Jcw|D@FNc9vpSulAQUG0Gbx+B^96#}nLG?p!D z9Gnu@kRiU{Wz#7Etm5Y!29y|qBO2B6OJR@UftjiK4zcQtN{;W z19h27lc2lTqSu8>^7i)TQjUIkpHr-U7LSPs$vwC+9aW;g*u0aj5Z1Sk*J|i$bC^q9 zX^;S&%+{#M>sV@w90w*%BX&kAit|CtZ}jp`)-G1@BG2>)R8hQ=ym( zHtN2-C;j?v?H5QmKN(5CSSF{|>EGbGNB+TS`OeNOv7=@{wv2B{}ND~Fq%&LoXT z8?7AH070>zh-rPn6f#MH^$CC;gMvHoI6tba1GtMINC~L`MM@=y)h^QF%UFFoME*_3 z>b<6XPo_fD2v}mtAo1_uievXPQ1qf>LnRV~3+Se=5&`%H7Ez8f5^Lwp&Yn*c6K%y6Y{hg{4rYKW0@j zXqNV7L8L=yDesGMSJ3_!TEZ`aD$PM}lgE>shVCo8KC_!*>elXmu?N}{;wb?ybEq$9 zKwM;#XwP65EN{@U8uvY5Kdy;Nw*S=2{^H(al+H}iVs#x_Cs>u(`ct}QkGJ+hxN-!i zPR2Uh?d(z+_*>OGnS!P*(R5$!XWIg>6;0+9 zcLW9>K#+Rp&6hp6C*?wY4utgez%FQew=scn9@iwdYobYe6mp3-_g3jcBc_&~oSRZc zA+`E8;h(;@%3+IPF*hqG08+d4{0h|aAh7LX-?t&K%cvyou zjlesEVekAekX-~BjMrk$D@ofXW0oD@`20Qt6UoahFk=bUp2>ZA=sm0T$MuFlKsJ)_u z>(A9Sm4AF9_@nwkb0ocKTn{>rw*IO>Pdd{2jh=*(S@p)5n;7q$|v_G zp5XXxIGr5rG~&k;=n-Yukh9)A!E~r6(tg_FYrwYxkBQw}y3xW~jHi}k`xR)HX;Z{$fwN z27u(r@X<1KFrE34VgpQd)0$@|&>$L#u8TQoyP|GCwoo?_#%`H0E+Uks4y3UZ*} z$<4eXeEu*poFue{Zt>#Do0i)k6*X=Eum?^yhke5bw34y$mGo zliXa6h&280EL!5f42w5DE>mm1O)T4O{98MnWg+}y0@K=PVStt7teMLVYkY1^4&uiw zYx9(W3gPS9kQd``>A%C#Inp2F)%k>?H`kFgN~Eoew}m|#)lhxBUHIjB9$S>C$NrY; zE)!0P$P&zTPX--yYr`yWWv^iSak1WV?vexBpdP7pQ#ehUNrl70q+vueujRlONFZ<$ z20OJ~EVCUAU`(D56_|YXuI6^h5ztB4z5&DfQg}0eB_>pj(FvyjVC)!Wx0N2nfwN)P z*x53MJzNo9J)U7bTD}6W?YC&Aexj)!RCn0Vx>+ZqTQm?u#YWcw#S-t3vU))$30V)g zKOgL+r#mD0?P2*B}*L?{qTQ}ausy#pkn34)*-|x&aJT=DjQQMmGy9{XRG!XU)L2w9NPHy%9bpd zRrFfxw&Z@KZZq>Aso+B@??*hrC&X2B+F*w#A5C>Pv^BfA+Gf|q4a2amJ#F^ss^v_&nB#qlEqrx{h<&q&oMAQFD zh03{w`BkYDwqPCC0#!e7CESEpb62FiF}`z|caZM@(q~l2*L?jPaa1D@yGFkQty}=I z_%vz=9GsdsV&q0jr0+-|=%CctUpZ3hE8cex$8)}4!#W?@9TqR@Rsz^X9iQqNM%iNS z>&Qd(QNy;rO1B-2u&Bd@Sfj&LXOam7yR%AE?)m*~W8~p3$+?VbNx_Xbfcoidb!;jf z#}tQ4mst|3Y}FZ*9u>p3pnoarB^2dts*XJRPues(_!ICWflkY*Gixbcgz7=O2ioS;`>I8gaYHR|( zjegTAaH-A!4X*EIEZwCH!%@Km#*vv&q4{^+WHz)R>Qcp_%}LcMvornj+HeO!^2uu@ z5*;E4YL9sQz{JnA7DUk>##~vMuNsuOgOgM61Xt=zA?&2Q(dtl(;VvUWDOj6@I{f(33BLsi2%&8dkKy?q8tqKIzFx;`JaDWof2nN0FS`BIiR0ljg z>~@j9zETZ{av_~A`7H|$L7%N|(8Vi~H7vECxT4Wr{+cZfQl$Fb>YLaeBrMc7#26Gw zI~q}J_r0{5iS-|)>6wY3iltJ2hL|{ldknH~k5 zXp+#JfHo_GwlsY;{M~rbF}V1)G#(1bgf6GVviL=0j2(2DiveaBLzxN${9eb0nT9p= ztx=S-E}K(fVt#Z8+MT94RqkQV!o=o;M~{|8jU9@%vjmFlEw4_+f@2kLi4uY(6JFu_(o~NnMre}u@_z27c0T7$2-EfdC01E2-12{Ys zy4`|W!!^`Ap$|q%ZJBOyjfbh;{W}}Mgyww?N90Sro(J1aA0Wb{vZxh0gCO=TI^ZPa zv=%di6sLknX>*_a6-8!J{C%-7Fi9P6eRver?AKm!n+9Tf`nGLOSh((*-Z}qix4=>7_)pR?UR}$a6naF$7L4?UQ>r@k=Efl=(8NoE~gUH!Ext zII|9A%5zzBrr#;Cyv<}nZu}k*BolHZq0V5GLuQ=bWsmXU5l)Bbj_!%ojec9K?Oj(! z3z^zd^iWFCAp{Gne-uHf(*n5Zs`Zv4I&r;7OE&k-*|^6xJ=qaDRqih8U+9&dn_Ts5 z%en$N7Y>Cg>%Z)NF2h+u+q{W8qy;1=&gsV~>LmamUGKGmDu} z#x`~@=`Mr$kMt?Z9TS7Fo=z-y6%g?A=|)8_#MQx(237oO7(Glf5BL^Lx(mX|UfCpG z-}l8PtEUarN$ZXg`Q@O0uZH(7vR2-$Z^a_68i!*Ef99-bc_@o2O#C16CpZxrhpWI@#Y3 z#zg(g=s%jYCdN0-jtx?Ggg6Y{N-ZJx#VtIJd)fRB^Qt?*sf#Yq20(1ldp;%soK~)m z*U|RPfR1LdI21`cqY0IdBp$|qg$(4@pG|5lx-*OZbLviLIo)^HzBF!%m_{Hc&nzO6 z){PkUSMI!fcI9$`etMAu(~M9o0dHUErKrY&XK`n5*FD)QbJvUhc1Jhf_fDMmC~eVT z6<=I+x#N@yu^eLNP`bbTsj>NV!l+t7E+Kt=1iPU6$1e&8C~a0nLGT zj)%rwW|s=@mI<&9TNO1k+@|jo&jNO>z-?#cHt9kQ%}V6(ZBr)yPkDX>4mnlnn0}aJ zg<$?A3U4VT1D*T4@NPES-?ku|5;^d%li#B};u&1MYRdZ}h(?$1HSPzS@i9aRedJ~b zliHwP-Kyf-U}BZ73{L(118!Hf`M{`A6l9cy8e0EcTbQhA*xJI{q&gC zLTS_?puSg0!w>sq;+$Q2!;qTshQoJz7cHi#B3%c?vT1(3M9A0+ZgH1U%2B$Q2(2r0 zrY|z&Vn<#67tBO0L4C}KAHY;!E%0N&STE(8M^*dw4LET*K$u5kMqztz!I$ zeVTX%kGGkOsN<0H7{lpsZ=-HJAneXJzd~D2&=n-aC3cor7<3>Z;b|A>E|fbhUuqQ( zE9`)`#py4x()j>jP>4h&fvAr`wrT9GL%h&V8SN~%4RQV z&*M-k#ArbA{L`!Tbx+ zh@=A$_tpb%jLyG~x1DMzL&$KZ-wMI?Qp?k91fOcqf9BJfnX6pU%V|ReKFj16;A3eo z9$>PUeuAi;C7Mmi5z(Y{1st;q$(&XBLRQ*CHP*wKB77zJhtDb149Wo3vMMoma1C6r ze%W}Y>V9|D?N)Wql{GO^0*tK4v2$C>aGV4rU*7w-% z&Ait}5DXb83gGho>H%2R?V)C9wR5XMg(xtv1hNE;ly#=Uq@=uO$;lr}$5?%Sik{}KRf2jh z4y?LJbVz>5P>blv;pSMbOPc@LFz_SD?uj4RoSTZ9DD!?0JCuyB^CMXx+J8`C zsPFhor#K^?n!9CWNEFj=d9q_pmh6zg`(SpPCk0@gbj(*OS0H*nhLLmjYzpt=nkU-& zjlVYRS3@yZ3+%JRXLA_e1?tfGI0l=H*Mg8$SWO2_#v?95;TB;SlXd*`eAz#V>~H9K zdKG+j-cQ7i7F1S1U+|z0eM5rMxN72JGNff}6tK1fd2uv|W`CXfm^%-o(dwy3Y&AgdDPK*Er^H2rbZo zMO!*wNxOPGf}>PY-{jMYOBL|wX^lT&ZS9wQ$mS||LUyCc)t7)l(WY$XVk65_Wt6CX zrrOfXr$wbkXQVuKDOhVDFT(IVf^wbVM&f+fik^8b522u382CbA!l;;w+WB{gHdN;D zc(W0S$1a(#JioRsA9AiD70QGf>*CuYy&j9|9F}>wI$U9!DPL@O9EE~PkP{F0yV4sW zF5RGSoR>0JXPwmowN5sb`+Zl)iZOh~-}-!m>p_n6Bl^{J`maJ)4{l4LP9{oqkyo@? z=BKXoC#iEk;(G8eEx=_}A*4J{VqLro#`&ef4Cn1zE@Lw$Kf3#6SVt|a_Xoy;VvbY} zhP|Ng7d4X#LnM1O$mXikynf%%6j_zopz$8#cbhC#IwaEv+5X-HukRs`?ygUbf*_6j zP8k~s`=}`E_BsTZMyViDSWR_qIRNPxsExGAKR~3mv+8vsOt|%w;D-E$F*RwTh*?Yt znRO`*vu+ky@D_xfLO(Tq5ImP5iU%DRy6A7d8|chH$x0G(w5vhy)j(qIMwn^WkH9I0 zr?^Oa;Ax`OR>2Nsys8$GHU+lLP{CZJxc&|VAyVH$fmIGq5&7F5rV8O_vv`mU*ubqk z)<8tb=c4Reob2~kBjP)RGI12Rwj9b=+pokf>Z--ZZ0)Afmg4I|dat{sWsU~pJA%?t zkS3=s%|!D~klYE;DX}D#uK{n0CGFc>ugcLfq2lM7#uD+%w*R)C8W`R+Pf-24L?<4f z8v$B?Y$I&RwnK34sSh`5M7yqpy!cZ&2+(7=VhPly-|52wAdAV;)i?xkfx>}t1yWwu z6F5YJ8K^xJt`hGB{#GDEJAPOPxwF<)duAUXzvkbv0d(rj$D%RFzm0gKQqBWc{3=`* zD}U(SeFE5xw!kXXb6={H4?{PsHnt<2DoH&!TJ@sZvV-6*g-qDVvPrDPc(yH`-6AGd z^2U#X+rQl2>?D4>nYQc0n7O|+hPhY0IG8&k&?4ukq*1okf}{*$*;zzE@Q zgl&|92b45e#S106H1linvv9L9JV+Li&Ohvt!`2yk-amLTXUW$<(|H+njB8i}HaRq|pbA@y?k39i&nix1bOQd#_j9V42iJ#<2qj zqNpaK!*r9qqifJBO2xW7X7tvz1K1*)a+?KEnl%zPA0T+p$=Yqtp<|o>u3dZQ#em0 z^uE+o!o%u29@+xp&@Y`Q_m2;Sk3$&<)#h|7@7||r+0dKeBgUOC2I#ruusjZQEP4(IV z1=T^*X;}PgyK7uKyEQ6NE9j&wv^O2K*+R}bDiboeNBtc#ckN0}oOX~@LmVNt2Z;yn zj)ZC`&N;JAz&kE-R?&6%66=$+P*QQffTVt+)HrSr-oWMyr{`6fS-qt#tiMEjs#r*l zK{W5ex#2h%G`5&Q=FLj}J0h!<=1;juVY%wsqb4U<(#Bg`1#qK+%eX-RD+adNEa%-I zyXy&j+YGuHC0ZdER9h8CKZYxb_z}q@zCn|yI^*D_`7gP(YN@RN zMWsp56Qn>Sx|s~gF_!>`?~{Nw9JF<9eG%b&8>{UKM1T_7XBNJsNlBLIecE}~eZ&4C zn)OTBI7k@Q)L49ae3n_822ux9*LCN~-}^KJO|f|WBe!kKC-Q=t{3lXAmqm8fLrb>kU@~IJ`hcb}mW|J4a51#2~mM#G2*UuUmFdowD9|xQJO^ioX@?EwTn_>ooBdl}(! z%)%y3xI6GPyHDmELcFP|65U~x%u~d6^wSyvYund6ZdBK*a#*~k@`?lRwn$c-Zdh#A zF6ps*Q$@5F;pFwJqr!9${&gNz4D@Faf%Xk(gXK(5>LIXY*Wvm*+50P*hB$z#$UjAS z@+`)D7ulNnxhTzh4<-_^_dOdjeC=L#rD-(i-e3=gLoH3Zcs-7jhMN^++m){#q_%&D zq<ziS}Kw*+jps_%rci@P?YzlceX#8)%06Cg&@?lw z*D@n%eT(IoXr(lWvh-89XYo3JBYki2bzwOuOWSA)t%gfNue5RED{t3h>%7d;q4({* z6Gvy(i(746Uj`)EIL`*{6$V!>c@v*V+7>Q6^7oTE8Uv-Cc@HJ`YrL_y-@qfg>Xa{o zv^cZ|lVI#=?fjK)Z8>Gn;=D)EUrnH#+33g=+B{?&faYfMA}#X>B1IY#5VLh?4V|s9 z%aoDr%|1T@0z*Gu-OA11b{1^j9C5}k384ICPOoG4)umCoP7#~%+wp#`Cel z+XKd77p!7yOE`DNy{eseYt73;7ppS^Yl-)Bu(szn9acsJHG6#ay0z2Bzn`d1f~J${ z*`efCNW7tu-86zZp9$a?7vwHuxc=hNYfWNW#>Lq*J5Q$uG2XN`sC0nPMF_r8HGsP%~jCuO5N7FpSTK z=iGed_gP(53;Ub=9Jtvj*gQ81CWe8BFBf)9mXo5+wqx-Ciiz+)+yz_k+0-z1QzxVaB_4CSy z?r8WLBHeSNcZ04JMP_{Z?cRRB@MXk(|ANvUwJa|ynZ-HpD%I5hl zxo7R{zC{&=`s|W)TR~3M8Zx2!7jLOPInOmPRssU9vBW+^U?y4tiG@lT{?Z!xFNLUv zJ7*ovv=P5mZ2%_}rxcN^M-KiK6j0*7KQ~4?RnY_Kb6*CegY6M|-L8wpgGI&%1HjF+ zAcV#)qzhjKSHYBhpC+Y#Q;x=(U1w`Kb@0uf%C#ZD_A_cNC#R_nrxrf5-!O<0D4sM~ z#vbb1RBnHaxjyD~;$h?X$#5+B(PTiX_T+IjqTT^=!;{w@)uf_7Dc7a2!je)d(e3D~ z@$Mz%I)1ktTO5ytpilQzK=^gur$Qo9V>!LgUFbgOmMOGH#P+!*D`x1{%LC(XlS8f9uQAYHB_eSLXp4 z{*RiooF9XD&bn`Xm2+z_%gc`_ zEVzy^4Is4#>R(kEc4M>why=e@bmO*tp_O-}V z$zMX(RG?(4{vOT~-r#i-&M5v8vIpZ;l}mt@nm9S3O2SeQb-VNN)BSI$E0_!6VirA2 zHmx&=(lwA9b@4pTw5mz?9j37Oe*%Lrhmgr|*lK+mGKsp*!bq#gOY3d7Qft!YF8vss zOFsp)>{n1>FPp}pnaj7TK!4hMa0a1_50FE&t;wi* z9cdh6PL{%@-`soeElbR8ms76eCo|7P?rrztgjqKyH#GC^lB%L;Y}AE4T?fJ@^R!S< z6g9yibX>9VI!G{f7lU*%Kd8}czMY%3y|qhQTKYuHrp^ct^;(ySb=Y~@0cNER7m?d< zGIJM|4qZjIY}r^Y{~@Q3k)d`ed)v|?le};@t|7uY=DkQ%6G;+L2c*f-R+~r!!p^w! z))tM-%yYI(HJJ_%=(ZU9xs=tlGhkaSWk%g-VF>*EJ9tG#i`JnsoBxt#y}j5w7Tf1; zb)AZV%)g6oKE0|B+i!imSTb?k=-wrBM}4nEN!?;!OqaKsNL+qXtoOBW+j@y!+XUmK zc|5DN==OCYDcy_dXp(E@_+zlYy~WGRs+JUrIdgVJ-=4n?MWVn7O@f}t#t zz>N|ht3#ByV*pZ&;EF&5JeV-dgms;E=suwQnMN{&efI?Xd zr=!kX1d=qg6Qxj1rVo=7+ z3a5X128yS1C{981W3(OtBnlx1w3s~*lRH4^+=aaZQQ0uc0b$c%!6e#Fw_sJkwT_`LgXbvOgX*h!X(sd{? z3xeK)6;S)imh((P#-U5gK%S@3+S3Y8q0i4 ztpGxl!oW?gtjvB+DEdWS>JG$?6`*_b^`Wq>!$TB6(wwNffu^H@M}6y6s}s+IzPxoe z&DIB4^hb`v#pWu`5GZT57Uin`p6k;WAUrDSRYQ=!Ir6psI4b4+*R7E=IhsQf?w&^d z;zCddtt2Pc`DZpG_sUtVn%XlQ+S-NsqKMk2ZauqZdS)3*P?IsItfxXMVg({~MY61D z9JNnQ-v_uNrlA7~t@asZ!0$2H7XDFac)EGKy9k*hogQOIU{$+p^-XKJK#5!tZ`Jw)zj)`WK)ACRmc)@W! zr~tS>^^&ItW-8tdDg*S&Iq&-tf1^>R1fSdJR5g3cXbp7CCBqhu1=-mTA>HVx9?zLL z0@XZ`hYo+5pVQNL2WfDg>COl1r06opfzgjHF$hCW!K6_s?bM<`8GvIkx^7f0cf=6L zXXXs9xdBa--z=};^v9;(K;|1TJl8p}0`b}ptR8)@BS-!ReQc|{5Y1O^q>&)1y3O01 z5gG(pcsT)4k|Ax8eGdt5xlbwZ&+y%F^3aS(`gw7w^y)fKd+2dFcu90 z5md)Z-22idm}bW3Jm$xu-dxRAo+f3M?7!3VrvvzD$5~>;87R)jPh~X0xo*2aDKY?5 zPNDjy4h4Q}Ri;o+ddulj79hGV6!2!apB@2dzQW=Ss7WYPvJA*LVnvOwGjGbt6;~El zI6vi~&*ZIaXK(!Z=YLsU4FklkFv--8a{|KHNI>ydy(|1$8ZR3Ppu|oW{&d&>PV@hl z7v7(7zXZDaeq&<(b21UHRTq$92hMncX;kd==cY!gr&d){`vxKBT!5M8g_6AI1m26X z;>#b)d$U`n7*l^Nze-zLlzn^+!z%GOfTA})m?i9=DWYRU7d|m5g?}HVcM&zGaRU}q zL)A&^w`DVbOT)y=PDhU0Jw_;|8nQ_i5F$n1dtXumpUVzj^zK0hvV*@cY4hN^unlA_ z7eBdPM$!yELq!0B5U)#x4Y69~A58?hBmFY$-QGCjBTyGY|$0coa0y@pM?R2GyXc}@z@2rLdtVRo7 z8w+Y?y^y2c9nK$Lf6Piq-?*IeU7Gq2Y!{+)dCVk%r$IW4Jad9zsd}Q=P z&NV&XGe~IEkN>wX>&WWKY6wm-qTJxoxAP0oufZhc0y3`nnX~Za6OK3ddV?{T=P7k4 z+<4BSnY(LyyTBXX7Fi1sP;!H?g)7!h5Ht(thP5vF{rPz{gcExzoLW8Eqjvd!z%q>(%$+51nYrU*joSpI=E}Zhw_T z%AQ)I2Gy^TAT7OgHeJY`yEw>fH|#}Pd;ep#qjAY29X7r4cN*EMH6=o-eom|}%?NUn z@W~iy{$+ohz1YM`<3g|0+Lu3UvecK~Aa3|rN2&WijO`spoVP0|{jBaRz2ruYB>FNz zctQE=W9AEe-5&Tzp6GF2&imB?`#Z0w@R90i43BNn^mR_Kw{5t!3A|m)_FKDcel0%Y ziJR_tcg?UW^}5P6`NRyH^nH{m@~myhTV|z@`eQ;6`w_lPM7oq%vk&xvQPDPcf?O-{~u#-9Tw&KeGe-oA&Q6rf}&EAk^+JZ3J8*llrV!z zmna<)BPJ!%C>gI`MR!UFC^-y>wDh|lWzG@L_xHKpKaLk3o$-0@+ror}k+oiaWaIlP4Ou%kr+@S7d^O^&Q@AfJz6b;mMPF~OQoAeP zRe294;N}0Gp>I@lFAk_b9Ne2Ui0l+pAIES{+?dUw3X)GS^5TLv--BbjuOIRh$xW@uw?;JvIhBX6CO`VtdQ9H zEal|yzrMQrUZDz8u76?n|9+U?Q;nTCz^Wmz;$`Ia@Xw<}g5xR$o>6!_XWb@nZQ0?# z|9GuCT&Q>qo!7qZyMKPcmgo8B4>ZiN!6w{_QNCIx!{hTal{GRZP^G_@6+%sXM(kxW z$~I;B0DI#9`1LPi<7w{7N2&;n$>6n9)% zdn$&&Z8Ce|Y+gF(TDq!o=Lg~@-vFgN^H#$yJ zeP}Z2gc|F8Rfg-~jyrPa7gG#>kCL%3N`ZX5K1%pJ0x~WU3^%wz3K2}bkNaR8RLiTN zsK|$y7Y!qIhr4G>aGI<65IcfEE~_mIeR`&tkoD$U+`%Av1lLLa(s5=dv-m4r{hyGg z7In|ZZLD%m+RP9nEo{a29ot-9H6)OC1wlNhi#S|K%v2o@aDN9$&jqjsNCCF89au`# zfS@K|AURav(!;%Lmi>Sgr5L{TY$>)kOVi1YeIW~YWC~PlT5^oeyMBAbX$*?_dt+3o zras>M!|(6#cl}Uq+40ZVArsE(LgE>yYVO?*JmKRYJAZ=++b<<4&>Ar(-(H$&0d27a zAtP_7L*F(Gs!`l78u?r&NlIq6)%H=0jK&}RJz7HRNE}t+ z(aB<1Q;8OD78h~(ag`CWN@ftOH6#|S!H%tZDr#lz%7lwJVe#?do?H;{>DY^Mscqz3 zN|c-acx`-Y*gC?#zkQ5BXthL*T6K_txY+Z&8xp0=n^;|ek{~~|J=y}aP`OYG8=QqI zF4H0w@rw|#?d#RcS6qNG%FxU6Uy)*ocH;2R8wTJ+k*ACn4k>rextpT7CvPe;*i5MK3|8*ePSG#sxbne#=!2;&VwgWPeD4X7yIWy$gET*aAz7QN|P3`89uJsl5zbf@%TZx{8yH zRopRWGmC)Uk&oPJLQg!3J{!eLf3lB$_XBo@8d%)U6?tQoDLh+_0O&68Q(8>$_RpJx zCUxerZkU~tt%eZu3`3cFQBnmHNIfLr?tar>ZilKO@q_RgGD!_vsOZ^&CP9fiuNUus zY`LPY>dA4xM+N2rh&^59PJ`RPUhpf}-WEvQdOum?7k`l@lIhK#=W!BBhamCLHRhX7 z_kdc)6Dcs8JHdxf7gpRD7Vvsv=(&_q#r#4%o}?t1hEPH_SDBzke1c!!h%!YUJ)yDq z1~5`18{a2VjVp&ibQa$4wIO|F&y2kW>cgzRcPaD4Y4qkPgB_rRC!e~1=*G$Ld`CMx zy(tWQ!7Avfb3NDGqNH4N5l&NZM8qgB_r}jk8Dl7eX056;HnD%Xo&waM%ZNxmowAa6 zQ(?<#q^3Xa6*8?msB;=nlcS_Nc7=i}um%u6W5CPpfH+oEW7A)ThMFKh0C@H+%IEJN znba@3nXo6roXkLvDRZ$5VKpFLWx1deZ`KQ|3^>FI9J<`ln*Lshz{Xkxm~<6uXtA$` z-m=HaY^#6vt#~@URe8%6jyES;J^T#vufWwwTQB?4SGtLERCwSbAqiFPwGR}75?@bG z;f~vM=ihQPcx(uY3VAV(p%PX`?EWgRC(QvtX!Qvsd;8^X$(WcCn8kg}c`A&>JBq#dmCR+c>l znU0+QW8nZ%y&^4W5K0w1*Ny%&Z~4k?5)pkun+7;Xqhme%qDSQbSy!lECnBn)FndL@$L!+BnI840xqIuYZF5y~-M zS0E|oi0YEgmaWbxr)g(CPom!3Y6 z8vU(KY@z)tTqmiB50;}rk4g>S_Q8PZ-f^K)`K`CBViIzm0Xsv8WGVyAWmtyuKO)oT zvdg^wRAFuCiQ_&yc85cyqo>aIHi7ywyl(u!Jq7K<0|F$Qfq}0&vMwM^bxM(|Ft%Z&$fij~1K5~N{%#7;N z^P$Jj-$h9uP*O8BH;)xe!aQUxjb0^G4S2d50FN7HOitNvKL%m7lXEgO7sn|=P!ZGPrnO%EFbZADc8_$ zH`?};9S!2{=MltF`+WlLq53HymUI0Rb+cA(MjBKx6P2epfwC7pNJFJ*Q?|DwqTjDsHI zpl;Cut`+bi0WWeIZhQvA5sAtCR)6+KA)*VGp@yB)C1S(~nvNOEC~R89WYjlXw9CcV zC2S2iQBoJ4hY#zlAZ1&8dwK-3$6WK16{+kp*ZE`JrFng}PwdWni ze8=o^mzTifd)AB_hX1{2Ce~-olxEuwwmos!P0_0*m)c)e)xe^EbG32Vt ztb29sgH%Ld-ag;lqzzUk5%VCRg$gUYjrj!ix)aM$AizT|mf&&Hn{TrQtz}j==bcVv zdp4>F^d>xCD>~9ND|OeMz^>(qeMc6WXE_aM3v2@Iv~Rne!4A{cQv^?6v~0Qrdf;%4 zy!qG;3HR8((;Y9hcoJq$gm#fgPGrcsxy7lz!kBJ^=O5mZH*l$Km#O54ZY;YZw>tnD$dqLt1@I zMG-3KhTKohXJYDgBl1*55PD)(BGi4IhMzq7LPCZ&S)5XyBHl^AUtEUV*UkEUad23f zb+zvOxVJZ*3~O&E2)=PwBK}3zF%m3o+pkib6LK?3K^Y|Lw(vPK#pd!{IN>ApDheCK zI}_z;^fCD{P>m}wo$=G|sOyYcgd*d!aCXyr$(Vkox&?%snKul9YQeDMWVe=ssk9Yr zt31K_LQ#Rfg0hQ=vfh|Zf5-2EET`ps4(wI+L~}MTXzU^$!G^31)YULx*0gDt(}LSF z^jgCY*j8GN6g!ZOlb3pogLy>u!914J0SMLHZ2w9Z_st))w@Ul>Q_;}8F(&L*eY@q& zu~(_tPKzb%RRjF0t%GrN!llGNQ*Hbfgm#P}H_Yh_!JKIb?rH{)`$b@+(X%wcI>2!C z_Qw_9y4OOb@v7iGddS0Zh8D3GA7(8sytOoj%tE7yQ% z-LzKObQBEpgbq85=e{YyrSsbpUi*KCTwn;S^pu~EC)Xfc_^cGO`LDB#X3^KuQ&pmb zT7cQDANw5#_AkgOB}o(4DO6DMaS95A$KY(wdQlCT&@c!Sf&oGO_>zBI@NL%K*V~yl zp&e{Fm32=+z3%<@-6ykG1PHgXClQiM?%pW&&yK;zk5^5uz$W)UAW=7Lm#NzMjsW3D z@6%6WTh5nXhfI(*_Hrc$km9+P?UF?iufEUdoa^@tzz5O}ob_rEVS`jsOZ%>o#yA1s z62|s|R>2O4Qod(SA(ns{TCW1Uj|RORvQd1W5cFz!n-|08D%OZ5N%5WiHk~EyHfMgE zYYOV&1^ZYa)*n0X)55Kj#R~-RLY7m`B8uu}ckRn=86AE+pEW_0#qE_L|0J0aXI!quMC#?W}`)3f}*1R2gYBdB*>sQ0@bcH1akMb3*(}VP;0g9BmGzgB_V#$F zp_j~jGl7h!QcJGCf!~_?(h9JF&2fTRq#eL*^s9>!uG}~DVy!*?6XZN94v&~F-JlxU zEjOwihv(Zm%HCcic&2B%P%BMuhAP}ClIgkPUPvqrDq|SIJLZ6g!t+Kt@J8W>nx6gs zYcdbW;ZSn8IgDjCjVBPtvW^5Bb5R}5t)0Oo8Pbac6(zk2vJApNC@#NoD`-69 zC==V*&KBY9Y~|W0#?D5A^mwp)*(|>TwGVIJL2(j6`{wd46JIk#g0}r`T)c01r6}EJ= zp3>BgG!M7NWe8Xg+6Ds})+4Mq1rcc+b@e3y7|%c5r2S@s35XnKhN3X z;SvJ^*hsP?615iFQXGU_E#MOndgFa5$q!sfA3!7$aPOiYZFpu%HQwDPFR1|*S%_4ajhnZ;$dk$yjX)96d(O0PT zmxZ{7$PZp54lKW(2#cZ&rIzE7VwwgOOEn0;crm9&AhZreIPAK8TDElu_zleK=x2a! z+a12r!pH^RROP8iwe~7Pd6+z4YPLLwN5B^Seanw& zX&?H9S+>7qN0&G}0so*XZIy^;Yb*U5H)`{~!(e=xcM7IBE_S2$^A+B4CrGD<0YwTZ zv3}*6XxJ&3U3qGA=NkulciM4C+S}%yV4yBmI{~#69)T`@*4NqRA0N)WHK;H+ykE3s zMFUN;2oLY?PS{=VDEe3gFfmoB+{Fg`jy zsArHL7PmZ4-~>(T(`mi8o*v0eEQdbGx@P7 zigCOOC)lspk>l}oHzZZ(f&4V~luvCtc<*w9xti!k)SwAlUZu zoVLV3(aoT;h4IBAVplfaY2pgd*hG!9>DWwR1F96LT~!FM=anH|OIQcQ@N12lh+?Bu z4xoM`rfs%=U1roSVzb#DQlnsRSM?d}D?ycDkF~_}n`dp%MH1}JHiR*P7%`DA0S~1Xu`sJSC5#Zq znh)KG*`sbopYt3F&}Xx7GZ&h#zSTX^6`zmXz%P|`;$!)z#o)@*S=-C0=6^aVHa6e3 z|4e7`(*#Wq!ZUe{1tcO_*4bxDsP%*(R=(Ob+c!s$?9<^S0@QbS2li2p?sGVhy&E%n zIqedApT(`mJ4!vex5wi^mWciHjDz4HeU&FGL{|h(!SKpme|+fEQM%KXnc`R2 zj!BZn3R?uBGgzhbZWr4P8^e0D>%CQ34Q!Zt>%r|Q6s33-kDNcq4a7nfJ!~8*}RAjoZ%}u0A^>ND6ia-_~@WTm%jPA|LrrTy0*nTv)sT~_$D+zp! zwkBc+rj#z^@1efThp%vt(Y!1??uzEY_>?v%onNUVjBi^6TrLHuj}+d1ZM-M6&mY?a1_QEjS@Q( z3`VHN{QL`U1=kTs_=w;d?;fr_`JNCCXnAg|IqT=mRdn;XG-_AnCe`+i^B!#wu^&+} z3%4@N4pXVAt=&tzEB`sv!u6ds`#N@{%RdqEZnjcFmv%J7oJL2qr92E#TK4X;_$Jk# zO%TtnB?g;hhUHez3n4W-j=z6;>cnf3?2?eo&*_1zuN%9M#K;%JaCPt+okk0z-f(jq z*C1Kq=^x)<-&%#GWYv%&c@(HB_w1s2M7}DADC|%z1GBc{;HORHMM}&m$go3+Ew{~c zI;|bfGvB&<8F*}I=>*uV`Nq}s@FE*v7m9?UNJv7)>Rv378BLv*mV3{MAYmR%Z4j$*#zC}YGQzCd>;kT%qQ3Ecv}7Us2cDjI?)sw!!pqQQw~J}ALCc$kb77QB zj}?YXqAqH<8|n(BJObMHb=-kKvY!xoH?S1+A6a(Gj~TKVFB*`uFm_h}q5ZSn{1Nsz zkA)~C1}?ZQV3mEQTg`65MVi5xVGrc5IDTUO8pKGlTrdac-0^_+13Qph7r1>7hd%ZE zC|QJ=L*rWMCBHkb%$d;BYV&G7_ebQM31Uz0;VAM!Ca*qwvR{w)DnB9k6FS*ybDD_N zr!!Ox^PlCwj3^lnfLyN)PyNRZ?{|7w2sIDAzHk|64fN76c6z+=vTg}%hGY%bca;#C z6rb4Wicr$o&nAt{S~Zokqo%npzo3h71*qy)tuBoVdCopD4fP(#A+z-bjI0*{q(lID z9Z#G;IPTXXYQv|)-{E#;(-1g(xN@_o9j{Av<+Y^=Oelz@d#95;fMoatEOF1m3Cr!)m`2)e7ULL>NI0kpgx*GW45|Qc!^t!!?jA|S zHV3ZIIqOh8McIfgIox)#O9-5q2th_gQGk6^H-P8z(4gh+UHz%ozNioPY4@IqyVJ!J zB=hX>AY0c8O{?(lXhY~6>rLf;r2hkdF*unOLj>;jV`N(W3>c5j#mRKxrXQeXateC}y&c%{Rwv<{9C-O5>T9QC zbf*ciAS-mP+)61F5yXB4vpZ@1f*W5fdHMnPZFYa@z$Vk3igjfQU4)EBp^rny2lScj z?e~xzy2ic!#ss&VLIusxZq%rN*R$!E1mX@QPJI~-6av1!rhg9^V959)DDKh2A$ke# z9EBM5j+P%((q8p<_>EvK*z}%>{1)vMW{U5_q zYwH3I3ce3NU}FX+rVw!2%pV+Ig8bc1R%~;^Sd+;3OU8~F74Nl5>UW=#?#eNGAj&ho z04i{UN(3Z9(7?wFz=<<-c!aiU**?R6#^)-L3joo!Uf%T^gu0qIul3?|f1UV*bUMbr zb^_F`T9NW50tXMTLA+HREA4KugkhAtF$dT~(JsUbHA_m!P8~IBAxN@dmra;ExEaEd z{Mrkzr=Ko;hq!VtEw3`0YK){7#&gL;=VJD$TL$;!j6uYT>B_=E3E+a$_g;(I&If7t zioY6yW(JYK7f3pukC5h38HY9k%wP?Z#Vx{h6DGP~6^NNV3y!H?8|zsq_UL@5 zJ;j)o+ANg!@(CftlJdKKFZA-BBmUzB6j_FEF|l6juKKZ^fn_dVaR+z-jx=y=*TQ_# zteH*s0C*}e-2)=e7ghx*odh37a13~cgdvUq6bJy|gAl3yVLJW(Nt}F@`I$HzC@Bg7 zOa&Zc;*ZZHU0_3^RwwBsOXySLfUuEUPlS+?DDl1wCuMTjLj+&Wu?At?pvdH;Hzs5V zTw$+X6AQ=c&sJH*75my!^Lp8tUMxa1T|wKi0{T22Fu!zst?9E01SUKzn{xz5iiR*g zeQjLdu?zWXe5tj#G(TZ{hWC-%Kd^i&khMG2Uo|TE4rzmSM@drN;WI$`>d1)ffhA;m zT%5?)!0!9ucJV7xWd+eQpXD>h9}AF6I+R*`0XBDMMt#Qxuw)W&J7T(Kj?M`_G6|KR zSp!ww0_&(=L}z*1?0&<;3DLX(NdL!{NFL1IUJ%x2<=xRwZ}xwc_8IcL#T-J^i;6wQRxH~a_Wc029S5xq__Yd( zBFA{NGj6i9vO73(yaTuCqColD9|XzfEcJil2>o~Xm9fh8!ax+Gg?4cyb(@quuaG^0783%axIv}o&e}| zA8ofiM@&L7_~Y7rpgkqN`_V2J8OScPZVTn=c(bE(7YkF&wk$>OBgi(D0>`@+NK5oP z=Uk(+T83WNc(1=f+~35sO>e4+HADFwF$c4D%n2C;>~i?$k!@doqc5A}BAot75N|gZ z8p3^Q!4sbsGd2nsx|nHI>%eK+UN9ofGTRmvy9&aFDkwgir*l4ufu~nP2L28J$NC6P zC27?QByDu6)k(l-NzqHy#y}=)ExJF;oZWRL+JwHV(<=yRKQRMl+C9DmYan#;hwkUQ zA~yp@<$D0Z_Xqe;BsldD+;d{4SCMKSQs-B=+6SC|`6HC6!LY@-Z3dY)vs^CBc8_-`(T;kL~y;w9@fzi)$Kq-{td@u$gz)ALc4K6msV8pm7UbZ3!=p6?Itj6u%K&uc}CuPo`FTf0W zR8Z&-#v0cyY&pd^{CRyOxw;uCLYAu?0a4t(wU5e@A7c2Qgj=>S&i6s=6`5)k6uX`z z>(facf-`MR2ebVIGkjh_8ghxCq@}a+#x1TUM%6?J*oL;AsE^5O8I!p`xBZnT?h?cx zCQxU`JTP$eL{~aq6C&Os$H^QNN`Y1&3zpFPb=(VYeBnkEw!_dLJKPj~IOPRw`_`zi zd2(uxiwM?GkY$eQ15UjE<4JF$^M*H?3u^&xzfifyDNiq+RafAdyHfvOA%*Xi@~Ipu z^t%gj_2BEN56i=h5F5AyPpy%234@89o9ge6#2wR?2pkUd7F3|8W)vVjBp^JaVDx?U z=N!Xt0K%K9s8g0!gA{7#4o1&Up|yFtE~jhrxdQck7(m9bODXh^&$kuc0VC9my8@}Q z7ErM&%OTT$&*{p5&@e}3c-6LW7bAv}okLlBpC-DR<;*e)y@FM09obKGi!f0Rz1_?epoYWY#uG!+b1)zy9LTZcP>-`|-eYibiugk7| zJldfEG^u|FTOk*DiP&=_l0w(X2uh!SHY8-VYHfa+G7=R}I^Tvg9GiizPfU4lUR$&( zI4qRtD)o;Se=i{zgEuw3ZM7~fM=N`6h7##2qCYfZL_A=x?CvUXd^rK+o@yYm@G`Ax ze-;5F)#gI)=o6SD(Zhq25hpYikN}Olz;)JisefaAv_|JK)ZrLy`P01elivkNt|Xix z>tG*)5cew2cyA%ge*<)|HueFF9Ycp8ZLG62R~^b1jY9sFYd1 zCiU3>m=}nSATnSW?03QtIs-`V>xyqQVRc%tG!9)_Sik!!q;>)f^_y8Je~UMgiLF z+wlDS9}~b@MZ&oEh1s$3o!p4|{IuY#@3=l`<(v)6TF?x+gLbK|Fd?P3I-8K3UfSlf zd0MVhjj?7l3>|74(phm>Fk3?cGuf1Q^me)*cy#=>xloS3;;Wy01B;(OB7BJDqep{Y z0#Pxo=GNiOFf3WQyzIMGiDk;+E&FL3_BsRQr~p2(wqz`ke_f>y?Y}nMvgNw`^*G*z z-hz^eMr=%J=ksGv(HC$2jRC)HShup1_ItK`X}K-d$ET4DzeL+O|JUgTv^Q^(k7MLN z&L=4S{*IwY_zH;ypH}pr-%-V~{ra5UXyNnJ|2zwwzW1%*&u5r7({F4d@M^Qi0*Xnt z1e^ousPzl{*Qg$xU8*hl>E_5!OgsPhKC<(u2plcGrLaO=>1MaQGVzpO1ZHe)X@B(L zKlziWU6gmvU8B199(eBMwyFpIvpQG{P@X{;#rN-VZaFF9zMHcse*#Fm7soFV5!c^y z1}-|1V;Q!7hkTko-SpspM7$vXol!FRasyW=Ia1wnlU>p>-|_q9iji--vHpFIX)`kZ z&+z`W>wo>WoD=wa@ePxTAMd8>%*8T^ z^$A{}3=7bBT`u{flCX`Wn2)aSjjGeyQc`}_+?#xv+`zUBjA)_knDmxuMsm?1+%}%w zL#uJyw*7ushBv6R>$clz|6-RbrlVY@_$6Ms7Dv0t(c+s5Qkd-&Q}R31O|}fnuc<&i zr@AYg`7WP`bmpo7%{OC32~YRuzdnRR{q(NFl3sda>X@dNr_f-(jhGw%`<2B!@xpk< zxFG&J{~pP{>fR}~e*Yop{`;Xnequ;L>CTYOzpqfZh_Bq^-q&+5GyTCHNAPbz8;V|DCP>vA#%nVG)7`>FNKmrT+Yf zk&)C8$F6GGI^K{bB44Z;w@FeimCgZLa6A=!XY+)hv24|uL&P_R;2a_hUxN|I%EE0j(FUWIaAE|9zwJ2N+K_!;MJO)x*wy;?tgXH-ToUL2MYcK0c)ENiRi}Y`ykN^x@jI~aC6=D zQl4o*==dNT@%(0GAz^R1NagtVTKMnY*fIv*g>Vt{@egg1vi~1L^7oep?Z<{jl&RLX zrx987#8Z5dguOO2n<#~f@g_4>i<~q ztis4e;NSm9d+Re%(S-6F%D+5i$0$EBI`QpDOm{O(f&wzpyY&aY-}-Z@Y`%M~$EoKtJtFJd^J)z51q zl_j_Tf9?990AyDWW?zW=^C4JM$uA6r1>tbP^x?tu^uj+HR@GuTCiXZ-L>x#j)2)=t zKa(vPV#{c@^`W-R&^D5&qpmiMPK;wTNGz{``&*WEn*h1=P=rg3GgJ8OtRQvbAawyQ zGV#d#Q%ZqG`j}D2Ufi8EUC)&|quYOmEt}g&o?WA)_vocuYgfJHN~S2ONiTWFsI#5= zukF~=bPB`UUh3sFIbI~nrN}hly1Tm0pHIEzHMyd__q>QWlA%-NJ2$baw~&$dZC{(- zNm9d%!M;jr%WE`S{`Rlymx~%}r)8-e@Zm~zH$AOMP2=fZm}AR+z6)Xu{_YGzoOW6B zINf2In)Ym5dl4Ddjl9MBchM-w23f^D0vEit&Z7v;N9tTZN0n!Lrd*yryWsfO1i^2; zRpZaTE@}Ds@vUN4$Cu$#Jx{JqPIOD@Fj;@5&roP)+FWdk>2Taw3?~b1U3>>gm{`p` z?=&ag(Wb-cTz&W7t!5nV-C3l7!Mv+;iE2M|#cy+jJ*nwljHZfp{t@=Wa=uMZsT@Q) zfBWpf*5K>chsxhEZj_QtTzXISCL-5yG+D#`3b)eNlIS~|Us(oHywO9V2ey9Y>YkHk zEj0H&1dj^3OsMlD8nS!!Mm3L)on`*#Rv-=;)`2Q=kPU-%!^{qRV#(o3%-NsReeC6F zbof76#=X4|J>T?<-m26g(`4l>NU<%Yh^l{Qo^6DQ5vnB!X4}^d<^(d%3yn>DQYPjV z|1MvKFqAlNf<~3r8;S61(gUJd*Q9U68u6F~Iz}oFJY5Ojn^`;PMo#NCtfyv1luPBJ zV4?B;+=KrscxO1-w|_UD)nWZrj?IssSHB)Ks%#kd_sGM2ckw^vH;Q^acx7qS?e)l2 z*)rn0{=4=dC>?uAx_~OxO*+`&_iUJYrfzyqV|hu{yE`)@+Awi04?BymHE#rMew6hY z9LTjJh7o0d)+GQ<^77Z$ws2g2ep1nt%rls-G4A_nGy&f+GOo6#u%BB3=rL-W%+P$7}=F$ru%(iQ}stHnCk9pKI z%@5wppI)R_1|Fj_Sl?WJywqx!MO@XN_=5If-PF%4B_q<4J+)xJtq6hD@s;WuW3+9E zb_d8EK_K?22C^3d!zqYDwK51{dj(34bK(0+-ynX93F81H^MPxG^?Hluj>aPBLNgQv zO_|45zwyoiE4UV#rHxv?2ZPw-i1`#xf3`oEfy#n3gt=!J%=51mgM*8WEB^EJ>=nGZ zz-F`(l=({vh5)tT@p~M|cX_<}1u6&%hkJ1mfDMFCKPKIgmKw{wO!A;VjjIQ7kgEbj zisnlS;@w^p$LAn_na+IsEZ?oQ!hq3jx2WGaUOXLr7Jc#kjwH;TzpH5P=(&8&EC5)- zP4pHEPqMU|-3K0`qI>D(NC}dWpASb5DtVVcC7S%9@AKcAEhvYC@yLrp1jPX_C2JY) zvYZgh7ViuMpaREjsr05D$;-{#%D{HLiao;M81xJZLX0KDPYF9ud_D?=PeJDZTiU+b zyCM)4>LGl;w2h@kuOV=-i2$v4QQ*>(9FyApI&W~@QGo?NT{9i>BP4%smMSqUVD8n> z!Egwm^8ITYwGH_;Jz{!bT{7(ojQ?TixE29|HcjyG|IT}bZN)Qx3YbIcwaWA`&ukjm zitE%Qy7+hOXsI2ptwJeZ-d$+9q_b_!{Ga^b_pR=oin=JJ-0Jma9on}4z+tWlJ4Qc5 zmA>aTU0;&C{Z&FCcnO`fV{_34$&!59do7HEHD>rbf-^$5vgDKsaDF_r+cAk8?}|1C zJwF0aHSb96id=_^HbP*MdIRtKw)!sBUa7}V{>i|Tl~GgkP|c8X9l-98@|nYBA-zbI zL4s}!?TaxFTAz*r977j<7nMu0Buv1g@&}WATEic|##njQ+2pAyOXt++XTKxz3pS5O zlqvNOUOr*3LIc~^d*YpMwr~HkYmT$ii+A~V@eLJDV~kXLlM`!W8=jpK;FD?g6DZ?C zh(OHZH(Bw^b6>5YA~Ou7FZJc#__!a4)=T8#seeXI<1z_kcv(f?sDZ?Op(Ub;F=!`pm$|t%{0FV#p)F; zS9#(3Z3o^@2h|^4tZ*ySI}>GJH(PICzZMiY%#a_!w|Q|^sbT$hwzUqcI%)`@JCXUS>7HA_OKxvLX2vXjwwQTZ{r9!Rc=m;9wUJAs$T*4K?^dpCXX!F7mcJ zmGlwNZXBHiQasSh;h~2}8VWyrX&v8(lf6MZLi6D4_1hV?ka<~o|34)&JTc*v^ z&o~aGn1o=8`uT}2O~e2!{5e52q*KLhANS=TIPc_u?8#iL2E~QphsOhhl=+M0pxS%G z6*$VrDjm*6)FDnCL*VrMYS?VN0S>}|0XeG`sLSyzKNb{Di&L}D+MTwEnz4F=mK`bP zlMPA`5KWK?^dhpSsoryr3NHe`V$IKDhE!6r_>)|3)k?xY!G`s?wEk(c`@F5GrUpZ^@>!4 zX+_iPhZ5+oBc&e_TP_bs%`nK#PyAah~65Dh#4b(=RKA@+c#tl*=n#-B&0C0v7K1*}*uH%dTNN zRQ05w3?Haw)t7K?M^F9paCb1yOmV=sWX%}=6U3xZkFkye<=Hxw;GN#*gf~IP|8?*_Km^7}?V!c&?KiU(kmgK60TZFAb*MYh-J~8XvdhRZvH#q~ z%-k_skGn%_mbqi`IC&DYv+*c|0pb;Zc_*N5lCQdjYez=VINswm3&H>~X&6}1p%=oq z$qz(?FL4YoS3I0>-x>YcXO!ORddBkg_N%4ZkvSo{mzjS<60AIAzNWkIBCY04y@Hj@ zMs?RC8d#-mo_DrnA$HwutkPDUv2Noh2w6FqShubqhl+5pnCdBM0dZPF~41BWCU1a5nJHeFr+>1_N{a>a{ss~G~N zaS^I1E(I#bDt*A^dX~$dv;ss5Mjn%{U9Un~Nh*pS%(KfsxG%BrwJNa0D!g;bIK=fv z?&w9Lj7CVGwF&FCJW{J?7Q+q~m8fa9{Ut)iiF-vbmv|RXT>F_bGA}80qw1u;Yla$bJzVx8D{G&-V#g?~wt!_x*{Z=dGld{*AzX{Up1Gawbniv_&ukI^)%UnuoecHB^D8 zYB-OdBBwu`t+=A>R~-=d5&ebrl-C)z-|;(2`22w$o|( z@gtxrv1^%KqjBeRx1tzF`h#Ve43@1gTfy~z@tOe7@-t8z{G&k=WGUuPOK**d1KNua zRLY@LtC^6Uv=8Eme38ftoAJcaUph25t3JQKbZE{i(Zb`1m7Y?+{qqjZh8lWvsp%!1?-}{!Irkj)KFy2;OltH27gm00DBGPdv99vEUUASk4{!8PE{4-xq@K`*iE;W6ZVMO*#T&LZFkH_YDtEf`WL#JkTMNOprUqPDYFF{(fd)0dm znI0%kOE#ytXFf8q&1AApl`nscpIj|cxN`EvUr1Rw!_6}SEw|ssr9|1>@+5rR4dx0P z(Orp+=^$qxg!Sf3UKerjT*60^uNhV)>`n^)^A1U`o^`(JI8s9+oQ7c#$o)O~Rr|51 z#uuY<9S`zAhZYW+_)~nlbz}0{@aKBqRyr-9Z@l8R1|1DuC?F_9`Xg1?+5VQK6-dRf zH$@@J!xJFKWm^HVw!ihWL!g-r27`&GZ>YScASuZz8RtGucae1OAp7J7gy5ESXAfp) z%of4OV4nRmG6ug!rbm^-HLQ^0JG*8?UPj!ceZ%uq8wqFzdR(;yTLBI~svvp3imQw7 zVI!NHzhO@&S&`oNM95TcEl{h^t|acjj+jR zVwjYgvDb^gn!(m5e!rn(Y*n}M*5OtU!n9n$xv#IrDzAi32OF%fBv1YNvf^VJ`I>qA zE(rpuZ!SqW0zZ?@)E1tVyxjyWBVTKcMzg3#u zm=3VQ#gNzjq^WvG3L`JRlmw#7}~PrtJEAVBoD(5(~>RJmLr z#9Q-{M%USWbem0+^{#FRDm1Fx)m;9`I5@8J!gF*eG5vhwy~`S5qK4}Mr5oxR&-n!U z8j$v;+=*B>q`5VZOw`V|J*S`st}{D_ae%mZ23S}6&?Io!O)n8-3CLl{MA`~IhL1x9 z4EwZpFcu#TlKFfnJ-cJEpx$oe^p_FOeA`(At0f=Tg09(&%2`4#OQ80D&cILHba!F6 zw>eno6NI_>QX4ZJhB@d-FeDWRmAHl+xe}zLWwd>rr8&a)`G9AnX$!+0~N-M?%C_xLOlo2 z4o?s}1%++#z#n(`CMjKLugxG@46-*Qp{Twbr_qHpM|(Y&ac1bupue#7C?WJKs1$^j zZKZ!eEO$W&>jr{3ouF9$DRth=t48o%%H#PEDoxJb7N zU>J}r+ugOlnhu=9EW2qTCNb+V?%=A4(yS0>M$f`VD+TFNo%WS3bEPSE=##L912InG zqAX{O!=?OhLuSRO8NcU|q>Y%Q4#^lcMGSPDZwo-}%X_8s(5-3<4Ma9AFf3 z;}1xq!-AfKkw_XO3WbYox;_Vd$Kf9Q-G$Ve{m7rY3h@0a)9Z|(^?V{EyA zhdzU>+p-UozZ>S?09E@7<@?hkkw$W)9nrfV#&BAL2LsYc4G)A3>+bw&SMjfFh#V2g zA3+-?bf*^~8X>%|u?bEvkSK-BzDuhUWYp$$vEL*$s{EfEKVM?mWOBpN^qg5s+3Ao1 z9WzHIHvG98wBimNF?^$NRfQt4t{pxl!q;sTv9N$8dIizK0!`RCmfpF)(dv`UPI4mH z^Sm66B6KCVfDvl^A}*$?vFPma*o%8rJ+@$ye)zja3j#QQz+YI3p^+s=H z&-S`@EaVblxvmp*KT)NL*E`Gk%v!eP!COn=8G9V=OQUyn5)v|JnJ_C!=O2e7p#mXD zP<0W)`max$)o5gZ^1dDw5E8XEx@s%AtQNl%{q=^zhyl7S}UkL z-->$SASIS zC)$p>K~09&d%8q3My-Mnl8O&4bv!!iaD68|Ne&q`$)t#4gV6UqT@AZq^E$rn`dsU^ zb~4zOVqX5v&$?|=ghJQMJ5sYu9b!?gL;xJMKs|(H)%QnL+>G8L-1GwesONdbUw|D} zQP@du>2`---5r9zo(5Ip^JcZMCiLqPOLk6xT6TmxDR14i{?#|`$g9BK#eE$|oHKYq z661Hpr!&K?1oyO))Y^<4p?{e&3R92ktLI!oB6i@1`f7c2tZTF?=2xTGObyxC2j*wa z1(D3JTK9vLU(RO&hjf!30KGy4k;fWa8?V#ns@?8~V3pTw1eQ;>c0A`>BA_g>Cvl9Gh z--W3wEKq-R=jm~i8d-Q(_@OlUdTgp~$TQn;{j$>Wqr3kFijkjKbxCAh!%bg;*RnR! zM#cWrj0bd;DvH&T-<6B?K5@0q<-lo;Mg!4u_kz-o4WfjT0#>)(cg99`J`6o&I-eKw ziKs|2PCzwKj9Z-4`>l3+H;Mp|ddVQ>NZf7*IYK^5XBYR}yHw9jksl-Bx5zKuTr&J@ z>jhx+@`H>bw>|saLngWdJg}CO!MNZXccOBA1w#A1Ro|D2eF$y{fsR*(BGn8%p_z-q zf{_ix)4G!b2Z<|zNWx`?9M+)-hN58W-@^jnJ1m0*AvXf|!5)G9p!$ zoBEK*9%G|!#&8C1YP39(FE0T=gI+Q=PVnAyROi)qUx{tjYX!R~ICC?Fzsj%g)!YC4 zYP{qV$$bAn^?%%DHV5<$A97i1C&1x}fkLrOeek*6(wCPPCar3Xc3{ua=#mns(R!iO zbyji`D#R*OJM7a-+O^ZDm*-84QlE)ZzDRPQoTmt4bp45^m}? zP7{S`%)J4<)gQI1&E77v*Jt(8O&*Z{a3_b#9mB=Ef0Xlzo>|$f)Y`Lv+fw7Awfcl+ z!$;1C>{o2V`hL~&jNhWtCZ-adckh-MqH83T^oj1=c?)yT@&21@AMP{eP3b+h3c7Nv zHkad2WC!q;2`2uN;7Wj}H-|gHsXcS|ox%DSshpDt(3WL7cBwoS1el{auqKOUN|#5u zjjKU2fi)3WejeB9+eaNp1T7OuFxZdmfW436ss9!|NMtX^kJSGSLvOSfUiU!+|*|26#{rOg5$ zc4B{3SBh;?DS5JTH%ACAJiDE7o?eM*ECNyX+4G^b;lMC{`GfZ~5cV5a-w5bm zri0Dj9^S-4j31-yo?P#0wS6S^>8jA4FjidDg<@CA-{=$LyRi%H0xv%dUK* zue*q4vLW)9+L_!98ylIj&egjvXX@s>cbo>@Q4!oglx{20YttoD?X1$YvByjbQP+0F zojXc9SG(cTZgM!!(>-jqZf_r5uPP?59GLKY39X&khH)Aac};-i)Yn%z1B2>VPntCjQ4O}v z9l^jw>Ns3k+q3}v+}>%c`0ST~#HV=R4vqB^xE;NoBn0Yg7~JD!#O&_pdvN?2BqVkh ziOQ*X4HC)PX+%5dIHpo6$=U<4yCMb`08nnRZ$5Of4vV;;(9~l#8PX}O30FR{&pNJK zngU#JOuI=Zh)I}*yD%~SK8hR*10xQ=H)l2J4>XkrZx1G}W>G&*^m1>`<;8vtiU9w;c~he6n~D6w4dS`=T);I(0ftS4h1O zFmQ{NCt4R8KS4F{d!)Zj==A4?mgTDBqi9h5q~A^yE*(>`ua=4j$HZX5m3vfetzA<{ z5llY@yqsI0;ekFT@wDXLYXBG-e0m+%!1@7Fvu-VEg9vWQujg^>D`iRbFIRpLE03`s0VCyk9BJ6jHb&<8$@TX z$5@5uc> zW{(L({6f~BPwI%m!2J>=&O3r2(cwh?WlfORp;PalJ<53Ib`c*mpcQL@PA2-X{AQ=1 zt_p5MENc(CV7DZ)6b!Gp^7Y*7+9YUTL&*^_;t8#qGfT5@32?e?gek&N6Pm93U>A#l zYntg#Fi~2Y0)D=|=ta^+dQ||)5Npp8U-#0g0gz=X&>S?vU30z}BdVJvUg)S1?Op1> zzBg(5Pq-@RILW6qURA@2Ucl~EfBBn1vWr;2ecXF9(C4Fjb(3D76~r#$1e3`gm$z)2 z8rmvrZ1k?S(?p$~p@z@s^5T>&gzd9P?>o`sG3$fzu)vztZVm5-%2C*8*mRqwdNo`! zHV>d5%!x-7$1{Itk}MgJlWoVP#{2`qs+cOI>M{0lGiUN&F?q&sB&oic7&;sx*Yw*^7@BPmC6zjQ!b|!!@ebat=G&C>IUL z?5LCp&icT7>d}m95az~)M~gdhW@CHlkVOVM`&xkC3Ck&>T>l?q?;TI|+sBWWl%kC! zip;{HvX5{ovUkX??ChQ0Neg8pWJ_jN_B?1v$R;aC_9i>C-|HRcbLVs4-|z4HPmjla zf9jm~x!%|Hx?bZsrr{jf15pU+=;~f^tmdRiZJ&Kx<&ypzcrVf1uSY?jJS6vo+&!%c zdjebWnRKD@F@3=zgy-nsQ@{V$dlI^iIS_-r+b%0uOpzH~Zr0AJr7_{R`37-S$}y_! zS(?|hp}9t=aU>-xCiXr+Jg#mY=F{AJD&A>0%(&P{a=J4oiO2H00CeU(*6%f~F-}NK z_yn(;^a_qd4%~gf>5drql*~m@gES%E>cH3E<-gZN&%>yVJ7R-_vhj*xLqSO$rr7HV zu7<)ex+#N$H%aU7y>}nOcYtC!hbQxoK$T#`(=p#82$+HW7pHWE_G~cYjfN+pB%E6q zUQ8D0pnYs~4brQ~NYDk^6gzrPHzFMboW8sbjEnvfL=YP8QUR}OXosBxU8N`KZVW<* zD$IAsZi|dgd6)`vp~-xJW5cHrB$C@!@Jd+@arcdK>3c*4nX9VyO0|)RqPF+(X;IWj zGY-x81FU4n-fqI5fzas@S4+Fuhfb6k9p`ue+!E{4zQKJXd3W->^tXRiKp_UWEyz9d zF3K8siv!|hhpx6*m;uZo8hZLt$UxOBf0p4r;7#;v?WRY8x?`syS|55aa^zQ28F2im z-EDQ(V9rDpRecY*M)lV|yT9P9L-=OVl?lEy?4@@{2z|4+O6jL_yqv33sn?MMmY%`a<)>zSJiUb;8NhoER!o`)g zv!166vDn{;v}ce!T519RRTljv;fl#BgrNK7z#zWlFn$AZY0~(#l+Gn?a$mDJ2*`KL zu8Yi%A*(N*_FBNdg_!(U0fOfS)a z{1h)K1Gxq1yZe6EPGapsE?&;_c*TyI95n0@Y>7MR9+O+!Qq$3jg38kf%$H1@gSmcRR?NWWf0&sU7OO4z& zW_s)OzIO@X1YV=js2#xd{_q-6Z-CdR7#C*W$$Mz@4+S}=cl4ULdt(}1<_(;SLuaAQ zoA$E}m!Sga+(lqk(wpv%Qknr`a)1{2Ogd!Xlc3K~A8b`u2m`$#n6zXYB{*~gKR#7m z0q-(_5`>0<1y_iPg#EW6u&Ov2bMtfyJuXYX%B8-ZYT~k=VgCJ#yDT`}Cq|{m_mq1d z))V41nLH60vAT#GXC~wj+X+u>@?M_-jqr@ic_0R?)oyqNRqC!X1xY%KYtVcmz1NgU&jAb2f&IlFs#HD0jQ*-Ra*LeCCV`8%P!%uoY) zAP^_+3|er5&?j0%({dwR)?h@&(TIBD$|2bn!}<)|lBSuXayQgq3kNU5)=U}|wQLUN zEc-lyBrmev zh#09k1&hQEif(Jmu=|jiYhB(|Yqwve*r= zth2EX&PkS+{2t+7j_%~Uz;r9PTCyKX@BQB{FS;%H4y0e-FH3)<$cqX z0RTXX5B{Z+{m~oAxcmIOdOQjW2W&Wuj_XO3wV!!(=3kT4nq$d}dAR`TxvqAyh4t2K z?tNR4%Ha&NX=8KOhWzwwb?fi(7RPm87vp7xQv>p}Vqb`JUxS;Ev1SyKoLj2kSucE#Ggl=P^xP; zm6*Ks&w5_6CPA0v3T%v8s6@nBD}r;Nl(}NR81_>wr3xR8jPluly>ao$<{FIJ2O<6} z``7ttmV#;=-si#U{sOlMkxe#B9Zf6B7K%AeOhO-&@iRivYLazOO_T z-y1mO4AVkGh@x}wvt?pL+zp`1AKa#M(35v?>Q<%S6u>vFx_n(R%q-GoxY*yvI+i&&JTTsGMV#h(?)-1{i{f5eS^u% zNO**@SJq_-I>2i84AVq}7Z6*karQ8rnZEuz`OE35RT5(OgokE;QW&ykm_h8zWd>q?vqI63O#X zP(Sdk9gu~+N=ee?4h%>6135V}m6Jxs-zZzO5PHRg4qFvz>zt-$RrGlr{{zgeXCGvv zxt$os53G_8tw&u>n)>3zv=S_*=kdVxN%yH9&iJiqNuwl@Y^zzkLXh1~z_%MVUv-hm z=z_on%v8HfKDl~(ea^C6?ZvZU45j~r5)e!&iKajH%AQI<@{GdkvUQ`4s6h)A0tjwg z?Y_5*!&=iZfgLsbj}whqx2zs$cgx2Ih!Rhp-qan&|D@7lCaCv`Ho`O^O&+hCc9&?oxN_&@d2VlvHSGgPLK zR;~BHzP&9muac|i2waD)SJ8N%2)`>AkD3lo9zmIg&cvsRvPC|h64ld={~t?NNtP>Y z9c!VA)3(TYq5InAs7k$Riz>e+zBSiQTc43STI5#py=}3G+k0Onx4)x9u@cIfKc~5l zE!GRS8P$KDc2MrNWZi1g2j~C#9!wxnJy?QrTyBq*kkLvs)6@&pQK8*V?Z>r|_p`bC zI6+xs>v#M(-uPbcw1A~49lhhBeQWw(GWOA$CYBmec1P=|YkPh3+mzhNCirs+vz`5M zNSt^dHxUVz^MT-d#>`kd?Kkd&AH%3j@Iw2a1&QTI5lYv6l(*t^y*%DU?j%OU68ZHR z(6Mkfa2ZTdu+fEDjdZT5%}@S4-rMerQBv){>>Nv!UZU2@O+M2G#yOa08LFEStBud+J@U)czixpwZ;&cBYV zha7`qowv1(Mb%K}!U3v*oYZ#&ts4gx*lOstiv0wJ=TobKHh((B!cDW*@X=d^N}v;uSrLuxce6wgYbn@8yOb zyr?$*#GU3Jv57>T31s4WTITPHpLLjxgfGb%Z-@E6eg-C-*#1G}a*Y_BK!inBbD9ph zCw(td$f|DaM!tZIMWhSiW;g;7A@zxZ_sQ)wBsAE!Q>-2T(}(JdkHIDCY9sEZ;?D0r zE)4Zy_DyK!q)!Dinl_n(8Oa2_n-mIo)oh|C9{i6T^Y0Hnr;bUg8e=MarY|l$0=(EK ze$@`xb=xDhQ`}0K+A`x?n>)Y*D?fH5aR2|qQ~vb@$$GohT2p%){J#7@{5#V9200d~ zIGJOhp2=jygUhjWew~aPLVU_cS@6%E)S$|O+YcKb# z+N&Hp5*%s~Xz1#|rRZb6U85u$?`9cqn@^IM{J(q};f*@)y)LY4Edx5{*8%`kzOAd0A-&iqqMTXaJTVpx~eWg!a*H;=5-N5AD^SNpMUC z2byXPO4gTaDARU+R4RIJh~un3)Qbuj)&+94fw+V|$Qq zO~1cs)epHJVjiOVY-uLHb)VcV!sj9{Hh^vSniFxmr}ARc(St<$DbB{SkB8a3`1VaL z(+GMFz9!f>!e7S>?S;oE=roF0Y0`Ec3cdu|=)Yf`-hStp*b_v+J4w^_+*s6A2mhD= z09LofX{{n@VLzT(DJW*a?gHXNLiFyKpn8zTr{wQB14=7DRn& zpi9J#K(XCm>aIJKp5OJh@cOxbHtLO=v4h?&lzv{rgr(TN1=v^`8Jbgq{-B*9&GN~U zzE-^#(G-S`e=w&`F%!685uSQS5ZL-0lbf)6c_$@mz5l+oF*&66V@(n+OIlK!Yh&hP zo8KeqY8qdU^kgdVQ{3XnvJ&P5iH)a|ii3vrigQ{=Q2w`tWnQWJ2@|g*A*4PzLfN8PAAUkgycl*~a@;fd_apujced6E zqe2FeDn-;oo`7rqv$&@;Fe77?au>Kqro-Kg<`#Mc$I>cu%|p$*xmtB|AG;DopI1-D z zB$PhUZ{bVR9f*)sa_f91Z~g2@sP_NuTh1Wg(qLQ{_ID0|AwMw6W=9=$epsQkRt{@+ zK+c`4p0zfZTiZxp_tp@ao2<^@Zr`nIx(-37lg&;YE-hH3@k4c)hY!?NCI&Hg?$~5I zUI?5$#tJcE#`i*hM;-b`43Uwla6564GHYN_7imD{S2^~Z1-RGB`Yq#o*OG6$?bSa} zVMExS|9SOo$48rUGVP~|(%gQI$vQ8Dc@{X4b4woG&*Y?{%PAnz+wUd`)V-C_}XLFRhi3*u)Z&Z`J$U-buHB|3dZbs5kri(g1>q`rw^j~Au z8IPfiPc(2d6U}6WRbJlY?Xrrc`%SOO$@j41t8`VUZ@NXC?sUhIn15o>KifY!1=CQr z$FP#ugDJ-~4aMbY-E=Yh9 zf0OU}=Yx>wbxOG+B02O{aM9z-D}JIW2G_owrei&}^9=8&ZyY7+S<5(gjSsI=oH=h7 z)-=i*5jQ^}IO5QG3%^bwE_QBfA?-_dkuRb*x#@{^Y?Ro3B8$|qu^Jay>Qn*C7yE1! z;>4(T{0(@Larp|~Z_Um9E@I|W3aGlQHeIjn&V*zPd}{M@*wLM63&mr*6Gv4a244TK zJrY~L_v@kOl!Aw<>YP)ky`=2SDc>h;E7pu_L3Xydv2tvutjG_FFVfw2T{47Hq9`T* z2fOo>VufsTYR5@${Y%<+JCrH3Doa(oWazhk_Df`G=MI_f-myU?gYV-jb!M-O-iW%r z>Sa2w00b*SrbXCy81DU}GYhgTDxE5Mtd$9aO$AtTo>>P)a~JaSG~4@{7_wukg&#W! z=q4w3b8BSYmQVJRfB1S7z4~td7L5I$Cy4Fa8%MsV+0{_g$aPPJQ^H^vXBx3-w6q~B zTGe||Bm8LcMCb(}#~3v_mTu32+B|0T+Wx9U@zZtT#j{zahAo+Ij*0RSF5(JiIBB7z zJ0&bIUi&{C3j2IGwU{lO^e%bm%olDi+{)YHLOCbD;r_LTb}3n}%Fo8(HSNI@Zj%3` z4urRb^*h#BZERGgsYP7hv`@l%rFL((S!!3a$gTdG92)I~nXc+1H= zoVU&EpYN(+e#WI`ay71peH+Wb^8_|W_<-Fo$KSbPY#4Fr`;jGyVuQIO(c#rb^~W=g zX}R)6%;wgHxe2A?w0)~i*oKO-XdccE?3F<%op8|N+~?MWN=;`x)5HYMO13a_p4J5sPuH^k&@ z=XZZT4!e)C@qLFZA3MrV%5FG+ERIBWD*x*zA)z|_k)rD)PaM|y`7x7J1znPMZjbJ$ zxQ-kY`7QRJ9Ppoog9m!`&{;K3bmXLj>&Jv)RR&IrH<6nI;~~fOLa>3#9m#)kz{VfQ zL;O7QYDX6eJ51gvcn7V-5m7D4668&EGpH{v?10KwrV`t(OBNkEAuZ88ex@anOR_YM zB?r1&-cMZ(TA4KZ?{)cWg+8B1p72Igt3xw%N~A~X+k_9{D&}DPg`{&dYnj-bf8KsL z<`x(;XWyH>GX@>Pw+(8n>VVp9@KvIt?gBvEjVc-IF^Wb(L#gS<`R#c!my0<19`Z5{ zOfuf4sV`3KbDKHP&DR#?&Sd_vRnFWPJ+vjjPk0u|EKFV%%Uy>kuF6jj)voQ{sEamu z7XSDxuZOx(Mf_~TOYW_sO`nBvFfL?1%&766qM6Cz{a zP8io30<8z{k$s{k0nzPW84>+`9+<_imq8QEU(AyO0hWK$Fg~pTt|Pc*_vM~;SEqvG zvQZ$@Yo1-rf^FkCEV^z@E$ZR+?iQ&(!bBE?f5{ja1^Qk9yzG#4Nk5MC*gFA*O4PH_ zcf>OQ#+nxc)V#f*>*RgU6iIiHU~P*i<-DlnERjXr+YrVDI9b#Hyt>re3b7PRFTiUq zxON^Enxsdcu2BQMWOu^6ecn8q1G8;z1k?2h~O3pL5GbX zj&~||>98)9AE=&s6Dl3#8zEi&MihJ|BKfmJ8Nv}Nd8}?qbJ~lV_wJ2^hy>i><3Bwz zK_ooi>#9kO<(>g1x$_lLTesd(%4A<}7lfD)XM=9JD=?eIwZlhoAnMDs z))l*Lp?4|udvk+vXy7=gI9ebMRtT@#=ENE_)x_5tLoK`7VIVP%TOMz#)u>_`_i^Jk zuIG7zFOkQmKA8eyY$zgke0)4uU3bi#6RnZWm=B13&f}r%_0pfzYRlM;aanBNX!pN7 z*h@C}LWVPtjH^^}p?B^lAJi5z`;Ouc41!8v2q+k81I*#DhFN;v2XQlx%cjjtFd?-g z+8AfMZ@!<2>Whn$x2wM%%J#!e)IO!!u{V*$QXA4!A2Zq=Lp@wrezOQW)%=}W?;>JT ztj_Yjt25HltAk+nRYfkGX1x=qt7;;YrnFI4@8iNdY?UP;efK+m&(=Q-P{#U&-Bu<3 z3?+quFDly#{9dt|Raf<_IOcbjAx* z{9mz`7?~mn53j>$$pzYuqshVd`ERE0`KCJgE>ff>C1T-a%@~}(E~+6yRQj_;L5Sgo zt$CbU`%3eql=;}P3+27@xCqIp6?_nxeRj;^T{@ZBn(nb;y3nMsDXrK@9oAf{nu(Ge z`9e>j#|-nNOG8iJw`S>cn-bcW7;d@VPYCAZZc;Mfd*(FWjTiTX^ z0PRzuZ`#A$lY%HUgY}NG$mTD7CoEbXH2R$FADm)pcgAU z8J{nifR=2ms;IO8OinMN2l|}->OXQGiYi#d`xH3I(ULL6q;p^>!32PVx62#T4W64h z{O38f70cVlN_!v_}WZ~pgAg9P(;8L>hhFtVS59drxrhRx% zOsD_E$CpjZg|@*pIcqY_N$QDj%_u_F)UOgKTU)P!{v2{PG<8NMT2s)WXa`#F^eo~9 zr(&}QT6{sr=9_8mFBE6<{!M-3wB1uNTLP|Z4NG6%4;r8AG#N_saq19US< z&=+my2CK0@fwHd>27WU!iF(DLtc-3V>o@-=y^|nAtYq;{AFOFtPs>|PkG5yV#N@r- zNAm?yZA5TK2;az~7_o2>w=C@2hY;5GjJ2VEEtmgB!QB)q8^M>ZsncjlPD;!&F(mjq zewmjQ6`B& za~QqB_1llx05uGSg?$S7pO0VQ{~=!gwmG#dp$NxQbzT zhDv8OeiMiQlv1hf4ovuz#-7qA&nvD^N<^>+ zyx(rVi4XOwe#klozr}X$hu0BlNMl-71COa6ATpx;khXf_p3~(M-(I#ae;KfxPY6>T znj3o3EJuf3lA~F2O^yz6`sVO*zMF5VGB|5ab`3kvZ_zQMcDf~0M}3)4(9)j^sPT3= za``7_Fm4ol0nUO1spJKhswN2TbW-ks^k*;)|M}gnLA=@F5iVwPAt`Xd`hhOY^V~uA z;D~`-IFA&_LQ_blR_)QL`w7N~haUnlccNBJ%Ys?U3g_p6mfUN@g}VqwQ-JG&zXPkDy*0zM`9V$>do%ZT+@`Kn-cUZ5$rZ zHROV|zzMtFtbO3)lOv&qp*nm4i!k@}71@U>-2k(2Qr2pKLeWUohF$>Y>rR6pIigP3 zoDjjWxypyptB3bQ#V5&lbazL)PLG}TqvrlBzqI6#fV*pvmz!*;*EScYvz1EbMzJT7T>dK8l;|BB2iVMo%UiT{+DB^`MM*3aVa$I0KV2QQ6(^;x!;?w04zfS)YvpX%z zJXOY3A#K#Y{{)P!ZXxO3x8SpIN+AWZE@e|z$n}7PZyR4MVsp0o&Y3_Trg*)Ih{#Ya z?+lBt4+riF-OFKU`Q3iy;nsd`y+I#LzPJRM`P7IsZeibJK&4R~`6PRsJwC`PPW4`} z5Na7%9}?kYneoTZACYS{vWAe*!D{y!`~?Nkme$LdhUyY2T_|B zl0OZW7oPm!oB<>Fh9CF$9R4XvrJ|tDYA<}fW02LoygMzY+g&1zII%7?TqT8u*JPMJ2L51MgQOZQ9TFPfdu1-Y z6&mrLRtfz;1IjLyPkxSB(^s^=q{A1K(o=icBO%27M2k9VASi7)`S#|{Q3y8ozztXP zxD)Lw(!6anX`3USo2+y8960>&vq2Fv|8FJQHz^b12rt-YtCoscLodC9#%_YnVX- zInS$H$*utzmoqQ_Dd+wi^SAVhDR7NzkFYN;IAlEqA2VTPolF}_J9n<;T?ZV_#kH=T62eGoCQp0ddTkC>NzElC+5GZAPuS0xK<54+ZFU7I91?P-~7V+cn zF*ZLGInH05m^x^ITl{tjgp8OfhrrQtj%BI`$X58fuHJ0HildOZ`?WTg!~l$FIEl`& z<94983jlpnpV9EkgRJ>js7&@7f3k0?`asE9VyYBpm6!&kE#eo&%w|RP)=H5r0E{=wTw+BYhO!qEV{bl zsO4f@oa**Yc9XG0Po<_US|>xX%rt2V7y0Yt+7V6L9Wu9Ll*z*#VtExsNkw{jY9gV9CPaBWw8%EIKfLbIM1&Sd?e%naNV{VBs&v#pfRvCjZAHK~;$}3T($s|8 zs!8|Hey~cGiCOye?H5ts#p6vH2Cvl%pShcd=C!0|SuGnbku3)0YQJ`TF&I%c>7s#T<+;`^$z6P0 zy4+)fVNB|{2V~&2z(p2nHC0c=p>w+-OdFtLUuIkP`rz5fSNIt@1E zxV(1>)p0z)lUeHt6SOYZ%MnpFAdTgkI?!oqvsO$RGjTB=-KP|KUA+NztvVtD(SUe) z2O(u@Ms=xk-Xy!#QGzH9MwNKDK8i+z*BmNiRB@)Nng+A7G2G^XxwgmciC%leXxUU! z)|x&p-rt&czo=Fyoi%XjSM&IF?Z4Mhk&eieCL+1gp$vh%X8;B23yJ;vy;KzLZ;a!Q z9Us4rVGUZ*l36=t?WpORC~w3Yq56?2Pwr>!>Aa~hwBOPPaoj-R9E$=Ox{G^ORXG3h zhOajl3*U%f5TS>u97?_z?L18FJf4y?2yW&<;76ngjfta07gA%=bxqs>yd@smma*Un z5^P@pGj+x^|7F|S!o^Z_(@y}3AI>d+Em@1WpCYOcNZEreh1b&PIyvp{vG4}tQbd9z z*YOA3zoxHzFgD)orz(Fc+qwp2Nss_1E_OP6GMT*>awK6S6I|&TV|xUXxENZ8R;4)d zQ>+TnlOxQX0Q!>Ze+}cmKOKM@YCk|z$e%|5_-~S$uOGrR^12U*@Qc1svm^>ysG*Ga zxU@SbTsynDNx8lcZ*uW>ADbTs*LhQ+mGKc7g)Yn2wvXoK7wKZAPp0~1Vs`5JC|<+* z(MF6uop1w=Fu6N}7w!;{{ueZh;4lN)nn~ zGU24y>#}z;yd|C4Wn|5DcQq9!j+ywm-igWKDnBN{KCE53IaSfMEa!HZq{Uoov@8FO zyF}1-y5Gn_sJtyDhx39iZ!35;^02`-kB!y!0F9% zz86p;vj$|Qj`x7RFqWtNJ&Z7EG;o=~07k5kpKc1+TELB#kC2eEp)c_TO4|{(e%Vw* zz4~2*&I9%>X`$NpDG(uQavygJ{J8|*@GwYVo2o(RU*vjCA;>F?^xP6}2W!arDu46CpN^D1&kla4$-CPNfQA z^GiNC6FS1LVMcB;qY%I>U zhJq*>6ABk+Q!|zj(FQArBBrShLv8Y(lWsYg>|drJMh!Q5woBBh-L(^5S^^QXrxu$5 z*A9DM*1Sn~#0jdD&)*_I`wc-|VlSc3&I01^Nc@nH0HlU@SdVrs-t7$S;UcgO6@{?S zAHI%eH)Jjaw1Ie+#=Bcrn4S|8k6*7iSUm2!_VGH=3?AdKb}LWkTenpSw#4E>iMwe2 zAqi)INeh~J^ahrVUl3XQPCKH)@5x9^P~6{Ir`DUXA?+(=tK;lY;c77$-}_8$ES6y< zP8!%~O|)6~(9G>kv+2^!E3(~uwog7$LDOG9U-=HVgEIkw5sA?eS#^Y(_K(E_JgWnvr7TgI|uPn=1?kYe$_A4ZZ*+G=2Ins* zX1e4_CqaX9KCo@go}nKs*-xDdoxy}Mh=N$x5IjV^`X>d*?L3kuw>z2`xxL!{Wo+x3tHvZ9IrEa=hRaj5H{ zw;5@i7}Q~ZN$Zz(k|Oi#s?YoC++U)|oB1|NM&+B%Mo(QUWW`HdcWsq$&@#v_yXcpg8@^u^p@ zVb-c&k9)6ZN9&z?r?OIQZFBL{(B2Poho!yK)0_jFj!JH~vCb01^5}{<>jX|gr!z#3 zuTu;fngf_DU(RAlkHu6yDIlTk8M7f--Rl9GGSbma=Y8bgxljwu3ju0rsqk{ZNxv@ljjX4A#;B0>KPCk*{=w`e$}oI)aV1f5|5pmF?MOT@h!Bq zajx$^=ljdNzuVF*Sut#VWyQ{Ce_-79Q_vscRZh;IhJ_T$+#|zF-;id!zUF^FhiWXb1eb<5LShTL~Z@fy|uB@$mNs zXF`i1XITCMYD$c44i6%RgNucKR_sA~uGIi>W{@*+xhI33t#6glEDLcD>(~Z%?!tDF z7P5>NmGx5i)Tln8TqI2i?0rXMN{5wmbvv~7@|5oO@3Ew*AOr zQ-S&Zcyaovk2I$KT&NgEo2?@pa3fZw`N851d3Bp^C@>k$vg){7BGloJhlK=K$Hv5p zfc=b{lyW&0iO*B{nn+Gu;xuz68H)PQ3Y+sJ=&Y<$5%TBkQRg$<>mZY99)b}EgQ?8{ zm!`{;eQbKQW!f7-gqCv~m36wyGI(fG^vog|SqgLyl%En4E3?I>7SE0gc)i@zIJy() z8E_YC3-7Rc%&HGkCsEsIkQnfb4{ZEJRQw{ke9pZa%kuMW{*^2RCzEwi@%n95qw|)D zH&0_XMkPdj#DolMUP@{T^p*eK$Xx$bB^m-&rVn#UtA<4&sDU)`6R|mIDH96|)Z6fk zxn%%Rk&LH79LEmaIbuK8j|^(e^F9q_BE40K{Z9&jBXBX2;^z4aFUBMrC0dHkC{S@~ z)|P#6w|lk?tI0cd6YRQ})~(9qi|68bC_>E3cr2G#Mk?m?`yY1;hl%PApeGCUwm`fI2Ja(fvr-^TJ*ZpXzB?n@iiEYc51?;P8H=ukA!>1s z*#)BZql$>CqybX+4hDz+gxM2?>LSvef2pVcoR`70k_oX94D4T()NS{982I2d@pA>% zOI_X23}WyX-7K6011!hvgQvjvF^wc+-j^c#xHeUDy18dTi2F)ZHUS98OLfxSs} zfuX&b8PlnIvZ2j6+d?`e8-?aoHpr>t^NkO@j}hMA!7pA=8IkxTI9Fi5cLVn7KU%YY z&@zezM0sWP#@FEXb6 z+S`0cwzY*$66h~lOpn1^Th{|-9um!D%wM6bRns(0_`>D{$~Ay2Bkc<$G=dGu(x5=X zdb@P}fK6&82+@E%FTMBqf~A=&SW(5K0gcA-_}c!~bWF2Nn{?!@^X|dQNv$_!#@My1 zc#U-LUT)oa7n~B0eUmZ&XhT~eNE&^e!qRtOm8$W8OW3orw50n*9vy3TU)N7h3{G{L zO;!6bjnJY~DLva;4;>LO!FF`yV!UUI-mdJ{5MtrWWS2k^hnz8ec>8{i4cn`{ zJgOD-Dk*3VsU@4-d!V%`xB|JU?`kCCgeU3TUUOGdlFqIK+DkrWFOdIlMCWInI)Ag& zu>-};+86Z{k+ei|0$HuLnGdN4^K)~*yt=5*u)!qNC{YjYstn3E8n{a}@zBsac^>gn zr&#>e4YeeD^JDhcA?`)1tcSMK^}CEeT$p@3{AG)ANUDS!U>_?;!s2@UVR9(3dn5gQ zs{Wzz*_p?${>EtR!rBs}`_Xy=c)&yYOq^<#7n-8rtg#_KAof}El1BAoy-L4?hZ7Sv z&3v*t>dryNWg5MP{rIgwGq8BhkDms-%) zIV0I3(V_zb4{WX*$tvRnbmpoKNiYvRY;iRKZ#r08MWY@ux7Kb1tn0y96 z@AUc0ZkLI4MpGtwMLnXTD>s{ja_eeK<~8SKL4&6MidNOGv=lSm?;fE}eM=kYsjRVE zmVW?pkNxKD@IAWFa1$d)c5NLh3Uc&56Eh_kvwb`|%NSmK}@+K46fx%IP z_x*qn)UJXtEW|G9wPwmpnJuF(L9M(1$y{$5iSFlDi`lJI2bztdB$HaM|ai<#*i; zWU*Ji^-H1&?9~~Ezy*)W+1@ff;OTPbL-jrO!Q7^KleG)g1Q1}W+O0>FY5~B#V{ROS z{ud(*Hk|3}9Afzu5nCKAN7RDTuf8WF%>tNMJaOr)BKI2(&U>hvleKNprXPc;B+a-? z?y&RNnsuRug&XzOuqiV&Wbvx>Wf)i5?TmZ-lBr`n9(uSOxnYl7FZU)Yl2D%R5!m;IWkR zg*Y1~_cCYPe!iEafyD~dwr>&-T)4|*^7|3X#0H{rCkpfV!09|ZV4%sR6kfzyJZ7y2 zF%LOl#sKVcXu+|^K^;+fjr|JEu5+mw#dqmZZa?9?cza}K z>vf%>Av%qUb@G|$Y47Y1iX`ialJn>j>X@f-_HU4y_~9<`8`~_p-E`SX1#>a!8S_Ke z?B!aK(s`be6(>cUjj!48qny=A9Sj=waKQv}%FQKJnrC?(bL8^T(Lop|{5+%N3MvC)EInhSuzQ@etL(wQ2CpPFd@;9 zuVACMQKefS(tX6KZtz4yHu+0ujM38*Mgih?+KF8Fn-x4)Rm#n?g#PrKeB_NlqTGrJ z*FhXbk;G0vE{<|-kyswzYn3zRb00?U!ALKKrP{R|2Iqz_1jZW++K)D3qD8JQtCz0^ zgNdO)fbM84Fynk4xe*$r(G_4BfTLW1p{PT75cC}c;qt=G1g;8yC~LY6)2eLf)YL*! zHXYi28n^P}8F&5u_@cuLuKpQa##RVae02P)`F{J{`(bJAz>LyL6P6-L`KTo$aJJVb z_iot>i!rHMt)XUGz1JP2sN*O7#sAfvDksV-tfxwv2T1|Y2JJF=BYEmT9b@dZLl(ZP zT3(0i>{d@ZCgK^N2&FlpxB4;kQA8<`$w@ch9OWRglXJkQCK)M$+XBoxdT-6As0PdHp255){3FsD^Oejf>8A?KBMamXAgf z-9P@;<}OtYki#?dj8_iXs;#?1+cauqS`*9C?EK&%bUwu$d)=L!A+GF6h=Dr};r5m` z)ETUo|6cw*2)CZMTfJ4;N+LhC9i&kK2z&?hx|oqdd-UN~_MohR1oD%x4T{KzECC7WT=8(dnsQbU6wjGGDzoMQZKoRsanT`7s*Co@ah>4B))(Aj*eD8lM9?H| z!p9B0TSYI!0$?Pea8Z_IMw`dWXH`yAd!LX!P4@6FKE@!;^-P&4i>J(qsp&mq1M!aLW< z`Og}~#EYb}tuO3xpZf+RY)eWh{pxn>Z#Z1qsv{krZ1zXI*lAXu_5oGCI(&}vrDR|8C zX#S_*t9wVlm3h^q^Jk)ed>b8GaMu!a8?6$;tz8j=bCZKh`JYdj-6LlOD1eNRpralN zKsX*?N#bf=WNn06BVDe$xEiFDK zoimwTTLVX)3y_8yu`phBlbm4BE{;?&VV&84C77#>1kd|m;G1UrmE zGM=5{m9|e9+Gb+=jxN87kC-TV!j3X^P1G)Y9ug7~XQg<-Viq4ybP%I{JMQ)YuW7l} zkt)+2K>H1{8+Xm)G}~9^4ix0)U$?BFMcbQ{^~duEV$`xVMedAoH=m`+Gey|Maed;wJ1n%ZHF3_vGn4*CGqFy5d? z+n7N#T-I}6T{xiba7H0gpeal@{S#<{`)&$YTYxz4+emQc!w+5+&IF%bXHfUa&?b(~Q5Y&zRA*yZS$)0}u(@09#S*uL|Ju-(1pmLwb2k5Ur3Rk2IMFj&<9Hak@h ze|MTj&R0vNnY_u=z+g45beemM1!vPpP(X|1^{!Tqc$^Vst*FMq+3DL*TaOR4QC2E%?~O%K$C?{|3C^U>#7FeVyQdMXcIkslL|G4y*xJpm&pWi+$D`}n zzn8kxnxDNFKNNS~(SpFGkw>^RBO~=1JC67svU@=d3-YtFYUn(%C*pO3Jf8PE*!o9` z9;ek%?-&2gJsLay^zfZeN4&*qBjykIQFb`(3O`<1 zCb7oG%Qq!RrN?}SeM$S=*15vi!OP1O6fv$R@b>?lmT*~mc8j0|Td%ak_ebw#sD25g zmL#RUxi~j@LgTv`bG{!VcL0UJ7xI_E+6J>%$%a#0v*a|J-t8=b?;2k`shP$Zf`jSX0#HD zfA@vy8+KDAGL=Il^K(^H&*j$GM9Xgdp8h4ZJ#fLs0*U3p8QPs|Z5ldX8-Yc!jC zhAb!yVW^{jkTq;szz_>9RpKPA$v^TC}lY9ZCG@`(#QsPOEAa^-@`Lq zZ9SbG9zMZdrW2U&rlLDB+@CYiwf5da=6QO0=K554$Ac05#nb9-Yy^XvIwl}A8Lj3D zs{b9+Oh~bl{YBX(pG&f5ZMCY0BMw^l*vWTG9UgDjQlq(a!Q%E#1~BqQikIFjtLGCwC(5lof93Ig z#^a)=M5-pq^j(4qhCRY(DfW*f|2yDduTy>D)TK|CNv!@haVSK|)vA3oDWYb60>tO+ zr@DB9vyM^xdz;j;#0GsP&ETLC%KZB0pim5>vhVc?TKVnUVDe0|*4B*s(y#lb9$o** z;G*J|B_@z}{+A8{PtJY(c|03Pd%uLrIV0?>!2e)kR=~liHelm8#(xa1qYt+SEjT$J z7nYVDIuWcic-2EZPww{CI*+Tys^Ip?fB!os{dc(d_kRUzNA~fyk4a+B{{3J6oQa)( zh?9-hScb`}Wm9S@HRPSpkqJ~%jXR_s8f;Q<+`_~>!%_R<_Ot-QeE?2Dn21IFzo=#Z z?Dg%o7HmS3tJu5pw5ej^8ShnMrX@34+h33&mZ~A%S5LXWnRt2s|jZl~guHLp+8TPa{QgAg!Kli zD&3*?uw=)G&|~)i**FLA#}x&-(RTl~9v}L6g46GfD+pB`pa9Wy^@s@*V0q^LTG6{_ ztzpPf1AyHbhegm~-MymsSRbY49q#h<0f(`cxYg<(&m993t}VYjj}q?o zs$n^p@qZ=DCwwDoe?F0_?EcEmccS*C)HOF$^VkzZgRn=nFuM-^_HBr{yZP>36K&Ec z|MSMG7Qf9)P`|Fb)W14UgVJV#z?Dt*k4Vbjn{g}dU^*e_vPT5$+i}l<&8+=+?lY)t zLo(ds^HkdH_e${We|avpWiI4Zkuxf)XP!aETq_k7oRQ5FM7bf#_xC;W&no=+Og0oO zg|u(;KZKM&UmQRLlV+x%YH8M<^G%(_)5yHPc4elIwYU+TYkYbNTxT zV+9~@q#O}9-SNHji$wVFTb*gkPZix|P41B?Wch~dZ?n!l#-^0@(b~MrTBl0=nou5F zSLWkc%F_|ceN^iU>`Mhno~C$DuDr?Y2Td*ib(av93{wm=-O%ZVC!>Eq$1dy(V*6~} z>#Vm8F9ojl9Z-2R%p5j{wKaMWip!3c%5gC_ZE%u1nd`GslE@``_Pv;}3hVRPu#N?{ z^vAwk*J7 z{dUn*&3zzr!Ae0r*5F^OW1@%+IRWbWDIA))$yXEtTBK7epvnV=lxloJ5Ql{YrSR~~ zyVV#qg!~w07;?A*|hb8Z%ATTSq(2yo7%|UVsk{l(8Kp;g;?6< zTE$##mQqgi>j49{h3WQLr)Kh)(=S9-8or9Oko(TEf2%jkII4E8JY;t7DrsflOk?K2 z%#E3xHt+O{bP{DprT*s2!l{ug!nBKQBuLB*4Q(`(2b08j2o5zJ*L`z0-#V`kFLvM9oL6ngeJD;V$MEj!C7pPQ zOx4+9HSykMEF8);kmL%r7`HCYh(rpse&?>JR{X8Lau zH;&AHDlhV&wyW7#{wn9C61>9WUlS@As>b=vUoPGp+!%wf)yvOae2V8kI2qJ`Usqa}@Ifws` zvagPcdJVf(!~jGT5L6^YN)eEjMi2=p0qIb>k!}#15RmSYlXnL9T4wM)!)W!Cy<07FZ^AM2nG+F5Bc7A`J7olEvu~yzsaLi(9AnP?tZ< zKjC6qxJzHr%iV*sbfr+hSxB;i-}p`c_ZG_jAcC!8hYTT?f;atG!ZOKDvHQ;*uOn#} z@h!NO-knK*1TlXdju+vR1ZiN!K+U zrqFZmj2zTrMfnmXV%NhC6%c~joAzjb9J=x|os`Bf? z=gs2TVM*)-&9@KID>ZE(%yH{S6l^vp*Z-iyf8FTL2mEIVurAO~65aH?dT;HePowSR z1g@oH{gsG`NOyTh^PQG5RZUZrA%kv@(oksckces&f%#-1C1hA73^G)0TV(T*YB6c$%wJf;KL5I5 z`q}z*&As={R=k0oZAqmXwqKt*oi6dgj(Mua_u^@^i-k6|GJSzhZ&PopAdm1K_GOww z%dRt;qn`Dc){zu!nYq&Cy^C1d1Wl$(soT9}jBW8< z%nHhv%3nxCX-k+cgXweojJSx~X)J%&ZmiNkfj073d*aM!$ogeEIkJQV=RhVSG<8+% zo-D=gyk`n|{gz$Xo;8c%1?iik4W+vto~S3h83+7P?h<(6w|M7NX-D<)374^ni*_xX z3;MiLwUZey{PhR_6>!r$2IEcn?eQP{!f(5r`l3aH2|C!0+Lj4In3Uql+A95&3zV-> zvUi$Y47{87I*e4F-SD|n{~zaMeJAGA%@#&KPF`w(KiJ`2JpK3X4kZFr(97g@c3zhP zQP$gsqL6F$s<_7iLjJN@_?dq`p&@ma)r?Hrk`44;($tdHU-Ju}Q}o{;|G_o6DAa&2 zu5T8fR&YI#SoE}@IRU7Fbc-sNwsgFgu#&T(sAd~!%Da|r6Dg#HCm1OBF8hfV^8nH9 zR3708F)0c!p0j@>o)JTt9LZ0Y@d;po8YQ5Y8_Cr(*{(&GF~Im%uC%hvaBKVk&DeF{ zB!4f?8DcF^QnBn<;kbAj?c9UYhHEAq{boY4Eqq5#2rT!24Y~^C&<)t~@%+(t)aC-M z(g}bW1VblR9#mzifFryab``o)X~0#>0W3is05)`{B5fr*_mLKezXTfwx6Lf@h=PH! zM*UgCQvl{t1;Dow4b{}3yTPPc^`3tZ#1_0&f#2emM{#ynFcVb6hm4SBB+?6pMjlTe zFlcmA(+m6%u``3jQJw$M(^!c5|I*WlSvifgih;NjDh|rRR8moQpiB@5a2Y#x+w&yc ziUmUsMr}(vT=QA4rX@%X`Vq)C0~?+!P<+a^xvXck{p^IvAeWa0#%F(4P!&DmgcHQrZPp$$J#P@@FcXXXBkl&c>A zWPd0PQ5?b^A{v5#i<}!TXEA}SGH}WR#i1$rlvE`J*t9s5DM98oah=269FdP8E=Z(Y>TcjPyBg~7BuHUado%Z$l zbb-sn@paN+IsP8ItY^L0#QI~&AMvExDb4}P3^Y5iYE6{9zAo)mPaSzcj305!Um`lqYM>wzcYhd|u3H3IQifsno8050mwM22@NL9!rzM8pvh6VchHQP8sCbln zv$gV@QA_K~*`n7}qE0|;QE~lbn5;G1lp#fdKv}ALb6@aHHVLL5#R8(9YHisQr`~h) zX)cJ%?vylqmtcYe(mH}>>4Fp_pqr=9M-Gdt7!a1Gx}ndVz0ucX!po46)#`_5(s+ zK+rVqbC0te(1xAXK12Y*(gIj2cD9!3WvdNcb4`Xyu1>R`*)taWrFcJ5N7)Ggrh`F& z?T`4A^s|!(E=7oN2#n{5E}B4Qapr(17HI}zl2|q|`V|myeYX0{q9Oz? zGVtx(btmywxs(lDmIcCFNj?^{yC?B`kDm~nYead^-K0mEy&(Qqx;`rs)85ktc&{m* zk=rv8^vaU8N(juc7|>t_;utms#v{{q~=Fe%W^nl0miQSgi$PjS_P!pp;eoL}B2fGWn;j1&mKLK5xFJTtfJ9B$Sj zP}M&DPaniBnuo0e)yprv#&(lAZ57dUULgFwWY3GRtF0CjeFllDDJ%Lsr928;8BQIU zv?xjD3DpHEiQECpE{v%G=eKKl6{5L~#r+aty8V`cwiA5^C94u)YAszh?JONpsb#dN z^|^-J4si~JQqhk2O;v00ilbq#)Qf)(1X8c*3`|aCSs@KhGat5gbr!Lhnt~_C3Thd-D?7e8yt>=FvzUIU!ucqf7 zvah*-SxwLp$jpKZp4LqnqjLgapyzv!v7Qt8wAqM&Ji?$5s)#@)`aJX@S>VC6jkNB* zy}>I?e68u38~+ z^idtgQ55=(ZuLFG)%}A77EGwkdx`%m!)a4?=k~W8Nci=H}Wco0|O!V7vnX z15|za=AX$34fyw9Jo_=ADKBnV2$_-FXIi=t55UwoAAS4fuV>3I+bw*9p@{yVh2X(^ zE%9bWk23%nhi+=mhx7`^$ekqiT|HYof3k7&<89`cC)1`HmDpStJJj1BGsvnBJnv%M zkyl58yP26Qd|b`a)%_E>cLNt}-}ywFtfp;4+CbT{#8~A-o?buWLQqGL!S}qBCFHB@0lFlfQsNF9 zj)hP*@zRMro&LKfvySkU&Ad&c0m?K8$g!|$qRIN7W(k+CeN3xwYcy>RlLkL$>-2_8&SAGB7o>5E@Z%@ zZLBQ-%r3f{AaZhLr3xQ*S}$NAEKOOBVS^81*ld-JC($&TR=*mpQ}8OL zuY9h*@vl~x^$qEnIemwA1keax^-N18G!urbS;}v;4_RfSl}jd$F(bpjBPn+&;z94A zS4EA%_Ze~LY>8UA&kZH*d4e0h>$cJ5C8cYHd8w$7;qZ*nL?8Y=jDhFf*4CQsxCKTY z;hP)1J`Bg&ZO?ifzu7E&PQ>A3V`%;IQH?umDUFEc5+@B4Z%0MJsBi#8Nj;f_n3NVF z5Ew=M3XI&=a)@jesP%g?CJ_m^L{m+!QY{1<;C2nGYB)z`?CEHUk%>tlM3jwy`I5^P zb?A@I$ODg+dAek=?bAE%&0Z}xQ3o4@dDsBrC$~$NDoP{g7v?@280QnVdM*J9D&F?l zKo>BFhZn^%6#Ic^gm3UDScjcfAw--mDfMK|@0JU{HTog=I#1sI>Y)NagW2!SB%ShK z7q2)}Nk3JK$eD?*4g;@fFzl%G!1ZX$_2W{a3;!RU-tWrRu{8&vn>DdSwEnm<#RyRR zR-=@EVhY!)tk={%{`62N$9kU1G_}jR)PYpcgP8(F7qjMU_w36sKi&cMc%oRzk+7Q_ z_{o@8=(Cz}QIcT;-~!5?%v%M16f@1J4Mfm?8&v27vs0iU{T}~Wocil{U5paB63N$= zJj$Sfw3M!V-GQ(V@R`K=8o_qMhNdDCSne4K! z3&GcGtUvTBl2Djb12zfT@$6W8&@Mh@(8X;BVinqhMdn@eFAKs&_?JjneXIq9r#KY{_XLg1pIQD)77Wkji z_67q~r*(PlM#6;JP6kVdO^SV@%XWD$lp2OyvWLvr6)cLz^Gh9HE00)HTBD2ss%Iz^eXQar!gYtYCMU;mh}Spbx0$47Pp z20NSVfICz&$?rnzLlnq>WN-{5xms!qO)AJ~`?(ZlOA9jTk=fTl<2!i`WhW=?1N zWMG+NZP)48cOF9rrfkmQX4|ES^<%^^0%6x#f|_JT>(y#`PN>&3t%i$gNeYo2ssP8d zppdA}GQK)L6P7a8BEw0ReOdkM_PmYSL2W{x^?@^9c!kOoNnv!JPmx6iw_fxU4^)?H zY+mnnLrzEKFvXGei<@-CFT4YnzrCn*14^>UVdeP%q|s(bd%Vm9bPc5*i27f7V7)i6dHkxaUIrct z8k-?TDT>FjUo?BqK@Wv!oYC#2Ys@1U?m8du-ZT9QC3XNiJ?8#_V$*k(l>|`z%4rv) zwNK5N`P2(q=!(f#R|ifb6H$`Dx;l-BGqAU9^r|pFUV@nYVkEb|8C=!~gpX?@Bf7Uc zTnm6-GT_Dgg4~y)!`9-$1}hLF`=EI$6dZX31dvCDT&gKS&TsZ6f9a}#)(`QVKwcoGq>Tpj9(6s;y1K5fNcxotScu4tq9DjR6Ezk!H+n6^i`Hzac$Oes zw{WW8Hh=2&l$d1~9b0o(hNxw}&MIbk>1>c6L61*rdC{s(RI2GM1Do&>X9dR*rX6oJ zYflNw{B)D&nbxC6fw5%&W6$|90X<7BA!QDFW5GbLZv;0ot+Dj{Lu}E#{t7K&OBU0A zZA$to+GNkO<=YlIdyLKj?S{Y9Zx|-_8&kq5*yyx&MU;o=6(rS4Bc1jVot+tMn5{k2 z6~uRB_U>+oab(7?1a`bj3D09VZn_fRYAwFW#gK0nxo-ua{5s%Jj)Sxseg4(9TF*D^ zY8j;=64Va@bCGz(;*SE3!9!C2&zaDem1yDFQZ-As=RBApIjYsj5R-~3DuBqM)UQzt z5yvBU6^16IADHJMBL(V!MNl1B1+M<--D1#>itw4l!f&vQ;WK6^9>)drf7#a7C3-d> z?X~rQ$zp)=LsS6sSAgQbLB0B|S>kc|Id;zMY?Ejqs6ROPROTyK93sQ9HFnE$l?On+ zlX|4KWE%VWSz;?+_LU0dgLYHPbJJVJt>vPzc(&M?`TMS$EvE)OW@q>2)Ex**F1DG((s*T>ISWIMU1j~vqHX$d+dbyfqQy4 zikWXEnv%UBa6f=AIJ(ZttH@*m#p*sMHQ@1KmR8)wk0|;nB_BKcVuuBxK-WY6?c~oV zPo8^m;-5#}c{F<-;j!EU1R3pA)cmMsHO-c9x6Mb>?Wf+;uYGIVR}m2uugwhy(;k^x zU$55ItDYx@mpI;hnvHt=W1@uD*!H9HaN-naV{2=6m4uvlwtG9CtI{U!qrI%^%g*86 zn}G>**?BlBs_AAO9fvu2*4!&vp0vAc#4G;R4B2_4zrWF5V0~xh(&dgFW`>5|SMUk> z-Ok}VJS-=pc4^%O1aTYeLvZ8w_MdmG_fOb>_z%|G^ghPgVMtX!2stFiAaX^yRzwVA z9_gHJFp%u^>oRdBlzhk*fm3qJ;m$&iWF%*CEW#)(r2D|NSoEqL{{p>Affo|=;xL}cuM~Cn)ZZ}p1(1bvszJbmBRHOP}1-K0hTe@ zi9U0CHXdU#U&51mcf-!ce>k!?yRr5{VdrL;McNkmSovaRo(>hCzxYOdv4PoS!Hu_$ zBy37NtB)=X2bE@xm1|x$#^M?Z4;3|g)WRUN_zm^VTd9F6U|n7NcoF_*r}!UH`fgd~ zk)2h)|2{bPL+KBYb-M9nz_Q4Jr_jhLX&VWd@{Yuh z`X6VH&I;|g-walCFw4c`371r7a!=S|8#doC=Jl(y;x-tVADix-_phS}8iJ>@eNI(@ zGm7ordDU2-*gFI@I2KSw z)e{|nz_ERK_hKP(3Sb*{vi!{Es1m_VEg5k|xH;)N^O6Zb!K)gRVDef&@6`(p z=;Eq73mGtub=>1UR$q+|De0d0EIij z00T{qBge){BHfmVnf_HX4VOy@KkWHl>j^g&^oUXVTdFc4CvE}}a$VvB_% z=T-i378I(*T3R%t=qO@h%V)@$xl|r{%W9A!$L6>-@3$xh=sF70(iK)7ESSY6 z=8N58{DXf+BfmspU7NofiduK z7yO1*UGIWJjYlLt{{0i^{P6f9@IoVb89VP%8&LV_2z1-P!3ey(m)rZ+OVbL`c3lSw z;w}qNqV2?TR5MXm&XQduZyi5B8CFv-@VUmjNKoamL)SzXWzB1}Nb~eh>z^A_AEBjO z!5LE?SU1jDO|z_N&E{OF;2~t=3BU8m#inw_LMwgNMci*6i?m$44eifPA0Ig7i^&s~ zd3igBAOjr>nPdbaDfL=)IXCRog?nV3C}PBP;)3b?(i-~ygbxv0U|6!8jO{Dzz=Bze zNC#_&NykT@K2%4~DYB-mJUP*2mHuP@ynrBKpJuV-6T$Uj+7z(hud=*A`^}<$b{@&vIKDM!$&djDpaToUr{&_W17phPjYLnuO#I@{t#o^~_;dvjbzK|e` zhRGk(`o^It38r7OeyuvK_SFPDvz5oPH|K1!;%P}JBzyw)X#^q%lSFfSVcr4Q9G0fa zNM5TUh6u8m(*uwzj!@n1E43p8TGOB{ZK1olhGD_`GmgIl|EKx)W#hXWVLN6kqt#aM z18+U1?)4yQ#HJ0|8>V2eed!r2Q-^6To<59v{y1r92nx2nJ-%kwi|;WPt=zef)UwTj zq5&5c?Xu#jaqmviwk0Id_nGI+d9;-9+%6t({Rwp3iy_Q9u)_Kg(a))ij}XA+`Kd9jS?zK2`#GI zxy->{K}N^HH}zFw1QnyJ;dt-NzcyAs7Ts{;Ivu??R5ra4S*uaeyNHso1B4I)46->4 zE*yT;bw4L8B~;3z2J-V5De!356K;Ola^jX8HT^uZDJnQ8s?j+#iPAReD6H>QXy1?9 zKfUJJURZAw8E$#_^>tS^9eXXC*!>-g^;5Nv-#uP5StZYX6s9sXZ#)>G`0Pz$O|mD* zZ-BdI!EyUWINODZMR@ZxJG~PSVb|UVvqI8iMgRS^E1?1d|udx#Kfwx@v>Rg*E20l4a#nMU1Niv3IJPpL+(W{(0G#GTh#g9 zc(?|Yt2Q7--&Zlo4_u=Pdkj;hQHpPur+;4)NKVCb>uhG)DYF?4w^)l-m{#pg_bt8x z9qPe8(L*+pcoIGr?$A-s3!7xV9{f5xDz)+qmu|H9UC7Z{i8YZh&o84K*W)}}U)s;x z$l;NGE2GXz)jujIykBYm{qa}g@YUG?B`19qSyeimnrg;>;u|E^c@cPmZ>(e{F+7bE zh_r-AYbg&TsT8_Wdn`+XSl-HACO~(`SWbUxFVAyiXyi^z$=JFTmgv|%j*;1iNaC5}2K8UR4)P|p9n3cT7!|V8o(LD|S+cZGuzKX(XJ{sQZ4cv=i+GNvB}!I| z-4cnI7W81W?m8$8b4DYv#h74*#>>;M zU?QteuAG^i3naXXbqoyQD(NCp0rn5SgSbhLl7qr-+~$VY6nqnoTUcfm z%sO)o;T(J#Vim8%-m*3@Fmg#I)$MdKvY&l(OMS=h*@?o{uIdZFuPkD-k&F9tO6&0e z$dp7*89~GD+RAO&#B_Dkf^z#6KX<&{oJ+|r!XeF>ZJd(_r6fEw7A`_9A9e^iD#@qL z*wpZqPopF5Jq%Yr_l?8jH%~!piJE?SG(qnb5$dH)!>y!NP&9TXUz|#iD(uz7(x_Z` zfweqp%uU-5WK?k!-%*)Vq~0p6MQjv9U&vXI>%Xx=?PCH2sszv{;KNbiB~8>Z!vTHg zXvyUldMzA~Sh|~u=!_+-Bl_>0K+uk-&5P!7WP_;1sKSMQh#Pe2ot(QaApAhR#5$yI z6y`CYfl@S;X8H9O^OK!LBj&O>Z!gM>rw%AO0hgDRI|7&(oOW-=X*0{(^}gm3h6fdG z-5P<_!Tp*2^j3Rr1^1XYe#wW|A7#8>*IZBAe5iyK63sw4e4pv9s03c_n3dm|$4S1O zEaonvr?l&u??2{=l1)}nOGEpFaU87v3R&S_GITpX&*TLV$|pbcy2AJpA^wedpdH8Y zHNJ5n0#Oksh8LA;Mab1={Wu@9{dnLJB|)`$=v+WpN0G64wdwvGx{h(uL_dztosjhd z&MqHu7m33MC@N-Q(dg)^PM1%k!u9zKS!=GDv$gb#7XyzBN6sPpY$z4eOfYNiCYNsl za%j&nM?W{q+cYuOwnE0dco2m<~{5OUY(*Xj=n6;bliqP&AMU^?2#n zAsF#G7FNhn4keclKt@{}MD2lNz`2aO>cyjaQEy;(V@?5T(_m-pEZr` zliAH-te!|RKy-1wJDmuLub&K4p>EPU<*A`@&r?=ATsYlvFT}I9)+q%lbt}Ec!DU`%7Xb9KG6{sTQSm#pZ8!CJC#l#1^N5 z%}gh9H-uY->n)&2W%}vn?!e=UxbgdD_3Qc8_NsDQG1$oAsU~aH#o7?hO{k+RhC_Zc z7+(#lnA4y<&O;2P49&@Th~Y(|U3OEfG2|E(flwNM)rb$u8NTZ2SQ~cNRV25ZUdT9xBPN+U@%wzZ2wJ556e{IvBBsPtP*A65Z)U92S%c)h#gjUhO(H zJ=#B0SR~}2$?mlM+(3D2!%Ig3UEp_UEh0j8uc&bn>Tr_NZ$Md1=tlUxqndYK=ZW_R zD7MY$P#Nr?pYw{XgrAJ-B#GDUz_9n*W5zi?Go|-uQgy}W@C`9u=DZ(6?9Gw8N2Och< z60kEri^TF?H+H`KBvU}^Gu;^-is^^gF|F{6&9IK|V@@QzG^c;8Y7`zN5%ebVN2aaK zub4AxYc{>88v@HT*4$Ld%ecp3xg?rNfpjK#+=q+YwAWA$1bSmcUO-2P0{P6JvQytu z@8PKW6R2Qcx%-9i9gd+|GP`~&87Jrk2%3Agwt+96?Me+^drLql*>L{NfJq@SD@#b? zt(GWOhFDuz<9FAg3T^q3c{cl_`ldYBh>pK~(8G>RV8_2Mkr;mC>o=Xx~o~y<=Bf~^Wg-}(0=OPI3eUcXACMDi>sEEi6M_sLT+dNI_BC`UuaLQj1v%<m>F?jY-b#DFw(B5MOARBS? zL3I4n4UN}~twjqU9t`EQnHPB!bjQ3o5qPf!Dv!oMCod7YEp-l@Wf4Z|jjc+jPQQkD z^F(tb7klfYs?(--9+mR!FMnT$q7qaL)A2379(<)U9f|C%?#k4=Q;+TV`xMr_dNETN zIWs(;tVk7}#yks>IeINj>zp>A!B)@mD6{)M5&A0GPnBJ^SiytYM+NLHcxSv*hGbT0 z$+;fh+4l&E9+RStsjm)#veSY+f4Y^e-cPvVzGdVoEMk7=Ri0L1WuiY7~ zVGqdf-40v*zKe`WA9wls@^owk2=ct$H70hltP_{WMK>sQsA)HE`yDO2r9pf*bUHZn z1KyY?L&dq2P^WhRcZi{P$_vWDwMmRHcY<>!-rvbN@fD&N+UeYS|Ln&@;$kQew_Xyy zeyLsX>^W^p>Ge3u@!$na!cdKmmW>aIyMIS1zZ_cY=DB4<0i@c^0}Y%Cf1HdfPco-$ z63wIH;1Zq~A+tqOO;)%a$X%J(RXi9*QaCbGnpxIYs)sE9Qm>VPh08hTGTK}En-+v2hWZ0lJq301vmxO#j(P_z9A{?9<-!sJzAI0k zL?Vw8JS6hy(#=(Y01{8_HSn|Ft=o48+#6V(XdFu}=G^CJ-}ddnA7Kh z@mX1}QV;~10r8Gy9y>jIsHNqFb#zyU-H45Sg|!~%vEG{&m3`hI4(qpPlEH(>|8j(D zEqFbyp0k8MR!&^6oWE$h2LkPapAm}q2IWi$pR0>VmrY6{$2Nj=e_5QAt;6`1d&UXo(!y}5Q*UG z$*=b_(6?Qrc;#zNpZN@AeI|M%Dlsy!jrfSUn&WMJF z=ld?K!e7>@e8OlB)zG6|_R3<8(79isk@ZPGDjwDd?IGJp5FE4xR4;=@S*^7*4RgySKHp#5H-U!}dmeJg2zOS+RMxxvP!X9)Il5_=6XE z4_(03sG6zv@$fF8y4&v-5m3@oNgYb((w250;=w zpUd&9QOE#x~+v(8|Q7k;pbLaZL~XtZJRu$-=}-wLkND;_Y*ysMgjAjN;2F7avtG z$rA4j?pr}KWhXkE1G=(UgUvY3H8X8PNudpES(~1g%p$- zADK=Q3%LVzCutWW*eYcZ7ZAuD1l*4`O{eF}v&XqBoHV%`$gHAW%nii_(xdIOU}RmM z+}Z!JzjOdDZgFm$@a{JIO}2~r?&#wZ=C@w>JM*+u%iXzrnQ+JcwclzgHCIbc9mH1Y zPP`LtDwFSTs4h9aC-}>DBH%A85DsjG{`9Ok*%yBekgA|Z%0N^vzq`CtK}GR6t8F-$ zzGI{y-DR_cBt5~;wx{Q*ol>KXRLX`B%F&?tI}KB2?I5m-e}y+?HxI0$16hQ^7Gw!PE`QF|)9F*yu6U%$U)+vJunBm1a&!}xe;vrA*tg+DKPelC4_!Vb6!yZlX z^fMeJ3frJUqOdD0C~sVdkf23^cX<|RB*hvL*me^{@qBQ_uTk?}fVm(z)dI2V=R_qq z?KVq)4yZZ(xBw_XxfQ$a8o-3KLqB&RcWMcpP&*^L=9=jd&>NQu5>^ngY`Y*b7fU)s zzT@>hLphXUHttK@-|I<1_N6fyy%73ij5e`DAT%ifS{^IKy?|}dJd$6#`uP=f!|ttf z3Am>>0}Sp}(F*B}YE8ObZFL^kTvF3ODQ_+>ZPX-3Id$p-t zZzqHyWf%?(_0|3W4w&f+0f;?@oz!+1yN|9ruVH7hv?_gsa+#fZ@^B}k>!H&%5SbFm z1Dw#iU2$Sn-wgm8gg%Jnbs3n3HOJZbB+c+To{AU|jHz1!FxMPyGS;mJR@VWTc;kai z$^%%&PnoXYvDxFnrR?q6U9E(fIQ2aW3&=z+7Sq8ZLroBL)7n8B`xTjvg1AR^ zZ}V_V8Wh6rj}9A~7eG_vW_$sVj^FP3*&uUVTAEzAOEubkZXJTLnO(C4{bLFVj9ruW zFPQnR_=ue5Yk6;BpVe?R3$sD%`Wl(-GMqnVUQFF}EoV`HYH_MM;SRE;7bFk@eb%gyt<@TZxp?vtYdGL?=8K&>~3q851?3&y;;+5s&qKwm`j9H2E7 zhp=k3woPaEc_zrqk{6(3dd^wOgbfmr##2mx5)rW&fW^SrZLXarNM`SYc4U4g%IZty zNzk3rjehTWh-8s--%p;)!$)I!s@Qq|VLi^C@v9F%gZcQjE-weIz9Y7O`9F;vq|@~Q zV;*#fW6wdz9u;yRX?|9=t%xM(^JZN$&x?-BekHRDv8*|>U zHlgA|Hlo1-6Zfe2IJ-f!iLeAxJ?r2O-BO!{{x`dA9~n+glHQ4c30!!gL-S&|p}{Rk zXe>y@?Kpb0=X!+dafRm08{KoG33h!hph3l}=KST+xH^EoN@#P`P>LVx`A_~1W^i%u zUTz1Ou3{_5^c6O$gdD(cSnA79z+Le5*_tZ0{ zXMMvbk$}5W`|<3SG2ynz{g6#c-Y_u<3R1e@f;#R43uEdE%EGv*;DAU=p4(G%dXFcm z%hrdqYB@>R4UE(bN^^c%TUTIpWt#A;`VZPg2Sfvcn1^r2W$PaG`B8!qGlud1sguAH zX$mw5FjQzFVR~Xa(yN)pu~}~5PrQ5{mWE;M!5Q!z3QLUfWJK3{f=~234aACCbsc$B zBc63u#p$fa266JzFw^@p$W>Uu$eR)1sj(WepL!F-pq8VJv67~l$o_Jhw3lyx1gFKA z?kidf%r$R?eExd0i_jVzq?<0E^g+RtS0##fAN10xE0lgltk;;`T?rTJHI(`ZAb0H3 zBjI)G}#r*Dvx+6 za;-a=gXnNz(B1KdkYT$463!QC$wkjV&>9)coKZ4+d$eF^2*B2(6Z3%Rk1KV@njO1y ztMMpk&5ZQ`gJ5v)I@q0AkuO)TTVeWqd!;S&zNbn^YQOF1YS8ZTI*&$s(pKV0Q97K0 zPwSo$(c$5fde3U7=%aA|`ZG*gKw4s|H=Ds@e{qc}mF|oQMY)(h{+7>1Ds^ zMs=&(zuY#Oeyg)NH`r;tf~MG-LBz+`K^d(Tx}-l25oziK80+?e_h4PN(1!>hvIT3CeSi-Yh7Aps4cP)0 zf;W6F=?8Q!UbUtELcRTN2vyBGVYK9GJo+z~a6 zsEpLu7>qM9v6{~b@{5j)6r{0=voLKd(2?yPJIZqVx9vJ{ZX06|Z`iyGS|XSZD*Ey@BI^gSrdVjjJ^^E8 zn~5X%Drw|XHR2E@e6&2pwIn6Sb0{8AiFIXaCHd27xTpANCKb{?4Cp9HTY&078cB?= zJZjYv!BLeP)W;TOt^)$-l3xQQ7{qFrXU$0pK6tJdW%+vMWOYToFkmaMaNe8b3?Bho zc=dMY7&BDOi(beU0B_`;19!92^KMf5I}=12ham-^DE$!{#0I!m_hzp)hMeI1T^S-f zbu#Mll9cN%-oYe$C5HXTunkgcPu8$S2A5ia$C;+jgG2)N=QA2Tn%gUOa@MCho+goc zH+N@-9@Y_jDoMMZSu)7NAHrQbd)OBosS#kZYW$7hF{80XxEG4c(8?179-NI;SRVZB6+w?O z{B+41yF)on4F-$muin_MINNRInwyNy)}KBYKc-jZE6%=PR&4-yixoON3c3vd1?bW+p%8mq^4b2OlTQx$7plT+=W4=g7aD$~wfs(N-XB>MnVPYcr7LV&&tlU^<|se~@KcYN9wi^hs;S z(sPn<^G@-TLhONFM`cr&$+h+=yzYxG&oCVRC-nUF#1FR@7)lX+t3~hWX9^JIbv*2d z=ie~0C7_EB=Qo={)KkGqS9;3u(@)_KZtrR0Z=yA-FW%QNTOaYftlrR}`$>>!BTpcP zj$@|0O_-7|o!TUv*XO3F2t-tu99J^89+EyK++8%(ZEtxK>0_Oc%sk2&HYz6RVAk$$ zE&@MYTJf$bzV?B~arIZ;IH;A{Y?x2ja*HBzFt$CU;h& znaMzbNkL!D!B*griFaw}ArI?D^#)-l*6!#x!NYP-Y;0DNZ%8~)By|+_QgaXNO$Lcxpha#l^PKat zn!8GSo3QSWUBfs@a>bh&%K|sL^|YHN-trKxZIDZ{Us{S{iihE>sml3quTu}}WvLU+ zn=rOF1nh}Rs5sde`wwV``pMKB`wj6ob)9+Dck7EX-;d3Nl6|@IJNW)(#|%YYD0q&d z5G?$6ny5Jk?!xa^{Ax>C^S4cXmUhG>67Z8KyNsx0JJjN*iL@oiwKUEY-+G-dwa0wx z44BRsFIn8;`??mSebk5I9*I}t%zTTBvl(6u;(zO>&K4J=I#M1cvt47Z6Tgw>%o!xw zhA(J*o{61(Q^z-4Kyi;ENVI-jV~%Lt)a6Vlye-O;GIc|_>&9^_SC{OCK)LPYW~f&_&%o%!)WoFrHBOEZ1zH;y z-q>XiwYBl7DXqp#yB@X(y8jw;SPEs_lNHpg!#>n}o^KnTdj5!>8S3`bDco51E1+O6 z$gNF|ivB@w@G#1Sq*y3An2^EX(W)n}YZg`x9YtxfTk=oQpU3Mu`{2YI2WzwscmLOC zmO^WZAxB*#AmOqaj@mfWW~Hl1=18~J-}WjfioTG7pm3o;T{U|p`c<9IlFv=y%`0&e z3zP&~y(W$32fL%fQ}F}j3`+}#2PGC1D{&?eN!M0!k#YzQd8%$IPv(ERYF9Gor#M*9 zw_B;_LVE8n2-0y(sK0~W{$uM&rEvZaEr+clUDFv0jNQ*sL`I3t4AFxT=U-x!)*4vS zMhDMP34Y?_ax|4Z^JHxDdxnFlq?M5`&A{gr>Wgwdk11>v*>8NMd~+M}QZCB9x zOXvSOQLvbXQeUoST6(O9B?Fpy>OyKUc4T3feZJNKjddt2S&vxDPt>&JDEh?8ci!pumiP#1&Aqm&eJ^U$ z`Sn&zzJ$cEDD{Lcnbk*4EhkAh)zrGL5@2%a80jh;pu~D_{of8I=E;h!ae+*#jw_;_ z*m)I6O%C5G!rL0|YwwGpB@#_@TaH;;u2~Wdv24_A-#+#s5moLyF`pJgV-*YZAB z<-czV)wwz%_Dp4bA9t$ef$ZJq3=aeb zn|smN`*DkxJU%qB2yeotm54?>Tz1^J!D<`Z@r&ZMb?pV!BD7;p4T5pW-HjF|PhlOU ziC|4ru@&ro15W=rl)98Z&p%u_vY_c?#x_XVlGTdTBjNpgy5YMm39tCh!~{D7-`DGF ztKq@X$`EW9UO!Am@2LF`RtYHweZ_e21IuA``G?LcWpWl(X?(Nssx`ASI{te)U29lV zHQ6~+Q$D0E!~Q*d>@78((e#BmKF=5Pb*E5V*EK3^g(@&P{7+31mR*_tB)!WjA%A_bo$Y%W+o>MB;`!>9N_Ikt9qaXv zgpG>|i!MUZ&UQNnDnGhEDpr;>Wh(Km*5>p%FlDd2!!4fw-sU%1LsF*Lbr5tja3T)n ztnQ*oyZ78w$GMbSf5Jy6cSBnfucf)3?8-Nu3}P-7Q%-6rl!O?o-z4?*h;t<8Cmx4Q z6uCH8Hm!U#7#G?9RZri+gB3UwAG!2jD;5^W-{EJ^a@5Ffu(olwHOB2#I4dFgSwMEN zu&~JVnDQhW`9#SnS9ndLA_YAOT2SBf=d|9?FX%_Q9YCeX+m!U$f6ph}ifDd3!ujVT zKKa|iVxynSuTEa3w5O}4`c~L(6*ejuUjNAOz8Wuqb9A;;cG^e>UhxHT=b-Y<8;J#j zy@5t!Y*sHUl7LFrRTEpFC}NuSv{7oESh(WU;tFS35N67&mv9VlE=lFh4VLB$)Yy zMgEx6KSwy3(k+p4&0xmM#~`AzDT%`#yUR2UqMN<8y~I6iFRO6&z! z{Sy3OX? ztQ|)6b1n+@1l`6DT@;n~iaAUt9QCY>=dC&(HPV}#&hD50e8Sau8>0_{y*70wZ;Wa| z>Y^~RQPP&k&E?QU|6f;u6Hl??byyHXto6H$`&=e_W1MG&?_I2m{sKiZ0|a<3c-f_> z9dOIa(3@SwLHGIJ`=0K10`-|NA75{@ghbX{TGau=1p1@)6ka2VQQzpzU2bW{H$#NU z5q#^GXZ$XG(fF`Cd9G&zomfWpFYwI<1w=y*?GNw#cUS)VY5(jNbfho+d>OgQ6ryxH zVm62#4GJ^k{9lw-InVMvKZzK8d*tK~Js83H@5%qSiTvw%%w6<|zI=-?%|^M!B!vYp z%Xa(iyVdpI%}96+1Bv$N5KLZrdJeNQVcS{cH#Ccap>7r>&Hf$E{>vEr^}hdl-SA;d zN?wFmBY~`}!KHkd!=`Hvp_Lc+!Pw@6G?eywZB@ z91C59(4*j{PuL1WZ#Mf}^o{OOMhIe&w?4q`Y|Y@YUg&;BKtCt;URUTm5j@U5sZIMFfOjXcruRl|Lgw$_n5fBY&%qPA*9j8 z&usd12(NM5W&L+a0mLs30MS@H4?VA0NSt}Tksq%)e$W>HCK`NujllxlZt7Z>gWbz) z3Kah_d(z@)c89V}b1&223iw@8EHJ+63?zzUehh}I)%E>9SOC=mb!eFTak#Plvk2qj zWY)1!>guLYD`A07V}$H0u7Cd2EWDq?WHQO@ri1rpf0P#fI3a_4Pk#a(NHbEvkhI3y zSa(*T<+F|;`R6T>1%`{W?u;IFG>(4R6h76gB+^>6f|g@zo5?8W6TFJ**vTTMpDeW|A(=+fQou;+qV@_1O!A{N~DxjK)R$PrKC#< zhZ>M>5D*cN7Lb~yeC~1U&0lxd^j%Ppnec%83)?%@??va?e=en;rujA}b zzwkbn_Pr&^awj58T%G89RF*=XS76vPUBh%W3!1YBFdIQwX}>&bVL5Fd(Vv)~7`v~h(vt8hgJ<6kn}?TIZG1~4 z_elwf#bkLF&ru+7D*A%1T^S6Kdmb~FY;n9WYTifZZ>%eh_n(vWzx)_?I4%>fw=-T9 z`OkCy#{n{@16DEHN(IFg?0p;iCMN@E5>_3)_TJ_%pW;S%<;B9u1^aSc(^>5@4y2Vg zU1`l)1ucI1Raa}fwMRHqwBJfLULLp5(OR!9<&Kf>vFJ|6uXOA-j`{Q|NRalsdUjbc zJi>S4Njs595M>spyD=H~^jK(1*+$6fyVgiUry}t>w&v#lE6Cn?0xq{6LMHuxwdVn@ z_j;w(d{F%nwfN7I8yQ!B3X#$=g7H^8HrqYI?_Yxx-_l4Wb2Ys2zvL?r@^a8mS^=Yp zAF9v)?~g8j5DoV}U8ntb+*5ggp`;|Ggt!ohUqe;G+xZTcFq=JlErLaRC0!i;Ei9^oVL=V=FBZQlwvyJ zrPz?^Zcnw~tiIKgNkk$EbpAg%<%@s%!OKR^{RBPP-0u#bJf_%qH^W3^H2Or3NnlQ?e6#Oam}Nq%Cgv0nbJvG}&Mh=Jh)U0%w> z_aejsK(`=(TzU~S8oX?>J2ZomUgxeq`KP zX#Ybi18v{kCm4Q?Je8E!W>h2hs^p|kr2D~gVcoBflnPB9h#mU{8e`e7s_bPah$h?Vq!UsWEA4q z(wv90_0%^fn*LcPraY*l?Or6Osc>2>s(oH& zU5HE^*STnOA6%gwV9T-652w;oNYWdP_Qt9~%<8qh5@1yEAzdiW2 zhATA7%-kUza=cZ?qwjsg zM-FjhXtM+9D1Y@-e|E`X)G=&U*cx7kAffMZo80SJLNoHsO&eQ4Z$G0ktMO(Lr;YC+ znya%hE4@03`^etxECo`lK;P=)@mbX}DXr-#pbeUsI*<-La+{E8$LV-C6o>Y6o=NQ( zdl{{DqkEE6!O@k|4t|oJv3dJ;uHI3~NP!EpE*J;*oxR{?H{>_EFFRa8U22RqWj+xO zR8{7sSiVW1-NFg`^y+tJ_0Qj^Pw8_!Nl)JQC5l69f4*I=M=zj)rCup{`WgW zocq@5RD&4ajHj)owgaYDZKFCPk*#oRN`hA7;#zyzs57ru5i_CnNqXz4uyuwpf>6>v zESS+xHTvlfyv0X`WHhMf*iOj+**(+Nx)2jOos7K5)REVO08S0YjcHmG!b z%_hr9lHw1A@hse`ahsDB3l5W|C9w#I>3)HyN#GW^FTx)D!1?ck2g*_U%X+h6N*!$J z%Wga6?WNu8kw&q{s?}95$=Y-3_Ctbc8A~3uaWe+NI%9PeZ$#Xp#Kq@H<$*WaYVJM#TK1%Ep3Y5QN}nvy{nwC1BRV{ zv_k**3lZ zK4dcip;}Mf%P!)7zeZ53_#}e~_Vb?Ma3Vftm&#asUiDJTW#HcEE)z_?7AfGQBx;p~ zu$5<-=k+haM7RXHZ)>h+kY7Ow2F)}HKi?TM%(D55-yGJ zj1&CIlF9j&q1hWI5RJp2xJIv7`#-XgyM#M77ze(MV&q8nJhNh$nrgu?F_tvf73RsJzU~wUv$_`JAMX zDXYkE-y!TrtifNy5XD8)Dw-_sctU{GeI+S8DRApq4ucY5Nc{oUul{*zUs0<6ap76r zxwFLJmJ~Ac^2COM^+-=fhcUP7co3HOM6H_KcP76*4xK>H*WXQQWIAeJlH{G8+_sY} zHqma3<6QQDA||*i?4tRgWAMF21`#-kSHMX$dyS?3?~{0s?J~z0YfO@H)2lG9Z`zU1 z@OwaCr6rH}^i_#;pY1aNOOzS1o;C<+tQExgo(38~xwJh#evV`2+E z)d+1-Qo;^nqO@31F~kJ9Lbv~8sd*RC$5||$Upa;}+v8mG3)nr6JPGLY7o_<9eFIlY zVLY9cv*8m#B+BG`WW=P`?O=L29Z^9{Z7*pR40@9H^tSG+VaJ;W(V9`>#HP18pR4Y| ztN;K0Ro+J)Mw(+KTHM$>gQQUJp54$XiTl5Q2=M~uJ68s|b~Xz5@{_zIzh2h<4s+No zc>HZJl07!VmYlq1@}u!u?NBkyTeXaVxrvN&7!Ymg$Sl*qT&w%3?5b6hy6Fg;CC|i0sG@t(HkY*_zBo7 zEXcN^FM(|O396M(A|bzRi^ zrF!DFLPnxsKje`Z%Fg%eMaAgD%;aZcEHVyAGAGe$bh>pw{2X&t(9i&PORn~ba3;KB z7r7#K%j18f;GY_~vXG45O<(ogl@Kinr0qI=;v z&vbG>t2L5Q%1N5h6tgwA1>z^Pr~aaEt~e&5ow6nMS$uNZx2SwdIIn{aBKXM{BK9-T z3$_b-V20?IyUe1M^K9ppw7bdesmFuf;NOT%m3%RZPBB!rF2(v+-tMa@{<>4nn)6|m zpL*N!S0eb!MV$iNmFZaM8zzxXKdL0l#cA_H_6S8NgM>#;B;tzI22n3H8{F9wy2Z8#O zr)lRa?g7{|5=sSVkeg&ze{Ibsjv0EuZ-XVZ3R+-CSP@QS1&zmn7YO0}r!HRi1M_+y zAzvO_{_bFV^zy~FVW6VODc1n|I#b{b0BNk1F*t62(=zSHv*dA!#=qJg;fT7%}ms>%OsFrJrRDb zKM;zWn*uf?+<_K8=mHNxfE+J25W3C(>JA>Y;7Ip2u zVrPBfP34cj8q=7(|0w#t+SlPab;kj_a|iiDuXpgG3A?Zk@}fIp0-U0qy1U#RvR%ci zC`3C)H7&~D|F=Hq&jTeMLbn4X3W7_tQ4h%KcF8ftU_A~EK=iA%Dmne}fU#NW0AROk?2x4(C#Pjqt^y!?~0K=tHDQ{$W?yQM^QP|+_S$7z;ha&|8{OSG1*9JTN@vn*^2R@OdV6mTgpALSq8yF7&I5dt>TKrOoNt zHc5jf)vYF=Fi7J@euZea-%yRO=C=aP@%ogq z{AA9PKf4F7v3GhMfyqt=q(^PL5WNZ!bOo<8qHlGX2d<;9M%lbgiyxSlJ7M zWf?*6c3T5m?06`o&V}n>yqJEM3#6_F%UeHgIPNR3On@HXWf#pR?Wy7@i&q^aD(+LeqW-NeS8ceX58*^x@|7|9!wvZP}1#O zcv2JAie(rU0)ck+CenIr(S!N6wNI;Ol8&zh!XpiaHWK)RTFr7NQ-c~?T%62cm&{h> z+xrh-SG_>MU%O7+tYQmfhy7Q&Yx!^sT>!TDJJ^Dg1x)4^Zf+=T!4W4=+i%zB>2Y$q(QjKs~pD&R(c#FY|pJdWmd$rzz6Yx}J?2Z$fI4AqEx{+0JW|qGN1Y2H!yW z8dQtgj@Bfj?tAazxnV<)3{RLTy7L#LHqBMdxP|B= z^4ZB(5bl;qVbvY=H=^lX$Pm2T?6UYUstfr!X8qw7j+n~+G>LjQ9%9%01g)wgPHc^JdbqZ=<ySuoL}O_Sf!LdP&PD?S*38BHP2gQ za+JGV`aKp`DxD@1!(y$yIXr2`jD*giZ5Q35;FI(+!X1;xU4S;4cR)OJH zCWtO^Ko`1z-a#_<`6(>zsTH@N5u6sl6ZaXsTa}|&2=PJvJ@Jz;sx`KZ9=nrYKNZxo z71My+Gm)eD*cj5(dGSKP8X%sV=uQI}iKvc(=;}~n>ZLjOE=N?*kM zw*+n&KEOf69B21c?kYN?S4IGZ48fDQOgPTn;lVC}m9Mx$=L&-ktad858UAU<{FJ%>{;HOK_ZaGk3# zFj{g1zyp3;?-H;M({DQi7CTak>sD^h_gq3Z1RBYrRx`qLUAE_Bi~!Vv?sH4BLQKtP zC21kVEm7FAredSlh+dxj7Da%bU5~e}t~ZB}sM&DhuWCM>jc#!N>P*C_^LhHOcbCNP zSDe@7eyMtt%n|tIpJRwYE^JTWxejp$M|s=~Nu7aEohhY{qXZ6VFr%s;`v3tXsllr+ zS1SP*Whj`nSXTJOl@i@ozzzB>PL%iQ^t@n!&91A$R1pn&XysbNGUoBt(5pa;<{ag% zx#YYaewmx&scYVR;gpYd6WimC2a!=A7yS-$(P8g47m&Y7f;^Q`w2gWDcXkRg+9_qj zx-#0`M3{{mg1OFYH^-}#cCI|hsy(2B=+7)SyXB+h6gO1HVqYh&RtXP5LSrW(K(coK zz7#IDC9FS+!Qr`x=q|2Z#DR6IvoF!ELFrX;V0?Y*G+Hy%;`UaIrfv{W3BQd{0uae$ ziYbWrmQ%Y8#GSi!5R*PR>lHwDS{wN&h%Wfm>9$yqEVp{rk#lu-RLmv$oTCnA90me< zU9%g&ZcrH5l-Q48E5T3k^8zR$Q5QhDdDyhyFJwdOGk^RtG6d^CV)&7_m&kVaZNDyb zRBY~*m(`a7+3i?HM!0CgQ<1}G^&mU&VoLSg!=44JicfzeDM#!sfK^L^vktR3eR5YI z=~e?finG_RVBu#1z(|kV?r)HM)4i|$3;H`QKxtclV*dJdxHwTbE-J415xz`Fq-w69 zhPq<@4O@@JH|!?YcV7_aKOH5G{<=IqQ}^(u9zJh87Pop2xrFfTfPD2w?U~=O-Tv!= z!u#CyM;IE|8f9F!DTNHe$=^At{e{p}clx2hR$m1JZn!Hr@cUum;h!CA(CZ8E0ts8OW8EnT6-RaF4SfNA!v03DP60Jhi9q!DQ;DqdhBbJ4 z+BmPS`kKe?hcF6HbMxNJl@TY9h?_?2K7}Q<-(Oa5_l(iy;P|n-S=p?0fb52QG3lwN z{z^j(6?fsrr+zcoQ*V!?fn_11r%$dRwd?_IC+&tB(>4qr>aGH@b?DkpFwPJ?>c(^;*1u&LoV{=7C~K=16JO$qD{DH1(5`At*&HuuQT9`ddLDIOVCsgzxR;oV^kv zXZ~4stYkIdyeJ8UfoWm11?>>&h0c*1sB+zDIZHNumcCvAfTmD!I^09x+wo&i@;lfb zj=>jPy)#|M&o!)QO~j=5V5$ie5^3HkFQ5%Tw_LG_F9c3?pEzCBKPzPJMiox1E5DB8 zTan-&9x6xgG-y%|pj|=kcsjFh-wjny7QoHm;1DF>$;YRmf6xE!_+aSY@EO(63ZUBG zZUsvqZ%&-B26`KSQOPO?z(bY1i%aRV;%>t0=N2$>U(1T5yGB5>*}Gau;#NJkaAQC> z%z2tgh2z_zPFvzg&T=mc=x%MR$8J34G-Me^nTxCqd!*V@2Tl_{d=IYZDe3+JA=nRO zxyjS7;@r{IyJ!)g;-Vo3lU&j56CBQ_BXnK$trl^xebeZI!WNp_@V20&=t8}vI5sAS zSA|ygoO@Q=cz(}&;>{AheHS@aCYnx=xcoP)1- zGv@oNvU?aGrZ#f>$wQ(vL}C}S`RiF;4}flihrigvqsiL``gO+@Yxm-hkzQSa$BRv| zT2nH5{wY?JKL$qCMH${IXP?RHO};DsKK6*>c8JY!AyPI+S=6Zj^;mvW6PQlIvJ6tt9Zio!D*G)cgIX6y;cQV@f{K zgMW_Pbj2>G=q~``^===-IOyfv<}p{vjy3|C`--6v0Gqa8^TsWH@PT&%q_RX@(0wH7 zm5pzxL021(bKxg=Qmct}yivKKEWVZ8NVcTNDV;{(xkLTg<-QZPXU0U-5X=%rK zVKM-8Ax`T7TC#rrMNliZKtilOsoRW*PY$$sdt1Ps0S+$yktPop<}vX8ecC8RqkKio zYOJt!tI++wU_zxgk3l4%i2ByjP3N1|nt2MYVoIK2r>hrd$jh6c3gfJf30MLyRF;d2MBC@xjy??b28APS(yUK^1j9Cd1+m@J zvTK9A;^fZ2Iz>&yQuZ*48PVKQM>4=8+h-JP8kG}3|DO01bE&6nPAlj|FsoNoD%FOU zv9n-vzrvI*O$K|N*$b_Ljc;J6(x*7Z>pZscP5({YT;~E~nUf8ZbM*|>^tW}4W?_cr zez~c;3Z=H#n>@-EIc8cHEm?$wm{a(^zodmnjV?~c+i2%pu`9stYLfRs25r0iA;|Uc z`%*dy9-SU;AS;kQiV%e|fft@oQAoiF@36fB8PkY1x(F(AWXU%q>akLHj0DVITD&pK3=eA8M9>i?bKyjX8QO)JMx9)wG%(a!sJL9@D?j9Y{`nG+P!((i zTbBUUP1<9-6piiDR{W6400~O^PuJp;zIAka@k2{>aG|;Y?>JY94dT^&=*~DE;?|Bu7PzUW@GawQ{@twQC}X`|1&1wJYE1> zdFddI^~K>T-zo!)b0N;7Q%nKsoi|r~PKHyH%A#A&4@=^k_IiEheJ+{?27p_15;l%! z@>_q%&?jvUEK0ovfs+TT#j@^`m=T62YsWz2F{;!WZ%D=C$hixC?}h*RGbJs_p4om! zJS&^cA<}s!g}A3(-x5;Uo0Uj|ql#M-C>w69d@9jX={V6%;b2USc_gOinb=E4clLOF zhTk-FzhZ@yq@|(WW>G6=S!h{TeRSuv>4 zO0Xr>a1O`gV3-A|?kYGa)p-%hq?E(?7qxQMLw)YG;#(x}09W(!s0=r1vW;Nl&vwVg_qt5P{n%_2&;u zips^oK}19$aA>IG%a@o z%I)JKR`{_o2LrJkOw_wM;Wo`5(g7Oqur7BdiTt5;70yJ=3#O)GYco@_T3)E3-xVkR z1NSg0X$$nP7klrJRtkW2y}v;n73iJK@|bkoNaalV85Xx0TA`H0|1c~;IS@8C?Ya~i z6sP=%yD!Om6_fJ%Go<=|S)-}F! z_Q=jyN9;IZM`t?a7CEh#$q-T`6_K60qX!l*q2t!yny;HVPh)$A5t#6rFPv;R{hh4+B zqc?)gW$)i>cMq7dikAOuI@M6WVdecCnXcT^Z;BM`n~_B|6)ZU+)NgQ=dLt)3t@tn{ z-N8S5Zu7J`Ec4sNdKXY#CzbmqcPqzKNP?NCI1WZpx=MIXIb``JDK-<2HIUso?2c?= z2SQSiUKd8_c{}7z4HeKK#5+NQO$jhAJ^>b8Qf)*?+%p8Dn%L(cVj2Wb2Sr?43-{pOfmk$I@&S=^D1TG4Ff~UmYdxJIRJSX7pIVs3b@M&&huU_i?9do zP@`-WTDUg=UiSSJH@$hkn%qSfZh{HJA<6x6>XJgI9|2^RWE#_l3tEr(F`e#4c)w@h zU-{_rRR;|#3-zFZI=-RNMxZFOURv%Dv@pr63Q5$8ohhH{=zGZSU+HYO|IPips-H8& z2cCG+w9RlQ4HpCcQ9J}B+F-u%ZaxJCz+`LS#?A(Ftj=QsYB6&!a2SK)2KqdfXpf9M zccr`F36rPCFMV@?>lb=&h{Z zqUx@?cT9nLv~&_W6cTls(IFPR!B-gQJ8}FW?H?>xX}ZtRI!O@L+VMEx3yA3><8hsj zi$V?C4_)wSxbp@F>ZZ;DDFv+ODee&(aB-w{Zg#=B}G_H~6u%HC15zxt%^Jjh77 zkL&g|XFJw$H@I8oq)ju-ES3d1@d_+xsc_W{I4wS^^!cukOxX~FjAv<N$Kw_x}{d#j?omaD-MF?8U{t$5CjiS$9 zbn2k!=_b*7`1-;=g@ApgYG|AyBE2p3Y!lX>D51LsCbAATvW`Y`aE>>MOm*MJr(+)}F#MPlv0)awsF8sKMhGYF0MR(gVs9b}7=!H6OmIn4nA%hkTYHCX04U&`YUYi?{{e<8le8q7N!vTN|a4bwP$ zpP>t+x|Q%HoEFo;SA7T$QmsPWOy*|S6PuJT;W-Cml**?fFTP?zKru&`v?&Xg61Qsc zl+nKGz9fNYaIYvJTe-jQ^Z~{&pklXGOe_Pq`4)r}ru_C2NjX8{rd`s=FepD-?KuA)*gkGgu3m8t2x0#u4#n2Sr>TBb(wdSTncg z$6LxF;2N_nlm%+#Xjhpzka13q(+V%_3jy;t|JnYsJlXHL@A`SAU;6}ZD6d4@fjU=e7Xpn7W6LBJ zJ=za(IETvxTD?y?O@3!gKkTKZ|5S7OU9Fw5_SqBu(y5gQ%D|eA(q%G>I7#owheIoI zk~Ys_zO;1tlTJ z%voT-Q`acarm(dam44F4Ah!&LF)l9^qnBzuI6j9|>=v4%FYz6}ii=Zbw@))na==~p z1UGnFs&F;b*ZT!Z3t@ef=f~J4k3RZluXXPx&9$OL(L( zB7v{Z{-ML}ibYQH3Lt%zg1Z0K{6`;fxr{!WId|`{_y2pbMd0QFqq3h~Li~=P(>k%E zBrX}pjl-<91)m1nIs_>W>Dbx81)2aFA3J`&j72`XccT+9ckV#rbQu9oq#qvGKXT>C zQY(dhU|#zWNfGZC0?*?7rmWv|PmH&&?cQ(sEGKHMnICC3+|4pvNBmo_tcA1|x~O&M zrB&q$QiW*3qFCL^M-=_+64s^S|CC>gI}k2(Qx^koC?i)xx**uL?HZ-n=?QAU$Ixj^ zcp3#}yIyUqI=l9tp{8B_R02vvWQ~_&;WT;mgml;-az;h^GO^zGI1_4ZQ`Kfgd&B@q z=OJ^oZp#qTKOJEc@ewa5ZFK2)*g3uAWCksjebxzQ^(mIq_xbG_WMi}WtY`MlL&{r{ zBTFYk&c+$ggP6{vH)F)c*^0%=Mo(fy`S_gk!B-;3+bz)m532s-LD?p4|Jzr(y=YY6 z-t0+NiSvi%&^$809RgUVsKP;yhbiTuH5iv10N(d7aJHoO+XJ=I5TNF$yMq@m513!m zxy|4kZv9&DC*bX$aRyvEe(tokLq>NXAtLHx{SrcRQO<(O;N^?A zbHGhZ5fkz;+ zP$j!A25C4+sSIe}BKm7w+vP7_dK~Jtx80Vhm-h@Oekbaon>8dKf;hn_7oF232{(?m zC7l4b7BDGR2MbRYfIoNun6CW1)#uwzr+RXrE!!aF%-1mGav8K`j1i3w$1Q_t9wTe~*ME1UAt`Jz zCh(w^&Uh03tLVI~r+u&YjQuR28EB5a#J_2++#3Sf^Ahn69I|BKZUK{r&pHMnozD(7 z2WLQoAtX77&#(QxXPPc|OT9Otp2~ysr~FF{N zJF4vC!k)+<8H_7&<8DX z6?9W`7znuP9&7`(n_V#4v!4Y^KhtH>EG}NE>?G(CGul#@G z%b(@e09Jfkm?E#7x0uXTYisLFpn3>#t(yi3opj#mvatU)R2PDaAhm>+dmZ&^L7Z?Q z?W3aq;JMa=7^EMMp1ij-H4%}wbg#1gQpxQ-g>$T@$63z<{hF6+U;JaS@{s9)H~M(c zAsFzf{Q;;)efybxxCmqNj57RYj|llkZl)Hf&Z0|`dF+JER!$8lBD|*^ic1kvWbTHI__D?*&t-E zk;>?majgdWJ#i#I^WcU4bw&~FhgpjunJ$D%Ppz~$;Af8SvNgzvzndf)+ z`AzBIdrDsjVip6tXxp)*KNQnm#~G5!lsXm_cP~l1D!ZCSbp7TXh6rz$ivb~*fz9~% z3fQ;)t^T=F$`r4aUbhWYpJBPfN;3BoTCd8ZV7_rz(>3R5b#;zJVN;-xQ(Bhe*k!$H zbGN4+vHn45UJ2$ z`amC88@r2SVopIDx12u>xVzsIk*IjWMYbWlbe-{&FLCT26K{GBreG6 z76>VR5Nk(j7ws_rPN_dnUzXMfy-RomQnvR8-Fw>>y!Rj6%&yk(QxxlaZ`P58_s27n z8~3_Mn;nhwk>MfK-+h(|ePB{FbzShB8Q3s+jK@@7tUpsJFRlWtz5y&f8X!fUepsEpSrbT!&804c-3r?+cZbi9|K=hhJ0TlaAqEZjCqG;Q?`P;vg8 z!tAx8nRd#&U-C1HC2wTtnb5L*gT2y9Ef4PMbn(D`my+L`{o%Tx!ckO!!@fPsbs;UA zQX-718ksVyk@+|8r7jGJQMO-uDtLe8w{g8|oY3$e^aJa?jDzgBhnT^_4~d#11;7AI z+mIX9e-L~Cd2YQcp23WX88EGC7UrD}zQ@sGUq;e=DUshUy$4e#3^Q|ZABg$}zMBlj zG=2llH(|K?#33C(AVmJ`8H-9G1);8BrZ{J@Y{_I9o5j%8KVF?KMRCImmW!9(V9&Xk zAbutWgVAk*ZtdkM9zWP&&RoemG{w*tEeCcB;~pYCfKitY7M4qZx5Z5zxK;`p-?m(W zmJ-t%voWV#fiY`&__dO$wx_S9~G5*crxWHmi^htYwCwXjqopKAHA=7)rc zmrtqCzQWbKKo*a2$qwnrtQ1D+LtV1Aa)F<@)azc&_L~6ed>*-$Ni_Q}I8ivY!Tx22@p4Q;PAa zey?^nnr729c6#yCv0-nY8{Stv-IzVH{&h-cSo*LrIJkQBbn0cwA}mTLe7P5t5xv% zfOk>fU$6>3X!EX!FGGLgx${Rj5+MA@V-Zyk#(;FQ? zkM2~F=HaH%2h{1C@979)-qe8Rq~J{ZLrh@|dw*%4+rDidr)}Px)_aluj(rQ=CCw#9 zktoKV(8;Nj=F8fDMp)?knbJLP*nZIfBScO+(+=>?6CttOj_7Zu&_A@=JTQ+o;I{a= z8timeZX;v8XzCk1-!AE|=qEP3Ekki*F8noYqHRo#h#!0ev_BdT>>z|q+WtczGwEv$ zjE}ZKLg*bq&ZYr~W8o;eA{sJZZgk(=Jskv>W$90^0^ET-E=6$R^_3Ao%FbVYf&$!+ zCJh-IkTpItArV06aH_#2WqX&~#oB4=)XYYPNeGkaOcHQ-pK+Z1^KO7d zldFxJx$O517Jzb=Zisdq^1yfmNZws6E~-Z$rs6YV*a5s_k3e4Ic#Ll2p3@gRkHi@u}}pBb9he#rgl&*%$6KRA}|*Sn@~ zebt*9J!qTz#ar;$|5|{7!;B<$7dpYpA-J!HBakwOKx>Cwr_dn^EF!KSY{s#!| znhV?;wYx)yGN+*uQng6KoX_jTu?oy#wL=adV^AkB(94xUzl$3kUdDPcG+qW-Hbp*E z%_Vz06oIdE#?g2;BdG@R_EHzFTS($E3qFBn{-tr-Z-`Bcn@`rge-rU*oR$C&cGu8{ zwLpn!66{apF<9oum)&A~a|`^=AiUkV>Z`;%$&k+8LLW`KE&?nkEYH!X%e0_Y)mfak zye!${i|%Lwf}t4ULb}v&(G%J3mbL2f8NJ@7^j1*U^WBLWuVXl4r+BdZ33jWruXGi= zpfSiaD5jmsYEH*E0DimuX$Nr_qgS!3#bJg0u(QQd*66rh`DR7iH{)nVzA`yV)*|n* z#!`RiMffA$n&V;sDWcu7q4ZFj*_Yr~gwvp3m*#%Bj%b(w9W%6Z1!eUF_sd?{67GXGnbmM;aosg` z0_T48L!#fy4z8u4w|_h}%a6sqhao_`UZl=x1CUFM0^vk4kZlzYRxO>UKAkI>U=v~<` zDpK!8ZiKc&w9Q1oLL1drj%yK;`i{olUe4k~YR?Z_1TdwgNh);THe`MR~Q zUWjYp+^zwYnm8>MiU^Y2je=T=+>Wipych<_K}X(T6W!&=y!UFd+|2hJ$XZ!nv0>#T z*~>4F))6vhHE!FvJ`zt%*|K`cMjb8k=i7|{19U$$e|)V1Ybp5^Tu%Pm8T|>)&WUazU5p*c`Jfu22z(qefvgZF<`7WcOL=9?l9v|8d>ob@l4qP=O%+FjAqQ#el?x7@G-u#7<_^c4QrK=OdjmRzYN*> zdS>+O2YIIg>&CepY`+=6lT-6D&M=#81;w7&7{gA!!+$DoJNe8QBFEGxEeuyg!{TGf zl9S&D;bGh~r}C8`UGKIsZ=D9Wd3G58;=5Na%i5Y06-96NLTx?i)rcASWJ+4jPu_)> zc>T0m2mVFx4*1d~7duHd&cIrO3OGE542=^c2)P_t``KK{<3L>G$LITn$&?Zpm{H1b;O|$oyqtt$xIx$-vO9 zQs5Z?I(F+(3LJTB^1%l{u!>b>+8y;etdUDAW!V{Cem4cJ`9N%Opt^hA6Zi=cyaV=f zq)wo}aP{OXRyoT8G!5F)1g*cM+k%efnW^~KuU~fX>l!Q8{OF|NFzKi}O4hFwrWX^! z&Ve!$qb=>}f7%?P^dpDi$+;NLA0hUSD{t)P1d#CD_1&&R(%3QzZ}Ztn>HZu%QvAxA z-JO^RXkzy3W$fEg`Bu^*sIZT;8?A4uv2L(?Wv9NHUKt+=incOHG#nEB*;-wL8g%D~ zAB${K;z58@7 zoL0=Rz{A5W0J5gYi{&u5y&C@SgD?TE&nA)s$Vg+ktOFf~fv1;Eh!H$pLZ>mp(#ok834_)p< z`#dh$V7S?Xcx}`rs`StM1}fQeLq69uAF?0LZhLD%N{q{5_O_YrQ^B>+|DYO9WHt{a#(Z0dObn+QG16gNczKSwB z1uU(|<9uCS`3W2imJv(^XC2dVpQTrtMl%G`PAof45g`7s31b07q+F&sL-gT(9{+rLO{ch{y+PRoL{s zk9qWD^Ut8ki=npa40Htf?9~%@RjzZk)2RtFgOy^?^9O~as=Q$!7q|iwDH#dW3PCW^ z)DcNmYjXi2F4q03gFO#OIHwLQDr4hnh<|kBH0+%&Kv&iqN*?owDC0AT?QQBP{+Z#< zyocM}@{u%JoVA_yA3@0Hmo+p7_;x6(S!m*#ZSM`&uJ~rnBAAmB?^P>%+p$^SShVJ0 zNi2%h@Tx^=99EjWvJD}GWr$2zR~7aI3h{k+YA;8*tZht`*Gwkt0c&9Dj6@@i0VhlsDbv?%-lA&H3s4@hvdjk9K zXXO|b@b)^?KC%FGnZB$Uq|AsOj;JRK>ICeGzZE8F2K^;gowb3POj#q*Vx`~%$Sl4J zbXIFeA|8J(P>2#BFZtZZOJTC+@M2h97*O3m^$dRerD#dX_ASKDkdRD3EIBbV;e#)Y z6X|p%y1cC`!DwoRA*`R1y@9QImR+pxDWDp}JyZ6<`l}E4`{lf2ivj#d4m7_cLInl5 z>FKvNuKn;Bc||Hs%{hefr1ao-2T zRzg5jL{Lge=?+C2BoszkNr|DmQ32_0N$FwekQ!9F1c9MLVW^>`rQdsF96i71d9UY> zb6p(KnZ56Q?{%;EuFqnbpo8a??z&SMKe<$7yF;g@Yf!X`s}B8K=~VCc=*)AC^LzKN z1&hft$;PZuo%dzp))tk$70n{TW1Joor7CiT$r3~H4yZ~&FhE5Pev_Xf{pP-?fY5?x zC2e#@JCAhxEhGxhx^bI06$gaVw2E#Os;UGROC#uY^~(gEI!%X{7i^ zyP(pdwwEw;2XP9@^7x6#6rlm?t`=57%YC1X!wXD&bVi_GQM51`PusOQ!9n@lw<1?z za00}(nKY+u_JAmKg*48*e8|wSqJ()O!Q*Ie!sNXt&coWm!nmT3MSMSvWS=Yq1ym%{ zw7iE^nFaGQDeZu4!?6H`gC;+_crMt{<_CV9RyW(z{Gv!aAHk zY;sh#HC!G&dP8@-|H!E&VZe0zu9qXGm)VCu9vxlnKl(ij}J`#yG< zCkrau{1S?paD?+cCzuCXbzxxGRHO`3r2~F}zJf-%*12%k@l)DZVC~B6%!>=wtVTAz z9T~8@{*fDeK^W@dx^{O86eSb*-vTOtQtLq!B%jSyNdA})3wPGMf)RUP^KfmXzdhaL zS#kmF&W7o3(WkDOQd1R{ukZMIhW2(ouzYq%Z&~zzli+&V&i&BgAm&Y zbhM{%{r2qokyZ^je3Wp#S^6Su2h>fOEl7*NaifX^liofPy>iB!nh__{!qnb?nVG_Y z8|TcdGy8s?LS<{?N6_p&Q`=+@T9-3#ci#XU1gOg!4fI9n>DdUS1ZE2j zoupOxA>7TWNe&CK5oA=y&RRItZmI4q4HP5~dmJ5Xrq(6%SR^?EWd(@KLRUJ0a2A7X zDgm+y>~f1Xi|Di;hnj(c_A1%THQ``hMHwzIjYt?om&Y$UNGB}lC%bK_m4OVCOM#fx z4%BFS;Gr`;F_WIKH1SA7XE2!r4-ye#K`WVx)hZMY9-yKoUXaXtOIyDo!|{00{zi^{ z*^JkPlDP@l^0m{?N!D_PmX8$!FEn${p@zK0W$<|J?VAud{R> z39TrXP1mkM?|;oS5Zixq^{!FNdRwpQTY0!&&(`2EERJV;S+6Nj3G$vCLKK(1TgbCC z6KZl6K!1f2Yt6S8ZF|DO4HrX@MEjakV$sAnN0UL=bX9Z|lZ+vSMgXSOjIDiR$=Pu+-tE$uto{>jRv~ttA>%BR5q6=E~O1=HY$@j;74b zjqyy2t({ey7J1!aZrFtc)bjdL;Oc^vlZ(x0RCzhq_Hff;FI>dOU|&6}f80<*HpU&~ zILL4*4B}Ki6>5Poz%r_rBjkszPn2FA0jaG%kXN4-l3s_-DTVqxP(73J@WH1T4&<~B zxmh4w4N~tlR-H*gBh|duc+B5G6D-j90*iz$HhkN!EC;#q^wH8g8nIla(eVNf`4#+K zKqHk2+|8^$#?6L{RdGd=*7xv%%Po3F>SjM^cHklKshQN!4*GBkhurJr`ODd01fdo* za$Uw?H+aI_;IoaGwo~Wm} z&+EjvZfa-FZV$s}QF~k2IISv+`jEZdC#(^yG%@H0%j224KBU{S;+Y{EW0;w%RK}3z#lPjLQ9rq80hA};XS$9oDy$P z<>{bM7P&R%%vCnlp8)e!IFD<0DYQb`9e9-E^QqE~n&6R90 z&9?)!O~TLCsP?SUgp#PX`a)o`mI(d)sLh`5{=M+OLPNq(Uq1bept+fHqNi8BAXW-$$F0qqw+NngO@{>BY@u5XTn#2-NKei>+Au`7B^c&| z_Jj*E*Cq^1DJ*>+!UFS(FBm_O*IaFUPwUW{_krEEsqswvrc+3}kQjq%gr%!e58pd| z@h(;t2gVEhe-ua{?`7jDE<4vPL{eK;)qGjOB!hBkQfH4LIeH*Evq7tyxUX;Pb8XH5 zEtg9go18J+LaDkie@lrwVa{^7>HS;Zwc!}4-jgyX|nrK(sxlyLW?B0Cc}Qn-Ol=L%zAqKg1gn~6vhcN z=O3n`YNV}<_x!9@7Odkxt7FTGTYfrwf0ukjT*cr<=2dT2HJb>IzPbH3l}=}$uzMzM zH=n3pJRn)+p5==c0v)%!l;s!O>Aq-6Ersbbr6O(@8yX$Y>L!mrX) zugWbd{_`J`Fx53p8z$D|UrPH~e*2Y#N`aqs+_dp(&v}3EspdWDZ!M2XNvGyzgb++$ z#s#wTwMe5HqdHVLa?4#=`@fe_4sI+3`UvnSl{O#;Pt$&&-ivr9!D+fIp&OO?jEI!q znt0ko0v)~5aIKDXQF&(OVu$V&LEdmn8_S%KBgV;sS3tG{TUeY7TYc6!W~ptw#bj?j zuEY;|5y2Y_o}bUU9l*C{H&|QZ24DM;`GVsLxOL+ zg)mB}s_il@Ue5on`!&2;f9v_Yix!(^@w?GpBi#zi=`Z1=LUr#(mpDJxl3>04EHC_# ztid^Bdm8spQ2OJGH}BX+@)K=|LSzkhJ7*a_Z49FA8y#BZ6p$_QlgH)6Qv&NFQoBL*)1E$~jh22in^RR7(3zt;Hg9qB2$ed{TsbF1kqmI)R$?Nu#z z%~dV-n64WA;~h7x)E>f!l2vWjNjfxfsM%u2_phT5CL($GMPRNmOd&O9W*w`| zjUf@UVS2niPUatrIV)$w$b~^jHjD={U7e@S{%xeZ%4rm}Q4g4RFmSHtLkT9|yGe(w zFVI}Xi4c{PWJPnmV$NKZA-SL(jg zpX9mSwJkulxw46O{}le?yY7mY%c7E;c_2IBY}2!~+2Al_a6oa7nN#YJY|5!1{JGpNWiNQVbZ6OXa?TDjIftSv00AcOfl{cuw)eDT2o@eeq$5 z?J+rBmu4XU^C3znt|dAU(}#*ep$b z$YpZB8pEQt75E|=+Wn4x5vQp$MEs9$Yo4P>{vlFxhZDxp7aWI<(3ip65Qnenu_xJX^ecNlTM?BYNN5dq*t||9UwV6^k zlk|Dq*0Z+JGA65&w7Cw+!a%f=&foWh>p2x;9W8yVT#3h#yGeJ-z1GOje-#__opV#@ zRf7t;?xCR^AR5Y&$q3)BN&^t}93pB7+XXfb-v2BeVT-S49Z z=a0{?Qsc$TgD*5BroZvm_x&u7{qMWV8RVHq*tl>#!6!i!ylCXaPsMAwg`LvJFN3XZ z8qoV7{6DYU@7&G5@d5Oyq0ypFJD1ALzDni}^bn{Vd7bEqe@oz1b!G#Lmfc+LRoI*# zfkpAhAg0@+8JukXA;*iqm-6;){oXS8B`ivo7G3FDsg81ulmHna#{-VPq-at2gu#K8%i&IFDQ%o zgeaaD(Ml24D3oH>sVdb7j=D!0A}U#c&t>`TA1?jDy9H7kB`1@!X#++l>ZGS6bBY-A z6>C0(65&;y#W}ZG)#|5z<^OF-2x%16_x%`H4<5s7OmsHKD0t4C<9i|c653+9k5x^+ zqQ$%?>>1vkT5&CWhrr4Hwy7QMkAQ$V}Qi&Y10bcs~@*HaY3O$8Pw|)L1|4j zQS5X!s95I!2;Bzfg}@4NJPcPmkRBB=#I%oR>D0NK0B#9$n@=I3rzhdu$y5JtPvYa!#P6|55*RJpz||Fzko*#9SBN0 zKQE~?Ty*As?NtuL?@o}wN=Df1*Kb5qRPZ1Mh^jXo3%E3Z)sN+~F??o2&Gy$lr^MpD zJ9_lvG!i|)O9F(t-NY|AzrUdGo+oUg|NnW<{<%`3*S$Izu|bp~-}5Xf@%XEwC| zKNrrA+aAsQs5evo5rN=0&v=lnFcJh)?`%LC^7*M$j&pe7ocnPrnLlQ0)$bLlZ4wwt5NRC`&5&|@ZzfV zLrKP+XyTc%XP;$F$+MCrFK~3eb5w^l2lWm(4?oy9;{&W9tUraP*?92wG zpfa-f&^}ktif7X>*?lQmc<1sP(G>zpSn`qny$>ZcLcbpQ|HZ7_;-$p%1#A8M=4}gc5=@ADie*Zt8{hSaOxodcrEB^Dz z>e8Yn3+ZRtRbNN(v&yDwm)f$H4eVbyF3d!5gx;yUpLnp3l63sf!T7Hwg6@Me6nKyq zUj>-{=K_#aimF;Coy*Q`el-2s7_Ft|UwMry-_rb27INM6mcX;9cpaa8!HB_#o@|RV z?*HrN&7awzxcgueJEBv|wkNdN{Z=89L!w%e_zJsY=rtb0Qgy41xf`8cAC3#D08a_# zL4?pI{byxKd^|}OqSDiHANylSSZ4eYl|8#SLRw!8P`I$|t3;cO))D!OS$bDlbiw!! zc_N@bY<1kmmcj-jI-asKZWd((7# z4Emd3au>4sri-vivm!Z|Ih~Wu7&C1?Q#4I^W}@swpX`*5?RufzbTll`8QOy(s|%z0 zltEnfEzKrVpB|pul#?qx{^%4Iu+h-o1bY8j*G3vDZ!6k6J?rN;J6hZ48uaW)GPPkA zWvcDES>98I^o>fNx5_s{<}#lK9I z;%(e~3*EZ-S8Pq?zd#DKrRC+qjL@p5}5Y z{%cEry-%E>o*l9nZk}gOJ;=KQZJ6i-4gF_EZdi5~ydU5G53c`xAr#)PzyIqNzR-M$ zZ?SJ}Fn%+7l{d8;L@%Zi?wbuECGxMEb1sunKU}sACj8$U{|DC^ju823n;TM~LR%#6 zQ2ru6*C`*_w-c7oB|=~?-!MNbf8zL`Rq;UX&GwFg?O$IlN+4wDqm4GDTZvtSkRX<+V&5BcnZ zi@ckNNMgVz`8V^z^+lfwXe*^}K*|iQoIPre;t+26%j($TzTM1+L(&>91_VE>mDEGv z)?)IyU;XXrbb~bNt)j;)2>f=8;mCy-5B~R6170mYo_;XRlWZf#_TrO0&MInUNgYh{ zxh-zbRCFr%`Ez%db$K}q(Gq((%ks7P+7#Nsx7ct$=8}!|?1(RxzNj0p(yH_;?q6WP z@T#H{W_&V0BQe3zde*3^l-ED-!^&MO9>V?-LX+D~b<_A~oKZ{&CRUv1GW)NMi4`57 z(Onv-BH^`DHa&rBFx7OQJYClRuN8;x(5nq$Ay0_bWh&tr&PN&{xe$5p4I|=EV{5}~ z@{|mQWfhiz_+lbV90yhUTF#rZMgghoC9UDJF&5@lm9b8rf9^XPC=+u@X71M}`e}}z z^WYj}$Yc~_Z}ZXJ-aPREKk9v98OoNf>@$Axph-e{QK>t}TKgs05gLjRL;Ahaa&?i5 z=i$ULIt9EZIjsKo%mdQz&=-u`GG@lsi&dWGdjT=hG}Z9<>n8Kq02h!`mD{hhE+A!< z_Yhj0GV8@C$yR)xDjrjX;4earPv{6FQ--ol1k0;Mp4VE3^y7C@|+Pm@ch9@#M z#SKe7Kgv{%L-wB0C0=mkj(djXDFYW95HL+|z*!mu7mU~H9ED4ePH=|>tF$MFMNjl` zt9qcKm6i95d)Xd8|NA?jb+2N^frV+Qs%aNu{Z@)Qsq1);SSn?sCCw+_US(TrBNaJ* zINc~H9-6R`ocYhQGP!gcQCyLh#1LhI%?^%KI0xHkOogxYsMV0_naUTp>`V*Zpcn9p z;0P$KIk8iemPAVVv!MI$F$@M{vHPZqG_X-0HI!aZ#c)O(ddqBy+>b0ok zndeCPA{DS z8oS)Q|ASdBjr7!PkCw@o9M_-UmGNgYITS5L+0vAKZW`v!N$gi31}{Sp(I;eBQSl5N zUq%^vSGt+FTj9KdnEp=-RY?4N;ePF>PY=rghnx$vOYP0g{ZT{%=`(0(V(mmI@Y&Bv z`&-?P@BuZxxNBet9NVQ^0b8_BNfyLUg|%PTYn0YPF*QZ!OC9?0`@czMlX@HX{0RLWw=CQvu%g< zRgfmK3$hA>bczcqT&o(H$O$#$S8mc3ITjJVZn50lZcyoMffIMIfnOw{piW5pOtG(E`*zO`wCPQZg=8YCiPM&KzLBY0=@Fv@QH}8aDWsUx45$-WVhN#=2Ayd0Qj}E1Y(?e z5vOT`g|?m?@tE<>G*f!y0S4x?6>9b!A=i!1k}i86UY_?d!FRo4=c5X>HLw)Z8*; zIv^}sk-Y$TVcSNl(_mbzYfRav&V^oy%N{e1rSYaKvon17*DE^&5HShuK|7tqAe;%}w=dJAakht$)J0uCF96|w00jKiyGw$AGln@%t@0b%sf;xM7ci3b8WmzhM>z zk``8wcGj*}o@c0`a_ay?FS4pTe_B4AH%8?jgD5>U>0Qu-Cae46Apt#AMAS@k!eG={ znW?FESy~l*54pWUxnOxGd^iW|*p^-cNNe*@iHPv5E`jX(bZ87F)hTXIVs@bZsOF0B zpBr>INwj2!F5W4*KuynNFJj{5X(}$*u3mjgK?a;%Be{P&zY~5gSQu`aOVi3M&!lmI zLmOFM4z&yFTa1(j$r~=nA}n?bhp_u{LSy~A#R|o7yRA&&j^d|(JgIK+lx{{WXj+yu z0cKH2P1QCciu$3*f$fS!W{q%pwK;_FuTa($nsNbMzmN)CJ0#%a&^S;+ngGeIs&z!t zuN;q8lDMe*=^KEcDC16$KBnJjuye2jTI!*63Ki1n)&Z#854fD64J#Ixz9BjY`-u!dTsKCMA8K3Zc#tmf{loM+dnKyYR)*D7N_Nq~X zs9K#ud<(?ouZ)P++Xb@JC&+hyL?6wBw>SKqeJSi?w`E4Y4TTPbwIb93XqA@BVOi4- zd!iCF>6{;4SVdZ+FMCit$&8) zoJsFj+_wd22{>NysQE1e?Xn`20fHo>U=5RCgOvSy06I~7pI7Yz%Zj}JLNMRV6~qV_ z4RU)&i>|WO;;Pf+KHwEZ07Zl510Cy8pmiZG;H9th`*gS4ZDu@@{`qNLn(m6}*^&ssQUVV8vHoHbm&D{1o&aR9j*&Y* z8K^V;;o*z!5j~$7eNeKz&SfLFrk@c3 zb*=_oxCsWJO<9me`n`m??hVjZ?uA+gy^KVOwt>YTCA15=d(1rlDD&gxu{yQ%eX$3k5C3T^>F5?JGYFvIQ#hXus znU|6I^o)z%!l;KSp0WytGqpVQVsBBPTx=D#zHSpC$R)xtJ311d{2}8~eQW-`omJnz zeWX%KFT+proD0G_9L;_1MsEVSaNI8HFtm=GzMtH!Dg9LVY=HZ=Sy6W&zXYUxRqYcxpf}k);S`=6 zE}C>TOgJF;8)d4-znJ*>_S6C9&i#N*4>Xf8@hJtDkz1A@{5Y?O;ES8xS3lpcBCajIh-< zixx=FyW1HIC{5Dx@tm!T$}2VN!4^|A5>MB>rB{N#sV_SdwXImW3nNr$6c3CuiV!9jdSAg7h z9(Z?Z+fD{LjXD;-9Un)Da3k}{5z+}-1P7~-gpoT``sJ`LDdL#w@8~+Z_W7q#=#z(b zP;Zt($c5zqoqgT(uKt^d%vU@kC+|I>Uw-25O7biMmgZf-R8Xgda;>Za>ZlX>sErKP zlfiud+LfjIUvjdNlz}DHrWy<49+rY27+%67_X@OY`#g@eJtQBfx$B9)dCb2IXo;`q zAPDIO8v6vyB#sPh9>r{r243apU>PQCk$4{)`DupAOGStcTO=PEpQWsD0iM9Adb!qF zw@)vD7WQ`n!0Y~{e>ai0U=B^xYtaFNqMEhx$Zy2XLiyVaD!tqf&9U5R&}qzdE2dBZ zsz5JhDK&q+-og7!ZD7HT*sWFZr!}Xc>84NB?nUd9)_jXDRWxr2z4`3XCV=Bf?rKt8 zS^%WW$B@Jl4Mg^tKwpgny#%?c>c+HO?)#2q0M?JN0KH=-fW+WXY5aQexFG@v!=grG z(FvmuppFD20rAqxyD9w5ufCMKRmI8yBsjy-)Z0;kO_0+E=_rM6rd2~0uWV-~VL@#6 z5SYC+RUft})LK+Qc54KEf|lP1E9!e%6;Qhix{X``0|-tTxEFAbJL;|2wfdb%`IU?o zO+70JFtuU(VDxm1lnxR*rXU#;4(ZGeeqEEQ3I zL*c#SwLzD*%2T1vH^i-n+b%fbYY1t-Dd)q?ZcuV;JsEj1I}qiucw6^2)!(_ecL$LPM4#4mhSpw??q1BePzf2o(4k(Kf}FA>mI{Qa8eD; zcCe;o3-ymLj6R@-yYusOsrp(12}P;f^MZBRgoJmWcXpl~NZdZ-`!h;9MReONbsT7T zamDr7521<(q~R~RZTRWU)z506q3xDi89#tQ8`uFuj{uIeNE*A*r|Y!IKmz{wV5_1| zrTMx|AU}|Jt0W6|it&N@TV;Spp9=u4ja>?Lhuoc=so% z034~)kKF+Qjs_@U(K@e^D-R~g>Oeh*Uhw0g+yr1qmV}3lV^nNL;1!9&zk*m#(FEc% zWQDDk7#;v010ylU7c8djik)3R4_lNg=2?(4x0Cv|k;q=$_BY=eWti|9;p+yu5@=Wo z(IzNZi|@{PdeTwxFpow=mk&t^8RXRV4V!8_s_P*;W71~2lZ!6Ui(N}UEJfxd8@UgA z2vzzaKXrYZUGHV*kmxD@+QlP{?J7lcOC%>Jul!LUgkD}nOWe++gy&7bnF`7Ik`=q5 zd!{8LPmRe$b6SGgjy4vSJ<}IBOt(B=HO_6*lUYQ>xHDu{7P7nL%tF;7w`V&uFg0W> z^D82<$L&P0lr$F%F(SML55G_0>;OUJC5>zzFxJUHW5yXma(~aYqiWQHP`T++h0%vs zjoy4EU4n2HCV;J0)@QvE6SdNDU+AnQXf;7Vh&-jd#sD$|@J)qN_CVhYk7dVigH8CH z9v6U^F7k}tMH4=lVCo~*~Rz zhB;Xph=^hpW@f||p8DDI?@DJkDy2CSUvB9drWhC|e>Qh?H{6fh$>=}^yt>+T#i%ng zXCijr#7E+1T=LWIl}3t|%+keMB^M0onarQ1aZJR++-u5LP?rp1^jT<2zj3rVNH?vh z5a?_cfq{h_rI$+saa#cXj{`^a%mxF459Ps}WLO{QU*xu%i8GSewe!uhw+1-LT#)q` zn_hy*s|Z%tujjo`>Zd^qj_u?vp(QcZ@rZ$J6c{sCh4{KBv_1OB8>SF8FhMt_Z@}kd zhaAV~8Ujn)@B8b)fPAe6Q7C=34uF!0egYUlrQ%Cg@z8_{qD{-M+7D8tDUPigm}+z4 zv%bnKfUApiH{;;F@{H7rYNu6UiHh8JBU$QIW4NZLHTDF)DIW=l($!^)zGIHtMaQA{9q#6Qy)tg9c!z7RRHuYhT4Uto7Lck7tZ^9RgV{f3!m-1g0~0yk!M38%7ou#}nl)78`MF)nLP>oYlEKGq;(Q^Y_~d;&=dH2TaC zmb(Q?y7??sbGKH?sv$`8cD%-jsqF+9zhnmn*<`J0g{-|MqV5Bzf$NqxdBUbi-$|E4 zvtMw^K*tYiIZ787U&E4W77M)24PjtkJZN58xUVOmG&eAtHot4%wgk995OAiZ>b``* zUgu?=VYI3SGh~V0Na>UaTxM3+KR{N3j7xSHwIsIy#bnu8oH$LUM!B_NIx3`FMt)kI zbxz*CLZic?ewGEoE%FUS3Yi?$6G^zb12ePU%c{J zJrEXN{@kXV?f@#)%_tVwLV`|2_T>W> zv`~idgSP2N5^e7aAOr7b>KOrKT4=6RYVqZi)3~GgwG0ebP4PJ`$31j`JL$5? z;3{ZjOa)^i3BR#AHN$}ec9hjY{s}PWU|_tJABPwuga09bI4AoB3{3RHa9e(PFQua? z`Mn})u3ZctIhtR)rEZ}zYl*L6R^&YlLi&$m*TtwrQ%rAFEim82xi|ejQeY(bEG#Rr z!wY71Vy|uVo$-kIPF$tHNYnR6A)gC3ceq|_A@3NC44O@)5d4)rnnU3gf08d<6V@3W zYT?i1t!oL870q|4^}V)PAC$AL)pfsZJru1leXrHCz`y44+KrS*-`fZ)`H}o`r!g(y zJO?cV>;~_0*xI_6j|Q<}_tQ|5Yi!dJniCaw2A2GjJ|XWc>RPvJzIZZPcar^>X5iI- z2}%OIkT}$Hv1n2Lz>_L~jdh<|iUbg43a;=Y_M_4`YXbH=xbvmMOSNVj*u=mCLD z-lLWPNzrk^6pSA4b3DZe`rEH4qQw5nI;O260O2k zNYQH0p2Uo<*( z8_+e%99Ji$Y5@CG2VxC|x${#!z0^(I?Od=y-QA?!9=^Q(O^Hzz0GQf|a8GPJfWh-Esl?q6O7>RE6$bu+}+^G=aTLHb;No@nhg1UoF3wOPXjEI$1 z#+RFP`wgf64c4C<;g-WDg0HaM+6=|0kbFg2OrXBz`%tzozyR&k1L|9s$^)M4LpOlV z2-xVw@w$r>C)pPf`7BP@{vr3UYFDpM)A)RScYBZRuDO-_r&0Yv{<_$+V^2iX-aE|- zHx&l&UyHJlYkK|n7Nh&Utgo>Y5$!z1lfDlBQq6BcFYIJ zr3D_ZMY@!NHV-2s=AiCq`zUhBXZ{3%U<$tQ_ot*RkHvOo$ZgCZ8)*0_=zi$GzOe<4=04X0ICwRcFWm74P0+vhNaER+`qN)KRWX+e&f@KK93p`{3Gy&q z3PRtPw9UN|jrHVNT@<>bt=v|utp8-fel!)PDO=2+(0T9W-Y{)l%`y%q=+Sz6xQ;mw z_u37T!@p0-sxufap3iN2uFw8)3c9lxGqpucHBeWn)rB2q)?0o+=42ce75eCTnbd@F zQ}>R_u3Q+}D(?lF!6|EABr<`Qt&u3};j;$<8ot4)HjHy`o|J!h_M<65{AM@ewqLAM z%Q;72iB_f=dU|(HoVuV0y8m;qpnq_AZ3~(lmmbTUqR0p39L&bJ_5e4L*l!asR{Q!a zUOfd?Uj@{zF1|wn?l*3;HI=NhgT4Lk0f2bD*afmcEae}%oF$p}0NDKW!;8N9q9IN_ z@mk=SKW>$2F27Y99E`sS0G>&=YWs|cPsF+A&r%n>{7jiZ$;`3MT`RHK*vK_qHI}Vu z=B~w7)-gKb%2}+?T;ABtZbR8^qma1XeEPT1gch&#rkCI>ow`$U*MsEQSh>2=kyuX> z-T4ss74^#%6+>9z7-T=&Hn-cO{1I*;P%@}kg6|sYC?}*lXIdCH!=JgW4;q@b*$Lz% z_490VYwZ4fAbiBPHB-l-j04yiq*n|!tsUU;Ed!t z0G@?CwxhmItkJ8}!K4}gC9t2c<9sm4qy#>1t)~YR-OAlYR(GPX&p2MZxLV?>v@2X~ z#V`J$yn@-JXemCYGrCR6_bWwr{1$if<2nJ~VOKagGj=w%RAFmLp6c5_>H}-e#SPkD zi2MBn@JHIM-pF5a(sr+R4{vywM!c0^FzWQZa5~;sr`MP4ZEfA|&*=Olb!a{V@@(sf z>RS!aO&FDT7;QKLx(07yK?%Tr!&{((boL|rNReO=H5tVWHz;VV7v6z89l#wqbbJ|k za#!k(4n5nz*y*wNRyR(s>Cw)YTYmkRo;&w%OhoSm6zAEXKbI|qTHL|dJ>m5#(9mA} zcJnl#82fKbqoD~LWnczi_Iz)aq^6A57ciR2gBs%ht^GlJC+0(4=}f zfu?fMwz%Qk$l(4Ank<;MIiDGcYA0j=<;@*G;1naG78mn0^!5&Baat%dX)|bU{B0Sj zJ?-+NX{-kyDTX#UktXRAi=TOk-SADG)}m7}%LlLzb3gv^5Z*4Qz*Ef{o^eO&GRY9*IoEVu5umC_{t<%nPTQ% z2K;lWy^5Kho|jCF;Du)yxQQgZ+nYo$_0t4S!M2;fK=i*Pit% zr|W)Qd3MCD#BhApv>|qQmnNb{@_n_W`kjwGQcN*D>Lhgqa_9P{>xnX0>Y84OYq?(w z(+RVQSncO82*RCz8u-I~uVVH3h|H#MH2liEWa6xAp>fp$G&npP2%F-wnRx)xh~?*l zRY?F3=QALSb&2LexxDSZP}_LKLTgdp`n+0 zmb*5|`r{6%Lg}GftN{42N&je8G)M~iAf z?!IP1ipVQt3T&;%pejTl1*RHvEYzRH$%s6?7C)k8KP1i!Tr%A|1lpQl_!2|w2|6M@ zP}h0sDVOI{@r|12UR1XGAZo^{TS<}m3%4a<_xV}6k0~R=cw(Z}4qp(_-%=;>Tiq>S!_@MQ2vzgm;3iM7 zSjr01>tP$LV8yAtwso#cCZmNaT>DPKLd=}3zD+QdUotPS)Wq_WFk3s2=PW3gq$wis zbyu>EAc0S{#wX9r>>zo$-G*k+46(o3YuJfyWBi>>^NJ*%-i2%uN&HC?rqd3iR+8xs zV1RBguu=NnHGqD0dlW9@4$?k-pkPTERt9t=z(ZFAHs&0L!w&lxD4UM$7cEcX!WfWr zP+F~mhb6IVMzQvGCck>Q*w1>tgj7A6Nd#qc)`$4lUORX1DG1nMZo7ENVqp{aKpj~v zZ66F4?|D5A#@t?=M`Veql0*x_33#ppoxhBGwNg5a;F(!HUQO$RxYUbo`s1JX9zS^I z#29igS|hz76Q+Ip=sw2iDcnH#>!5I*#4OoivDM#d?$VQ95?%5w{?47Ku7NqWvPM5z zX&#)FYd{G(J=e_m1yOT(0L>EBCgV}6{XY37nGjNMS>CNgr$WBc+*EB;_)Wr`DW$sM zG91XYWHasbA*$8Z zA}<)Ku7oLCzZ@Q2b6i4xCkOLat^u&<{P2|fs8usxS43X=_QcV_1Wt7gff?gxL9q5t zz8@%mxeq6)fq>!k19pB8s-Yo+G(SM0oKDY33>DHC| z>yH*vy%G6UGOa5e-?l)KQ9jYC>FQVW;G2p(Uir9r;S4dG&5pO!qFx^D@TZy1sSH&9U7FC9iE&0Y1Q^M0cl( zxs^fL)srs)nJIoEiyWY0ONIfJdh?m=7F78Nk6Ir0IC3-H2YEMzR{Daal$ZF*5Q*=7 zq9#eJ$z5wOQ54Rl!|k$VwkdM7E}~78ZfS<$cKr6`wTn~Y7?`G+532m+WoAR{v<3fY z1GkLY+4h$(4~Rw~<(tLHT5z2mln;!Ivce~r)~bf6Lj>8sDK>&KVjq~sXa~OAVrRE! zZf;I-?c2T4t%DU09dAm1-eXu-sF68)D|H9-GK-kB8ZM&;RYEzY^sDg#f$cf;c^sR%zSQEP`w>C)RQ0BeV@*^T}QM4`i{fJe#)|pXGJBpCol9COCcyPT;>gb`EKLZ zsb9CKdyL$&76zFSG=y$kG{0i(kZ2OQ^DK}n!Nx8f%!WOXlQ^59Kh)_6CNx9 zF`ncBxJ?+Sg(blD=@^TlSyxyzVgqVBxqbUvV6>5gUh7vHl#hwRx2@7JIzc0{{h#E z{sg>WHMIKJHgILyCwnT?K<4*O-@0-4x3k|B1Yp-tqCCY9ISk7p-buqFu=7Q+dxnGD z)INN4V-K(K<9eSu1p%C}6pxHMsqLu(nl@srb} z`YvK1(O?WhSg8%MF3S2(LDbQiDrGHRiFiFHc@57hnD)Sps1%4OQcJ;EyMx zf#sZ%vIlMwj0Ys{fDj_|G&wXEz67zn1>8i<3WDp9o;NOG@!RZ`1BnI|#Y=)~k)6^H z0mIQg+E&4?z@YfPK^ae`fuBav8JQQ7NixsYWaAQObn-h{3GB6BpOe(#lMMe3CT*g&r24@J&cD>s7-h0W9jFXYIJ1p=lx|`La{3Z&GEB7>a zPL7_!9~BnArbuwTVZ`Jj%-rkB)%`s>cQMJb72@2Xp!uJw-v?(m{Ow7zW+AwgGf-AW z)qG@HqC>m`4xqttsvO%(?<6$YNT!o#4-SNeuc}X;owS(dbxKrMDrW^59uv@97o1Lw zIM=qIy*`eKs)sTw3>C$Tm3cRMD=a1++Z#t3h>Psp=o?Ny!?7PX1ws%^RZcFyXQ_u!&6eKqYr1(;YZAsZq5)D)y3>v~cl!t>G5L*)2&#Ml_A;dTMFfi4O{GDPSH-Von zl~9mq>iJEKKCp^S8ptJJ6F0TrG9U&j%fHt9BE<0d8Z?-fJWt?T=Hvn|<&BNpN5w+3zit6r7Ba4guia0sx8s%uhg&0RZ;7 zYhgDAUKdETm|41kRA z<^O=^P@Rn#;9UjX>%R^X5bba#i6Um`t^xg}ik`{-(5?H_X9n|At9NB`HSs;xoW2#> zcW@p;03Yt|4$octpb@%9Xjj!Ayw6MK-(rA?-ljdpCb0zBzA z;ay<9S>LSiHXoO956z06hIGCv7Lix%k6m0W^#|PM=bOFBhmS!uL2EqMy~L<%OI7vi zUqqw&0W7aUR^GW70|79YE7G;cTwtj*Vta}OnK-us#Ma}p;0RZOikHqH%<2PjxNLo= zb&#!Bf!C}ynY<0+KKBjzskG>NIg_q-!?h1xOU zobG!GZ;iH_i)aBEtXf|rw`e$~S)SQp=Brn!pGHnku*VUi1^s`*mes13{WFoTQ@k3? z1AB$F!uG9N>~I}ZN+S(?RC8wZPr2srh#Z&b`6keWEU%Y2%bnz}O}`|TvTzbt3xQQp z5n$+iS4o64Dk&Y3*euJqW2K{Unybvs77){tI-4JLcpk8(;oaAmsq0$NPei_W&-p4K`?5p|`!azWHJ z04{o?ttba*ALNzMJ8JSlMc=LKlt@kNd!{f(MRrB95y?W9T$1X4e!+2S-7AggTYl<^ zfi|y~qE$b5Fgl?QJB4ky%e5zjBnhLWLTEEnpG2u@pa!qiUFu|5XFh;{FeMGhC&xe- z*B3<0G5hz~4wUS;a(hhLVMUD<**m+xoQ$gw<^|^(-G9FLoI*<8k-P zUn>mZ1oEEwn1xv9P|^jNo=i3RjJ07LNqG4>TuQEg${f~a7SB1kK#2uLff zAd-%NfV9#fAl);Lfzl-+APmxtbPb?10@5{fch>*|{QG#XUhn<>T8rgE=FH4FJKoyQ z`#d@~<`u0dQXsaKB3$P}Ljd~^gHK|gjI^XWq%R2Xl)V={v>Q+f4~qJ{x`Qv-INOVK*J>?@4r9nVHjw466|35bvX3* z#%Yfw_^x;<^^~pTXCYj*t026mXV8JvSnn}@SRZw|Y}QTmm%R|`e~&&GBycvK3CNgK zO>!2lGpVHO>Z#<&7HT!d#Y$V%eH?EGY)j{I5FZf@dk$wV!4b5YT=bLd<=6Lh zgqah6&SP*OxTks7&;Q1KO2w^#dy-UZo~=w2(sL`tI5(@GL-7z5Q6bgq{Z%;wX$JJ= z!?^x?Pg{@iRg+mc7s79FE@+)=!B4z#Pan^%;>`bf+SE65D};)#C;d(LBnJacqFXIr z{xd=k&GDzF72kUb>6*RV#}MmKiD-uYooxQ$VE_HAI1)UV`1lQe**~4|zpwdYcZM7y-4_q3GGZjhu+;FuEw7l^* z-9qHniTvp?pS{m77~o`@{%Z2Yc!T@1c1V}I_&-+KU!~DMZVvjjlazx8_t0ynH^H(s zpdpQ`ckpV!HNYnN6-#+Ptw&Qo?XB7q?9cZEvjf|(k?6zof4k9tF!~(lk)82w=>@Kp zQRw=cg{$1rlx#fY*CdcffWGF6^l2(d6~3QMyT~G_8Flc}b@D&E;{N%d|4iBm{Jten)YW$rsiFGs68b+*&>zEb2NbpX_w*EA&m5Z-L^9|jcAYgR|%0KzPYiUs>u z!n!4c4>f2&%>_RhQ2w8+C!I-(JwR8j2QXUC%-+$=)k__e zNA{nM8y31YbG$z$iBlODtN*vG%I7U@$zKBRn-8n9e=D%^{ zHjm0n?)fWB9G4kF?@be>rZQnQWq1u`Ltm#{JsY1I$Mo)Ho*XgkA=&#>!mDv#rdXmU zTaF#vXB;)1P}x7k#T^i>vzB?4qKuXWrteN71qi&oz0Io=n*`iS(+V7$RHH->1hq?^z@r{wUq~5{Q47QI zm81B=+5E3Wi8xyAlOHpkT{+{c6dhyZkcpF!Pd5N&ot_` z16vbw1vrCJi^KaRq<#V!OWr81Vv8xTsC*uq=rZ^D?S6y#KMyKJPhe9v&xVK}0o|E$ zpvaPyaKjRMS;~m$hHgoFIgz->yt(%B1<}sV9+$*31|%cpf*QA8Bc;fZpRdpqyW$7v z+(c?Of9QC5O1i+f<8>lO|DCv1LVx{Rf%QC}q$n^sUG-td($O{rnXC0DzA^n2Vm^eO zKpk%#->wDoOYjcMKm=`bqrfQpE62ZlFW@fdP;ln zZrCJD*Up1qW5c2nyipku@%JBkGpTlj4{OgA23S!=5s%Z7_E>s?=}d1N??Li~ zKga!l^Y6{a1UFS|zx-gAJ^9~@B=BPAu7us!b&${%dN8Am-OP8Ul@7iAtew3=recX9 z;&ennzJQj2hV_W+GXbCbKAv=!-LrU9%7t_+sP#O2JZU^_w4xLxv&DrO8`9no{WGc1 z>@qPUJ0?@w=hnmKa#Qr>|YVNirz4qUF6&;j5B3 z(Xs`s6 z1!-#LwePS8rKaaGVAOiZzsgit?p+oA@3QHB5T|TyyAie9C^MbwGp@sz-6ZhA)uHpf zQQmvXuyv-)fSNN84DKy`RKJG1-l^#;WE-rV>`xUnk!?~}nDWAR{`wonY;lxLMxCa? z0%v=W^dD>g=P)Te!P#fD7tHg|h4U_Ut)d>&IZ2TaK9YL^WB~;1HVC7{YzbPUMPJUe zT69g+DPGTf*UdyqN-gZpQsK75rwzXy&&&K0UI1;S`zI*2-5VIf|FR*_r~ma6$A^#B zR`+A?-J{rkX(2FftB+B;gD*+O?qHTl`Ng<1mU>{|!PFbS0pIZ2-byv$Os=S+ru~UR zw@Fgdyx9kg!KZ3ONy8{w55!wdIf*v<562Ty4IH-WtR_{=&%Z|5oU6|sxM0=pShTN+5`sT%knUZk zr;ci^zc_6;IL)!3d00r5bjBx&KEJ2n+j*%=^99v1%q|Yx3|Yc_V6`I zL%QFMGXC>7I=MR2z91L#Vx6>}!P|BO|K6An1@V>@Z%c2m#Vd@B*$k4nDLucy+ zT!iX{A+QuJnO^v~3rfin{nyf`#z>X(Yb6#$w-=I}I0b#l6D;||l`awkYU+Q_;6)au zu&n|Q7HNf*S|4k{$|r4^Bp4k3S!W4frGS>9TcX*bL3R`aZiwSArikQGyj!oNdVu-M zv8-++`S&27z!zs4DV z_Yn`KL`QzgBN`s_Anj>D75fs{+X>{#U~e0sC8esauyOVlM_ou;oTxvVpsG~}Sm=)w z)Z1683Y6%mzh7DTwDt$1!}GFsd_2-QUf?_z*NLRu{xw7=z!SfeipGqzvJd&Ez2pWB z${@lhBv>mh*RE_N{^Qwh;t1EQo;lbAdXPsnvBOn^V{$0?yJP5*2Qo9)EhyMNYH78+ zk`v9mQ;=)aZ+g1?8NFnRE3-Np?42zOA?7qSkDl3Zr;@GGXY&Jf4t^Pu*rg$*my0#a zDKV%!X0wOO4iB_1%zZuu^n6Lct!BLCPyZio-kJdW;nD1AN3%SA>o+QdKh0#^MSR>i z@?i3f>USohe>s8kwD==d&#XT@;Kq+pFW=(nXBjL>WQbN&;^26!azneuWKw{8`7>WB zv0pdLx0RLS;uH-+)l2hPs#12%)$K4t2com5O6yUNMk#5phb?}QJ521?eR=mz_Bwly zOP%v~#}%@-3KOIttW1GcY6@0`?w8&EWx&&!+e^t_r7`r7$(lmLp6Y$#$X>ICQtQvf z<;{*)DoGww@*(IgCrn>Qg0&;{>b72Phr)nI=tlVxw~jNb`)g>w4}>nc*KJkE@98;5 za8=+Jr$B$y3R_zF-qf3j<1TOhpnE&!?awR2mX^9K$D?1&{~$n1#Yy0QOx0wxdIXjM zxQTy$kQky8HjhRZK6S<{HGJ2L+~tO}e1??y-|yXhgaCiBY>#+!ofEYZ(JM;#K9@+c zZpxVIZTz+U?sfAR`LA`^wP(v8Upjl|zQ}GNqh@W$%?t^7O3n0Zvqi*SS_8Rk<0-n= z#U+cAC2WjN{`s}Pe@7Y;bc-_8sa~Z4gGdS{o%1!x-*M%0!D6Z)lh2Gyne9%IwkLe#058yokf7u~&S+_Vq+8XV%O^5$jh-+l!K2Y+jWV zA!}-RH2Qb)4YtxIRqA1EjF*$=*`4!q7TK4u53|_!{43Vf4#R1j7AUN9C!2tRh#OG? z^Ch^7A6|=jZyKziF7KwJyc&3O)Azuzvy!Yu!4j*qtLX}t&;Hl(T_#-1SQN;BnaC0S zv_$4&c(`FJ+_jfqpUMFj?>#Qx5d7!V1KWBUC6zJEc63>lEvS(tuSHF&K$lJ7tzN}7 z77ATMdl1B#T8qh^VNBs#sUbL1CA+|1kJ3VsS?P1b2cI?+3YYYa#Kq{Ww#n?q+o%D( z0@dT)6BquNVrZI9>l%hLNgTzu?YeHXq6+*4r&t?`=f7vABeIhSs#`u%M=RH!n4r&4diy z+mI*bw+-<$W%!}T5@9}BF~t=Mv;XMkH^sjxw{mrlzf-+b5!8iW^)m_h5u`Pdl*ZVe7+!{|r_qgb}8gEf_;x$dY{`R}23oOFYk9$lZN?W%g`s zax*Eqz6V!3OywiAc|ESB$g99!a7sP<@Pa9S?wq@!j8i#l%tDTn!VC4$)GN-MMGj1u ziMs(e^wm`B_H_iyI>@K;w1`gL@1WXpbf{m_P5Lz3lWxnRN#WdW++%8dj{=PB8E9nB zw5yQ+*Cs45eRYuRrcAk8H!VTpl2S2{sC zZJ1%$K+{8WG7eT>XO!uS(q?8ZPhQOM)$7WsxFb>A!a(~kllMZ(jR5PV7UjXBtKt=u z@#>)g{j|43O@J1ea|HD-9OrHF~i+(IGVE4>O6d6%|4H8^dCHw z4zq)3XdO>yJ^b-ey2oJIu}3&G1G>ccwRbdli-koqUG|sLe477 z;DSfVMg9BS%j+1`5qt;#6Ktd>kaUfyBrA~*&&zGR9J+v z`q6?L^+`Q-@OzdQeUvkD`H#gLG8=^w29%ZMPymA-l`NVO!`w#whw zFp2A4&D9O$rbS%Iu@WBEJ@?#CNFvD8e-m9sM=#W2+G}rNB-t@5XQk+LyX=|M7sb?7 z5xiGlv-3uYHecrDoL=SN6fK=0@5F@b&i0z9WtVFs*vU^{TWY!TPU3hKB% zu%D?xQU>KbSCcqIDh@m{Poe@xa_{*dv%+W3bQ|A z7L&%476Cd{SAccmEhW-&3Up)eb?W;WO zwQftGn;p_a?oo4$XE%Ce};pCNq+QY(b7qYz5kTFV(xshJ*dVdUh2>!g_b5HPo zaxpeiXrBM@yK&02@S>t#7i`HCg*EA^x65|O0I$Q9Cx@bhxP&q)`3iIOJ2VT0f-*MF zv2Hg%Nl6s$`rxkb_roOYZeG)7I28vW?Z=I%BfF$)V*R#xCJv`9ab0k9A|cFMD}Dm*TEekFijv=5n$kVi8$3(sbS2>;;u6t9vf8`6KVQ!E0y zP$M~qSjD2Q=J(Etvn3RmW@rRQzxQ#MKRyGH<`jU6?mSR}?RT#1i7$iZTNs4nm@ZAw3DW&PTx zsu-OWznw(q?bLRUt=MT(ONbR^89|qfxV+Dj6QYpUH5p+?!tPxE zxV0-LSv<49sqpDb3ntO+)!4Ml)?n5#L|b(-6;Kzv zUO+%C(WP=T#u2EvbwJwf>n=`f1bGxNu@D{$9c zZuojal16my9Uu5k!v^xrxYPwoZi%er;j&-{vPA2K7$^rba;6TxBl#|IL7#+=(gU@psu?uAq&!NNxp zRcKlYK0&i3I<$?9If&kr?`wEIt7(k8XBhUwuip>6!nvJ$=2uh$*qWjoNnT6BCf;1B zMhR<$39H6N0lEQiSL9oO54;8h)#gcx1_-(U{N zseNM-Q%-RKUMro$f=wB64`ZT4K+1*yJUjklEu*Tk$czG-swC|#yC=r`<@rM-PwKdIB%hRLypT~b;B$g%eEK(G(vl3BwRQP#d-fyv=$7L z?Ycf*(VTNWT1ycl@7A@?IxTMmE|IwQz%vZ@{fi=a=lAH|ws2;@)=YM5pV{VffB3}2 zEA0gyOpN>OZ&Us6T?P8<0F^Wd&`sR`(G}Q{FO4^ zN0dlz8C4FS8#D}Jb>YzbRMq`uDYRyYbFcexU0WO1p00$q^2Vk_sI>k1NP)|sW#+_e z1KZT*p+B8!OA@=Q!FU zOs*^t?Ai}c3vfuvitZ)Zcf_w4izqVHmh>{-^MJ01Fz{iBEV9v-tWY=_(eG)^p$_>S z_E)e#^G2obdL=hGo)%%+aPHX@qe6HP8;UVjIa_ggkdCRT% zbb|z{CIc2BF7Kbtv_q2x5z;S!>1T&*u!R$4hlYSqRz3PW6lkbB;?x(_HeIT|L?66t z!9P<8zB-Li!1%Jw|IUdT?mY>O_6>wLZ$lZtDAyl^gQA{mdEcZd^PKw~63lL#=H( z5H3<9nqe6!+wwpBg*xgzGP3E+T4vEjq4gQv3#Ag{qzQUkCZ<$jZB4LoqaO98@I}S3 z$wdeA#kW?lTZVtsU)Yez!zag6NMfE>6D8N`E(368hdiJYs<#0xxQ;+M(F4^7>CbOP zkK(o^!tVaU0(b{X9=T35{D`8J%z8n!Io3v`M_Y9?-A2^hUrqD9(TA>rd8{fT_2J9z zMzwgkXHo8E1i#<~m8_i7~3v(04N_0|!WGs&&&A&{w1&x>CYS?=}@OGF1r? zz9mN{r3RIjuern{x=09I;-IP)SvOuHt z*^^wdOX;mGskb=vDmqdKRmwcA?QXP;D-1R@a2N`(zV}W{q7KjgzLi!eL%T|OCaJkE zWuV2e^O7?!>!&?WXWnm7r_)jfWWq|R5LvIA>yIiWl0J4$2dQV6Sn>_VUx8JoZ~GLL ztiRLiu_0EqdsyZPk7`w7w@yPz8)OV`#J=QA`|{hjoj~41vJIcqDsxOL{#XalYSn!F zjEJoS#sE)=MtN+L29=&8nQ|mwU0W$IC7tktE%Rf=I6eYoe`vt9JXZ?V^PYyrU z^(8lyt)DSHWW&?CoWbg1q`Mnm$L@DRQ-TW?TGcFJzOQT4({G33tT)*-45PbykH5^L zBbz#<)#UtqXier7`6F(RoE-58p_veM167Hq&sOw!xpnCG$rqzf5*6m>ZPrtK4{>}f zWf0B2a2v56<_oYue_O$ujRaB7BvV!f2T`CmU1bJGkOedZMM7sj8U=m^jqj4+xgfI2 z3iTbxwFABT&yF5`_MRZZNz5w3d|&0K#TeP{5$U7t8H>JM{hE_iA@5#)aAJA7>{yT4 z1e;vd3?phCGU*K+ON=59n_}PdT}t{8S);Cr(~~gGzG|IjGV{Dg@K6tKT6xtvKX%Hl zhY}GHGL?@Zaz6+j7~9+}kMPy>pxAb}drMiGb#|$|~4O z@c``jIAH|PKF*7;x&`&5&_khgh>ttwoONIYppj#~=K4mwk42nl%m*pEn9_Epg_5Hh#7}FTVoWx9Sk7d`xjA<9m5$c5zZ9W@MO?`evB-89FU7; zp)xcybTCmChk3oQ4Vap608BMTY^T7fYk#_9%vT=#At*yuUk6iWak#P2KGE5$w3yPv z(HQ}}XBAo-IKjWiS@uzc2x4Z<*Y%T6M8SjnEPzVv&FiD9Y1V0r6&EExf=JauDgil| zNGDYvP-3-e2O!@Y{5h{T0Kb5imm6`^#+fk2ZcS|3X?qSG8!`iENMpA7WUkgeyKPme9!!pBGd}Ow?ucoN|vh5 zW#wLWjO*$xv+BTI?y64f!+0K~-x_BO?5^IP6i1p3Y-_u@RC%vFq_)Jdd`+?BbY85^ zdIQ5hl>-5M<0Nk z;_DFIh1MPM_cVkL))OAr==6q92n`rhfZm4kOab)55ZKm6+oi;hn?YCDJm7su0ukI> zzz%VF^`!36hjxImRYpgRiH`0)z!lF5ywUR_Z51AFBcke48bcMCJ0uQ6 z4bQI$JFyq7VIdT2-ZbjY*R~ocx*`X9Z}g&`Pne9kosx0z7#tk=uy~;lED!}WKCx{S zTtCVSFayKW0O&dtWLH#Bp8IzJ8EHw_y-t&Nei~P{1+W$>YU{1W@YYS>`n@5Rq@FV| zOu=tXRKbM|q|JO-JJ|v(SBsUZSjw~F`i7)B(``*}eDE3F43`VuN<7W}gM$h*nCe6Bsk z7^FOXmOc`!RXG8I7WX#t6m=8}+_ow4Fs`pr=6*im-3q<+Uz#&p#bA72Kdv9lYH(g% zdcSF$MAI}&HbdGKjF`?Na7Kb_^}=X}vLt(pFr+c>zt`5#UKid~@Yhx( zPugDX9jo$KE+P~-mjEp%PW*sRsV*(N4W!3w^QH{0gd(mcBRVth`}Ge7o!@}@rLqHU_)3X9*{` zL+ldkM9tGij0cYweB8z6H-UtABIWTw8(gSQM%B z?}aWo|1hM0FX_fGYkEZ}H>*3w$A0Ka`exE)AD+bn7S?d@jB;;;vW}U+fYA~^ zsf7Ks!$7s$_sSKKga8d*kMxedQrY}zA@$E-RR02D-0UZadQS41QSBgs@vsr!3q&qt z2)1g9?K~Sm)*!Hc)@!B+P>6H@7QvS6+8tS1>x+9YS$NpiD|ai}e5X_F*3DkccsGnh z37pPp-Mz0pLqL^JPqPyIRPiufY|8wBpUP_MYaADmtXD>r2h+apwai;}DqAMK^*;3b zGSRc9KJRWnH(z|OO81^yvRoRVN+dMTN=C4M71Rr=vIWX^N5pO&m^lDDO8q3|#b0Sp zYyx^jA-R%rJG<(oJIR&(*PkBvq8px}gjaR>M1mysoN}TD1Kl1QL z_u|s~g4O4y8^$&kJ3)(PDO4xOBr2$&P1&=p<#2iZC{`Z6zT@hr&CwkxB+|q)!eC1z zuR_u#TaME$%2cbWzB*ib(l!FstRaIEwH2{mqC~NekQ)ctCIvj$@GEZ6nNUCVp`e3X zGNSdK&_l9O`&r@%UHE#Dt77o&%J5&Gn8qi0g)s6t;jbA#fzO82A38ZsWCLg(3$E4Z z4nHi4C^_iRm10~f;Alt2*(`k?U_(ko(ZC2GuK#c3R72feJDBTN0tXYrH7(LAW zZi%(8C)3#H<;nEjf*6x%^pA9w)R#cBvDDcd^2VSlk-}b-y z$7{ogx({hW83(H&DqE7naS}uvs7MioZMnOjI`6!mXG0LK?BgV;P8(F%v@fHhwzut9 znU^!4w?R~uw-FwBzd-O1!|BjsGD8kzafQMiv=9HO$(yqw`Bso?XO7S!!yrmsAPHS{ zSX*!hbtm*_MBz7B!&zIgJK6qFUC%GE6lkhl^+Lywfd4HHr$h=^kb|JeedI72thF{P zSd2)I$4hb^<6OX&A}1yi1(VW`=*!#H$ejq!e7T)hSjHyUs;`Bb<6n7YUMPby&B%lm zAY%|g*9EP^m+i{B<38ZLPj-54)zsdFIx0U(E@lABeFgxHNx~U)J;8~ zng1gmb+na2=5pk%NSRT*er~M|#1A3vX!5gF58LvW0ab@1X#Z`8nB0%|+7M_G8=$V> z2~|l&!(1TZ%ivb4A0_2=cfIJ+C)NJV5=cE z8fJ|2Yi<+5=ICLNAcZWhfXE;o7~MX`kfQhmTbbWDqc}4H_jQ`kGtD{7*zF+&&aCeI z+=R8~Ozjn9Ey9M+`Au(qdhSmKFc)KlV;uq~T@-kk{`SptSCD+GNY8`8(Va?1bK+TG z%{bO6PWnExb*-VjI*w6rB};kuwup}rF#|$9%58|@-@a!`3|C5etP9-y7O>v0M{qO3 zZ`CGCJ`bVLARXQBnUq`YgwSpEc8Jn-!;l0eLh_c`M1162q&S z?2%1O>38K&dFzhZtOcJ)clnYhXOP-NThlTXp!)xk8@+ayokQPyAB2y@S*0^k530oK zLAW0Zxc;}~I1pAd!P3M&TkKA2t`L(}k2UBq-C|>10bAB_F{@Zx!eGBM0?k^I^~0Cg zHL1#Ovc9AG&AG#~nb8ru&em0Xt+wVzxV_3|K(A=jups*R4}dNu>D;ze<5}<5@0o&? z;tu2p+U(@^l9Lg(Ewao_R4@&bQgz5@;o?We|W7 zyQomXX5jI_T3NDKaE>3*%6DwT(f4vEz&ZakiH|T>fmNQf=`&L=4YogX7B+nKp_5t# z=42JK;;Xq5>=SSJxJ#$VS2q)s@Qpq};9RSChVu2UEZ6BW0a{78Fz)^Jz;ijN4JaK8 z&ef!}r~5Z_idqiP!R+K4=`Z`Gm`IQXwmchC+|0>M`%#$z1K{3(q1QVa9Rh`U9J4eF z;rUC*%1@~&6^+EG4^W>U=Z`Fqbk%b9h8M0Yz#(!;<5N|jgSQJ%Fa(#jaF;KI-m z_`zCE>>Kt4h;^9tsJ;#4XncLGG&h0R1O~vx8$E}(5&7?`#HZa0tgyMopI%i_z5%EC zzfdO&Z0*f`!A?Px+fpxPU#%Hka23RxU>x~G0WTQ5HFG|UTMdk`-hF*>Jqr^vX43C z(U4PhRMyj$v9iu0YCS3M0y*K`0#0~!=cn5aX!U(?NXW2XFI;&Cv*0Nf8B9eA^?Mh- zpR*#xTeg;Vcf)uPa$%-t^DIj9wvw7+roxQd;v2S44SP4}pOC~|c3JsjrbldT2tr&_YPW~YxRK>Av;jy;5e zUSYgI%@Tf1H->xyJ$zT^(|U%`1(=S5N*onYaB524=D6~NhFVi z(Cd@7lR`({P*AtaE z`CN>Vz3$zxf4GE$Ehfnt(^bIwV<^SP2$e|P^<%lfO}XlM;0-sH-WGQ2+(?gHlY!{< z&Ih!s8&uJyFwn%`^4P+U%Yx?!-RG%XULYu`iK>nCh^Vq~sQV~n5uIx>NN%-~UvGJ2 zDq&#-m-hny3p4DGodRDK!zx3P1eK6R^HhcCNlbsSx=3S8rYR`z2XCC z0~%;*A>@6QvzgN+v9X3mOFwb+B>id}kWjfHbO5N%XS}?3Cj5BPCB{#Q7c`96j{&BH zX|CY)X&ywRtCOUDq7g_-ckDNr5pL_gpj5sbLbEy3y2-YC@BEzkdA!)v>eVwul22Y( zo#^{|v-~u=$_sn_iP4exOH$G&_+F*)BOty}01R!{t@E1L3P3M~N3$AWFV^sZ`s5jx6+uj!F8pyC@$*C-MT{j%9#}x|StUO8ccj5rj zhlf{?`e^UW=eOi8;5QMVDYwx{+hvuPtMEjOv9zsI)o$W^m*%OwF24;A0=jn9Jon9= z11|f$6XJf)D^OlUch9mX;^d3DX0oUlgTSj$vBX~Y(!fBeU%~xu$GusKK#t#ew>JgH zCIev()9%P_WifwKn&=A1Z^?5UsbWuPUTlUc)_^zT>`yMl%tz>(wa2)&<948Rv%CzP zkYbdG({*tWL^?LmySVa-X>r?3wvo45&vz!(!lv9o4gb(?8ORbjf(*I+{WUcu&5YKJ z<=i$vwne|Ew2>Tbv#}2mLU5iw(|U!;V8}A?DhISakTisARQOgLugJlW%>VQB#Y(z z5n2Y1Y1xG+oX=g;i`*8}q9x(a%t;?5%F&3h>U>c0w$SdAnn6&TRVVQEAmfXH;wry{ z0zeh&HX0krYjhgo8~rdiV1`nLIK+Ec^SSO%zF?p60?)g}ws(0L9>zW&1L#MO?0W0L z#)-aW40^YHv3<8yg8VioZEI{_Vn2=se;icgsko?hxC#z^i|1bo{W|kFK`8K2Y9Qwd zEicEr6G(16PaV1leCWk3cD&_E1R{_i{Wqd3N``4sJQzrVuxv!OS01eB%p2+7_ou!i zc`(rSj?v`+@K=~tH&85B9v`lHpiCl>0`H*6>Gbc_z)cBT9|Oc%xT0EaHBLOYH*q zCKcY#FePzjSn^;Z_x|FF25RKVTVpg|1>1xAq!vidd9O;k{DZfaBF48^MXo70jNfm6 z5G@n{$JQ{;q(*TW1G|_APQIcg)_q_y603gA1p}thZW(Kk{UnQ6^K~bCOGgjY16h6= z&qNaX)+?lLuy#u|QCv;XyOs@#z}d-yb@Gfozt$nhbC`-1hvDjM>g7?T!pogP%N7`N zvz4ye+@ML+ z(dA86#+W|C6xBu@WadUyMZz%P-LQ$43h-`u{{bpX;n^JC_h2QR@Ah@K7a85}Ep`tw zuK={0iOqv4dR`z%%$-ZaU;b_4GghVd5HE zoa|Y_T44B%G~J?ferAb=&$ou0p``ALzY&GtpLtuHtLiDfHz#0UnMN7Tj!t^HZ4J6h1{ z(y(0vee!_ED`Thk&&MiKA4HmgWZ|0l`0swGo1ciLye{1S6!~=NP+BT_lT7;^y5QSX zS~xpf&X`zXHLrT^B<~w5J5+|g#xZ(`h1;qk^GKDwO#+rB#_LvKr_`&hb-1|@T9Ub) zx33t3K6ozPw|c6N@W8=9RY?oe*=;}HY~RM5V_N(B=Sao3y?+2CIN= ze0Ok~_8IAXMwwAg#ZEjZ&c$Nws`EhX3FMiKSAz#<7;!xsIsw>0Efd?B?=R&`_mpY> zh~4kRt@H)gyt*zs5JZ3#EX93+Zwk8iOm?uLSbAPpzIdnscgkDTRYdj*WQlB>nDkwS zdAU^NFJy%2b+uV{ZJUS}%l9#$4$9gjt*U=dUb_K-_n(b{V_5yO}sUt18cVC6(E!&|;TfnR#sK!4=wKbAbl z&vFW9{N>2pSFpuV!teF_>Kq?XeFMWAv=!;voqvN^S*-X*|K~RJ`<~p-09*W53h|_$ z{Hu5{R;|szkx@F3zdG9O3MOk^^;U41$M1uvnSsk`v1E^de1r~Fl zESh_gh?c>14EbDhNv0g2HG_6WJZwAJ=GR{fz*`nUN&M=bud^;BpLXu9HF?yB18^wh zhPyEuiu+^Vq? zrDJ>2Y}RxYGstAO`3ht`vbk_QG zuQU(%i-keiqBoux(9_&()sL2BoT_7T{e13d{2n!yKLx16U7N^o!T}UP3M3512T7M1 z!-*SE!rqdjpqhXsGO)@j{6M)VGS+sfhL&yNjgP=me{`yu<##N+?mSna-dw=LG=(i= z<=#GHYo&b1T__v%y&u(7rrj01XRw8yDc!DThPKf|dStzFw+>JxG6(9CNg!$IXB4cO zAG7^R2Y@JWRK_=o5+LE^{jM399{Tx7sy!emQngL3X(s|4z`V!kLF{(@oo6OXdv7EW z(r-suSgdgu(jd&hL~n+VJ4nQ7*NcPo-KHrGx&XtKqHTQa=p7lL=aq82pL5JtZeQ3C zNKfn2^?@?zmfL8sMHi&T8hLg%(sqSKi?j92=LvuDU$Z22jbH(OLCe>i1rO;<)MzY@ zsAJQt$Xn?*u3!)EMckTGY_1jyvM~A%&|exFUMA*~QCizc+l<8foq3J!HWizp?4?@f z*jOF-4XWRU36Hd$u-l1bY7=!VwhoK62CRV)inr_R^dt~I(FR%{Af^p~v}hj>10rdz z3IP7i-=84NRK1jkmlUyYV7N^>lEdhQ{&LtDD{5DZ(_3{&FA_C1QmewNpJGq1Vu58(%Wibh@^C zTH_ntWN#3@c=6!L>yhEX5wLRkuZ{tRv9qnZ0lW(O;-)PNe?JwqzM8aFR}=J)Ju6D)XM z&^k1Kh)3W{G3MhAX#6xp_q%Kg-yy_2YEzWC0tMm5ov27f`vM*7P{uB9>)caEF&KTX zw?)Z|n*&<7XErSl``u;}MD?TZy+y6@?c#~<5S7ReZiN@ws>FxI0E**XdgRP1xU%}m-5>Jw=_GxZHKWf z9zqJoRgpDz@}m;zck`zJ@8{?>WyCQMC_b>ZdB2_fn$ZId;OyF^d#_^JJZJcS(DtE2 z8LFP9TXr${;6RBZE8QM~r~P%Q1^=Sl zV8Oxi%4#QN?C?hvhi2d(JV}J+HsZ4(5E&Ia%7P22k{Y0iotk>X2+j z3M7=YaE?(I1V4#fbl*@{0Abker3V<9`|z#hw-Q^Q-`SQ-f~-H;g^^?E7*L@G)CVy# zuGHJTB6Q7J59k{ou{R%yFv1qP%)TWNl_XlF@)yHc;hv-PIV-(aZRgGv~QJ zssgvjYlvL8i-C4!*LQ%@fPmo{Fk&okhK0VrAHo$9>X7W2dc4!>_uAF-01w57zonFsHf2&q0k5!0hou2yGACsyD3-_4*%syA*;LUe;&{ z(kDWwj03dV75$jYw1fi46QiaK-*2#p+^#KOr2xn>+cJ0I;+DYbWsow>HKzwWl`$ev zSsf76GY}Jnl^#@+wp`xna9<%;LF7L!ofXz@&T3^g{+^4Ow(PRnF0SOTrFgQ3Ygsb? zqktlgO4xq6nW~a7w`_|LU7tnH?1Th)z6O-UQ{t?Ck`$oQqP1ac&W$sP3kK^5{g`W)B()zpBbePi+cbIB;J+=PCYt3b+v^aXsTC2Y6oY$+b z+?>J^?62U%>Q4ISa`|PPqH$&oG15P4=WvUam-#R0S#e>N4G-3fLCvZYT;v9^$0AG#n8DN`@WLvQN z4k7RW$q1F3t(5BqtyMX=4B}o6@kRUU>v104^QEQFoCFEw`p6gRVR(LK&lbx#Zv)+= za-&Ysvq(p`15hya$F_Dwk8H7NTO@_gflM|55YxP?w*8(N7fM_};JE7jXxa8!vJZA|v7$oEd( zF2-fP+M?NfofNQsh?8~Kn}`$?$f}w)qnDeW@{0W;!22P{DLAuftC1$EcXW4?*0jfQ zS?sfg;SI(`rWo$^?=%Z*vmQ0X`HyPBabBD(en3oWgQCtcz&Nb2p}7Em%H3*;4V%*G zOREihuZgjB>%x`m7eqjiEPW|n62OE)D)n6e>gug)XQb6+lN~to-&NF0`=QR$w3F>p z!RZ_yTh!vJDiPrYqT`(p+$4xFUn6X52Nc25R)RGE&{`E!%42>`-=BUIvX0u7<+ABy zPw@nj+4jb=guiEx0bjk!yCs2$m#Lise}sajOgN6et=nopE1GT5Do?1A{ButKs!`Y; zDD`Dmi1q2sO4sJGU7b;RLH)ZNoJ>L8?ZJfpcnYCWk6L_9WrGD$A`ftr7}wAd@B|ep z45c^?123mC&?-}r;`wHA&Fle?o>)aQY7!sOg;E`l_3V8SjD8`uQ`8ffooZIOU~^y) z%0Frp$q-|wKr$Pyyg%jVoZPx=aYi5GvBeZEBPX=eYuH-(z<+?5=4E9ZYf&6|M{o{L zItzxve3A|+(Z!$IATbltEqqWb{j99$fnZniuAGos^t^?pso&Pe?IfNr_7V>M^sLi) z=hB_?qI+KL7irEmX0_5)8P9m`n9e9W=-<{hY}xOmdVF%(v<|(5Z~aMGmh4eDjcL#v z|A7puAHcnlLiw@{($}?*>hHw0Z543lm;d$AGiH4|h@1;{=HM)6>%G($2}hX8T-{GO z=t41gi&gKkwXH|=?(&BquO%bPxx?MjoSnXbrcq}MC?2Ur=stZzi4LE{l6_Ggz_y;L ziX^fHHL;l8#=hr>j!6grUH}*}%@j5RUC=ci3AI5|*@lxnI9JxHk^uH65}=4}XS(&< zPQF))``#DKs(4QX;BQ_C-)}p>BnmqOp&b4fWp4ph)z-y*E24rRAV`;zB5?%iR!SPB zTS2&s_pT#0`Q7!22};8(Pxtu7b6GZB+FV| zeSvJ}X}KrEe7dej980X9p;_7? PbX4F!X`;meyg%-_bo_6C9Bc zVeABFO2s?{NB#`w3*AC;73%<%)(9v+cFr3EALf@^ zGLrnB4hly^ES*X&fk_A63t>TxT!1$9L0eYmnkO%E39(>Us~!$K<@BZbOG|kPYnmb1 zBFRLG(;CG3P|F@Y&sk@o1iTWzdJLTUMU-NWrVkhl<4S|Wx~M*k{kidu_Qx-yeoXE< zeeB}R?8j4AJr$gJnV`Nbk6T+nUoHW;_Gb>?69sU%H%kU;nx8s8y6i17{8#V>%DFTjs^nh=X3V2{4$HaWRPj0J!Y*+{u_`%3`0i5@KplEms;92RQ zuY5QZ@5eIhin1onCqg2V;}wuDqj6to3*K0QgEKTG4Z=(8$4f_SPAH6qI(6Ro3Bm#A zpBU(}*aUXio=@Cv$FQ1@z9;ttCl~J^K(_N)vvLHPZ#SvNC!Q3F zytTHkfDkN9ZHe+>xXW3gBl@@xEN8QNi~ztP_YD<2>u1qXb1n5(;6!IrDLMXT9uX69 zlw{G$$JgGK#*3w@*Kd(jtlOvolob8Fwp(@p1rt`HE&|?UtuW1~!)>h*eP3gROlr5C zSCJ_-d4by-V~S_S8BJH|Ur0`BNc zu(B@#Ao9*VQhiNm7n16Xgoe+QhQ?I!rkiVp#QAe6Qtp0>m*G0i8r&A|LFd79XDqUb zc6KL4*@4$)IUqsqlgj?fF*}W#YAO`onlZYyS$E>c`s({Li``QYOXdF*e*%LzN+2W5 z3XsD@pkIT7#haC0^bsf4r5fYh?bqJ&9sYBg4pVHV>l?at1V^K~m|L0oH(wMVgN!Wv zIFl|;s?)@eZ1s1bxr#+FI1<8He{Ytef&4?cy4k@+oYdXW>$f}eEqtY`V~l0eZk0ULA18_*(`G+^`}n0x%ji&u$6~eLHxa$`GJYcgD(aF=mvRpkZEcKzP-Ct7(g4 z|A<7~%eVf0=v~#?+O~1Wed*zwW+(Zw1sfdp39TE;%7M>7z-x`VKP6mi{XQ5pKK~VP z{`KOKBj~ciGez7h#1bw>nBk$`Wf3jYCzIV8I?_pD#-rps*RQ&K2yfcz3=0A&CaJM`bgPkPU<%!=x^GaXa&j&FpT69rF>ET3KRcZ=fC_z~|TWpwH8$@(VDPMXG|vin?b$YdGLT z#)B(v9}Fdgek&#Z)=OYrwFuK{8QJ%~mGxzq2Yo2DJmE5n<~j^~+_v$CywNS!-O9}q zc&*MgVOA7$DnJt<{x;zE<;c{EeU;7lB)sF1C*ld%fe8q-Cggu(SZ0!=i;B$hrHLc|n^(;R+sg1{c2$8e9D zc{WT)(Dpz!=qt z5tXP98Uw)BH%h#L>2gA%TjtSg%$|H23dUCTWW+%zWLIOR1wQ4sA|G0o<{*j?HhgnY zit4mwsVc#@gK1v2M(#6{@>hBAR#PDM)y5s)BAlgOVv4QsC?_yD=URHtP8Nv-)5&Vr z?bxmx47DtTIy>jIP^?Olhsy$K{0f<2Xk^ptAw2EW8k-I_Ve%cR+7b#q%RiK=my*>_ z$c|qF&fbxJkDBTmIr8K)wvqKCycIR?5D;2-PaqD0Tm|F0jla8}0>0u1W(!^*qjDU- zJER9MaZ zJ?{!|(kX6veE>^=LFZ2?%qFHv9(m!k8BA_=#vP3*GHy=BOTM2sCYR?Dl+_c}?1~;A zIC-?{CH?oN@6t5ngJS-pFYgFX<^wiq?Z-A=?E4B-!H#Z}PCDuBCLY_ECiMicP1sM> zjB}UH@A4R%>L1yg+y5X@Js2z1sJs@}q~NACP;6tgz@QW5 z_TK*Zr(!XVc_M|j-^?RhpNM8@iU33m--{l;yd?Ko3;jFQ-=)=i?MG06e1Fh-1=r&{ zbUYvl0qQ!)LDTMk;zj6bBw-^>ckx|lTSmbgNH_6<52s!!d$L$JEx{-q2)S6@^p+Uu zWmR-lojTg(z3Z|{3Wr&&qe3Y=a~aI^mQhU@bSL!|Q8vOBSgi5eLJu-(R$G3HFaP>W z_axrbqby93i&u!v*ZUk1>*pC}4uywIbT*j@3|qY8`>xy3#f?>Om^l-!97nCNzIvJz zsehDP*9X}v9OOcGZiB|7^0(AYMa~IIvZ8MGIfIs4=LGey5Mn&%i2>uBCRirJA1tcR z-Ll~g>usNUf)3=i&Sb>(BI7!*jjhJ_-7~7fyV4uK*+l z`t81VzTt+G?@&uj*I3jZd5|$`RV3{eJpd3|9D>K@yn*mXp-PFIF~ia9KB7S+s^!Ro z|0cP3QZ-~Nl}o@2=VPT7iiW&LK^;f75InoCW7xz@qaHdYx&3`o*`mKvvS?#WJvW(! ztRjV03e+vGEsCwQu9wI-N-8fI~!8Om1$UdUzebI#r zMqd|~49EJZOI}W+s1+GXwHsD=3jV|*;In54gmcEw1d|KVOOyHfI-lFDC*A#Hbd$ay zANyY8eID#_Sv1;UNpa8Pr_Vq%vl5meH`sMd5QJ~}e+B2ig0XMUl4@UHEsbJG+bmak zKke4QM3_2ZB^=mSi=>JMr7T6p_46jnAtPN!t+_y$s#TfN1riWF zp%mN2R2roJaCh!sLo<74dwDg2^|@@^4~~zaHiTRyVS0;ct$x; zGm*seZIWUO+d}6YAzQHEH8pUZ0|-Ot?37CMD?0d9D=;XWB%LZ2oKdLTI;?=HCz!;I z#QWT0fA#vbuHX3PeWJ%VM(piMLMx+y!+byddg;IH!^>V21un((0I*r(A3t^_x8lb=wfoL6i5fuiI8ih(;n@-YKFZWdWi-uI$o|= z_UFf1y@@0wBm{7_yR3R#De02qtOngWYL6_;1)h=H6VV2m_buD34EWJY0p8CmhePI< zzrV`=zQFgK#5pJ%)j<>30-eZYfE2Czzr(bl{gZo#QpFKsP8)#+DRv)4F6&JV89bp% zZg+qh=*#wR?N_rH_fk5}KN<*6ZDCh}y@Q^b5ZVXBFCqB-8HyjQXZpUqz^zk74N%`l zmv-Ob8rN?N%Sqn;ecOg8S6wd#LU67U0xRf^!k||20jPVm`+D3DA44soYBki&N)?mi zQ2-J2@AcrBf^0oAoCumWYP%rz*Gj-Q{Cgh#Wi$b=TS?7`y1o;r^&z>$1@omDdqjfD zd$!%kYCg$HRQURX<8>Ayq9;$lsdHT&Lcf(CUYvtGf=ced!i-?8ByMTggYPwry_grAyx zC(f#O@|o45+Z52BF&NeT6j%MN8F}32kNZk~tkd`TY~+i%c36?a@Uz0^&^Lu>Wksh&WxZ_GM!UDS_^2MuE% zS&>M`3d91tExEVCm_h9ym+esTFpgK}YN;Bmf8TDTMYwj(`OeG9dvZ8nug^whZT~y2 z{Izqw5lldKEYk9)?I=+TEt*3W7ne_Nn%NEax^vHf?(B?Fdd5yHf+ z&)|1pXO^2oE}Kp`?*jMU(EN7?-;Zq=6{qhtj{6~{%Q5P&^`n$vkUS9?@6Mf{y}~JX zvate*6A5B@*PYN3Q=%^RkN}w`r;Q~UoCm$Uf&%uWEAhhUzYWK)$M+T7nv_578o*J% zZkM+mVoMyvH}!Buj%yBna{R- zKH7{waY9w!x^yr8r(0^C$FmNU#<=$1b6K$7V|SYghW=c%d`fL}j~IC4zt`&Ji&Yb( zaTkV@B9;7`H_T3Yu8ewWx+8snO0_)q!EA%^(K|wR_|N|YwXtT2{dx1n4@vvJG{q#P z-=>|;>W*NcSv0i&JX;iln9n|qNxL&c76(evsfwtzV4|-6tWZRD~DIZ@<=97>ig7sw4YOd+ue~h!5~! zeJ9$B3-jT01!f)r;foJ;+PrN4`#3&O&q@#atBTXJKj`jN&Gol17At>A4U5t&7b#xa zFe<8}a1GPh>nlnPGrs%EM7iBTA+3&{9Ug1=FbJ}nPW)HyO6ExTSb2_?wapOUxtmVX6SWtT zK~CgdBr4J#Gq9X4$$9njmwh8RW;rSMzca8jh9o%MZfkzU94ey=V~*5@7w%oiaC%e+ zTur?sK3@m(Q|qcD%JqMbHvq;6HrRO$>*=zKAF>=*g4C4 z#_mwQMC5>a$ze>hETszT54zD`j&<$5hAo?tBktj*l#gG}2x2}oSp|vfhbd*l$|`EL z>AUX_S^!A>-EmH7B{!wq5RA>{BH!4O?vode)aTHMRcmHPDw%^`Y85%Fxb~&%HCe zjJh3ueB0N~MAzZ-M!!D7PU4w6|E|f0e}krL9^Q|0GDO%R(tYx~B=2zNYr$h$?-1Kb zwlQhNDQiWyCGxoK^rh$&>7<&|l{z(3^u0*swx_{aLfXNN0eor*IL+^!Ft(KW7b*WC z1P#X(6p{GvjRK8JB10nw{ES$x7VjMb+-&39D;LY?{sRF|{ZSbqxw{KKr$F>(#4dr6T z=Msc*^wL`DgjJkkp@_Uumi@oxA4`d+CbM7O-a-+z6)YnQC!x%}dg@^+i8ZvY1S!A{ zBPophn?Lz$y02;@<*E1XK0+B1+xU2;Wv)oQtouvIPCRPIcR~84f5^>zaHcD4uO2;{ z!{6s@AjG8i73#a|w(eLW6A}DH@Vk=fir^?f8}~nxm!=iX2DJ`I$Gv*xrYHN<5fa)n z>3moZ&)$+7-n4<8*un`v@m7W z4t$O!8OQEl3l$epaN&0gQm85$y=pgwgPpZ}ERiRFhkWT_%J|#vBwYA($tUPz?RvWS zL(>3vn?XPqy02(QeM9rFMg4#O|05D;r?cejAfWvBO*U*(^qRW_+lVKA8c~hQ&h6?J zkBwL;$}W3J`lvL~*(Wv!_KNqR*N=Oe5d0Pc55qpeR^n-ThwMU8M2>7Np7qdHma;BR zcZ7gt@!V|g@FN{f`=Llh?YH%6jm@;;)n49r9!lrP-KY8kt#vuNhS*o#v)>)jOLJ59 zA|Z8AyxAgU-*6SYje^bsCA1fd{r}q_1zjD<^;*@C`9KJZk(jv@rk%k%IwtONDzW!& z-zp}k&3B(k7e^Z|)IoHk9QhXAAR)7A5>0mfz9oaIN`N@hu8&vOa5)@`I~ZT6=V{5W zPCys?@6+nNh6Kd!Kca|Sv*l=1A|wUW+=pp^q@wPA{myQa^W^P68b7U#`Mm zFXE9MUno=$r6^_N9>~+Ui6me<`&4B+r^kn;POTVAVibQ2mGx)VOfYLRTeLYbDb+x* zdMLYh(2Q5uaIQUWQM)BT32)}GO&YQWkn56n)~~rg?x~0JoAG`IszZyAH5MZP!ZzEI z4Z@31qbB)R_5m`Mwn%48&1XoeYu7^{PJ2FTT20`&!MAOGkU&NGC>^CNm+ zMEi+a0dhf6!Ia)Y#NN_rFr8RL2$LFoPeM*b{jGF~QKMh@#zWj!;c_!0q?CgDMveXZ zzU*uXkxhiN%MZr~)nYjvs?Ju}MBl6q_~f?X+|8;Vrn|~~aBh>`ief?LAY!lp6M5l{5DdamD7T62G(05w|`(`SA{b_OC{v_sN#jq(8%QS zR6pki!G8u?@E+;a6Bga<^qMH<7vGx5ha_F{b-NkDctP)-G+EagU*`m4W5!0=*p$HF zA%b)t0b;qZX#SL*?;D8QzKnU1i;oXeVCC%H)M^0;cerY{iLwcyd;eWCE!4G-wSymZ zr{fONXZCTb$3_l_73?3H=$+XW*(56SlM^27go_VawX?msG*lmjK?+F%$(Qwi<}fRg ztAIo`E$(oPqb@y3Lo4By#%Hz{%W}GG(tqH#<*N+zA>cM0T2>*8r5T(hpD=6qS*X}N zr}C{MnYO_oQioE}b`s+pF3a2mmhyL5;1VIDAAb4h+%BUrgE`tr*AgT`s6e=re4*hKjl<{7m)7&<8_G_FbM= z07;??kI-D+GagnRbvlqf_hDy+Rt4a3fffKX(u@Diuy~4M8CH|KZS+QO&_q$f^&djm z;tY|E4s}Y4_>B8`mmF#<=db4ww)pnO2OcKy3#Rd)Y<8N+Y2oXEW~-6OcHm}7U7qtF z486ZL3;?Y>c9Q^y&h8Xk9kJ1rI39ig_@40%aNvUh7+w-0z2mphI(63k#>orq;sL%q z73~&4!}ID-Q=D}2>KQHYhLua1>8;_IiSpZ}+?%*Gfr4$wH*`|S>I-6d9P-}#M1?2t zI$5_xf53xjeXn&S<}lY-s|*E;{xV~jOan>5z-is7HTUmAtn2%pcBQwFG9c6!ANG`)S+OeoVu z8)8opl%v&6DRE9l+kB269x_<4o7tJ%65H0e`O%45YIm|< z-a9q1jOrmJUB=+vC!%g)@Zw`XFqc#Wn5d|&qrtq(yqtGARx!S%_I@?`(Q1@>7{0k- z37>JzpgH#7>C1aG>j}S5)GOPdD(f#4HK&P}cG{HFO2U4&_PHOVadTddF@REI2mLt! z^iZv^hZ9{@bm!a?1lClInSj@u4M8z<`$&jhaPk_WoeZ`cA^B$O)k*k%6-P-4v0|RG z4nP~`Gc4)zwXJ2DAga%(l>2H`gKemX?*62zPpRV6GLESXTbGaugU39!P&ba*$*TgN z=;>630&m{SA7rg@$Kpz#smxh*no!mxYCV8FX;pm&fd^{(-1Y(dy0N49bsM&YX9(M{iWkJ@ z&d2j$f7`?ULf0%ncSTFwo@@Oy@w<@@rQae9o0$H=>j%)1d*gxEhk_DDg+~9Zo1wa6dD96lKb47?%#ttHaTw`z^*8vWe$;gO zIdLpcJ=1*3UVRn;`i7?)4P^8IP&g@ojnoOdJ{k7iJXwLucE%C%1Fhy0W`nU&pyGd} z6R+jr4!}D?t{3}we8VhLb)a|;s*X$r5~5fPzI{2yr@tfxRZo-Fw?Y(YHIL4+$Y5u7 zL4r2eRcs>Z6-V1vxBOL@!VElp}v z-;)O2xlCsMy+L>eVXrok2xb0AU%LL~NZ0X|_i8QUo=a8h$=mB_t1ALOxuDvFWR7V^ z$_y@tACt3nm@}tbfM06}!0bF7B9ANi|F1h?TA&5nQR}|?<#!Ym16&;0&$vf-MDD^79V6yzDeHuxj^Ma zX)6a;uH*rW(+J3hB7{%Qw>-#QD`g(`7r*U{=X$Ktwi#=ZSJ8` zfT*mF9i)!Zz|O%_rg?xLzz+^MX$(yg{}`T77Nl1*9}4{ z6~^mqPEF#<)Z2ZHTNe3Mdv@i;KkKtP1b@mUDghS|{YdeiQAA$b&QDE_n#9Ue=noIa zGY!}Pz(%L|`#Dlo7dEQ8C~ZsC(~R z)Ccv+x0TqXUM$101kNlG0Ob18muL8>;TmRdBc_0q zYO5)=HIVkJPF*Vt*2WK@sJ-ff2;qbV3;ovR>?M+z+RJvX*&cC1I;*YhTg|K!O(pDf z^wy8|j`U8*dd1f}I1;wO#jvw5VDQ+C@>NY4w$w*T7j0hB4q~B`LH7R5(u2j7jzBj_ zh8^;K@l52Eqm8n0pARq0npIB1^*Qz)>pEF4kVq4Ic|Y1-yLs_?)_ALQ)5<+eJ*3)d zw^ZXcouR&7vfHs~!1k~~@XvXbtiFBAKRc53av>!|>wX%jWp#Jy0T@>$qFGG$?Q8Si z_$kCQ6I}bfLTj2^&iH}E96?aug(`qiSp!&6JB5|5II+{sa`K%G@0$a)PYgo57VoJX z#pOE;vIW~s%~vB0BKT_`hODW~!x%G0)7E_JSzLV6-Hzw|>SV=E9}YySjK7EF4Iix; zdFYLECm!!K?#Sb3{&eg{&hx=5FPcRWm+`Qt3>JZJ1(s(-{m_LAPgfOoE8;WmtCsi%-N{=Fu@eY^IjEzTfvTQ-3R7wQu_kGXH? ztuj%sa1yntZA_o8?D#~%ck$JTf*P5G1UdsrOBR#eR|m}ZDsVw&P;+D&}v+vgk=Xn0Os7{no{I z29Vv>(!ynKomTDu4)*{!jU!_~%%NDy@gP096J&Ke73X%ooo!}cca}jk-j8;o$JttQ z!r17=OBPIpGL`Pt#GG&+9 zP3@T&_zl0I#Hu?iJ1F6lWhE3~5zjq}OdQHI^DASluj<*&UaBk8iz|oad%)`=m%1p$ zwNC@e7~?+4^cU)>r@`6;j?^z)5o-El%bNoiIb};d{ly4|i5j;^?X_Z8Uj{RaF}$_C z=2spn@)ix}*`DuyXC7``^_1ym9$eEJhw3s%-D)W^w*dY5zB^yRG!WmLQ~_ROJOy9q z(ZRuWZ#OYgyN|Zsr@+(Ycf?T`^%ONjnrN__Ax2w&%c~6F&7xKR5KT_Z=$m4%&pHOpcF-g5{RG7^E6DZ0 zM_gMBm$gS^@y{v}tn|RitK7D~-1O@C4slWs^~})AMyB;iIrjgax=9j93{TB+e`x^{ z&M;I(6772O{g}UsZK1N#zIAijE>Wlie69qKy=PKitQezjE`e^Aw6Jzz zsBT*79&zI9L8{!^f8JMChD$tzzl)aEkEA`m^f>t&LW45)aEAGpC52>pUfb-ZOP@sJSswr{H z2ssE5d?e7nnkg!zne#1dA&XwBuab)8ZUEEn2^wj(kF56V9P<`z0iWL0hwc=VwipO7Xx_`AXAt2VQePuwUW9wZ>9msYgC=*jH$n`=h#$Nvc zf#$TzctQy^2Y(P^uT7cm&p+Yv7}-8b-+N0o<5yE>IbE5PgcXxKxcUYzM&mlfJv6lR>rAz)GgOMQ-*=IB`+lvO$xBsL3~K0ZbmZh|Qvcpa6nb82p5%EQ=A#2FI8- zk{`(1Z>xhfqVyxTE(cz#kDMF$=MkG=CW@tEHf9N z&N>~C6#AZznHF~$f}S4sh@5L$tZ(u0ea%IF+!S$}ttbZF^yu;D-wEV?t=%3CW&5Cg zGI?f%cWf1^eGE=~8LhcwCQkQ7T3kA0%dI5?W_0Gd0#E4ljq1V{_0hFmaR*8^`<OC9={|zmk5}^tL$W3AIh7P}6yKql^B<+D@2-dAv!{ zZOO27D{F|7{C98$@+&e)#5U-5l~F*m;}&=UYU;QPjt@#9u;nUfJ7I|!zyNfN!)S@b zp=}1ux}NvBb;yoxck3*MXAEs&;rlT!qf&4q(sDYD{ zFM(+ZfeYrM9fbKbikQ?RhVtJ&WoH_suTu3E(zqLCFaZELd!Jfu$pHwJdoe`l?MSJp zx5kxk0$}63NeViZeUY#_tD3q)d1KMYA0uD4^$a!{0>;N!Cb;Ztrt=3nLq=Rz^NsHCoW3?sL#4qsczfW0M+?49C|C z8M27gimiz{v3tltCa*NZ&j-(WP@ZMou|R8)r`np35k=Dz7qx0|uraXLMio>GGdDUF z0z%H;CrpCwNQz+poYNwzH)cm5rh;7;DVvV&R>;A&^l{gB&J%Jb00>u@*sLUUkG&vD z%h~t%J^oySk~WfNtyY^#Q~C_K@(T3OJ?7WrwG(WvRUgh|2hz!LFoh2c)mm4Gvhf8vs|H`N4P`z^3H_bUOdg zeMW#TQZhZOc{t^~k&C~PU*)A-H{k939S&#bIULn}>hC`g0d+tcGrNXd^g2&F)CCf>H{BoSXZUV zH z2=LfHR{=Uc69Q#7ZC2W>^h0&(IShxOLei^j+?ZZMt(H+T5%giu#$yv(0jrzFoo^LN zEfSj+cv3b#34hZZp?#m%YQ8CmavYE?)@N`Qr~~ zJUpsEYB3GX<=0B}yo&@}hZt<`@QNQNdPK+{Z(w*1Z&-M}TZ*9JKCKD%i`+>z%dJpj z;!<;_C_dD4HB_h1i?lJ7Y;6}m5^-DHp)6C<2whW&OpH|DWN~jA_M>MZ9q1p}HLMQ6 z1YDgVYhRN!-_G$UaG6xAQmJ`SyR?~d*n*g8ZUx0&vk&^8Ic8mCZobyu4HqK~g@ZO( z_b@HM9|^f19HTrE2h!7{QJH;)tUCGa36EYO?Wl+En!3)=RaqrZy8zGFkR!D@tzeMyCCXOacW1yz3o7J@S=0T6{7kkn4Fb z2C^i_{yPB2@QgmE;OQ>PEeSHvH)S5S;2b{U&kr_tsA1uKM{0xpVCMRE;H$xIcXA52 zUx#T^DHLvkn)4E1ny+7fQ*6*fqa||&7zOE2gF&kvDG0d)wNe}izLpZD?H=Q}^ksE8 zHv;?QYlt@76=XMk=(fEHJF|tm?&Z1G)DX|jFlI{{y?q4L{AM3RRim|=iwZS$^ z2*xNxt5De)Y!!@6RvX0<|9b11Phwws{HKf;o!0IV&Wpn3uJ#|aBPZXY$0HAM_Wg!9 zgDv1wEadMwyTLYrN1Fk15#3)7Bl2Cn9hVwXh+*w_YGt$l>c_fT9T-_%c2E=lGzXLP zfFk6is6~orqcljihqgF>yHlZ4rhY3!JXWTNS5=1wXRXtC6WG1>g1u8SOe$%ZY9fP? zi>$d`>EYVR>zB01S+Gx1{MjK;dch2-lxc85e!!Q4@37;#W}g{Az+?JtLzztHg+Pv! z`OzKFerXlZC+=zjywyHfgLMG6USm8CNf;3TPyVCH5m70@c`Q z1F`}MAf!Tu5*)h8$4DC>UG0D1kmaQ;Lh@K2-Ab!yhfBugd=ud5Y5Vc5fE-`uOyh$# zRFCt|HgV7Q3jQh^(Yf$hqsg0t=FC~wW>z)r{hV?QKL$F?wUM&Oz#v-o$i#0Yb*pOx zDYPtHGUF!d9ipwef;eReV@hLEf`tUjFT2$?*As?yGeu)Mmr*>2gfh{>`(xzqkHT96-Vq2EzNv*$aQz3TsTMTq`(O8tBxquEV@L z<@n@==>b*4$m1_w0mI(XeDb17VBDu@UUnwf^IUI&1UN*u5KB_Q+OjTyz|aqB!#AQkM+ygIc+&B70|}Vl0wPI;O7b-P=a~US z_A@H(V&C&#{oGmYEcR5tMlH#1Cv!;KWs?m$NqY;2qPyc1Mx#>*_Y8t2jj)Q{3s6AZ z50;_(%1~1cFR#G)l1r({erfsl)Ho+)3?z&{hs{~y-h+te`E^}9_?sUON-=g3y;u58 z2NuWi%Wq1#KkTc~9n?J>L%be;Z~orMJMg==#dvUu8`tq`o&2MR>D$692Xn_ZB*x;! z2_xkTd3GTq@4LSmY=j$;SCKB`vsby86E_y5B=VF0!E0#{r0LCeF#`Rceu_c#8dBRD zM-g0$m?EV?q2xp_=U%dHz7C2b#+!^-=)bqLBwFa9YN_2PsLu#Tt2rJj?%M>mgwbt* zcqZvO8XhOb52(qSjJic_>A)7xEK&`_g#Cn>7#ILF2oe&+oxiBO@|2HuEZ}A^?=H%Y zEG`rniryr1e2JT9eKI>T(;d$>P&hX-LxZ~x$XRiDbqHfIkEHs;VM6pja$HxCt7Mbd zu4=YXl{`FNyAWg%&arW~>kjOrs7EJ%6fBW&a`2#B;$qn`L{1QF=G)nSv@XYmB<#c=o04-(yMg+Cagr8RIg9A?SrdC z%a}ePO7x4(Z1w88@zgg^87<67s$RNBs=6u&hpA(=KiXY3SmnV`ziNlx_JsvEX4>D3 z`g>E%t(Uej-y+0E2g@q=imw>H&wU`O`~&hyF(1dmj~?R_yvt0QdL&4w`8AImldP;S z8DtuIpCbvC$;sC1uu3=r0uK7E`C(vQKRw{Q_=;aA2*Ihh_W&U09rxMNk7D>a!xYFX zu$qn*6bO7ST#8ih2gnsTRnURQ@?*F!?)o`6J@Q>kP8EL%oUTkzo}EO?$$$I>v%kV> zAzpcGcf$gLFUOo%KA!>2ZiNW({faPuoy+`{^N{m-BPEQ#lf06fVxuc^rfBw9Za15f zhadlIe$a`yc|7fI$Im#*Gw0?!WvjjLx7Yu4;fXxWXCz%ffy5dP{PzNuAd=ii z!lQ=7#F}RJ%U|Het`iPv%sfBx!AZ^DY^3kZ9cGui`~|eYys{AW&$nU}G~nysthPv4 zBLecuN|mG4bIp>}UVSL&!z0op^sXElzmc`n~ zY}h7Gt55GnHq95Qu!y$aqR)A`Z(n8t9ziBV4gXTv_Ttf)uj%pEP_)eoPFZ#>>PKq_ z<$WX5P&b7q$?(xXn`~YhHo2oVx<%z5IL{5BY5tkA-WtxQ@*5)oc?cv(Q*}`-N0U%H zu$x%K!7dZ9CKby{H~c;bfi+7}E1IgAJ}@$3E*V+3 z!IukjUZ&8tDxaSo^Upo*e@ZHeVB`zk(K0PbY{KGW4Cgn=-((>6O*aGY`qvHjF$>Ff zr1y;b=cdC7OA^v)t-%H6wQU_?EK>%hrMY^k3%<4Kv@Cri+}u8^MwR=jE`_E0)RASy z`@(q#BC==x7+WG4r#WS1f#a(xI>bYv?qgu98331E4CL3cdb|KDAke+se6ORRk3Fpg z_B6HvgLC!@#Lj!!o{~`Wb?S3wq69F{`IJz{3>HmRs5MBC=J*%DXdshc1ET=7L<6qO zGX`@~Zopd>0>+tSCtOB@laBuG53s=w8l2NlETx?slvl|GH||;OAC$uY-_8v z$8g1nkesJ{*{e*98G4OJIR!)Ef(2n(9`(g3TGJhPE6!Pvna@}#K4>e!oWj^cBW{Lu z^@DKDPHPfF{uXhw4cZxGj%K7zLE_eY@9U^2^*i=oc#Zdw zX9*zpfm`t9H@tOK?WPzQ{<_|jV=2_-38wVv2w=DS0kkz{-A6gQqd2#uUxL+m@WBf1 zX8`Xg75y@NkfHfTPZ*&>M}q<{Hjx=_RIP zMFkeXcg+O$#%=Del9+$8s_(AIzHGysxEbCrLb6m4TG_kb_Ly$Xgok|n1hsm((%8ng z+_@v~$I0!o9xVer1^-kUDM$W6DN^c2KDKYqTPp9HGY$j%{;P$DWTzX_08%CldTBO zv;bV@UDHI5i<3FIK_H>a zJ#2CkM3=1Ffc?#dZ)DSRm`h+$>1^PN{gm$e%7FQtSCG0lxFLjbP*SbS927lOqU05K zb2GcS@6vW;C_nE=>A|P&L&K$a$J6dz!?@~xo(qDOPiF{qy1@nidVste6JMqzKlxyz z(flMaj%RPBC!d1K^`C4HD9w8(7`gRUgyuX)xnBQhARf`41y_`%%(rn@Tx`+pYNyUl z+}_Z4cd}U^$^Hg|j+wrK`JnhEs8!{izy80!lcC?2;Jn_@N4V)O%s*rQ93%${P)d9~0$nW>!AiN3&oBC|@1R~{SgrZV1 zslz1mU~X2j@++ym6Q~K&XMl14XQ&?uqHH6}!xf(oR2}uskh5C{e~T|K;&!_aioE3# z_RwrhXkz{0Y^FA-EOx=)xC8m))RtEQT93YLFLZXmU_7pe6+}gD7=GI(>ZyO8Ydi`; z1-Ujg+T)#x(hJ}fzRx=$Ylr#;f=rXzXwwuC?}iY~=fQK^;f$Dk`yq``)36eN>E?2)yT(-I zXRZpb0LRf6*psh|Ze^$NpB#gX+6Oybky?VVaXZVMDbT=SbI}e|nDzrwVS$aRg?42c#Gf-a)o6^pP`(#Hf5xncJ_x1?L@!jvAp47i<)(@ z#8bij3jUD+huejtiUBYitboX*8aFo#Z@Fa_6`ARP*;FOWocp zHc>05V^imMx9gM*=IPA$St%1IhSwghFkXoJAC-7GoNNUhIGLXcq5UyTFYn!1ZK*8* zT?gaAHdFajc^R7H19)jHLhhZVvCy4V|SUz%^%fU8C*8Y zoLOKo+2tv(aJO)-@_G)NSe!UhcVsXr7Js_Q-M9AfJO8+{imF-By7+G2WQ9O=u)e$M zLGij`jC$hEK1F>FbMgAYTg~Z7&Q{N(%7w9j!>v@yzmOAIG>_8I4xs!Z_g2jwNd~@# zk zxuG_xP?MEf#CrBC^C(SwRk=_2_E>!o%hUC~vVAdsHK-d_pec8Kd;~SAp_!_YPU74G%`&BG-1RhjHeB@5yY?hs%5LeF6Pe}O(+`So8b2g3)^}Cz;k%`q z<&!-}cHMY7udHC|MfYufvs!yeb>WG;NEe0tv)$#EF%C2I4!I1c3g;i8!vki0#;j@S z>3;;2zy(!6nyQGh2LUzq&%0yO=RX_#(S{_wQdqLO@DNQbJHVq)U-dln`kF zX^@hX8dAEH?x8^e>24)OxP+` z#=+K+z#d`71XsKss|xcN>JEEU0PnJVMq3QySY{;iK|fUWYEuN z0`SfBzD2_~i?x#|lo4P>kXH{nO)EET0~)!24=iehG{b&?Q2ck{$#WOgt0QZ8V0M%F zPEeg-x%&J7Y$C5}L;$MvtGCfP8=ALBkJmp5<|;RJ3V5N{Zk-GRnl|gN%}n|U3}Qgz z#&W8W9n5;xg!bq9;fMjv^Fut@AUFa**^;?KkP%A-opQ%kO0FMkcLE8>rhnz~7PMm< z@bh-Y$M~dl>?wutKoUhc#%IXLa~F$c3P8nfrEx-=>hg-gmj6jXGdEFj<2HUfG3 zBZt$2CQvcFA*`N$RtXjVrE6;2K+f1{W(mTFrco7~LKx@f`4{s`F0@QW9-+zj{4*XX z%b-`vqaTR5zqA0hvwM>8sgGbrJ4@g)yp~_P`*j~I6mK`37OJ-ZcG(W-Q*X1?bu+KP zK##-g79TBx$@$n8cbm|ugAY>w0xh8tc;Gwp0Fs~|saJh) ztZrs;6qu^OqRwsh@(ZWl_oaCFs;sFTk{RT1~MLg(6 zFoa^AIbv=CwUy8Lq#8(aqrkGczcf|UAFy6($pQJ^R@?8Ztfg(D$*o&8lb~GeMTEp% z>Gv21V;y{%&CK5+&9D(~f_X9Kj=VBCNs5%JZz8Cq;gItNJ2J>z*|K6B6jkn{z2=@Gv}M)5Yx?PiP8;|m~OO8atMC~vL^ z{|ag?>fkHqLm^hz;hj5o+;1tq&~dRMQTs5WXyP-u;aGb*530dbu9H&lO2@3_i4e0z zi1pQE9arJcZ0*7nFNoI7iFGk3`OV*OH#oAmTOK;nA8^4%+>)L~JkzMoF&si$?~pq6$(r)W*wSqaLirl1#d3KU=uE=lR==oBh=4Fq_P z?-Tn9zj3Ia998G>9S-}drZw-&^2XAf=7L3^ z*>{<|n1j#kMfA^7(E!60vYn(N(p?TQ2_v8F!xI~Q31;6>(qfJZdQB@XUj;ofS}RM; zoiB*s$$Y#iQ62Da%_ZKQgV8BRyi4SS8?r>bf)(g!)?9Fg0K4j$8&f;i45on?1kxKj z(E#YdkO|;u*a79=08kiPg^SJwb0?;WD&0xl1HJ7Tpw>|PO}K6xOv|%Tt^C8F2BAk> z8#o0D;-qWR(lE?zfw|e9Y0wGYC|o%|i?LhmsH*4cRi{3^XWEKez-o{VHF_)ke2p0u zZdhNrZs3_bZ$LD&(?xwD5@7*?&NL7eOI-h>trw`pczGyu4vfT?av3V4DvN#=C@*Y! z!xewc$n{>T(buT0yYadjx^$XuT3sRN<6q@RuWH>j*1feWxX4(=u*l|}V?sTPNj}zX z0hF3^Rfa~V2o<|)KhXXNNm8g#fwCw6@y*)8CeRCn+!=7)AGyja@YY*3;l88-K|twT zs8B!9@~!~DacUd2`-|s^BJ8b2D_IGGF#a4yr20r3>c@AJvtL{spMNmxQ(-EalW4Fw zne5l6hbsU5ZOX}7r^iifkJUvdP~xlKQfJx4zHYA&# z$ZdRG3m~PdnTHp%4IK}Pp+d{;&H+v!0;j=+*_}37M#r9SD{4GJ{A6;umx6| zJT^T1%l49@pg}DnR!5ujnb<92Wq|HaT7fv1Y%{qw3A`3JA)58P%l8esFyUv0GROZo zf;?}@)8s6120CZl=#KYTwN1uj3+oN-+h2<)@@0)#?_7$epc84(UQ%H>+AJ#B9q4oE z>It0|-cs3|RE?R&5qk5qLL@YeRY+A6pFt%)te1jVOfUj)%_qs_K3w zJM7M4^cJo*(0vrSeQyflNCd4BsuQWh(PB|on!VxA*Q?@Z8^=A6 ztkG~!-@LqZ!%uLzLUFM|w)l~E12RHAA)Au-=u zdABiY0Vg>N2o&!{AA=@=gtfc(ZZnNFF$+WOrmAXPAz6DIw{btfUD%EpDlcAQfHOjZ zl+!y~8CTM%|G3E-r9E^+0;o9}jjzBxr(Dro`Se}gt7!go3+M0w#J=nr&wHSP4B7|% z9zsD@A%Xb@d-Q8iB0>k{0z-@iBpHvO@JZ^b;Zk(EYU>Yx`yc;Quk7f8)Ix;0epI7a z4z(Qfnoq$Bk-eM4o2OjziUdC<@KouH?}+SL}5tp@%&M=BL{ub!B zb}hPOk*LFU0_x}j1SPmCloP<>0{iG0Y&&aMdeC}R7L>v)wr^e*-+~ek!P2sfrE`o7 zWv5f?4#zv+D?8yCcGTzBds3AIB3Ri};Gr!`aI7C`n$aUu8uj_28!n(%`96^Bax&2` zsD3{!uc+Iwumh;kUPw}3TQ9IGQ_ zB59`?Lgi7j-0B1{+3G z(9RSaiv_i8(`ya=nSv92~`^3r*Kk6*Ne&?<9S2K)KHi1AlfKQ*HP5L|v9f7Tl z46=Y}pl5gh*t-chgFf#Nr024)F;I2L0p_v#2>J!k4&eU;Ash4kK5CM-UHPa6RW0fp z$Y2GsOS&ST1~~=pPeN7FYG}lsy;iilCY95^^s5N$!Lp6V<_3LKQlK!pMs|OFh+rf+I4>4<`Q-K5^(ho9mB`& zwGqbb1(`c)JdwYoh)svN5j7#P<_whqMp4%Dq_Zwc?OmbQ24*%rIQ2gJ0CV zN7H*Q{_@9XpQcv}^Gy83+u>JU?pL z?ikgL7v?mPx*zWjpJePL6;oAS!6kp!MylS>SP=H;#^X9nkCjkm!105PA|Xm>8ED^96CTX9^OCJg&#vFD*UR z9{9epX!lQjb6OI1Ou6NVz9Y}ad%>}*B!lqAK^463!A`YihjhosDT$QX^&f0QqXHw| z4U#f#*jjF2Nzl8upx<8GP*`V6|9DM#v{Y@66=B%cYTLpN#s<|$`FHr>?Jj-Me)^&f>zwawL>T33~amTOJq42m93G+#cZ%p6-A zKjkS|ZJVpmZ#Fc=pC~tt52qG0gg+(uk0dDyT%2z!)83nyef<5xQ;qNKS2;h@HMWTE z&h5P-x_Yv|_q88!27E75!r}WcPFJqOnA`G!ab;p%4f2~2h9U5*rJx@XMbkV^8tXhx zmls_&C_#BMNbz~}z4GC`mVW2UU9q@2GSeWlEZ)DFjWh}p$mh)wy*O!7beux!am==g zPvIZy6}ud+l$6{V#!7Y?9IxLkpq1A8CO>LjUv&~U$0szw|5hh3o)kM0VN-(T(aCf$ zg(5Y|F#bxct9(0GAJ{)QD0&K$6g#$jKUg`GclkgE@AGlp4Mh5{x9^oeHXVUkI3YCJ zD(=QvExZw#EO-AtVR<(HiA6xH=E@_s!||fLX!0*Z+Z}2ptXpoDmacqOs?mv>O6+U} zvy*k#4L;8Drr7lc)S$Jd{OldYfdt*h5#` zreyT_qlZP(US|?HA!{RzFus5Vx|W+j8?_B{y9i$c+hUARg%|b%UxBt($uno1S#22{?B$BRNJ18#k`Y+THHU@In z(0+V@y<$lP3hf(ukTvIM#6&HK{M>waMJ!0>&oBUNpou()yml)VZ+Ueka@#HsDHp2i{c|hE#%*Y!aWHH^a zDDoa-dMWSA#polQ$Ah|6wa@)4^?aB_6lRSDdek1Smig~Ygh$HuT3tFAWD zqx{s~U30Wd^qpTK^XgWynX7+t;?!d{BU$riB3v^to;Mu9wb1Gt4i|7MDWR0xl~pA4 z{#;dr^7{-7Z2r?@k;fNX{KfjuFRH^G`z5YMwa^#KnN)jIY zOR9qtBGw_Wad%F-*VpWmIN;>3KF2uz zFDTLr)XEgUpw|li5m@Ar_g~83H%|NCvajY;!&|-s#_z!mF|R-C%Fs?*`~dZoq$@kK zpgS#pq9N zh~#F7R3S2Q!7wi<_~8F4DgSGV3P3$Q(cr2~HphFfoEBjcul|xK%#hdej>?ZOf$Bcf zcU|r7@*Qd}ft2@rI*EhY!{Q_5d83zwouLd7SzVvtr>Fj>$L^z-lb|=MLe9wj%Gduz z$NSe?{`0U`2sz}WppX3LxsXL|tBHpD7aZpp3|ibr$e`~;M$sH&|J#Mz171$HD_vE# zrXArv65*0b=ykKr?@KRM2TYE)f7r_rRzb#8WN_n?qcY-4{Cmp$JDvDn|L`YV*4e(v zs=j->&ccV%bn8t%r7HM(?B0os9rP9A)0uv5URdsON;h2z58%nuUmLZ5P2)XzP|2xp zG8Cnl!`Cj)?Te0FMgt}ejYKs1{{m+I^Jo5D=KZ}W-1EH1#0rRJ%UCh1^Koz<;Z?5j z6T{mkP%C_i)CG#iY9B1diZLNjVmybg*9h8!%dDtplkCgU8Oobdf2K44b?X21VI3p* zs8ethga7rY|M6pU_+zFhSmD86VE=2{`)f}4Jq!L%4^xhy`{T^k1N;AR;Qw#}G)_-^ zZ%1>||3dU)D-c~S)R(aU!4FmdV-O0oC1+4$n3*|Z+)Eq=>t!6Lz~*VTI{k=&TW zsc-r@qXWPRZ5d3?HS?eob&D$dy0S zO;f-;!V!zH{Uzjbdiix%q{N_JBUwG*GHw3%5{HB*bDP6v&{4zfAjAZ8py?&2Q}`D7 z@(>N&+M$mFVE$4G=~7vis>ASX` zou;v%CMs(GQey=&3N`q|oR{-!IjYgO*!!2;BVo8d$;)dhy07w^Gi2|jnQ@;O2&=Ik z;b)_{MZ2iO7`RGzc7r7^bEk|d;YZFIgeYan2LDnGBGa~ALw#?|)} zh+rw%KNcK3ynPB@l-1D!FC)y)Sze8=9-deZB;VS!f8{UMjDbzf>D3wvmx;)Jl38VIG>~9BosuaVjm>$D)Up5r z8>G@(gA>FFV-g;Ky=9gYIAYcZ%Q;F;dRFcxUtm^25v@gOlcvxtwcP!bSI9nC2U5W% z339O@=gp4nS&qyJMP(ndzS4K{y~;kQNi00o#UJX0UP1Wk>a2HtbIed>$Q!_Dob z8;p?pJmtEPW*l`}C+{tuQEKpV)=91{q|V^X4pFUa7kxU-xMlBFKZ#5{%q-^DYs?-M z=86#vE9kL!eGy!nkvnpdSxjjF7x@wTS7P1ZmdBZxI@5h1=}Fl{t-MS%HA~)|V=Q=* zYk5kgQM!9**JI{5TC=%iS-8Mwe*7AYGr*h`izjA_6~|gS>g&%0i=g>Z)u*`?a6zPc zsHNwKgHsJMJDA65?lX_UWh}gwo7;#`?J$U;CQpBgiOP7Tn3FA(F-To1{_sQs|tQoafFS~?oOJSk-b z$%)ULjZHs#7kG7%n+t^=h!PEWmB1B~El#$V&m+@3{Nb-(n9xhoeanzYEB6_1a18>b&MhYcTfz_@mz zN=Y;+JNt=5SPB1Q3P>w}Jv4MpyiGz5H17nR{Y82vFktPG+>9`V<>p(o=)@xptL|q|WO8SEWV)B-8t;42nMDu(GV2~hd zGPaLqx>=$(R#{x7PhQ&m=%uwcEGRbtipZIx6S^e1@p_H26FW|jX6o` zzm~JhMGcz>Ik{l>L&@#MHy`n(iR1P&y{6-pTs7Zbg(=Kp=Fa2(`n_`I4l9hs)ApNQ1zg(cOP`w_AB+oU`OOlw-po^> zY%KtbTZ*TG2C2wot7RBYzDA;usiv; z(i0TvldE>Ne-c5nzIAcJAAGCII-BWvV|NcL#%)udGmhHsN8_{x7V>UCDjyV7_ZiLh z6y!4w%Q<#DI?SWLmOQD(D(iMJ>Gd*Pr4skcT4Dz4XBy95dUJn%SxfWzo&xT5MAf~L z$36M^ZLDL(Av&c+=$@z04h`0QmFrLA?B&k;E58IK)9sO8zSfVHw9iCG62>}i8{6znf80PI6a-%Y(XSafYZdDvL=ipq-~ z5(yjGK=$&BNo2F6x7izuj297eBi>2(E;fs8t2o&UgSd;t9IZvb|5t7ctsQlQ=!iDB zEm_s9QA*P&NP|67?wtC4x!(cf-`fHOB+tA73Zl1OK=${q(D(8oDFLdh52K`^vL`d) zw6Uh7o9PA22A&rSLcJoR7Ic(m^U9qKK2~N3;&rqVp6#_(FmPpS^yaQvvcji&syMGM=Uo@mRf2y15 zElj_D5etsn7Wltj#ml?HnE>23B0X5e{}dm8U2)IH7{}4)TV|3R_IYxgFNNTf?+)&0 zN|=W#KY0SnRl=@*r=yWm>&n!Ux?SRH(IjgfbB3_LD`qRKVm$8g&~o%zO-7hc82m?f z58E9XpOb*x3yIo}PJkECIyPpvCDtsu7`I`Qob{T{W&dxo3Co-kmdDA#J3&NImRw&-wy|uulyW}+vPYbzu^P_ zVAZ^A1X8@Sj=PJD>&SjP_*e~<50R9*ETK|Iv4YbhrCsi}h!{#CA~#=)f?cy2w2VZ{ zf7!xl@LYddbwyqIb?-FJ(Dt%?<~4Rk8xiM>zb^sZ9E;EOX#PC}dT%!p-T>0cRqjCP z41boReV;=^qHGfC}e`aP_n?#_bflMkGW-ST7c zK8hNjD(@AM_w5Vd!4|$0;<36Emezn>zq8oq`?qjDmGgaEVh$+PJECJ z^Pf(uZ`MA%eCC#@lTIuAaA3E~{p>$?QCBdUF*8Ke=5;)UHp7}dBo&uLuH0pGa2QAY?0S$R18R`D||&e%;B#_$BwnWei`3~?HmkNKI2 z%n(03SWv8^-+9a%$En0?K5*694A%@dSbh4K|(O*l6tP&p!{oc^CUp6Jf!O!nEgPgsd8NaOZFQn87#J3MXs7X z-HMY_-(de%dFzYF$~Q-3=db9vrKH5N@*SuwjQ4Z9z>x@l%CprQV%$=`ll+VWD!7xc zjQ3fyjDmwjo6D8fc~BUop6*8M*e2_`3E0}2FJ#bfaGw(IRWC6*{mb*>ophxqWZOG9 zM9OjDoqPN6frpYO!Olj6wH<2@{ORQ97#&4*;m1YETsyGxmSXGt%c4yjdFJtZXnMsT zm~Sb0rIGc&&bfOnOm~A?nVij=mm0RD-t=J8w*YodL$bZ+HRBeRSp(2Dt|=wf6dt z(;|<)>o6F{RU77p3rzHu_?-6<;SmewDXjo|8X&DOMXrX4WZL zNH}eX2*BN_xIDvOgUvsusr-BFrWA*joHoJCwh2ZD?4YGpW(o-YZIiaRrn%#Bnm%ZSGRk zS@~ZDhQBSVt~Z0-%!%Q??s%ZS8N9E6H;Po(@_t(PdzCliS^4Ss0_}0|V`Fw*DKeP9 zg$chcm)< zE?6ubcR+;#B&kuLMH6kiLt^#cv+@ibjePK#-EXZ3=?hdY`A@i?boKvpZJ%Liw7*4W zCP5*Vbm6Ae>Q1I<{4UuWx)RRMu1P`&L>xjH(;Fd*gKP|x(l2gHk!dnw_T29ymD~{i z(EQL)FS00vLC-F7N2hgIre*yuxJ~loum$g@_@SZ!V--{#9?VhX(R*m@*Th!=Ea6W< zn)U&%Jkf(cH*e4Hr)UB%e>;(=cnXu};=JwNK?N6<#Z#l@Y=ZQn(t3IJSE~DJZG_bI zl^WC8jP+kP?S&7ch?VvS_P;K+91_T$Jq}_9Jh*Y=zVhb&!^@+x&!bG&9B11|RM*b- zn^oiLLK)op^#Y&9Cm%DH0Gyn(#*3my0|TN#<-l!r)G~A`DzZ`VM6-=h?DFL!i1d`Q z7NI{1qa6GgP-YOIf}hN_hO8}nR1;;VdwcBN_Dqo}xF;VT@hH1NJqBC|US?nD0vdG` z1pnkJSGUnmf(DCWSMC!_LyM9{waEL5HF-`^lH||b+1nC__0J3x+7|HvAv`+pD$ixB-Zrtz-+o$8e!m|vVSJ%f4AL!V2 z-P;E$xe&-QK4T2jc+GrvP^)SH%RtI+xHxTc14!0uhw6F|*rgQoMOFk7gdwq=0Qp{d z_tSyx$`Kr+PK00?BuX#csy`Fl1XdG%z>C@qgexRt)`?~8e1=KyIKur>F1V0_jUIwz9 zqC0C+j6bhK#=4B3KhDNS>ZKrqam*dWXCEx@y%FO&Z_x;Jrr)8hI4|<|)9Z23T^)>Y z3fJr>1D(t|Kn7t7gjqdiXy>=Y8}AIuk``EbNhBBy4e%wC^)B~VOXWbFut2b6$ zOqQsar4J!IPhw2jGBpfYJljkf@{_;38|o8F^Prhh&@aP*?5Cl5#BRVL2{gKHPQSSv zT{?0aDm+q(LX<-JT41zIl-8+U?KH{B;+IZ*kmy$A}eH=RCt| z7$E*$Z5V*qO_acisqwcD>$wy({7dM+MR>gEoeHo3_REK<+Ki3Y3EAaEd=!<^^Rg7HdrA(5L%Pfxp+4VHo$ocxQgL2dN6-CL)R zY9%d?*PK9D{9rR&r>6(Ry|O>%&hjv7XzVX^?6^g7McozmW-3)r;yP0P&gQ)=#9LmZ zkl@o-Uovj;y1Ea%yoo@74etfM5`3jNp*NPV#eBCq5`l@g*_Iv>NbPuCb1JuN>4+e- zbCGSVZT-lfc{u}+NkQK+gwlm3%C2yoUk_D$wUTU^xj*Gier+aQUH=*kCV^yIEG@ic zUe9ro)pOt^$oP5m^8#M-Dh8K8hX?5Aye(^Yq##bNceb(XDec*+YQxPNM=L~I52H($ zvh5uuOQ3ezasXEVLAb)7_f1P87u4=v1AgP_G+@mVQuDM1EYl6WeIureb!&W$u@JFN zZ__#DyA8Wh1$YG-9ppc+>;4Y%07`Rt_)WXq{B<~u!d0Z?<{7hesdV`1&HOt!=gG}7 zNP$YSdJK4dxX({P1tB>UIo>>F-~Vf0@XWinJ*Fdl9_56 zbtY9_v@u}%coF2TBG9(0hL})Y3i(`y;ebiz_Xb%NM?mo!AT% zH9vZQZ7WQWGjeh=Y8@I5Z~9DD9xT0uLmuG+z2)@r%9@%R;%NF>wx8c0;nQGJBd{`? zKqht|I=f^0@h1SAm;|wwV{8)cYE~ATe-DgcAX&Bxz2YvhxYcob^ZRKxOAMaMZLqc& zL+r+)X5Rl_=%fYTwDnAXty$UUL>6mmvvmJ?-dC``FT(y=euqFKzV@IFGL2OQ?(@9X zuCoNbh%7MAztMqEOj z*aa=EBJ-l><*q{VO~82hp7|MLU3x?>76CT?UNzV42!DA zd%9zQFZubco^VmwVRVh$Ld;@CAR=$aq@kX|m1{$yBHvL|EV}-9Z>&fz{sp3aDllsC zOEKp8$y=>E(TQ6fkQQ|O9<(ZVf1-%1xeKoKeTwDJ6I!I zIBclAz!ITR5JN6w9drR62XFQtO#?WAP%G||?;&D2j`n2j3%MO=VK z+nc}jPSRuhs82YT&p1SJ@l0x%w3pli)$v~DYsosPBeWS!^^Y#<^wJ+~-NMO@buz>m zod$p+l8?+gV9|O2e2M*|%-oX61Kx(O6%rS!^Vz5PyDew2X`SRuO=@avu8Zm?3 zj79s9FB#>B%iAsIks1@A32qXpg5n6rn(*NT7l+!$Rta2@hoJN7AZ_;de116viv73Z zuN$iF$rIehV{5r__!j3RzK3#=&n$Ofj@>SgTom@6fu80>-82?6NNhlyap+Z$+XKYm z>VmaqM;A}oY<5eapF(Z~`R5z%;qa)(TB8fyU2iQ4f3W?~zq-@@R#Pu0@#5P$eLYb# z%qsmQ<(Zu2o|NSt%qI>q@q8fGuUnDkUCg9p6{By zPM6x_;zV1%;)&roPP}TpGdWphn`9TGX(><*#H-%yGa7}9@n!_JoG!lYY<5!xK+9^>Iu_|SK;hh-?8MpqA6HP* zP=f_Ra~|8bkug3L-eVtH&~pNJf`IMZkD-&vVUdPU#aJ&5f<_~zL+&4rbxMt&q9_)! zGd6r$Qhlw)@pqaahuzb%o70Xv8gJ?9&0{wmyE#54Euh)wG5i{-P}*@jp%?OwoG$V; zJOWkPXQMqKz#)+cEnugBT{5%ZdB0*-MtI#t-lL(~&x&%!PFSb=Eb_^{bMN-NQJXnX znW)sxVUSleTS`#pl(5X3-C?ZC%pvMw?J8pA%Z z6H3cp7dCf=3-HDuT-oTK6!zrh~9XQ(aYbhBpz*roUi=c&;gf@CwIg<|ehI78>{FGmC=`7IqXmg9fc5okIHyvP zj8nq)`-)x^+ijcarRVY*6?tF(y^+Q?gyLZJ$Q|%as|@m}RS9ThK;HD(>{iAnu?nN? z*yv(NZhFFwuWEZ0G7V(NH3z6P*q?ra?aW7JZmrT&uT-WPo(wpf+UyKWzTskJ5)yF`t@}a!wKuNkAM}OKiiL5yB2VZP1Z+?fUDl;W2Lu)D&jJGKEBgHwS`xUGx-(|)J`&pezMn@Lu5^B$Okl1X{1=WW} z>8*9Ko3RA^7+6X)5LN*h%pa3>EFG-N+yw7r*(7d!GJ(I;=C;EfL6V{1h(weWeWv0|O~J8u$QbeVk{XZVlWP?w=o59ecn%qPoY#>J~qFpni@T z#$GSKxX2(a!Q8+jYHE`@JozSuvJD`SPHfCVZ#RUvpruJeRRbK6>91FZi#k_1_Fn{ON zoCSut77-w42M0cS7$v)v8+2X)Tpd^VlaSO?C(#9pr+8r2)}`E5A2%%R4W--!VN@If z?4hc8O#A7jk07^`_GiKBb6~d(bLcY`I~xNI{x(49Asz=DUy2<7Zdd^wq2HVaQapFz z&X++$8SsS%&ZX40eY=B+>ufNqDsE6K>6g=i`{=XoN(U8`rzstG$@jQ5w zw+8$WsgUf6^Cw_>KhN@$)$?5@)kF1CUw-+?dx>ZFCy(k}M8Nvu)NcAg|GY-bM6}y* zeNF+ryGGA${z6}QCcKP&H7JZ#6O7M?jMWw9#fOrxogzbKXVJLjhey#*qw901Ma$Pq zwc}r`-99OCRf8$&Jh~{=RuR+FUoqYwq;|F%3!G#mOj@2Q?ieRIXlaYiUp_HNOFshd zB)TCz9-ZIr2){j2{pd6rH(A4OVx93W(Zis9-;aULvb~Uro8L;bSLe{n3vfT#o{w82An_2)N6o8c` zdzxUDL3++PL)oM;9jn+EO=2huw%P8vNMT% zg;hbowzFN%h1E@4Aog(Zu>b-VtzWh!AQXiaPs0K7>%|ITRx-P|RxdJ+fb;0;RWo%* z2sBbOvfwrQc0*tyU0|q-Cdn1o5TC8c-m^(_APsx(#nwBiwx3lAI|3|z&#VhQ`n=A%9=Zy-G~@_G|!$_bYAKL(FQGW`#;)RChC`+WEMS& zz$#m;AD@g*x2Ic9cdfctw@}fC|J@h(8T{ADSgd~+u>?5A?1{^><-{UA4ac?UEj-cNFp7tSB&zSr2 z9@GZ&X2ci20SX+E7~9jZwUwjP<}3PtPnCF=7ndAdTo*tfV3X)5HHK;Bv`M)ps^p9W z@W&DOK2ybvOrY%$Y?uYe*WKtJ5BpAFd}RZ6;R_w%CRD@LDrWFxc@^KJX)6~VQ|FFd zgGUoCHq5zY03C}WB<5-0&=EdgG8S!6CO04(%_uvrPG3>1akd|B2%Zlf53jVZYS^&o zG4T|yz`$NpNwIAHF#@}oK^OD)IR%S4=~`~uoEH{b3ShjqShwVUH-t%UfPHIBm5jU> zq|$9JKwnTr)-S>27+G#Dz)R-0)+?>qovS;V5-R_U({RDN)|%V7lWS%;=hY9jJo>|v z4c1;ucfW63sBHZ%Kv+vT^g$`RAKDjX0~GAL$0^D=BsbQ}l56Ha1aC7kEt-++Kho8) zopvRcESV_)^w9q6Fw#?v+o!c7PLDJ@GbzU_;I38^H+P!lwj`*1>!SsDrG#0R%FClT z`Ok7fXFtIN+_1^t2Za!(M`gF~Yp;9NaZ*W71?FyKTCAwUQtR<$4VYOL zpG)gGn?yJ#!n?J`yGd(ebu1k)Ym*SNZijC>#Dy!qDJ`*Yyps>GO*johx$$@4>q`Iy zzL5bfR-d7iaH0~o?q}553bKWbo60UsBz@O2@z(Y$*ou0uk&tv$?b&Hk*eK z^AK}vg_%B{p>D9ea`;|50<`wV1~WU zV2QEUoKS~Qv}(?@Bd2NE`u@r7^Zcd-mnU@vPYys&mhPYW?G*Si`E&kLEs}T-h09-C zL7z!WbID@>9ArK+{fGNJI+D+?E6(%kzWCy!Q}Pf8QP^50x8l=Yhv<-K*%z|%2E+BixJ6}pflt~>_e+Jx+2$lBG_1>B%>E{n`{xrW83<-& zW-s(PkWr8G-c;?RH+n983OoCYPd@RRPyXv9CHtk5uSgEkH}U}H+!|aVG8eal{GFJ= z-iISUd#0QkI}|nDU%cVnyy}e;tF1RsbIY!LvPFn`HU3#h&G`KpC^NEn zSNlDbr)dFjV4qV@*}D{62L5>}=l&a3L*gQ&4^`?)#p}QbX>C=eYcE~BasvfIJ{R6L zdC-_l^+X7GTMKo_TJK`My8`_@_YudRlBj-S4suz@4Vx;RcXRV-uM2OBmj&1p$T0L0 z7&N@X&(F6QQOf;+W|!0Qn)wU86&{vyhBB=_9b@ME+b*ec(oWuLIkfo%WaI*OV%_WO zn_z96Xhl5cy6*hEd2hA6%ruR3KT5unlzW>*8ELwEKjKG~B_4N`o;g;VZ&Y!ty#?uGdKIT=h(F5YK5Uhe`?#vJEsEUd%t9}Y{OgAjWz8>ESE$x|HC#Y0zSbs5QoipY# z^e-DX`f`tkVfNX4MpvM>dzP*gHPL;mpOHnWmgv(oW;buHDll7kP1VN*Kkuk$jGTM> zpr|nIN%`Xor;^-Mt555+Y?5;0$M)h5Q)@RHRX*{lcSR%>^G;pd(a$ua-6YqZR+mXh z#uzkAnF1IPKK*EXA_TxAv>3){-Hrw1>f=ZY_lrhJT1AykI!XOj+zPwf*52FeIH;hz*UOqy7u2KT(QPM9!t+Xlr zNwTnj(KYIMP|zAxb1A#|j;Khp!V=l7!|R^t`ToGdhaE>Si(NbTqBrtMi?9GI9ho=$ z^6Lk8fEs4$k&8^Otwj>G+i_|osBY?|wa*7rzbSm@LE@>xnx1_2y*r#tY4J8Yx~q?f zRnB0Ld-A|;rb>9v%ZIz29D~!+@^t-tq`MTWTbgKe_YT-muB_+7Wf>-uiN5|TrZ)8@`Tozj&W1eh-`5~hnUVnX`6C&gM9t?X2-JT%dY@D zLaZ_#>`dY6=UlRPFXcGdFek_zzwSL@OQpono17pk4)RqT=Ih-iwDl5oups`AWPq$u z?QV(o_~`zm0i}^7K0%J4XEpZVK!w*mMUWA9duMGKXBgdaAo~=x3#wKWVY?j{a=0NJ zBbbNFQziVe(Ddy$C)H`Z;Z&FG&}$h4I#zWDdA#E#+y^)XO#%j34_B=<;{@$L9s}Nw zR~sNCq86_(L(2=fJ8KbJh3)b?l4qQ=M$=Nnk0mA zmmZ0L2K;z>;hXdvz5B%U&QQH4s{%BB3j&eJwDQ#j85%KU$%cyiKksU;(e5!9TcybT zblgyuqB%1eG3!xgt6`*g^n2vcE1;vkatgbJtC{uW%@D?5Zd8(B5Ab*_zS1VK4 zwD|^0iR}$$T!mLZMhl;9^6*3nDasNt1qcO7+OMtRJjQ-%NI5KLRkV90lQ zsfw`dITGXh9JvLSG*O#p1YA5((wahbEniV0HB4ypB870Ch9*tvEg(H+)f2OsExa9H zo5uUh{zg#ndq(gkv@XvO-0d;a(Pp8S$54=~d`W929 zc{MUOI%;7vcSqw+x}-Gc{G5X3e!n=cD9)A!K5Mx3iCPGt-^S zSCBs=!13k>^l1)>9#c~V3VD%`cUt7C@|L8M4wSxH+vCYU2&`TEp9Q1ZFe=1&I)gyR zPh6@D1XoX4s+ZGdLAXm;P=ET-5kTzpJV4TmiqrM@^$zH%5};k@+;XZ#Uif_jbeA;y zHtEYQUt%CY29}#q5I$>Z;h|HzzGhLtwKe7>90kpKTJkHVUTx;ttN}3vt@Hje{@7RI ziH#)dRx+thjMW4_EA3G~Ga^B?UsnSBcbBf0YU>uP7HXWPFNQ(J|2l zadEWe4!WXWtCSrDvf~B`cK;?nR(IU*najfyS^(w^J4+c@>rwVuPvx$X7F=8@R&j7W z&vLcLY(Hvm;>1;wypEA-HKrQ7-p(fExlZTwUHKbZCAxNk%*1-V4Lv3kv~WXkm#;Uz zDjOt7a3mzNE`X#r5yS@L2Po{XpQ*|~ed7Uz{h(rzMSIHZTS7TJu!jrvD=%K2Y4Yx5 zyj})GECRQcH^fEEDHCtJAhvDo<#^e6MZ(NJ0qP{S12C4)C>%z0)2gbfs#^(4kJiVy zF-j#~C$qE@#Fn1t*Ke?L-KXsh^Z9*CKx4Vg4R>Wuz>*da_sfcIfi}^39OneWx7*PO zIPl(C-&{}&q5Xbf%H?EkVpfnvd!}kIU8ucZP@Jo>)V7o_B%een6%H2?d;Iq-|9=?! z3aBXCXlun0>5`C6=@2BOK}5QwyQM)=8V969kW`eAkS<~9k_MHKM!HLK==h(pzPjuF zcP$s|T{EvU?>u$R*=L`9xY2%`wOo#jC04ok&feQOP0JlG(BpRV{aMZQU5b>V?ulwWC=FHDW!<^aKChqkpg#p>fVbaG3y^-A7y4&Q9d(J_tWdHzYc3M|vmY=t~ zFYrwV05Pp=Ww#!T7FA|Hv5fdNaA*&xwhp7Uvx6uTym#7&V$zSAuW|`|h!!(|$n%lr z!(sg4g{-D!WnCt6l&WlQNwN0mcD!B2n}MPf1ES4XelZaC4a;P%M5|O?RutUC*LwjJ<*P26SSHiGt@;o8Vs)Ns88fn2KcvjL2qUAQ(X-R3QhPoAli0cHJr6fTFW60?Urc@+@R*^7Q5S|tEMC1>pWL& zKBxqwq8cr*tXp_}CMG~5U3?J~PL~vdA7=O}o+TAhd_(Z3FeFMEkcDjQ2-*U?lu};a z=x3r(r!)h)n!L>hNgMZTw98_*)%7h|+PsLh-hERA{*d|f9S+xzOnWY?32QX0tk-S} zhCHu}ZtCYvX zWla0ae~%WExS6vk98_nQtG%^A;{zbtvNi9s{jW-s zMpDPzITcFC_KVl#l11BgFtG^68fi`+y4fXu*fBc!M&ffJ3BuG~)O)sv9BaBV2KChH zubo^(9%8&ASs&q(uawb5MUJQlRuQx!$f^QR z+@}xc+X8gCr_qHR!>`iCKfhbzBGbMA(&U7=hya~-S%``7>d!4LXzRyv4Zc0Q1=eHe zy-4{e52CNhiFBkuYDsp@MDW*70O%dRUpxn&DJd`olGCg_e9b40}#P{xMZy5Ee#umK=tIZ>N<0CL*`U_av7 zsFLE`;c;4!-T^L8&S!BS0Dll+3vEBs0^*q_KH8*!IY(NYp6b0TBYz9$A8nI7YOjq;Jd$3F^fb?(O?DI?H&Rj`3DXYp&Wvi zFIKGXzMqhN89SBLO*AHY{TYX-dCr6WCnE;>#f46@Cq;HfkPg z1dlC3>u(M;BGRl9IkLYdYOe=!cP+5;ZXobwN(0EE;+Y@}qxjy3FIXn*O4DgQLq+*|DHqk_F7Z8I`83@c8pItk44)Rs z>n$$SXtWoZHl&yt7wkTMi|AhP1nOz%dz>}u(0JYoXn=G$Nk9M{sdUn9A{3bGPvMyR zD_KgZBA%-57eI~-`PF{5QYr%n!hgeCj;$ijwtMjitQz-KO}VTC^6O|c%`UCi zNbfQ0sTGuZChJk3t1c<4UgFns8dwMXdh>`;PfZW;ro#E&ej)eUqqK)!lQyW|l{=P+ zb?@^2^v%FuZZd>(krPx1niSu~%L%^j3H-=6Td^}8$({S}6MWB4LYo6_l zyres~uMqa}_yL&$viS;UZpE>M9!c!W5V`aWWbBv8^44|@;<2%Q{6~bNT-EkYj&zqMoqC^&@P0jS_Wlkjwjty5!e$i8Ko6Z3AqCK`W(UHSAGQS>IunEna}|GB|5)Oo zaCn)9N5{jMn8s&3m%Dn^rKVM;gM-<WNT5M&0ywi7t?_TTT@J*I0lcQKiDO4y( zIsfnh9i`1k+IOjxb7v5lRP+O8HEigxcS+Z1(%n?`M6+=u<(1dkqjrex*(YAQ+O=Z4 zE`RIdg@#Sn>WZwPvqd@EC0j%w`s1G^9a2SIhccTk0;jT-*(Oh4$0h+8K6OsJ@$!%; z(BAM~!b}>N^Qi4ieN@QSy6%n$|HW~~N>ctC$0xcO8Mz^7Bb=MFhA}?ZT7USkdSi=@ zpR30orQ%CI)|8v>RIFzX}K1JFVFT&_CE8C z)S4`J;yHr`e(mGLIYbhVs)7LFRD%_u%yS?Sis>UE>ke7VH3v5k9yMuMt&UgN$hQFz zM-#hj|oK-@$#KG$Oc9*MV*O z=r4bH8!!o57^m841~dx}&w~x7WV{!CKv&mAajM6Q&HCX^S<0j0afV>tt_UA-iL4mNTo0@ ztVh9~iJjjKzs0)Cja9xZw#v41@6fAt7EpMY%W(z;8rVw!z}+kQq_&TWz3 zlx<){9bTNU0e1Vdy6@JV?7s0Qq6`AawfAz}<;*u^1<{|;qoWSqfG*M0R0Bw)KOLs2Vu$bcg4$MV7rph-?h%H_79wF5%7wsD|2ti=iF-+9+pWde?#%sv8)GBQp8 z2Zk^rk%0BGE={TwCS?>ONr(%SIil_b6*`n=TaVck*AIUE1izi2MZ-%-`C37jVclO- z-!=XT=Gxbe?6*(XhuP@4ybdNJtV=duikXJj=XHDhcRaD-af_$fRzX85$wK?9Q zZ^!A^A2oW$`N&AAYYb=Et|Ze6@>59VN(kqjmlYkXyE0`^sK4qH?J5ddM-x?RRIWUE ztflE1kta7qyKdcB&?E9yr8`H8iO9)WPctIcGZPEWcsiIdUOb}CudQi*|8k+`rM|nJ zjKNwdo*Rtve&sqP1qlNbf)v=3b3?Qga}yX#GDnZO*3h{S+9({pX>aFDebH*rWFT;r z=q?~#;vFRV+o09j0@1s)+6#VQxfe@)Bi4Zx zEqIIBMvD;YN7GBTZVr~KSSa0O+cyrkwB0K}&Zlvn1j^2gh`woBVNX}Dajl*MnqSQO zYD$F(sm9}_U`4Hit77}g@EQ1#Iu8hTk zWna58Q~3mH_ibq2s=Rz|rrF&>>}Nn9X)1_X&Ii4ZytWC-UQB?eLWFSipZ9_)89ZB+ z)Ayq1OoT3npv)u`;MlCZ$Y|018zArxU-^jsQQP7zfmJNSsK7p`_DOg@)o~45sLx7J zH}`0y@xnEF2>T*uBHg5C8l?O9mjv6XO+SI4l}>$I>j{!XFcv|lOYr)j+OUod8&t|* zk*M(~@D`uF-UD(Z)#T*lHo!ZTD*fiv6X+AmphSphJb`m3%%(`EEiQpVyaL5+6y#6* z?Ji5sx6;G|@)nWcKk*Undr+Oa;6dCuWMapv>CVSJ3xCRjwq3n6=}Ti5ko)yh+CZZy#dc5U5HshK|vfTKsv(- zAmCv#8t7tF6KpRRdZiAo#4R|A;SG2p|KJ4W;E_`k0Fl*M&mJ)zjzCGQb;qihn3!aVZ?%pA1_9I4Sr=E zbS@_X?#=5XP@6@w5nTa!)mUO{2hxhXSYNizh`%P%77-$THInYIR= z&y38{TDlNjwFz==!~?E=XceGIwgLbU58~3ycSd6cT99#ucfQlikMQH1x9xmz#fJCs zr(3`pYu@Tuj#p4(76pWifM-J9?&|1VYd}y--vs<2xQ2{@lj?U--4r}(jl3drntes6 zPFoO2v1f*iT4qGHB9bH9Z+?g4+qacB52G6{GP~v-N)7Zr?5xhaD>yvX6mm{{XVl8V7xsaJ2@Wy6PHO`6%9!cTo=BhQS z@SiPO*PO5^E*<7gBM+oBrR3fg{pI-p|K|=4BJ8#vzyA>`dU@Py0tg+rqioWIb-k_F zw0@xPY$!lOb$+-K2sn|_dmD-$N)|(B3Ub>~<+%(MWf|jU^#-VMqE9=CZ%x5^*wvr5o1=D`sa)fPC#P$*)sHX*NeK+`on zgvVqfd>@c?&`V(l_DLc>o+UiFpRpm{h-SLgj}1SJnHu^+qD_RElpVmz=23C$ET^e>>p;E1fbwbO2V_tRUa>Hf%-VOLyUG?or#(lQfpXX|vwS&yQ(sQj4)= zc6m>4nXXsDZ;$QSDSa>oSI5AlI)JNyhVl!#WRXo?QteakjR`Zm1I{nQ4(lJr_E$fQ zmB6!(zXQ`cycAH#!FQ=M0TZd@?-Kbvd(a>nWw0|VN&+e2rcVoj3_KT1@oToiO>mSM z7UrthmnOYu_j!Y0HA9U{*C~+HI7uKhQqlv0&XC>p6C8{-0dn?gpGDge_M5BR?67W~GSd)H7F6MzT9g+zaUOIis14$sjA+DsOB=Lo z6dny!q?xp{s^Oi+^HHLYcPqZ_LFXl%2Jpkm7utOJh?qB;4SS;QB*6|g_K>P)io@?% zZoUtJQMDiF@Y2lpfaavjwoZUmhhCOWUC$?Xjyvkq!`246VUs(bAqpCs5{f+%flH*Z z0sLnJYd=sd2APnel};nMK6Vz}#gR*;rLLruO4sA|YdEE8ls_Lv%kX|E_X7*0R4f=I zI(bL)M8ZX)sIR^idlEo{GR!!c*nIwCmzJbe#XeY`zmp@?^Q=KxyLc_OAaS#YpP*lK zl{Cuyvb!O1SxZJ&yRw{UNca1QPxNlvZt~jNHez*HaXn$ zSb#LrE>kbZkEpD%>4q1{R5yOgF{>52AK%r(k^4SV(3)jj^8mf9ta#KWdAZ_!nep2{ zzSPbSU#ifS+_4qc1XK*Z^E&))fF44J%=hhyL@JXk| z4W^rrk=cv~SlGF$!r2oXlYJ^3ot_jTxdNF6aK>}h2{ZkJS;mRMjJ(=`nmoPJRS6PHYER$|2-AkngROl2QKu1T*$oyk*H}!Nrh6n3t8_nDW z1Ox_G^>0(ayG@F+pR8$XaFJ{p0AM_ct*K4Vn}PRAl#ZbMCmb}8RN4U2=bV!sJV|G~ z6^rKBp7JLOb{^xOBoe!%RFkdItp;&X-V-I-IVjX9fbKG8adp5&@?XY+)l|fTVnL~I z?(>o2gmQWgHTLIZHS&~E3Ql?Yw&Dof(3y^)4vx+G?^S%GY9WB~7M zfUJ=hn{rKoqC&W!auw4xZd1zsN(DUM&0KiDaOHxDHKHL{9^*{j2kmZ`*jgLs!u!tL z^D_31pxthTni7t_^rPG7F>VVU3rTgbq6^85%CVQeHv~RbYkRj;g&02K9TSZY+!qJ) zP{whOa7?sX@w(1L=gClCW00<{&<&@m1OJf=p^GHt8^2Av*yNAXUuRrkAzH#;-xH86 zv7uW+w&tKP4+r#~@GfyyPh{7(eOKD>(+B^I(s?*0lpyLHeO-H-WQM1YaYSr&%HZ=w zW*-i!ntWO=L2m=bqJ!UQa+5|1W6r#S6I(aPKv9F3g>47;v*EzO(scR$jek_}rmpo| zf9=yPovN5xOz#r$C)7(k9=i)Oaud*NH^Kl~7z$!oUidZX?^r6vEx1QuLfXD6iLwE~ zJDP^`1Ks{&Ls4jfFaR(cv_PKPT4Y)#mL658WRcSU0w?L2RBwu~&3AN6;28~aT>bW- z7P!{|sPdrf*4-|DdaQ5H8qad@FnY-B;UrvUCOg?44-}B!r1a*OVudZSo0L|#5qy5j zI`(qW9;00;Fz8{m@YU^GqO!7q$iCKz;uXfvahxJlruwwx0JePG{Y_sOnWtQExx*x8-x_0V zPPbliQ&BYK_w53xgD?;_0v?(#}B9Tm97Zx3W2#Z0U-s_p~q5hDo1qPHPPW#Q^2t7Y75mF>uS?xF2zf_ zYoQn&k7(3~Z^*vFa8g{Kyi&Oa3*y|BKmto2-^{O0=^WSojO8i6A`kJ1%0<(bgQ#7> zKF4=b+vAhO;;rKo4b>qf;|5G`Zf@U zq!w^p!~aGqrY9{GiMgux>9*_v+NigLNV~ zOPu<|Nr}PiYro(eC^u5%2?dwk)W+5*L`U*iGd+}U?(jLjiiAZblheBhxg5{18#)E1 z7;j^;J7xS;Wk0;H+L@|<_r={=v>aE1hPHsALE_OOCX%tLR%s3X>OQ$iJfF+M*E9^c zd=U&Ob5N}HTR%R^#D!^Kh3SipQu=ZamPFyt4-B1?zCWZY)Mr9&x)Q-1ujoJGclzKa z+2gyvcia_J{KPAGu7Dy_vxYU?tvJMR{~9M!NF`Uh9@Ey_#Vwb2PP+<~LhWidFqfJr zKd|cbz&1P=gKsgtC^f$vEVUN4=F4}ut=r|9<lL&d%}lh#S0U|n zP-nWlVOM#V6p+e({Fx zotgzP_w$Ok$j&9JY(1q9ktC^=pF%w)uS$vj$&3HGNXvnAi+i$(YrHr+ldoNk8BR@Q zc&uP`@b%=g&Yn%hSS5UM8LA)q`G#()zKuvd;*Gr>8g04P=j9h>pO5!PHzrPYOH+#4 z5MGeJ^CBMN4wl2FB5HflDk};HSiUjGU*n6waqnFH#vslutMg2ToAJR7Jil1-_!o!k zB!hb^S`{67N{KHfI9`MvzAB8<>6y2E)Gua_%&k$z`SNj&jQo9ao*_ zT(~!Wwst8{7--|`Y~4(>Ydy6k{Odahun98++`-tZw5$6pca=(y0{{%&3fofTNhWRjK^M~njEbOri!Ffa5p|D zzw}9vvlmV{8MIu@t~-)9A~LIapt33HUGJm#3-w99BdYTc^5U-QCqcXnmvogz&+=r3 z*@ChhLQS)xc2!Q67f6F-E+g!3krJ5N4S|oN`rx2!RBmj``fAUN!3SMld$7A2((%w@ zxidX`LuiCxo$lP(N3pz|E?Yj5Ra8kjjS+GyM>8%yjq*?M|LY=}_UgAzMssziI*Hf} zyF8koaI7ejmG=Gd-n%;Ve;*C#0WH;2$`X$m5`4V1oP~t&Wi5y4nauZ*vP+Abh8ojc zpYl5!u7BMzpJkzBFSz7UF>{-0Ft2h`b7|;haJUSsD~d1axx-=T7Ts$^UsfkX4b{Oj zN;6=y`bzkZ#RJw4$Ox{V@tYhBhQ90d4>c*3JRNDGQHU0EMV(13*RMUqpR75=XJbQe zdoy;ot&ZP0&n9kMCXF1$Wp$XX>I))(x7AzzU{P)Et?aFD+)v19)KBqF<^rO>Y8wCB zar?od5ukxmg|ry2PegSFIU{~8w1#1ZbWHQ`<`QJ>?UifrHJ|INqW^w8ONh?00+Dm0 zyK2FNgDI!k`C^4Z%1bd}JWDhrxF_5|jaBQ8;iotK%!Hq8iaJ93EbvP3R=#B=uZ-ot zSQ)JTki@AF;^AB(6*0Z_k!b6y{+Z=5R6}s%7dPwYm_wr;iXpGNmU}oiAm*y?2y*s4 znOg+=OUVjiGWQAI`Tct`XJpb`r&vp$Tv(WVA{1 zSGybr3t7rGf3RgB9KOsbOanzjl~3hVEiZ>9l+9LP;naH`;C=p#GVII&(*1b)H7<^n zVluzuouA9&2~r_CQOI@Y?9lA%q9KwsE_T~vPM(sqecuM@UJJSs0tNE$gg|u3o{!hx zWo2bq+7F~6-+m^DR&}u*(3Fs%(LY`$#;R9UlKyz2-Z^}wM%v;#YoBud_-L$gnt^FC zhlSK{4EiTWvrh%J92Np{BOwhwr*3LUZ=PPmZNhv2S($z4W@BT6^Ssrj7D||;ii$dO zk^;yM=Q2>U3YtAqO;!jD z4)Tv3Fx>Gvum$uUm;PK;hSCy;L!%oJfX`LYXu6*!fz!lJ02+1bmq-MD%QESgB-+Gr zUZ#@)_U2Ze{$PB%;eWs8|MCi^_%o;{W@*G5$iH{qpG@KZb|I#MdPli9hee|Ce;o1u z?~5e|GJY`Onk*mm&LK9{8VcS)w4l{qSWDl*C~Kr+^-Xh?sVkpXIObQAS4s zscJXN#~{aS0#YETKUqG|DCrp>#zYiLuYmUc0@PXrx@!D;sG9<>G+sKUKOhDN?Yo;= zRZr5BggnZQc6&G^G!fLVk}qg~&3O3@=qIQrKvU4qedZ(iAQXRbw(0r|ASu2ls|Q>P zwTK5(zK8U-_5fesi=BDu{0Mz#Ml_YA3;y0xd_(3I z0M#t*?TZ4n#}ShQ{fL(YFm@0dM6C>D6t6UP2bnZKgSVllvN9GlxMl{(@bK_Q1*6>W z(_u^5kCE31?9Eqj5JS5l@of5Xp(REK{HpQ9D^pTEr*yl*Jia(B_kaHsF{Diys=eA6 zMA~)#6anZWYFga?J~e~>P1M0;G(|x4s%a3ABqSSs&#jFUco9RDRsylFS*7O$%_nx+ z_USpFz&}ei!u^=DU6x+tq176>e0p)kWUnz^A_3=Je!-qJ66OP=bey>x{Hblv8CT?v zz7acwEjFgIlP|{nFx6?zDBGv0$hyjk!4Lt)t7p-UU_s6yLMX&aW3&Zi74kUb_X@us zUl||i)irn+*VU8XoQA)at`Yq+I(da9_F|3W9%ok^t~Uf(_z2ok5In;#UcXIp36N+2 z+1YJ!=vrg9h4f#!lD|IUmq~I5MtH>CrknWRd;FPLGzF5Ye6zrmJH`Y%p*3y$kA(CS z>ORq}_Y!7eF~;cfuUdgWhuyCWv3Xo@ zX3iHfSv>eXkH0NdXD*Sgmo8SevNP}409CHj?P#t8+Oo(Z46K;9I6OJ=lQE_3@EA8t zS(@EPMc*vT``IsUpAyc*h;-4CNG$j4#qf*~5VHu>b1y{ z>E+Ct*4*RU`y6jHEv1U3YXLptVRw9o+$9ZfreE33Ipm=-K`vN z8d>zFfh~mYvqi^6&h`s%$C>=$ApK|3rNzMcEgX-${+~5>1%gB7Ple{es=&mC^FUEZ z`wMO&zpZ zXXzT+9;@pD=xR$uFqJzy7jt`kQ98#cfQ$CTEK2OGK1VZOSCS-xF6R=l?7 zFp=EGfsD^Vh1hN&5{9#V4*J|s$bsFM6Y!7l;ve>p>J`|{FULxP|9#6^!WDl`9UWas z)sv}kS3xO(pB#)n5y&=Qb_H98*{Qo&{=bL!_h`~7`yt{1Kbwayei48EtSGAZEB6`w z#n6z~vdZ559FS*}&_|COc6n=jGnoD}F@L^-JQh5uz*3F#@Sm4{USP^2$n>Vyl;Tx? z2FjK^(}K7|W9p8<0Z~oe0lw5@XFB^_ON$vJ8FLLk%iG3!W<4_9*G-dxZNiwiDTzm>~} z`29l(am9gt+O}PsMSC$DPxi@K7JaljIV<*ihZ6QbTZp_gOel4dN2EZGOQKEnmTAXk zR|GY>M?N;{jCPwy{CIjteHNyz7TrURe73W<#(yZ?|9*(e+d#?jMmX!o{hf zezuLu6GDThn0|e!8rdo*78`C=qxLGZrhFvR%)e|kc30*^ev{XS-lIZcJU_uN-Qmt! zm(od4`amPM&)O~56d5?TkIZ~u(RkRgMN&Sq!`J+SzzgF2T&L`sat5D(2!+%*zExT4 z536z{v=dpvtA;t~NI}Q49{eJkN!h);1{LaN2%wmYKb>qeYc)3eWzw}L-f8uqq5DG8RCc{AeBT7e|_J! zQ}G)$+pjqSRZA02;+5DX;QQjo@lW9eaf-0nltD1iu5g49TLsY$1T{-6vj{$Uk8Gt@Zl@h9AtWGoE!0s?F5!6bI~ zR$}JUdc^C@2DD4)?8|3!TJCAQt;+qLql?8e{$!B4LmK?;fl`z`%Ks z<%aC}J?!8h0w!}2l8)DsJ%v#5JkCQe?79##pB zlx?@Rr%QQU5#y@&jy5O?am(;bSZOXEX?z5afT?9c>%-fom_@Cxnfnd9Mw4h!G2=&b z%xwJyOP6GAAy7}w$iGOQ|5*!oCAc~3V)fb|9*2UGlaMcO^93=#dB5Rz&R!YHPmDBg zIrIV4$~pLmArXQfeFR7(tsr~F;Cc9+EkVm9bfy$hwg?J)3TOGB_yYrVjE9IOt3aMf zA`2Am;Q_fR1NZMb+K4cd&mm=Hs=js|r`rkw%JlC#0*2f3!rHBdS}j0OZ*l$=s;-4U z!x)Hd#^-W=DG+`~x}`kWIF^g1&`|w#X^|E>4+s|e_}p#gP1n&abhO*Li&ieOc4XFz zG4$_@cmMDg0xtkce=<%u*;rm7w_g0jj`Leh!w$n3<%fi*n5zO= zREahqaY&Tx(BScpKnt8QgLk28533gCmjDxt5K$EF4wMgC!IQIyT%I}ifBVLopkvKc z`VoyK1ef+<2trUXN7Lj4sIK8Ei5zDl%Etpbf+>9&aqvfr4I8O&?11DcW)$F|B|969 zgWMo4B&EE({CUwiXzE6UP*rd@Xa%jPRa?Qxb0Fjvb^z@hZOAXS5oNeV03=RyY%3?- z=wn&|)udMb4S)zj&#lN&MmIn?(Dn+5ZEu0{yHOwx)k>&qOO4QE?$7k?Ps;w3PDzQ6 zEwGZ7w18m3MTuSE&?{ zl?U5=v!K?N5U^&%adIj@arJ}%Hkq)yEf8!lN%c8Ez0jyydHx9~ZhW0_c*g7p7F@@z z+;b~9P?6RE{4chLrOv&=nq*I!t{SIy`HsF)ZeAxfxFj_6-kB){f=1!wZlmHDB_uACbgcuT+jx@;)veO!mX2i;3k$e)*!1L$Ih*nnh50+`9xWSzsnw zdXo59v{1BGceaz})B=uqw`8Xj9{8cS;L*tcv>tEXFHeFnmv_znbmG9@3hE~VG7O=h zM4R0NW4cs$@6+9Ppw2tSZ8|oRcU&3>2QbXuIm^)z#uj9a-LcDRthryVF+@Qc!@u| zgP>I304l>9Dhj_d$BbKCoz2J;j3ut$96PRel#8L5ElAJ{R8YAW=a_&0R*i`BF^~Y! zD`|FP3Gk&`2{iR^g>bI8b=^*cG;^_|HC&w7K&5V9Z%(#Gl(hSQHnzDU#|C z)Woz7XAe|3Ejob%LXf2VHZ#Lbr99fNO#TGs%{D5abiNHVhi6A< zIJ_??5e0&$tzf${HDN3Qsuy14<@pv1+Pj2{ZE2)Pu`8lWIo!7>$v|SZRjmGD71f#< zmUNYwYz^IHq9cWMU7+n<yRy_IbrZ=@y_(K)+NqFrnzEdDQETaEX;jV zX4*ejl8R1AHcZB?S&NBXaIxA~etl$Ny{(behSb1&{Ef#SbXnOO>8MDy@iNr;n{y#| zHKc)dz;#~AOe7Sa0!V42JUF8ot5n-NB{ei;RQg98#thwvd61XDJRCE2qp^we`s%As zbC{Q4&r+BvdjTS&0!=#%p)qa(M8#092}a!4q2BhKn;cBlg1q}m&> zW89)WiL+zD1PjWIExaGCG6F<3S@*GDK5RxK z+3Q@ow&z~NeZR}ghZHI!{OKy6Elw*vI+ErWz`pRX7NSX}^H;-lw?P}Yi+R7~5FACO zJL`s*Skm4O9z`MH4pp9FBG8xa^k#gZG^h4_`;yPw-s?%Aqa*m zosei~Tp(mE^oj^+?h8V*(XSN7e!9{4Cdwaq?YtK^v(m&N&y!qM8)k*#Skiz8ysdij z{^X69{;q0O`B+IAOO4-4^5S}`99}1Vt6t6OFSx{A9Ea-)^xBLIY&q^GEEs%Oe+N+Aipry5mCxHvxz$K?B_oFYIzae7M1 zz`*o(@TJcfK^58pYJ9k$t)lGqV#F5OERi@ctZ1+z1=8>hFO(DMiD4d@-uJ1#%z&Lq z3s|TbbTKtn9k5`aA81oDWK5w*o3s-xfrYEpLxXR2)RHma1dv>vfalTbC9khfsA%Y1 z07fV}gn;)^myHi;s7%bVKNZZO;h5e9x0s;Q_mx_<;#+1=N6!&+dxF+r8W*C~(`5V! z+fP^ix^o+B4n?s*vf1q|g0X$%6|Yh9Sorl!WO;YY9~7EfBVtVY`4(Z#?I)@|W4{rp7ThM`!E0Fe#6G16 z*G?`4O=1lCzfJaj%3-~=<)C({&G0t|iwe-wj4K=3iXQ`tO`}#c#egZLOP}};3sbiA z`8AJR9jG&&1A3;B=94WNfxvouMWZsVMhd}d6~;f>?yvMW^VR_M)9j{VH<7b#i5=kV zgo&QbV&9+kzxisp#D|*ea)^2RzTqdKFQsj>jTdl^j684_a-!6ba$81gJ$_~0`cjUa z36zGhiySY+&JZagDuoEbD#$%si91ah?$q)u45~gwp{VGd%>UxSzUfi1t zp~@jXQ8oZndn3X9gHBn?HyEUdbnnb@&|#!UoTqPbQpZ_t1`J|$P(zpRv@XRnh%{0w z7h_6W=9DTG5EU{EyO7p>ptAa4TzN5UE_EL>MfTEZhfSH0Cn5bg8`?xkAuHkVTAQdV zR`s^(k@T%R%xWT+Oa(M%sV=2>mM{>zSFxP;v{j0w4{8PB!WeB^U}aibpaK+Elw`*B zI?ioTSNrOM;xb>S()*;chNJX>aT)uV3R!yZWEuWKZZO4XK`>+c@&iZTR^G$+`I83N7Ca+q$I0~(t|y0)c( zoi*th^<*90B^Ml5-`sW$fzMhDu1iHX;Wj*ZZ=PE1= zDemu@G|CzlpEjQA-=R!&>`V#%1ns2Cxg?jQ|ICo;&4r=?Th%d(k!%Hi*aX!bgYM=V zkSgBBL!l!s57+$bY)G4#ucnb zoMo@2kwOJw)4pCDL}9zfkf39We(jwVuQ5>D3M{DEryzoxpVHHxZcWVr?MDL8(&Nnv zXm27v4_JWofbTb`ifbr)zm-mhTFnSYFl_rM<_Z=#xH3>-tMG|b(y~2I$7rZxI=z&`w8rL zt+FfMr@@!5D_;O!V->+3>taumoW&+1I<9bHyK9|lRp5}E*Q>_2dbjJ(c-n~_*zfgIR%=T5 z1LXxgpOM*8-Tek=m?Xp}pgamiug8Nxcm`*T#Io~%c(2NVvB=-3oW-?hJE?(_Wtz7+$EO4zJl;$e*r;n^HP z@vJ9j>7e-&EEv#+(8@%AbsrvKf_pV3TI|}*JM|7?O?S{y@!wv#IDnb?qVx~siOQeg z11>L>v&Ufez;w1!JI2FB`5HSr$NI-UVR|tDyqmpM^lGlQ9Sft!iC0m^$8qwpkDU__ zy!R_#N%TeF#+|Jy3Ine-cprt0jX|PNvG50W*D09i*=x>q)rGb14WORia? zq?0Ry2$DEdL)G-P+Nx05wtV{tNQ2l!rKTl#=dc_@VK36+~Fh>EYIG0;e3kU#B=LG)P0ObCW2RYHM|>hb{T zWF5!1ytEma)a3#G8|Sw;P47|bI%Wbee)>;MHfbaySISn0F21P0J;TjT{*y59jf3>QFvK9iOq__@E*W@*oL=ABajUNF4AD+rEQxWREQ?{^gAmk zV7p~z-9lL80l91=8zPZ!j>x{kXQwr{x;@)s>Vx1e6cmnQc0LHcqF7I*)mE7_klkYg z!u&&wV$ED4ZBm5^mLA_!+7zNH_Q998m9Zye^CvE7`kL?|Ipa)D!o_eOpn=wkkf^t{ zsz-nTfDaS8wDcH?5#yc`fD3bYnD%JcfB9A!>jmgYg9oJ9WJ(GdVFN*Li}Y*55Ylob zfi|eRHF*;)C*!R)Q=r(L_TdW(wi3xJO92>Gql5&yY7&RQf+nH&R7h3LC$-T-o$ zkWOB0E&qIfV7*?1!g)k4#aU{&27$rw;o1iKfZAf@b@KHBm1{-j^;8oWTyJXdBSx;p zl$^66`2x5h1Gn$)8YT%hy-LP6zeIlw# zPF30Tmd7SxTrd6vm|VPL_AFpfkOYH{;ttL(+B=lG{|E^IKikC?sXr&xzvLdM+`3M| zxVPMkV2|K|D7!B9X(!tI8*ktMR)H1Lh!`OXbxn2`x=a8v)qD+#juN-S1j@QRfW{8)_NeP*4i5C6 z6mb!CrrVP5MGk?3YcR6|)-Cb`#e5A^$p6CtOh3Nn&IM2x#Y1@_i?-iF2B;PJGgoy{ zJ_>!Wns5C9k^wCQv~NjIn2uiFfZ@1=j9}N#@L*#@m;sJ8hVTzZB=g)ZK&!D<7@5?S zJoFBrgv5krgvL;+&*zWzwHu+Ra#I@=%$eFXU>Zfrd!l#_qWix=h!O07I$yo5yj6r4 zgz3Sy&Z4KV@tv}@Tw)a51#bYf3ED|f-u;4%m!pc*`6c2o+n`rZWc$g-pa}9JYBpu?8m#kljPnMCV#R| zmSjjroNxdao76&lwwrIvU0!Tm`g=_by(v7Yzp(*8xxv+j(*4}}#>N<50Hyrhz*->= zoA&~dSFluhxz!&QPJ_8d)@ZS(@z!9HV$6SyXaU2)l?bB17T@ zG^rrsUMSIGu=+amvg6h3#MMEVq=aUtUrz_VAhfHEbzYP^dK}JQBk_&QA~LhxX98uJ zKJsMZ4?CaJ_aOXm^ubdg#NjBB+yjJPPNd&Ab1&2-Ck1|b}r~gw3v%&)-jo|BE(x^gzVdTCLIFpk&mnM zJmMF-q{VISTd$)F3!0QydGIE;&W%#BCL^WxS^R;YL1-bvt*d)#rMe;af-%@J& z7RjC7InpI+m8913Wc~j0*o_$r+~bN`KYGS6)iP_myGZfZOq7};FzH*uFtvF?D=LBT zV@=(Gyx0&bKRd%k`>l~?N1?Vq7Jeo z@u4gCp9R@m83ko*OJtc0kyLwAfDlC~bCvy+S=gl3SXKPgz-#lif&1)r)u*!)zMx!_ z_-$jC#`v*SX6#Qj4V`qsR|Hb2gk6~dfNzFF_qcnBVKKVj9@GSN;%73r&V+n&)YNkf z9LLX}m&1dQzb^WkR5EbKc}X5hHKP*;j$&4mcGn4A46f@p5ts}DM2%QcQ;JTJfp z*URC(p?wEe`c1_%9xmqBT&uIkq045t{`)u~p|%B{{2=~^)35XUSC#sd{lI6!foNEc zLP0E0h)M!|Ds6o`a}EKJ0O89mZR6$g`(FN4QJmgnaEBz=(2q80s@&(PJT<9wnk>ZZ zjlrXWlyQebhU@p!>)8nJ?2PgfmQv5Q+-ZMKrSe{+f&6R_{u!S%fL|A$^T|v<)yv-m zG$9Ngw9$EnbZp_o;TqNbZ4Ex&!|hPr(0x`9XEe?K%sz|9{>ix|nfz!ZI_MF*j#2uV=_yv&BiQ2A#Je?JZAqQ%mv zk>DkT)JWM2Zq;jgG5whX!_ECW94*T-_V+iVlc09KAn>5mf^V8`_TDEkVqsJ6ae z!~sDXq@@)_Qjl%|5djep5Rj6RMhWQ@kS^(z2I&~O1ZkvmC}|jw4k_>2o>$K~zTbDB zd!NTghRvQmJJziAk6)DZ7BEqEgN4MTJn3_m0JQtOPBYTTWt(72P+9at2TG=(%TE}b0|xhKVAay(5`Mk7Z%=!qVuMi?SAtBQe0!4*we zJ zCh`LD8s@*=Ltq+MOg^r!djkE&HaoDJEE7rpO({q#J-PDeQ4O}L}c zNcLomz@p-&O@ZG9aTwJy{>5LF)Y8k}W5%9KxS<)eo3T5|*ES?SthTnWakI?Oa(3u6 z{(DgoLOuS1awN3xk}*})bi zz%K=R9LxFGfnAPJOq+cG0k&owF0;TPg~_umB2u8Gar- zQ!ksM$4y7?!@khrW0MazX}h!0>ysDyWJYV;;naeCkAu5C;3)%LqV~chNWikA?(GFs zx}Nq$T#u=4_)oeonwj77K=w6zv2k)1rBK&pK}jPpFJApb{K- z6*RhDg*&!s&~8hyVq9;#j2z8YNO_i}%oI4rVsKp!!KT)Qls-#pw$r^j=*{A`I~ z-!!&00*@YHynk@Gvb^kjTo8l=D|HBLm=>#wpx6J&>z?hqUF)x?1tPE)KM3Mc6<&NV z^`0^P9uqTDx4(+L*Pq~rJI(Hx?DtK`_M!W*zP8ieVET!#6ZBIV=SSX>ruJFhl1}`L z(rD9hJ)bKMkP-OLoC5$`7eG`5eYW7McDe>oI4IS{n#w;?Q_(U#2W=r$1@F`RaNqZl z^sera^LJ|h1xOMDV$jrH+yHoMirfVjvr2Ok-b`50cBv8|Fr6Vcs2!3bn1gf?>iG|@ z?~w~Tl`C3;E)|{^VTn{W^{N+(4#wloGM2I)PR}o(TTFIY_^BoauJ|@Nb;liF_M{S4 zc2wSCv|H;@BnG=fTrc8m>BZ29^13O3xH>%(A z)A4jK%E*m~MM7;xFcrz(&?CwStKC75<{()$RBC1d={0Jjq)a zd@8a^PpD9T><5KbJqW93Vv?XKJ*fv`{>?Vcx5gz8+OrF-v^+{W9M+W~m7;BA2T1IA zF)T+bwWWL|T?Jp~(rmo)mch|@@56A=D-yA9Lzy>z*Ay>dcCcE% zN>Q$F4;FBPg(hiQ7x5GUU%XJsr(aJw#+1spte&4|UH`Q|Kr@N8-+U6YCKNWpTF{ds zDvCkd5|ic#9&$@`+`1pp>K4$A4!hYsHroF|LfJbrY^}Rog#lGnj$GTtwSS4gz*o+; z1A9fen3&1ya1M!1X--=l#fGYOhY6!WqF8D2a14WU~xe?%v07fm#Lku z_8L4)?hd0pNM1GB5NJo=rUj!busl_>Hazp(PvDguXCswXIw~2<2Zzse%;`Ro!YrwQ z6PlCF#&z-uwwhIXP=E{t8um#e-h6V+Nl;%EmfXi?2=tlW`LC&?WO^`pVDZXDc|C82Xv(>prr}%I-F@TcqK1 zz&-zW5&14CA~O%+DuIG`INweyk&O+-ky5e5JC=tWD2T$$m$S1~C-VHiq96!y2SkL} zE+6dMBC?hw^2`l#C42?B_hMy6D{yq`UUWZ<6fw^XK;fNzakYtSO^i02NtOuywBUg_ z3hAKFLbr*5tGHOGRdZ2Z*g1}-%6yBaGYKpf1QWuj>U}QmYle>!LOet@#Yeeaon>4+ zLp=(GuEKRku;7jIcQCdGhuX&$;FU6ad+Itc#5x-A<-XhqRb{YVNX->e-@{7kNPXuK zBz!c*RaG7Ko?rRs%1Rckph3_O*61P77Yn_|K!%TCuDNz>ge71%5YFh00tZ}pA$s3} zPS?v)J5GK<*5hg8ykgiBlmtQ$SV8*CS{TwrkG$>H>|YReplg}kta9!C+^y32bD*u( zX)e{5=g0pzj`L=j2<{C}C%V+e2HD1)blAS$$6bIJU^pIVBa~$!sMBddUPE2op?=6d z9&iB~mjelok5Q|MFfK?c;8)M6zMMdsmAm08(%kJbv)1`pN%=KEzPDY?`DZzj*wrHN zn@&a1hy0VJ6C)~}-dCV@Qe;7JW@gdJG{|?<=<)P-9vVh^X=(u=CZhD$x4oC4_QYs% z=A|R#h;uGz^GI#@?ef%1fs)BG`|0mw2YLZLReWn}kny0$72rGgh{a|+9mn`WA>vZi zU-@uf%gOvrAiE}@piof;=-|1AmeQ>O|n}XlrjhY z=-SHnERwLNO`EwzLAeHSM5=T@J~t-{kh5k(D^df#0eW)RKv{Tqe@LO7Q-nST(wrKW z3I^hO4#SGl_XeT zPu$CC-?1;16lbb*_`O#-5Od$q>t||zNCl+Na-}O~Rbv{y-d>`2x(an3rPp+V5;(6! z3bo~Kma-k>N>9Ytzdw7*pPQ|5kqbAsTZ3Tjs~5RJAM4@$MU!PCODB1p>gURK9W}$5 z)P_2fiYKc36$P!0_t=0ET0+%_KDe;;P4~v%5;>5ENZ9@y(4G&TBJwnLpz&~?jdI&U z*K+dmgxZacyHe*DZH}ndeiiSaSz|Q}Hg%}WLPh~=i0vX$5Te8H$ugdD1|qFmPF(WBWhq<&X1lBb$JFxqPp`MV1bmir^Ck0dLy{;5((Y-9!@b zQ2GKg64M-{M_B{vmr-eL3!n-{1e2pLHuND@<>-o~2=BxrHyie{0~{zze4(78iaT!r z?gpJ_glASjRnEy6VT*U_f09R|9MU6{!D zV~V27z)Ai^@d2MU^N3f1F~fr(3{E87>ST88Xiqs@M1V;a^*{_oQwJEh6T$O8BiT^l z1XWu~7XX=er3kRimLUr&veuQfNX{E`k5F`yVJUj$9*$!>jQrg0gfH&x4-RKkjxX?G z-X038)r|*NT{`d~rWX)z%7}(emDNp|ePv z3ZhqADhJ$N?&~gc*9K3P$cn9_qVyep50e~GRD^k%ytU3E=(q;*g>D(S?Ip}&7>spm zY&#N*8a)kMyAo>JsKLH|#G4nI|1U7L$K#)?kxYk;LHK>zFQB?YiZDI!!eYaKTXUmP zbnKiE&=7`YVv4(EdC}HCDrNkRK19nW$?k*_Fr!moW6mA0skTIte>_Oh=yIQjIpYpq z1O9N30&puip7UkBt%b!oHc{FyAIm}4k?#PKu2%E;%hM3t?rTkb@i!uPmh$DV2BM{k znTEsO`AnzelLW`=CaKi>9`)a7z8r`N!v<;Jsj!0fc#PAe<$*>KJ7;ZN0F!2>%$Lz+ zphMeKVw~*AIIcvo3(0RajaEcmgY~X|-q-lhowEWe@cB;h&ePX=mi9D)3O1% zml$@n@?ah(c=TifjHju=F%a*fGPo~QoTAW-UomS| z-*XizQz|pltD~xh23j*cpOBnhfuO^$Jwbe(ATdDfzQM^ZQngG`6AznOKb&9sw7<`)U;I7Z5+@JuIfI$o-;5tOGL2rK#57w znJx3=w-zO|q)8BN2R_NUED4NTNWfx~L?&SS@Yz68{@t^!%Lep%b7 zamV@_0ABoj;zA*($vD}@z=w;l&C`sHhqsrn<*3r2y5a(i>6~Z&tL$W_ykj7IOU>oS z`j#N^t=v<`_h6RSffR>c^u*q6cFNTxV(`X*Hnk^l^gZ)G*)UL4(w^$4HLry(B= z(N7;Z2?B#~6)6r&;8{#qH8aAZzl{WKhM-lnJs&JmUkgx`dp2#b4e(=)pnDZ2F%y*` zCT!3qVSNYn9)5Y$o92oP(%B|K9mi?y$!e!EsjeD>%=e`o9@(Vkyp~vlk-OuT$3R>w zaNN2DhoLA5M0kqZyqDrmo?~fOhRG=4y=}Fx!I&HIwhVK`DhiCM)2=!(ifE~!k0k)q zbOw3WV}x%{un0mV0h&Vcs31p?M=-ew?(p*nADF2WEjAp3*|2-r{Go5TLsp%PZ*xJK zg?S#UckuAtL93Q=`|Ums*S9D#x1!R;2gfslDoUR$W)Od@yY^{-V>E_p_}M18`LiXP zL-rBXr0+2TG@X4MMGrJJTPjFh?~s!XN9g1g(4VesX?|QgiHKnc{ashXkwOtXi*0xl zsW~^_@Ulp|zHTMo%9+{SVEOp^m!&s<>JKcka zcjjCGd==xlvXi*4bjk}2uA#}3)eII)+iR%ciZ#sh#t*0L686S*feXOpOR%CjSt5}R zk1>>@OWK#~(DUqY_|Ev#Fn%R>V&3>oN0#Gv#n_KaIPFuql z`l=#T`dEn+11HXl1WAKH312pY7fztjXscLcV_xu%l|J0_5i1)qwX1ei8Z$uW-t;ZNE<1Z1#- zZ|h&6In)J;sxRnGM|+=Tf4J1FI)Yur`F%Bhy8k`5ZGWmbY8)LXDK&m3SvB2pWc_3o zxONhLvYjZArBXM#x&%Dqap1F2eX{k9lPevtDoAG)OdEGzba6Xq;c^zlnOI~BL4(>5 zHH)PSW3UB}?AgQANf>@2fsv?knQ&)4O^fYQy_?N9B^p04UC*DqN~wJ3bNc|5)MQ|IuIw?rxp9bg^j2o4i}CKLa>Zkk8Z zkbuU-GF(4k6+*Vj{xIL;?F%dT{q4e$L=gfkH zNGyD9@e>R#irKPCWR#%$X9|fco!SrN%MBzLZ^D|!QE$k7^cBOcmy6?+vIf-YRTW!kKkRuc-CgO#8dbOjqEUV2c0c#$KOAEq z*;o=XG|CPmK9DMSCXjIQ;UP4TQd-Ir*@c=XfQ!jo+J630{|YPEGM^;~_IgFho@ss|HT>Z5 z;_9Nlq|oaKp|)XVyB~Rrl*@#9xXfAP@rX}!dY@U0%?bnr$+lK6R5;ublO>_Q%GIeK zFhJR;a>6K~;|Mo)5vslt>Q#|UNvOP`X!>&R{N%ufLYsYdmQUXI{_oKPUQmUIeMLP8 z8-RKfL>SBWJEPKtcBaBYmzJaBgu7>U4ERen7 ziGngR%Ogq%^#GsGOH5Kll(UYOh#G#qAcI!<87mgNwRN-=;9iYbPHDZALim@-!lMbP zE~RBP6@Z?=bJ{08m9%qLn^~ONN_@0bH z?$QnPk{{*-EcI3%W z&WJx3dSzyem%DQ#c&TZRk##$hn=nHuI`CBY>T+@CM+!Opk&6iGY2Js!q3$Lua@d1y z4>MH}mnHn;?x?cpMQ(Zv1X(PmyP?P z?0!u!o-<;)KoS|VVWn=)3yXfJ`Av@_yvZ(-5^8!7ILqAHwEe+of@}UynDr_Gb~0vh zg3%-Kgj&hCyT~Mrsu#!U;ofQWN^+q8%S){I_fv5%F#16&6LtWUg>#!ZN-n+Tp-qpE zrep35O31`~tpZzb4zVvg1_cI%*EelWA!E%_mG9TNrlvh~b0D$S%{KOesR9Qr@iWg= zBOOVUFTlEFBfv;NI+Xx!VTUy;Y9$tBc6kM(!KNE%q+B0|TpH-LHfuHug#9^@SkpkW za(vkOZ zI1E?*%;yMi9BF&q&SnERYGsr?vux^d5Zn{4@dQ$neqmvP6CYwE*!x&Rw6Fv;gd~Y2 z!nVPeA@z{Ei~zyGkdUG|XO^5+Kz{jP7eB*Xkg#b*0X9dkxvZ=pOk7tf0;(k9Z7V{1 z174ph`m`O*m7;+{N|>(j=J>#b-i9CJkg^3_M`?>mr~K9DTc1JR$JZ900OrmZLQ-O(s=+Fb2Eg z!_ml5E9USvL+N%z`(A;C>h7zlcaOV8NR>+|@Soa(;8K%SOT|KSKK4l`dWO2fNeeDg z0El0<5|jt2fo4CvnZ!;C>+BWQ@f_Y*sUhH=Z~*TN%~tF}O&D^*u=O2*5m0t4NtlFd z+qx21uW7qieYXY_rll92wDtNS{B74?chDIDBoJ3ejmeLB1+1p$z_8wXGqMaDmhufW z>tb)ibM3LMmq1~J8%T`DO}~F8oz)KYD@qyza1HL;Rvt+i<*}f77GklMKat!A z#2_pDeKHd{{_lv{3zEoZV10REp>oe0%**0N&w5GVlD->%dLqi~kcBa1cht70mkCgn09_J#PHd5+$t;|UX!LA4Jd;2&FCYel+S#3LamUd`zt6KXd+j;bNPp z2OMLNj112by*U83W3C9H-3OeYEe>Zy6wCSvXy zXx7FixKZrwI(c|Nj|`My;22}Y@EYL^seaAZPSSO#+CpCj*kUr=J?IoXWct8p7x#2n zQxs@Z_kf5>pHoJC7oTM{TKO^mP55*vv$u`4E!RWS*M^8k0x;CU2o!1}xm6t{L!b)vMR%xPR zli|bSYBpt7F!o!J0b1M1AL`pCIRPNUine&*0p;f`Pmj&~iR2Qy!h{NPQMol7bxVES z@h9x4NsJw~Pu0a9N&Br~QzvDWrJoMe*ITO9?!TeQEK+F~I3+zn=fM@+_TN83Y3w+* zJ1j!*a=b_0nRmu34$KE4GJ)H6bwCNe8c~j6YI0KCqg4jJ<(S5^%tS`&aH(}batS%G z0zmE(F(w1@8qj6bF9mqkIDRns>2LPFY&te+c7h55@&0H|81S4lY#Na%+Gl;NKQ+ge zkbqwr&Yy#R>B0CqvF9C2ao{sho<@6l1~}Sf;QAblz)xXjyw;Y0MaBzgOG=vt7LtsW z;M*q57$WCCE>K}Ej?P|Lg?M@h_wb-cVm-jv*3k7tqpu2^#`m;0iTl!I3;3X+XzE|T zmjU>PFHZ%F3;IAh$3&>^k=@EfL za36^!0P4t@Gux~A0+c+l=Uf4yRZKHO&$N zyvPySmj2~p=>TW%(P#-{(x*Oj1Fp?HQe(9WcacGdAN{|u0JCTMQv96RP_e;{w(-eKxkGk-;;yXF(Mkb`ZIeqT@ic#_LCw8{6Ju|EnjV&F0x$}6_ zZK7Gi#XP6VywV58Hxo*$a}`CMPIUdw$w^Y4LT*T&vp%w^q0WzA!ezWz$G`13Yo2M{{} z`Uu^+3B2Yz{y_#%N{+vH$sNK7(%Mj$uFx$I!uNc(EFu3=e!$6?Ud~G10&I(AQ2pgl z0CZP_>&KOV@fr^`bqF_dEDi!`6<{x-STVBh8$IOJ#h93KI~hue<_ryT1p$X~7{L>+ za==U`6LQk*tZj%D9o$U;+e_wTpEq_)`HHY!CWk*32F;t`!?UIe17ngQ)F%8zKRMGX z>S+tzuVWC-7Pbk=q4_r(%ST>| z?GptA$z-(nIfU~Lb6jtuv>-tvpkwJ9}V;Npn+y)yLLyaPM zhraoqn-V6>|KP+f<@AJh9`ECuAQGX2?{i2qANSk^h9~Vzbeh@q$6bPLvdLK?eP(^KD_X%VTFto^ zE4Pjsq$7C8`Mb_TZknZP*UmJ`ylA%nHBO;^YJ*H_w5QXvfLXBIEQ5HH3&PDB7~*pr z0w`Q8=pnYSfEKd2F6v^k`^ERx9>ww9ew=x+(L2B@lKdgoVPpeKJK$6asHIYyLwlgR z8_>SId`WE^XQHrJI71WPU@$1i9RM=|FOe3Hz(2Ld6k|Bpc+WilQEqb_L`%s!6w9ci zLfIx2Ed?X1Y4+|g_0c7V@qM6usZh(=$5gj3eW|hEhhJjcc7+qSOj|V3fSwmo2qspbB1+o4Y z#GB@Q1QXJwxfFqMza_fHLB|1ff#^$Lzuq5^zJg?2=@p{qBkgmno2f$c6xb%x)XgHU z6Zxtr)qE;Fqf=93=3~ZT7N$omGK7w~f8&w?i_{#K_SjAfe(jvF3s_;W+?qW4+&d(L z#TOQ230Mu^_6)7m6RF8e-;Lx;dS4%f;2Oo!E7jtU+Wo`=Fc5Fr?vbz+u!* z`{+S19=}dwQ6F`>oUjFU7)3$A3A)(!NOLJ_+yQqlxB8q%=)bL$${rs;6iP-GTVqilP2dJh^bAD~= zT|mab0BEce2Fxc9fJl)t`IgG$_Y3*K#PQz{A|U?M&x<+C^8}B)2jA9f4(n7xFd9>n zpT4OLr}R|}KCeU{-vag?0Q^mDUPV=T5iC6A;X%u*{~k$*8X1Q9QGH^dIO(?60sejz z947~IeAw z>RGhRI!dY3oYe=T5q7TA-x)Ya)R}ag)^1iZV;D)G3k$iRiIaESGcg|W7P91+udW+> z#EjPL4#$DbG8RO#`k^#|Dpu%m+F;q8JQ&N~b0MLhsZ}V#-611#?X(f?Yak+(pwU^t>&H z%FXD3q$<)m_Tlx~+3@lZf)Or!Wj+>qOXK~r=&OkEVYnDSs+RhJmjx~n_JlYoCcS^J zh*oW1VQrg>YnVLp&E~|)&)*RzCJU$g&jr6Z#ux5)4~)dvfPtg19~oj&GsxafZ$D@o zX`VKnZFV$aH%?naH6Ix*zQ}=oIHjXYQ0(Mu=zDpc{WI)XwJ<5ZVLMBQ5NwVgr-YM zQs&Ev{mHcp`DlJ6-+xABR`{zrD<<@Ugrzuo31fAJAVbCSQaYSxHP3XaNz0)K&|IQUL%0qF}*i$DDZuSaRII`J>&f(5T}bi^J771`aY7+p+%6D zCRU?)a&@HoGL$Y=p*34!l;`HrK6?e9Fba&T1<3Lj;zD;nyhoop9z!+8-v)fvch@!# zzdhE}v9KS%N`km&1+p!TH>Q-R4N&*XdnHpf>@$%LHLaO;vX@v0;@zfy=@^0-J4$M zn(AS~JNac_l>8)h`(p3v4>#mdgl7^Lt#ok2@;<~9p`}cv?8q%JRJj<$#TQl0^`22hN?EhA_87O?0(CobENOBHvq+KanX{H=s#@SLkvo~} zLhyC@;D!HC*nxITJc=7_(5aTXJ-+qjgHm&61m(BcVW&%L|AL0eVh#hZ*c;aRMCxgP zEoxNOe;+IJlekei)b8(i0-Mg*1tG!>yC_Q<(3tB%N1d4xRf5WSH0Ba~Act?a&A8+# z0-9e%`La2`Cilnt^Rb0rbm=I7$ z9!tF17%$1=L-m0)L-zpIp!MYAu*5$m;7g1@#XQlSsj@jV>2d_@D=27P6bK ztw*yUW=!D~JY)9BHoeKC(d}6sSp`M4Xa%cuHn&PC>13(AZdp%fM*)~D3&yZmQw#~2 z2@*i351WfHjwXz0BBSr(G+s~&JK2WeYyoZ#X0nw=7$@qE3tAB~hM437Q3Fc3I~b}G z_5ubInm|WJzppWGw2hjkF~s(=PKD0>NdZf$iK9Z`9zN%KX-{?`fO`hy?cP0a)l>Q= zr@(pf&hl%wK!hI^7}CNRlSP1}PzQ#}z_kvXbBXG#o&#^BWk_OIyYE#nG1DYP#d>|t zyT)NUPa{Gx9+^7hIXjHLigK^iO;EYvb}0C~2by^r@+h9f9r=g>PyOt*a%!zj6W7&H z5dlGgAfYLdL^D^;!sCG@*%e7bXI1qd0vG+zj6g&!M$gfPe*Uf~zsgN3 zxADE>dZvP?smt4_{jcLM%f_S%xlZ}nEZ~W&m0v`wLu!)!HY?y-B4)@HedZuaX^V3M zs#ovbOair)Iip!`SumLeZoDmU(_e7-9eqXN_cyo?Pwry-((nMKAQlx*)aX)nzZr@MUx%F-4+HH3LS;8p|{*WqSI-G;5y8BTV zDQ&LepZ^GhUZNNip7OpOMEkpx1)k_|MdvxsD_)NEV5ByW#sSTnP<8W?&)A$*2AH&_ zhxdxPx4_Ldgya|YK)jLwnb2dfd0JuId)wHd?%HQkjk#|y|NPeD>A)aBg#qpo31+Wixb5z~X

    w47vztiFx8>U+-Z{(he+AWB==iT(!QiOD#`foS1Lm3RDvz2O!s+=W<8R3u!lAPuc zu41UNi7rEwM;&m}H-)^4i^g2Gl45a!5zTNBMP(>Oc8n-Ju2+PUz}0i-C?Vwfr)+K_|QhjvDC&yO}xdM$M8Bt_FB5U?LEge z**&c?t=42CX_wnC6l}R`MVvKKj9TDDYY##2QWnUQ5;B@S5xx^RUS_9-08vNX=+241k{A*gci3w>?K*=|{ z^Ev!fpsir+pBNx8uN|52-&{h6H9X?*U#==rh?GuHXdjbycC$Az=XNux8#>WSFxfRZ zcidPi8|m9ej(uildU!jfo=w43ph4tK^6PIyI|u#^u5ytI$V!V3$&4wZvk4nZNoEv{ zcQ*BqlHYLZU zZvyfYx`41~3?L6AJ=>54AnFk)#MEjFEF8v<0-Z12LQrFQH)J`doe}vTa#aOGKmR84 zUT^ykZf5rWKJ((yPZfKixZz^jsN#=HE-v@>lN;Tebv-SJ6EMtKBQ*%>G4tQO@x%%> z=^QvdUfEZ z$Kt9FELz*i#0J#n-J#KUo5{R(NpP_g~*2EO^U6sjZ9Y zKOmM(vAJ54ti4(jqg*M{dkkY56;wi5p*QAkdayYC0I;H(O51PuywPzux%!04)clfe z^oPK#lNPSXHy>7>8*u?{#K$$jcM*K-Bl6F>_#Xp=4|Xh{a`|ocUzbrp=b>8+k!Y@) z0|A$1s8iCXzDAA;S;1C=b*rU$CBaI>1_by%EBSvM>Hq%gB9LaSKkLc)YZ7{L$@9`> z^jLs)oCK>ZwlyG%p;~r2LDq`@4BzKICx4(w-{2Y*f4DJ8@*F^^K zVJ^S4pP$fq0@C{+EuTUckT&QA4T0O)Lh#ro@H|YRvX$;r^P&oW+D-8Xxf$7zQzwY0r3VZEuPf$O$AoY7(#x3XsrLQ@A&ueRw*GtPu|hEvft>;|HpU!wc9Ek_>Lff zu+uMJ&Hrs8_@B3@THymt+1Y2tUn2h3Ddc~C=U;P7svjf}>(^xS?Em9Q|9yd;4dS}G zv$2fEGQTKQzn+-uJFy?0SwB2HmlqU7xUA~F+tbAJWlw5i+S=R4o6NeHWl`SiZ~^9= zS!MbW-)C2J4oBBsYrDoam0LC3JwL(Y0B*4yz;S{*vLSxT&%}DYK0r)VA83#np+DyB znBzXLwQ9VV;(j+*O+Gwwts5rL>v3`33jimloUkvyEg^qTq_mekKY2R!3GKchkdx+vP%%TknIDm1v#z+ z&&%0dIxZKvaYAc>v^oCf4^yWx=j>cMu({BR7N}-j%^J=UixxH;vi2Lrb;q%X61Lo} z9&`P8K9t}VybLegxlqt#r%3`$_*KUKfdRcmXF|{2#YhE;sWGG9ja^#LWcnw3Ppm-D z`k@cP@IN#DhrGR$R}+vw86j$UEeAdAoSU9{Wx#Gj{WUAsbXr0zDP!-@roQn5oeD&t zqutQrN3xdW$CD29%nMnafr+bMW3w*Dr8yNATIJ`pg;jrVBaBNvD$@DkX4GHl-Mg>s z8y^y~)l<&o-BX^+rH*?wru_T6#LRdT3oZ1t6rSXc&MK2#`dd!mzm@e5_3sK5vh(h? z8qYqX_f8isRA`STa!sl0sM3GvtUZ12yZHsqiBU26J-cEsaW}>ZWHx>`S`W%fmjKOf zlLi0$AY>#d`9Sl)Evs&oOeAFyIF!}TLj@%;vcHB~#)ovMen{X^CDwn}$d?s-h1|J^ z%@MYaHg@!$4qARs?OduXs%n!zHwgO1GNmDhN>&t&g3@Y4X7ddCaW67U!uZF{mWN7~ zB*$*CbMm#>t#F+8OwGy}FK^y6601X~S|$i={V4BM8c+lk=|mW)5=JO{&;LDn*aXPB3)(3!=? z6a8btnR((l>)spN+(F&-uNp#liMTB)W1AM&vf`|I-plFbv)UJITi5+iCZi$-AXQUI z6!HT=PiNqIH7^PJY_qWBUcQAO?%lA$-z(X^6spgb5j&tr(iH)gb8ip@_Z^{s4lWIG zUC{dHfegAihQ0AbI%sgqQ3cweamMWl`uL-Xue+F*z1f4;Hd+~u*|#SifBU+OwR0do z@$Kjmek~Xl#t1?x9h$|V6N>~(En7pDN}VrKkH5USmXziSXDzTqM3oFKX3PSKoT~E% zk#Ko|$(lol!aCMluJl_-Z*z^t@u843W_=S*6a9BMnq%-7U>#g-H&C%z&B;f~zgD(1 zEz$wS(B?O(EVN3pJ11WcT6LeExVpYzUVDk0T&UiKC+_>#72%bdoWD>I3@8lB?92$J z<1yDlXlpbUtc*&5W8ToOMC`thfJ(}{8q=c{?KXxM(6L>?DV0Ro0^b`JdDX=YkpXaF zA_cSXwNWXxw%4ai&leZ3OG6uh$OhbG%tDyF`&NX6lGMWFBmj|hPm3MVsLb<3o% zZh@ow7i6TbPU@SK(i?$yUP|jbLA)zqP^_05-n1Y+Rjym)8`mfPvZ? z0^T@wvK^;sMvB;USUgX$f6jd(hk9dAthVhPmMoCFEOXoMI^`w8mrT%5 zWj%=S{+A%b&zNG7(SuL8dTq7VaTVE;D&CYL=pKw>=`KDH}5 zH2cVkj`$xIH8DCMd5?BA;m<#%CSlTMU$K1oB7xM~Pk9Qzww$u^LyhZY-laB|oXE2I z6Z8Y^6`ja&PnDZz@ryb)8_*($S(fVhuf6KMQzVS`^uF@VGlxwdTgOeG+e*xJhe`xD zuMWpiC4Q^8*x7qpq=^U#DP$QvFLgJSchN5WZ`+B~H<0f8mbT89f6~>VW_&2&>gomu zRTV)#Y<)?Jp0|s^li_vb1a5vfkC}(t^tmbHHQ7SBiZ7s}8>`dNw|wJl18zz7iLGDd zq??0i+J1-KU<@t2+f~oFva=O`uGx$|gJ8F;h~cYorJWf@EnE2q`&6t-c6L#Sl)crbSXkc? z#X1)E{0+x#i|CyOw@u7ilu+vueoWEY0?o(cXf=--W0`F>Ha7aB3xZ_U|0yW+*Tk`} z1VuNQSluRn46S`FR1DfC*N(yC}r6hHRO4(%5nCqzNNR) z&(4P09xgDP#}^uRJlB#sCz|DWRt9#AXv3!gRX27SUeXNM*E?rJE(&C9Z_19dgTvdT zPq?S2LT=nP*ioR=aq(6!5Y?-C-aD^3NH*`y)3ASYZTa;L@n>huoDIj>YmC@2wbehL z%KNsacIlG{TN`hsYhHab;t$N2BQ(PGiT*43L;+!I=-7Y9=^im2_7YF8fzE{tl2j1t zbGf({Y!~%_t2B)Dx11zvaMMW{XLsHY{)%j=GC4PNl?`oC00Rc zt}4e-_7<53r!H1L?j{S_2Wnr_t*~`;C=f=c(L8SPTi|2@?!Nq=Ml}?9ZatI!@>hZ7 zcc|q7OM?m2S|0M+I5(_%2>r{()3ZUE_PB2PA?NfFignxHtL3k#F%H}%cCI^D{~SqT zr}(Rr7bKcb>cK z@FC0}n+kvV?{M*i?UL`>!F0t+&%gNo;QDCUJue)I|4>baE~1|r7CVtHJnFCg=ZXK< z1K)Q;?a3KC9gF4qBVSn6huw(OmInBsv4MehqrSC&9ZC#|VQXt^SZX=_?)pFRJb&Hw zR4phe(ozd?`!l`u2o*EVUfWo`SM@6P*@@kegCuCy(g{uZsvWuEub;3&| zZy{aYQiV9$Nr~WL4VXYgTEf}g9_eG^0pcWqlUof-dUOKnD-0XfJs+8O!H^)L|R5xv ztOx8XduhaGfp_0Z%~TXjHkPmGY5&lufopMWyJs9PzaJBo*Vc}_n`4zMd*#jDxD}TTywEK8#$^ZZUCHg-k%LkG@}kn#ml`@c?5VFNnvvdwqU z2crK?%KH1pq=HeMjOru*_m;n=sQxc7Yb$~{;YET3*(hF`kL766ChYL%)fDSJMPC|r zCcEw8azEK=&kCHjnJLsKzUCfvbVY7Z7S%tGiM#WB=|t5e(Cj0dT47rvvN_19dQ~a* z&oOgy51Jg;mR1)3nCYCjE=}q8yk004sjGKAbButwm)1IoNYve6LmIeDB9 zI&hQ7H*;#>uOJ%UpCKXbWpL$&9+CuqRz1LCqNiH#k}Pfcd|A1dMaw9NvqqhYMc{#A zeZHG`VGo(1ZQ02)XsheAXz79t1JXt;lG*0`{lJLJz+_jD;sdokDKloS2jp74qy+`g z{@glKaL&W-)f%@ONipQ~xte57h9uU zuTAGW(OV%N4iP8a<40r-@2v;atuDm==go?w^(^}D3JVx48eUmArX-G+hC-iQvh?ed zZ`Y_FNP-W`ViquY4VTYd86kEYXuoUmd_X3$%p#4=NxcmpT+^h>0(5~ThBSezlL9Zg zc?&yE4O&B-p`{d_RQ^gTJ3SpK%?W4wpB{WF@XSko;A8P$vjY|etQioSD)B2QgSF^f zs)$%B$qQtT%wFcGf6sT5WmZ*TX*cNaCo34xGX8OYcCOG*C$#Kitvj{7Rw1+66PlXC zy1u}gB8%yBWcpb0ZqTPkpRX3xlSQO{AatwWoHSc9<5X>K3EisRJaW7$OW~HaZejMZ zF{t>zjhj+KiCzNtprXc$No;@>z`i~{Wilt547k=^J)n%>diMJ3{@>rh)N>5E1RmCo zJ3Iiea%iKya}1!f+_6JI?j}l`CtQrz^YK)>rMY{cs+G0pqzSji2)cen$v{%saz&QC zMzCtd-TcY%)i3IM8p?vTl&-G)t`7;-{EB^4JmZg7_Hb+-A}d$aiwkk-;KPqU4&((q z@)`>McAa~quAu0uE$_*RIpxjub$qz6N-E>UcbAsygD0DgB>5jD-mz8;nU?3JMa(CpE*}z{6(Ia{Mp`jU;c~(#2(8tH9ve!Q{y-30DUA&Jwj)mkSC_5Kel42 z84E}_o}4hyMJ2ZZAilSZJd-aO$^a_I_m7@}m>|(h%eR`J4J>Ond2B$TB_4?0P1FI& z+hyQ?%L^E?ya4r3sSf1qdqG*r(jN3BQnig%oxfGLLpB0N|A)0V4~M#a|A$LRB1w_# z6qP7@b}E!5l_Y!F#TaGF9tuU46xo#|`#yFO*`>0~n6YMyG1iP_EYJDw{(PVB{rMfw z@%-@|&v7{J(mnH@VXpUeo#*R(Enl7Ka3iY6hqGHR7!H1=fgq{s(mtt~V%3K?( z>x>dn-Scr|If_tS1J-HevimiZctL4;;f-cGPrUS%VJAIThndt#@87BYHr-tY<*t|S zz*8FMQDE&iA-l&RiUtSjm zUd6j)n}u>*x4E3q#55D*!ilP!-=PzNC#>~ZD_DoOR#;q~nB>$@D?hS3_xDQVukgdD z$^7Yl#YDJro@6%@%up&#h!{FgV&Cc@aX5C|wNTE|+4`S1@iT+PzPQg&C?MeoBu)r) z!W52u>ChUE+jD|hkE{^^K|jTwfKy&3tY2?ZVv<*QgTPx_=W8Wjf8I{13GnKwL9@s; z|0}>6(1@eF-$4Qkw%Vn#pmGURJ~All?&|s$W>M!KD}WknN~V6}<$xcB)TPE|YGsp% zk~SFhj<}Zr9MIk^653yH5(z5{5?fShxVxXn$JsgQelYuOF8eF$bztk*Ztr|$(JZ+g z#^H8|(dR8uMc|_cDB;ePe!rwTdXlPt*+$?gkF~EVZ>DKJSZa^DW@W_EQypQ}zN=O_ zSKM*ioH;z0$lCBh?ch{Y(_Rvtk=ml&0Fu!n7||0>EQQs6j^N${2FTr(*h@d9a854i z^>8TErtS%R3Acs$N)wCjrx;%l@-T4IjW*hT{Y%k@!J9aEo7|WTll&#c5$Ep|!WFQ{*mjYn&) zF}Y?Zt4hhefp@IwjV2R?jc0gCOq7>lxj5*)KDOn&k`KoQn}^v{I#!=51X(a|^s zLwcpf5!W6a)&5e1H362*C7}FDM6zQb2SVbhwqUCyX5y6=ZTya8eBB@!n0{};u(S>F z2rT=x0u$(XMrqt>pw_=2Q$Suk{Md4nb!lt#+XY}*Y==)zh z5eQDjocD>jk-!a+GgoyjpE!1bO^Nhmb~`xbT4&QeN(G~?0jn6DPS|JjXlLwqKytCm zpwvC=hBY;kAzb~ewZ~thtx@>yd%d~wx8<*YbjjU`7rP=uysI{W)VeY`1!^sa6*&|2 z|9(gGX%FYCGHf}^Z}7Kn_BlqmI3)rVbA&$fz2|nvS#-y9e|O`)Riy0f5@#xkv>d*r z4%b7f%|vW{E@1HMw&nV!=b?fgI=6E8|M>!my(F8?>r1sLt0-3C)~=@w2G?JouUwjF}asZl_4 zZRjuayT)R5yCJrq^_#(m8c;#e5;Eatbx`Z|u08ZKVVq5OoklO29uw7=t_>^;2ZZnu zXZH)qIk7N5#s%4RshWwZD*FQIy^XUhe7!kXzWvP@_7dJ(Gcpg}M#F0pA1VcS5eF0T zbp->=Xvra^`dE>5ZNxqPt%9;#v{>|4PY7kWfJl}lmvAiyM!E9`Z}_LqzD(a|#L zTz6pEXubJZURpr>PO1&osprp}pSfwzv9(^Mj`K6!4kJS7wddQdVy>cBqgnDbt~4+G zfH8!WqJiLyNl)339))X!H^z#Gl)#u;Gn6?KymJcY?&j!iH1wvWQZTd3zDj>Ha{#;b z9WOcbUFP=L(aeGG(Ar22)$G{2lcZ4$)guO2*UP(S?}ZeViTNlE_$D~L*|8^+mVDiG zWd1O|d0JUb@@!dbBlyl=Y^c0=HvoNHi=&a;0MGiWSf{jawS-*DPK1zQn-CpZ(6F$tDGwz&m|LW$_ z6zO{1iN;$GXk|_ZoqusI_G)A;LSF!PEwR=FBm*Lm&FnEB?nq}>RY=~0q6~JY+kz{J z0hC!HBsm4mN#Z4=ZXRA~#yH?2825VC-P3SXlYzBFz4vuOZ7BwfE56M^V2Hg%gH%vi zi1Pa6F-${a<()z@lp2zVMNxk;J{UFSGYMWZTB9vsz4V(!cyJ9ApeGazqWX&LHX3<$ zZrm=rRV$Z|&S3fQPY!Cp$HR|-{@(>B6dR>8wA*jJCbXLeC%6NX!UM`JV=Jkq-^*de z?hkm_qa%|%^rlkj{+FlsIw+m=+zs%U0lS9v$Mr4Kqhs&NjZn z?sp~dT#4HB_`^t3(xuPzXX%~Z3a*<9QbG*aWSoRgTjxG@6JG3gP_~}Ws`}Hd^}VBj z%KiADd@#29=n(yuLjt;?spIYfm0t6uY^UT6{ZT+6UJDs$dd#kl|5%$P2b77pkUR%y zg7r=*2>0~qBM+k?Cdvid2Ad7h`y7n@Ea7Klki5OJ$*QfO&yU)SS7y3;!6I9gK1zk- z@Ns27pD2}aD8lBM(L<&$3JEI1KYP|t%bQxB|LQJ*aG!<&lZ>=Wd)}G`Gx0bf;@;7z z4?_{WpJ1jrILZ_P=4`#)C4a57^A2ooW^6^K!%GZyDK%AFu#49!FzXMce{R<&FWi^lc>PTV7DJxVg<53 z5|B8Iu(r@n5biucvOKOD$Fi7cz`1PZ?E6`x`g107@P61HvmsvHhJ9CRJ-ts9$@o(J z{N$*KCGG{2%wVGa=S`@DMXb1Y^vism`1YU6sLCstJdyKH7btb=$(oo*40j-IF`l_$ z{gzF#8{~LdgwZJfS_j8yu8!<5h7yE}x)^`UwU`^Y)`ZdhV7y^~@c{?g23uC_A*G-Q z4w-Ktbvt(-3;$v1+yx4&6fUJ%6nX;xF7VUfrap@&<*w;aO`>m3jm9noKdN_%5m{xl z>>sF4s-x*?E0WAm^N#ngA?FGQIyf|(zR}fI5P?mp5ws)&NGeoaMZw;3-y7~I(7tIC zBf7}>{kGQtei`q=Dm%Rif2=RPrp7GRw<1nRX(oIb2kts&KZ#zP zR136BcfA|+v)MTos$Q7GRJ28913+iAli9xtisLi={RSsK)7ZB%ufu`o5x?NO;9ZoZ z8!w)5qtLYTMnv%a#^Csottpu)FxWU)++C3^GyK>x%DM%mD_;+-mnA>WiwXPQls9B9 zsOzjX6{#>IC6TMx&`-Lqt3=1N*#56s z8k9+YM5xLbdZ8O6SjZe(%85^Q$DeU?E1g5*Hg6021H;g=tF;gj|2E@?Xa|p~sf+;8 zNWc_l(b@H#jrq5L5F(ZK1WG$YF}8xna1C(@tnSiLZeeX+KlRqQ(CivER9JYr_2)Eg zYlBkpmpoVv{H&bH>!bUE9ICPd&#Ahr_FpyeP}4NU)QbG7&e5CPtit>UKEI($Wu!g` zLYtqtU`C&HC>2sr#p3w`yg-7h+<7K5=}3t_s6Mvww#R%xbM(G|(|>Ep2D*zEp$OA% z3T!_kOP4+@9R8elXc(BVwAiAxZp>rU0?iqt(q;DhN=Nk@F&t2 z*ZpbYWS+~~Sb+?Q7N0*-yjQ!2hOh6~4T4_FG8-(+^fw^6_Jl-+qOHZH>-+Niz)VT@ zXBC*7UHL{JeaK#Qn;vR+DATtrd%ziYsvCi~(#}!=(h%F=F6bT)bwU?k2&aY2seXuu zP%Byda!_M)#E*hn0a5SypHuP;Y@dNj6Se`#Gnd31b%=OIdA+miW1oSQDR65&w{mBb zsDUqyfm~1y#~ETQe=4&HhXUH1>T&(T-0us?6$eay?;Xn5D6~Geqh7?#n8;YEZfvP- ztTAh05Ys)}f81aW-x8c0Y?7Ymwlkn~O>s749lLmKQ*BsD8mD?bKc+U%!GwV6O_%oi zqRd_*@TlwSwnM9$@BA#K}XCaO55lC&44-BZ}r0&rb~s1~J%r()+N>I3;ZiboAt-_8?%1%m!nb=J^N+YtR@;G=@(N%c`Ic`d?d(0OsPs{jpp!PLg_I1 zUn6B-yoPi~{*QvG=PkH2g9N>yt~vY4aaNc*3VT@``$oy#Hn2qEbEu92+aw<q89pCZv zX)F`i$->@zk)Ivo`tnp+W;l!$n`AikSD20LvDvxQ^*cFbW&NL8wsSV{ zflLPvl*$JL9g=Gua9`3PB#TxOScd}|1v`Ytc~V%AWPWHs;q zyC6~0;XOgpV>!CJ=D%wT*bwWz*0;;G3%=)LJg&BoY`PNxCF%sVBlC|Ip5tK6mgWhk zIwOXr(Gp*3uj;IQ<-+6yAh?m&06IV2AKSaf-+w90{H+GJ=Sjgp5xu3)cZ!6PU2jG4 z(Gp54RgJ{HCP>YJW@uL03HO3=z|qLG@Eceb{dTjS*n(K^$OdT-_Xa5pw5;5HLLksjZ2^<9z!MNP|F zB#@kSeea>x!lx~Ow;gfx4tJnk1`qfM8k^0EhGy8Y9RM5MYPzS~IR*w{4BW?=0^R~J zRycq_xR4rhfAN%BY~c9=Vysk5OGdv~Yj(y0(k78k*C zBNe%;SJR2pYhKZ-s_I@Q{p~`*6fp(UyruJB_oBpzUM$luLs>1*00{Ws$- z;rF%nym{{1-a1gFc>QtJV^To={;15H9fw5ma!0m1FK*NG!YfKx4~7pd56Guq{CYJl zmRcHjZ=Zy9`_b|FLw~@p0Luy13k_=nU9N+5BLNT9BRJ!U-fJm!=pJq9Qk{QI%fDCC z^@^vbygY80TFPLD2{)|=v|+9%tqw}|CVb)c8Sy^8$Fs1@T|i)uF1WRMVpgO^nHB*Z zmlJ#?6-Hm&j#L>BqRcu!iA_Q8bO@u=6Sz-mqpEZ)c z0{=ASb1qdCL)Aim;%#QgT{UX+`LOvpafnbe#fi@vnEX;fm?u?;y7lqs^(BleVVp*() zk-Rzvs<2X&M+xrt0d{HzK8ZBb3v5=}uSqXSbvAvG^79v^4hs_#_|^|Jm^cl<9?!3X zBoeFID0({>THR_Sd$qo}gK>l%K(+sxyf5jsiD$eR3n z>+mmJ1&WSCotLO;1NPK~eD;;LT5B1Jxpkw(m(fQCiRiY^p9`g2!3DI5dQz>sGb;j z6TMQuHF>^Bd+Cbrn665qAh9dPU`o_}EJT12|!NM#~;E7??(Y5~3Rwku<*9lfMSZ zjTGAYt3Go@ErEMpLRd$i0wfqU;I0i^IbShKCE+&P8|q7;9^feBRir+$3o)DGV0)&w zn9&Z)(&mu-+yx1rb!X$`zrUhSihHeYe08WdwW7E?TEjFgG4UCj8IBX2dK(U-~5r;GtybdP-Sw_*=qfPBHzok$e;GX{$0y?XZ^aC3;urFjK5Pxn~&BX zKmomlV$fHUQ#_uJKIWcNIJS9eS|6AHC3tpw{6J9ZqK;2j$x2od^2FSd&Vc>g$BW9L0!Tz+ zJz8CZb4}eFFc~;^Hun4YcbyP%nw{8s^%IR!))$%+%%O-oV?5~QdU(@%L|UH&k*<`) z!hI?(_yXm0zeNOGn7t1_u`ag^o6PQKB#cxw+-=Pc*&|`D+AnEPz}}Fn#FITMTwS+* zj8~isPt)MI{$;PRC4qhUFq{p~prj8$l*Tpd6+I5H|kA$!lMPKVr;;1 zponOTc=&-aAni6{6R!GErJ8q?O4S{)fg9rNW&Zt+W|?1rC7($?{Yc>L=;S4~`t@h3 zX^V*88yPH$026aNxd=$w*TDoZ7T0B2-d;FON3K5yL$JZJ2#>VAIE#3|&JYx=uk)Lvx$U`yn0J>Ko2u*U5CAnrq&9Q96R ze2jB4Z{qD+!Ae%E-jFa8G)+}}yXfz53j|bB;1Bq;#X%4w-=?=%c_`pdYlIwN1E;e2 zW6qdSNNc7w*OBg}f%5RZMe`w%3TLpzWiR8fc--Z2Fz}DzCU?MSiH1j13PtwA;g?k- z-nk|IZe&KbecQr+Uemq#57qAA`FCI5$Q7WIeaFiT)?XbwMy#oGBfrF&Jw9+OyVnP` zMj|VFPWn0|&O!F+jeNLE^UbB2p$4J)_(=)Eb&6Sgj^L`Zu4) zH%BONPkH|d-;RFun8j?xmDsZ7Nd#>3?U6FQH&3Wa+o`8H(wdSCtb|oJ@#p&h| zoofERWZ`i_GjYy5l0tif+fQ*mwWa*CTSh82#cfsC=R^#) zPrMCqs>Q=1DzYe|B-Uc{MkZH*khUYXF-1~2pQ=|?5j}WktQ&A$C zA8dRET?q-^SKqwcUmU@A;aZ32`wKG@s`~Kv zG8i;#>5U0s?x|s5nwO@j(w%-9epMq82FEzXnVLjSpN6PyS}fi_BQT zSc$3l=KUPAziIAJ)QhC2kk0$@K6&!MF^I@&g;9h&_Rfj67y(ne7S<_9h-2Nl`wL7c zo=XC0YSa=_9!K~j^^Kwndj{PHTB7MV|LCw z*>~Tq&FRWQao4_h_-4E;ZtRyPbJogc|A35{_ZhZ3nle~vnj4xj8yS0+U9rzig{W2~ zLlkyO!l>~9PaFfnMir{>Wxj>IQq({tIS-9pQmjorZkbosFMc`XP?=iIs@!n-l3eHY ze^vDXO1fzx`&Oe&XTd5`O6y$FQ&rZ}j(aztjy*1;D1B$G+kdA7aTMtUhf9=m55}$m zV~4~ZHehU>8VW+3EoIOfV@&>bQPCm}1E7tw2=KZ_Ft;fi(3BOSFb1v}-Gm`Fc{H*a z423VO?>k~lKYb=rd8fh?ZghdOt&f5}3z|1bXCZ;}&RqCMt#xgMgTL`l zLP^eniY9k8ZlrRJKAWa?EcFIELl!~p3y3Dv3Qrex7vio2Q1(J}X=VFXd6@`KhEQoS za;wp&$j~jigmG=1@dbZP=W|2>Djzl@J3i(m?&velh;kM@_WraN#UMjiuY@Ptb<>;0 z;2KSv8du6|+iF3ez$8WHF)rShxw8~Xm;I?{NNJ>QB}(VLL~n|grPYBtXk^cJ;d4>G zVcnIUT8Cdxp1vPu7F&014>nIBK{aiDjUVpYWZ%ZJOS|ogmjBs-g*}G)l6eYh9}$gh z{oKfT^W6tGvEC_cSGU*{6xjE4bj9LNewUzL+-?Cn#eLwUZMUxRugkAMt)N!8q7qMW zdiy5v?!6ct={ALds^RiBg}whJGWm0j3dR1(qy0?gr?6*^E_60767!0PH7v{!C1AE2 zo*ma8${5 z2A+1h(#y^%N2!krqWp3BGvx)%$f+3w5G=~iRrPvb0pt?RH~)0nB6~#~k}igsy9v?E zJnuxDFs?(52B;?Gs-ANbWuMO2f^Gxn@bOR-0Zu zZ7H81E;U})Q{%_q)tI(@%3*-jB_hVESd(mf^-X@W@7~t1kZ@QJ7F84KZN0dAV>sWg%3Yaw5`Y$DtTJU# zzp3w-V>}qr=yYkF`=edMb~2beoPWQQ&a-{`(Y$;X(U&8&d+xv0Oc;h){ zQ@Ux~#aKQT(_5Y}-48;|BTiO#Z=BkxkcZ``)1Mbi5-TJnanT9+;^<{X(z&h5I&JMV z_Sg-mDB{50BH#a_M|u5M_ni6uHl$+K&TNUyGZ_J>`AYJ6D&LOSTo{S-`b|V@bb>d? zVXLu5_YzzcMmzrELkn$F@1f8GhKvX>?ACl1nN{}2sN7OFtBK={LB$4Qx1e)Bi7OtA z)wobgJ;wKh;2Svv)0Lo3Z_M>$ZX!2y4_0Y$X!W$MY{+yeQ3y#-+zW~g*Vx0t^{-_9 zesz?+qtBjIzO#+%AgXR@xt(oN&|SbMsNPUhK>;&6T2U85P}=&}bH{%Xx<-#0oGgsBJ*Ea8Q{`i`0B+{s4| z7cIBPrE8#$*a=fzy|Vi-d7b5y&+YFmXL%ue$BUsR>r7=_7^XOfK2yY0PAiRjY3i< ztyblMm}uead16sRBSjpHgV`Os?a`*$sORx5HaS*H`{V9RrFc8#Ro)=b3+EnoybM(s zfyO}wjLp9v%BQjV)^NxD8sSUum+O;APCCNBOTJ@y_~^5`_ej)H5FSoL!`>@t$k34% zTT2qyIst~lO`;g|4vdVxU?6#=Z(_sE=o9Ypo3mIp!g1-GbqIzqTm~m?aa-U6oV)OW z-hYtfb6)~IYHGG|cIzV}a)PyLsT_O|^zz)^KjN%PETa;Q;4X6Bo<8C)Cju3 zk~h70=y_g$nYjZ&T#blNnp-31x=V4{^mb?#c(jcvSyqtpKJ>;|zRNo6A^PUych~5m z7r!NG8656sNbj79IIZ(;t86C!l=C-H%K9&BI z2~Y5-sabv*KXcR7{_%1|oIzdq-%prj^V8>B2`7HleY=r;(>?=zTx#L5=Ol+jhd=2U zvS@y%7vIs;yF7?x*HjbM-;9&?K~{i<&Bu%Bl4|-tsri3y6!e9HQ2OW5?4$Q@Uda@@ zclV%h*^=(VEB5r;>Zi_!)hzelyHU2FcGoobl}Z8clVsmgic`gW5sj!{c)o8zw6v!O zfuOJ^JnJ_ih?<}Kuh@fXYA@3??${%S?S)*w5f^;nL@CN3lx65B5gK;fst`2zHByfq?ms-o(P;uG4fCOtH|D4Qz zB$bQQ?Z@F%@eYCdFS&^jldC74PS0iig(A$l+_gg)YhzWM8w=9~l5cUPi+8B%n+Cd$jHuH`m`kE#J{7J9-(e2Jy=ner^sywAOGCgy`q7bNa}ReF>KgQx zx>*nYpBE_<+6t>UD*Jr%blp4sCjOP!|9+-pENB5nRe#^Q`+W11?XOf3Ry>A7jc;76 zG@6$u;gE3EJ>7M-hi4ugS-tu9$Aou&vrO9--nsfe@BAF7)t49P}J6{lP4{U&y6vyB_WhfvlrHhi65cgpPi{$J30_CMfxQf1H#+i9# z$!;w+3dqcd9fzP%cQ8@r4f;SmJEmSQ&tw(u2&5l`gglJvUD{Ho%_yd6sBwx%c&0$9 z7N?ovd*TKjz!E@%b6k|eK+DTPZI%y|wEO|w1~MjdD3Ne(mU}}>{Tn#3dmCYC1{g74 zZP%;;e$(W#9)Gv`!l;e;I+bsHRBrcw2XYK)XB zW+h3S^ZW4m=NjE8J~|{;r>QoZ*EkOJ{`J4j$$eQ8^>P&2FMrNc+&*6G|EGD#G}YO_ z&;F;muOn*5Oh=26-i?hvNRbXB*nw)1hB>mVe5GPwrq6& z0-U>6rz3cB(U7n@R$1^9C`>*s_jvw&(u|zCL4@wBc#QTGm3iss6C=g;G7pgC^;FHn zt$z_6tGY)WYjtaiE(j?JZr*0nS)Y_smt#>|01lMu25myidBP#bc;AlQCK9UW;_yKYaPZXJ|~gs6(d8 zY&!%_DL5?>1_q-xw}C;<&~yXsbKk1-@8mnB6jf?m+qwCG{q^Fh&j1|u9A$tCM|V!H zNu|}U0`?LPp8&oFPuwrO2)Rl#wKnDFN7SjjTBpz&_g*RmNm&dcS5whFa`#c&DX{t4 z$2ivB zWs{SW*V$E{jf#3gHsqh18`FfQ@5H9T78T@>%{?YXo61s6mTnA+e2biWQ4%D$aco+{ z{p-N7O0t~XqCos3%Ug;6>GhH=;n;wl@5*t*4%lOPHq30{us`G;+xC70A)9qR`!5!^ zZzPhNA$UE>4nfRUz}=sn`t2y~xxl;dR_jc1fO57(V?7opkDDjMoWr%RlL?&OdF4}C zsaaUQ{eb*Mo{1>wy-Wu0xTscq;91EY51rKn#8>E{> z^<*{DKCxDmE!0s?P(Pm(p~GByl9ZS?hY*ZsV9}KDTN3mQOpjHq0&+*~e_WvA=)me+ zAcGH??Csv<{ruDaZ~@G#dpmLqSU+W=^zPCeq^28}bM1g*0qq22K*(PrOs60-JMf2; zrP)i8dJ*6y=IV$X0yNYgQ~kVxu#W~l-uWnb{|j}{dG``l5yCRn!E;giwU1M`+ zUib_fzs+J@a>i&$2jq!xGpmihBiFGp|GHLi>IsKXG8fm!^i)S*LhJmP;DhjCEBn^3 zV|E_AzRMUI9qjx<@%Sd~bMg}SdNlLuXdDMWW^CcuG&YK*>-S9#0UeEVcJMFOjSC!B zzLxf6-(_*>h=lwhbKBh$E`L@WS_dQ?dZlRX<=*GbXqq>wcO$wlCHIZ#wLCr{$W$s5 zI{}$WRkjKFB6|P-J>}u9B!#hB{FPs?E2>RY-fx6=fkT(0Q3xGbFoWNQybdu9<@(^H z&fJoceLQz3Uuzus-A4_s*<2;S`-wj&TEQ!>yS0ZCrZFN;w}QZ5?PSaY`JwVQNk002 zi#q?vQk5O-y?if)<8_FmBSO@eGVi%}EER@>IEx3W=MGLqtG#Z<}iG6x0v>E+bu#kcNt^uxK%_dbHSr{Old=X)O%7Y;c6A_RDSuZLkx7Z|v9 z!q9DqoSo3Xj#~>KO5%{V7 z-5W}yzmIH_-^;CC4QNzIZVeqDyKSsOPsa>bhZq#`5_N2R&?Ss#HgLlk^ zYU=Jc)mY-pruw>wskoN6o*T!0?jM<|RJDB^Wi=m3&mRSX$7nr`~3}FT#J2%(S*UytYHp()GFE zJ#dUY{mo9FFKS7OtMSjAaNz3jbrm>J{P%bw(~6W$1Tl{Ony(7T-`3!Q!>rH9|3fU( zp8|p_b44JkelBpd_ThN*fX=X-cw8mV^1HO5tt%h1^Dy|c#nzvW!|_tv1@P+6*Sw11 z0i(Id5J5a?EtDYus3F(1KYoL7z)Oz-skH&;FtK__kmPc3xU=t8zR-4bh!iHM%#o*b zPa|NQ)wfl+d>$U8%3t2vOPuZ4pyhD9hiIi_86ztEm~(C8m<80250);7WVCMnV)F3t zcJvAGMADePJOb!G@!xq&2aC~F^;;GzHwjCwLvC2crbxNjPrQj-;`mA5 zQMX|ca1OR>`ZgBAGyj4*^%`?0imfKH5qoYD$-8-OU)S^$_dVwTwU}3h!*PW#zia1R z>^&keCI(!o*8(}}E*w|7((M$6<*4FMbv-)t!Oxzpvbji1Jw4Ci+xH^EuhrmuoUYsp zr(5+(vAC9^u`pDlQ)GWSNQ{lkR|l0SgR#VHjg~0C`iM?}YzJb6TUS0L5=A?3T8G=E z{+6D2zimolG>wK!B>(H$cn94Q;T@cYaej_y-573k9Lrq!DL-$#KgN+Vk-bvTfalWv z_ElLHo@O$Bn&Ig+?9D$MkACdrlY{eHg}A<6zg1rvW-^XDp8lhM^nSS-@qLKpWwYj@ zNKDGmd_Sr(-^-O@rhN5-h+p_Dm}d|5yHVy`;KjJPE^ltj6@PnAwB2uiR z{?9MLLV4zN)wS)PqK{r$S~A$P7;Sl1jSaHLJpzCr`>f^fbnr>_|H6XMx$?pr{z<@Xx+jD>92wuTK9?dN0@_jWl+j_66w33`w%4MU2|jOj(}+hLG2s6dDN>amX!FXI=f+%9}$3 z<^@mDOUHtFcX8%`d6v#?KqMkv`F^9J27Ot&^pyG)-P0l;vf|b1VXjYXmHW?b-#0h7 zVD2n&do$PaC(aO0z}M5I&Qn@h*(g&F@0L@liG~+XwLRWgE%Y}Vxv=(KnaEwUc!2Vl zl*K9F<279t=h|g!ZxG*B_SND1v3GIi=xXvD@qKj)eXGEzH5P;fT|&j=p@J@V|B?;0 z=7JR>_ef7^X6Rbw&6TpgsYdA|ze&_ly?+z0Bz2%+e<_VW-BVlPUW3-7JN)L~gHtal zgg{)$o>fRi_hJK7H}O@+!9U%aW#CbVh~?f}NJ?8JEFMc~8G@sP3&3zK_{1MimU;qL z^Q-6OTwq$?5*#v&WZRMy`<^gJM=C6bs@lj<|GM8bC=RK^wp9ajvu_kzVm|;?@6`iZ z2a?m;fhng^P9JRX+9!3E6VYubVEsj|@>~9be(Z!pH=z(-ujRIL(1>WQHB4Zx!>3*a zoWBo-KxJ^0pdDajwwwMq7fHv?;uu{GCohN0T__Xb=1^+WD`wTG&A4HPAB(R4tW7#RqB%+;^t ze{5)Y7-Ndb-#$Hxh+j`|idynrWlF6dbPU9avl*9^{L!bnGkR;H`Qz|>$ECnJ^}^ff zqF~V$eXaQ2%mWmLD-UsL9>7Re=E67S`2&U4KR4Cx#B|^VrAyOo8e_GHY=%0bT8Y1f zxTHAviJ&Cll=}PFy!-)XL`g9Esm&TyLV}$!SR|(Zd~t$FzjV<%creDqNZCp%J{Lv4 zYh()iWO&hrn{Z(p!jIVmuIQE3=L`|MskQPi`xdf^_4V>UV==Bzs89j7eqVjp`QFYc zU8&N=TYjj0pOj#;*b9~)pPt`5!oqVk@aCm(r60dX?3^EuzCL~REi-fbZF&(A3F`-E zP%~F3lp#;l`c9P}rM$rR<#ZA;X~nQji6ZBHccg5DQqpfmtgge>HAfh`8YgxyuH5A& ziLH>apgk6;%3D5e969-eTV`-%KBv3UGIpbW`E40@M@~dC-k{}fBv<)wjC5*IccTy9 zGMKV8GGEaBUr$5(bIH38z9*cGELy$G?Kaq3C)eyXSD{v3wh`{j-H{oQ_^->1O&2NP zEvHBYmMzG8nH0n91ou8WA^7a`82>1nZE(Fw30=VbwBcOyD<`JP%%@So*y!EMywsQe z=j)J6z_`D$gVx>OuXjh8cJ58*IifZb>o*d2gdUi$Bw&*cyG|!cD(|i+`+t%BCKFA` zHmJ?+%Jw+XNw3KqwJM>!BGJNC->FPxVYc}*PSmFzz158VgI?830Ugye&U3iw)K%NN z?)x7T(`4G_T$f!ON~m6ddOLa^>dzUrEp*LNGfN;Lc#;xUzaBO(+wJ+yc_n{qSF8Sy zR&r{SH-u98CMhQQZ)W;ed1I@G-Oerka3F+SE6h~Mb&F9siw-?{v|kYIBJr;7`f_s} zGR$^SZvF|QC#>Ni3!^Fc2n(kDS98pLFD;(pQeLa5`|DW2Z`gW2 zj^&$aG|I>*?QV1{k6mnt8FsuHsq6ELMHlA}UJQTso*X7O8D5O7TYggzj+*zE8`NEr zE7$Q~^YF*k$Lm&a_GT)T4fXB*5%+8S+&F%yU77DN6<^^qYCm{Hx%Uv!3|powkuS31 zezO6ECxYs0?QUCH)~5cZ!KN7MNDXPd_PX+Bx-#BB>m8}!bWU0mWS!;on3bFNu7k}z z?f^I1U%Xjr{n2M!H*tP-_3Mrk%X;&~dhSkBRYrWf$Agh7cQ!@;Yz93FYnOlW_|!mZ zxjz>nEJ?n5r)E10cX#^j8HygIzo@daQqTG=zC8S;eq^$(|L~4JNz`jtfwDqA|G>6y z{NtQ1$+&O0cv|Pp{Rbn(?ssJUQno!Q2Jk#tJ#r>l#L_a^GPC_c8Ja6GEajhz;|*6U zJjv4}^S=4_(>epG_;R})BDaiwaF#C-Z#uE{_y09Dzi;Vu9ue{KiZ~eHtCi+Qg)KPE zI4y5u`r15V`Adkj$cazousZP$UDJo@0{-_4*RG;iuBL(o;9_SU04e!-(QQKgw((lC zOkL5gVKE}lV>m37saxN}jmLU#o+K4XG92cVw|PJ)Nmh)nu-)q*EnP&%Ly_oMV&`h?Db(QQ1@QXg6|qJ#sw4wkxfD9h)v2#z1U6z5dLtIizRhl}oPIhIL0Auwu=pZdrImOtQ&Wg7_2q2?3hpq^ zTKDc&w@}TonTnX|{9Qx)LmC0c8~=FMuXzuR>~4>we)p?et6d-kCDAx!@28>~+4$!1 zjbV;+D)H|&aF0Nj`jcJ1? zUN%>$2}f;1BJ{q|9(apyffbo}di?cE{Wa8klQ5HAKVkEnysY8{+InQEj9g*Qbj$_H z(6rmln4R7-hdzVT*w ze@UxU?M4@R9?3kh3|Tfc0ob%hI7`{@Il&Z-^&>PC>-G01I-)1(>}&rRMg;-M6=49|H(apN(3TJBA0 zkN@I+|L^;M=yH`OQq2CmXLB3o-A49!p|@^2yUXWyAAKRa2YvQY-kaoFao`V(9VhHO zO8vuDme7yXr|f7bkh zrEJb=##P^}`|lMri*8P;D({+1JWJrmVh#lV-TF8tjbRD8cq9xdD0!Ry^9*~xmAP(0 zz0=0jq;i=*?5301E7P!el*&YEg$eyBg+iqDgAeA9a`?{@PDw|TldjKBO;}j zYv#I-yL8BeB^{?7l^@lbIo#RR#!i*i^KOUy$+1k{(WN8PDAvkmP8KTUVEE_D`669X zO;_BEkU*_SH}zccjC}oyJVoWT`Oc&{x1w9WagjRH@Vl48C)O25Rum6Ti$0m+*I+bKN-C!f(IP{eJf<9Nz8H zY-~^a@7QMgebXHN-4n1FlZT({FR)a`*50ctEARETH_;lepZILN3i{Kl6DLxjU*xHuFY|W^0BnuCw`# zv2C80qpLF$JkY+|QNPvkLndD|MmIw7ICah5iO_qs=>l3jO)|wpMP+C`qT8Hqay536 zKrF7@$r9Hl5g6SqCml%hZX3vFQ(qgaudF+8*KH)*nmt9vdZb8?l1XMH(SlG-G7#pamG2zX=IZUD)6MV%!yQ^+M9dphE9*jk*AZZ z&t(>k|FbI4o&9GCA04$xmJV=)wo543{_AlRokN?0&wJ#LTYfV6`ha>}w9{kb%@a@i z7J_m&)4TMzQQteZ;r7uV9;cA@R2k!v3`e`oK}!Efii?; zQQxyYp}R9NK|NXH^U<-B-~ni2vUBKXwxFZvlI%ZEg%sW~EiRQf7I^jL$e~+LXa#!L zoSFh1(wat4b@MkvYr@^i=}Cddq&8i@M;>jb;y8_KRJ*%75oOuR_0&rL+;f%$8=DJ7 zjcGqPcA5iEZgbAu#rJj(-x)@F#I4w^LnC^EZml@3HkU6}7^ z^IkyO?qX_os3?+&Y23V2%E0KTHcCwbMkMLSHM0qt! zx2tm#P)=g~`XVSUYId(J#_!Lsu*5leNtY?FRoG#aDJ9B;`k4h8{}q{PPvx-rOdAtQ zxgW7yG{HAQev~@GeI)6%;-e4pkAC6^ip`i~ud{sJI;)6hIP@sEywOp!+ZKW<3RDG{8qSxliO<+o*(soldqe@8dW<1p6kYr8bIOk<@;FtD1%La$*n97&rq-=}R51!zKxKnS6$?e0C?%9o zL`6V)2PujO0U|v}Xrf!$N>iGJ5I{O1)FdEXQ4ml9flvYg6flqwP)I<6fxFoI?6ceX zefPWPzdOcn3}CFRjF5NDx6NnHXFjj;{`~`PAIJ9h{1qvj_c+QWBhJcb5G2yw)*5k5S-bI(xxeZ_MJ$-aCyz zDyLy-b8%_0$w^)gblxMJ=lg>Rmr3@}wsS{sVHPTfamfNF*1p+H`gy`+E-CTuyuDeB z^)O*a)PkoZJ@!Rrn!zS;KB<+=FMVSd9C+uRIY`7I$Y${E8i=)3)N*<;)8LrD%>A~Q zlgAz#e?`xDX94mlKRCCK%B}DnyiJIl7--)1hqkjHEsDcNO@QDMyR9i;10NvDd9^Gq z+tE?e1n7on?W7{f?Dp-UyW#h(P^f1FAZ^{$Im(ddiscN)QnrvQithET+FLKSiW{e# zwgSS5a#@k*m@Rjoym|2Ni49z|DIL5`yk8uu$BK0wi5?C+5>}iYY`x_t*t1#Fq4ZOT zoIs@7R;{g6$k&z=ZNr=04lT=$uh!4HeS72}wxK6^-eUmN^tdfS8}xYSDPkLug|;+h z{lwL1JOxy9FMSUOM1|5fFG}3a2l!eG6ovEt;@7!8TW%v2vB&}Zk-ZVm4Xk+<)OV@~ zZ@aqH_`XUvCq7RyTZ48WpgRr4Ea?60ZO3oD?6E2>Y-$a(cj9oLf!j>R20GK)5;c2D zIijn~sXTN7$i+=MrEcTKsWS&}Z#xoBd?tr|z8mT>4Y_B7gT)&!v}x3^m%K2mlycY8 zULXEg-nOj}BU43eJh-}XvG|8nmFmvc9V_JF7Wn*1@cG{t!C#Fgc9LsD$#Ut~YXa^s zpWpr$<_E#-ulFds!lN6s>7Oj%)a=Ml=Fcvpq^i)MrXV)66FCa`@}ah2yFMQBy=@m| zSh6A3q358dRFx>mT$}S@QNdLN4d8)ISgS=an?28QFcg*YMW0LRM5@!rpp(Z z=X#lDcNgO}mgp^;eLz}IW6yaH4W6sPL(eIr#jbtX7v_xr*u($5nV&H@7KacBtdai% zS^oyHS5L&{^G^&w0yp9tsK|{UixGsD+j?&2yT!OV51skO5l)+;>QqF2{}kW2u?QhU zm~Bs{le_e!0uS;aV>&z=YI}9-v{{2Tjgd>bWXSEU6W3p)p7)p(m>c_cjxZ+?**o2E z!boC%v5|UtZEDw{wU*ZES9eft)4jUQjo*Wc;l)Vg<>t%B|K6Sd~s57M#*CHi>+I6i|N4X3oJIo>_JGW zHvPl72l2puKe7f|Y}|@7pCfk3zwV|0`_yPQ(m|E6vGHQxTthykf6;G^GMOFR ztwH`)5qW)W1gLp_aS(hE#57~tnd{EaEY9^uJ(a~}GsBi$eUyO7}bmWUWaOgHbcWc;2)BHqj1k)!RNOf+zuM-b~E!3r| zAfUM)dGQjoALA>)@$~(!K<#UE-^6PAnNz%RBp#o}C1BSuk5mFgAfOb0Z{F&BQ&S#< zPizAO5=SUN0UgI;@{3}L3p~Uc`3GkG4Zn_@<&p_-iumUne}=dJ!!_6vbhsou_m;Pk z^glZ9_u$RHqd5QJ0zh`av0Y<}m%IP$bpCFy|1gCA_Eumn@cAWuO7GPxfA5CB&&*$E z?%!{=;SXKToxoF`q|5wQ?Vr-${(M|R_N!-qH|_smwLiTDI{}>425USaXYt>)Kkhj2 z`CVb!Au4|}R{Zm}?MgM*ZFqPP8A>$n)TJ8oEO>KaqkzO#IFK=8MCk$t$E~i+7A#1e zN!tM%0Aq%!&$+#gcot4@wd1uJ-*p=zF(k$HiSf6BgJ2$z&pJ?Yvt+K#z{W(=N{c0a?&((hxs4a} zd2s&K?_2<%tRcT__z%#9(W(?)ETeu^M-IfdcV`?TCE|l;GLAA|ES9dFMF*@nvIN2s zwljj^Bs&MjnyK;nn&!@(oXqXW^w#0)E!Y}Dz?`mpOf>Ygdkn=}V{wO2UCR^`$Z-I$ zev9Alk@rT`#QN3nzqLH>#F_nr+8?%8@euy?-9e{OK%Eu^C=u^Z`vuSc>KNbG*@1cw zX*n&uHVyV;H}W(65HaoYJ2k(VgQKJ|89Znupgj2c$1|zB3lM(wjdfZlPJabm0{+NlYciVO!xmtDPrsuis zoA?)Zf?PWMT%r@K>;E#{Mzpw|Z;dv%q;lQY=kfogEejAqTHKZ^dk}1@{>O-|(ur%b z3W&I=dE$?+_z#QH|9RmOlCD}7Wo>ICEdu0sAQuW$B~XBAR$tXGCj7^0pLHw{IDsDS z4gk)L^6jX?toMHPooW%=8&@P(_;`>}IKb{*8@@K4`O_bt`mLQk2jBl7oW%zF1HJ@a zpdR3KH;M){RV#l1T$@(LJG1qFH<%9`mzFF6K``&Qdnzgamrt~ss2{_I6Z$#>|2hYL z@4&w=n18$9cI_Cj{|Hqe+mjIU}@`3JyKZpDO{fPgs7ks;by;l4| zBhqjG-{-_%2FRaQ+`rr(Img92IChSyBKG%w_+LFvpz!wgb7p+@s^@=aKGgR1fyzty z^p{rspIZBm`|bLG_2NG%QAPec^8vlj#ai7Ty+8hM{rbOqDYgf&p7#E5&EY={{{NNX zej5{2-hlT|=RM~M{;x(a;Nk<;yH?)=uqyvyn*Yrm$oA$7F_dio|Ip??jpq0`zO?jmElUAZ`-s_q+85{ZR>8p$Kdx=~ z@N0e?z@=J64qAl-pt7`inC2<5@(lN=R4F}aTRJKdU=%lQ7a7|q1I2m;i@@ohi;V1u zFVC}QfpSDT{Sm`h$`a&*!-fKa6D$NS$qiaW9tsP!Va4e}N#~0Wu*i&% z{;)`ho$_FN4Ygt9`aQ_71AZ{8fN{cdE#1u|f01OFPaoIQ!uA=@cX*qU;`1(z)+uC> z>HW8Ajq)o9)*`G#*(5UFcdH}h4GxDd(;cim?^?<3O!g8-^_X4QFlT@>WJsT~yS1chP8!{1C8 z;O@KJ^mS&AaK7Jzt|OxQMaRkyL` zr?q2;&p0W4{VeakZc<4C&Zgizv)>LTx}hy@zY(x+3FQ-8NAm*$f8J|=yI;Bi#Fob? zKysh+272=SbahI!e>gxGTI&I#p#uQ#)hYT8vHL;D=eQHOu)H2#y zShSFVBwCskXOA2G)wi7mXyuEzZhX^xMz)OJde^oqJmQz(_SCGqH}e=je((UxlH!pg zzB*_acFzp1aMv^ypQ69}WGutaoet!P2}~_at|r478k)V~AX3;vG=rvdh_WC1WI26_ zh#LQwR_veN+p#`v2lQFbVe+r}Q!k#|Lk>G|c=AXUqY(+%F1pn#``9apn!Dx=FLzjvRLm%|SbQk$v{oXxTS109yLiuKmbqncp0+pEqvBG2c4O#?Txx zP?_S>*pqV^DCRvENVObvBQe~kOE}qetCA?4sfh6E`Fzf$BIAEQ$3XdAFUB4z^og+65yl3 z;$Hg*X1*G62ztp`>fim7HVJ`}uA95;U!RoR{2IT}R?rDq49b2NLkIkkZ%L2VQTlWp z_DHAh5CxEbrvTDvV}49IiwKh?4<{MRlqbOwyce!dPfKpymQ0_CO*T&!6!gC>0>p&CbqYqeo{Dpsqmp-p6(tx}157H9R-(i{-g#6RYd>ABP=r3jk?*-mGnZN{Z>Y|pI*eZI>_Hp?PGzf zRO2<&>yaKd%$GhDJ%8sC%Wp)$woIuiY}D&1E9{9nG*r-qeW}6>VnZEy(-(3V5|*U+ z156tYj>dbRhmLA*Bxxx1nX>G*$v`oOAKELX@)m@#%L*mqf^ivB*y>+IKQUGn_iM4YhS0+QX)y~WOJPwAikNX8yRI%=_Z2 z&Wr}7)$c!i7r^i2HiS-Cwn)UB+O%M#1<6B>rc5oq@XSo`;XQNb(QKwW60W#=MS9jU<`rKOu_9Ylss1X_| zf$c6eLklGfInSmS*mVXU3;MY{btEq=`9#`?P_N)thLBh5kB|SrZ?xOzP+%)3AS%)bxsx!C;io?E_ zatd&@Mw!r(kNKNh^&0RQ^BK-Q*Eo980)OSZEhRTHfu6ge4$gEX9PWHNotBo7Kb(ry z0AZg9t))uL*@_lJ1aFr@3T+tU+eh46h-kqDlQPArStqe2+^Q$^)UTLmRiqln@kdSV z;sk|y)}-nl;hnhF`Ld;7mRr+>9H12rnLX%DsFs$zstvqMyRz^xtHic5J4MO`Wx%#t z&~$J;^Lk1P!ab%J!>|Gu5VeZiyW`NQsZrRst0l%UoKw;4l&F~#(e7k80TCcR z>QD%6REcqcuX>w3z^yaxJZj=&S_T&QdVA9-oV5s2pAJREJw z*$G*gCJ5F~l*X_N5)_y>o{XTUtP!6rLUdh~%Vgl`HmF(cJXh1v(+rW}uCcrK;9#=rDqrmM z?KNHpM%ufPOjM%aobkrY1o6qV&Kk0|Jg1g96O*wNmptz7r%v%+_svPa2mcWeMU~iS zRaFbad6`CZlHb(Kg2{qEJ0f0EYH;Q7Su|xfP1M&xGER|nkw6nMMsY zsYKpKjw7{@^3q6Xq@Z^JJ-FgwVA#|>$sZnT+k{BYY;f+gT!$lIg66Z+Z5Jc9lczDM zC|6d;&q=HrzH-V<(a5Y*Q=0>id4&azV-1Rha-bQ>nqaeJ9&GNC9oZf~l4lrj@ui)* zDOxO%X%e4YJ)->m?up0kHb>{4yTP(dlBs9H#Ft{$8XEx?&lUq#?Oht>OszoBj;?$L zUmWBX*ym;Hnr|g=z$-x@TM@RI07TbXT2Dx> ze%VbEG!)oJj$H*$jINY>^nBjOzwqkT@Xd&!S`^8BkEosGayj^0Z^IF0JTCGgnvEtE zA}9}mr=Y2Nh?{`EOak&_$jhazjXk+hh-GX>tAa%uV&0{YDS{RyG6Yn>54CI&lMI8RK0*F~SB9cLdMG8|SfkN3y1EM0PY%arRCocSs5PS>Pu#9G*bXvpGmsjTs*-VW5^zQM zM{{M?2huKXSxXYZsi-KGDyFL1(35zT{2&`>-H-m+N?~6I_aCiW8?76<2?=6cC@x}iT-`}NNFySn6|-aS^gv<&(jwo7Cn%*O1ynHd3+iXiHX$7 zFtK2|;6eS>b_VhOr50WZ6Iap77phqwyme~>G^n`y{&tOi3k4LlOYaf_+K$+bW^`vF zm=)p8=vMWZonf8=WV76f^wTx}2zKIs*};w7S@pWoprDQ_+$Lh{EP5tHI`~Ua8@Tt| zo*E#AJq7~n%{>XZ_}CC_m%JD(9CUY(0Mqi$u!V!cqXD`f z!F}6j&F;~-F=gCBJHrbMwua}-XS2O#|6RRY5J*z8bd6PTtpFHbe)La zAyOnMaF%&4fa_rl6FPI^v6=h51JwTAF3d7k&N?!%VmZR+FxE_j zd-OGeZM02?7of& zcBtT;?%^|)l_rfHW>~a)Ils=J5{y`Ze=fet9AL#m9bS0TzoT`=x1c$54+DNlDJDz% zvCCGCv#yO-C!>(2b=d2P1{UO_Ya`&FGqDz^=HyRCXN$sfpncUHC&{O#nWfbsP=_fW z=u-|L$%L=#%zVe<$D#LkT)_71ftca$FZ+zSk_$bD-?`x$mpx3sUV}cpxvo={adp}j z^1OHmMEiImo%8q5XVpc}{-69Tea=4)Y0vF@1C@DyQVxWV+riE28vG$Jv=8%6J&la* zj;Ou$`OfDLhb*ssMxH3V`#dT#t^G;6jq9fipBz<;Z4y>E`%yK(U?R}Pm2_pqCAbPn zd&yJv^3B8pnBZPamQRvlA1LM6A-}GZ`-)Fcn+ReF3uW5f*3Pq+9rz4Fk$K2$tnQbu zH%ynWPT9^bsPZ2BZv6EDLTc{r0V+C8?2A`V5VEG%ZV$`e^5ZkigyrUfY#gyf{Yq}p z78;_KcRoMs_U6EW;?_HoBD4PCOY`ToW8MimFh#8SR-@BeNItt|d9M&r#HXH}q3uHP zO}x9c&b>Zy(8Tf}Z11r`JCWhF>Y^TE39qDqjkLVxH`3&b60wQIGXCCg$b;aHqlD*# z2W>KheQB5q2XW@Otjm!9vmI7K!{xf)k)g-quDRo|k1)nsJq=wDwQr@a1po|CBJyy2 zrpH}GY*E)lsXESUdvvu_`x&SdsGZa3fOl!GA%`!bj-97Be0jx?Ba8Mpq<{jT_bNMR z!STB3hD%&H48klIM#uNcHQS%bL%bhqLR=o>I`nAT5u%XXjwm6#L*02B0eD`{zjK7(SYCs=e7}<3Q5Vu; z*}jDgML%}9>#{En9ra_+mm4GHY@MEo!PU`1q?Y`~Hn{lf4?KH^QEl8ygO|MeyX<#a zCi5pmi#k8v9=NZo>3d=*G)7%Jmm z2)_IDlK2wiHWzE~$ZMwsW5Y;0$w6_0xLzyCK@D}<0WU8!t(6{7`SIj^7|I(cjtN1i z8@ddHFHbGKYv<&Ou1!Rvk5v+@shG$b^WYZmSG?i@;JylbQ3rN)g?fdw6)om;Rxr<* zG%>%Liy{-j53hgwoeN-t$KI1J2GQ1`j(BW8fTCZD4#ODEr1+X*2&PV}nn8U&7m=x$ zwv`-9cbOzx=mQ>DE=s)-6JJqFyX3;}S~p!x%k#6!dBCdTy6H_a44atAj!D;=buOY^ zS3%B*zgw(LP%H`PlqHUal3{=t~RA zj`SICKQi0Wuyh!L^Yorpg^5DsPH-Y5_?Lo4U-Eh~>RLGr&NFmHB}oZ#H0?liL+DJy z`ZLlwsw1JsZW<^=uCr8fcg847`}(0jh9`|=!q;~v#g2qBI&|yz!xX#q;t&a`-9i7@ zI66W5f9yY~X?KNBMCZk`>D+1A@_^T(Ca*ov7V%@z&c)w2m7^#&!Tj)0R`$M|rq=ExS!UUJz#{w9_;E1V-jhSKVf-u zOzWXQ_N}?@w-1k*nFk6?kWkKbYi^Qn&*!V=ix-N9UGga2ntym}uGtzoc=GNvjzW9j z>Eg2B@Up!uC)q|R?Dcx`IfI|MT{2J0P+J?O$a5na)ZO~kHp9|-!w-Fp66{vu>4to3GG*_HBl?ax6pmhV7wojNneXHj?PLdSQ955-`1XpFvT>tSy*GyH z3_48gcovgh%@!$?Of&Bdcxtc5)L+8HT#@KcVLP*}yyMxWh?2#Uc&=S$2dha#NyNJ4 z1N1zt51EKL8RQp|p@8Q*D@ZxwwM3(0Ctku{d>MJoLerbnZnqTH`zsi|$ZI?1msl*0 z?h0cR>ej=RwP7 zR2S-?+E-mbx98=BD>Zdrm(VAkWNWrEZP$|dO{P-S6ZSBlP};-Nnof@x?>PL?mm6Am zH`^I9e~%b3|1i_XWUK!hyS3Ijqp`*IrLvYqW3rqJd%R-wyUCsT1R&__R8$kKp9^rw z#d&|q#}L!h-yL~2WXAP5Q7TG`?9u=&Z7H}Q$;BG6m6+D`Dp9`OKQwx2F75|`M1k@-x2#xcaDCGJ{ z+=i6_oV)q`&5_=-xW3ifhlJjw-}~E!ao$6lN0>(x6jJitAI3Ga*1$q+w&fG_bI!eA3@JC7v{W+FJKvs@NuEy6uo*b1Q=G}t zG5~q1*TN!s)9uIb8Ph2%wOm1BwdSAu1g>0kM^1`8SuQYq&BvG_^d)Px=LO^iB*WrH zLzTgI;bU)0D|7>OgYAQpgA*HsdM~JZYZ}i6+#RSZ(7e1m2|PI?&0RvkpJFJ*sU$VT zldaibBV3b$X^jMFC}`}vyNgqt*}?}C?D1zwnHl#dpo5@9omrB!z(cZAngQ*;#x!(5 zUNSj?6{G+K;&o#4-ER-Mi`c;lyW_5AMB!Vx7*X)#DDar8BqgPAFWy@6w4<>T=~~#R z+D&WbgI?#Dc$Ht4@8U`U-YwO9Z?6M9|cEfV+$Li|dXYdTldWVk9)#n>~_RqW=wz}YFj!WV{ppDL1BO%?b+lSYyd zo~%}sN2v}H(sPE6IgeuVRzLaj;{SaRp50oM@LPHb#M6<;Kj5J-4_V+d%RB@i%tUs$UztU+=u8mYH5EmYQtS z@C4AnBl+(w3El&DuH8Le%<@ zVUq6}FuKpqTgM?(yP>_A>eg8%dbNi!`*-~e?$qpp8co2;^qbZ7FK4w}&P5?K_aA&| z>g8Q|7vC+x2q|#LMS05i`99Z&2<#MFM+pjs(nV%7LUrQo(X3Zo((3&Vq>gLuFl5UP zPCB1i!3UaWm4%-4V=P9n#DQF#aV=NEY~If&z*LXieUX`;bmS)=uFGXD9p%cqOTz6k zs}Ut1YOP(j9NQL}5I(U%{`5}@$+S<7Z3>qJ&x{2l?Y{h0Z)bMUXJeU^Q`yo4x5;8e zr2oW~aL|W!K_iox=FO1qcz{}CeHbk6cPOIJ!dIRqp7Pwm5sO2x&voBaC?VulqElXY z6srVB)_sJ8CJG!-N%}`FXx@=v2_{oeVPfaBcG?|!TpdI145~>RNlP>T~5;qACgka zD0VHx3N0%8!DjM&^#4#l(q=;B4+O(z4BE*#?94EamSqyD;B$l%%K^c ziVMZ_HXRIoV<5Lg)Zsok=Gj$y;W-GAo|Zje<#zY_Ag(m$_0gVlW9o(;SMtf$YyJWv zq%T2l{cJQpP^iRRNyG_-q!&L_UQL6;inw&<2MYQf&&VAZ$iT7lw$5iQSEE>ok6Tw= zH$EycdfWrkRwEVq9-VNdghWf>u5Wqifdh^NfxQ2-vSpDdZO>PiJqz}yT2nN#^*^4b^vzgdCbN6VrgA7kNy;O zr$|jxJ=H6EF!y^gOf_t=Cp$=yK4|DwhHy0a#f~2mNuJSEwd4ObY76maeEBtlya3us z5Q1gmydP7O?wp1_ha>DIZ_0G1>a6L-<)N6bAR9@e%u_=oo1{u+8Vka-82u_Xte3sl zytFNHydvetBsL~w)4zmTR+@9nN3XfI&Dc1?tHq=l*HvNpB*Z7=MrYW zn@j3-W)SmMSnWspE6yIkp_L^VN<|bWL~sseEzCM835Z)JVv>&P7$rfm+T=IYGbsBr zMauHV9rGPJ9lB(>=}q+vxT!@U#dja@A3UK?)lC~!wFZ@>Hb&Ib}(xi`(t|zB2WgK(e91DY2YQYQ3tJ@yd``s%O`#R&p98qPr)r z^cFyN@dcM+jbc@`nGs92ekuwnJ7DFJY4pi1IJi3mFW-8uTNIY=`u zlxCK2_~Omyo_>+MwmFdk(a)VYx27Qp-dp8wCqD^uxTcLc$6m*vTJQQX6VmWG`twkM zQ)=*m*uf>VzHGFCR)0Kl_6STZ@h#SC3! z+3LzPoP!t1czWv=E}u((2cVfH%f?xk63;OlU2!r1I_}uv*n5PQ$Vdzzzm;(yV^kd1 zwZA~?SmE_b?&D|ORxT97M{JPl66)3gKLZ!(lzpjxL|Exp=RM9_dUdt=we-*}BEkH&7(K)-C6A`2H=;)zjba+C0AH zjr)XI=?hA|voG!llE$pmwf&qsBkRA51)Ab@`)0OQdMtDApp7KxC6DdrE3>w^B|tmx zqbO@HQ>a{{wBJeKf!Iu8zthvJj9a3YFFfU9MbV=a^e^`z)rW6f-|fu%A|Fb&S?fsk zkYT_hk*hDxxi!WBT2bOhN{TB+D;MSB8^G#vYB$lz?a$4^+Sa)+{@WF6$!4;N**UP3 z!y^Uu-ngKIJqSKlzL}6m5vEX*;eLu;9b1y~Lby!#g7{>GsUBW48SYop-Ljn_&QG(W zC3ls*cwgqN@6zh5&59_KK!kLaBj%i}SXHQh}IB@1`ZOaUxbNEzTbaHSV<}z1n>zP9|79XqV^vA^^0Nu}d zUUEijkUg{xr$$=Tx^<;du!K4+d$?qHB0K6IluN@4ZMm;94MRs?4Mtz~a4oiq>=L^y zrYMclWL_!nnY;j|;*4+fW>wpEj&|~%6fXUka;_%3M6&0^7@vBauZ>^xB8e!FO+i@NrXT3xWe!>0u~<=8pzBJ9nMFgCDQHYMNUDlBxaba7roqRO^LXg91N zn&w;m^&fr@5J9N_G}Z zs%r1f*TP)KyCck73tRV2J1Y zI`AqzE7mN?>sKe(_s#u4rZ*qBifNM#tE46T=*gsKdG)N40~liK)t^S&rIwsi;ZN_VmvVg= z-?i1~z<^}?PLKE4>-9cOQSWVrjn(JHj0cEd`+R{f1C>#5N*r+V0esm4;}wZNPr1_= z#tBZUC4L28DXS=zY|qsnb|vif3Zg7q4&jwjM$rv%`DfQ-%OsL#pTPbV$_aGjy*)vH znhaSgW((aWtxN==M%hP6vBHrPlJ*_(^9__(XJRWeqC|j$bt+|NGHr8QAO||NR$5Q1X!?TYQ-);BJ^AL3ekfD zv+9Ksv1R#Q`}zuaTrTGxxcjm)crOvE`3@SW&|h7cyz)HF%42stT+@O+zPClV#VQ38 zVPM&v%V+oaL4#GVGs|E2t+jgX#1^PK$klwAC`-X5uU5;850t2<>vl@-_sh|5-fN#~ zgzk4@UAvOc_46QFB?&k$*8BdH5?-Ev7`IUShUIBT*h?>EI`6MKVqb2Oy@)0IIS!MARh7=JJ6au3hm%Sl4uU zAbglAzN%=BMJaM8>F+*ox7E+;?`Tra7!2}6l&K6_T6mH%D-Qj))&UR~kLM#tla+^U zJS7pqu_iLJC(gP(%jh5%FH3cW-%fF1+$vK<7E*{Wg0gEaW8i3rcq6`{V!8O6jBO0v zCtSU&+WLmeKRo$g3jq9g?(991jD#;=t?uqWtNi_?QXZFjw_JPB+jZFhI_TO@rRBp? z`lejR!P?NC3nsfSdd&?bZOySXM*jA!-7@P*pt3`6S( z7Z=^yjL<-yKCj(8>k1i{o``7h^K*^To)sCyNV-V>Kd}ngeu0=pa1#HY<;d zbu}wEQxQ)sJe{*rpd%3B_&2Os5pp!wBHdySML z90?Dr5%tDEe&oFyP&WKqSwq`55SiNSjV(xlUafyWV%ggHO(+`3@4t7DtY?pRx_pe~ z&bYJDOYl&r?VUsWX47TxrdmEBCPz9Pk4YGZKAPqF#w=BiNb7>nu|@GItT6t zSF>C3*PfG7UoY&uw`v@18*qOV^-BMj~ zvO%`WH2q+m&O_8I+7U|1$wH=FEd12XrIq7>Q~lQTK77(DSW8hAtenwai_8}UoeJPH zZ?IGLuRNB?h6l_?jkXn>&Wb|2VewxwG&k0`svI3M~hN-tUD6IXfKUO2$mJ^37OIn~TMV&}R)Rr-K&JM_{NyAoQ*dX}UA&BOK z%noLOyOFzBf_whm`h&M{F3}XZWehwz8D8weSkzss|52{CYQ?n{c2p0&p*3ie6j_e5 zfVqs&piYeB(Xt@+GdM|btkAwOVK9xj7>EQqqQb8~J*UMr-4>8{`q%9HbKqnogbOJF zY!vchm${U|7yYSy)qy-t%F{kUI^3TE&yT?)Lf)Ue-&O;f>pGv$W#Timg}ImFc7#+m z?L%A-HLkTgvQV3(2nXAbrY^`E%^%0pka&rVU9J=UWEnyM#|*%nf?%pd$oa~bRtTt? z$&+A>1sojK7dW^daqxaNzjl2!=_xU^+JLe)w)fbu_KF%N2~oMR&}P zHtpoX*wfeD*{>%#SMBr|5Jk?jL^3vJF6q_-ww1=!ePdV}8=fyw(bHdBf)TCAtH6M_ z&*>Dc-{|_iyn0n{C!q^5;ZQa3HKKB>`f_UxTbE5kHLEENhF1@6epgv-y>-*|wE|>5 zz&Vq*{B-l2Y0A9u(HUbGto%r62ORHUnM+u)9t}>`c}akj zFX@RXD;CKgNN&RT?f95zVI)9n&Hnyc!SqXetW(q`}KgP(km3C=5 z#vK$yTJ<*WmHoE$k}cw!abSJ#!s8KvT%*=dg=)i2u?IaP!Ag<1?>T8C*87|lRcNe$ ztWi_s>Bs4r{$mA~dLqp7Lmd%?c{LaFNpR)-e&S6GWZ%q^^T^68lpt2Vl$$;nkB?&M zys=ywhr2YcbUSYdlsKHM`G?EYXmxUbPWHx->s_|}4R9vsa|CroE z+a`~%Jj&v@nCD7gi1KVvH8B#KE+lw`og2J@wrWV1jE{h(W6XR9PiFXCGi;&d7)!DY zYG4Ctd0dUXfPTAT>)1x??M2K686Q+Q;2iKWNtD*f!RR+<_gO$mYE=-Vk1Lf-t5~vw zghKw1H<%Qzn@<}Wv7CGZ^wSE?tw?!#L_N2V>l)MbyUOYBr+)BD>?8-}uo75xbEQ9a zc4%b{rnQv)P#`&o4=5=rz39ODMUD7|_csNPvq4z#k(GUp!?)i;+`SeSHt`9EGn4XA zWMB+4@bD-l7xLIpCZWPlfUKAWu&hZ3b{5e-y_EaU39(<@m`L+G1NS`dr*$?UkBOZU zyVnEere8lzP_Nu8=5(sHTZ0b7#)y9CN}{jk;UlNi3hg3BCx?>oZsjk%*6m(pSV^G4 zoFmOGVs(um6AM)v5gdl}QCf{go6$4Pr;}4-p@R*$LM0RJFc^BUvPFNr_Doy@z)`GS zfIM~|OR;?H#jFi{-5PN$5;t-ofh{6b+N{)xbLQL>+E@8t*dC9aOyyvjfoJL|$Wx>W zb=K5fiP2R5*P;y+6G!vyl)GyLO8BloI%gB>G1`*20*N1cYp%#xjnZ`lnj~mj=x($R zgUVzKNU0VTcW2#%)@=sBE;eqsxGy$Fx-bNyjTsO(;A{&=-~HAI5x;%IxWa{sJghUG z7Q6n`-7k$SK9mzt1{Ct?oFA3^PgR&zb|A1F*v6TfL||+s2-S;SoK*;xh))X0jWkRR z2(3t6dbt`h{#x3yLCm)VuN=a{I`>}k+d^~4++E!+Z`gXiD|pW5>buO~3{07*U{je} zi6n2E-y#DpNC7u4>RnfHoZz4&}%6d#11usG#-K`$dVj5qluDzucg4Vg$$u443 z*lnoSsCPZ9Qo%W^N8(f<8JU^QYqiW9NJ9(Kr05ry)l8{jGLbv;CB=(p@k90EY*lpG zhIZ4^iiss6D0ZycV%-T=i(RkqMU!ks0Z7@X?UleFRs<1h7>l2ma*;+l8~?dt)Z++G z`YvyQBCtIe27Vls+1>v}N^b6i9B4@DOwsU=)T=8Kg?4_33=4om@FtgHw<3{gxGAV( z9hQIK?sfTjkAov}NiW++^(^u|SC2@!h`{KEXsg#Do~&#BV>*#u=xF=*tKM#2(dE^lK-YjV(gHGby;WCB4*5V73* zT^YUzENvO^q%p(uH}`^1Ag-Fq7{Nzep{&={ zsr%K+4me1T)TL1HQN6Cxnk{0>-M50o8MW7+{|H`K20dC=;FJejn1(l&O0qs!vr8MU zi9#4*kgE@-n-Gq}_#c$sqrZc}DxSFnT955y|CTk&;%UU@Py|UY<7rD`lV5+%FOt(9 z(30jC_7S>la@8`cCV0$nuwkpLD!koxSdzOTB0L z(hqK5ZP#nM{=OB+4Etr*Z7^30Mke>aKu6MBGN&8vUY>uzCuEka$Q_BxV_Kvwyw8D~ z7%qo3;UfV?zT$e7g9z*S0oJtv!rtLI=aI*CVYK#_d!3fRNGQ~X{hZXnD2VZam*kX1 z;yA|!4Tpcu_cQr7zPrs-9AHbLp0Gp`Gzu?;=;*5`9`@Q&R%jS z5u6(WV--7s3T4*E>=C}W#z*`wab9B9!_{eI%<@zIH(NdUMxeaih0=-m9sB&|b^Ocs zI1B>of_ZzoY_>v}8M~lXGE&}_nmo~896#NVEAFi^&pf4cB3|M!m#Uq)MGo}o7nATI ztoMfPY0OXrwv;d3#`4SsO>IqS&eAijmbU+ZbZ4BZe>tljzMtVkB}XG(*rV5yB%toVdEankKOnKr|-!n0pvA@U~S ztBR=)!aUs|AuxQh%yUWfEaR##4Dwh#Y*XMtP=>R zC1dWa9T3v@j}0W7^>>;5He;0ou?nri0D-5BU53FBm(JqHpA##Z@+r;3Wsx+u`Xw?y zJJ^CU36z8sDnOh>`S-JKh#pw@8SVmsHNq`^sLXrGQh%-E`7_4&7u@sWLS90~&tCql z!;Gb0{(R;0_vaZ`u-nHjx<-#VqNHae*XldPL(D&FX^a_qSaF5Eo9`O!y5^W{L47(9 zf|{;P&bMnQ3|Y?xR8cA0g?q#IPV+}1Q8bXt?04HE$gwY0RIFcwd`g2=Scw>m_P|fJ zGb++ecgda0FsY$a@Tv{jKX`MX zB+4}3pHzDnQ|*0`RH_n5Gla-Liff`f_1B1K>y4>fc$6pAVqnwwi+U9?C6Zo>9G`r2 z#pr~tyfX~r!cTf#jd;K62mw~?1~FHgKk#=xNQXW~<&?LS#(XTFSs!0*6qm^XN)qd~ zjz4S(Bfgfgj5b1?9dh<6zejuq@YCZFqahis z(i|1m!!%+#9w37e9i=4Lv) zJbiFRzu2YJ-|p*uPP=|{nJBHz@p_L6KXBrFb#SSR*cR`g=n4E*UB)fnyTxL|73It` z7Wrh$&OTuLA4#&$b(9G;w|k3bOqX#Ge|?% zO!_)Z7)Ki2LHYha)tz}flx_R}OC%{Or7VLs$`;9PP%7ECnC!W!F_vV{Ft+ZfTPSO` z8QCVr62{10k%=%2V;M;rVrERX8Ovz+Uhe0)zptm~etxgt^ZWDHA710-%5`0?^E%Jt zIL^=Keasj7mQ`B-Yk~czr+G8ys{rtSm1LD5>ryemE1yaW!FSr{3|IafN}W1hkkRlE zfZD{_B^~^aIr)Ar>3&UDMS!&z8IYIy`<^Ecms&S}(0A;~S$=l{#L>wVuavAkPfq_) zc&*j{zTXF5U8P4~qlLKx-tin~3iz>SZWY;R6&W^)nyROex1)w(7@MyQObA`{IzprtT=jiF#RKt3fJES`vC#eAx)?ZP&_HNEi!q*UawBJj?iZ)9`oU(av zu5LXg_jKF-kUu2dpOW8w`MyQhqlpb3)w@jf0D-B;2W`f8(x1iz&-$c6SHQ&rN$qSj zX!E3&{+KAQuBCKGSu#e4MjObL%OX3ko@?0CfwGHci%d6w3lDD zVF!!oo@YXdP5A*Z(7yODycnjssDaQDh#g`uJ*Cn5vZecF`CymfO;q%ge|VI4%y#r^ zP2D0Dc}2XtO}NzXDU`WD)52vCu03~~%gCg#uMq5T2Lir-AgBGNgPkISqm$TeusCId zVh^Gg zQ>NIu{;hSgq79}4XhaC5lld1eM%+7IPYV?rr{>tT*}r`p+40)XYW5D|KHmvK|KP0N z_J+Hw%lWojRG3fs=}}mDm4mCX=t@Ez!gh90-ps*q&L0^9?79WUlYRGK1b$4Af5-(| zbQ8KmZ@T-r=L7mt$waD=BM?g9`#Nhr}7V0=z|5I=IjvS9pgf* z&->zofM|$~JpkHR>2IsPM_$?!HqP@!dM&fZ5e_q=dtac>v6c{?X8pg7Og&K zN4ua#uIU*}(%p>@%t_-=)nO6#hd4NK1S2&V((^Itj&W1Ba>+#S!@N~O9(gN8H2jSu zvnY{R;u*?Zle6?QwkX!#l4A&g%@Xsd=|~4gjj9EpHzPWEV`67J0lkKQhPT_N_dm*S zk`9^qB`=$xp4p$Qkh_ z>zggUAxN{a)$U}mrhvvG3$5U!z!^i-u^SUeG@`f{ph`Cj4i(yyDN@_S%+?^&aE2x; zD>e7J+}zEU!#dYiq>t(xC*QV4ai+12X!&EF#eLNuzL$6HWkUpqc-2T^skY1QM$b<0 zB8WeZylrL{`YuP+IVaH2WDWL9!;J@9kWDAZRXrTA4Ld`Me-^J6Vdv`pN>Zy(a$_T9 zcMf?(jpRYP2_6~|G1(FHSP7>4XTcv|2Dbs#q+4PCux|i~t$^HacTJ8RB6)uZG5+4> zpt}!qx6T5q0V9a1D%k$Y&%-VOq~^DD$3kB3>p!G0a8x19d=3~9xXu&`_nc9nXqasefnU8EOr>vh87E!Uwa$pRE}jA7^0g8>zc~8HDciI z9shgTzldXRWBBer9h%-yvLsvu^L}ma3cPGG<9RWxubRUIoaeWc@OFD{OaXwrcc38I z(TD9hdz!_`OxgaJJ@@5FMTyam1AqcUZSGUxt%5e@7w>cyKq&F^o|QSzTY_pYjV@Gn zJSIP*(PtYYotu<98(YR5c%aQzFT(=I(BqO0HvQ3mbT-1Q@xBB9ACw?e`W#U?sd!}g z-K5q8JtEsjm9&>;t8|``SiJv!aITb=5ZAGNbhIU@O6|oEbR5gbT|MSG^LbT7>8^>$ z(8CK6Q~Vr{SL~OOrMz5j&h7WPZp)*5#^rbS@X#7~$&LGmRNTN-i7M684SGY#Qs$xq zf}X6xiV{gOF>Wv!*GM9qLL9tPDCvNfWSr+5kO_`sS-PSTd*!VKN!Nm{0e@SG`2|No zSSxb3d#InVVekRG8(17r82q(|^7@D;K7h1Pxy8KM<|tXz>SOKtthBYV)tD75C^ym$ zpJ0b`&B|lXQdO2Eht_?bnjtodenQ!y25qCaTy5J@Tpt3{zaxXKlL5>OLtdE{RSf9P z-glE>J_uhQgCc_{=*Cd@vp{?;wt1q8Gz!%S0Ys>#P2?+-{OYSssuFUORk+s?jpQq? zr@U-Pb{0E_0ah7DQQMdbR8votz+toRODud4Nz ziLVX=2HzD=poxt>X8(O){n>^{`S8#V%2dVqEN>Tbf5yC&j^EaUGv_Ui%~6pVk)E%~ zm0`Y4R8uJju{nERC$XnaG4NrawL9J4Z=piBBP~+4Keg8_9~lx1losNXs=6!?BEcqL z=7FqqrB_d+XR)l;c|Xfm0OfuLr3P@fumFpT&t1@8Q*%G7l^@|jV8G?<6E}Qvj?drR z5Qek(6rOS$<4sgx2-IcX+fR z)-(|lks(Y1k{O@6%O&o*6Z?m}@%FU6U?6yKl%|y1H35B2T8xeJzsL>+nE!iP=5u&= z-!UQBu<8`G_HsazZ^9qB1>)9ROry*UB5f3RZE0VnQ2x7^2LSVv<=3UW6tT&u4@Civ zktVsS<{H$A*{jNK&NvNw4P_qR7tg+rem__L!b$i=$5dX>1HOLaaoj1Z@iJfMI05j$ zU9$q69g`w`x8e64f{pIJN@)kr^oLk-u=!##&~>n-L{k!$r;^hSWDQ`?)D@nZb9!d3 zKoHxKYP-Rcz3;cFp|m!41y+xxNlH$?rqS!sShpktcu$E3chmR0hW0{=4cdVP#V zHzmTM%%~Wuw>ieSTC6X<@!~|k`C#slCwcZ=#( z9bpo9?p)k>TkYbPbyLxV#LHR()8OP<_9>>x(USdw32jEk#792q$d;W#>e&QWv(Ot&Y{Hp@B@oxgRu z200*X;CBq7Z?Kv`7*qw&w%E;QPDDps%sl<{=4zb&xNv$1*;m(5*o;|}Tv}CTaktss zNoL(BDmVkleO~kWw$(0S&LZRlJq;O0(O`Skd4HbOd1r1zo}u?cW=3Aywc0f;KMgtF zLcvRpc*P@)^;9EWuA6lB6$7fowT}-*Ta-H&!!$w^H#=QhzuwKgGv8kCb>LpJhxl}D zwRui=hI-Aw;X~Tn8nUf4SzR}%!ktlB4S_)#p4L7MFPm^S zY{CzEcKsdy^!wXkp;x=B-+9oZ)inI-eL0VcqT^%rsVQooPQqHs_B^S5ypnBTy=bi& z2Z&l*ctE6_cG>m!+l=Xkntjbm_6vzB;;8_e@n}G9^GBR-vDJ36Z%$9qnEkustl{f1 z@Z6Z(gfIKLmZ0Ar71rcVVU4P!-Nf6w`o_*%{w|c-sVPifzbJF->Y<}KmwdnBm3{oP z13vTXr?j~b9SnZ9$2!kSG3G2Sc13el6Q-$4U$04LeK~xBW@Y4+vuW=;UvReQu=x_Z zbCcg?G4_jyg?g+J_1Sfe1-U~>aXysS!crf?EDD1P&GKfRp{)e&b@}gD2H7iPDT2Lr z`NtfSE+pE$O`&)WP2c5l@K0$0dPVLh_I%gYWSL`=G)UtnQ^P{0I z(Zc*ypbwB6Kp#bP=Z9KCf<3WnaKRz_a#>rU{z>1s~UrTRK+p`CRiGmzk zzN)D# z`)4lW@LP;}x_hY4ZwK&Y@fwOYf@YqjdVt9=3nG{R9sYB8J96u|>jYwxNit>7WClwI zsK<8nZAn|gmwObzBsGUJ-vpOCfLOASr(iCWPNDgQx5B^B-RU`686hzH+qOLKHLibc z+bW%X1Kl1y?jfvTlQa(ID$cV594U2X!=KJXDs!$lEZ(uzDv;qAx`%Vr;AQ?tHklsH zwyv@Re#Iv_L+9IG=sb<_>H(Zft*`41lUJ}9o%+uKCX;QXg$!QC^7E@rpFS^9l8ZqO zUaU(c)f7TD$CYZdD6WUe*M=7yqe!Bt=*~VSZF|R9c+O;E@>-`Z7Nc-!^(BOo9Y!Vz zFFT5S5=%aTQ7D)Yg>L*-(D+^=%5eNgqgq$+(jdV1DGV5>RGyH==IFPKY4srgU2*|5 zz}$km3BD7qhMmX=_~(aM^FTD>XbgF z9cSBQTq_{;3y~dl5m0c_qocDhbKn(aO!vFyy8{cqR zEBt$AKN%U1DV(6*GM2)9;BZu!c(xjQYk>#jK}Cbnp#&B?CC!A@1mavQJ+m-r0fW zMr%g#NNb%IZU^gWkS=xLoEuSU zh%|8&tSA>d3Xx(fga{C6Pyxs77Mb5`#%+4-s&>tEmKY`cq^q#@M2h zBaE9fnskavM|$E%U0rQabzio0nzaoW84A0dRFiUwopDT(GZZ!g&v143)ZW%Jj=o#w zyQqVi1%ZuN&9yDa&;(j(gi(Y(d%X{hX8(p%eiCkK()~3O;QrXpRmML1L~w9rM=WFo#RQUoa)@HfK;u#>9gwv|S+hFo>KKX)wkt+|m+ z7GX00maqF{8c@{w_Zr#jJnjRH3etoF(uhKONzOf5GvfnP$a`n=-A8H08)k|pZp}vXLn0sz_wojxqH-)K{4#->L_scwA z(G<-iY@F+l4Xjbv!&~o>veiI9+><h|vs!;vkeH6D{&r}NE*bOZE>JiNJnfK-}} z<_19+=M&dW{fhM4|S5i(XatHAOMFQM5wzofc;I}$&FPvHJ;NU;^?AWo0@QjoYI3`!D+uLEAK|`Wc2?ZEP0Na~-%m8K zg;D$d3^(pX?SgpUgH_r7ZjU`kBbg)j1r_4FCc9%0QjEb>(P~$K3Xx8-KNU#c z*V5HIpKyrJ!T~IoWa{T1O=^8$)^{y-wXZDT&}n4ayn9&@siT6O6lPng`4#qu)ltSZHKdXEM>=CeVu%B?)YPmySZoS^VG^A(y3hs-V zi@O0wl;GM`JGL|vy7IXQ4ex}Y?hdDGk61?@P(!T#oHbNdBV}6cbtr+MSO=f-f){q; z;CV`saNk`3U1PDhFKrn!`J7Q|=0QSxO9wRF>4Pum(J!-?1!5Za6|^6^5+W6Pg%0T4L@i&zRxa~!I*PRaG+ zhZSKzsewtO@gZaWEm-gQb2pNa5%#Zr0E$V4!ffrMOn(r{Npk&Q_pV)n3r1Hj->hq5 zA1QX$y~5CL$q1p2YIuVfB_AaAo!tH#>JWG29B=aFGf)ukNpzq>?xVfwdz1HO4DYSm zJK47sC%)%~&~rvU!m&zyhCB&Cf;7OU=K{U@F}J1QtTsP4BDTxd+dL=We!rLIDa~`< zoARe}uZdrb)z6j& zD?+!=j@MMKuPmlC7umcM$2rU8yAM9nPw2f7`^OrWR$I;FrfbeytnB^suZ#*;s*=7? z45;>B2mD(}RrjgEjW0>zyKjt)-LFqP!=%Bpe=bq*gkG^oM2{c;0iv8U`RG*&me$rg zgKyb(sS)Y8lDt`cHs8x_78SL%VQYO1Fo`D@wsmFcGyYsg}Pp80Lj zVs!-PQa5`4Q~CBPMGzx6Hn3*nRMK2re2)wB5MLuh>~8EJz*HY#lro6_%``(4X8G zn#U$y|BQDvTx?}e>&;q#Co3Y?r&V;5CTHR&LfOi3FF}m^Y~`(d+Z^Nu-1nJ9v1HCz z-yH9r9A0qK#cHTdAxQQU`Q+ql+)F2<8~>W5Z~XToesul%I?U0Y23x%?fO#Fjujq^o7PPPVs+ww1RHtS$N%FPPkh6OXe`9srBreVT6= zbSkrNlLK;*%ndca6C)=sG^?vK!IXQ5d3N|(X76R+3}SsausV&0`mohY`bRdEyMBw9 ztRHr%pU%N&O}-j&1P{Cj%U#O!7v_lYE*o`z=nU@LI16vgeA?bKk;sQe`w9R~+lz4? zpS=C=;6_IUZ~4A1K=*>8?)bUe;?fc?uzn8y=qw}eckDK5inJ_n`gQk>4{yGL;f=-R5a)wS_+5R9pu^6u$^b%J{lqgEq>shY{qrL4y3oSzy- zAZ;{yjZK1$e2!Oulbhp9=AM9OR>?Z43SzS*&eantt%s61H@uew%01J1$KKmFtp7GS zg@I*G)}4P=LGY?igs&YPh3@<*PaTTccU6jO=cCKBXueAMTkh89Ub~$-?@6hO$ufmc zUL{@~zS7`*{Bzj5Q>+_N$}zCucV41(?;hUcq0Xhb96#ru*R0={w03Ahgsj zdwc9A>ALTzEGLX>5M9U~spt3LKY>wi%$Llzf?jNUFf^BC{YG8#{2(Z?arRPotN-3S z^FlJOp0P^A>qHA^YQV>ci;06YpMmL$>nsp50W`9STt*quWpwIs9*6bykzii~qD0z3 zUSDTzTeYa*L}yAFag$pq!@mecxkdWk2z>Dty|2*1{xWt+5)JBqn0R6^Iee*@cGG%0 zqKd7<)?h_Qo9Q7_w~`Wag%i)zLB}#ZgE(=-c90 z$Sd*M;Bf&k?_FT{LvMwe?K~a~w_2rY((6)~*vh&R=jmxwY12nEMQ3*KN+6~T zvGlCE=XFABD-shm>W**Nrlw;W-$ck9LKFczufp9V{pW!Fm$8d9fBUUkKT#?nQFzNM zK5(q)RzF@fM=8ftas1Y|h*aNqRBLYJHT}hSVV-GETcll!VbwC$YXI^!={n37_xMUx zr6z#f^fnZ`AZ$4xEzDkQ(0}=Mmi*{#(d~?AOKSL$d_nS|$2oR9d#JYk&*MC83$A4s z*@ih3OTM#24UA-UL=DR3#75PGP&WCS;nHg*kc5j^SyWy1Z2$Msl(8lFZs(cMO5a;2 zET!+{D7>2>qR#3IUIguz8tl6ydqX#oHo}udI^8R-DtoA9Yug9LT?UJ*fCX&@iKs4s=q0KQ+eA%T!^E)hR9rT4uvhEK%J>i(jGi%ujjR49xvqOE; zX?0$1{Ac~6bquv7qH6pD97U35Sr3+DEz;2}ooH#4?>4hAKRisxUJFb@ zSVtEkJGj?n0%sk;&WZf54J~R;h=9kG>b30Z6EY`g)YJaMV~uEM!X-yFUZB`88LpZ& zSgqcM@sS(Rkeeib^FeqZS;Mt;RtpNUEyw?Nvix)GeSO5Yukl4L|LO~kdj&r@Grr%H zG~0T0$7e4%^++%-oK!*(>3tx-I@uEe7~2JBlB~A3SW>KPtWm5Tm#rH2emlNb{2_3Q zyBKB*U;h~tyl1ei@Y42%T!qU3eO{a;t8@b+9OrYh7(li|62RE*RZ@|Uw^%JS=7|Ep zrISBU?)$p^OZr7;L}g{$_1mlm^P8|ht>AiCkVBSVo?eSJYTopMHtOzZkw}>6gez?Q z@%0HmTnKjR4={c#`+72C$~a5O!4U;AOyH->VRv&ns(lQRBVLSfm&B!N^Sr@lYnc$g z`mG^4bzQz?!qvIbMM0|6hJT`7_s34*(NKGkK#mKqRj(ip{-hdqih749X`>XX#>j8t zDyoPAgkj&2Yrxu~VW2;GDr-}L0HL4ufN}U;8_JV2)$Q0}Zrc&wTw)mUx@o7jaEwf& zrdL-(@I} zIPRTqA(~yHi;-E2`Xn6QJ-cNXJ|6UOA)`|0#a+89I$a#rHJh?^FYsAm@_Phg%AOvP zs)h7hcHngXsMm4<*W&f6qf{rnGR;~ohjrB;s9wVQ`@#?iZo^rA)(@Hxf?xJ(_|n*# z2=}w1w*0kN{>$9cs$XUaYhTsh0vPmdh%et5p3dysZ|580=> z+EtE_@m3~08Ou5=R~R3ex@*Ezr4OA0WAXl82m3s3Gh1^;XG`|EH1Kb<;15f1yWPV(z$ wS-i^SI~0EE^F7f2`}G2kQuP0qN4cmMVxe_Dxh-tpF5sWhHS?>LS8m7t4_?udssI20 literal 0 HcmV?d00001 diff --git a/images/model_quan_table1.png b/images/model_quan_table1.png new file mode 100644 index 0000000000000000000000000000000000000000..53bc672246c341dd46dbb7a269ff2b3d1c35a05d GIT binary patch literal 128197 zcmcG0bx@UE+b<2$2uOp9G}7G-N~4r?hje#HNq0$iH`1`BB?P1!>F(~c@Oj^vGiT;} z&-v%f>~W0E-naL<*0ru*-N6cSl4!_;$WTyFXz!%Nl%Svx*r1@G50DVR-*AzwD?>rO zgnB0?{Luw^FAZ_}WA_Xs2pvWVSNP?Z*D(6f?JyzK+0*)og=^%`-~2Y!51kemkgU`_ zB&2&WLaopFg$)-M9!E?(y4c4dni6Kydm;AG0)2DA&gpWmt@-w0$g%ZshM(7@pZ`?G zHJv7&Z}>Ef0SgoQUtZL@F!3l{Rw3VzzWV&jOAUs_wD9G>z72;Dnt%;iAQVUVU%na> zyze7|m=N6m{6bFmK_!IG5D&@xm+Siq-bV}j`X$W&{DPOyG*7~}T4O_L;jElX9yMuUBw14@@4?T=!*N(l>r&xc%r`I_ zeVwg&L-CsX-?xZ0DEv5QbZU=(D`?+1IKvUCU@7$fxuU_p_NRbiT~C8W*7=v}7kf$J zI}-Gz;EYBTQKQa+`3}CKShjfEJEGeAU9B)6>GAS2-e0QI%32sarG4zffnfKjq(z<-N%(ZIH6Sy+X#?jJG-~0C( zRwL>LM;Lbd1%1zVr{ljQF36I|i5cx%55F(x+U4$C1^a;7%5YKz?`8GZ4JA!*_h}Cn* zws($6iE2gqJ>5o?5>4MnTCJLa$I7P9n*zn|G*v@!@61Dm^96!D4sQ=3oTnSVSU*pd zFAP;yfhtIWMd$i4UaU-4Bu_ojc6-pAl${mZD#^Z?k;IK8)PRJhPmP@w!JX2pB z>$}^zIjcbr>gl5U;mmNPz)hjDTbJiM3~?d2_urt-J7hT*3>n(4^v9}=q-g}DdFpGV ziLud1J&uG~&Bv+7$}~9b7aXUOL0MT&MGrH;wWftY1%`Bo;>}&SZww|I|IQZAciNvd zPSkQ@MkH~FGFxc+&d%X^`ipa;6GM>A`+iraEQ>rIa?@J*OzHxe*E>5bZFY^?ETK-P z7!Uk0RQSL5s2SX&J+G@}L{{$yj}rSXsAY$U)7`YnBz#DF6kJurv<; zYs2nyXsp5{xI&CT3^J>4?4OMvP7)z%>!#grvOX5P{xOJ!O6V|Ywt9QMYr^aDX(l#A zlMOO9lg6a?rExt$W6&hS)A$>knSOXwX+tW9I``-AK#nN?8Ru^Rdr@ETg`KoSc?g6 zI3(nvwo;Ug_?6Hqc;&DfIAe#!-yK`De}SLe{~1mG>*{#*S647LT1jE!&DnOWf|)id zj+pi0ZFg#*;n*_9T22h?i$dZ&{EP;l|;g@xma|y9ul4Vovjv3yg8|xYvwLCuD8pK;Nbi`zU!ze)BXJ&C& z;I_03CEay{zYRLDk0m? zkplgmsR>mb%cLV;;o$fx8HC3Bi@9IChb;z>v1e2M0j8%fwU*wvZ$B@YKA50X4ZBlq zRk*ph#nRsbc6fNbLwN5Jb;>iTfKY3WVeO40mtgz<2^C|146p=jY5#UjU@*CX<_{p&_r z_T*Xj!&WqJ{7N~#W3kU3uOAy2Aa{13YhOmH_c5Cc5%XesfOQFccDp6#{dl`H7k_BA zhigQdfVV5=N@g@WdR#J(M#xZ2{i~w&;u9-4iu}F@hlc2Hjbu3)>wzv3ONa^jrm=}v z^~cb+I2TLakEvz-x+s(vnLV>AJ2f&@d?!;7Z|~DCZ26y=%gCh%uN8!!jYxT)^ifTI zz3ax)nZ~s%NOv<&meYjq{_wmnqB`8*bB_B#WABjp&S+j}9rb|0v#e8DNXP`U1s%KL zPt1WO0?j84?|s|P=>#r|@j@7fS%Pg} z_1RUksLI?o8E(If454|O97ofN%?~})po^kar|*pL_33n6a9-lyc4{hZ(bdPJ%ilKn zaG@ZT1H1n_C+PZSx5D5-oNU+BQ-)GCkM;wxpO0{;GNDzfSleOaW;!#;`EN-w%ZIC8 z!P(%9DIM=tJ>_9h41U8y489|u8OJHD*o0^Y`yxIys)1XFRkHZoFU+P83F9EaJy*8FYW{O{Q=H?R1Xfh`zu?>DgT0y1uJJ73(jfI|}#4T#avDC9({^JkAT= zpMS}a*!E(0qmKSzBP2@)kCG&=P+v-VJK=6cM%~7}s1iFB5fxvT3JFnF1TQxIZJ1_! zu}oE3oVNuIc-@FEmfyuT4RT-KMGg`JNA+|Ps4hQy4K^& z_k8e=6KRS=0ArO6T$0(3Ixj?G8==9ZfyIo{ahn?4j_Wt70aBUS^|k>?R5M+{Gumo1-{c}P%qI+&* zr|$e=fDJ}$gxE1^J{1wCyjmqVw8s}*Z7neAIXK){03mg*m@c$-0Xt0qZnZ5V!BRvr{Vv2y!JDI=sb-pADMB?!7wI)NpqF? zYa#+pVAK0psOw(@-19SbAq$41iV;-VF=p4Nw|ElyHW@{l-E_MTN7ux|vK%Lv@QJ#_ zWA{TU`VaA(sTIq0)Y5CJb;l9CzsXcumO5>Q*beXYHc#w@ExFg_sc$1aAMY=n+^Nu) z^fq85(9oZmpcKoz&G8q`*Nyw3LvNHC)Xt%*_L~dOFHhtlw4+z1SDK-*YcQREjuy^^ zY#mAhXQhK1oA?TwzR<#VB3m4#@;61q(Ko z7-=qQo$+<(+6)lQ)O+XY(flZ~HSB%*)IN*{Qw@ha%F13+_h|uW25y|Y{1ht|&PA7; zna>riD=cpyT#TV@egnLH+Fabujjv|x-#84jLmPgute!Re-q)B=JAX^|oxT~rU@Q?i zXV>Ol=uUpTBIht%z+a$k*Kajl0N&Hl+xr{S@58GFgxlS6j~wTA>yCB(QlA5a`%uVX zniWq>UN-{HrcQnn-e_M^@MYtt@8(9ULsx-3KFn@Wz&e;dc1dPac(m6(gkxG<~I9_mt( z%+oNQhuVir&3MW4GMwfWb*cl*!+C{wvNhy>(uq{zMHZAD+TEeoC2z`$i{6Wx4m2w> zzF>@F5#eW3W@Gs?exlr|X*S@^PN#EA4;*I6Vl>wPeWp|y^nMa7;!`+?mL%bIVgrb# zNm+v8Tud;;2MLGG*Ll&kmyoxj`G99mmF zoVRyc^4bxOn?zMHy=w%QxKOv)#klinXeWu=KL0-4#6gC~VM`aCcaO=wpQfN|2mj9N zV#egxT3>8gJKXvnfk~>~?kj|k_`HR@f>(cFPY=Bx=bYTMz3(08s`RCW7xNx+L~Fr_ zMl|ZW=(;&`lUk?S;s4BOS9lf{m4|xP{bJ8-*LBC8v+_0CHS)kPuj*4}_s>@D_Z;_< zS%(YOvdbX(oNHLO8U>;e8DG8C4oY`IGmc}Q6-2#7{rL{tk`i$z&2=NW?rm<%)_XZ! zFb!8>({9msMml}7lg)se$*TB+iZ?`M*zNu9B4(g5Bi?XWUNpmAR>;!MllMX~h1iO| zfP;JLd2-{UVn0jO&s;`aiK+I4xh?9L^WCjZ+evCqu3549vEj^N;m7^2ZoBggQ=*m_U`(ANG#t)FibO`i0p{Y+%+>o*eU8&&%-!{RW(OXyIdMi z@=zMD1_``@{0{lfH%$`7KQ132OvZuSsx~a@9M^qG49fSoxSLF_@@XN9&GQV5UEdA+ zo>#n!rCdB(?raSw;tX^lXQiq!{&nIsYesblpycTjF~Fb9>s&qsfL{gRLP$fAX_brD z?w20#GwSWu-&+IlGY4O5`ujD}aF%G0+wNr9+yQ5s#bV2h`$n=!3n{;w%apcz8K>8s zi*-*J!D69ePD?t!yW2#u@+l#^`Pzd0F#qE0_s<50*g=4n7~|b7t!V)mY1Snb%#n+b zhjYe5unW(h@PIV{--}@PuJ3=J($-$@k1tDM9p)SB8FQ_5+*LUQrGD&nnzA(Q5S1>? z*gAf@U67F)JADtPx^sZ|)+Wj{rh0mhe?UW8FBjbx!Q8hG7^)%nyW`&0C-^Tw#jmRE znpZ!-(Cp&4Gk(m_xc1z7D3zO3ule)ITHoT*a@hl@$~V672Q$E$Iw=mXVZl0n4rkif0QVU#F()j9pX(&1ujYo2ss&uGSQrDliy z44D5o8Oodt04=td6DeRm2R~&MV?8YR*kZ}3mAa62aRX^;Ps1rYPzKt0tP`|4lBoq>WO=9Z3O|a;1DecFZ!}fnqWbZmQLHAM)lBB^rG!HuOPz-E8Eg z9W|o@54R{Za_wfiJA(?fCJglmzV>qhwtSgn{rJ}aFsLRwn+&Cz#UG>IFvE?QDo)=` zp>fV*mbBcxuNJccMQ9QZ{jhZH0a|5uoX_#f%6yKlXyYZQiU!FGIZd;zuT1n!6UAcHmla+kIOB*xE_5 zuxUvW$2jNBH0Rl&irpyv!X74>S$rnFk?Yfp$CfnTq^9ACoLYJOKCR~H;Y#97%{trd zI-=U%43Cpw_Bews=Y8kbeh3E`JXv|&!DWx3hmUgv{ z;Y6DobD3Cq^rCO(+CH*qA$l%W-t$nZWy?6M`hm{hebf7pnlpTVyY#pyhpBIu|47KB z>ysuGqQu12_v~@r#ivjCA-M1=nx5n?UG9@jM)*O%1AQ#16(FXPSZYE0TEvnhUgigr z25FBCSCj-Qu}p4u1~@hbgBOFtY(dUrY0|0I@>*>Q1cI&R94FCNxI zCEOBDS{r)vF04^zXLq6`k-dwDM6~XjAkZ-`^~1U+tyS`!wjBc6O9d!JyT-uic&K*u zZ+)nQr7yV-8dmp00$oB_I&@XaQ)F$1xHnw@=eii-&GO0#y(%A@b0zq>$-~{X5q%vf z8{c|BuFmx!cM2<99iiw6VXB3%=vazuJm>U>{VV{_vlx&UHZzX&+#76ai(~>S4E{5P z!7arKZGP*$>8h&a>?4tBQ=djnNk-F=9}0G0N^lP%bt}9C)Ls9d{vElE6e!na%Lx*mjdOdvLEhb!|f?58 zdnl28i1a9ujmS^8U7hzJWCLB7`sS(YV#-fcbfWRfR)&0Oli)5_dq2Jlsiz!+dT z4&th6DjK5>SwWWrXhDAN!9pTld_u-%$`p;&is-%N@f((P$dUIw({*Z}!Jh6iWrTQq z219v~nq)h7T8r~>{{?zMB^?`W;+(ds8G;8OW53xCarYh!cQ2akWw=Zzs}jhCIlISF zoW}5v&akGzmQ9<4^SAnWKltTMOQ*HwkF`!?N4H|koZ0}Ql|u&o@#iv-6$v#aDZG1-txSz*$>hWS=za zJ#y7AxhFDc?Rgy&hYfURY3?C{NnvTVX09UQ{$v^>ru8g1^1=-@X4Dyr+6rbbP3pey zVc?f3jB@X6arGHqmwGwY(ec>XZ z@uy?e^CGH?@v&PSZBO=H>iv0vy(s_s5N?P35GFDyn=Ef(zVvF*{mxK=FBJjj9Y)9XZVFC_DOfgmtWP8^t6xR^I4 zwa~fjQwxo+P-u|G9M3f(p__Y7M6}XPF~N$CEv;Okr)g_dXrOf)-GBJ!a3(Omf@3Hz z{wm0l^X?FC?AIj#&a-tvzL$MwYC z4qWd^=h2Ghrh9P(9or#sU@L6bj9EMIU{R8`Uaf>~Y~@7O83gJsSp)Vx-}~`FrT5RG zo(L7$>fZyL{0}M6Dm^j2UH;GnPc8oBuqsU-(rz$wfB_r6>9*eBOOi940i}U+V zqF{f&eyJ|dV2885sr~W!Fhk3c$Y(WUTq3_@P>aKNKoVmqNOSr|(vUeW=TPMq7V*lD zoCgaC_eoi&KX$MfR9f`b8jgF2Rta^VUlScRmej1?Z=_!@{Gc0)h*=`rbUr$8#zr0Z zj&^Oj`UN!o+fwx!i)($S@Fky&GKd?59@?0#y0};}l5_(cjGSkZTR9^H82~D?GenT` z&kmU)Zv+?~3i4678x6##uLb$cuARl53#-dgCEo2@W?$Y}ebFnDl@pU4eienH&QRNi z!-IT7gwK*hau0}i_0<4Yxsk|6Ktx$U$h9FE0!Sgk3^%tj{@EEF=A+lAn{&@n?h?!( zb8f^d8L>vG8=o#?WGb*<7uzZMz){a_T^|65kn7egrWjd$hZ`b8@&5CXE3K=L;BpVG z%qa3fh@ZEddZb)j-=AX#>~@<@FOx46Zl>NkFDIH6H%y8vJ08+Q?A5&c2ecFk*nGOe zG)hD9nW!;QJAzON^b=2*ZkZMXUPmBc4iE=iKcTcPSWa#U~u#QlMI+Ai%wAB}^*eLmA_+4z-$S0koh_#;vbF8~`J>%Hh` zM0TWm9Rd;$I)moQEWpwB?eKVA3GtL6k?FuIfrAHajHM9y`-+wNokL?goFh8Iut~aW z&m@Y&>lLk1eFNQ-5mAhprevc_`YA&r_?}U~HGCp}pN$T4D0uRdyw1yL1Fz?kiZiDC zj>gWXJEX~s~D^@_BmGIYe< z$<5Un#@le{3b(B*XMNtKpRp$$`9F`rr5#LP1Y?WA96kI1G_Y@=D z+otGrQatt_s`z6-#>Ko9Cm&Ge-7}__-Z$F?`z%L*+^MkLCHJOh9+&&Z);AQPI{{z9RM@#P!Hq9)6zhw+ zc{nnod(Gvpr=+U?ay=EU4l3+YfR$w$CX`@f{LmuDPd}k4_)% zU0Afbp8s9n;sxK?TW7G2n-P}d0wvmCznPj{CxYDx7H}16=*(~h**4dE*_bT{*7hda zg*yz(5cr2S>gKF!Vwr!R+muVxVY99V#P2106Dw&qB^n-&Sp#LdY;q*c$WC1S0a-ks zWf}7@t@UDdVDgnrlvwXT$g?3eV+w?Mrc4&wRISe=M9r*RtJm=rCSVt~t!Qp7hMXW2yPFxd~T|sXeA3o?51|NM46cY@yPwm`Sb%}!4IA$6CKcq z#JctF03ffBy;{N4P|>#e!_g0upp^Yn^qKQ|yy~nn6}}nQAjjf~^nA^Clt5G!?OYbq zh9xZ0N8t}eD;Yb^aY3q@wgLC#ZCz=WbzBYX{dt;2D_KjxCt@e+gKIPRb*1tT;4^sh zg#CCDZ{r;2u~He&tXYCID|}h;^U><0()^JS$(PVEvEIc zdZb|fxk5^+Mw{YIx!bGv-|{w984MevZk6!aNWHG~c%g&erJQk(gzEnJLo9$^hNf*n z(#i0G`c*RKdqK#D0=cSvqb~>c*R2W;jEyomcMO^^evffa{(ifiXFY*i zc!ivW)_;1hLzBy+f?3M1aFx9zNGkL%7|)G!U1dl@7#N)8U$DJLrMICsatQH8Z*vMk ze{62BoT<)x`-gf-onje1=A`-5+ZY%ARw_p`)cHE`Mel8ziAkDcf^yGtLRs?kDV87d zEhHzm^QRDbugf$+>(%Ey^Lx+Nam^i+gfXJ-cM|~55GQ1!IGXxv#%oM(K z=pa~g4(IKVtsrI_#TXP2)CY0sjBy9r>m8O*U{NR&pTCytIy$2B`3qPq<6y*HDx5r4 zIAZeHr%W8)+|JS3Qx?rzJ6rJ=jBJEUI>Z~d7#zlm&7**mem>5KFT{`Y<^bgF*VV90!1C$2#| z6TNqzm~MCop+BSf&(>q`P-%@qDb8CqDL3TC^DNSRDQhsK7nWlKW z{W!{=h5bq})u%Lgx4tK_WJhbp>s#L^i}Q@bc&*@3JY_G+6G5_J6pBNwPbmGFzQ<5P z)T|xUv`omA3yJ_^DbZGN)lQ;~n!Q}`GDr7+9I8X9Uvw*3=v;uz{#c6AUzdqbMvBxp z=|T>B@E5^}kvLbg8_&or-+ zP4n$rY&!VJ(edNIVq45u1-{jtufx^5iy{JR)j|H_RXe_ex=~>DjLs!R$lJhmV4LaL zVc4O(s6kB+6-w(8UbN`@15p$1K`jSeetwdQ_@G;eQeETk_&7N>F7BYF)-M-Csd&GQ z!MHG7v~%llFTEiUTiFk~W7&cSwBho7U-EUlF zR&pT+lz9mEuDfOR7W@%=-E%BN3qf-MuNS#q^*gM6xQuzHg>6`H*hqy*Z{IjrU)Zlx z#{xe=4WK?lz;eH#W8JAp7P6(%Ouc#*th5i0lkaf3+4?l^IW z_zL)EOaqC@u&LI*%s&JTupZS!_7JXGHVKsomFE#x!vZk<;KSMJDAWi#RM5OSJIb36 z2vffQSXSveuIxk){S}9r@Ao#1D+4xwBp@=LtczD$xa(_&HIqK?i#q>forkNY+AHr% z?;dZwz~)n44d=%9{$J#%;JaV@y@T*VbZ+QjUMn=f6%J1mVyxR#w@b9(Fo}L~8+Ifv z#8@dC)%{M`Pw7qK?>%0{AhW}jki`pG3FsC6Vik|zC$^*hSlsGzcto%0`NH8aQYV{d z^e}L^-Hq`tPCqT+#EN|>`Pkp*g5ZMCG;}2|Fsfa(K#*b9sn#eA@!5aDoNZv5v+uCa zj5sysO2Xq%n1fUnJ1}#R zC#9jLC0WdZv*vH*SEjOM5Jo1r$YlF^M(+y2kuSI-rBcbxQVxV%!SD&h;P zUcfJRyb6y+lnqq0I-eT=_4dL#wm>4R%u;7E| zh|&>Wbizv@)Jf|HG7=2W5L0j5;e}UdRQZZT2xB8!RiDEHcwUT53(eJ#&}m0=#5O-D zAOZIGw9^hy6sqQQpY%@}j8Z-CD_}JnKX8@xP_OPY$LvKBUe!%q?hMlKgFOgt{_xOH zd0&&kJd!ys<Sp=<(S+m} z-Tl~38>-)WQSWoBt`=SxFM{b*`|<&fw%6*j<0GPCt#4nvey2AKQN1_Vtguh4s63hI z$s4o!0Yz$64YQQUsG}W-rGbBbxU@9?q)G2(YHsodHP?9JTQbwJ{xm_4F&R0~2@p-` zym;@R&D;YD^Sx{RPUpI~c+(i3*<@+*e#Xd50r`6!Xwjey6wBK-SA9uFi>rRI5+8cvwNZgV>Us7%yBuSj^3j=&@O$5JZ(6`2C0;YCgmT8Z zXhO}tbo3-s6S|X;)IY z<9kS&7Y73RkfL~RhuRw%o-L|??VRTdI8AthtoT>B(F_X@Re@vmQO1u~U5yj19!Qxr zYs4Cm?DS{9_e?Rzj{|~V8A$a=SkZ4+j4j+M#z8EMLV^eM)@UvM(wNy9$7HM4LrNQ{ z20h$cVgyttyT#x$k6jcBTVcb8f{TgoDm?Q70AJNUJ(H*YCLWb}Fd3uvL;; zFm~EIE5CMyJ!*igtT7b2X zrIq1AwMdDgLNP}|d#YTkwJ(;+9rAGVIr$QNX2~jJCTI>BB+R;WJ>^ z-T7{{F#A_`PG&W6dr~>qCxJ9u0misRgGl}j%jrtJ<|%NB?*OrOV0W&*Lbb+NHYEYD zfUdyIaRCg>tI=|a3F`xi8iGM+L>o_%aZ|tq3S*1z0y&lKUm^nc>=e-RLT#Uf>eDYJnrz5Pc_EjoWN!MS4w>nP>t?mq#3GMWcbr$0==)- z@vNVeQQ#Kd+Xpgx9jWvA=M)ah7BEQ+bEAqRMF9^%YBHDu$k9EA3C5 z99jI6yeo{++3-V9TL|ME7&bnoZ>5e@EN_6{MugMY__Y}w%r|EBiUzx=Jw38<=W^981#;(+zqANz8p$QM8V`XR zf~9FM6QW#hLi-^kbf#qQUC*8Xz0UUFwCwWt?+rD*ryx#ZUZyd4U^D?+Vl%VQnSWCXg_ya^z9;xM5+DYY*$E+j2yiqU+1S@;cDW^U)O~0``=YHrHNh$3^gYS zdbeo>`znX2M^0(l%g%Qee`P%G_!O`ng^N1Kg|29@v;A<;+)1PCh~ajn z>J7GW>4h$Yi#~o$edN>Ks5E1;B&loHkjpw>M80gwr^}WBb8KVl4nLRH>x~qrJVaLV zue?^lRIIP27vV`gOd$7w!B(fEUV4_n_eL%^!zjP#geghF6Auv7(ms|Og`He)7=JI; z7%jcO_g9|-s@u1x!b?etUBw(L60K5c{kM0D8I?N`1ZaZ~ z#>Sx~1<8)*K#7?qz%b|utJ*}udNGHi5ENwjnXAN)=b~x1yvBW>@>`!+Ls#?+@%ZDo za#u7PD{r7W$+%f5sp}IFVlJ@aM`Wy2_1?#*XRt^2mRx123b}hCe0%`-U21B&r8w!3 zPq0*OFZTCoMQ3v~zuV>}CMPgSO;{-i=>CFUDh+ufsF|#bK;mU{I9+96@f~V^{HaM@ zvyw+T<$z4r1wVRR%bzbOd)4}P$D529G*qQGdU+J_sVS=nfk#q$HzgVYq^C8WzW_YM(R;C%Z`G57=U8=jGuk{UZ>uJk2+X`^?0ep8;t z>s&{ty=8EF{`Rk7_dfho)$fK_kkRvj5gNaPZ-O@Su?_10`ypFUbW{A=&Oqt}TZ_iv z(~i;2Tv)8q{)<07(0hd)4ZY!<*Mkg9aw^nInqXrlHdz?`6|Sdmd`WP#V-Tlf8O~yk zgde|JB{qFwe&xJFastyzabrCH(mtYOHFj&LA>BTz=yt^D<}|HEznShNP5JmPfMnPx zHbbvxIycSM_~So&6D{RHEY)v#8`7D^D#_!x6ZbJv^BIcdh~FP)ICOEGeug>)+WPf3 z?O(gEDHkr%ymv^sVG$wrE1~3waJt7Qn0=(EXeO=NYxpbH^w(^sdnk z2e|eOj|;ql6Vbxp#4toJ9)KhAM4|GYg);@}60YkvD{zfTHQu}+FV^UIw;;axMmcRq zp7q=qL^t-Gyg27=MEl>~-W?{KY zN0$Cohg+oxqoyYGOYLG_Pt2RgExL}ga~jDr^^wn&>!u1t!?|*C?q^0vtkaQw$~;m9 z4va;1%^qGRL~cW6>-6e9n0anSpB}2?X~argt>Qp3zNNvHlv??4{fCZaYcmGa%xOh{Ffg3HE&W)s;59K>|Vs zk(tFRtTJ_ZEg8C3tI_RfsIQB-O}7<|Y3KrI#Vqr`1? zkY$>4ZJ!?6gVuU{6*bJkfHyd1X_6e=fhukM2S|)}@`Jhwp$)j98KqWe+Qe-`Iyi~o z=5>cqx*yZT9e&t>#nCFpP}=mHT^kR}wo&}O0}u9I9;LhBC^!o-Sv`kaGq%#L9{udP z_U6U#=v>v0qqq8%d649r62->P`N-f{-g?jU*&S{(UQ~9 zHE~&E8z44A<=K&OFeo+*unFcM@sbE4zdH0gE+h8HJp512Q~?!kE2?rIdk(EyIde%t z3H4Z&IEVbldnIF)YBb>r?biOml~9H^*Kp8%>HWg5urmd8KO;>bUjL$)K)GJm`kD0( z0fpNchw~|22sCR80_Q0$=b8PS>1E0=+bM7TwB|?BAV>FmGyGtuhb{zSX$~faoO*ZW`34&B){ko?a-8%4Z4|rAk6S zODXP0f6L+2m|6c4n7%NJIG|akDy-VRn8wje{A~jfHjQn#ID(v6e+O9BJH(@d zB=LJH#`x!duSy*TSf{eVSRBEx-~s$x7eWT;p({t6#ztGk4R9C0@79@7K93o-w;ovH z@e2)l0j)&v)$4ee?r% zzcICR(CEAejQx8{E8U^Lp3a~^bP{tgm=_0IH~x&WJNtkTZs%W-`Lt3(a2;4%3(fWW z4NL=HlJSV?r4aLhxH#E>HH6m)0&`M{;O1!Q0}(J!Ug%t8d$UT3`ahv$5Dc+iYBL6| zFt*3LO~GHlA!me!59XNkaN3gd&y%HQkJEj^H(x{s+qYKU_(?d>ugCt4Nee|FX?h%Y zJAE#~5WE|)DV)j*A~g>u489l`RAgH#L;A=C6Vr;ExcTPu7eI(f zV*5d6V-UIm<8>QzX_qchal%`TZ(s@L1XVtp(yb613HiFv`APU1`e(oTd%aOfLIC2( z6v(CR>FBH8h=atfIry5w;C?|YYA+ZNbP~jfhxvPRL8%+V$XM*nH`ah4r~~yuGIn9= zb&1=Df6K!P!JUW-A%x%As8IiB|NY+s-|CoA!sI9}Zp#1mp)UnNaQ7#MNx~cC=iZsm=&0tSwVF%vzw1I0Yt-549ntg7N(f?jhnYMnH### zc4O~(D~-CG7-ju1r`5daT&|Ay{lsKJt)y~^3Ois)S%9sW83d2}Ea6tJ%BwLELr8%K z41~dOaKH^jrsc&U`0pm0ZgH(quhI`-H+%}@dz}52GzRECAJqLu80%1_Uyr6goglc~ z$U#tngYk5#vydmgsRcwU^Q99RHol-RFahG3A@%DEVVREwFN7)MO>roCe;bz?a(_zK z^8^?u#WEC>t}$@kps;%0I6Q@9{A_ao6gCbbabhadsONZcB>|w!3cQG+>_1eCKhjuW znl*n=mm*5m{wE+#E%}rJC-;DV`%kHp!3-oaasq!3(`s)d=TkNUsQ2~Im)m2#OamVh zL4;--`|2rT3w&tSAVX}Fz@TNWK)nwXv+^Bh` z3&2j;?g1cdozD9u9MKLr-hIQTKnZAL2C5iJLrWm$QSuU)P7pwvaP3lOXNlfji_)I@EU6uok0mvQhF6 zQe-C&`R8E(IM1Oi3ALpfvl>8QfY+KolsjfUaF7$*c46fMzkw6Cw;W7N0k}|jX&-{h z+r65|ZD534Xv)zg{{)BiA?*`f^2cn&IAY0^s+e_CSIX0HRu;8rTM#(?`;3GL65el5 z5bAE7(YuO{EtMHo@XBK?TJl>ox#Tym`9Bc zUEW))5PBq5`W-)O>g&C9Ab?voI{WvxZOFy$e_YE&>;5w-ZbN&dr; zHd9*B%HZMg;!Km{=Ok4FtL!84jhV;CD4L*5!5dOEUl-R@OVYesFDCTuHnIdUoCo7o z#6yI!Z?p%;^96Lh{#fyX3~e(uHeOrlB+w@gD>6F9Tp=D1%sy^r5x@CbDO;AxG5S=Zjd4b6=#NtsI@Y`4PZ1HfBH+Ld6pRzpV^|~dK8kch_l`Q%a$aCxb|qr) z)Z0b!<^zUHH(SumFKkOx$}BuZNIJ-DGb3^iUZNk@O$B#gc&D=bGJ%lY5Wn@MUQbFg zis>y+-BgMQ*1)Mw4JO8}lQw9oCXPpySeMUt9dvK9lk~X=?ra5V7!%kmj5&~aQ7B>6 zEHxd7;Mt*OwVY+!9R`n|u*#Q9Gk3c@Ao)y|O)tqX|A$}Ib3vEu=I<;C>Z=IJpJ+Yr zMEt)So*lYCN~li&r1DPzpPKv>~ng=aH8LN5{PVPh`_K0LB2# zIGY5ndlq^=>Xqmq;Kxf}tv?xIToZP)q#i?Hv2=DPj=&Is8v zJ4E*Q+9V^2LW#=A-ei*%*?Y_AYa|(|jO@J$*%DbvgzS*Lp7*Ew{{5f-|2fZjo^$SV zpL09iU*E3l^SQ3~`?cN&Gl3)pS(FMLt-%C^E&e*r)cQ~2r3Z(&=$a0fdA1SDGE|h0 z6wW&}8?|Nb0J9~Wm+q`=D|Shl8hsbeI$r(SFPCsUA($aCr9`Wju&fWO;T5g_TXV^8 zLbnRGS$a6_-+lnM1f#AvOIps${VeSD;k?*yp95?0=Wdin?u|7DHhjucdu5N@Qap7K ze$hmG&7)XyHjK6Vi)hLhvY5W-S5$0I>c0&WLu!RSxC$^sMt}a0d(q{vKJ?;b2dmfM;Qn8^18*~J+p!Eu? zp;kcooq3575d3!@{Lzx)qrHudLWs;ztv0@-2U=6)sVY1}gw|DXM+klBOA+pr0!dW^ z+{B)-*S5m$%z=%l{yf1&4={xkJ!=G+t{1=ndzH51zhw0^sPJ5}7_QvM>|Hc)Hf!9WNpc_9~TVO|*45*pNx zRp2nweUSCL&2Rxuj725%)sY&$ilb(KEa@$qj}cVy?0m|S>?SEcor>ns9_Fk)>Rp0P zT{n`mpSQd{{v;CXIpF*7fzBHTC4E`vpSKVvn-5#OcO$&k(i4=2!N)^5?JH7w_)%^# z9#?(u%AP!NfK$sLalY##i;hlM>4%lpVSVf@=H}}6uy@m?<~>=z?;^KgA{NGz%fOB7 ztT*f#H~@Ni+q9W{GVui=9T(Xr|G)J^uv`%#9fD~_G=aU0%R3 zh8J>!5w_F-zU-PGQFbd5-I4SRLG+^sKvQf^6?fgY%|fp9dg``mvmb^5l#>nM#q%il zKP9+H(QWlP7NdjKES)||Y~39v1J0caW%|9ky6p68u=<6UMKgx7<7jI|er3>NB$Cv? z-OtsNl6XDr=c8oxHw;e(uNwd~*#Lf?WI-`Kw{i1nFsvGx|EX#^v7^N*HYAIX4P|f+ z+k=Wzmz4{D6nGskB(I`?zaCuM4X=DpJ?nb=%VV0)MK~WM>)LNFj~3d@-O~4JCsXo( zdOGJ}6b0@wBOzlEKZBU9b^(7h+!!YRZU4PsW!1X{dxbyE>GcS1b}J@l!!Z)~@A;8n zeQ7)6?ETx)beljNiV!sO93j~mBu&&qfXu;A=wXyeo=%yCpf6fm!HddV(wXe`!;4wV zE49C$Q2TR2l|^>`<^c=lyb5%7Tk{_Vk(->^65H8)(2>k%`CVQ;)av6cpEJDHm!9p6 zXqD)+^!FVfY_GkKh8Fyhd+&3D8t2@jaqjc6QtJ{QpqRg)#gXpRK`UtfLvk_Q)DkU8 za_Q->*JG>Jv$NvpVs5NJJp2~b4-W!G62p7H6lvr}n3IW!p}YoY&7dmNw6NfBVuC-P zF}J-x7u%`-t@_E5#s*|vM7z&Lizm~LJ@1#dKy!ID>@i{1h*(K?_#yn90=FoFyQfCXs`zJIM%7$}HX+Ltl47^9C-HBE4-^Ca3I;Oz#Uhrh|jDbg#u1pDX+= zqn=@|if;6`84c_L6_G5Nld4dA_9ss^1-5k@%T5yQ2rkfFUP9uA1*yfm#d}gm%`U&~ zz!~xO1mwA?&_hn_IdvM(G2H4O0pn$x%r4vmW#1(w(wg)Pa+O~908tca7Lwx)UYS}*UGMJK(x!XFOjp{m#qhe%F_l3NUsihQTNuKdD zVg9oX4;JN<$A!It(AxFB&_5LJE`#Gb0C@-w)4pfVT)^5r*;YX@k7;RF4lQP{&jP%D zYO4K``E6^@Vr0ih$@YH?yaJSizJLD0C8KFL>9rY-k3Q6u-#|8EbvE1U5B9OA-eWuG zM5J&^&>inCmGu=$GM$Ytlzl$;%UnR2N!)J2j3mR|N6cz4Q}z4AN%Cs{kDWh)-)^2w z9Nf|AuVj$)ZJWyewGwhc$^KvbfSC*ypFDoZ1zGL4GuQ#$qO!z^zX6F3ge8uC*zGbS z4v9O1Grt{)8&08&0ehq#h49_|@0O%c%9f!8&jsROiQ<^;|KKNNI^9_JEZShWF%6t> zU_dJoz;;Ct^mS>f$C96T2Eu?md;nHUgUp?;m)K*R77Kn^sa5ZV3rlvhEwx-~Oyke&uGo z|KssgwG%goxsPRs5KhPqWf@Az-g>uuu5ORGpBOkH5bUKx4fl^I(&ZzEwQ%EY;XrkM z@wbFe&%c7QAO$k`w6i+^D6=32zd;!!!06-^=(Th?AxDG`MnIsqAYq5@cb-fBH%ik7 zgwQd(WkRcNy|g}UiAeaK`r{gd{R>=@OjHwWBhUbsJ1kv`63Fm)WTycC*bI$@^CrJS72 z6@Sn>XJw+d>2197;4p_?%(gh(J;CqumFus*I1F|@`Fywb2fNEFYeVsrSN}Hg0KOt9 zAmNo75H7&;!l}^Q6x0ZHpQd^fs0pNu6yjT(dzv=9X-P`Gn;+w9hF7n>3Ya#QxK(*m zX|Ul;YO*ehR=8sd9OZ7LS3Q0kc@>3B3UzOY4O#Y+NI3lELLc(W@7O5iP?;6iVDFBuCtyLb_^_>F)E7i%6MkgW7R#AVk93V6?Smc9v84c&S5A( zgckLJHuv_L&Kf|JWx1+Mc3kr-E0Q-4^gy5Wo-XA&Rn2vCw4toy(m;`#Xs*dRuM+c` zUNc#TEJP#hYA9_Hk7fZ1s`?3ghb<#}$y~CX`=w7g_YF!n)JMrawSL-hcyHr;N22Kz zcfLAq(oEAH*?1;Ep(GYQzW|eWUq;IAT&(P)z;+}q4i-S>@tE5Mg|3|22`2dzk}o2{ z1sSjnrnT6gecOsDYyIcZ2l)9++OuMRIE(UMFelv4cfk#k!H9HQGR@44MnPngUw-x@ zl&QDcX^y-o(^1_#@;$VX98*`jYY3*tD+65kAt;1&FE)3d^U3FivB*QHqe>K$=VGM# z%k3sBqXYy0gw{xKo*=n>Sa%ceK>Lw?GRN##$sFe1s3b!hayxJ;QOw_?G-~SSKT8rH zExkL|)-5Vr4+Yx6*u_hm&f(>EUdem|Bvj&uj@0I~$_{V$)yogHYmW6@@9-}zeBJPH z*6Xq>HIuB}Vv&rAF)#AI9E)Un(e+2p33m`yUcOV_Dq(O9!8B;JWe5dV`7Oug|Z&eo)+E>j{2@Q}<-aBuO&v8HlWV;UzN>);uYX zzq9-nta3f4F~>Co<8Cj`%DgVt&vk6@FJHY(lX0|8GAymKm(d+7-?#W=SZC-P$8*za z%Ey;1d3sJu*Ur-4^3JgT;%6#lAQV!cYA(pYu?3xN z0X`xzDP#$99;~p@ashKI`EJm8TrN+p34P#>YvZM*jN--MV4q#dqhj7NQ!2 ze)rxFykY^DYr)b3+UhNzJ(Gv_Hop5x1$ICSYrfa6LcjF>2-5A0i{fX?i$i4B-zE@x~?DZ$y@%4&6lB;@mDpIoD{;NYVyAFJod@bOEBn$ zJT#`Ei|43xN|SKPI#F$C?IxnJrC0-TCEU)pBL_-QQc=RK8y<_QaDeG1yikxsdm0E(t)Rz5iZ! z)_MOC-tl#7TFH>aU|*Z^n2QeC78zrYa1oKvjZUHtpExIPO4SM37FLKGq3(IgZY-br zx5p&DQKfuJ*PEvHQ06QNjYZpr|G?G|3L7MM=;dTa9x?sz(R~$@L8eQGgiu4<-`_t# z$HZl!w?sI59P0=aXM)ggl{M91XUSbddf#y}P8xNw(;p@r)H+e{5{c_n4j@wK*FXKglS}L>V9~!~$gPRVb_Dq$6T2Pe7;& z?|T?UwKSKwtv#}dGfum-dq84<7&7{h8!oAcVRj7syK9sM*Y|>ZQ8Zf51iUd*x}Y&2 z<@PyuD6<)Rk&`3Zc_G5!S2B0buCtb6Pq z?}wNayY7%GYFoD7SkHdTdh9fl>C&{>dh?>iUH_=f#M-#0VG@q8OO*!^yMsKi5Rtvumtox8(EI7h~BL~;q+ z*y9BRNQ2C?Lyxt|s=Z5|Kgp18-)XJ{D2fk;O+M2Q88Q2?@X^R&=&NG{JVL;pukaE( zAlWt0kv~4shJ;C?z&|4eI>WVv5H>6R^p6!1a3j_S;RDh12Jb_WEZZu_AMy?ma{RZo zhOi&)Gq0WH`8z2>A9ah3{QoAN^Z~EF;<`RH4xnnP>-~I9z70Pt;^Ki?_l@;vs0*Y3 zu#(P5$gt>3_1qUWX%EY>`-xb*hp8(0_(Ve9(X8zK#)gh*LDYU~EIAi|t59D-q8I92 z&g|xKoC_Mso4$ZHO@mSY2jVTrH)yC$QqXF8P^34$tD^=wLqtBA(bTUE4elP4RFhBp z8bX<%{`!53$Mu}+1a!d-2;DLXztq$Rbf9ay`pfyY%XrK7#~i0f0L$rprA9ym#Txh3 z4C%#zK;Ch)haUC31L*gr0XRypU@B@1v}8X>zxy&(v4rnx9u_X~pBR}#D+d63B*VQZ zoztx^Ek1%~lN|tJ1`3}yrN__~PaZPUOYVLZhn^E@{>{eAp25 zZL1DvE0G9$?>lY-01eq)lI?M8y4vgglB7`igwf+#+%795&!<9v>n#w@a7xC;HQ59G z@1^=R*}>nrFARhbWbmc_x3}u7UiTaUlwM^CV_5EC39C=LP2aA%2*~Bco8TmB29y5l z?cNq2FHGJ=wZ)?(!~+YnNVv)!*`jE6>YU)hEWGxm!RZ2B!wO4*Tv4{zn&t_9|Cud`2*Wm_B4;%+2UAeq- zvOE2VLZNC(E@ZqBk;zs1AYsMQyh(-9Q{$pBg!zSUO_6Y}@43V?-;m%L`-dO#NqmsJ zD7%-Tud08ZC9{l)y1yol1FJ@mR$rao{x`&|!-GSUK!e<|$bQwjUFAInWs}&*UzRg$(#MnzH*WF~VcE#nO!b5%D%L02uTJq}5JZ zY_;fzt))L2*=mP}#v!w=x7fI?7i7xzxW3_2%QgRKFV^ugMXraRf?$;<@kHH_Mip>C zvnq{Qzotq1x<32pcB9CS^H@QXKHRorBEp2-_b7#EOm*4$II}+eM+=Y`7VKl(dj?OU zEo=9v=-q?G*}h$JPD$O0$4Q^%THFK?hdSJ))ipmt=nJ++4k#833JBYB3NP2D$ULik zXyJJ4nqr%Wu98{L6|m+06bfVUvs=CQ$MjE5Om8QZIb(e6e#nLFxNS+*z4*7Dv;>d48W%x+&v;-& z3T51|^G6Y-F2${idIs zMRIH6wk1+2lamF?uM`||?IPc>B+qd3pSGY&z<(CCxA-Dl9#S0sq6uAisO@#+oYEaZ zP4a)$rM+>{$&>A0mgAobjSKJXhcwJMpNoJ0m#}m{O{*I*4=zD0VVImXsZkH6{t}sy zfNQ0-nb)~~B4eJ{|Iuc-@nEejre0F3i3#yDlcbB;x{2ML;P$*4W>A(+FFFqfH+etx_+$x{caA_TK>XrXh8*J7JUT-}b zcX!no@7Hj;Vm_bO7%ruq?BjE6VS%HpOXJ_n)O3f?vt{C}Zm{!5;Cv z8DZNnGuGnAxZlAIswsE|0mEoW(nBLsBlwBV`Dg0vD7ARt&a1vTKSnUdf++CPr?Du6 zpm*0MBlR=3GP7`5b&7snky{ctkNJE~qw8kwb!_DX_`SI)AC!~+icaXE%ZcFYq%=e} zxeEiM-$Tyd8BRBcMJX>gHIOMdqIcj1<;XpDd(!Pw11T13qZ;5Rc>T_>sZkP=PKXqV z3hlXil+~OSUO2CF+ffC>Gm!PO{4*EkM|cLl&Gra?)J8-A z%JFjZ8mft(|L5cP&r1G}f8TqAE&pL~=AJgOB)C&vHoUlqldFh)LPX#>UFuCGeZ7ju zzqsoq$n&%#}b!16w%`{6%#zH%R1IZ_Vh2-xb@!|i&%;pxt2 z<#Y{JOsb$}NTyVUmc192-Y%_I|DV^{DBtYxOH_g+iN?SGovxb3V9~QaIDy(d2U#}* zM2v1+LlGbDaU!ocM4;kdn%jc7LNlJj&$p5^STLy~o1yUT0}jv@!Db=-{`ekaWyH-} z@2LD|nUUcdWso{I$*$t5!QlA@zIxm_3IZgt5#gV=ht*`4>Z_TVpHcf?W&vtaRy(i* z%2``T7WZ;w0b2nIgEzivm7Ev{&)-KSMv_d@-R&sdW9lI$kC^ z(=F>0%!wIKItJ4)t|tQ`ZHKG3taUTsVMdtT^aZ5k8O>~=34b>43Q8tQCX7Um$cubJ zs$y>eG<(AM@kMY%37$440&WykKZ5#m!Yt_m8K=hINg%tpZbG+QgN3Y6gGKN}A%ID* zx)NjVm63OEm;ct$Ai9Gz#@_yGNRw9JFR9ehr3-H*1NlDGHGN1Aj|dEb-z!M^AZf<; z;U@D7><|d3NQQ8E^BRcLt|3(Rfu8PGTsu%>XPx|VjPmcsf;D7_N4LPHPguwQQ||It9`wI| zD0h<;PRE-)PPtmhm-~PG6A=f4Hg2M=BTD~2pZP!p^Yy@B7ZuHJ!B2tdhfs{)!}V|X zdSSU*^YOycW)V!lNb63@E?cyP$uY<{6~Np(kf9kNff&L&ucWVy3lsW|9SOwCI-15d zL$uDPkQaB*n>0&03)rJY`Zk6ZzJEY}V!p1P>af70*Dd8Wxfz7FR2|DjV!8{>c+bU%2tpP_swe#w6msn$ap;TtSg`?^< zK^dcVW=6!|PYd${Bq1oZ7Xq?Uhkh&(vj3k0I>r@n^sg=Ty2NpYM8R;5G=yP$Q}!SA z;7jjLwY_5sX2W|q_lfDm?QOxk6%^Hdy;g9*vbyUFy!d?xiU?PYInelC!}Xpn!=Q+B z>0${*!)x-b8_iP|{+T>UcJsKCSps};NT>)J>H6P*B4;ywNrxQs%j++K$X%fWO=F?^ z+gGUcOCT(=KyrHZ${upwpt}3uRD$gwkqcn&%OHhN?@=t35Nb|A8O!&cqk9rr64*G7 z@Kb;;-s>1!oj898^ECKHRFZiG83vW>t8M<W)74=PRVa zwrm7tq~CW6TXnAR+pO(Kge%U{hR}sLycFn2M~V`e3!pM|lVVl={nG5aV4TsQTs)Ft zo(4}m?d+NbdiVm3ph*y|tWSLkfLc0DJ`FoXkh(z|n!+n{$j5+zqfb*Y4dJo*avU2) z)wB2tjX$@#-K#lYAVxHo;XC5vLd#|&xlVw|iXa0d`oL0Qmh>(BCl9x;UiGOIei^l-EYLv1;L123?QkgoW5=xf%&tf z^HSN)0bqlwU=*-z;#5DHJx<`2BCn;E@&}&X^aYXyix`AUoNm%sAgKZ4K-%?ivSfz| zoDdzRLG-7u>cMT_n1@RR>)-6EzV(8t!`f%B?mKN1UG0rb8oV1U^q)S1u0qw<=x0~F z1sGu6Y&~FT$5^;}mjng#TQ79;QvkQppu6MD4lnqG{v}QpmMRC@j(x?y`@Zb+^vl8J zGj2|d|7(u&GY(MLZlCV?o_)>NuM<-dG4Tae>~8=85|cysU4f=d+*|>X?r&(e6Os+z zF$lmYr!kj|T@jsA~F2VJLeJhOO@bBXmX0y`iou?@&e-3D-nf~O81u?i)h`C0B* zN4VD4AOW@u#LX#R;YI%d+#UojuUX29Lm+)V7=Z71frkB>nO`ZxMnv~346~WcL zXT{{f@r~!bW7byQH8XB~3unWt{{;xq3tL1Xv2*Ecyxik>^fjQp{U`;Tl3gtXMw@Nq z44om#oL-2$C&h>HneYVlp`Vt%Yf=@^lWPIF(YbdY4`%u+`Qq+{FEf7Ha{Kk?N+&iB zF~cuLko8PQQ1LjEYsmaW`;3FN#cn=!(*XDmic>+E*G)=HD{!K63-d=^Zv4aKQ2hf( ziufvY1F{%-F^!M8 zgJ!f1L*NTsMJ&Yh;0&trKG<5DSm)EN^Z?v$8*I!Dok^e%-*I~y`(SBg<1#55g6Mqs zSKbpKdvi;g$yxPA_Vf9-v-IY&($AyfIIi3|*Wd=-Q?)zmL=Z25hJQ=szWY;2*MMK3!3_qDiU z<5I>^@OYbox3_iSw5G|*!wlq=6$mBRVS2I%zl0;W}MOX_9&1^LSE+ zPHS0!i=~|X2)*-AI9Fd_!m(<~IkH0TI_n1@tw+z}owPHaOBL59-b|~v8p<+DhI2Na(Gch&`?Uv;l3>*xoUutW|6eA)Z3hw}~32 z8ar$c?@ow3#MkWR!S24!%rz0FE1n#V&{g9AxvVvE_BuzN5KN@(_qknAX+pZpA{MJh zrUy4=WPdc0*$5V34b9q)yicn5((LPt3QrCU`{gUm8i&Ie3pU(G7R3t=Wqo3!BX)xU z-Ns?(uRXzAa|J;!244b4`0nQnM39~CI;u13nlEcu{9R%JZCX&lIZCJf(`4wTt-uwQ zICJdhnb9UEJ#>7Ca1Y@WM~`qW9{2E7>0$X0ZzGkn`QEvo4WJKPrDK}HqFkw3C-5&P zy%iX&X=jISA-@}ZCDP8&ELRM&vC zQDm?hzlW(l`GNH9Bnn%@3TtldSEu59*q@6B-(Iu}il+495gCzs@+&S))J9V~xywd- z$gX@yQFDc!pqz813@Y^buV|19kBkN`=#;QM;tz>i*9wrU;1hX&r&BwuB%{po1fZq} zc|kz1r+`gKiOd>Np6oFojCX>4j}mx`Z%|7(z(2Ht}NvDS(G)1!?^Ne{4P z@sB@>bo~K2@TGL=*k=ZgEsk@>CL&~n=mv&~Nh_C*k`PM1&}ekg2o|RG;O10~%Ua6H ztFz-gqCs6e>R-_f7d7vtk9e=#$xz8^(7x*8V;W0F#SBcEc;!Xf)omYG{rW3DuPawF zNZ($;ygjH`vTGTgY`p;vE$gpKh?hBoFa2wjk)fWdV!dtk_m*pi0@#nE<>;QMG9_fq z=S=9XJ$qW%`Eci}IA#jDl?12L!!nq=wm@r*9QjBNfhnWj1^!_AH&6|{{narwLcR0y zOF;LmzP*k*f$xK~IX!_qKI1jNa6S8oVVtdEvepwK`t)M4MBSzg9wL8u&{7mgq|1$F zZW1R~^H{+p%XbX*Wkmo^)0>>`&7Plof0l;(V4RjM*260J)Bb7sMg}Lb{{Zva@{`pB zZ6_*;Mq!>4g7l6`isz=Qx!yf}&dp(8m!yXGg#gp~Gmgy;jHC{<(g9tbI4ff-orV0WS;a8{}QLN9ab+s_Iu{>OT$ z@ZIUfx>1c}oHMQX3ulm2Vih=-m89yoWs-(pK3u}fQcaZ_kr>N!o>vX&=K^a)dgZ<< zUVOsQ?iU{`b-uS&=hz(r3vRtucrD{Za$h*;oRtuHF~uJ055aY!)K$>cm9?aiVb)X* zTA~xMPuqe?>{>>!t~Zc!&tM{2|FVe}wpW{fCf>hW;y?7Vqw;%^^TfGRY%Fg$zqbhq zej}!|LQ`fh4s)(J(r}5DEf%bS-$-~UEXd)n&n@q}=sdRR{W8!fO|AUGzk3sfx4<6Sh=w%ID4Pp7akJ_1kxQ0+Az_x6eeKgehnn zGlNEAqm81aPI0mSrRhu7+1-Z+L2}RW%)(=qv;-gBecv#V$8m7+T|o|6S56Pt)lUoM zNC}7OcZT|bOgYM@rNqGuH5RMZmMQjdE{;h<6B_9Oe(myrdaA4HHLRwF{~ z6LUxvj)_A&wEmk>vlBWQb3`)l@{vkQJNjk;b+W#j&Pw+@T;5`U*9rkK+$BuRg~byC zA<(J*Shy3ru?&)*@tN=$SKF3%KeVjo_MV;`=~`f6vhZq$DP2z+EYDJ_j`VNYAlJ~w z=E4i%ub#o&hUvLuM|hYBZ57O5rOk{d$0-*@(A&1@McoS3oAxT*YgK|U2eo$C`9+qiF*Jg`MB6Wz`B*jq{iPb+)d=6afp||thdxx zM_bj+533^kbJtcD!*z-0a0@R%3eu=a9y)XAgF+Mi<#T*TbTn4BS6M9E($9`UU6XqP zwM)Rb|u_-HuCyq?Q3L-AdLbPM9@B$oV18mQ)jF8nP$v)P<4 z0qs*RT65Wm+xIb^S#->j2g{?orSLH1?WVrV5+dy?2*!D~G*OX1(i&?034_`O_u_S$ z;q}bb!>C>9$TY4Mgi>k#I+>gg|s5(du~aG6uCW?-U-T#Qrc$KrE+1{cvl{3v9(`ijin4u{WYX zMs;fY2U2Vea(I8qGCsdO1NCa2Pm^FAGPb8DRBSW&9l=X8`04ix{W7>tHGN1eWM;z{bN$(t3?rp$B z)A{jt>s73Y8%qTF^D`MWkJii${Udez890Mq|Gp_@HSjpU&|)vGG0C&U=T{@?EqedQ zLy9#gx6+aUZ*cD+xPF`52g93sUy_rk2sq{iXb4KXN!RNQ?VJ9jlfGrg)X6qId=!?P zVEbWrX~etifm<$~*{FPAjML1=0HczR7t2n9Fo=d3Fp1cM7;PPgX50?~JzGz=2}5o5 zZv5D&0g3fCN>Rs1O=?ZVCcfE$_hzy3%*>_auC@?*b_t4e0hl6P??Qvq0>w2i+4jDB z73Ak<^a8OPxenZ0%DSuPkIahN4R*s0K3DCRsgLqEHZky=Px9m;qB+p++Naa5Zfj-x zgXmc3Ne}(#`LXGU9ghFB^t!!!E>vR$%O9&Hz=SXy&W|A6zDgyi_l zR+cL0A)4t&J1k*xd`l8z7k`m;T=4q9xia~_PuIvco0 zug~N&FXBCj6zg76DYT;9pyYhrB4yMUa^|mbB|zN+x0lb}P9V?lsUqG3dVk&_!DoDm zJ+~{hj{ZDu(&ug>hcJ-AOw;Pv)|dXn*sbQ|P7KF4NjfA?X`OBdxnAdLJtwkLl4!(o zvkle#gtkiQuF9(m*nC*aL@L;9N-NABU5eWNBjNd2)H>R@*!R&3_fcvkXDD(~l+p61 z5DdF8xDSI}w*CCryT0tvp4a|3!z3HVSCp5&@hC`MCgF0p80Cs(&E61Ui|cvQpo6Qe zSoWte|Iq>voW0D_BX(fpb(=yb=^C2)691Tnv0ygA-qlQPYhhjj%1d79&PKI!#Kc5K z*c^i@aihxb?t7k*V7zjxX-(H6)RHA}(TNLbExz+UBd7aq=eFpPqJ_`D5I%sIf^vBocKbU6SiksaG7jj%$a3yN-xK%RPl{Y65rxel&b3; zyf2vkGj&TFFS94+7Q^eBVz_&n40s-exMRlRI1m8Hf7c)!gZhETCnLi+GuacFl`7I( zn8;S0aQ(YgY}3q1>U`KjYt9jO>x_EapULsyx~no+jgMVr7skARUIL$8zb--h`Beb7 zx|w1CD;o(2HN_^!6B;CK&&3R&62X31{BdMys<`^>bPvJ(D#9gtvxlONxUmU*Xipu^ z#x;B>HUbo6G5U=;gBTCVx|&HHAB~q^2r(06fAgLAd#KEOO=(wxCjIel`$_a~8$ddG z>U!k&+a!wp*x(V9SMWSWyoHs zJXxK3RJ^#A)O+lvmEVAe+OKXO2hE)NXlc3&f;8k>LU?(?+)>`g$DUJ{|-+6p5t>r_xIUOI{{*+Cy;7te6yR@ z#~C1lR)JuC<8jPPiFsz#9RBHtdPK#|R}Xib+*RF*Uca8%`pv(u>UEa#=9Iolzn`1n zc-BfezH{g$6LvLe^4~Kpc;da6eC8#xuFG~tHHahZ(bweabb0G$wd(31uJC+r>`N$F z?N{UOErdu!0+l0aJ}x`^0Cnm6ot{y+fnxE@f@8)NS|ps---MY`B!7M^a3o8Uy*fYU zQcA+IPPaDaoy&$@8Y1_oT)p6XMW%Vbp|&xp6v5tCBv^d@W$^;VhiznrqSIB$?>{oq z%7U1B%C%oTF~rLWmec%&KJ9otpEH^N?CB$i)MYLz|5{QjW)UN&wr`Z0*w|wW#ROQX znL~hDR|@iXZ;r~MpohF%h73)nO=@%T zc!&HfNN%l>x!u>GDf(l~%w~OO^#vWMaeK&VP&iKAEmsMsjkQgU3#lwT+pC##0+Mxp zmr$Ans$TYETOZEKu<17KK3>rKBIv%~8o{aO{TlaCnv>(-V=R0*t%$(?=}qRy@)m}u zx(mDPRu4kgDjcPPxA2SXNFOspkfjAl*qMRe0NBV;0yDqoyhm2F-+hy3#`iB6EOD`E zIWUK&CiVe?N?F5~;8wEaFRSORS@G>TO&!&4;Xhb@!fbIrcYUC(-Bn2kVnb zEjJX3dvo;JXK*@dB8^)E-=(=Jml4f`yPliSF}~p4hEg0;(4(AkCZl;7NG5ol#&BA6 zy2c2#Q@oY7EZRXMejnTE=G*cX9Qt+>E8I=)T!!V@ZeMuGt>eSM0qTY`K45c zUui!@AcySCYO=C0Ma23!RS1rL*ZDdk1@dU_#@DnN(L7WC+#I`i0NiL`jk`G^HxoTb z(c+x=n_@l^Sf^2+V?m?up@V4^sSOnTa&0C_2Lr#VNZ-0+m&$c&TNbNj^PyO?t*8rz zZ3T|18QrIbBq9YCBZ{Uw_0}BHk&aFsZ!{qJL;aRAPw$Mk-;0PD$sgczuiFXoqvGfe zL8&b-M;o6O4HN2Bh>F>zp>meDAeGWbwkjYbNf6d5l!^nw={c>{S@^m^ZkEo+ z^CIH060>-91K~&YTX2ux$Q*srn-VG!{Bi1zr0uZ1K&$#LBiL|d+5MIh1bWU9xk&(7>uo<>i zd&4Wjkp2A`Zm9pcc*tEYCt?U^@4vv2ZY2HzmD?t=WL!?ALT_Jfd>R+ED;{OrNk}B4 z;@Ka8#qPnv*EUnOE^9s`=Kl#Vq1(hsYRV_R1v$0ymtd(}6DJWzEDTogvEe+~ zoz#vNyAjR7&lRwqb6iPz7=6SdrlCz|^llS!O~%^jTM5zSR~h;uFVWu*3g4HOUMn9a zj5McY1uA5hudG*S8;=}DjDo|@0x)+%SFPEqKV7V`B9LHLZAb0DNTWiYw#0chaJqeG zN1R{Ju`a|FCz&ovJyGrPx>R%*un0wLdI|42-Z`f(d*pwQruHuw!dVfw;q`hIEjIC% z%^!!KsH9-dbLs&D9KaS0JEJOkv^krNM9Vr2m+#qJb+hLiBQPbxIH6LuuA+|A?~h8) zOBYgny}JZ3Ex?Yj_^;p;AAQ#E=RId%7l*kBmRNhm7zXR@PW5)eT?8?t!~2p7h#^;b zx+Zdu&aK^NRL3(@en?yBa8bJII*}ON8(JAO-gkY&o%W`zq_D9%Htn-8}+cq_;^n0DexL@b9e47J9p#R=>DdYkR<8?hA$QFV8z|Y zT~ck5HP=wP;`+B|#^~Dm+vP3ak24F^-^yq?y!(#&Wc(z-(Hw_0Jvxcz`}XBeG>BwZ zEN&8^&I{j6d0p(i!$7rl`Peba;YHq*GpXL?0@i!BuVX13g#wzw)UsTU8tM6wv1x#Vw%GX#OmV?t*ILj-z#~F-~aBPZXs>wZ@2I~Cz|sr|9Osu zzf+zE`DeALOrvT`$Ej!C>0iY4E&{P22!=SLU*2;WLsg_!ym)N4JB>8B`AmsO6v$s> zwU=_aDc-7W2<{;J@e2vX9=M-QGS|-)-2DX*BtC5Te2pcv%t$~oS>3`rb#onknG*C;=tXG--BAXe9!1D`3!#5U1P2YjIJF-tpCWL z`BqCQTa@*>!2Wf`Ov6`(v1e)5K(ygI-D(8pRSQVQSb$N$6|I!miYW9F2VE?}{1Yut z!QSEEVs<}~sCH=bfjPm2iu8lT?ycO|0t`mCW&-@U;H-V-bm|Tf5VdTxCzpST*f)0* z;q!0F)^@e@8=Hzbua#P-$|3@)#L4XEECz?&<#E^t$jnXYP-x z1EH*gkPIAXBVa}J;}U(6k-S%gyMKw<^zUyNwe`J%+Fw8s*u5~%xN{$26l1=N`Q%-{ z-*R=w5RWZfj=WqwDrWq{YrKGSiK3FW-vX0Km6^XZmtM38{!lnxj{06Fp}s)mt6&ZH!}=NjwYRXgGH*a;kY74*mgJMMk0*=TZq zAU)7rx|L9mB=luoYsRoB%RL@!?p&e?eQw5W)k3S3+DRC8XPWMOm-pvHnhR?=iNi;& zu!0focLpq^!N}*2TMs#FtP!=bzI*mmd%Tt%d-if1?Th<(^oTp6=f-6kN?b!~+|~`k zmL>tPG0=-d+~IwsU`(gLVDxSFS4ODkcfi={=L70w9A}!nNx93$!&aw=Xg%#tD8;HH z5f5e2d5S3Sb;*!ZAcwY_K1I?q7uNzsQ!s<+t^tD)#ZVi@a-R);zivvz7#`B$s;)hD zo8kFRZBTu!F%IK;xG96*O>(>6fp6NF0okrKuWY^b3X%}VjYAo`@o_dL2E;c=H0kiX ze{r;CI`dkNg&m}cq8shKj}MQ(U!(9=J2^Ua;I;B5y6FBP{i$d2o}jI&e$9GXbN&|s zeIz_S#GS}+p?xq++3F86idHkt4Myzr598Z?e5jr~e`6$nC5hF~a~aVaEJ`h$*DL*T zxueg#UKpudQ$Y0Ly5k1K$$+r?AbxpC$&}w+ry;<`2XqwQq;dQ`g1vus=W;a>o?OJN zP(&yf#ZE}Wd|5La!L#av^VIUYe=+?6hZhH>O4z(8NQi7whG12)qLdbB&?yLJiYSmC zme14xkuCn=Eg7}$kh5M+qCh?#+x5;rg*1G)MWKX2#H5RS93uu_eH#;|@{Z=0 zt4KL<{Zk1u*XbR0~JW&2OeXmc#yDob34In*yY* zgNnsQC7ncsXh{sxgXUFa0-b#}N+9symh2eYwcRn(ixjG52Q%5vecEz>U9Bke>$23& ziYKCYzS!)4V;oifsk5cEVh(`e0UO6*_NhVUuI7l`=VD{ZXH6EkPW-=c2_Nv9;_-&~ z;VC}Wka2O+--V&vJ@`V7)?|A#ic(AB8+`Htiy@&asP!Q|^HqX%IhViNG(GZ)ShlWhNpN3m1qr4Yal%LPqPJEbHX_2@(@r%X-7TqgZ4$!1 zE4}kTsvpGy?iLfNUA#A>+CqCmp@K}t2O|)7kK>Y(pCTFP8PM((Q%s_mw~@sg{*+S| z&cR%+ukOGN6)@l9vZKs4U2q;1*p)!>j!{(8P_HtLO&T9`H(rQ*WNA#_9x6?ud`{al z^V5a0=a%E4JHS4#LW0q|!k(7Or6$t1qL@ja1=j==ds<{#gMQk!X4FBN&FdBK-?V#8 z&v&y31M@ZduOf=3hkBryd1Z&Yh@bd4EV`q<%^+y^{N890BY@FRsF6=(3gmtk9QYEE z{4T8~P$oOhFOR^3>@7Ww zM2PF&7X;fRq@0X=$DkxhML4e1o#fV4B1+K#!;9kO&q^{JRlS8;976@0kc6%|O??0J zQ6jT4#mtA+psLpFj{Pz$ICpY{CubD5eJ#!C;{6v!%7(5Z4=q}QE+k75wGkJ3pd0bz z@I(j|WR-3F+Qidr^-7v)0#g>zXT1G*5@U(1tC_w&CRnj@?b0hYyQaaI!=&!I47(+i zEKGB=i>$ZU>2c8)`WY|M2P0Z$`-{JD(ara;aYeW;loPjolO@coiM3!`e?iJn9b|N& zW7iJ@tHAFEVp!z2_Wob&y=gp^Z`=2sWnr0@d7dgVx6HE)p$tXJoH4A-V<==+#s)L7 zNK%GOiIkbjtdOLTNC+kK^lXR!d0wCUzMlJYU+0VG<@2txT5CD>{n(Fv-?rcH`^zEU zPawf~@N-TV>$Y zW@dcM(4-`Zat>|n@rSs8BG9F;hfs3F>MnTioiE7Iz1sC$YeIv-io(oQk~;36@JKv4 zC$jGLWJ4rB(cZ7;W%KnO?xeMZwPb(xDgL#plXhR7X#{JmjcIn+MJ3G7fAahbs-`7{ zFV+Pqq48;Y>Uu#U>XSe%t467e=&o-J>!uD!xC_HnymqM4l%3+<0D*gdo=C)2!9L{u zuCpypd7X$>i}tDLvpuJ-9ZFj5?wb)Hf!GEBHR!&?WZQN@1hx z5%s)qYEn>(H3C}j3ctrlao)_S`NyR79i4CK4-bE_6<>Jt` zeXkoADXNKDs5p*ge102mRwML=g}v~L>6m(U#ZlYpFcPhM5l`gzo)l8!T`GUtiJIj+ z4d)+`u)VNQ85jBJk2FYOK>`t#*`GmKDXYO7dx^!5^^GFMf^1YbW-X#s*%VZUh1{m3 z6VQH0e=?!Ldlmhj5norEuE)_am*GR_SJh-o94ASUZCLt}f?UF?VU%Oip45q7UoUD- zkhnc)BsCw3JtD}b)a|G0y-~JFNW07xtoZ>Wl;5Gd{$-|yrA3f=#_Y4h)a`FfzQoSy zE|OV$`wIadGD|y$0k2rn9JCGUg-GIw9$Owc5)<>zz#`@h!N|IzIu;7xnyw7 z-v+D04_H6^07?t-$Ix(U7#s<(z?>o6Wq$uu2ZJHvyCaFZN-2F|DfcM@$^aE^fsxKa zSIie;&#;Bpkgy8{oF66+7YJ7am4pGx%>M$WFhOk$m6+1-cs@$)z8ZlsymQw8MF&5C z_u@rIg<-KM4YmLCUy5DP#|MfRri`bf|8Flld;@7}crBiNy40HD|9{{A{5miveBO>` zws;#1ETdU?GRs0}i`V#@l=J_L2>t0v5Dnv+_SUmAW!X$_*oH&$|Nemj$?oPv!6Q#GYSFIFX`-!GxFc;ao>V zKOcXy5Vo9u937>$$A=488sAk20#EhYN22Fu@X-y=NJ;y;I^GA@TZx;BGQ{s9+H>2& ztKlbO(v~QZ0e%|W3}u8Xls0~YS|z5pb(v8bgr}HY(92yrPr{{_VRWhZQXRHf7=@7{ zIOBsrY+d4VS=b7`z&!5}rSEV+k^_47F=FOG{aevRmj0<3=3-3q_jmUXB`yeWwCr7~`jf}eYQ0z(f>S+E zQsU72d9qf`0kq0RNDTw=4UAFBEIbf~>jTeEJ2Vzg7zDqkCzLw6rR|On`STezVf4YK zj)33}A;}ul=1n8ZW>04fIKq*tB%<0V0Ctt0!-X`5W>h%s}@t%zkzEu7Fo?h0YG>fN;e|~r9H5+!dY)tH~ z#&$nwPMlkw2wqJrdv;GzQ~iC%Cgi@4 z`<7M~tU+Lh4rrRzsbN;JR3X657c+AiTRy*9KQ84_?|MT%;-xTJSd$QZRo!uIHV*=W zYcs)z@7Uuz_gC&&Nm*^yrz`gjy6kgW77=J1b<}+NyB1b~)lLpiw@jZ@k6=1g-gW}d zAzmM67xC%l!JYx&Ou8s?`$$6(Y%fH_7HP)P%)+I3p-y^P$u12NpwgM2^{ejmz}QgM z=BZr&ivx1^Gjmu1R!8S`6frAFunX=*prx{yVKB+Nm|XV1gg9P4K%1c&noXYr`V#GA zETU~GNmZA5^b2n8PV+dvzUe)dcJbzY;S;637<9kVvNB@R@P%og4bl%;(d=?2wJ?Pg zg;|&watKxSK>+8mDzI?R!ZD+yLq>Pff-pz=;vQ1K-fsy@*KbqtVFF^AU^gCEvKR$0@6@ zTggD=fWS3isip3@Bp3%#R-H>?=8uh34e3#9VQMOa5`iWf-HQ)D8B3YA8}u+BCl|b; zxNVoI6#A(_cR^kjb(ye-2IgXRQh1d`jnzB07x$jOCV$O6?-GX@vLZ-0_i?TwR3;(g zBC}5AgS*>zWk0(~`GTj4L%KAc%Q`Z{n{^2&*^X}&!aF6qMICCtUU$852Z>)D#)>{4 zw^e}?YQ`1-oZJyHycmD+0#w>>5cBD-WNIAH-QgVwyDRTMf6*6u73^cbq>CEcYNa#6X^>ckj)6g4}Nn(NIbtTyezCE^4r1^F$hLdsH7% zxe#`E8XfuS-2jT&?}(5Oshc7SINwpbSaTV4 zsT-L{$k>u80uX7KfGEEQJ%|ptcFaY@m7mUv)|>?aiAn((q#9g3mic7*x>5A7c$Pir z?+2iHH7H+3LyT5}KB}U}kn%5`+Wwa9{gJium>dbcVD29S`QXOp-zTJ z&Rg&jU(i`uc!02^tx^7<9u(}pux$9ACAlyHeQ?F|<~m&d7ni{8C;p_=J`ZV>~nX1zu}2jVZVL=&)I(WhF`Vy@Ys|12=J0|5bgScI!i}&8IGsv zJMdWaf#Q0g2l%RbKPm)ou8hN+xYPfU_2#DhmHeAf?d^qZypu2B^85g6%ma{E_4)uJ z>v>c9Te&yKr%OoT>-|FG z3ws7nb1L@LcMwb|*L@hP{%@#ri9)|X15poN@$dUU_BDBAX}bx9q?r*5SF!jBSOZ3E zRUJG&!@Tr`!!3RZq6FSns;=liITRE>W|qDr?9??8h|N$A^6`L0svk&P4)yRXc`nSQ z7bmNGmD9iKq$1RReyPU1?t%HAj<92Wn!qG@#b~kD?EEKU*f+eZhefU0j`a{5cnn?RcZcBxpNsbCc52=bVwAD5{uWm6&$UPo$5Kc`25rI7my=fQhJys z<8{5b96RaOyK@Z^LQ;AHZhan<#+(0T2#`*BR(TW$^hqmlQ278S>4`4O2Z958E=cda zJInmspLN+54`|093I3I8Vj;xxT{GIhZkCz-Tv%k$7f?gu5f9yh65EsGZIa2T zo|6vj6aB$;U+aCHp)|UD(aL$ec36^!J-zPuPPfhey%#1=9B}>yyD)`K7xE-Fgyn4r}c;zj2r||&-6jfCOthQTE#swFBq~UfJ zmW~$PplCG(%yBIa?}bhc1h^%K#M58SGkZox#cGStMtQJIU^~pE}=HR9~P?A|Bc7A@3pPQo2?n;S&+`*9LQi zBpeyr`>BH_O6($_P?Z?vKii#}>wVwsGsnUXA=L3mI8I1s#e$ugeP|0b8Z#C(csrSu z;9u*%r{AO(w#Wm&uVm|1=f!HL-alCj5N30F3CI+b*G}F&IMmHVU9`_P(sc+HN$GUU z57*ut$k+4*1KmHMD&*Px^V8^S+s$%Oh4*f28>*4?1?{Yek(S&ueff-~Ru0%*QddK^W z&Nz>UiLzB$BW46G^E}v5h&2^0K_p#0z*BprmgV&(fcP`x)St^WJY)q{%fkF9>f;|>CVfU-mRNh1#{6V)~ zCz8;rvXtBwqHGexK!F^2c5s=sA52v1qn@W`umO1dRpN5TxZdtF zKnG=lRXWHxT}BW4Yl9oVkdy%1U!MuB z2hYy-$%tMbR5pfo^Jzlv9P6&|qnUc&A7iT%rw+ExBYSmbPI`P^2v64I9FF_7%_q(3 zJt)HI!At>LT7H_iD$2vZBwf7#7kWs_!=0O>9 z{M(3#nU3)$f-+&Ph^qBY?%d(xXSL5oz|dl%bvDNa%*Im=XZDLPT_@1}Y2KR^z zVpBzICDcuFpRO>7n2Fb@Zp+gvkCO|)mUuW~Rj#EqGvMthBs>8;HCtLU(R_7if(lM4 z^0qTZ`kW==K-smH`j-Qx_=F;9iSAVdTqg&6av9jSxEamC!{0#)a#2OzLL7{-bx`qK zWEU^mB}|RIMs)j0OY_q%0cb}1Q4!1^&ggzz7g)PjT>a!2SXa`oOR<|^h^d_<#sU!T z9@z+V{#&?}?+R-3(>(re3G#jF{WOK1|Cf1%V-_*5Q2Qe0m5*?&age`g%$>D`=lI?7 z%rAxXs3YO0zS1oYb?cg5k-iqyNTBb#*J!_DktvdAX7{8N`naUb@UbvkN!quiA zo!i-!gW4QyrS^r7empcDLrD4~l3Rfk;o2$wq#G@t$rk{`k5`4QmY$CH*WZUGh4+RR zJoxT}px+l#Gt%C=851qKAS8h^6PaF0H8k@ciC+|==;iaEc0ywt71eKX+%=HvO7yB*6TgIyR2<_{kCQ>a5VISso9xr0#xOM zwp|&Z5vb+t^qj~Gwr3-o^ll7KA)3!k~ek{rFfh^w;q5qXJ6i;4=cUkSwIo;g=^I1&<2l^CFbsc~cpPgw8 zz3HD2W~n8%5S?R)fq#aCy(d2!9fL{A z;K-!36aFSiD(99IQ{XBgfE@Q98(R8|xZ>?9^iF>9cagx!C2_$4Td^kCuKWu90sjjJ`?;LwUmf~8#-moEUz4<& zi3TD0y{>(6F3BK!b2(~w-j9mr?0X+RtNa^3JY1kj+ACtpcwKuVN==OJOnln(U!%Va zWc)L9sMVu)_!W!%q1$cYy7TJRLL62wNzI9b9T$27He0>7xgH)MsVH!bt5g`t8anM* zV+7J@Iq{WGZyj1w^1x<5$}r1tPQIl=eRh(X_`!7{4tHK@TDmi~Wmx{9=JdY^04?)c z4cP)osq}!r=~e}Z_Yf%_g(KIW2#Td-E~Sxis1+ccp&&W(XqxqoZL;cgNpF&H1zGT+ z(Ba>}e%)y&vgoM$FTj`NvgvamqSWPM=p&!Oef^07W&7vYruqV1Hx=u<_PR}g{gfRc zI~Y+FpQ$Bs<><`Yu!YJ4b21qATK#^$@31FzD7a{_i=4|}K2wC7s}~V&$;h+utf+q; z_mtCT@M@uzM`2lc&Se-;1stfF{RwSRZh7fQLdIhwUz_i0HLlXo1%i z4YFk1ouui#H*n8Sqpx^7lh}%THd=o5(xI37;)LI2%n2&%|`JZE;z?$Yh( zMUiK(?0+zhc(0S1=`O<2;o3p-_OIC_*mA!U4#nLh2pZwJf5U`QD18=c&tvK$nq6Qp`T=IMc8exk+pb~faYbKDgw(I$PQJ+4OhI>(xDuW&{q!`GxOF0Z;7 z>Hzmo=`kHnf->xx`~S-%pr(7v&h_DC4PK$_`D?y!+b7W9`0l)8WiS7U9MMgv&t=oP zl8vL~4OG*UJz1C0rJrC(be-Jn>4;W2rK;eR*o;ssMkt7#y#g4+|HOu%S&Uj9$ z^|&0(zSOG#&a~!IPCHV4Hc=}tz7aS{6o?1!zFhbnaA7ael4cAA%X{skEhzU{ThuZ& zN)MD*TNMa8hZq%`q1ipk?Pd&5SJarS_G@ey)NGTsxGEN?2Rl z)Gn(C+i-auL9zre!UKPA37Y-&q{kpO0qyTOJ3S?Pf<#1_|bv};R1Z_jJ!M3 zUc36)peQv>bd~m-8gVg;bFS{5x9axE!vir_EC}H@P}1MO2jeYnDP2+O#gl(r}4RvkBDZ$>Z|?sx=}**Hk&$EKDR4QE@sK0@5rHtBH6fge(MF z(x%bi-qSga2e0GGo#bQSVb&RU@`yWImc!V3LTIcpn^z3}hc-lRan)KJX0iLAK;<>`^rLbbCd^El@xJw2D$0tO@vS)6wz&y%*6xl0x;;kCGaJ!r z@+k8H2Mg%FKd!8eeOO+iL6ApfGUEq=rRTjqeXj{4-!|YRJK&|5=FEFncf9o_fzhrR zZVuwkq;w+#8UMIGY?s zv_Z-FZP1V^RDMorfdwtCUewqx*nF8`|99l5y?XdcWR&4sXS~2*46p3jJsnGik4!j4 zudC53tv4SR#}YG;m+;6rc`xXU`a@mCBHwhPF{CW*X`7G-+gi%WC;MPsDB1x&@YOHX ze1Z|yt*jPnQ8vtD^B=^2+4YIUl;p+Mi=NT6+dH%_{FlSTU~EOKdh{lrElnrxSSYJ5 zgW!dVACqk37WB{JqqImUxJvKVJP5puQ4S^{U&jkZ;5Hf-1aH-;=zF~yeLTo5s^lFo zgBCE0v%MSdqd%@7Vpa?lt6p3B#zvTP-1N^})`BK7q~P6ipK-;??2_ZSMBB z-HD_^5a@#|(ME9PGMxT}Z^~OEWf$5vDUoU@byD^SJljz-bB4Y1FH=nMUOrJWq)}Oj z`ct)XYsMyDoLKUVvdMO@&0O0{Y!Hn)l@RIc5Zx&hJ~t@rn4s4ZB|9lgtjCQ?qE8tz zuWWa>c|D`|vT8xG%MUTR8tIkDxh~b+ry^Q+m;!D$|_Sg zd+(Yi^vT80^NC3<{8ue#+Q#ZfZ4Z8kyo#O~mf^cYf-GG2drbegJA(y3KoTqt(s*LA z9x3UG%AzoE%Iz0rdsM|*qii(29v!baGWh+N0(E2Y4=5NLER4ly%Ke(OO`5eCB2q2o z+&wGB_su3t5_%DKAf+_Ez-Bi?`m8>bt|1@x6qP7^Tir<+Ci}mShe@N%Cx22J?}j!P zYUzD&4*uZrRRqljHXEzo(>}c4dH>5fqRJ|OX|%0HD$$PQr+5EU?JFJL5ZbJxf6Bil z(OeJsyDqpkpgk(Zgt;ANX=fqbR^)!of{97N~+O$|MUnvJ#K#-MF{koKDf|SFL6D(@n z|9D5lAQf{YYX(%(wJp@-LratzyFc1OjYklJ>U7-a1dzt6FyEIeExld`J)r5dNPlMW z){4$n3z4ob7!6^3CJB!oUxrB`@H=`bPCd*mA?qm_){YA9)8g`tJz09|=?9kU1R*xE zkB29=;zqC(6q~}_Oa$tvg$Q|yUQVhrqeoBEId62nkgEs6w^}9jcx-6h@_OwgQ<+?){fVL<9SeMZ|0_BKscp zEd^>#_swKF8b~0KzLi7OxfgBej1UO1M~z+tGGzAiMh! z(K6K574UIR%A*M55_iWjl3Mnyl#5kv=idhxZ|Vt=xdpv|3MrOSmvt41Qd0Fi*xz#} zjQT|7ZE3Lorp<_lgxacB!zle|^s8~r?o3a7jPo=_z@gfc{r8=xt_Fdqgvp4C_q{51LEu6JV^0mf-Y`$= zRP?|g$iX=gGf7z~znH0m`^ltNKA3PkT9SuxT!!oRBmiJiPkDWF*aCibC7Qa8ehL*k zet}`^`Z?kWt}4pZsVL$@CrNZJ`Bz`EP&5uBeedg?>Gu>)DuINbbFM~YpTF&YDAMrA zwrZ@KxLTVJ%sP`hUO~pU0nE1Ob)%f)T;96Je-^T?o+c|nscT#C! zMU|jvu6$#Zf}!BFij5~yae6Y-Ocl9A_^iQ$W`1=rAk-?J&QiqmF{PExHrHSOwSBAM zEG3#9$Ns=jVo53Qu+EeeQ%(`St;H)grbd{W3x(#j7L#B`_7_h5V5$u_iNa5LfI18@ z@g{POJC-Ehy5t+tZvaf(S#xl@h-u2W+1zUW4sPMM4J9oW#4k*AHF#Yy(z1%A6LmG5 zoRobw?fNMVPk1G_ZD=utTHQuww?Yf<4Epn7dBQC|NQm_ci={WzzR|H!KsG~}RRdlL zp&MLolKc0ely;Rvo_>S$M2=U4qeW{9&47SWaF8TE6N)MlsZ*hlKT+GQAB zrOO{83<{*m+gp8LpRBeQJUv!Dup*KAD-h77GX5Nbc3L?zx%VvB}LFKge| zLAvt?+*C$OMXhS@pZ6TS!z`vl#Jhc4LmPI#!DttQH#O4n&_f>M<_Vc$L`@-I)BVd%6$!YQO#m}K2(S!G;||UCHfBiUH9@i5 zw|$z{ws`;q9cqgNygEb5%N)tpG3HmK=lcF!nH$J=f|AbNy#f{VGwmBHadbSQ6MM96 zLtpd?s8uyl)Lj~D_1E?GV$RY$%#IR~Vx7V%f!_J*2jlZ9j**GEe(%qY#J!#3b|drF zdK zGt-{AkObwZy)oVQ)7{e{bs*5-<+SEY$-~TR%lUax&GC@#62# zNZRKF{g)X4TR<6p{Lm_5$|z=*zeg?HjF18n>uGU(@=ec1+Cwr82KxHwFx8?Bm8vBEb5QFiJ4EZ%7@$p z6ot#)%zKB z2p~`z=~pqig4q}?Ml-PYzS>FPUyX6VZ*OHdR1|Z_xTN%UE6qPX?C&?Z@1Bb+itwLr z!k(cIKaPLM%Jso|-kUkG?(!9P8q3&Qrs3I&QF%b|^!v`;W!IdG73e{z$=#ZRwoqO$ zA@7(W4}Af9kWOOs zL5dXNc$AJnS7r?!5>Ls0c}U*U#ZycBw@NSYUJS7H?{9mxJX-1S)u-h$kGz+yD(<*N zSRKb$(2obJYrZ(0&$MFd|J_3(`()MU-&%lV@RPuXFejR5Pjh+;l;+j!iJCD~m5%kY zb)K?Z=PKG{N)20SzT^Jv9m%P?v68v7xk+4N=~3T7A?|fz7zAa+tK6i=f06KPcl+YF z9;8ON#!t{>U_s{pjzyAJ(3|17+3`WrQ}}H_G`Uxt|0c*QpYgw#Nj4%k$V8~os(%^m z3?;hRSo!M4!-a^~H`fJXv@dFuEF#GoV~?N?;IsdQ=myV^;ve-iD$N+5<;-q9%ivDx z9;buLURx1Bk70QAdA8gogWXA4%`W*Ea%?WT;t^ z6QQ98a;^$3^p$n5!Hd$v=A2_!CcC@G?rzr`(?>3zv@lo^QfO0-v~~IJpt(M*gC$;? z%h;K}W^uYVI7}@iszHccI@s@TgEjU{L|XiyH!Jghn@I&T$NTEemi^aEDy~-}ZXrn+ zhNj7fPKhufn(6oqQqI8NkyON=xwI8{r=R|WMtH>tyLzY8jNKGb^**$!s#GU0r$eH( zlP2Me6}_!>qCdJvCc_7})CUPCB}NGgyoE*hpVe18-#Y*b*5s{5LKH$gf<6Q0^14$H@J^EF$!+znK1ai%6{I9gS1+b*B)E z$bGnwZ7VIF(tDjVC?b`s$^8|lk8ubb?++)wT}3B-mzG7foC_h?T1RHfPQ{R_lbVR8 z*0h+oB&dJBC(rVi*p%6<6BmQIWn1|2NT>MC`iVO6osUiXjwEb#HG=Up~wtErH zw?~B$&xdW-W_eKH#L0$5Ns%)`>$?yb?G)H=mDiH}gRWL#<5WucB+ z-BlEC+V$bUeQjc1=U_QKU`xMeK=GpD+PsAiCg=M6ll9l!KxakzdSYMrZfV-S8}H6( z367dE1w{pWk7GGg4_gnzadb51ONITP%T@eFv@<6zoqHrbvt91VcZUhQXo>QEJ)se6 z87kDiq$csD2WG0gx13*VzikdBCHe_bVW}@_{N*cusIw;flZ`L|Dz$n?W8E%fXI-ZB zX2*zQmkXrFKA^>h@><#DwdLOR9Bbc?X>J9+b#bCYXXDA@o~g0Ie|?n0woyXbeS*XM z*|@Zj#mrOkCn^tOJNH)a=bdkgRvV!!PC;gz*@4z6yW+VltRPRg?$3-Yz(VTw*A1I} z?&cu^)$4w2b8tPb>BvEY=gs`+iCVy5BQ2?r^-m{8@+A#Vf48B@!SC3sOl53dadMz` z;5KL?SPG$;aqn;KH1iFgpN2@=tbj8rTvA_~l8>NC75b|sX2TZfd!|atUv}xkj#Bcr z-6)+YYE~76MNzj6B#)grmcn;LqTDP)+ICi9_7s`Zc-3DJ4)IjC4R_0p?O;xiS2xC} zmcj@n(J&`}3 zJkgM{<@jHdvM2%#`;P~X;0p<|skmZq?Leay$R@)hdCD$L$)S@9Q9>fr&DbQ(M{9Na zCybCOHdX*f94aF8U;OEZufhjLS^hHJs2$o#9niK?EHZFAk)MxvcFNCUaG%l*Nmq~T zD6AXR-J>dccmKra5bCsyQrAUD#G13)$niKy7hE)OQ&f6%r{bJ3b4&N9SiU=2b#acQ zH-wieC%_p#bJPn_+CAbQ(M`#Gnkt;{FvggkCXx?xEh|l^`sTsD$vXvS#!Pokuth zY)^$m#JirBut@0$790GXYd5cwxoc$S5i2t@CQ0O)SPEYyHW9)dWu;_(>|%S{pDJVM zdg*(1+F|27OdVjFcx;#H(+ahEDd)Z#W}d|m!|!Se z(OZ=HacH)s-hlxA!;f_rO-~AHWV8@ZO%iIds7W|V(q+^`Ijl%5AH4QOn(mA-VG1jr zo~8%amwr#aeeDY)_KY53s^aj!S>x+1T%l+LMYdaEX!gSb4(O`~&S1}25(dxM8p}ng z6=92`)s7h-DVES6+~vJftM>42a}g+9^x;q(u3dD*&>>3gm^!a}xc0GdgD+TArpBSa zBh#gx{n&pX?#6^;!L)%P(n|lIh2WTQH$L8Q?b=SwGP?gj3a){@qjn6RiD;?wi8MW)_=ZJ7B}5`w1J1 zfqedWTr-%45idk9RHa=(?BxEByJPTLF%G_=u;wI?#HA@kE5hsy_My)uETVrNj2VUu zXh?_?sfJLJ*&r@Tu^K~zv3y;yM-)?qPux8CqSjRU4pDzPM*qADZ{rft3~%H1@ya>~ zul_N6d;qCJ4$U7*iP5 z3=&d2u-Dx|G_;Tbny~lp{V0Z?Qou*hYukb;&ZM3I_A!h544oz&P_x0RsUr>biW>`{ zwCud~@hkvn1|L;;GlVJi zLUF}eOaKfj;dS#^)VpJ0YQ!C<)<-%PPuf#Td&T2GmAVX~IEm zW&YwOoc^WpcZI;dV|eq>^sxwFdnB++>VZ-~8ngKxb-sltS`>xET&foz@{&Prcdi)L z1;beQRuC^J1-V`K5b~h;mAhK=6)Y}QFoi(!Ckqa8XwMkg$-YU%kV$UR|9Ie39`m+l{Ed&7LIu=ymsDeVCetz_YEIaZz&mmfy~zx z@*+6-4Xg;IfzmlSqD5mi?`9~6n=i}}5tEdqH>Y`BdIFV~>a;GcfjEu5U@Z0#+83ekK-sa@$)hEEfu&rx`zu|pcp)o)B(z2yP6X{J+4n;Qf{fe9`MfhK!P#L2TMw; z;>>NsNhlnFg*t$z!-~m`SnXo zz+Vy$knE zWYnEh-8l!)Z(7YDZbQ(BvS1UbT5;M%>UTThb>4+NNK{wMcSQQb_9RvEDE9y86LGuz zh~|X;C!>ej;5zBdpy5)?fQ$A;6iD}JcCAXVkl9$2LR6O2DoE!>RoB76x(MCe zp9^+1c!kgeov)vOpY|1rXI)pBy{vEyko3*tM%opZ{{d#J z4Eo66v05k^*U~lUdA-_tb86pc#KLC5`{Ev0tgb^r=UDWgemJQ+JN4yUEpOa*X-F1K zv4Q%IgQi-|_^9^9y{mDEhoeRMXSL6${8a-QO|6gfAXLOiBk{LV-rr6md5^D{R}iSN z2OQYCK=@Ar~V9 zQKTV8F#Eon&X+1bIb~;pcz=T6f9aMts|ily9^^uq3p{1Oi-L8`3H0l=8uySMhm*^b z8PbL8NbWMsYfmYV6pCOTmcyTysnJp`&J=q(Y7uJ0vnQ8VAnXK>m$Ptwl?Tt1h9IDA z_d@y&4-$hjTJX~E=Y?w_(PITS0FRBvTK(IwaTu!UE3_=ysxZ3%Qi7~w$;$6J!Tm8M2VN9e9zmQ^qIDnCrjsGBF60J`lh~h(%=E|qygOunal-=Z^4N)`gun@bFuq|0U znlLjWl?wzlg-a2LJAw92*wtyux95nEJ(+TEQ2#b50ORQW>U#k(n|*+PT1#n4rV zjmhVp^jnJr-pHUuq&d5zk#ac&kllzb%?e|)9o}&wj%vWci%a}(DKwxv!@}_O5(x#{ z149TJy+!^VThD0ivG~Xr&H5YHK9KOn zad2XP{YBLm?)r7AZA*j#Yt!ED~!f*sNAt)}6lA|8ebjl88 zr?G_W%$wW2@Wgz^QD|JvKIFr-WgoGuE{Hfq7L&_zc*Or zU;fbcO#WUdfPhcGr0^z8pL{?VaxJaYE)Dy(RA01`SS#uD)eMU$z4#?~D5s`76L?$B z+#5`2e@)-hhk&vOv(IhBTd9~aQB-wK5q#=L&0a9jY;N7(q;`-lZ+>JJQr9wNEXcQc z=4qqG8Zhqb6px^tJ((9%@x9HTiTQXb-MMLoDYCQ>=Li02&nFiqnwtEzqIp8BzEu|S z-s0mo0@xOkiNJ(BM9+S(g)MMd$FsIQa6d^R-SiZ4PTuWu5%hd?5@q-{h~Tpzyz7&% z6$EN7az8W-1<+<99nqkV1Lvi+Cr|hc-4#jukN0=kO+S=TpklgQVv3&aw{II9(~_TJ z{ZuF0;;E#+rE%LfS@{Ft3wEoc|H!4@z@Pf2xs4=Jv^=@L_~OG;0aB7y>Wsvp*p;Zs zT#%TKiPB&Suo0W=qvgBuGlI#m+e+mN<7MNM z;;uj9kf<@OXg2&shGkNs6TbQb7L;LeXpa#VIIZ#+q+}9&Fmd!BK53M`giM?Y zgFmI5oFZlK;AGkK74YS1#0KLCMclR;_m=L;Bx(lKvu@3|C?Y0G&84~xb(H!)exsgU zJ#a9Z9Ou~jIK3;rR$sBJFPwUO%tp}r05~-@n^Pk~rPR^6$v0@_Y@qn;n7g$FVPNJ> zYu;AM1eMz&0>SGnA4bGSa_s{dZ+!hi_SmQUDQV|SZO57p{4O*oq?C?i+A`^WOCw)M zD6VJBvVoJ3du3h7(IcKYQIl+P5s0{baIxJa3lKw0SFjB4Sg!q9`ImT~-9_sg#p6B9 zJ9C~hh_~EFWIGLXAAu1s6w+v98MqKF-YfjFAw9&}Gja z%&jzz)w!ph1|Zd+V->xxEJF51+tmk4xA!anEji=UMaCe&lyE|N&)ZQNL;>9?C(g(( zX>@(Twz+ls9GO+MXgXKI%7^J`07G4+?I-;Dw_5Zpmt+3xgiDzIHa}}%Zn7(%4;@P` zOhr`gH+VL5gE3LJ1*8*p!DhqX&-=G><)$&2`(yo@;-AF7im2=zHmOcUkz*)G!J5Q& z6LUUr9Sln^dQ@=j!Ua~Md)Hc|uk?6b(mY}lWe5hcNdKLPYgaBFJ)*qjWxketbGNNi z?RFQ3KbISu@rtdLZIvW9-q)ebIL=kb%h5_OxzfREGfZr$r+|K$?icSl@}NOAUuJEi ze#V#9UGgl#NEb^8qYTY{(yZhE1Uc+$bGWCl(Cg>O55DxU@Ti}OYup&n%2#OdJed|W z07(s=?s5U@Q@Y?5)`$9OTwiuwo6ld4nn}F^s;1t5KT@jjI7-iiu{&Dm`P)ftD-P!k z#;MjR&GbmiMB8!)W%veIk^OF`(WSgBcEkXcArNG;{^pEpRdP(I95cE=p+;tjFG_#X zJVK%17i}abC3Qqwl|syxjc^;QfCQ0~-8~7umYSH~iOe|1sh8hJ9g?4Ocs>qVY&-H{ z8+IFVh&yWw^zfpQ)MYkN6q_DDZa|GZEi;jWe^FKujU?VyY@Qo=^k)cxDQye&VAQefE?LS*-5chBG#L*=dM-# ze%pCUl)w28oWg50sG@jl(s$1uAAgC}`392O%GR*M)M?;tIf2^Z7N^Lr#eE(gutY>s zb!+_6m<%HKP>Y^!#E6)3Iwx_BnE?rT0VTB}sHDP-m9%vAr@WQ$ye5@=~)(Ks8`pqQg zkJ80XP1h<-jfq6+@E!{3Bka9_-<)|_)iQ_%s}wplh)I2)gqrJy z>>)RbVL}=kz5sHy)Rr~j#WRp(24U2_;%nY;-^hTY#wXr7+-e=oGjX}_X8}5>32GOzH&;cFUo%TAf)@;b8a^{{@x{nR_1AmxH>?2r~D2OKgHuTS>zV*2Ugmqn|&IvwtfUBHn)p=FUs z8l)|P`h<__k!R=CaI6a72@1VtyjF49fh>V1`oLx^az}A0ih)Z1{P-wuMZR?xDGQ2Y zwb7P~fq!Tdsu!ZyL@(Bmg|neft~iRGFvz+>!ld-8#@O1hC8RTow+-?hDmLHm>yF<_ zmFn}BFjnO=#1M)01%3=7J1Z`5F5fw$DJLvhq3mA3DKpv#C(*`?$6lIqmU$$r%sc0s zGNiC-4DoICdBW>CH&5M`-2DSKl8cn#ox0rxEiSJmgyrHfLrPq}%#%H>^=Q|X;=KUo zO4Dp4>;mBSr;Q!S!^e2<$S9IVU+C+yxbM39wRmLy7$PeVePO5Ja5ub2IonQv0XzXQ zIt#Ta?YPzn%bTa{+c69C6btAmGBeNSSHwzDESPgS4XMzYoRIuGcPey7#Ezcy^V4)Hfm>Nexn$2+662DsAoou+4&q6h$A!*pB86v_tYPZ7F#sK)nCLM56p;O7t*kd99aJ zYP(%ojyKJ^7Tj_Q7~HZg*LQroxecRmyu#4yjRD_#DC$zFj97A2Yx0mmfzt046${WP zHZrxa+cPLgsPF+9(?#O&&NIk3y%G|Z>c$L7aZO)OlVaF`J7GdqqPtf#Iekn34aGt+ z9lnR~s1L=010d3Byt-=S`*u+O(EImvZy|6+-0m){7{7%ENMqiS7$3kHyw8<^WkFG2 zF^UP#({|^YzS0I~=x8OD5S;P7FZor&`BKV^ISj?XY=FLd01sbuzh7j2@}1U?qMa^8 z+DbhEr6R@g8C+&qUVT2J7x4>IU7ZtLR6)|Ipk$c{5@qws|AzY8dIw??M8q84tK`kf zd6d#9uQUhW^wMPDFH|3GF@wY*sk~F?EQ4YQt!C}K`VnW@{Cp#Ar=jjuy;Q!Lc!xqH zW3uk$_FE~R?`PsqmbWxB@G-5~nzxzRP5DAu=125pvq(3n_KeuaN)QrsFQ_E?^6w2* ze>xqe(3kfSi(mQ5q8rLB`tu$=9@noJU_4`SvF@6~);iqTmLd`()MHjIx&@#)zZlAA zD2__Jo2C^x(@1^2+?uY~R59i!623CJIsz$lOfLDCpE(pzp2$l}m^Eqyds+ zDoO8txFf39xSzQEDW$7$S;_P=!nS051}>x99T(IGk}n*5 z!IPS;@A=FGB2e@ZSgQ+m?k%Uk(VOs?>7m8A>y|PiT+5!Ptx51)33gDQnvXfl<0_oL z^KUJ{6$*QmFaq&?_i+b4GxE?~+gaR_-g9g3y))l9HjF|O7$QO$62@7Mw!A`2M%3aJ z6;9;#7mB&+sM=B)8KlJt1>LtUZ z--r_lJh^mmo#RK{-i5|(HD38;wNoBTar5DMWG`2)D$i%if6V*=Q4>r`Cv971=T7!j zhjOD4Om-R5m36@3b+&o?mgPMgSU%n*c(D=Ag@tr@H`svbZ&MouA#jZM63&ldxqSc9 zUMoE8wRdI6X_bWNF+sGjx>}7D`St52m@C9cBEYS71rb++zRre+CKp-4yDoN>e6A&Z zRL!Rff@}92a_N1$(YEg=jz~H6;s;su>dKEcq9aI71cnJEb&ya6gx<$mqrm_*cPJpP zokI?=7Taj-n`nkgWv>Pe!(|`YrM@*L|28k$5rc8+QdvweKvW;xpRma#Cf*R;Y_<}8 ztP)y3vpajF8dM4#Rw{Ia>?5B!y|8}roySq+QF2gS6@RA~xsD)ngGADx-Xz0F`_ThQ zA^=yN5`t72^@eQPnN<%TE7b}Aq=wl>a?zw{vW$Z0qy=I0nZ99~t=);G`U_19?m2qD z^8Pq;5ZOCzE=!~tcAyytUO-ZSL+bia=D^XFztImLr7c)tt}Ts!LI}nD{W3%8r`aCW zT5d(F@q<#&f|=UnkL=v1>skcvpR5p{u~QJ%xqbO5O8s?G>X;ZBO70foQAsc!Fs3kh z?Y)uZd|~>_1rZTVf(M%2k0C(2?ntYtQjg8JPX1nUG_glEr-K*;J(|dUtfunCpg@d- z3X};s^f>}Zaj(zX&Dg4|DN~^->Wp5(!}_qA6`9cXO6SZ4kFa0%8f4$(Z0*8a9|66g z{V7>dh!FXdMuv!(DQUvFC^>K16X^3NXh`ZJPFTD*GluXX`A9=dkJpLWD82G>`w1G_ zKT}Uju%VdzK+hGfGfEY1d~??av@YcFft%!qc}y9B6WGGKJ=1r&gIKM zPEZO^UdDh4CBn+Jo?;m#k$ry&U5Vx(y?=nKMhbQIjj0J z_-yaHN}@<{f2PC^;zj?{NZy+L##W`u7AF#t54hb>@p5-q+~x@;|$pEynGX#B*v_d_Cj`uhM8*b z_9WQYKC)a2rZBydofl1D0pUwhUY}Ij_;uX~0%ud``Q=C;&rlF?znJb*KtO-U>lX>` zZ?H1ln!hcB7vc*ytt$taWJImpFX9O(2yh4jh*Q)cN^cjXBRIZyyUv1HEz`l=AYpMJ z)4d9nAP~Dl9oL=|M|m=W;H5R0PyIbYxkruC1dH96C+ax)pNH*jBVF^h?q9qQy7Xyb zdQys^zJwn8it&#CCfSrj-QjYLu?(zRZt`4%_pf9ogttv1$o41D1}DD)XuHlrPLas+ z?x6<2iF|}M95C|>Rvx}7E02<_8{|$JH*b$9O&Zg+lHbBee`rj`!6vKfc@VFpA%nyCLr;|D$I zEHuQJ|BgVvo}Rr67HWRMlSoNJ>^qWY5WiG%Gs!_))QU%glhzCo?T>WnKP(k~p0RsQ zi-sm2ZUde#k$pb`Z97&dlkb5McWIMl@t1XW%JCD};48E-TO(*Q<*Mr^TP7nL#5s6P z-V!q=UsN~#Fg>o8dk8s=wCF9u#DRvHTs0c|yykM#7bj~E*=%9k7I^4bkyv5U)He)v z7=4g2He)g6a3J7Rcgn-^P_yx;n{KjulM>VMJ#=jy!{qimqtL4;WM~xcc=W~@^PamDu&N|x@+#3EGS`cT9(XtL3`gPXYOpn%G^p-GTo3FGRT^;xX22&(* z4q=$0z0=}u9sIdaI;kBU-!ijUD>nxgFpK2jKyZ|1pz%d&iH zATrVC(67Ts7qMgE54|E638|RgUu!_>@scIok9ONS$Q*mCNX!dBP5#f;linagkuPTv z3<;~Xf?*rs_k7l?e0qNQsF9bfqD$+RJ9vqlb$9;K8(&N>6!*Q2;)nJN@GyI>HW5@c zT%4<9Q)hVf^6!Q41m~WmN8aD|_p=|$*1dBE?$6BM z9@;~?kwrL5t*tW2xwqP#7d`Qn=xzFbNKcS1nYWyhgzl?7Dkk*1YQ5K!_=BEmpS2Etx4Aop z^{JF{jRps?)k`k9i8!srSI@bF_RpK~=6&bg_fJPu>T#r*v(XNy$>dIscARzRY+8b` zh#kcIEIo@4>T^|6@gi7@{K+gapCodA8in4V4k!lh!a|mrIswtSN|h%(6-bc zH|?{Qd=@No9~K+vZC|_1IjY-yH#W4RkaGLft$0{Q^(~n2HyVH`z$kBTVQSu3OmT=S zx1V4T&T6oRoSosxLo_8>T4lX{t*A9uapc*LIB3)@E72+oy^jW!E~iT``eo=Q2#^E$ zpXxqkQPex&QR7uKTdr3f6+X#?QcZ-ZndF*iKFO}v|D@He9U#sz;HU-U?J9$1`Nl)LA%U53Xm?6j2|&n&c;O*xubZLOjYmxHOnEnLR@y zCrCxJ zFH)tBX9G|N^P1wsVe?G-y+wNvm7Qjtcj&pXTko;@zweF?4eD=ZG76=tCE?$0KSi6d zL0ExapHT|wo_8w)CWg%-;U&+~d!j3qF%G?2RIcMH6EzDZ3V!*}dxF$*0DHn9!<46Bw@&aG9pcjgS=yymRAWUFD#%Fz=kH;kI z5&mpHtF(w>7W4~Hd$=q*|5g>6=?#6)=H7n3m% zhzZU-W6HyjAgNo7AjN*|g^xq_bQ_1<^J?|ccCB>BL}HE(A=oU>SandjMkL-CB^v#d z;V~n`Mp|GRJW2^=AzV%QPO%;D#q~_V$8Qd&O(N;0DMKu3In#Tq52kNEuFzu;#1yz@ zp<-4@`*Q>-E{ZMcc64ROZa3PfT-?~Fb*$ZMcNfFViEukyb0uVD3gEI3(h+*!024ET zM`2>d;Ne-(y}F|$mxu?SF)bL7H!crvJfF&XBeb40z>b9=gkzPxgDaUs=d@SzPYWY_ zVu-rSB_-!g*~e7LE?kOW2B|?2zcbGR#NmGjlf<~zd8{bIcDy88tKR3@pdYGSCvn06 zR4VthQesW6FZ0vAED4KCplB0*>Fhq8lOUrZ;Pyk^{wM8G3*t$pkhJA;FY7@^mJ~I* zG+TD-B1OUk!n&&nW}nY4pnR4D-6d7MuQD^sg^)>PS(%d zZv~fyO&{zwXNijHc9J$XAdX5G3m+o~(OUv^junvnVDSVX4x5LpbbAlRfgmoB3iQj} zF8H;$an3Qu-SSBC12fU*#loi9nV|h2#O)Day7RrV$M?^$rrJa$l<`-9+I%jZz%4Z~ z39V6Uu2P9d0E@Y8cP-n~`{(xeDT;RZFJYwnM$wX9K*yD19j;xIPahtBF8Xd8wSVn7 zhFla}#DE{3PF$xCE;p;Zt%w8(_cI~$YhJ2cV9P&!{)2LH9moA=q)`7;+B;<1;ieJg z=I>#wBwX4cldFc3@x(L~%-#DMFFhM16q2LXP20O}KUy$>?J~b^Hy#5z4~+qj6*Ph-BY|>%GKSeoeMAgJ|XJ^ zWHCNT&F7G(3z~MFplxnPC#8YHU#$r-;{zi249 zB1#o@Unq0E=Dr)dL8RnVdk6D&po_#4Qm{P2;{68AN%B9e2$MW)4QG+Mp=23B>3MIi zx-p2es|6DE3i-4hny4+8Orv%&=Ppw*T*m$c)QPU8NcWR{+N|^yI*LUMIvw?|)u=&N zg(RRnZ)R`e2_mn8&tXUDe&;8816PS`YSdN8Bs)i>B0QVROgN+h)M(~HNYUiTsK05_ z^lN-<38~L9=->)$rjeB}xIf_GsB-jbAcgQFma1psTMG4E3R7#oB)gN1l}m@_j8%K> zl(*~?av!;L`zK@~9-Lm#&!HC@!kD{;W?8f;T%y5I+ih!H252rx^mOgbgoWAH7apM)OYRBnc4ZX4P2>gw;dBopud?Qv9HSKY8N!%9QS!Lzo7X0#rHG-#hc>{u3;s9w& zWYO4xMT3N=a|M)AWL%gvYkoSrX$zpuP+nPUqKusN5x$$1jzz-W@$=T-0THy{^6=Fx z>Y2O3=O#KEJz+Jp1-UGBWT?zD=tWI&X!!JGNM^ez|LK-ZdZNa8H1YnIC4)Lcz;%h9 zfVkH*uUT6)_A#)o#J%h6tZ!eu?2M5xp}MzKh$AbWNp%(`$F^$rp~iJwu~&wxd5bTi zWGDs0bj%}ynziUS-Sr#?tEjicn4G5;;{%PXa3)oeVd{DyZcVNtCzZ&gK2k>>Tg&JDl5|sBE?KyIz7@`9D=&pB~Vz;AcoFp!~ggJ-vYJT zOo#WSglJRDZEJ$g2ND~%LX>Lf%h+(a6H468S&Lc^*@#(ub$rowlNcY+&wOww=v?+o zyj}RzQixROy<^<+Q+b84w%AM5vuA#-zoT=Vhvn;ClxtjKExR5MGvw{}ytu^ED|e^% zw5ucOBTd+w!eb$_ArjA$k^aQs8{<&R@xMiCTjdwZ%0Z*67?BbT! zGey>Y1<~w6G$WJKP8jV5wHuzg^+q>!%}(kz!zsVVC=li@ZxK zjKX7cw?msL{NFF~8D{WZ6YwS}__F?yIaVRXvL|W6U#fV>>z_-RCxj?JWPTQK$-)Xj z(E@fww3kYkjtwmnwR-oLdkU!-hJ@j9Ey!P*DAX%|zumP*DO{n87Kb)1HnApLC9wf7 zD2(OucGow(cGt~eJHls|_IrE>h# z>?$20PxL}D!z9lz5#aVc_^=^|meR=NeMUd?57)ylp*KSd0O1slJSmCQ{O8~3O%oV!;`$BEtb4)nk+K2rLmPaf)!MgrzcQ(oj;rJX z3o-SgCd#eT5GOjCO#qF&Y40f}(H~a)ebJKCXz&Y*u{kLGb!=nzS|Ou(n(}aB+516E7hP?_*ALFw?JeAXj)n)Z+B9i(Y*X4bmaEWXAKrTQm(2 zHl3&bBT|GP?xMgb3aw*-u4OxVH9EW-l$0gWBZ=^zeS~NC?<-8FDtvM? z!VR=%|31$oUtpp2XA^3PB;R^y^?TJL=4ovV+7ify`FfIXAKjd6&nnjA74_+diRO6y zwvhue&_)eC4FCFhKJ76sF$qxNQV?prxTk|R4KBk12Th+Lgo{m}v;Sh%dI{uX`A*rOLjjM6)`4vA)yIK#=ILD-yos z_~I@-*PZR~3cHc+HE||#oCqBjZLWZ-9&Q;1y`nj+k40CQ)g8;ImdkZbeh6!zNi!29 zQVsRGocpTO)Oz3EzX@mANo{#`oO8`inGXwQs##0uk%-A>Fb(TUGKXGSGwL4HTkN1~ z^ygXIK|)P%9?cbIOe)`~F2jl@ zk6&L$7#CR6A-pRuoJ)8SebomIhH_9qrh!xR7_+ZsKY}TScH-lQ^)`4PuBRdeY1sYl zO(B!p75sctK#K7S8f=U^Kr*)4AoCDWBw-VyE06kTrYf2e0k7nwCp?_MSfe(k)zCvLpp zciG~60vV9W^?JnVJU=?({?4^ux*uyjiCL?q%x7x7td=@stv1I?wYdjo`}55p5amL0 z<(n7%TC4FhD=|!Afl#f_KAC42i_YK@JCSN6Ef5W*tMN?I%?W@!0({r%%=RtJ2UO=k!f?nOkVC6^#U+m6~ zFmHwYtR-_Jy*T6rT>)pCwd#^DK6&p*GLnTcA{8D?wV2*qm=+uWRk6|M4VBBzv88Hqi zJi_*&(lUBZpnxU_%AE}G?v^rx%FqjNa_4QlR{gr{>G6Im9PE;Hp$;6T7juU6&0~T+ zGEOrf;YlCgSiyr)q1E(w-A^_es+N`_ULbRM0jd&?Qm9a}AWapfwjo_d3si-_Ma}Th zXE7GW8OAwV zcYFP0bDaBeGZqUYG6|0~J{14l0*F)90f0$&2hcBEKmu~K5>k1L(0C=X73hxFa~F+# z)zqdV#M(lBTe+L8t6z*WP6~uEVZ&tB00Zb&$P{GgesQy9%!so#`k$?nQWPR?1oS}VJj1L{4WbYHhwS2uvHg-f={`+h@%@{gpt^^^tZfC zHh}Ovj^YGk0}jQ(%wG`3es{jbQ81(rB8TTx6s@_7G_T=|ISCkF>6~zRMIKdshipcS z1un%M1@HWXGQ-x(zTi(>XpJVKq6X=-4ZG0Oc^DJU=y+5%Vz!*g+Ka4RW6a0$aT6z5Md}T5G^Tr~ksNc*tid6S#`+Fg!Eo9LU zJBh01FI^Pgz_(8xyF+nk>hK5Br%}k<#nv3zxuT4xZhEDkpX3j)hf5}u%`4zi^7ndM zm1rY2oBVg*AFw`It`c(3)w@sTJT4Hk(#r4d z4>$6foc(Ul6R=&DxGBGG-M0HRs~>s98V!EF_7ex&PBi=ux~d{%G*5S>Y7Sy=Xd;29 z%jyX{bYdj5ykA4wJqy!H%{!Q6Av^Y-d94v^;%0VJ)Gmxou|0rf6C+8x5tW?BJBWe` zZN+XIL1U3tBuMheSpe&G*=3=}pi^QT1tT<(v7Db@8}*e85hd;$uY;8r5OqCY&;Dm7 z#dmxl|2?vH&S8fi37Q&_QqbIud!53_WP~;+x1A5Wjyzus7a59_f_RMiy=um*1I~z& zb*%h!i;jywY4@XD?Rm7r;p30#hl` zUdA9>jrDMl0ab*`O)sZp$8jVN25V_P#Ug2u$(ghe>bB;dl^PDEk5OQJ^Xn_@V&ttZ zr@ktTQIO!A@Yb_F!IiN6_V(>|K4{@zfXrp7faWXX^~;#MHQXOU<@yu~nS`!yyC22| z1-6!Z>X>|I(U!on38Y#|@LWl??uDa-6H*|$=6>nReo#V=Yj@D4MK!-Lxa}!s>D$rT z@P_+%6VkeqzpK``22p#|McIuNIX1Khg{FjSP<7s`_reLR3@xQK#;JbTc~#me zxx}0yq>Zn@<^BXokd$3U+ZkmsWoV$YE6^-91w^TAuBx6KNSGF(Oi zSU&4u4nScAcy>88Yq${-uZh*<&iurzPF*WDWgujT=U{U-P2^*m8d*kn>uNW7N2*Tf8@=hk zzgHv4^Fv&UjdYG&TN! zO0Oq)a+#k87y4fzy`|d7C`L5e(9Zhona~4Ka4osD>tKfINg@7-y-04_1NFWA479|- zT`dH|=nz3C8W4GX(H!!Nk|l8_aA(cQlA%Rtqf-JSSRGCYGF6+B^?xABq_A>%ugzX- z42D0QsImQJisYMa?{6$cLiw_pwCt3+nZ!tozfeo2xyg9tF*(t7ZrzHvg93jXncZYn*xC z+2iF?ZuZrI6pW?5aZtzLX0lAUkF884|BZb=)3lk*6_$_h$U|=)U3noco3{u3b3QA9 zyrW1vkY2!Qx;0{isI|pR6Bm~aa2GeXif43~16VB>5=qMXeYra!gm508r9sM=M}x`@ zIgCM4Ve9F3v;@{dR!cbbuaLBPqx8`qgIr)iB8iptwIYX^NC90>$f+)e7$fDayKs=KPVyXg_ z7@6ZLn|Uu6pe)kB$8Cfi)sRwST!aDr<PM&9n82dhlH3*PXSkp_Is^Htb}4Q= z>>HEJavma;bz7aexMb8w;`bVnVT&smYLA9BnN5O~kA3a_XyTd}7M88Z=#$A{jUaK} zT*SJU@=b3AjrsLen4Y<#@N!pE%%ivw!PgeAQYtNmw(V+U^V9yfw0QT8*!oSkiP|H* ze!|yU|5Rf8OOC^BSIS>@ec$|w&V%L>A@c6w z3uLdL8XL7Ew;nUD)8-5)vCJX3Ux3d!{Q%M#@geFBJWN4;4}R)Z5QXJa@fyV~+htU& zf2Ik5K&j+)+ga5*FCi;)fJ>bDiDAL)m;;x`2?sx)^~-My-@{aN?a1%E6q3m=FDm1+ z$@(p}^=Zf<3?=u6KjgW40j;yFL%AP%l@4;YtEk`cj9O=-0FO}~Bl}jg` zqRf@q2H;e!I%DWoU~4KhBKagS6c}6RFRHA*g7l)o+z4VXIn6^9dIv04fD(aqUzBpG$sKdPN3K^>>KHvGL!GsJDMa5E)Ix# zLgoHeG@PA#sJwqQ9M{tU2-_Wsl6*Y{m5unXFq)IQ3}`Zp5VaZ1WUK{I~_LQD0Sv2Qy6!UEGtloQgAA z&mSK#8-IVE=Q<@uPhFPtjKqLRRqrmw0nbl8vK75QJC=*c_w}gsdnJ zoEin8e}P2oO9`F)!uGP>?0NKggkUhjbxP)%!GnGhiu#z}y=vchvY4tM9gDQTED#3l zQwfxTM4`_OxJGZ&tWCMW$lOMxQ%HU%lh6||vv%xcy3XIQ2;GOu(Q$AX$w#5t^a{`_Ptp!wpSfhp8rcCI&l11F>FX2HyQ`2s;9~a(f){Ju^xe@$ zh2W!*hmRD{&cO051u)s$sk;;{SEvcAqLCI;l{L5arfK(tF;gT@>^T#TAC zm;mm_Q6d8{2^cOXF7aE|L_Q++w-R-BHUIzJe{N7QdMTz>iFZFSlty)p! zs9i2heRmJvsiVX#|3nKC4}W)CYBo&rF=j#<)zF67c|G&}@S%=Uy@1)DYd^Cb1l`ka z43?K_@O-asBNrOEzlPR5nI)x5=iXTi=^1w_OxuOrJFE91ZeWgq^d((XqI2#suHr2$ z!V$pZ@;y~?Qv)vWcC==QtS=sd*}rRdC;^ zrDG(Vxl&DrDh~ZzDD_{^%mqJFA|=9fzV!Lt>nmt#c{qrGf#eCSDUKKkGfK@4)2z&gTkj98#@H=$VA!cYQoyzy1w zDA=&_%Kqu;bE%N$As7UBlflwFX+%R)*PU^EyIn0GtCmjAIjVgfJ38OPy9Uq|}YiAZ)3*UbS;>^nZKE{WmgC!#rGHes<`}N)DL`Z}i84 z2W(qmp;xYhrp-*NXOxQd8qMqaLBsvmFD3V|65^?bPR%art8IEpzbCyrb-gb7*wP@+3caO6@z2;aRL^WTI-;CN2feC9KpgnSiV;QS()i84}hk zjY)hmu10L)7sVOU!ao{K4OMpYBC4X1e~4X zx8elTdEiuPMtM-en0p*k`W`ZPTn{Sakv#fvVKhv<FWuI4LYL%`v^Z;_n8yKiWS$GV*#o#@i}iE30#CLf>!7I%i(su*S4 zU@_rLa+r!qmM_BEzO&&_5HuanmP1xXwWP=K(En#0CP}qB2D}f%~!ze7-S;!(% z23|k{mK#Bmyzl(X@q9Q^Y&^Sy!ko*TRyjeH&UT{0-J$wKi$6e@><$+N!LQIqnx2Fa z5QSqCV*h}Z18J|;zEai6B7eC^LzOT*n?B=yy6rQfbc=6G`*Rcqb)J1~SpjjEWFl@d z@KGC+4Mko-lfiN{-$N{dacgqJy-3P&LH@=Ah()IoUAcijIZbauofM_O@ZajAi zasV1lvxNL*$QcWx-4TU!?bvBuZUG``T<+R`X%HwLUg(*4K@{+fh?P_*L^{vUIDRKtWhFSS?L)vG$~nas{e6etbfF;w zpuH4PLj`D@ZROP}ba;?QOm@kI`mdX+`nXs`GCCbjr;JoR_P%x%!Gmg?6c19fn75U> zG~fB8JQ#B;I})3kt02D;vi(w)jEs~sY$t7jDPqM`Xju{$r7Jqtu>uX$AXd7R$OLwFckyESlD}F~i#9hT6XE}f&J|KB#{JiNVPfLYV zWR2;Fri~DEI`Y=*I}8ieY11JDu7lwNhqd%e5A+RA$0pr%YsS3M3Vvb%Ulx=oNrR=7 zUrI^eKdfo}T`Vqc#~Wm}X}|xVb~o!2TDMdboq9LK@NrIHV0gZI1NUdS!@6v4NL0M~ zxu4MUy0IU1n88xC{OGtGfuQ<}CKq=5fXgepS&8&(#r}!(l<^2<5Ur*T_Aw(BR`Rc9 zE@S)i{fKq97OT`XZ;TC#yV2z9u21K{yV_yb7ki;))YQ7_6S9%3Qq1C?Z0DPw^J`S{ zj=xPx>zQqHkQpm7Q~{Lb8@utE09NBU9u5T}##2g8HuCODhl(#hKW+G9PA3;nJX5r$ z^2MZwPNmpX5)3bmDZ97IaRsKo_B);=tkDi9*VsDN9gZsIXpkvx8P=IT?yE?5F`LsR zSxkI#T}=>2eMEpG{&AkeIt5o1Ko+@#v>B`DDRTY z^|gu`iU)J>;AD8Q&2pPtnD1w-Fqj$rWx!wh0l4Z#H1LIa}PpneQBLzwm)nGUXNpppM4bu+c#EJ502|OPB#sIKnXF8jEs9qhC0<07@GR;D;Svu<)@i#-N$;>_gCJX#8L*_@8K@I)iFC3tj{8h@#T(7 zu@Q7`wE~B&PCd0(Q#L;w2!G#3z~hp8*WdBg1f54vu=YVKcGADb$9H+boHo8m1hpmb zwToZ{j>L1%o}^oaA|xRZknj1{9wZaHl`L@Cl)NS zTxZXeR|gkv&5{$wigD~6QzX_}emOFIeCsXMG-LS<4W3SI50cq ziEIa}PQk%&N37k)Nod-+c5Bdwkd+0+w43|9TPrE3o7?hUQjhJ*hg-{X{~+V08cKTa z>mK`RbAI-E`zs+yGQ~w%?M=e6+4uOtSv&|KLY(AT+yJ3_g`9+SZmUlYYHlSeWmyS5 zi6CGmqflH%W1RTYbN zHTK!!JH0W3k3oKk1s1GFb}aZSz&!nyHiZk#d`ZEx{q02>*jlJq z?KEp-qv0LQg@BoMofZ4*y^g6hXp{TNckaYRsCj`xTWGoYo$l6;Z|}l!Zv+&~2P~3C zz2I{1^(8BJBEaL~Tk)$Dx{l=FG&$-&%A>uMrNx=(gqFp=icdlZSQ=7|t3f64F&4mW zoX-Z9dM*Z$f4FJ`pRI~Y4(D0VfoMGI@p^w^=aK+*hlzAK3X2Y7wwn+E+r~*}^!$xQ&YgHrVFs`~&1G*XTU@#f}Wpe1cN~ zTCb>hlxQXz+|n%sFXY9gX;X(T*?EEXJj>W%YSid7>usOOg}y(5WfLN#(_S-UFNj$t z4G+P-hBsL46w+{8oRh<4bz@7X)Wysza{Mw^m2OW4QeE%nE7TKadit}428iarx1nr8ju5^Gw)&W`8@4MAXszCsu zq=w*9>R+kkrOvWpelUcT5bOBvUmHjW4aMm8CY9@K@>$Qz?FB}}nt%4GtNnvU5tJ*O z*)HnShno{a+xjeS1lpSJYX`JZ^0J7fQtUE-U%YS@TTsfp=4pV4%eCjD1_K}QZR*QO zg}ab5gqG`?#Y{k^M~AC5Yxn&Y2OcyTm;L4mLn4Cf6XEqUK}jhbUh7M5{JGM&OatlY zc()&s-T#4MLblYeh)2=87lJ5o>7o~n)j#jNzb|d(WS*eHVOTpLGZRyQZj8jdu0Liq(8{OHJGvcbO-UTP? z?B^wVz2qx`u3x*N^Mm#*aD=!PObD7<7!9j9JPgH)8B?vcf0MV?E)&`l{%SSkNU+yk zTy!+%t3!^Yo3)5es1NBsvWzD~g??|;XcT9ZLA2o&pM7Ps{)*t0RCCi|8DGfOPZsGA zejq`%QvxAX%R{KGO+M0`&ZiV`+N=?OYn0OH-@y^TW?S?hQcxAU~)3y~NF<$6BWg zihvB^lI(5%YV$!^M3ZjfhOB)4TYxssl}z7Zj@$vejSa_`k6*1wC2E#Sy@_6RC(5T{ z(qq}H$K-@9nofDC?|wZ8bC=XmYVS@k$`s+xMZ*@sL1R7Y8rlPAAKmBmujMf_@u9L> z)mJCUdc9VNb%5>9p5I(LV!3H9zpm;T=VDGUKZ%5Y?Af-w-F69~thiX8+vo$wuB;Av z>_2!kr_$6cEuoIN_+9!t5>OuBfcYJ1!Sf_xw4^;K=kMJY%S&EY7C0GGviwFZQ-(Xv-|JWDRdo{y+4Q$28bieNFFl&bi!v<(=+Zh_WNG4 zYidSU$GKMW7UT7qmAw0(oM|>3ou&zMsd}w)9N^2X5#9FbQ9SH#oC{u{K}DafuKvLt zV*9@=K)+~*;G$V_`II={C-R!BWcOCSgWEfD7I;<8jk-f#Bty+ZF5TJm?9AdoIzYsQ z7=5gyy`yJ#$5$QQZ!GD+y7NW|&M5JISBVjiG1v9QuHfDSy)U(*KW}wXUCp6h$Ief8 zGrym=m|oOy0rg$J?Eh&-^{|; zL97#duKn&2fK7C>ZWBr{hHPm^X$=2dVsrnac6!6O1zIYej35sjWwL|&QCjc$ zLym8R(Bb%#i8tRDem)_dxU6=GiB67NwD7mowZkUwcKxt1`~$RN&ZV#RDfh-g3|4gM zK%j-6PvxBIeH-Yk#>JT3B+r~mr|-hKvt-AWOvV$PvQf5_a%!St6QV>T=fcM5Utl$5 zegswP$=NZoKYRUAf5~kf&N2?6C~rnBKOw?7^3=YdyB3Z15}O90e!hKw+3NK~j?=+0 z>qMnzktKHp>5gtSNacv{gHG-msh(V@2jX+c%5e9m=L`v{ls%zWuW0HUIY#6`EMOg3~G$v$pz!QDCx<@ZQl!7uQ@gi6$~TM z&5|)XlvK=;%=x^g27~B-ZguLKw$KdEp%EH;fMk7HY&ccR3yd3o32*`gKUevC6IN#r zK4J=@{DZ0+##zc66oWe`K;J1zt+Gjj^Mvb$p_mB_0k4+FSt}I(8{w=4_Imf*SD*jj zHwgmD7y;+j!waWi`d2<2=<7Kq+Mp053HI-#Cm^*R%ez*Qp+2;7ytF`p76=}ciBh(z|{Zpr==pWt7TNz`IP?a+Z*qhXzkFX&J;r< zwiO6_`C*$0(sQgB;mP>8^AFxjPha9N!K0ZYhl}8a<&Wl@!|XJ`ZV)&bgtN>utf@=| zqTm>?vA&P_=odoQEz2KN0@@vghHxe2Pmf)E)Gi>Tb*a_QMX+R4#J5!Uygl&B)qs?8w%sf^?ICu0@GgjzXf@c-!`<#`XvV0aq9!ykeTjhSEld0P8JsLBdbg$-c*j zw-{7}Nsgm&Y4pC9m<95!j|#4se5WU%674?iXg`<^jr(unjs69;>D8a2u&VtI7zMm^ zYn@ZvZ-f;ko4{m@%!mT~5G(Y+3(GPc=lMr$!hi+h6dV9fVYk_>I!f(uaA4fWW*KlCwDL2~~K zWFYHhpA8*W4l29M-@(xMS`u<0n^D^krcG>%?@8Qy{SsaZZvb?|eTCQ={ahdEL5y*K zWap>L-*Y4m_=$%2Yf(C_Lm~4HoB~z+-_!$(&-xH0__5;)?vOBv}TNAnRb zLV|!#XvkI;f=}ZFg=%#BbY-76^?+N$UlG)V`{7@KcyC}*?pV%R_AT*0!|Y%FR8oQA zd_Vfx`hP>)O7O{oFt@<}gtqgLS+`E8e*Q1h=$~KdVWFvFXI8uwRr?Ljx`_4dYe^kQ zDNR9LfZ=Pku&pf0)$1Hw0BM#ZG>>E8+gE&+J^Ha)cy!ed=QV~#XG8gr<`+|Aq zkD}4V-4=RYP~cPc-jRt2p??S8$XY==M{O+G>#k^hK3OsPV5r%PSULk1m$5~ z_o)!rVQk%Wz2809ory6rQvH zai4aM02qWq`H4Iljs~xgI`|C!{f<>Ax$sF`{LUd4^Zs-UEI8z0T^**l7Wkc}$2VbH ze-Hn1{n-1YcMY;xQ0`QOr;>Q*U^tlz3li*WpDl!X2X*uk?@E>#D1yu7gPQ>#mdT1t z#cyt88Wi$Y?4KIC1hqFmzuE_+vK_N>iph-sAK!e1cKaYy*$L3EIV4&mPP?`Mwc=2C z^(xl#a@l01pCJy-AnZ1SmS^VLdX|ZzO8J}j)F+Y5956^Mc?9S1Mtc#uCnfozL+SIi znYtPw0F?`~3(IUjc#J4rV)y~4RjX|VBNjIN}bggl6mk*;PJf6y!Elc0C_y7ta_Ydsc+pAUp=#^9%jfokB3+F%&^!3dh222Uu0e7F7gCW-oSlt*w)*SO2)*e)Y*jYAU#uWh)#~{T$jTVOU z-D_lm9NaAFE>h~S-CJmT0r$j6!S9e_1IZDy{iQCmbzXtFVGzakd;pIB1z3SH4Hg?W zw8H&o*Z`Mq2Kc`cGObF1USopB09&m9RQW0m$|JI|s2d{8s)F;utx&_DQj`UCg&N|{ z`~vJ@*#EM`+dwL`f@C$eAp8hgnJE;Mwo|--*C5_>9$@bx1cIUzn{RJlB(jYjLw4=m zdB+#6ax0aaK5%|rQaYmDRs{FQhh`a1Mit$*Ci(UtLs0lN6UHwS#N>&(uwE1UY?jKK z!SvDp<3i4nL5URrdQ!&qk$lp%I!09_VzYRa_m!SN5c;+gIZjooz&0TfH@U#raS3K8 z5AFk-Rpi)Fih-vLN(jcdNyeU76Rm9e5|jXFlf^R#Z4_E-y$-a0ixx;AcoScN zwdMEt77DLKO=T3gKOeK4NPc}q{N=HbI3F8NM==B**MnU!o@NUa-d6D2ZjrZ&Wm)W= zLCc9%djRh0b{!xOmIpiwUW1QE74BFW*o42x8%QUEcjE=Xh+`=T82x$$7jTC@87Jyr;m{C}YevGjLCmMZ9&UH8yRvxp( zMDvXq3}wU}Uejyg4cdoA-k%cXc1FiIBJ4rRm38c}bTnLBihBvo9n81uj*KX+e?mV& zW43haWyA(~1#I|BR-r~wKB^g13ksvGxPQiMqA2mmXKAEnww7yA9Wyvuk*Ew)x+^EY zUh8UrStmC~i#ND(RO?$=9sVjWz@cEBQLNv@Gpo3%7LXHrk!W$3bsURfq}tWmxQ*Ua z|95W7>r0B!Sb9S+8Yw@9q~Dq`k5}R62I)x#xsc0SB%;A>398A)Bm8g&1%ROlEd>Iz z$e8;};fi8hD_cO4cI`Sq!uiwucwoU;XCJ&gGgaX=#-PiARHUgJs{qUVp^{1OoTY#X^^j^lGGC|yc5*H3?>Ng=%qWGK*dvF--*S07)fEZvQ zn-s+_uHE@2$%P$Qmyg{s^pL==&nNhZwAu>5T9sy9U+~1T;t`Wl)Q8xu?-D6_e03KM<0=D$ z+{hM}X02#!m0o=t4Q4wruh+gsi-9vN>WBRb)(@DgD-SHZfsaF_e6TT+Z|&4@?w(dF zLW$pO6nR5rl!6DINyYstwjvKB->OUg@JvU4$Lez$IUq&YOmgTcZ?4avvi=0>x(iBw zZ#Zg0S+gdGkw8EwEZ+}XIPIbsMspp%RvA_Y=@XD_kX5VUDY{GR) zJEN8$)A$NZrN$wJ&E~b0z^3Ui#fF$Cs;vx}9Nnq?5baL*<%)BP8^&sLfcPLa**EG* zglnI=?d)S9q!AH$b86?>jO%KDMUj^gDz;-jVqk!^eS%dE@~EOmoFZ-2wr{7NZxfD^kF>@(Uc>9y}RcxM?r zr0`ruOVBGog4CTi`>Gs7C{xq?1&@)^Gpr(Y^Sop3qgs;!Abfkc!`(r(xUoj~R!mqP zZQ-9W7uC~yUnCqEb2yY;F5K2(>o2cf?s74ZQ`#bdpcZ)r)PG;WTF3v#*jqqlxwh@P z62eP2ymU*5NQZQXN_QwJAPs_qprq2>Axb0C5(CO+?T-}iN2*KwYw{(=EchzYHTI$O;9SdLo2^+V#uU&1(SI=lvr*~cyj z8l`>Eo0vsVNj*XwVd*zz@^rDz|3*@8ko<(Ok78y1TMjMtvLO@P^d-%AHf28 zd2cKp`i>Jf)Y|-r69pJoqA%id%Jx4)=(R35mY^#7gygBw2?~UhM&fhIxpw|gAe8BB zDmajWPGL~orCs?ia?myl!+adj%yfjl4F|Df3ifz4?DYapluU4&Qb}O*R{Vt*&I+9M(k`r`wQYj zoS&`Wh_RI&x3%)N$E^S46fvZRJI9p4Y3E}Q`rsz4zRiP+>6v1isgeH9 zdcBNjO%*nMH}$c)0e7v`d;B_y?*|AC8s`NY z#J6g{J3n4DxKFvpYPZwx=FS94fNxK2#fjZ*78qZ*kjEKw-D|Cu{8EKvJb8L#v3ZnfDDny z^tJO)N%9P9nLAh0m2$CpE7(x#=@)I#AES7=blkiKJ+S?evwMKcjp~k^5e^ag2U)m9 ztb>nlt>#_dIosxr&kwxlsk^s-}|r(vSxUck#*mC@;_&B^$5jKmmy8r$^c z5OX;nvU$R67W(KO~d`34rzHlWJ?GRhBshh|C~03Vh5=jQz5=}!tP|qy z=GL`2Wc+{(5GRD&dv%qei*Tv#^RvkU1L)pY~YqsC{j z8?IX?XKPdCJh6H*WW0v4+v|J2Ck( zfo5+dG(8>UJtjq+-gnap=;Ri%Y{4BTXe}^x3C$^x-fa1Ydr^>d6!&$n`~yJ@&N)yQ zTBmt5v_mUBgyNeXO-~*Yqr%|7niMG}+4a!5^q7i?<7=LvZKTuK(+EVRp4>xtmud1Z zjG9Y3xFy4YnR1am<;qA;2q$rzvtnyMeV1$*y$^@Qz9JAAJzi#!-iGe%Yi{8>p z0N$v+`ZMmfN>L~jNjjlDU~@MP%E&vIp6TG@Q9Pn`c$qsy;+^AgK>&IuBk4#f$Q$@k!V;{FJx0aI{Vz>b|8_hcP(sWLwY_f^S8Y=-*}0 zm&Nxf^(&o+SK)cVojcd++HTD*5t1lsJUNg88>@T%I=I$1a~-e3L9@ICIR_{kc zkFQkE9R^T^G?S_4z3Y2v?l6A%HjvmWiP!y0)w?6fP2Qz*rjz|)DH=wWrj=@b(M#p3 zXt~vlBZ^S`5n<`WPM5-G#?@>(ZrICwFpu=gxqe%_9Mx?aZ+*d(|KlL|$o(&;!QW@M zA-n#PjS1VvX%rPYK|jkF@#^k|;ADzECOcO2GEmZS!v4FTUBS=6$aq&W9+bOXxb3Xyh2)H?q4O z3ANoRU4lLf#jkBTIw-<2fBB6R5XzkW-cO0-I=MDXA?<-&wn$kVBJbvPbg_QQUOS1| zNS5loIJxud-pxFMOG0h^7jh~``P+q(_Cbj*c4`i1?xlD$(_8I;NgnET-|lP!*?ay9;y4YkRqaeye*( zx9FA!C&eQ};y4Q&w!Dzev^P5elyiQnc6#%#_S>qV?NBF`fO>ZaL`aaSIGkA8_~*rW9g z2{H?45Ksl~X$L#eX~R0g0{8o^MFxjKj_$nUhc{kJPhqSOiKksRG9~%m`FY}bj%Wih zgT!c+TKENyg%L!w;eE-vuNh&`xnl+KSURLBWqW3K2+(EJt}1A(yX(n zpe+n|mj-*Gg_`O%dB`I60E}-u`H=;R%WRuRtBpzJJlZG1z=sJUL>Bd-=Sg3#yk5yJ z*Dsx6-r{o!kFjy^O2y4*LR*$erIf0%x_0b`oljslH@QfNiw;Fv0o8BT;+(_iMCx9o zD6jklDy?76`Fv0cbW-{n@-$h;(r=BBS$2*tRDGUE&dbIHS`=cs0TqHAFBqU3B;G^X zDs7SMvraOw;G#;`zITuXj!YQqH3WqZr32%rzRY|P4OrAjdgeJ*ZLy&k}8{TT(U9&BSx+Vr)qCSP?4E^@AJo45yUzX=`9{5Yt8hZdtQwD?dSnrUD! zbHf_c%ntc_6%Aka9+=lL^n896tI5V4f_q1;1cJ#yWu9DPfSd%>8*Te+3&ok!uriu)*VD_7uN{z3i*v??ayoMn70R? z+U0c4WNn`)O#2mE-P`^3P+a|`fF(N1BAz5x4$bUmKX@bP(2)MhV8lDG!?it4*%Y8Z zCCcrFKC+}=wY*{sMxO=HYt-l}x>+Xc9=JPb#kiJs*cJHvxYc4|CGX9ZtNv({I$90H zhm(pMeFXsHe3$lT8AZ!T!qfXv!GHrtC3ShiZe-WT^IH=nMy{|QvdV&~;A-J@?I!2D zh}h<;=LVps&}1|{e3p$$4P)q%%J_RqIz<{e+)+jM8I9{(i}zZ-$D^Ony^jG@vu8ymXv(Yv;XPm&sOMaO3-%Vjl{S_(jCjV?p?4N zDc$+t{L**(HbrAQyt6&u76>l4TBY1Cy|MA9r(ns;&O>IY6h#ujdQrdA`)*L<R%ZXefTa7H!$+E;NC ztg%0M5pG&MPV>>ODxo#Y3wo{@hJ)|e^p2VH7}|yIcXvy3FvBV7z_c(6hDJn}8=wVc z8gT^Kn64)5sZLqMiUvfqLL9EB;%k$7P5OM~Z1yn2ycVEZbfdTBClib_H0VTRPY?8; z{nj>SdMtIBm=KryNB#EG;bP11d8mE04mIJ0iy zSiZ1{msWLcf-NJn6>K?ZC!Zkh0<-9D(!44ykcoS;ka8irs?~sdr7Thw8a6;8|gCzNsMs1{7gWTLR zn4wg@_@dvuYT?(t+0VvCFGsYXJ!0oZ(E}}{1<@N3w_a}61;rcwh6_xiH!f$@Rf+uV zD`3m($s*4b3)0b9|5Nx0%_JR&4y4qVwaWw0sfX!W1t>g5vl($VFSjm@lJQaZZd{J z|FvC9X1$Wn+bB@mfPY|@Mjb=x^wY-liy;DQnX;;v-o50?*u19qs*^(pt`DiY+^=&x zr7SUBs-JY_<>0@4HnpeU_hh$cB_vS;W)(FW_IOvj0~Mv0S`@f>ljUyud^E(N5vCR# zmitna5xX_`PHRmQu9L;yV??dC zuvD%nbVJdjCG2EQbmwEx#*H>t&u#C8h{G9hDBWBO;p&AK!J$v44sOa9h;+*jJFpYz zdluHzHDn&P>+UVT2^CyHScsh;M^eTBZZY813@_sKn1!+?`bZphbXkMP%fy3@nMNoP zIEcD;EGFK71H-8FkHxSfi{J+PD|(iW#&C1FR?5{ri&Taie*oCMpx@leqhB)hi(oTt zk~@!2-Cen$6BW+-u4OHl;Sxb8vABN2^!+5;uSt9l#SVDO{`0M_|LMXiM&HJqbL6jf3 z$6N7_piMkOhuY2U2+)uvq%K2*uhJl}H+qs_jB|sG`;*{a3Kz-NaWAuUtGX*2tc^)zg-MEF|yQhQZ#NRHrtdcu7KY7eC9t@?Qu_J}6?KQ;O zwd=iy?4eW`zP;aO|6Lco*|d=bv=CZY+ANoh_v5R#=KQBNTVKW(--HnjXm@Tmf47SF z1R`+GcHjE|qD^d>NZhA^5bNVIsq8w&_^chddLJE>{>3E zwC~O}1F+@dkEYL+ZmDant~7@RVR#Py?NRw*WMJ{Ds|M~vMq2tr#b}2lXTl8d#3^F! zzEM)+Gw%y_Z~z4{4Y5=i@GZ$dxrNh$MFjqU(A0Mq-h2 zcWcU_RB*%gK>mx$k1|99uo&^+`249#RNV5gCGKV4AMf2})PJIN5Z~Z^Xr0XFGH|ay zQc2Jaw;(7VM6x$G7tLPVZpbYa3RI19&4h%6?^QQ1<2##tznU0RigI0(_PVFT&srz| zmTHs3ylh{3k&sInF-IwX=Fc>^n!#UJS#+=D_9b9Y5YnM_Ka1z0zIe^Tv`2&RLI)9GxF_OomY34xK+41Ig zH#&Th2w&H|x%$K2K@)P`NDj?8FT4)2V^V}L-}+5brrUj@b=@Ivl3#p*u6(^SZVz%P zty;}{(^AP?dA*XTk8Ha*_l{RsQh4~{YDWtc0NVkS;jGRqXC_LP^CWvt9+wwiznPIh zWb5DtG~K;axpD8Y+a;7<=`ct*@OE>SAOf+jM^;*3ykF(XAtf)el(&b}pw^9uZ>YLV z6#~1!M_D%i*VchwF44*#k10B5M zRJb+z+VJW}b>qv?%Mj5uxPz(sy#Y)dUR7^hB>C+pUtrR^VhtuiVzK?u%g>NbuxwfF z?uZFt=79S65mDN*3~m_V6J4F|!sd;|Lok1TGlj-^-7$*bPAO%Qqv{PSo}&x_JTwxp&qC#Rbk;>x9WL3D>BN zg)79hbY=<}p@)kIudC@jHpax5NPD&O@?E5BAb?>mCNkiuQ(gSu1^h z+g?ijeWLrp0Ri#;dUa=3BmEywoCrQ0aHT%>(sa8icfxF`3oo&`@M0o94SmEzFFXD#btdy@Vd@1{~ zZ$lx!G&1v{$$jBzxJ;MgFx`VB5+$sq6BB~a$^^8rXZc%pzMLdXCA@hCKgZQqk+sw- zma4Ku6SfmNu{8}e>@Nsu-{wEBxXg&qFlN=p(htp`h)NA~evOnN;|q$=&dhb2_`L7M z@fX=6(nX(vFq7V&Vn5B6E>(nm(;6G~t1sq-HI@=YAThF1w(ngZpXX`&kWQ6T+_4&F<1}AV`AH`xPebW7- z6nLM-c5DHg<(9YY79_1%yUQpqE0N0wR+m)Qy_0?^ZhkmK<}iQ3Ji-{{uf~u;TkM-2 zDi5Z!=H5wbfa{)cyerORtq|tN-b-7vSVdDOuQd;%UI@p-zuTUextKx_LcH)b*J0HH zm*h|f=d#!T%IK8c0RPg|H&ZKrivJ;!{s&uD78#)E-q%MR&@XSd*!CxLiVau(GOzwu zX6GV#qJpgoQ_`8rvJ<&hirwAmI{QXh@~Px|zd++Abg%kf|I1b?@KaXyvG28)Xg_uX z6v0*4WlEqPPU4yHS+w{KzoqlSa7z@1u+M#%bF0l08~(L%{U3fbPqv!dxpd}uu8c5X zepBWNQ^e{~#y!VYkQpP&B7yDMCUM~02Jja)N&bZ|EV~6i9;N?*)ZHD6RXW4Gf8m|F zA3spkdV8PnBi6rtUq8$c84<8hBdUjB@0`(WV+7}ff8KKG04vee4wi8jMJ&I6f2K-? zis%5?!up5h3%EM&!v!H5&in|yDtrVE;}4+|y`2fp93SUXID6mPSI;0&F8~a$5Ln@P zRal_c_RNbeUb8i@*Q883XFbLn}z`JFd|@Q0Xa)0FlL?j zn)TcYZfVG82dphT%Y~QkkH96f5{{K6;{tMoi{uip?!+E{;cV;;S=ZU%_A{L*nq*vo z41Fr{1~Sgg(<0YYB=W8tY@WOvQ@`ny+p{Gmom(pL+g;8d6iy}v z)bIxx%Z+1%v+`k>OQ8ys`;U9@BfhYQILCI16QAMRKVrtP%M28nPnnL-p4J-ucYu~n z=QgrQ1z;Pr|I5mHByR-2)9(KcV`T9ljw_?n+?tlJt-U?bAXuWIQ&m`p_xohCx-pF7 zX+l8j84$Dd&SMFm!GhU!2c0YpsrA{kvx-}WHf`Sr6t)bjo*6z%zc~gnEgL14ik7&0 zwMP`2BA<1IJAzpFvWRCcRMmAc?rXvwjwwjr)^1YIwM~3$FQo!XuNweNyt62Y1Xb*F*vPua%8j)kf0eFxJ z3exG-%&b&%?nm?@j#dH0;4o%rj#2^pp-z|9%zX$yijQn>!skWis&oDecW1~p`p<_uJJBMo3d zi`W#4@uX27r;UCI6XGI0oV&oYhmeA_4U854MGF4$@Otcjvn$cb-5>5%Y4X5eFf=mI z{AyC+9Hc#b0%H5?@vP_)D0betYgMTHBunI@Irf)p*L`oumY5~~!Ry+#wcN#rp4zwc zF=)Z_UJzV7vPh<#0A=XuJ zVetldKy@aTj9vAwg{lDVI3t4DSSKC9;Fe1cwAs){gRE4=!Zcx()}|q2bs*B)*TyC$Uzd zQ+gdOq?*9wid@5yIW~AIZ!Lr*%Gt-u?DesyYrxJD)OK(2?e8uE?!(0kO0UkoC)}U% z@9&&{;XJv@uzF=(xQCc}AJO@cuwU9L=QzCELdM-Qhal>MqfeAvC%?D8qs)$D6sSCH z*aVy5#(DqXLHsnQb|DZoc4xs%Tar+cf6YBcVua>Jn>5S~;#p10kXOrFw!-iy1-x2k z{&Hty@6*^8wF}@9Jp`{7JgysH()O_heDR(ciN69@BU(jmGOta*PV2^eUxVH1vwgEV zyC}efW2%XV9#_axh5W`#ohw+%))=Un|a>*OSzkoRBcd zi+m=u)3o;iU@~7b0qvl%#AKS~cC{Z%O-+Hs!M+GL74ysiI6a!!U%+_)zI1P1RzK;8 zJ?S7=1yDr{@j^O0`YwD_3DE6?PS4ym8IxutTNx>Rkc2udu-GjU4D%MOLLlTuIcOr* zKLIn&EAM(}u3fr%KSo{>d4fHDQ4TnD^-#{x$3}mdC_%;`c`Rrdg!ya0iBq(HncE}p zN){r3f>BFX;)>p%MwIQGK_to#d4^8+pFo%WXc)7t8XNlRX1;}1f{%|cTF;k}tf?T6 zF`+Bih<`Odc4-+dag70HVz~%PA=W^7Pk9_ike3&>#n&wg#RP+l$Bi5|9%IiQXwpTl+A#J^xqRBSz#7~zNRHA z$OVt;Y?*grKUgcQFhzxh1@Pv~Tj8gb7N2FZbo9fcFVOXaC^X}sRo?j3O1xAyy<77e zH^uhFPy!cjUaw{H>knJ$T)uu}MQl%>i5zStAL|?a#Bc=V3fH=}MmfbKN+lcYFk{^j zbB9m5@rT(SP8@WgAX?KbKzj!w1`Q)&p>pI+X+^xY=isBv7jaxH4tbxS z`Ai|fV)!s!W2@DIxP4zxjMB@_S!y8xJZ>UY*fd_;q{lCptD5XwL$V_|9&?GYT9=~C zk6`W0{GUrzn@?`Y0m{5oL4(_s+;kdn(-J#cu<5_aLOwov zOU0M}O%|@vyYUic4NnjweRXzHab)LpjVy0_%Yxv8QFbk8VI`k#Kr$rV*h`M0I|lx; zYjo@NUg+ahW?)|57*8^f@2=a_uOsMNeSW@Qonib0#2++ zXWSgQUye&~u@{^%oZ7i|n9kHImD;v+dd9nX+AM?0t6LiKEQct#v_ueXqcujSpJIFM za9tnivw3Gi&F2?CE$p&Ii3jcT{;}2~y!S1hU`7Cn1ghb+K zhf=7iRL~cM^8$!%%g_qKDo@vHh-avRvt$aRmmp5uoq5`O+V2CJYEt&kXULeOB`aJJ zJF!5XaMrKN^9u3EWUK1qsmvw+>bc9cMdiw#w2kz{M~+0AHx=H&Q1`?y%%G}ZPN-K!z#b3 z5!cM$`y=9N44&cBa6Jm5S%!*7+G*6PRx4vL6Gxy2J=82HXq+Y2yj4u^yr8kCLM>6s;*N>NOUxtc%0@juhY{nP% zRP) z+H8Qhcs8y9XDfhoFE3GGox|UUj`CYN44vJT7xJMDv6s;%eV*SD=#!8b7}syq_y-m< zudTaq#-UI4>X9^{Arp)5!CG(pbx`1DM5vIAA>YZJ6y$(wWUW+RjmyA_WU8$3cw2x? zaW}D*CZEQ_BV=3ZceLwZfvaEFD0L6U<#f2IkI`lCW?EMw20-IrPAYYQL#C5-MbyK( z+d7t(hS)bdu>J1o@#_F0*AF@F&4Pa?6bfYAFIuiQmwy>AHD6)Id!z3^x^{6~;k&*h zn+`AbQv?an%aXaRR!;ipDMOc5odqu-o7_~eqV#C-UMOLEap~L!&yG5OIJsqL8EI9i z#e!3Qy;tGs&wE)#88V2iIaBbaGbEXIO;9ty_tbWR zf?HC-g{(L!63%;YM<9H4|9d9*pVD80O6U1xxOax)&wFs!&OxL`u#1Se4W7GX9sTF1 zsXYsZXZ%c|QP^XD%}@}(^9LAME;)i0i)0~er`q>VQsNjBXjH&S@;NSUB$c!pc;?jk z&CZRkz4z@j|4wqTZnHN8m&uWadh)LE>jmRhBBa#z)?Re&cp70H<+(>E!c8-BK9tge?;W#dcD+u3(s>|`luZ6Y;#-5>42|~CBKp$Xn>+uO0 zC44Oyl);B9ie;eH0{VcAcj^_)R9W)n{erehL-k%ycKSazJ<{rmbdbIx6k%W8=dBS$ zlHPIF_yO14jis23ElNk*Pmq)xg@y6t!QZ4XbyeM92oNw?QK;C}SgrjG7@lEep@Fu+ z=M~~La`D14CJ>Gon!F;e@qV25n|o(^C}DX%kW@e3^0(G>Bsudj@3(lHJx5Z^uTZqt z=d*5o23^lT}s~a`LaGUF50|3Wm^BVG7oo+=JzZl;aIbw;4W_2 z_IYwx-^EUo#V>aE<7G?*ibpSDhY8Ck{gfXGCRFoWhp24WVQ{Y-tVPK$;iV>n zkDWz+T30IE^%t6ASAvBGYeF?y z0#T5A+A0cJaaRaZTQzhLgn5yZ#1in6VD?Zfw@YH-sS*>nsvxdCL}XC)XwU6q7+1h< z8x-TgeQ$^IQ}}H*_t5=xk3))pfw$@GE{0E^N-Z?L3h4AhLfQrq9JtGrYy$rqx#Im6 zS$!@eTViQP%};Bw_h=C(StcoHb+rxogec&Q(0Ub&ipO^gvJu8NjT{w)pS5l*4s}|* zXT>a5El6D#b3OzEc$dj*Ui+(c!FCtNi^Yg&(!6koGSmPT>@U6pUm1QJi z|8++}$a`89QO^yeLPqq#Fq22qV@4E>g?q4RW9QRy91a(KIGWg)5A?n$5f=9=e)8Vz z&ZUc=k5^YEwraYTS?dMptkGVcJ9x9<#zgrH)3N%a+wWdTd(~Q5_ef!_bU{~bE7yB1 zNw~{@WtW!KQ;3!u6}LY8J-+3lJ^+RI*qTmmQZ!~0v=dgOZQa@KH#Mm z!W(ooXreqwfr492GBU|d@^jYHJ`TKB{=IwQBF zD#*ZIVH>q8sSs9XxsS=|I?B>O#u|bmh)DhknJ5kO6ydp@eWP%xh}}W9u6q10Ads8d zuh)o-!<=XIGfWJ`vPDU<>%0e>;MvITDZgkdi~f>p#`(vhDP$ZAje(n*hfE`hc264S zegY3ZFQH9-Ea}R{g^DDUrp*ud%H!PXfkU(xns|n_qFr^*!ZITGZJji9sp$g-bI#@Z zE|xE~x(n#pt#~eZ#d-umvu#grm^wS+>oW;#O?$U`tVm|3ZeGqg1JX^kD_K<%?3MXO zl$d|LV4{V>K5C;IVcA?((~yFFj3{hx!;n^OsYnulSOvL_FPzu2WIqJi<`LO@1YO z*QnBj?0D`;7TW8xuv@SnfqxuT`O78I00D;-f4t(CMvr6UpP*6)=6v}Z(E8)gzXdlJ zr8vp0eYPNbgc;(M1|2oZTaRbZl<>9f5B2o0Y))P-d;ofN(VQ=I5nmK4$&xe$xr-a% zW6?Q?_zd@57P9LjMbs?LD{d!FMea&0I`1{!w4Hte_|9eTmb9?WohObv)$KAcsb`= z=&m^&4**@(G@=Thqre6&@g4G157_NrCeUv=74u>*~?+fIo zozAED8*vSO0F2gsise(0MoLHibd2!J3egH|Hrvs2yLeo=R-nD4x@(^ettQqHu#C3& zL-0+Wbd$JP3I~sTFyQV$;0T0gGq|HJ{JAhjv=jky6L%=R9c;U5!qtq}&Zk$iZ$E=rQn>$*47kpT8LL;+2{<-0&{}B!C`afQkLw(6Bb-_8 z6)*9BN@^OcUx%uraaTYRCdMl`an~Y+*2!^*))R!8I4?ZBY}am(fhuCx*=mU!^9MWj zkjNab)9YO0J4m>83n@2(-e$ZbG?c4j)q$Bg$ls<1&*ka!!%MP;$lp{h=(tEm2APZu z1n=~_Sl!weA~SvTH0+7m$@6GF+PB+*MTB$(odshsk@Tyha(Kf~k&tlucG3hzPh6>tC zD6}gJ_ctu;L3np6yc4OgX39sdytwR$xyIcqkj#fGqi=Ad9FFLW6I6K+(vGcN%cPF{ zUb(fAAIt9pa@+MWHRc!e?*#XX-cxy1-7WOH)pSfiJ9;F9(mc6}0boSBo}w+QLHqPJwAUi-#2|6i+d@Dv_3xpE9ZmBM z$%jGg3QVTB&tO_2{>h8v5Yc8^Itx~o5ZukUZaVKR)ZcJy(MA*E#X?%d_X{?b2&CWRm6`Q5 zj2Tm5K=AM+ZB1Y`(^ozHbPzMooESSa)xO<}P_Exga**xt$rSv%?TH?5UGBC_*3oO| z&aOiZD}ZQQojz6=T8v~%h`HcwSi~~BdAl!5f@m?6MMn>Z$fnkxs>|dsndekcVsTzs zf+h86tEe6R_z(>DUcYtw2^^{T4v&EQy!57}AWS~PzK2?npwP9;(@AomKhOEF4wmjS z*P>2gu#e>Ks{LJ8xF4E#W^om7S^+I@X7%aY#lPKmw2XH9hSfJG8K%)*u8Jb!0I^H=i zAY!Q;CrsGuflIMeBBy(Exl8BA^E~5UW+3&PB^o<45qX?k?Z^z_;pr?}w6~ILJaGsM zcub$9!7p87w1#3_;)=J?rR!QODAS{hy$#BBx%Z?G%2BI|EGWs6dBr}YY&uAn9!crS zbd+2bz3@=r$r;5N3_p!=PRj|@(wZeDXm(zzV(XDjUHC{7@h6wL0)-`Rc^5R}_l30y zk0q<22~v~(25H#gQ>fxmj*8y?An@G0sZnZ4kbekV%ZxK+wx8YOGMwvG0w={4>M+=6$ z%$vj9$?mg)d=fMQ)Kq>?!yUfAkyp}6c1ew_YAe4{js@m+DhE_bgchd^Xj9P9qnYo= zVoakgklwRYRevnS{K*%tpYppE*15asRyS(U`;3#-ru9ARX8sUpPn1u)k?XCcd6`1m z8?h2`{%tiZ@F%>~tB0f~N5GCFo~v^5Qqx@B+t$5CZ4fvaps+=ozI!PSn>hE?BkrJP zSWnCs$bZG~-3L9^k!|O7`+6}Xqy>Q)L;CL61R`)%o;+#YmT|oGx^a3}>WamEbLF6| zkAEPiKAb_Rs~TBGUr|>P!e99DMKA4tK4SNNFO__suYsfRk=xD~piGIs<`2qb3Vdw~ zJZ>a&LZ>Y3Aba?`1 z!poXgFVgt0z-KksBNvnjB~11xapPWOdg#X%i$0j->DhOZie&sTc9(J!x}YsibIrB( zv7s{R?ihS8kg9=^J-O7OFgBd%v?o(ZTAn69YQ$9Jdr1en28vMgh_ZdLTAj1A`$+S` zr|>O0b*Eqft&Xlow5jVVA!kVT#u4cJrHtSr%VhcTj^N;Ps5?q8zK@s0ksxG&hJ3=> zyQkO9Gh+rzA-&VQx&USwJ#kl^2tH=8>0n};jO^vjRtEk)x7;xr^@belDFH_2)b)w` z!RJ_?tsZ*zOm*+K-e2*dDY_Q;ZUX#5aRFGVyD1r&b<>nsDSrRdRhQVdS~isgNf9Ow zwx0%iNo^Dc3H*;GEAxLdL)b9#e?0?Rb8YNU&O{(>vSDrNz-ai$cCbMD0PAWxUq32! zBS;LF;;rgKRJcaLXB?u+6f}p<1=|`HgCr>dx5Ud>VHg8HU_|a(vf2r=$2jqsJX}=f z?AU$SR86to)>wDB$P&h)wg$Q8=~=G}NLabZkCvt6odCpHGqA5*EXl7oyn=O^7@LkA zBUBoEl$=+QJ2vaWc~unUK3Wd-BGhJ>af_|z*(xMcHR41jv8&DKsLXro46lfWmPYB9 zY>Xd)X;%*H#Pc0k8N!70=tRhxzx7K0N^N!3^On{+tX~J?+D+H+@YY48i!YLNk=Q=O z2qQIh`z?sJX7Q{&_#W9&ljMiA1C2Dk9&Tw!KWs;w&sSoJ!fCp8!>zGoxlLP`x^OD{ z5|#-y`v<%pzoAjW8JL9b_Io;hw5`d?^UkScd0ERk4&(G2Dg~)1Y{91KH=)v2m8vBs z?fSIBpQYrs1S}pyqLzle4DDke2c{Xs6#DPfT>)8&au!&GA?tGu)@S0%Jy@UqgCCFf z*x@5l>i*CAG(*;>c*ezmggs=RB}CTeW1p}0M-kj;M*r{nL?Ur!9sCaRrkOtD%r-bt ze8&0evN_>T1$su45)+v-xid^a(b~Y*#%SWzx2t5FpH!0KmO^%~`Y9FdU$A;8IN)=a zcqzL%eMGQfQGv9`nuSrg-Sw{h5>aUHx$Wn`{W)&GEvC})Yo&~i+O%z~j^*sZ=^-5< zr7YuM?wLx8BV#v|NxiCmxm6SNzTdIFv9wv~eSAK0ks5>Y&)`Mr!5<1#M@b`dJrAPC z=(_BL!it)<`#2+2344Mk%4_UqX2sxWFTh(AorbVAVc#v2aX>yBdR`ZN9X3oq?Gtnz zLY#dR;_3!?L@=bg{Hv^N3341dWucm$SyMxTwZA6vJ(R&R2u5QkTDN`yTj1L(T{LX# zfmL-|YVp>bro^^AWrKR09k)J0pkoe_!6&O7`Dxf&@jK{|9Oc z`6Znv5caM6B+@*O>VJ;y4RHkUi#>a##OPnF+~gzJz#J}u&dtZ@Uw+D`h?kmUe2&?r ze{qqciiW6fs@u&UP5*_ldUOGxwx^PAxBtnW{y)3E8E^wyVo*enSEKU| z+a(7az~Kq(kruwW1@wjWnx!h z^t>X_%MIm|hN8?nC)Db@ZKhwWNfoGlt1ARyfuhdb=rowUu6As44pwEiwP4IUd z-PAu1-{}p7yr&SDq>R7xajpmA8QkThZPK_g&$I5rO=YG`2)5WdQ61_cJwr$xC*{V7 zO-61nMmc}LV0`o?*b{Z<-v9G+9VYJYmq|Z+l!L_@23cQzM|Tv(_Hz6S5Qoi7-z*b` zw?s1%$pin}XK-s-qlTTueDl;yC{AAq-kiD%yKs@pT6>p~gJht&{TVn>%;SdTC4`Oe z2f|=-feZ%5Yi-qNxvt)_`#?&zt7ZiWS?kcp6h+Q~aux-fB0IG_u=s(7O|NGbe?{bLNV%cP%vwm>^ z6s9;xWe0c;YyX+i;8ma8~hxlH(7nt-KN$=uOLUv zRR}J7LJ)Q>)WG#=&b-Mj&sDCPfg=(?kxF8+M?xVr>tcIvXC7<*Nfvy!E=!tvjqBM> z5Zt8VDI;cDy7^AG12eIoa`bw@t}ID!;F}69)I#Zi+pkbie$=dHzVGrgBHqj&+?RgY z3LPDE=5FjwU@1YK^=b6bKRbphM}CZm4zW1<>!S#8!+YK)9a3PZhK>4%0!sx{zvO9+ zS=2skItV;lIQughe};gGuv&+ZbqL6fcs;mgXTfUC2|AkYjrT{}4nF+_VEnX-I4uwL zZ{RSp-|@d16akq%f-#xrB1A?O0!_3+AL$m1<8Qra60o6>P2w28}J6rFat*J4(^Pyt5;So!% zhZ67$48AewMeN*&`1)gM2m29?F(C}@_!s$RfXZ!(ISI$37*ryWv5RAs2|NtNRvW{_KeJF=sQG4lW zBQ)nXYxkjhoq3hNr5gbiO8`lq&}Y9+6BP88fgVU^`#Qi%E?BxO{UxrxFJ9P3HW|>U zjt*aq6#^kv!iYkZt*q@<2dR?Wt^ocFcghq$I75L&boU(G4j&I8lmDD(iz0%do?()S z;29r?)HM6nY8NzUT#}MjKBCMr@Xx4`dqm=e3?cITB-GQpGKqMF2ar}PlgtZC>^}+e ztgD&Xp}|JPd-;S|t!jS&`^rR2TSBdEzX674H0lEdFBJy@;qaZT7JYXCIw{G*v;?q9o!F= zYbdGFgfxiv2%~eX5rAj)Pl|Oc_)Tg$5L($Rz<;C#3x`ku5o9?{RGM0CpTSUb5bThg zwP1hW+)f#s3NxftqV*R3-tpNBjB#?1Yt`SJ`>PrJp0+p}%NpHJwbb*#V5-Zk>~h)q!}6&G2fS|Bc{1Kowd4uebU!kIi+SIu%UK&V zxN8AXxNvC;$fhP&uZ?`zoN6JKl~bmP?fwrKcy$9;O&JClxbs!ax1g%Dfgd`Kh(SW2 z5pG&Vt^t5c*g})vxl*rK)0ZEvy_XxI8(g$;a+)aV1MUsIkdaU}1zXAK4Y!eY*vV`^ zJ~M9G4uoH_-JGn}Hq(LyVGAlQOBd*{MF+tHu7&>ur;Ijpw0t8ru$q*%H=bLx2+dx- zmq&9nH(z5N=6VJ-0{R!cmcos2YFHn=mT%WTltg6>Pm$)aU}BJG$_52g0z6m?c*NVp zD1mrn?~{l%EfDIu(DAK5Me7Prqs`lBor-9HwG`?Y1`Kqbu#%fJdp!Ag5#9>^Be=mu z>ZRC8QdaUBV2iPjf>^}_2v}SA-|W_Y!5@@)ypzKyYyF8-B0;jTMkbG9oF$}5YjjC^Qq>-}F4-gc>YL!lG5P!cV;jRi zlkI=(ch6M-)&EjICAjI<5Z$JVq75hC7{?ziDxrrzLj3#|T@{-}$m2FiL-aUx9<{XH z{>V#jyAPzH2sEqIV=${P*-?=Q?@bqJ@}^f`D|HQjO&NkB|NAPkON@P3)xlHNjmKa! zCH65Z+eN{QK+R+HgB_VViw-o=2L5>!Q!p3Z@Bhm22Vbn`!|4_c9H$pDYKh6XsY8;5 z9DhXAKtF$T==3<#+{AQ>?b2v_q#7}Y*AXjqrPNjx1n8EtLd@= zzOT%}_y63RW$LsVOE8F(P_#F!vY4h;$b7CR#K(DrP>`(azOV@lfjavd4A+D(xr%0? z8mnYofpb-h&W-oK-Xbz~Qc;)+sJVoex147s6_vz^wrgS0S4vZk1<1<=t7JV|o*XkG z(YAtU&s4H3P)JgODd<2Kq5X519{g@XLJI?GILZ#-5*FKWmaI=%Ibg-BC?ROGJMd=M zlCrm)@?CCM3-__EUHz+k%XaR5K#F*_ji_eShe?h2an_ z_fS&5y`?cA$kU1oP`wK)+xMnXT0j%>bhe>XH3W z1@^SNI1wog^R)JTKkGnr@{ZSZINoKZ+8gBKm=T$?gFzFaF7bF27p7ih|JJNz?6(tw zM_zIXsGg^gk_26$`1f0S<^(D%x4j5y`n{gqdiXVp96aGtRj;~Ig{)%ZgC`urYCX~( z64R7}?xGkQ@yH6C_}6~Ah$y{CJ9+uQ$uw_s$*P6L=OD%W=~sT7!o=gpy7b^2j9g?wR5WIuaM|Pe;xE@`n*3*8yO3`kdX(4 zo|G)&9rqJZt>=PRO8DEw)xpHn$}E`Td6;=2@|l(?@DxJ)4M(+}SVl z&wXG%`_bEMEl}I80G-<`7(P4&V@u?tIf>Q()x`p$I;+R8P6Y5j4mxXSX?#8<_!hfj_bNBWMx9N1=GMFx@TbRwio z7WpXp?vTX-9qlv{4EHGD6US~SO(oB_1t1- z&xd9>0tball%HD~7&B@R+8nC`CQI%#S+J>O9&OM5Kg7KSP*;1~?oUZ59RkvgfOL0< z0)n)JG)jmxNQs1$bV`S!fRut#f^>s|NQbl_jYx?9d$ITXJkN>u?3puX=0AJyu^IRJ zt@T}Ze6H&<<{?d5;}Q*sMatH&isRlpXO@lBy!(hhb_ypZOWJ$%1$@bf*H2I}0w@Gd zM=ssCY#*)CHt3a&UlwCxwiA|jEgId7s|~Hq_d+aX$M-jb(gI+OSY@>f=k}L}o8C{Y zxxT%Pa+x2F_^wK=?k2DK zrLEm2VpsaBT)jHLT>T1K8z>eMxX9f`UvA!FrZ?hl*VfKZY^Dpkm1uQ42WTTgz(WQq z)H3>~s1?~6F5=D?A=YLkR7{pQbx6%Tk_owg`m+%;Jc-$Tv+NL^1!F`tDnUb0E&c*Y zWwpVwz-UTu(YVzsQS=gv(Cu26G~cutN(1wAJp}wBs{S|l)tv_*q6o#ekMlx7`Pt}8 zdx$*-43urT8}~oI=L_D#)={*~M5wRqg2(>{>Z|(pH>IGG%Teh6!F*}`Pt4a-w6d)- zzov3;s(o0YE@wASq6DLdv2=7h4CR_|y`i`{xA8R%)l1TZ_Zi`RR-L;=jwhdBqSq>e z|DXI9>!xi0->gaWAEbc3(r0qSwKyOVI+%GFdLli}iHSEh#P+`eVAfdJv}O_OQ47iC z>h-Ss9B51O^pVI=J9WG$07HpB@_dgh*N>9?KkSk1UHc5J_e!bTq>$N1jm65RLzmnf zDscI|2Px)4<`wbjU1ExF87xE}R)aVitA?SLaZutwN+9Tu)2l(h`yU7C6^S1Qk)tyN z@e&eVTty7{BCSc6Ah!Og5N+YboDe0@lvl>wlG4t*8P1UUo>J)mG#js5COlHIPBs=* zTn6t4uyt+v?#g*5JF)LnYCZHKO;iac)B=TB_5tq4TSj0Y>M%raO@=i?%Y{F~1W-%w z#a*eXMx6RPK1*a&AFfv7N27;i=FhU9{igPQZGO;xf{7kEyZ+`?lvog1dSxDgfr*Qf zGRajx}oEpfgYt7QT?U| zQL;k-g~1{<*DjEINfXvFCM~0K70Dm(KQeM`*H8;*?~-C_yKeak}TqIv8_? z%8DVazKQMgTnLs1kYs35{Hc~w|X)(>E>(FR?P->jYBIK@Mh#lMY z7y~TCiDc~QyyIn+@*bg(dA9r*d?M^bS+ysbbw|q?PA?fs1O6t+)SLe%$nLYMcFV>$ zp$=kALMcCK-sC(WyqZjhI2kovk*3pOYcN%*x$ml1H;xQ|ON7Bu}HZrsz&=mn|piVy0A^%ZbNx` zH2qP{w6Q_)p?4VbrRPfnD)Hev9X<)~2k%LU$?;mxbvHi_-D#l|?7Ad>2G&~vzbeeZ z8azw*9ZtmP?<)!>C%ZoFoYwoRpM>|*L{LR-h|zj|X3Yz`Iu=@%DCUo$!&-cv*v=(wq)U13Y&=-=;i?w{HYz*sr>L}k$r*IeavB^7`aNQ%o zFvrTh`MA%p+^~}Ie*6n*kd^sShVhqIGV1v@C=)D~Uf$O&-1p(b`A-1N@cI80fM!?6 zwguw?VO&9RL-KA`vTMYliA)q`j%PMH520Ew{B!++dd`YQZlT*Puwtmzd z(c0r@T{F-+O zR%V4+5>7DYUMFR2!ldCBUMjT&Z6C;2%|_D-gRZkee>9j0%alzrWNKzzvK8#b4gas%)3aZ?2!3Q zIlfqW;NVKe=j4n*uZn!0Dc-Jygu6qc%V}1)dDW|vh}I^#{MGjb->E%Oq&Je?MJ+78 z0k6Lg7?D!%z-O}xnM1z7Wm|ohlWslX>?3{$<9e2?7=q!C*kw(S3j)h#MWL1woPCv} zd;|`v8~!09(nMYAxnr*Cmtg2W;C|CKm`C|y_TadxCi8_dHIIsq=^FEjE;#%X^2Nvd zKJBc4T|HIgiE3-2iy>mIu{2)?HOpi##L)@A~&PY6V~9FzXQ=qdx5vK^4_n63Ed+-I0;@SBWz{)J|s zx9vC0;yU~~-Jk|q38Ioh+_ET>58tkV>lXjHL;|#k>M(2iW|cf9?Hc)jyMw@vUR9Fq zWs^||4AagLw>B1WuO$eCg%pUNZV6!mDhd611A%Z3l#&zJWC8(f^kmM}1A?J1s?o33 zK86y9H+MS>C&ANV&95+ z^|hSq2?Ss$RBhN@UY+_E3WpehUw%c<&PrHFJ)C+gen}lrj*5aJrbThFr#SVo_qoUC zryu!&j=G~@iTT_bjS%B{Pi|24UQ3}{b z`;XOubMfn2UmPf0Jv@F)eCfcl-n3|DaMb@M7cOwDdM90HQ#sB%8>Cma7zszFY@J{x zRs3--nws8tEk+UO_hEIFIR30x+#oyDW+Dth5Gjfy1Zf_zO_?xOMU(5|`*Fb88?cln z&fFWCNmPr z#Vi%XN?$1fl-jZ6bxkC=AaebLnK!ei$k`zP%2u%6)7$_z*62@Lus`7*h2cZeA7r47 z4A3nRUJg`1dONP(d$4>ps)0F~Ki6~<)ViIR_|z1W@FKN>BkX}Au&sO@DW3c>NI1*} z=;mVpdlwspD`W@@#+sK7Zz8QK#}MscXF2p2m2m4g3OfrsjuBFUq{pTTnV!B-PbV&z zh2{1gG`L41JvOI&-W;&b@{^PkSl%DZ6ceZccg^vGryiR~UWBFp-slOSnPUgJu1h^e zy_a060nVB%fK&%o)Bp#1a+o+px-?<0cNwJohHU3;PUyKRl2zooP=xZ@C_GGS@x#iNV(>CHeQye_kX#GpzDX%l)oaZUoMt1$QG;T2uvmP7MD z`S}zwA^2+?$E${JrvlJb@i%N~vugh-Dt`|NjQ zUig^lra7z}^m*K2bYAp@bNG`}q*Ee%OivGJ>Afx-@ox7#=maAeJh3ZX!9h_5uo(VE zOPJ~Q@d7yoad0dJU&})yE-3uK$8Q*soG(i^2cv&wZ=swI^r>>$O2myM!Dp6+fFPzU0hF5JQ#u76}4c}~B zpu(EuWAu4gH~l5#cZ+3ICvLqmWauSS+IP%lx($poS`Q-&H_-NHfBw-=)_3|*;0fOe zY=oW6NPi(#n6_|X-dVMqFycH+ z<<5h@+6UasSlWWUbm4a zw-^zSFZHj_eb+8mX7Px6+8{^Tzw|=_;SnTI1b^YF8dV4V$MRPeC0SO}+gG2ynjkhs zHp+1&D8U##*0`5LI9#zh@Ip}kjrPe`AY%SKnA?rhec}%JjKt&Ok+z26y*c9LWsKW; zY=VbKZ85Q;@z_USCN9nZ-p9+_<;n2OgbG4OEl}RQLK3K#=bF176Vob*E62c;WzvtF zfMI zWCvu5_b+6sNcQ62gx}G>2*0%ntUI`r?03MQPR-%Hd9=mTosdeFAip}rm9?I)UW{5? zXmWi{yO$UUC`#h5j&A98;2w7qNZ`iWWx6PLoJz#pfGNHsenzhKL~@A%5j)}}hJcwF zcg_V5jyf_e=bsXH6Y>GSQmCRH#nq~XLDn6Fmwu24s>Ka$nd_m5+nCHnhF(~EuZh%E z!Rk2!nS(s|z4~b=k7y>_?hO8a(-D3lY?&u{=axiIhAhjJAiqaEP9%RMj^C(}-ScVx zoL8lmfj7Oht*yMr?iFQ~iYSUZqj^#!qJPX%$i^yLa>KpW9Zhd>@CgP15?aR#e6qd~ z{^W%WtzSNJhOvmw?-1+L%pAM=3>sQGn&WooJw{7jgc8o$s!dgoZOPJ*L8TKO5obL6 z?G6RiC)gNjD4{ZY7YPIc3`6`E1>W)HTE1oN7IBT=l;!3VwB-HyMl+NJcyb8QghSO^ zqXt|j{~j&~zpmt&t(!Wi?WqEr#WEAn;cY0Zm1{FSvJ)L7b95XBtVqE*0Hh+Vr(Lex ze;tiQk2tV{Q23)oeuF~^o;H?i|JT)`dw+cisRSFDf_%3E%}4#k-Z)6|^-mxEjk86L z&bK9maMh*KpDrriX*TascGK#WL>A$DaMy&`Oo0m@Se_GA?u zv!S)q&voWj>hWE9DgF=88qhn<1i%BG4t=iu2mX&9O^E?;J?)B;TmQh;5u(Z<@vsUv zRv^^!ZUS{CO&5P*zYq(Ci^Ua05z2TT!_Xj?%R z6Hn$Xm{9+p|B~Ud+6ekcx>2nA->`rGVgj29f}E-#0Ik~I7T#}255b$vX)y4U9~_~O z>r03hgpeS}OEn}xuMnKM#xV5DOzr_pXCH8OWQQ2O!Ro>25A~(`dmMM*)4>XY!YUo& z^?DxMM0Vq`oF2vS$6gT{R$7LRgJcFgOdU3)jOt=;QBs1~`;=(xy-UGwjeaR4!S&yu z*y{UY-B*@1u;7dS@rpF6-7?#^#99s}%kESC`#s#wSTVzme}F%3ArHD2GYcE73l^!jYLtQv_SAd*In539$4o~kx_S*UXTDM`=>gu40x zaIg#^NBmHI#R%Om6qhKu7DW2q{3F`Q(6wM?jwCRd01~Z*Z4UQ}q@S7(v9$9SdNd2d zU*H^6Di=JB;0LL@%(z_~Iv5va=O&$-sc&V@-(n=)=AxgJ)$3m4_Jw^<5fV(saNN`;k$mBN*MZ-wpdt3dCTdEG)wD zl%FC<_Xc2(J}EDH;qH%=L!rDsUiG{dAPUXK^Mpn$ z7Y21zr2(Kj*&1g#8MEii5KP2a67R`q4*YX+XYwP+7Bb{xyrd7n>tm+Pva%5U&AnmP zH-I$gl;~g6fkLp*UmjmIptT8#`LV*mOyk$Lh2DJ8uaaQBP(pA?-W+!So@To}a`}qO zt{))F^%Akv^MnlREQQN2zYf;P9(8?z=LOe4cL2V5hNyO~8OtTbm&$;d8??$jU@?mO z{lo39@#l3HmFRn1aSI%+YqgpPE(+NKSe-!h61yt1bawNNUv|AI<+%hN`$o)~0mEh} zOpQq>-S7s#AhB@2JNLwngJ?X*K7Kg%(Y(oJ92oWJ4pR6a;JhuQY%YW4)b)}nc~wsc zJ1Ci3YSbyt+fE+(Mf3BsWFQcpNK!4*6u;}~KvujQdc`3#V}9#OM#q2@8rj^d>ag5M zgI2N@mY?d#b{T%Uas1wCt^%g*K;}K$!E7f)7xV`1z;t9Oii8Qn+F06i#v$7|Od~Kv zeGc)7JKy}n)ZiRFjwIv!*6|2BrU`1l?T;_Od&P+A0&zyKq}_m(dDqXEU?zY>`Uw59 zsSEo#Y&)8SM9M;ulYy97je5bL_L;zdXsaU(LheQ3srSD^28pM zHKmxH9e&7{3oq3%xTxX}iq$cQv2~e^G*|0>3(T%;THAaVAYvB`s#r=p3-RIEcf0ws zYU%I*dc8g*NFash&i$M>e0Kp5X{83yBod^gAFBub;+73~&gBuY*U4Ee^(5Q&Bwe+g zczaLZB`nVTVBsaiDC#*Dpm<97Xvr%`)>p;Ef!y zLYrqaV8BH=7+WF>-z;?DXtYZcv`zK&F`O##kj1BK$KO*>?`V(qS70&-(gc=0vUw z+RCFe!{91M{I5%|nyA)(`KfHu62;vBYq5=4EF~&MJub8kLw{wZ-~OhhEB+c_w)ISQ z?9h6GYPoR;wnn}9re8s21BUEYrDorOnG<$&Rg`I7Hp~9Och>|g$d)0Ov`^w$@VomU zrO^FP3KF+|3_MXClWjt#=4E{jR%HH+6F$WLXk6o%o({(tSsLQY0S~|gwfm6Vi@-l; zm_a>nCxyvEBDcjNU7N0VR@4r$;!+xV8&TZx{UNme;qvo`=_O1mF>f6~Ez?;zBmcmT zc0prNYWuq7<_Bo>J!9O^=@l`{q6{03&JdfIR1?<-kCpX_>h;Su-o3-VMA(l}WOC!W z5hAjzmSVEbYm4^Io%G()FBB5I7kOoNvU{!Rw`+bt>n;q0U&8NjhvCq2ODr4*nIGav zVz5{CJFYF?0Sl2{nX?^&J-}=x%XvGdz~c;a!8~8?lkaO^Iz`GdE?9c<;V2@W{BqL< zQDO_aUKAMluuADtN@+dzC+rw|sSb5VZhm)_l@45WgC;X2dR2rnnxN`vE3)?)cU`=8zG(vu&*U z8?%&xh`JA1c%(Vi9NiJPDe*&a58jQpS*}Pd;IMrDpCQW17ggAKqJ7GF2JB{U*AR<@ z9`(x~oj)9PgyqTwQ7#i%H+m-`Rls?-V+PAE^r!>-86Pxq5=6 z$sc4)FEy-8p`v3N-tdB@#&vsK)4=`P(ipriOre`vhBvYhTb6qQ9B?kI)Z0|}r14OD&%IR`2;y5rwT)21PlIR|<>Q;WuW zcwxTg3lp|=!_Zb1K3JtDV#fe)7FTKH$YHQe{tl-JShYmEf?PL%j`U%lZVomxlh!Be zaX#$b+R5Zs&^BETlJDa4zjSPtwk|d9k_|itCBJoe<2w?mHiy1B>g)yIgYP!kKFgG? zwKt^>k`6|?eo{_-DKH?~)}T*mue9z?7z0HEHI;y^x+7c*Ba`^atFT55|MF5h*jwl4 zEYPM@ldy;S+=2vKPqKP#LJlmAakahPXE)B>CrzijH_m5BV=#cKDbzM`>ldml)+?E) zY=Xnc3V{VxecfKC)*^9P{{k-kt34n{yb^{?9mq7w7;yH_4j^0{A1kZhyi%h3D#7M6 z7{EN{Q=6doHzYnsE!UwKbA(eWgH2b3#fJL}iZXaQzu_s@n|-bP`fW+Y&E|KcP4X3HP5 zTSeioq7$jgn;-A5Rl<9=16d_0u(pVTU(r!l?q-QaA7gTQ1WFTn?O`db-iBX^`O_(h zy(p;day4usa78PR;vhvz%G-^I*v}H)st)Y_<*HSg_~vDU(`!8=UW{|t0ELK*f_3j0 z{5Pc%^dan8r=W1OuLbv^HP3q>fW$y)P;CgSFH4!z?cl?b{uwVwmDUHbjUk-sR>1o& z*v_sI5Ot-XJX{A7whf7|ra0bk@6`Wrjh?}*dGn`ZYsy$qZ1a- zIMXE3HUKBzcg~=ya?CYrBHoTLKPF+;h|O_rxo=y#^SND0L0^f?6hqLM9nFP1;U;j@ zjDq>*XI8=+UU;-Ox+v=FyA%>RO6eJJ{vhT`U;{ZkqtJCv;qw+035A>iZ?@p#zd<{A z0*=njP~?;r-AS?*yF)qRUmZ@OsrLjR7FXPG$QOv7#md8tzDS0egTe8PtEe?`-Oa0x zA99dMA0rfZ0ERzL-$C|S9vHS`BqTlOKYt)* zA4DQ|3H5QFH`0CFaVvZ0^a^9lH|Eg)-~8vBZ9?&|3oJ%dyRVw^?_8RNLyR(EjAS9M zLMKGV7<@Uio}_yZW{3=iU1yR}FGgRWW$?+ol`@5LG?1a|vF1kfsQJJ6Pl-Wn=Z;f# z8*A2a1Gt;7iMABkI0(W}u8dOfZkz^dN`$yYxh>ej;88Nu?1eeGeI?G1OUs75bD;({ za=fyNHS1o}b+cgh*5v2#_+LQNPF$k5);j!hE6vAVAr2#-wR2yJ&ItebP%h3YuSyYQ z#$*~Pdlt_fKp2ffDj&1v8+X{^x7j`8^57~8c5SD0xbzFrAGzZsbe1IhLq27EO{)*mEnn7{l2c$0F1l4y=3lzg7dR z{p|UT)Q;pY8@D?$9%UURO~S1Ey-TQrXwJim3Rk4<`ssTa^~_?ra0dMB3~!PYQMezl zd^o-*F**#kHWR;gBPBrv}ITb4|yQx#z6EmqbVRGIM?B4u6l} zCZJ??l051Fy7fFZE$j0z{dJzG9vGrUg85Vzi~$3O5&$l1G56>G4Tzj618FuaZ=FS2SD6~Yx|J_c@=Ie%EdGLFU8|u^Aw&ugF4Ol`4z-M z5MrGBKo+I}OJ^ryn&^vAxd`$!Y(Jjvj*ZhoHGx+l8U0luR6Dai^{s=iujFSrP42HvTDyN` z+@!=!v6&|bB8hnK4>dnZ;m{UYstT|mTvj1p=XGGR7~!W}t8yZ-se7PH&CLByfSc2=yQ(F3Ts2(#D zvWUHymFsq84I4j|?$QnWKprFys5yB;&39q|dJ#vkR`tUc`mOHOVn z9Vd6;&!>TGq%9=^6-DU~OzAA2qOkwkx!s%}v2y|ak{I`gqh+e#j~0lpkNi&7jY*fi z?~rt;7a#-fB>Jzu{@X3Eg^xUH4PLlT3WmRzcmgYDa3~%=kg?+8W&hYlL*X^2#^sAk zzs*T*h3U^6c*);Vawm*Im;sYh(wV~T@-d>}uKc}z=a0SAqg#}d)NicI``GU5Pf>R< zG?xIr6^tVDFwMDCjC!N2OVF-S+1i?+EAH-JTffDwHb$&7BJsd@im^Jl0 zYq5-3tb6m{p{t_@U>nt0&%>X?sMnCh4rI5xw|h^53liV_SFiYwzhQ9dX@idE5SwNQeR-G}`QHLN6yG&5h-oRqe41PEFR_qyb)B^<$w|7527=eN+ zn&Aj3?%6H<1q0W1hDbKJ1@|hmC-u;BAFLdn1ion zx-L41V=CiPAU>c!uw?4SlpYi9!nGh&yhZNBx%Luwlb}t5tSh>*EHZ3Oj11B!(is_=g)dw*i%tNDZC5P1VW#CM0-e~iBrPE4$ka@~D zO9f}<23zKX$3k2W_ngeNI<}*UCfM>(wl62nj3Fy6} zJHPPy*hT|C4}<7NAqukvkZt_LvYyL9ErjfF5O$7FT>=p%Nk3-r1%{%rzOH43GpM}p zd$!UKZu;e@J<&?j*11L&B&+6ocWyMK60P9Es0?w_x7(ro{>_h2m!Y4lxqp+nP?JTF zeUk6;9BcL6PmB(k&9d{qpSXOy`V0p(6t$g(LQHhK*<3`rhwn0>SFU2~Fw>`nph43N z>@SLrK6ZT%zJ8ONcV#53;n1c`gN=-oMFT%=NOFE`NOg<%e1oouKRdG+l0F_3&Lkh>TwcKW31%BTMNkty3my9#5sogZ!|W%VbLL-N(D3 zS9VA(Sxx-)k?vQUpRxcl1{3sE_>+Jo*XYwiVjm1VQwP6*Hynnes zANS&H6>LVBLC5Nvhty^A1M**zdCq06*Fw1LPEP3WoYk3OJO&Fq{K&!{qM&a?a9zXX zg%4YJL7X7xv&?#Yi6JX%=kZ*VZ>F+m1@gItcvleXbiA)_;0D^3=)t5& zfqk}LB$EzJPKOlJEmXEZbPH8g`j~u52g`3Vk}_RxamSb=qGHOAc-*{jmM7B zH)p0lZ=SdYwjad!M0s7n%KO|9R-yho2coj^td!(gmPKih=Hxe!hpgSk}+2K;VNf1 ze}mI!xO~F@x_p*{>R+~=$c`+geE{!?H^2Soo>UKz1@Qte)mm6mrIgEy_$jWrz~#$D zUcgA80#z8@G<$8hgvWR5G3tRg;#7m()hA%DJH{@gsiKuebn6;fGQtfrz9q8o!@i`H zhC=LX7TsdqYOTrGfRpGHAY4;-i+^J)NMZqI4EQt^t?Z*sOAavGOyD2FiasZ%R zG?m=cQGi)veR!fN7*QoAr~jhNW-wb&|8~fwX=qN;h<;aMXO+)-F6hzYp^S{+2KvQ6 z^i#UReI)DGo;2;0Qb;{~i)XW}QIq+~dCG4Sa1I4O`GkH0#b7N~X|wJ4zYZ zs&Ui#{v4?Pvy#HFWM11~;mUkA0mrGY1d<)MD5YQK(3y0RS)kk#sn_SV_Kh{^eLx#9 zQ?+k~5eY+J@P0>X(bDmk58YS@ZNy#-T6DRVL61sn;qh0_|Kus3o21x4daXb%0C4kh+?;YaLa2Kjlb_-q_L>ovf3y0aos4j zw9$KC?42R5FhzHUki%tFO1XUrqYTJ!0E;f9$fqt!gk#e!U8(NH7}9oaUo=kneJaiV z^Vh2Yq6eS|hzWrSSsli8r5J{^AC0_SzWf=<<1hO-wH<~iw@WpDtvQs{b|9nR#aq22nzJRXF{Ad>~t-8 z?<=li2Uu=MI3KOeFol4&Aynf`1Dn1jbqKSCJnk=^w%*@#HPPM1=EM{zWL>(Q_4f$#bGh~w{0hpsWUkzQ9yOPmdvV&ra%s(+E+2v+@@`rT3dLRl>rxTBB8 zDy^ItJ#aN3OSjuwLuujFS=bGf3r&-1@m#ps?>t8(JT10+9%xI6H!zc;SGmniTELpy9%`nw-k=|3`{yQRIh0FV$gfgr=0WYQx?_pJld-Z_yZCgg8p$zzZnDy`bl~n|Kuw`&a%WC?ElEeS zQe4|^^%qDbHbGD9 zdp~&<0*f-$er~1PWmn!`*gS1Bz`b%`@_qKUj?N49N{bX|s#5vWoh`+djzxO(m+6!X z+_O2@4fxRJ#~+Iji}dM+6}s9y43P~D3$lJ-cV$*si(zx zyuTXovQ`{GnrENxnnK&^ZOGxu>f%IbLmId^f<`o!DGMVKE*Oz`?1|+y<*U$Ncg3Bq zuuiY~eI^|~bGM{@NgY7tfr63`dZcLWDORE`7(3LuE$l=2wQhC|uEbgz^RKMFu2J@dblAG$3nnPcL@(7tOtXPfNig*@R z$2jo5w=C=^>GKJMd(j~l+^BT1!HuIe4rv=~Sc_?n2*aVPgG8+HoxPKjX49hcQ46?| zQ(UN)!`Nlgauj=U5CA3UbfQm<=inGKiq|Kigin$OBRUJ>L5{+SY@7-?5zy?j*bJ%e zSE-(6#N?5(?vDVQNNGR_1HIN|pqDkj2L-W(>M#>r5$uALN4Bd9x|OY|%zcZk4=&pL zjpv3pnjcO&%LK2JU7}h7YlszF7|p3kDK&icnJv<3a54MfbjU3XY_4&j))pXEr{4F8 z4I5$h+EJf&LGR{BkDQM}hx##5_!AR!EJ3+n)tv-`R&iLnl>GA3kR&g-vSNAq zAab$I4V>^66&^`agnY3oOX$iz;29TJr(wplI&050 zUT^m+t}+^-;XHiy-Q-Zril9^d&*!VUzejJJr2wF77|KJPSe?wTmKMT~;HuS^>b^EB zp4{65F|8vvMMDhYc%2aynzbD97vL-DKL0X-QiYlXmN(KENXomL2E8!_8_r~ccg4cv z+FokgMt9{4TRx?)kqIJ2rWkp;s+U%d$Q0@NJz=aL9y8`4jyG5m+jiTczz7J&4Wqg9 zJp}o<=N4^Xq{x8ayB}5v2%nZ2|l%@iObHEFqi_U z`kVqj&MY~F%;&eATW(&Jof-#Gk5Xj`UDbU;6P9up-vfrB|1}~eQX{04vm4)Ce{~xB zWNSJzYs18!6-tsZ)^Pd9?7VCy_^JrrBn25Eo z=DfEp&s2ZW$}13IM!0ot4MEyLFi>+QuxQ79tRn9~ddfb_-s)R%!v_RL%it*x%{lOk zR!R5|?-fGN?oj)A31>t4q`Q*VY_w)f=pAHU!9~;5M>ScB-v@>)Uu!df$ZBH{sYFL~h~EW?9Ja)Lt#>qYZ<`#a_yTcF0I53q1O71_*lFy=mp`P*93 zvtc}a;IxGT^~Q25OtT#aQjl1xxpWtphqak*H5?FEVYPE)OCj^M{yZ@B_iG;$aTw*w z=meWc&j_kw;z48%%Y)i5aBh1UXC-Cr#AN+UpsO;4$dbSrrdQLhz1NIO)8C?QF)yN9 z@%nhq_#c`&jJd5FTS_#+5(i`W)G5)&BOiSF`3Fy7N|$$r(;iD3-M`3< z`rfK-Pgwlg;_2IAO#uYJUT2&}jQMkMNT+jycF98pYdHyto%=kau@~>s6JT>N&|jnB6X46ooBxt+hGmWQ7zEtf<%|asNbSezdCy-_ z`kV==V2&@|yH%W1w3Cebl9&WZ4=Jv7y$hPpg*?WeZk_@K%>~?C<&YvNm;`>-rOHpH4lOu?y@M!ZFfcwz3;Q1XOBV6xD{<`&#svmAFR9b){xw+}z5 z`~!oaANd2K3GCV86mHAR-6QqxYnChQ9Z7A$Lv|Db?3b8k5?F2!-yHwuyv)!MZ=L5~ zBWgq!E=vm@m}#jLOUSIT-qT#Qb+t#l_qH0Y#x4%RIj_Z%jYjqGvZbuF%lJ+JnimLjKJ7thL#g0UCyxJO0pz+|89PlT6AWX{ z@2bCn?tZ<1$4N_op+rg97T>`jMu6QU19l_)>oGpUMYBk!J}n!r`2#w(w+t_ajPuc1 zKDQv?{&0hj+i+9hpF%2H5~J_VL|v(f^OYeRpK2~fhq>HKMz7p(`J<4JC^mkuy%pAR zWyl4UO2M?xBg?TLpPN9|T^eKwrYypAn93Ntrm*Zl*B{~)&GlBURL8L=pOR8oM z;&W`yl;j-XU7DCT1O0$YZ=_G+57Y(lxWljy@2R<;Wl)J-;c4DA*V&8A^YF+^@02om zTUOq;=xat|NaZM%(u!)DeobXwk8MAC#l zikGuKQV%vD7;PCS=cZ5K#cl%{9fZF{UhHdLNb)n%YjV8N`~f%y+!4NTN0hm$n?gb` zW0T)Rf?*1B7ZCq-7x?$91UybSzYE07ij9F!>N1GA`X;W5I*Hl~k!2EodR3(zr}52^ z-tLVE743L~H=>FTR1EFbr%Q{jlqLys#Lc3Bepu^?_?@V9qA%D9rYxcJ|u8tz$Clc(y7 z`Xr*(3MWl+x}mjmyP*=0kXBMejJ0R?glpf&9m)AdC_5kPMMi6TDvNVsUPe-eAA1!S z&-+bfiuJ;LC@NWjP*}IR^2J3QbOy`?ztOAF8K=45Q1M%ZT)Shhy$iEzh2cikE0q;p z6X1EjVM_mpaVI-uBm^v`)nI~6_s3ji)1?p%l)(qtoV;gc?(} zSaK!_+&IvHIRfI~X?dV`5GgB0f;f0)8WlSue0MwVUDA^BVgrnxbj0v;sqdCQNsuMc zw$;#BvEftQ#(yOB7~`%1!CS@heDD9@(D&CbMJj`jW`3SQ`#*lsam2&Z_?Gz1ed>Pg27|A)6{wVUm5b37bJ;Oq3uy!7sx!zBY2LKrOOQUtL#SkW znR7=>y<%OyI{`<}eb!&UD_rj)C@Q-gluu*`+*L%oCBnmD{(ks?b#~Aaea$UN94r2m!_q(hI zLwz<%HiGyU0BWzbiH!zq*^DAQ75a&TE^44St3_X7mwmqea!5xmD^@fJ*xX4COc0EJ zk`nu7^=jF`1TBpnVfv78XQc41foq?{%by_`T}dzj)J*-_P+l++`S7MHsjyW0&R?MR zqPqF7?RnviSx~|_czLgw#Bn2!+m#^1xeb>}xDRU8RzdK-%9D%|CN4IFUUM6ISi=Qt zct)3m(&?_7UPDu2YQ_u(EAVG-17e+m4x{RIG5JqJI4%dBVn;*fU|*KR*Zs|DW3$Ag zypb=AT7qbbOtot-GWvkaO|8U(_(FCY;5dJNJOK>utpquLN-4=-4;dQU1DKeN(@Z1z zd0;Dq-{40E2A>cCjbOn@YU~F5y+_gY^CVmC%B~k0s$LG6Z)beW||>RUHuKl!ISzJ~1{^(@S6Zql%&fw@}8$d%)$xTzE$0aIIs2|2@gJ z1LHW6N<*%DIwD{+i`??CWVbmP z3CJ%gC4=!-F;kIeRP!I~5SGpn!ZtA0u{kY&_vbL?MD%|lds&A)SW@zul<3+2i|iGi z{{@~QN&2;+aY%43CX2{c)~%;aEP`?xPg+ugtKvfT1ndR`-`EWb!!43E4#!a+l6pb% zd=p`F`&y|N;1S6H0OT1=e5X0YB-~f$ICrFaKEIoJ*voMvk*IBq4-ClM#%)s5p+--G zF~q|+!`Fj8+TwktV>FLOENA)<{VFV*rQST4mwy8`c<=e<{=-vLQ(rmAS~lKeQJN~a zrv3wPUt!j(6jA~-Z*zp1CZXixhv;V!1ebcfg*ftp(kKkk8R%S2+^UPKGWTadyTD3l zV>Dn8b1C}vmk8Or82MbCvhiF`goSh;v>{%a!shm6*v#i%nl^-BqL(!PL;7KfQq<*i zBn_VR&Wae-UuPl$S3DDOzVG2PE?MH+`^A^2D;cRqIStALVgAR5h;kzZLNH3`R6tJg z`Z48&iplOMw8-gdI5h{`GeIy`hDu7UBylNMbXhQr`pn%`_-#2i`i*&P+x}Z_UEVdM-3vb}e2&CZbWk z`FsUqxVNeeats{*XOfqj_kTn3vXT9d^lq3xjGQuH_iGDsF!mf+b0Z{wx~NNjqupB& zCAmYXhNNt7(yLWxV^ninPhSN~UOl5)>l~H_jzk0;Wd$}Yx29^}=%o;OhC?EB&Yx2|DztFAR%EG;Yzxo?z&m*l z1aBE2#_ihQnw_QBTJxNIfu^M`-Dx;yutvd3CFx4>4)l=YF+5Rp7O4*=PXd2r;f;{$ zQFij!$`aBxacB#a4ylXhUcIc9mgAz1!%WJF|0_Q%?b(a`M>%l`j~`u8R-Gd{?c(Xf zE)cwS_05p_PQjuA6QLIW4yXA26|Z$}kX73L_0Q3uSnp=Ngxku@Y@ge>|Jk8WHrx^v z(t+U9)(6@%TZM7uXbKA^p@dZuE<%2-l=L`q9y7<*<>Gr8z8*g}##4R(&_PUb4Z~~G zDnH`%bv*8Az-l%2FN|Ni!A`hz(o+Mq*fH0xU28u|_D!$7Qi-mOLdEisTb+J;(z6`& zJ05PxAul6VV{*))-^_bBTn^Xvv+_!<=k=WNC&}!93CH#WyYQnkoDcMhyx4`$ALj@9 zCG0((Ei;RfEx=yN)VC6WT0c=3kCvH@|xsn@=hT!B_yMI#L&2QF}(ii z;pDtN`0|yyyeNLJ*#7r+ocV8%Ia~Sl$|rjeX@`y666H1k$x`wief_;>{%qj(pf>{} zZkEr1hKR48xOTdq0UX&nvk%VRQ)$zHiK-AxtE;pXq!cTSJq#Vn(H2%2~_ z>_~W4^Lzlg>O)ldu!Gen6&NF*rwDrFuI=|~{)`qje6I;~CiKT^em*cy`( znyF*RjO)JcUo{og?QcYzk3^HngF@J~?98#gNev3?@fS%TZ z|A4cKXnvh32(0vw(2W`k3fk~G2LGWaRqTAh67YyntuWnH`wW~F#}7S(n0?YWPkkPKD14Fl5WpJTym z`f?piN}QZwJsS$BxyZKUK0s~wp>Vp^SHMf81(rDE@9(T-`~k4S=}%=fyr3HQ7lZ=` z!LsSBsrh~HQ=g;VrBINM)D1!GYzx?V7^VRptnufLo&?WZ%11MVK`3Z&o^T>4Z73Ym zq!kf@HIfsqpFJ}%4XUrANuz$i+6^3p4`_Vcbg$ z>pqA#IShD+cujKX@0fqpOtQ1D!cfxJ6;Jsx2c|@Wc=3ve5yV^HlEF6}xq2)6ox(TF z!N5RRK2rX$^kwe0m!UYjy*8nTA~V@56g4Ue8wlA*e0|Ra)Nzw&mP;`V_7|y`9zGTq zw77RcP3_n8R=`(8P4e>(q5Xu`em#&#OV4;tjH>tK$CVHAuty?fX`RIIB+5wcXF=K# z1KjY= zPN59U#8ZFx8%B5u@V-RL!0#3f0^Qi*vNQP5Rtw!~uqd!#6lni|gw3faUaWnYzXqS2 zJ(2+)6`|K6H&KG96E0Gc((Db_)LN-&t7;A+G~f2Khms*JDH(UBD&YjOWr-6pPN5N5 z&wAg|-M_pv!2+Q92`^ucODb=S7FlQ=gPhwbx!=_A7|mh;pOD z66|3N3m=Y3?pib^a80Sra~iK|F9mRUwbb?tf61{?`ptSk%C1FnQn#LEVtFJvL48w` zBY?9lJH6O|i0YpTy2QwNefYd0>TjpFhakIqoeQEW*ydA;r1XSw3j^fA*9XadiZrml z=pXZYBGuPjuu^{a1%bn8>Ah;69D&*@*Js!n*R3|eFuIbp=ZW-wUU5EqOUJGC6H*Cy zGV3>fKY-(SV);E560?ghbNaxTz!toltGUlSxetj16-1XfzF+XTBfcP!u!te=P{}BI#=MiuwVV4 zJ@Cq3F_#j1u|LJ8v;V=@d$?*OT^Te20pqTpy@DnjFyUCXeII${|7-8O!>RuN{~rzx z3CE6O?_`{8!oiW1S=n3mrbt=G>e$N89%Y2era}m1CS~tEBN>^$=j;9Xt$)7P`~UCt z^_M@+bzvo~G4A)OPDv6 zf^5N8d^ty5qQrebj^DrZ0fHS}DSewhp|04C>|k;$>JAq{rsE1 z4FC(K<;-_O70O5K z^K0H?76pi{@4yCkGLie4?1**u896AxK#{3{%0<6Kt?s#HqJ;>rWZf%bV4tF3ioX_n zy*wTdL3o*7sHQ8iZZTH0|E)ljdWHAY!x7_Jgv#!7O(7KU-#8|f!Nlp>ky4&{82~24 z?tO`u<=1z*wZvx7Y)>>pxdh7~K(NM*bXMhO1bN>hxKhUodWA_`dBNK#VaFO zQ9;0McQ77wGBHz!KwI0GHlpk@6dQv!s!tKEh)-voS?!(c(He*4!W_1j)n7$$P7MR! z%Vt_WP`!(ZgvRqpVQj-MOB86}G&pGvpkQrg3IWx0Q)3nHA$8udziIv@L!+)?$Eq-I zH8zjB>$3fGTCaqPwZUzVjB?C-y*@%aQ5pQ`kQQPOjbR$_`+f0i7RfVTx!WE09EK&j zw?jRkK#c0OQsd1)uxe+zk&>8lAH;3?D$W@@WZMN=T|fTGgTK#~=%g4HMX(}5E|K-8 z{8+V@tqrR*Qr>DuyPMW>YPZo!K`oX85idfdQu#4k2kj&?AD{TOuUk5^ngML5#D$C= zwU7>yyb0t`x0M4Ua4d^l6Q8}Omzpq!ef+XeLtc7l6h{+OfiwdAQXiyh$VCX@aIJXf zDQ^NkS5omrhYs^eEl?WZkkWwR6%*(~-}0h4X*!ztS``zq|0X5jsla$nu;0JBpRM1h#^Zp5jkKX#;PFO*@Yj8$BEkC=3zgO->C(7W zGJOInFU4@fwbM5LpuNnQQL#t83a-8MAbnYc^=wtOA_V(ZgNxO&iH{*Z&;sC;GN{#( zY+m_dF5&xE+l{^3HbppULdO*RevjLm!f6EFSk*Rgl`CM&$55EUQ-ey3yfzakP_yEQl2;b#DW);Xb1&lQ+*bPC)L=KW-5hp*&OP+(a4~Ad1 zjzxaYp{)mwDN?Q64rg|xkI-2ZiYHQgK}YSZD8(6db+{ zKf#?CNPDCnwH$u@l_4CsrYXTCtlgA(6QveYFL&#rj)Z0C; z%mSP-#pXu1o%WTK0Ql1JOAwnuj;)Xmok!j)4kU3=Do59#&kA=0to5Y6M+F_N=>$8s z=jal)#}YZn+TdC1W31V{Z`Z$GB!S~J9Mg<2y|R#_{%g;&4dv18d723 zf!wMFtLz0tFGsw7PYlevtlL48&f7qW7CFQI;8P?VdC^=pe0Wo|L`)(uM+^Vuo({-0 zi}o=xN;nI|4PVP9Xp4XSN8U%Azc5|2w2xXM_(+c()AejQu0&4q(3_>>GVfi{0Ut6a zyr`Y;JotV8xyiV9zSp%aOF>U>rtf}Db@{k1(|i@QxS6|U&bKERvDe2?n}z2FLVnq0 zP6P^kEsbB!l{6f9EfTL_lMjzRJpV(IywM_0jR(IkqqHy|BU~qNp^o}o=EuP-!U4nA zm;C$Mvt5XHpGi$wX&iy<({9)~KCf34aTV!ATY<}-M^yn2&mxIE|n9bMg7|w8ZWR`3Bi&tog`8&xQ zi-)^Wyg4FZ;PzUOC;T|G3mGu{fW0krm5gT}5Wyy%jIcU1j`X$WO&$UXbRy86~P;!Fy$D=wGeB&zHBh|ACvu%^l zZefJ=kVU6Lhc=hB76-2Eph0rqWRR^E z;S`{DIl?EsZ5nq4(%W&ggSTjeqD2teAQVARq{FC!hMqHW$K5tUgkOVFx#TJ~DF^wE zVFTc_${VO&YK`Pvn0;w=*?J#b`E~>o*>KlH@nlvF^MOAuG4IzfOtBlJbr7@9M^KBE zTvj@x=*X1{I6`48~eN=t$1qy<=CfYLU{qkIlI!wd%$I2Kx`mrRv zaiki;?jhlcl%h4LZpdj_=3}4JiUfS(xS#Uij@}9dugKHv7-556K{(@|%#w%2#|5xK4brs;W?x;d+#4SXu!CBgHWF5+Db7^@6q$0$3p!i}>^9 zkp06;Y(=DDFXfetzL|`qH4Lcle;o2J-9tS&*c^PIc3TM=e}X2!?K@2pP3edBN4*=D zE>VxFV-)Y=g#1C?bHyrV3xNz*`1?htSN>$vcH>rZG`kz)@dZjdrF)MS2y# z>qU9X0H(jpYdZrdeCe25mX4=f$-c(j%e9NS!~w#ey3*(@%i$ZP!jDB*dD+Pub__d) zz4v4T3*xB?L@#(7x|`lM>+?-UeC8Nsdw#h7L*SYG)9gFHV$hubsoCQiG#~Usw@l{t zJ9Sz6?={X`V0dHRoFyAs9EsPl<)f&==si>ESfxn`qw-!QyPVR(n1PU)_3QTXAd<8X zntb!zPPfW=yw3mL=osJ!MTn7}Ar%#PvThuSblBS?gFD21X^UV>HhIh5>ZTM9Nb>^r zKncMumO`+;6WNQasRMJ|FpfG?&FGmKVOpBSw8{((4pDCat_V5v_u2FPtV_-2YFdi zXJZ8iSu9h(j$PSDO|BFXUY8^mO?cm!DSwX&J%3<&5`#{KA04DW1f8eDTECtpaG2`& zl1Ywvr*58kFBProy((h=I9{phgP zb?Vaj)G@OdW2^EDf@(gZeaLL&pHQ7M0GJ=nP}U&xee12xK!vDgRcQxJ%@|rLfWZ)= z{sZ>tvaX|#j7;H-+mAhOt`)Nc9a(4EuYtOVRB(Bt{;4ejFT8}D!|P~(<2GhDp{Ly? zt70-3cWa5`-ir)Hmdd2#BL#e7<_EgFmo>N%>HbuTVU6EN3Twv&3-$owgQGaJ&40eX z5wOOh3myWwZnV?8lFy6rKlbUd_MC~lP@_IERlLik0zR%@kg26^?qB=e6>8LdRm5}L zo}ml37?!<)1&GayoyYUEiPrLrFEJZPCY6g@g0R4twSuUht%hYSXzC(`HiqmbHO71p z18-uz0LDRrjAH(ap4;5D+~53KgkQQ&Iut$Uz{KKkdBppoU!~7HNpJWg+;7c2T#uOq zDl;M!0Wa?YU%i(c3oxZ+`cZgAU+}f{cQrSU-mRRv9zf)izLQI|Z`B5iv$s!-*vON| z@S<664|Q7C2ZOq6lRoWj;oq#gI<5e`WVG9(IeM-)?Kk%(>jLZ`$+UP^$CdanMOMM# zZXB|0%QEqbo+?E=W##&Rp2|MWQDIAf{l4E*wQzIVdfi734s~kmrj=dh`96!!zz8V8 zP%DrQ`xC|p#%Da+9!oztk7g{rCT8nvPvSm;!5eKgC#*IQ7*J^g@%8AEh->o?q2lH? z3FAvWx|hMCB3W%kImeSL@`ev60j~1O6mhEg_S{?d*wSXDaH3i zvhHL*z6$vQ#cfzOfkfT~{?W#!@C+jNf&Dzch<=Acf3SJ~Zf8*K!PHBed_j7<`CNt4 zk(r}T$>>@T{QlbV?CVjI@u>gm_cy_3E~aX$P=p-xA~7Rne;8og2P&tY+QUPERR@y& z-yyW~&&T=GIDz!-!|JiCN47sUHmUQ-iJbNu4EgU5oYIR?h*FV;d0kWqzea@)_3TRf zqP61rCV=Q;%PgkBL})78CFZVsFH`tOecLFp5bXfplLiM%N;xnBi23$p+ea=2Es9VZ zN?jO7As0rgHXAawZ^m(}lc+Ek?~k zIG0mbY(7gJvg6uzKx>;~JtZZIYqMTr#b#Hsv}CN{G?ijKcbXb21-rrRn0NZOUlggU zs2vO=)~9|)1uvEpn1w>g7L@V>E&;HHK1zQU?BeUc(E?g=k>ZEEp9KA0zcs6$2LW14 z9os`7DmRtR4F)~ti~Ws}`gNuiHo;_{$%Wnx;7verV7(fe-#fHeemLMI2%dCrF*(t` zm<*pUw*Azowx~WRJ=M5yGk7<~vPO>uhlZqqf~2EhpQ-EB7Iy|F__YspaSdz01zVxq zc5P#*3)sRO8;FFM?Nm1J(vj@PN!sUxdVA8%!A- zcpq3=706Wv8!#L#eBW@(Y=wM?6(q~|I@E{qQ{2lNW~Y}}?3*7HgtoByY=E7M4Q19C zeI{RhN@UB8+*@O-Z(@iz-OSUqp7$j2$t8N9~gHX8K-tYSM>e=hqqkc_2d&m_jwhh^5}5R%?sDsx0R= z=3uZDFj%a^y0Our)5W1QPBOkTG3Uay)zTs*>u3p0M=3|;qi#KSBv?TZ%`*{4IGn1H zUEG919j)(OsL-3=XP(1RNyUFzi~KZKqol{_Jgv9K9MAFR)R){sJ$>x zoIEJYqh`*dTt5duk)?t2FeYA9zUx^Z!MBhaO`&_2bN`k;^DOu~AK@}Vf0fcOiy$tj z`p38*Y3X}4Y8c)leVRj%_4Ab0u|%EfhtujY@=vccZV1h1Qw~RYUYST=R5J{_^QZnJH_Nmhc;I7lZN?hFu#0D(1?wtH3s?g!mYJ?fd#NkNp^;el3#!(+xx4@zkdCQl8K?_R`RzDsy z4&tKZH~S-gf4wDB1{UIzbsxdK&V*ByYuDu=ln4CEQ?bM9L%+X~<_yAjwrkW&9@GUH zb%_+QUP^5|rEz>Qkb%mAGyJJ{zuHg`yFDc;ETk5|>!PP-8QI8^U)6q6SD%V!`_%10 z^T~RsNL`{V@^CH=hZ)V%%x*t{3t}v1OSlK*&+fBb#kqB5{N0i z(?hz)V(BkoIO zprLJ7mTFzrVv3<@->v^Va3={gS>{E%VlU7YzPB$noO!yKiHpIAJBW}jSxK}9 zU?5Yay{W$}|ZlomBE1T7Z(0 zfg5w(n%QX}MAT&xS1O9~Y4|a82tW{D%m6}xhy^YUXM(HI%yI~l@iPtmkQi*wX9+T_Cc(<>@$n+x3hZ@dYkMy%Zmr279L2CEZqk*^qSMZ9()_aSVm{fHu;)CwubIE+Hq@C_ zF#8BW+l7#E7drgfac?u($V}Hp<878WP63#6t!<~ChpF$Ta|Q)OfC$7L-8JUx=J29R ztRBrIOt@|V!MC=;%0-q=v6Tv7Tx3F0Baj$Y*cPb_cc39Jd=t3kVrMvG=vX$#Dr$xt zgi|muexqC}AThFzO`6zJZsLhw3Jj-1uB($0EM5A%_1@+3ifj_u8TUwl$r*R}740At z+sISN~&#cyDVXI638hVaif4d&VKl>KdR94UU`+W=GM{*el z1{KWT-`pDYrrN`k{{rNMquzK&ZANU`JWQtq*J)A{pBd<;%#NM_+$ZLXS@Hd1^;T7c)~4LuaoU#wGLQ9jbwp(T}oWaa=H9UJ{Xr|Tso9_ z#&~nyy(oYqn)7R0x`wWTJdQ|zeEA5NqO1gp1&gJWzni7hQQ)r)@A(v^OXO^CxCqbcV(>|qX?KKSO(%pacD>a zl3fxWryo%Nb0O3cTR|d(s|(Q4-T>=LlTg9e;M2rDMi>DxAfzMpUrI6T1(3_$g9+7h z8G|=M>o7i~j~s{l!@3k>7|xSTP8JoB0h-kygh literal 0 HcmV?d00001 diff --git a/images/phone_list.png b/images/phone_list.png new file mode 100644 index 0000000000000000000000000000000000000000..e2efc37b003baf0a8376e80dea1b32923f5fe558 GIT binary patch literal 16772 zcmdVB1y^OevIdH~(>OHljk~*hV6(73z1yF=sd3wL*?ad&z2KIh(j-*~^^tuYpv zE3+z@Nh(#9@0*0l%ZkCnV8Z|b0l`a%3;zTH0%82R{s{%~^;fopmkJ07PRd+JNM1rn zh(O-K*2LV(7zju_EJ+noO>qQ0Tl1%cnIDumbZ6MlLMiAx5u{kEWhe=Pyn$3C;WI!z zOtrQ!zxIqWGKRJ`a2?@8J%!8CjXZi%l5?G&(18rh@yX-n>*jM;>*G|K~>W5jldyTVVcOf=ML6vpdo!43JJ=%IFO+!3%%qeWh0!ik#mvKLl!^TKRywg;xFm zmW5VXy+q^@e+Lc_W7KehF>vk%+gd`x(pp`j=?e)`Y_%pdfYfv7Ji`|S!-KeA9WVusBPvUmRNzf57`$nem8jPl(qy9jGWHfkwltcmx$n=ogn`AYEfnZh zp|_fF;V{aROa==-b`A&w>(I2D%s`4pcoRjE-2u22z_;tsH*10iR7oPi`M2fY(&}y< zLpsUi_~jUyL@17i%q>*9iI`J<9q@YhfW3Hc!w$WTK4^NofnLGdIq zooXe_d?Y%1uY&0~C3VAPed|y~n=l57?gIWp7>MH`usn<_q0`{dn?L@hU6vyRp*D&VooQF`L+-Vri03l`~*ux7>oEC zRB95G3v|MVV7Z`*5nGl{Bk#WdOAP7Ak0KVFIBZ8)Y#j`7;QiXn4L-OZ@3GfAr2gGX&Q183fSW{%+zG`- z4g+JXt2_jp?3XMip_ZcKPzuYQOY8zFVlOBG-~Lcq3$$JDaG$spH`LHWphh&Aqw&42 z<7YuIU0ZcsLCa71Q7B3J!5vFEYjW!;eRN~F?z33VrO2HwhhWpL=&9%YIS{v)msJ!^ z|MTHVz2Q2?W&KL%+FC<3CKqI`{DJ<8W5fk2>~Qp&QF6$e(C)C|xdA1YpBEaJt9vg3 zJSlWPl)CVNO@p19q%%oJP3v|tbOF1PPX%Ii>)IP$!16v~^EG8D(hHBk`q`hA)Vxsq zK(2OnS68Om^MpVTbfCXHZW|^ELAxTvfbE*XYhuw#om*}Mfs}GVb0`e;5GwsZD~Ivi zfPInR`g+`uU{M69QX$%Z_@AmnnE18oAya@K=#i>HmiL&{qMd>91$gH|<3bGgn%e7ZR(7f+-L<#tn*d z)MJ+kHW4@j=l}xy><$Qj_-%+0W54f%-+UjG$F}&Dp67CmIS*-EpkPMCi8diu%JO#2eKI&}Ba#@-djAKT~f^52-Yf z5wZ!gBs5y!c_3(@;+6RcSqB+Pm~=kR3=VHp*?`q9?k?l5=bihsku&l*@gWk^ zpFot!1H=ajk`g%L?GnS{s^S}k%|L%8?+n48z7Ms1F7qR zfuCTBD~V`{)`{q&q>^mLzskiLOCaP?%U0$^<|Qpa@4wyC-WMe&CnqJ>EDF=qEecji zR&{EaHhwqjEci2*X@2d5fh`v%Js7KFF#~xh-uK0HGizpKvter#aD(`J*aIo6l=VynQGKD+B94=)YYcx zF)bBu9=d9VSH)BTTsqIO*4SJgTqs?()>u|}>e%X!RxwtV*0!8;xxhHaoxb+cp}5nX z{kZ-5vD*r?^NkYOjc)X;LI1Wtp+cp5{F7LT^0c5~~d{hTY#rOMgj?nB?kNY%H;4-@yiX$a%(j=lU=D68g&ekNs7B4SlEm zuSDF$hr+)_QHTnNR7E!T)9s(vIc>JwF0XHgaw79ZNQIR}cp_cGlSP`P`N<=5YEAk! zFxN08q!oXJ9t>57bz+#}voT#v3-1oJge{=r54I0ZMe!x{h^J=PQtvn$i0_6Ea7iFY zgcmj$W$jarIFH!=6jW3xagxPmGT$tOEh8)2oj+v$so}8t@cUy#KVeDcB72vk$Z0gH z>_HK>Bs;@|k;Zj3?hogj``lv*>a60dYmx8v;Ejd*JJxz+dL%DzNO$(964DPf>SJUG zC{k~Ye$+R+Voew@1C2<&w#hcXKUC>jKm+uI+%5K zM?@$2nIhe2?dUbMK8@_g_EKT$BYpAJmqc9^bPKdeNtx~`xZ~crP^|yR2@AA+6+c)C}=hPi4RvHpolvjGHH$L3uGI!sgNOI?_u=sNV_O3s~}g&uNI<`c8&Y&hUJHC8a2 z^R+8hj?agWeBr(wjid%rhE>|ugDSg={#7weq_$kQ2~%>;YcfqKwx2CR^#zWmZ<7hm zo6gSd(ympdVy9^LJnbcCU2l%ZQxovC@R|?wkBOJyo@kz*Z=~-jIIsB+jpvUmTUQn5 zH7t;rkfYdfLn~3K=7|cT;>=d^(?W<-MHjkaPcZ(_4g`4H=Dz#@FN(@owWv zWn-r}7;!7F0)l`{PlN7>zyq3A1lH3EehnEpamhbM6X~>oA05bcBd%`El4|s%tUrO9~tyN#A3urrWbw?l|bke_HV2Ph3 zS3p3(ndVArPHHmJoQAg6wE9N22FA2*)^=agKtSAXoL`sL#!mVKZq`;dj+|~hME^)| zeqH~qrXwQwN5sjJhe%CEoATU`I1>M-k^gB&*x1p~!Q9Tt z+}4KRZ@c;iw$4sGL_~i(`rprgp3~UP{NJ8z9REA4uL08iy+g-9OHcQ|ZGWk9|E=Ye zH+M6(QWG|}Hnws6>Vubsm67`&`Tx(Ie|!8tn(F^*vj4vn|Idy8Dst2Peg6M5p8rhN zKeb=8#S6nt_rEjG3uF4`od2~LxaPugN?#?&-Tg%12B z^m<&l{jU&#u;}XlY7jsP(|`yXcEbunE9GBwg$uU@@Lra#5yK`wbZugK^TR&qH)_&_MHi$Zxt5l`O zdw%u`o!@MrFE{QnF<08kdE{3{AKrNR-mzP6diQu8*IgEP5x+jFJo~ho@Mw2<-PffE zhy|k+@tl@Q87Jpebf3oWu`XW2?;hW$B`7mId?re)J@{O@#Ni(3st>`7B_&1R66;AB z9ZsbRKc$O#Tt*lxDS55c_3&e;B>MS|-U|IGJDh-;0s7o~mjgxPDxYKvjZikKdqI_( z4#@E>9-liZ&234Ql$12Y*Xg!Tpw(_Yw8)|w`=z9?+E9VW1jD$ru;n~rkHbjFXu}6Q zqDu}ElLg=};;7V>Qw~WuqML?Rv22U`QZNe9?*5)-uJ=fLq9n55qJ&PQ_|=3qSFB+- zVea2?+-i=jub>j`AzZ{;8X>uz36n7{KDALn7LnEC-{~L>(Or9s=_vufc zdO%h(8i6VyBQQ|>oCfl=2*16Zp1?5Aeap zcr%ZwE7DMFt%jWCdMNh5V##Ww#mvU*;|k{UckQlhsUteAV$Vl>w+$H0xCd*^6t2xq zAx%kHX{F)C^L5RO%xlm&9mJI(lG!rRd-77_TXM=)1tJ#nRB;9d2B#?dRv%*Bg%MpW zi-k%7>l_R9>QHY^*WADs_Lt`rU9-8(%h9aRcv=zHNFV4x@MI#Pz%pnY3K$8#&T+Z12% z($-g~J9i!ukVb8^$}aZNnQE60&5bJl9DJUVGWd%l-@p)bBY*C?|;rQ zLS*6|=BNDTRmmuHuB*bq>9bU(9ihQ2RZu(3+2&k&{D3#wpBm>Ywc^I(W@cMh z^LI@Rhln#h(hgD|h0OU}gdz6K8;WI0)h$pZZBqH!V8f4h7f!j~eJ3-sw!HDq<&Ro+ znrj{EX9@l;LbVKa`qetWW{>~1hNN}8G^*PuNw)5E7fSI@Uft&^4~6uaGR1b*_h(n8 z@ethe?)EEcs>5EUcxcfDx=7rKBQY$0ET&>%W9qv$xD>al@yax$%OJB@x?d}mHK*L3 z>q%iD@f~$`?&unD@m^8I;EGvkH0jBTc0kUI(7&P0+`z9Oa2{5iR7+s8bBZPp3K zHN=s9Wr)GK(%^Ku1DnWV4nA3`vJ8E+k~1=%Yu0pmCY^6v&8xQWS9Dt`LxB>`1TgzD zCcQYH?*li8-k&69N;cQIQ(0<8K{1u_x`?mX;X$7=K0JP2V@_T2yUYV|E z9^ZI+C9`_-2MF( zx(Aj+EqwEPYS{S9oN3Y`bMo-d$ckwfq6ejj;xb9D$hX{;do)i;Rjzp!o7|50a&!=6 zaz+()A$!&Htp-^5<3;tb(dE_{1bi-Yv^x52R{UGqg8sy?m9J9}bE9vWpd zc)AL_HzHc+SXvtEoQtX1?eE$PF}OY>G-_Ro8t`jd4P?<|u~=+YMo6Z6_bEDsZI$5t ziGcch%~5mKT-IzBAEj10K3!pxvs%CT>oS${=#Zy0;>95Mba2q)z?}dk?YyYhyVC`o zsjd#lC5UL%y((h?h^REBF0JCu=L-PCYW=XMZJIDyN26P>Q4Chx9j2lS6$j8Bm~kS z^C=ZEu6Yg{HyAQ5WeC4g0>+_jKf2mxzK^~pCh*q*xHOx=IP>(`)I(`69kf@$No9?@ z36C`@Y3z60M)Mi1EX_6>RO324`P_Z7xIZ6Op4gL$0^)03T*9x2c-K_4XY$27eCv+u z(SV*NyvA5KQoH)qZH+aC-khD!}w{>CrM=g2MDlcIumwXpI0Mx5t>y>JD%sYULO zj;X-9UyUq>g%x-w*lgp*3Gd|r{7OhzdxMd~H#J4n(4u`i#o~zKexqy#diXIL!;&g7 zIoyW$a^zX?-%D;)k$)bhIsP`eo#f9S=^2BM8td`WZ((b3wL1^t@VD|JHuEB zZOAT<*oZKjr69BWQv^0?1H=+Q@5OI>vblrBf_BFz?KGi*qs8WWqsEd2?U~@ptGO$J z%}?=BRmoi|@nUxk)8N+qnummSi&5EadP!%P^@Mp2{`}`e_hYnrPpbl5ZgzgAKe04Y zyRvBFZgDpKcII#Dz>qT0aapLtxf+uO_$BSz0_d8^fDYdH-C6BYYPn8tNcpM^`FGt9 zQ+T^3*{<$>wv3Spd+@`|2NpAOm=BVG|H03pIs6mw+SJ^Wdue-nRPLCve0mg-bZS+@ zgwL!aZG)7?*UhcCQuTy`tFUO#Z9Si7wF-y^U^dZ&N_V)d*~s0Wjq~jo0}}<@t%Nt~ z3~g?3HWnbidz9Vk8e0`Rt{j{P&pv&uSXBi6-h0J2zwGU|*c$1U4~=9n>-RbC{=9Fk zqLIz*aNb1?`#>vwlb!0%hS@Z_e@1j6ti>1XL!k;-VaG9F173W4->!OABp>_u$Jqxe z(&3#COB=nJbMGKO_tgr(3@U57xM5|TT7!QV2a{CAlH*FWEXhmEFU#1!u3__Aejv^{&m$w2u74*l6kGfZ5(PCR*wj zuPF=>aI359ErE8qyI^^DP!5+P5_B$hSzm@r9S7|0dD^(b%j50^bAW1@dZ&g#GB*t^XvzdADDh=?lhE1~L(s2UyZ5wzy>Dwv-Mc6|Pz6r2gslKW z3|={;d!sHS8uxQTg8v_{sWwL2U4a0iAYgRGuD8v0(fTn8I{4y7ALWN}BWVh z!}N*Z%Fj3kV~kJd<)lZxKS01!dbITn`(5|u^5wRx>BBhSZ2VeH`DO_oIP%*kO!Zl!X`V3F(U z2b@gdVJqQafk_#i)E{khPi0iwOm)Tgv-04;DMUBN^@zECI%r{Kz)p~0l;kA|@%b&H zcmqLuLn@|%u2H-7B*TnAAh4KCNrIHetk6fiS z!+uMiJho{4K&!P9*L4=hQaVmNCTWx_@I1FZo8nEs zslm&%hkE!#N6l(oqG&o*F*d;}9%_3FIy?tO1}XIffNkd*O1xrrH0eU%54 zFj`z4?{}vYCo+}*Pp9k3Mepnl%als!x1>|nxU_zVQkk*cKm21LG03`qsC@lW-Z(5IDad0s2D=~5X>FRuYEiJZuToS^DCJ06yt|t1(&)@^@rN1=;zIm2lF7d#;S7cQ9vSUY>bPEO{llSlmKbS4c*u>D9>goj z+6*ZP1$282p9Ze9k$xQ}Z{um|kq_RjGcKk2JP3w5j1LZ(YpZh7l>CWR;mRF(y<5_*#q7o9}+kYHd@uwI} zSDS6hjQsf%6W=6lCo}TSl!(t^byAeXAQmsudL9Y~K+sW9^9PCmC$gxa!*B-l z_`626WT~Z;^l%XGfIpGR(FvHKLwk1-`|U@f^ERtNm-W|qs@prf;G`@6m;zd>L+pA>uk-^VEMcQ_XDn1@ixDRm z7X5Of%F+LrXWQ3?`x_jum|<4RenNqfFw%nsh)+bb3c@N>6LmZa7Mgdb*qHrZvVnx@+U92e~CyS_YHXk?c@j&px|2hsH zwOysl6pSuf^-&$szl|Rr*|x0qFW2fDa5o;QVWct&az`%Uo^0tdYE{)00%y~RU)43J z178M6YoBA)*J8gu5rWoMbWArnelxnu$!E-nRnytPo5=}zxES(7YOq?0n+=j9c}Xc( zgZdoz26?c-I=iuqb;S&NI1u5f`whC-&xp?M^dIv&IIdQ0ITyo->m^=XKyZ%2A!O!i zB^^bE5;w|RMAY4)emY&ttkm5+bnJS7jfnX3+4a=6ZVva`{{GZ6y7wKEZb$>SZ9|P2 ziM-Baf+$)W1;Dd{WTb+MGd#W~UgHxreXEAd##2>BuDSg5Yka53oCK^-v(6{OP!py( z4DR>0xgogQ(hmhhB;?BOKrr|*374shm9oy4^9aB%qlX;~43$FG;ygQ0qPtz(-)M*X zfw8G6!Sol}qnpa=XU1q0UuRGNdGP0zGhggE_c(RnUV*^E#NC^BoTK)WaJesd-`oW^ zT%>L`rvS~Sc_yTJ+0ZI;!MoN(yMU(qd71j2Xqi+d5|&nJMRy31m}_UBH6Lf#e~gTN zHOlHgq1eOeqG#tMqYO=>V({(I#+sYVOevS+Wd{(JTLP(3zoh8;aT60$Q3F6J_K5su zP88rp4|R9381~}wXI~wB4S-(=p+RlN>^}q7Dq|$?Bb_CK+Y8HCTwI#cq(EMI)tL!x z%Q@_XwbI?pc@_b{eB7pXo6qI7NCRHRQA5p#%GpPYjj21_4bGtyUk_>5rDGBwvPJq) z8Gau4c0$4kz=6gtQs~eESK(@IvgY>vA!Yx|aPNr`A|%r8I?U)(@Y!Ex`Zv5W4fSuU zLojY6Pt)^c?Qg?7P&#RhI$17~Ms{`8AK$|>#pkK#9|{rxm{Sz%E>Ikt!|(-K^;K_B z%P48#8rEi%DDK-a#+QYh&K6~q!TGyu7K9D_L!D@ivZAg0W=Q%hX(fqQ^%DmejObgbjWrk&I!;)2NOBY+I?uLi4g8|`i-y6 z9=L{}fRMEpTl^=e?ep#dfy2Dt=fk8 zLRQ!LwQs)t4`>SpOalcKHv>`th5T<7+ZSm2mAJ&0{11vt0A=c* zOfWbQ2;hKU0zCv*P(-oANNHkMs!TGh|K8vy&J+jLatD zpZL8_QN>vTJtXmOes?FzrRwh*g5z&ni!&>Z7$|6I)JqsNKU3QpEeTIHy(t1*&bd;l z&jII>{ldkHr^SgXZAvXXnvdUe&)XkE9;?#&f?$4j+u-$w3`=JSeVr5FFJrv!jVF(w zFSlAfL+Y!wiXzlkoUU)s+G3s4WV3m3crTO97YczF+}qMM<4PSHlB@DjOC|$jVlb~z zc<%QTcD(7*Zq?P9jW19Xl-PI|U;LCBsnE(4&assUe0}Ua=BmB3ZO6%T-2AcU-ILQ7 zDnvwJF`|)t-ygTk`=fb&yARvE>`ieYA60U-?Tfb}N_9_X&!bSzmn(Re%QrEj(W=Qa zBQIyNdq%6+ZtgrACzoa-gFfqc^{sWXhv!t_tc&td`gV0Nmp;NO5$N*Ts}GgUG25Jc9%YE)kX);Zt8|?FUYgGSGw?IORtqbMMkzf**Q}=wE^d>N~}*=4+YRdQ~Eoi7K2)KNST7K{W8n${SFy z01{yMTDr}LO!M^ucvQ)Wfg7d!K>+@}nG_bo1!`rarJU|W5vjDK93EdkOb}margjRp=fr2H?CZJhBCY|MEu@Py(r>%@T6#KH5nC5ixCeo1L zEOpv^nO`K0FhB2KlCjEaLx6D3>lA20tC5$DaO6A^2ijGB4XYZ@4>9N8|9YRVw0LH{ z(h&2CJ8STLlv_68=!YynoYXFWj6KEuiz~c2oTplnb?0=c*+fh>oN)1u}q$Oc!xB-NVwg$w-oP-`vDMkod=I$?;_m5-)noK%-L+M5@)=B{jn`VW}V@o_Vcq31K=T27?1iCr1TjtFZz+ z4)VJ4pj~fKdCr<$KCtlKbvWe(!eNnXj)YWzXH=!`dCsprkP8azeCD2LZRDhP%?_v~ zWsF#~${lLVIiQTm@1uBp)h%u)=5(MEgpBLO*l!nlQHT=^0!uC^nQCMK@{t!V)_he_ zqI)-=NoHU=tgpNBHZ)QuE&#<>`We#*=81qCPTzAcWwgp{El%Ue%zo=mWmnO!wB0bJ zRD{9x8_j!dywH5d)>1_0V}=y+=4*Im@0(4RzY1nfZ)vb2D9L_r*^J7k-~td0dgLgR zrER-rHKlVt*xktvCSH|N6Uilw*0W*n#EJ=CNR2b91bXN|1{LGWB!ie;3?$IzM$@d4 z?INQ*jByovt}j|H%uk&@G!Vs( z<|t|T@P>>Dk2zQK-|Zz&mgYOM4jcDZK0ThyYM+l%z7bbM|4|em7A*o;5HKRB1-FD* zJv4)Hxyt1=9T^SvsJ$ilGMi;S%tc*HFjMVT!L7&)Nh)W>ut)^2e9kn-9=bRM7D{SX znA?Y8#o8_4+dQW}^+!)!GGh;)GD#wb5rioBImS*F`0^&Nj2f-y&n?pLqpM zGL>2kQ4lVdKPK@2fWordWZ}d)Qb<8(#HY8-JipHNpG6MF&<}hOs#QlR7vARvM8af? zj(Prt?YiYwU%uc;00{p(&uU{mOZo6C&Pc=(3Uk|3L-6{@3fLq)tZGg3{S}R3ISITk z2lIKi`O@T;8>d4B+M2zNVFx1N1 z@?kNChEkRuNW#TBRZb_1WWwubkycg0WC~uD-ifb1kX~;HwuooM-+WLZXU0+tDJYp# z$&hQ?=P$**Vv^0yH?~U68r5}oE7R)mO7l&z+PYYD&h`%0U8Rv|jGGlPGT_0W5^sAg z8fA44_@xSV;s_Oip}~eRs9%saqR&35)dEOQG%8P2QCTM4)mG;fYuI6KR4TM1#9pq5 zNOcIN$2`I}r156;^zAb*VjPRtiM4D#FFdeh6JfbN7w`^bqs*S56MaWHFv`=pLJ0XG zp+Ai12ItG+5pTUBi8fjX(Pi_*xXdRhhQKEb@x46YX|F`-dRGwLN>J|B^AeW}z)uXy z)y_yN6-wz|()Fi@d_4|~DO%WMY_bzscSa?+Gy+gqn|_hy7s?^j7rZmZkdWV2h>|8W z96gtYdD8HH7UvMX;NvrEX+^J@ybzKq`08+G)HeF?&7vM9P54^DxWc^~l($+rO8Hu( z%O6r~d6R(91;L8E5ct|7nzX^`9>RX|MJxrkLhI~ErCaCd4|-`p9Phh3NJ3x+k?lJu z(eabP%LR=^lr0;CAG}|%>Uc3 zevTy+{zi05XZLpkc^F>YSH@7%G{CeigkEjgpUw9C%+@|gdyc`eO>i|3h0tus{mYVF zZ#G?KEvp=T*m>8SI2jH&u-OMk)H{`Ju4@OlM^LUs+&8psq)h5MEKG(7l_7m1;te>Waou_yx&AI$!p;zJ84HUCB%x zDHBonB4;ZKdBUT}le}Do!AL2jMcB|ci#ocS?=HzsX~r(J3{?eRjrHJw z$nXy8nJ~p(7=jIqe{}8B+Ys=ewTo%Kb@i=f=^NtTP$0XfUHS6GwN%j39T}ov4R`?q zrg`-=OmEf)1*>I=h`R#`pH86_SE31TiWPKA+x7r@(bwLrH$jUaY^z7yG3FQ(%VB_l zh?uz;)Tv)F0=2r$2;4zG1w%W>(T|SU={I>kLd<$oT}AqY7|l0#DZB8l5baZT^Oc)y zWB7G<@`{fW@XuYr5A!=p^0=>-x=hxyxqQ6{)wJEokii#e+}A#kPL)he910ix$!*R8ZkTn)t;fd2F)@e3IivBR}_wU9TP9GUAJa;9tv@2<*Rm_!& zuAFO|sPcJte(I?WKH=k_Cx!x>&rj9xCo`nPIUX^FHX`8T-u~RKFW;0)!KJ2kc}+bU zwxt5ajLzbSiO%6hRrt-*+-<(0;k0)sbKM64t*V$WJap=qw}?f3S)^NMlzIyROb#|* zH!xx^tWy!J`9%~gk9of>B8ka;r|h?}e9VjbQ}P({fRVnV&8y+Ic=uC3akN1=bR8Rj^I`tZm7P7 zH}%%zl(SjF!~@`b%5an^6g8Cou(JITHQMbRUKMM-bnQ{qepp}bpmIeOL(=)pxbIJK zxy6johnO2h?JgR4nOFvg*-z)wQ1< zCVS@5WnO&tglI39yPvA1WFo-Lo3`?PXw~&pwbF9pE8gOO(C>SD^5ZKQUB+uZ!-PWHY|U$oDi5fWo62!FhsNn2u+Mn&G^do>qy zdx0(lVgjLuti4T+L^0ynP63o|rd_3H(mVQz7@TboT!m>@ctXnwLlBXq^{D<>SGzVo zHJ%ao+R#X$kk2UWvQuX1)(~BcGIWVlr8y<9*?ekI&nvT&v=N3_0B3ZztgSS}}Gc3HjV z{#Hw^CaUpru|{nv{mVeu!01^%c;yg@%S@15mP)V3Dup5Ni+y*@w#7@9N!=4kzXfD< zs146Q(OGa;P+3*5{jz?DMcRVcto4c4B?_?LCjJf?yZgn6(L#9`1+HQVIRs9K3p=!-v$#~FZ0eUlnXgC-T#a{BYV}@-qaNY&EJY1}BbmmSmnl3r} zTy$I{qD+l2_+?XY+uP2g%lr_GW4Y92QLtJsO)a%O`Y$y2rh+!^ZHCuhzFrsRFhvRt z;s;RpmZqX`I%h)z5&>%RC-*0*2BeG>ka8MsMB4g>Z*;@bDs)lQG3In`ynPQ=zC#S@ zthQybbDf}&XI{l^PZWOMhw#}+R|#)lC;4Pqcv&U_os*@gm=w;9Q8zM1q}c$0n*Lm& zF~;bPaU%k*%!KMH0P*`5TylMXSsd3J1u653Su~3=hZyP>CVR1P)9H)4D4z`=pZ_|m zwQdU6g~F|#LE>BDA9Upa<12dONA+mge80{ZU|f$6wktjL^}&^!trkqdU4d-3wz-2n zmk%&hhxS{YRFT1!KRr5$5RzK5Exy-uDJ^Ln2}3%mFNhCoJ+m*aMnGhg?-n(qgeOOS zzBz!GD-+;BaP=w-NFz6^ih%9fI_fL)DcQ4mcW4{JWrK1P+15tbg;`^Sn^n7G(@@2V zFZ7^YMNZTAUYVm{QC_zEVB>;j$2#(~#uy!uhpsgagdW^zcRdbRf`IW%A|95Trfa}> zMkXH}5;B5tz{LZ*6?#cr+ezWxX(PDuTX##KBT~$FW5AyIBo&bFp_-#sg;~NujFYHQ zNqcef(E->g?tlvz&9`mrf&mab2pN5@;vA|p`M&Z>yx!jF+GWC7S8f)UUVUDS-V9#7 z045SH93*WAVQ^MohYDSKgQ5I50?W|M*YiMKYYj2xL~W8fd8%UbuI=1Nm!WCS{bdy! zj?fxb%vXadCXx~sz>gG>vvS2I7wqEDzR~EdBf(Z?6BV&;0~G_9w{FqgDcCdCEGF8h zG=`X^u?XgPNcL3GAqQrHbMdbzZdg$~-y$nx?=u;`5GZ~TR&0?(JxxvL^v&Q(z$b$z zUuP2Q^KFPX!8$AiQ2G#J0}DUKdDbMbZKxWvWY`r77by8@Pp`dzHu04n!3TzhOyJC; zt%3oZBT|sT;uM4?iC+q`{LoiP6EME$Bc1EN^YG<6PDERB8rA32DvVR+#iJo(jAs*D z<1C^7#2c!zMIWei%e!7qo|l?5QUSu96<9BqiEnOVPV)AeVA>Hl+|u;n(JD-{tzaXg zxf*S-9}Iq-FUoa5kuYcY{Ok}Ox^yI5yUTqcJ!R4i_=N{@FqUMZdT6uzZ8N0%=5@L} zNujZZ;)~nlccIt(rB_?aAT+O2Qo0@MM~_P?Rk(~oo+;MX@(BFJ`yIyZk%PyTDKys8 zzXb(D3=G+ZhXD3Xt9+z_L72Gg_hLk&G>l0ZW>p!JcSp3d7fHre77F#3Li zb0A}R?HDLjkv~VU2X)ieiHcBu^cw$G&eR=C-gF_1E%MqBhM*3O_G-)_Fpw88(2h_8 z4N@iAL&Lv=XLBid$uo|6J4%HUM;l1$|BF?Wl%A7wy07h{aIm z?qT2AtWaqsaz6N_GL5e1ySpxu7>C6eIoN11O9-j!Pe9G}fS<33tMJ4K>HGEHtUrU) z4TDpD!afwop62wVKAN&JbB|eOJ{+XvWjWclX#Ea`~h3GI*T*_q0*e0lRI~TpSiA6x8_XtAoTAWiBak4gh zFm)*PZ6?vKy0$WGle`CWX-7EHGYwEaUkC*B#<%F7cvTro;$AXyfv8kKgy6aJescl4 z95}#XMNNKajYo}aqNv#kdDIRunL^eYI$w-~0l{yy^||Q^gA1(CwXSC*CnBC&v!<&p z_kEaERsasHGUOK_O$DB(gP#5n3(X~Kf>QFPGeV?#TQng3^ zlu$x|cq0mek+6@}eDk5_-|F)0HpFU1{qM{L|2uO%kOH3VUN@(9AhqTC_D;P_rL z>8CI_+J1cGnC|t1hA`qPpWSd;Iip~~X<4{gu(hK!!uy0qVt5f~F8xAOcXL~F z+!p-f5MgNH?_vr~=i#!MAEq!yP$M+izf5-rUmcC7!JHpYNr^m{Uz?>qscQ}S6fqzr zDU1-QZE1JxX#QzXg$QQJ=ewdm!+uI+vsiEQ>Juj2F%@Lh{{>Z6PFcZ&*gz5Jj7DU+ z3k*9L6KCK5r7Mz3GRp0wQ~n_YWQ9dXhfnUbIn3}>(vtCcZ*ZGJi8^AwqRT`&Y(=pq zO$ayC001|@=qx}_U-?Xp3522|0l@zqg;`R2jk)B;SQ0wQ`Pl53j8-6<@hJ$14@^)Cx%Sc0=ybvaZ|Df8z^FV_|UbkX(@_FmCa#q4|1s)Y7|d^p6#y zQPK6c?_Cr~3rn*1*cCkNIia}Hd*2QYz+unq86k`1m074`M(PKLi}(bDS%XyYb7Wwe+YUE%SFa{G4a2wK}t>AwzOr0)D_oDZ)U0 z8wmOF5^QW!ueM@$+T_}vwLb90Xc1f@5U+i)O!WUDwfy+M=qoaLwKm|-5lGMy6x+XS7=bt#TpTot_W$o1eB#^qrqQTcHPZaWGXhD7$O=~r H>iPdacLMxF literal 0 HcmV?d00001 diff --git a/images/run_benchmark.png b/images/run_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..e3d349b936d8082646569eb0b38d0dc6bcadada8 GIT binary patch literal 136012 zcmZU41ymdDwl=hADNww)6e|>WhvM$V-63djEnX;6iaQ00ySux)ySs&8LH@MgIrrRi z|E!fY$;`YnnVEO*NA^ybl7b}a8-h15FfgdnQerAFFvvqNFt8y=2+%bmy<|}^FmGk8 zL`9XPMMcS!ob1i50A?^SQep8Lh?=T{xEVSs(w6>6Qpl}gD!H=Azr^4Dpj$waCi~Ts z^iJ$#pz2MTo|v%Sq&fzko*rx^`ArqA>-?1xZhX8;rJ?Aq{F{TL+cn7AeOu%0crx%= zmGyQFAI90*UB!yA2t_!9(g2qfqbnsYrH{e|mZHNS*Wq;rf1B=5)c5ZQN+D-2ar0a- z*)0Yx%1%CfF9za@WN{!^;VrVUcLVnx?_S=(=yxR!UBQw)2)BYuybCcE{TGB0u!1y; zdbF%{vwQHZbqlMcBld+`2w_+w`(w>uvsT}NOJ4cn?o779R~CzrCBS5fhH(F2+CnCa z3UXJ|Uwcax;@UyJjgnE~=V1O8c;&9ut*Q52X5g1`BYPqjls^7l??C-{b+Pv|y+B}4#k z92+RPsI_y(2e3ndEzczjC=w+|U^!Pgi8IhmtwU#iFE_(`jH1=!+oIaK? z#vKX;`44nRaDjPn7R-?ln$4^R-jBi*1|^T3uafWf{b_%o_|dEB|J<}3+(0kD@WV<+ z2aX1d^4yk2p>j+TL>hlf{LN-6{q{o$=7+FNG07D;^q!|nOLx?eZsOZcpU|o&fTFwD z0U1A~E{!v$ogyCIa9dF*JoQ(F_vE^&z`jI|ZPy>OSm>Rv$%J(U!(`+z8i9vHj!+?|j5&n^TBITh#b{ z_Vg?F=!Ydtoq*H+F~j~!;DS*Jaz%xSCYvh;Uv^J-@d5fQ9YHv5`4A1_)wlMr{^=ey zSCupE^QG+v5rIUePik$XuofZCb#iG`Ll%{rsRjdEGIu$Wl`DFy-mpr(k~8IniE^{I zutpg#CG>(w!Z2vblr zi6L5v|Lc-|Vs}`-cW=8o+~1*KiqIt?G=B;>)V?%2bqh6Sm|V8NB%xqrVGm%7Xp41AVvEF> zzA|t@{DdY(s*ln_1?FQy&xhSu8EHbPX6b$@4XM@KdQ*lf_{DT01%(f?1HC(Mt{AVB zuW+tRu3*Fyjb$%$dQ{-z7UOW@Y~yf;sARazz7|Q=<|8O!6)w(*&&XK624jO6!M_s{ z65 zOMie7h9!BOrj6ssmSO_=_t7CU^(uv`hfneh9~f{FbUrOJAeNkz{wxVEz0q8)&eMiy zT4EOLC z*N+Z`ZXV1jJiMOHmj`*1<9!Pe(uCF4Bgrh8x&nGYz52#A$7wgh^DifJTQ^;+gX!n` zr@o_sXBcOd9XX5J_nP~aU6rYQYgeZ33Kx%S)uW)C#QMaM;?Cm!`(4N>2(rB1E7b3o_!8@QvmLZu7kT`q;>`v;0QumpDSRH{ z^ZNySQ~Vy3B2>@!0OC(bYb1|Jrvqc==jK8vj;JxgL`WXpm)j%TH6g~oFn*c+IyHJQ zk~UH|y6rCQs_r`OhEVW-*b}Rfz@*5b&=6nSNpZYi;k8?LKfeU^WkzI+Q;DgI3q-hv zCy3X}3DZPmR+x9KzF&SXEvNeF+iqW3SSy|dDHq$>gxFS3L)a`9X>W7yc%)Elhg4EV z1^uRrvD8+051%xeba-x^Y5LB`L6<>$mCveQ@|_h3*sRuaQ3|OGw`TU(RkWR!Zfc$f zjbi8J&oZ`nemf6E7T&0$> z3~kSNQG54Ell}k$0Z9Xe)B?e@Y|Q&7&W~E^Vo7YuqIckMo-Mjd(8<8b=%T%oG0=7A z?Oxu_+2Du$-}}UI2eM9-G!{wYv}u#%E(r-Td=?L*x?I(YxKzuGk~^xqqxT37-M-43 zS$7JTS!5MiO&g~rJ6aGF@EfXNJPW>2eO`#8viqbW@Pg-&#}>%2&zJ%+B=K5HxF2|o zxVJ?q!HfHJp#Q$oVV~kCJ59VDrx~}L(YKb{%uzN>Yq0BQ**Qg9G1Dw#eEizu+6!b< z_JV4Q>Qq)B!A|p`P)^CUZKbv)`>~t#xe6rI@T7F#y>T_NdrIH(1)wdh`%%loM17(t zy%_ujuk=it%}9P*dv2+!wtPlJW-1l|!27*hp>V&JS}J2%lTtm~-Y;8o0j1G(RQ7zJ*l|UdW%;^9 z`-bZTyGTdTi$DX)H&hvl%RHWo{YGwMb| z1NXyWUumkUo$DIril-Cb?G0!h#h0|9`hs!OZc$*>3+Gn;eExa((!h(il{cCn$;aW+ zsH>Y#@3Ac3Z{X!_LhtI^bHP?+?{*W-nIhV4(5>z>@G5gv@wXzPr|ioVq@#1bchd32F z6dgsXBPj0E#FxayO}jhjUQ`N$fS8hu-0__lWpH5= zl4*gl{K~cKnm+xeL!bLZTU`<}Yo&+=;q^?niMyp(#^Gk!UUnxI`sV4zhAbP56a$QR z94ZTTHseTe&WVbsJcLQkkSIe zz~EB-dBaMpP+mZVM~0P}rn9EJ9IuJJEu)dCy|EdiyR8FsZx|SUcV6g6TQg@PGIv{m z9gx>ufa32SywJ~oRx?qM{oTdcMu0+7UWrW9-pP!NgOQz)nL_Xl85tSBlc_ncikQT| zhC{yzP*^%UJMc0wxw*M9y0J0ZJ6SNX@bK_3F|#tUvNAyTU;uj9IUBh%*a1KMbC7=@ zN6ZXp;$-FEY-Mjp_UE`p#`Z4G0u&T~PV_&oe}1Q#yVZZrWC#4$WkD~H>CYA>7Di^K z|2Z~vDF2_eyh>K?W&lkwD_b)=AoLu9tQ^do{C@}hf1Cbu%KtS~>pw%;c$ok9$p5wF zUnBXM{#?QTx}tw>*Wb0!`z82>pXq<@z2F=A&ALqJ;~=sUQ&fX4U;TMD&{sF~P5ZB9 zHw>)mx9Rw=jW966Fw$ZoYVNQnt)ZF3;;x+D?}m6HoyK@_K4B}5A^muVX{D6_E<2sS zIX;Ixzg|ABZPj;0ZkCmRm6k%H3Q~%q-A@CK;rF;@q!cQzw)vc&D*5b>Imigf-W${>G8Tezbaj92^=$Mh&!0&exX^&H+tfoDO6{p(=SQ# zY_pmdT&g`c)YasQ&K7?&J*~&90tx{eU%772G4OUDYBoC3>$N$Jp3V+D0yaRVzdp~j zxd4xr>ZsyBs?$-Ic053cy(~ddtJfEyq$U~Sm)&2UJWRUY)9D^aD@s4!4c|M5f|42W z>PVV?S8GmETyGVStF>4vgYOoSt7{w<3F@=v9)}V>%G;)J7IVVBh zBG6n!EBNAxk=)5cWyxt@2v=Jnd7uw84-L?hv` zLr@%Uwe{{wNIaFZ(DnrED-O%D+gUA^UHKIu%Mx|%frq=C0bJ~YrJ}lf}ygm%0 zsd$rkDwx{()%T9&p;n$a6O=-0Fy_GNn-3(}|7}nklOI4Q+Q%>`%UR^ozEinZp6}D- zl5KaYT`cC;wVg0+F}`2^wjuY+&?-FkB_bJaYAq5QZANYDjiCewoZ}IJ>KoL6K5j$ssY&FFNe2!KX(?+gJ?3yEt0?2FfwpC8u^6!N+hJwFURUB({9?8)v=O9;$VrOo>*y5&SM zr0#kKX5GNoN7Cx4Qb$A)FL0S*NBeq;$tD9T9G4mAD>#Q&IRd_Ba4u<3-w?uX4lK-P zYjfjs)d+Ev>4bKCd8v03e4FN{Z1X8 zERH$n{qA>tu4xh5PGCIMQEe|vcV_O|T&ow#P>615;~|!E`2e{IzkD8V_Z|xU>i6wZ zc0XF|KAZyeFsJfP~o0mbneD!J*>&TQsfhZI7>u?O` zD|QCwMQ8GmqbQ<{0E0(nz+&*F%$>mzUC!=5CR_*w5;3vfpOwiOFM zV+YL)`D`^O{A{+ke~Y6KyfSBj75G+7MI*JM+-_nMNcAl(9A{`Ub-Tal6E1r;(b33P zl63X88)5+kFLa`?CR(ha^Gh{8&n8tbFn`2cM-aw9U?%Y^ktIIip*Yu=`Ss}nF6QBQ z2KiMFLFD?E0c74T?S=m@N0GkL?+;bR-2Epby>k4qBLVhHsmKee6)2e}-gLnmfgH~k ziKJ@{>&A2it8aeNt&OUyqzZNBS-uNmU|PN(tGQkn%S=eLTrtMn6^`m|lFhEX0+sw~ zF-q6AM#b9P@Kq4wiTvFr!4VBwDnD`_;2?p;ukSH2eh{lYv*?RiKM$r&T$fdvV*2-t z`a7{U#QsdHjKkUZtY3iDgmmZ2D;x`7>@W+5%rhC;wjd7BLl+BRNo(fNn()e~`Qn~$bC>#<(90GTU5W`NCc!%A*;4V;WVPsxL=fII zCm9dw@v7*ckmnJ*&5?FCl9zcW=L9GGOVPc$wBFK+LB|4B^}SmRIveMAJ`5`QwlLX zVc8Ph%d5WQQMRqPllKxq7i#fivR=)NSz3UsdXY-HO8v^(roH!y5dLxVe0SQ?J`Bx1 zBC8#7-n%}l3D~)Ku;ts`v|&z0z_j@%@~M};AQrVoZ;$x+^2E|SJqTK#X|Z(FjfYaK zgZt%p=9tv7CNRW5}DOR4m=(up-j&|^yf_)m_3 z&In;CxGQ31w7p-xcbEOSiwIVyT(Oz?#^T?L9Q^Jq3094MCLD%i`~LKWT)E@y^_4Em zl_eJ{6s8zu$)DlLZFE7+vB&NH=`AUJB=i8xnIrp!Hl1YehL;U;7whGp8HX9AEIla7 zc3TJJjo;}e0^A->JlH-q-trP{iF&@%z#hXDq>ky2j3}61D^=Aw0;wT_frCwv$nus) z7b4|#N9bGQV%fS2o(CbtjD(gDoi-yBc@hCNcv(c{F$MuUUwe6PPK})H7jN$1v$bYY ztP%6Zek8QJ?k|w3C)l4c$WIc&+^o;F|C1Xo;f+4}_kPF8X-~aQqiD!6fTVHsAvi*5 zP#7QaIpq@f6`|WZp3iZh?@kpLOi=`KE;efU&E5mseu7Dua^G60klrJ~S)KPCd-$e8 zGeJO;|Hu0wP!R*N)Q4BTeW9P@WObQmi_ItLO+t%dCMdXkG9aBj%C8|x-$TIa4OSq$ zX=S|At1uJfXUg(+yiQd{`xGQfg8C$}#(Vm?0?|>ovsKkSjbE!u&2`=vp0uPLg6@w& z6vG^a8j=E-*=c&&X_>z2iIKY77_?*~KehPt}Bj|*8kU+?nOUhBIp%DhTE{1nLir|}MSNGN*T8+bC zH!$}>pMEjbZZ|Sk{c*O6J5r&klW)#Zq?$vZUa00@o4A&RCFii_wNYU+Co%_EEKwe* zRwzwTjt|F)Uk`|{%%64wfT3QsZC2MK*N?}2*!3DyYzNLC28J!vap^Ew^ zevJ>+&-^Ejd%X7i>c+hfX?bL0#YD^8qQ!Osoad_5GG=tDMU<%)Od@2D1}#qV$=O04 z4@9vCr0zqQNLJuRzp}y97-mtwtrZVm-=k0IyH1KVjyu!~2}BIb2+C*#wl-?RD%7X+ z@l3^XZkg@Q!BH9fq8jCT^UC7B!G+yN>ctIxKzy^kY~sbYx+Rtv)^iR1S4|yp*2JlrJy$))0TkaUyVv*3}WYJ9Q zCyaInK3~IoCyp1{yB0NE2r%k@f@~zQl{}yEJ>}fl@)?*DfU11NW0;8Ytd#2O2%^=C z)s(FYh5FZ#!B1P4;XKsrjt*;xws;CWb$+1?qVosNOt>cfinT}o}7gmsb`?~ zkc~+!=ZyhJaEaNp1mJUJ9_zkGTz|Wc+nM@0C3s?+w0v9iTeWl?%5=HJA}PX?Z>NkH zWN&eflv!f2Csbusx7a+=N^CvjXmYJv(rRpLm!%UB(Vf#lBj+KmoY|c#`FbtOj-g1Y ztmHZQDjb}pQ-4m0idXZT^f@p>M=pCM-+&?4!qfSSi3$zADk*O=XjO(9Ft_bX#i7h- zDP+Efd@FLb8~4j^xkGFCbJH4;eIo6CzQODv5?g!6CZrFwdS06G()*M@dU2KLlOa@~ z>IrnE9nzH}Co>M+2jfT2LumULzVTT;g<)orZKDz%`zl|LbI$v9)u6=H;$;)nv~X?< zaM^ks!j&tEw?1Qs=0 zH7^s&F6WPd(mg>&uj6n1O=v9Vy3W%?bp0UYLq=CHrOpl-?w}h1PQ{b~jv|B548j!y zfwi_5G6u~iUgZg;=ei3Pj-k?phWI(VrM(o!SeU!jhsf)vIlZ#YsomtFXENesi8x?) zpQ3xH>=jUV#-@CzcB8Ux;~3amX3H~!aAJFOcsoQj zvg8xagPxs`2+bxmPspiEcrRKSBrn(}Enw$*?RO2Dud#c1oG^>qx$XhaV@%jHM~wro z0lZ0O_x&0)N%|RQ>(AFHO@nY4f761sT$Cq=q-u9~q|U)oqz?Pf_Q;7m>n2GdfoypC zxywFdCu7csP-VbSB|nxuW}~FglUQa&q3RDGmBy@7`efathi49(So*+i96y3xbV&2-y>n+C7))#jWLYXegmcDze`*G+7Fzc;k{W1sGI?G4MDzZcRu z6#z6(cp!mREAItn50SlG>kk zbSD9Q{zh5{3eGkC45>;l50@(|^a(AxF;g3kp$lz`wiw6K;^i4cW(&QAJb6Mne}-zD z^1!p%15uN6@6!a@H@f!Z>y|hBdF1nb-2896yW%vk!8~%F*msW>?6(G^9vNG+jiu8^ zZryXF3^WG~z0z^TouIvEJ-(Pmgzw!l?dgvJRdQGDr1HNv`CaA3q3I#|kbjea-5~Gn z%5v9Y9HBXh#n9xfFGT%GKj}Ugs9ym%m0;l9vo0HNCN^~&s$^;ZCjE7sw634Q=vWc^ zE(qU#K`Q$6d2#Ony+}66)&Eu%W49s!Dch^rT>f2}YlTW!aEpDgOGkn3f@684Q#74D zg~mT-!M{8MGEcZSl*(u{($9zBFzXM5gxN1o_2b(kwNa!jKaS>$3g0P6l=w@0_j5OE zc@v8Dm^qTNU#Rx@F3w>HOsYRft(4a8oh*1~Y}dtoR3YIX_5q=AKU6U-n`zH4l^KwP zSa{?UCL;1~xArllFY?`MsnV5;LXyDsqsJXJ{laG-ZJs}RL4vvUS%n=Ot-x=b;7Jb zf0Uq+aM4a1-9g*+ee8go#`jve_$~XlE6*m2DW6968w?r%3^s<%iO06GXmBZnk_VZ^ z8iU!S_o*I_BU-=HOXzGW3%kwnIbwzw40w%xYgI4Wj7N@)qSy3`K(IJVhFHIJ6yVx; z^nT`2*v;exDzv9u5J15ldjNf@-cBJ9IAFkipDFd)kQLv3PclkA4IDz($BC!eJraM+ zbx3gQHG!a!T&q4?tffaqJih-)O9TgXa4(|OBOB#1V+ zEW1u!xunkc#Q7anh4hgSFYQS9b1DTc6Qmhr>rOl)IvTOrE1jRP_S-k`mhfP@XQ2tp zmP`}#yAH*Z*$`R)JD)-2?Z5Cs(bb+lPPrT&sh_*exvgGA+3v|GF0A6<4fTal(7nY3!Rdb1(E|#vgbaSbNPT zATI)3ou{@&e~*x3+1e7BN4>GL942OII5;}XDsiD#^`677+OsM5db5)BYY&)N@ zggdRYV+1ri#NyA5OLzqL_OCP`gy?7n3hen3YjdgtrLHH%-CMUV234KUws5R_Q*=e9 zy4;ZR23xJKKx~3=WuCgB5FsDVK)aQe;ALS(xpwPjp_ zsU*^|!PfgnH1jTwxDQ$^i5{aKU)?WWo-YD5nIP?S2rrRlz@LYw>_;Xv@+X5d4+6ss z6uqO)D(B9GeA(r}z1onQ=bZk-VR2a#BwD*XjqV$h0W94lZ!_6}XB!j?RqC_AvF$Xc zYy0-6B44JF$F4NIue!E@w0Pzu`5^2YYv77;&T|s?atFQ``l5zBd>AM!86Pi&RBT0slXyx9BlY{^g$b%W5wVLP82DHg^^2btj0aVir%(+`K|$3;1iW0zMu z9F=8UcVEpiLKc1)HB;yLNw4hn#XZh04@~vM{a}GX@h9IA{WS2epL4#4;=}y90J@4s>5}JcX@QAD-8f95sl%d)f_R zu@<#Gt5UD=S|5#Oi0ZMf4HX0Ez(w2IoN_f__BQUoUX+skMV4)uzj@R#kgHsW3`4x8n+W^J|(diHDhCcmERdtnfLe-?fRtI)+ zypQ1=bjf3XCKo8s(=8uuWxdm%`*F6OehcXxbJ!x(q2@l^pUR8$PUa47jIL_^ zpy#^`zZSODjCJi5e!bwZhd{ma%T^(^p4*(!Kk^3oQ7;799FX#qJjhXhz5>SfQTM&A zwU8{R@DPuyjwzS3GTNtydrxd}or= z(zRt`r^~ea?Akq-PAA#g?3#OaT0M9|v-tREQH@9IS5vJ4TpGedNpw#!K)|;+LUj-$ zNTAKfw0lIJDkmscO?qZNxgDNKi*h8Ak{n#Irxq$7xJKHZ>9YkA+2xVH70e^@InQsI zQqFwJmE0$vt;NM^rMdFQC35J2-BfJXeaEISe;!&Y^aIOvWs7oX^T+&-MX0L0;7K+l zNV>#`Bs9Dx7!$~UED?vY9KAS@*!Nz8kURD2+GoNe&3C;8;ZOq%G55GjH%J1#(h=jy z9p2+t*q&+-Oje@`$%$(AT zfKAwRiuX0LI-mpfU{VWSYsj<=$dkAEXN|<;*^^?;w_FWqbjNao+DR2<8#av;9_=c4 z+Jq{#+Gx{ky}$NP%O$9qeh)=IyqdMK29cmROaukOPt*be0H=AU%BKC*a^+yeS5P+d z4nZ3x>(mH*t8>YAI%Y30gG@9i7LX80P%yeW{+8jZhjg4M#M?hi%3-1-yVh#smTGS6 zcC-k!%$9gul}BjTZs?87PUUPHYw0?3#|xvJ>3GL{k}=SoN(Q_VavUck2Z;iv?+m3KJ5D7`U|-Fy^}2K+n)j`OKq5g zfU%yXNdP7yTAp})1@bD{SUHrIdr6-|>v|Ok)pc|Qgq@?qZofoUwXeG-PM&cCvg5WB z^2rE4&1y|=)AQs7p%d|mR40ub?PY195>DSm5%KvX`dzO8$R z6tlWmW+bBtKaWa44p4dj$}8JD@5;^2FeOxd#BDTCHH;lIWfSa^_indqhdy>GvGgjy z^<*$7w8%K%%lb!yM!&=n$GvO#^{JTceCNqMeHY?79+F`kl8rtIkHrosK0Y4|dpL7i zr00vQF7dD7XF1hbgX&gz_y+JbUw08&~j_%`K#G!UFAm;j9+PUR0 zCt2N=cg(fLLAz8SBqh1v_?=K$VR*&mg1OYtwyb?GP*UiEmwKd3(EUzSCDMuWfrapX z%FGnSf?>P`A-O;KJoQ%OLH)@WN~;$U7b%ApwI!!KuDxnG>?+m$n=NSz zet8-iM>9o@n9U(!>hyXe2ZoM^rD5AM0SvDm$j?4@lDvrdqw6GRhk%bKm^pU~!Tso~{VR?JF+Xmex$xP>$}{Qg=DrN6W)a$My1h-Z&J*s8%*QA=4H*ESbc)KD|_*`d_bS&FjDaJwj#- z+#4^!AD>}QI1tG9dk2L)*S!8yw{Fkc!+Y|@)HW22O4oM1%B3g$P5GiDt$3Um!X;Td z*2A?!X2AyzzimU~_x}JLo82Q?RVS+}GnP6~fjOU|=%@|LB(q7rfngEp@tnp%4-HhaJ`nUAj8ZVSRHl0>w+8 z4uA20!cl+1s2U5+G2796q4+is4b{Kk68tlMtO!H8`P=uN(9p=MD*xz4M_X$-oiCm{2@X}jv{Ku4SYAgF zTJJ14W^=fXB?(g86V7(G8d2hS=~(chBjpA1jl9L@JVM_IBCcHclx7dvQ&_^Hr z{v9pN(FNe0Wm%pH6m@23EpkY!W0(fMRtE#?Y8Ki>_8coQtNh?kHEKUq^FNL;jd=Jo zWT%a~P^L?^Yv^p_>VW%ucs>bo_7QSU#YR0ZYMm9%`Hfr{Ey3?_hdJRf>Fqt|`jve# zRkbO{w^zI6+zaF!5-55M)BSI|=m0}XCa&ACndJrj2)A4_|C?gnPOgk(bswtb$k#PG z%{sc5`<8qB_QPnbF0vt`t;k+az|XGaq6@|9}mPP zf{dmA&1u5nik`X?XBRb^lvE(Qx0tIZsjS<$n|lqeOs$Q7)x;RTrhCa5&STF@5p#0M=f$cwjr8+n^lS(k<-OKMRq;$Y?z;A^bEHqK z4y^J4R(`sd0Ht%gnKCV`T_12CsyL%#arB;7-TBj;D4?a4#g~2lb_%ZA<@;h#_Th-Q zP5XEGyfm`g0?AVy^Hi3=#=`L3A*E7lbs)#^sgP@0oB6c4FUyV2naGq~n@P2gnM!+1 znX9#Na+w!!lQKDvu9|W5-VXj$;9Yt8s9bd^YC@LNNMw#^e{xzC84HiNJtzN$DyZF! z#m|)S`E`zc!#UJtj!ItTZ=HuTq%`jBV}M%R2r(vD7Fc4}^9OjcAip2@$;)}V9-Iuz z`9MUMc0jkecDq^VJ;Bqq4HF%3b`Uq`vI~ityMBnpSCUrM;AJoim7k|T%4sp%;I5PeT} zq(hg8EY?_YKNNQZUt$}d=%IOzUXpx(M%v~CoUjYwc*_k^%EadkcQqQi^|H@t0c=3o zU+Vi*v%A@;_!^6s&6Q8j55FDNqWabTT7)f}u_N&Kq@VRgpeS+Y+bOo6t1ODWgXgh& zP|g_YJaPqtT+CUH(!tY@p&Z$5>&NuyjhIt5-Z81@`A#sIaH*W8)^!y)#@Db8b6aN4U(p}ClDD;Jl*c55Lb$v3P*^trn`sOTH}*d~bW4Ix z3`8L648xz$E!wlXDVtLMlDyPosj#W^q4mc-s6D$_Bt$)CbXm!=RuPjy3)#;)7BX)| znr0;HryTN6S}XOjDN+@HD0w$YJ4r=H{&9XJwR@2No82kG{lwh$yg+Wu=c`3YW3!!_ zNk}tAF4U_gkbf^IW!2Kyxzs9{+}fco8=i29Ual}O3%EINX@u#*nhEW`ren8a(5wHb zAv*h~Zrc^qBP};wwNmDbEc3&tEpUCp^R=zk?_@Cze=fhu#b)E}FLOyMp+)LtVq|Hp zpnEwu^)c$Px(k1vLW34&ZL8m%##5iE6;GDjlxVc{Qn{>zMZIp7C`ABs#^f`UnhPTu zay@@`z(&*`M*cY_lf+-+^WeL7V6@vvHoC?&?_IQvl=^Rz7oWbptF{axfmOqvzYpKG|ScSxYormzv*UxyA zZ>(+9CQf8$p$Gr4M70%~(bb)cTo;-YWCa z61ATvLTy#ZPGjJ4jM;O)TQ!9B*f8+r3FCmrlc8k&^)fG4G`;=P`P$?kbc6X;Hm)D$ zROjG|8`5a_GS9R((L1b>4t3g><)Y=&Uc#0&S?p})?|$^~WsW<^d$B6&+*he*=GdKg zblEp|@F{!$1v-*HK({hT(7o4a`F8p;;*a;dAK~{h0_r^GXOi1x^{2(-{NuKKtu4KJ zfG1TMDpK;7u8lqrTJ!Nn8Y|`Jkfw;C?xbA`*5Pe=2xegFF?mf2+x<&HSx`X#NvVC zRbK-YhH{y}zJ-H=??iORNs(ZIRg^_K6keZ_BE4a~bhv5gB-=p^Vom`!#o8HKB))ri z?(1oI3lu|(y8kdo7Q0FD46pk2u=-N_mu{l_v3xh4s{j-xOJkKVxah)vM?xWuZNJ zsVWO6kT_|l#AL&Sk;C9OLDG3tXiJ&TnQ5`hVYs=gTP%b0`baX`2sJ_-{8ojSR5giB z0WG28OX9=xRmd%PN&FhDcCpKU!tJnE+t$++P<#dRb6tC_#tlfVKZaa2>fP8&ocr7` z=m=ty_hdB2a@QfIzF1kSJha$KDIzfjoy=BtYtc_)u}8}%pREuoUJXK9T`E2k_EKn< z7O7)9tM2)T_A6#cBb&tSX3#n5oXYmcLaXI7r)v8gOl5mr(n-N87E89a6>#w;c0^RH z_odSS>sNQjcm^ynyWzfZUg9sz0UQXz#fj6Kgb2oSTk3=dm^L9&`D~CK&6)X8sJ8#L(*Q@ zgK8+*3OTU48ZZJ6d3jj2aDEZt^zXzUyI(fw}O^)rl3a_Cpc{8b3VvVjBc zNxpDSVGXG#81?5TP!=uTE*R0XNSd2V_*{hMO z3utu_LM<`a$54nSv-C6PkJ)IlV!ZcpgufR$|N02c*8&onPoH^C9-^zqf}rxs40HE_ z@C}Bc2qL23MhnsVFR%>H$b5gK6>jy1?EmgRn2Hcq$<|Go#-wd};Vbx^a&ar8WrHkbY9?&Le9eM%V5cns>aOwR(2=nk=kvDcwa z(s<^NzZ=szic)Q(@bjCa(|AisGWos8H9kb+3+mnG4GuGS-Ar-;Zg`TvH?73>Mq1P@ zCE_bmUlW>pzM1k-L!qhT)C?G6;fmq}tc%Oe+sSDHK4c!sGZ1npjjx+wKB{xZpYB~m z7L5h*Tw~Q1nJLRtS8UBxy51@Hv3d#WHDD{&OdA0Vz{X^pvA=@B)VF#fx=K8C4t7fw zB$31m8dSu2Lq<&p1DC3fJ{&dMZC$w*+%(E)dD?#6dpnyd$;+o}g$!U{y5*qa*gx)} z%(r3EcEV2!H4l))>OpA>M%~+vkBBW z^ts<;dhnyTr5tb3z#e6fdKU%-Jz}|dNnSxE6`L$)JD*5J+P6%(yY-q7Eswc?kVxqy z*Ri~T{=}A1R}7}r)ab~o&vKj>m937p6j|*?B7Mp5Pzw|UfoJRXM2oQoWN{$>(tQRo zW$swtb=m7|I}EU2{er}sAwPG~}HyIOvR*X^oc z@&{s{S`MZPD#?8Mli9w8Uh}8Q;ONr7p54hG&yM?8F3A;Ei&_~?+u7pNqrz!^>+>Tm z-I$k_<~iiY?WoGfa)&(nlXK>6eZpN5&Zn!k|8B%uBmYqz=A=oR#BWX;J1A2;Ju{8> z$w*#hu_s;aN7Va?RBev2wd;N_$*4?cpWo)-B6g2!CGg2aSS-8AGv>U5f}#){)qeA! zuZ9@|OZjC0)=HBtpA}E)x5%!=4n(YYzx!R&+79GWu9w|y%8GR?KM`gmRlkg*_pCI8jPra_LS>7Q5^(EVy zay7@K@#$InBuCs_NCB<_wE6NcR{W;WZ|AmVo-I|@!zl*7rqj|gnXD3w-Fv9`=$51$ zz&F%$ZY5`}7+S1W$)6fToFlbh^ias`1lWA+y6~ULa+R;WyGuN*#;m_<1?aAkyoW>E3TyY^FX$9Rsl$ndvIv=-?c;uDIc2X5;LB1s0)x6QSnriV3r`M zCf4|6lwkT>n}B+pT}lmLjLEsJOV>712Z|o4|M+OpqtA! zo9?MLM-F*wQ{lxRlAHiveZD^Na-oAP7a$K+$JIT&8(m~Dwj)hAP>WB?Yfc4We|AJ+ zthkL4+GYG|Q3zx#hy>(D{-%0PTP$`)YTy+kgH4}D@^Xu)dO(so895YODx^PcRwQju zVa#B=UcG)?{H5!@Fq8~jyB(ZiT4p&((MUJ@%iJInlg}OckBQ|Pfz4vFv;^}=p(THA zkCPxSm119-#ae|@0(NM#Jq~4Z0_E~93A+Kbk)4%4ZG39ho_m{twmk>h%>J*|G^^M5 zw{C)R1vr#AQ6jrZgpxGl>G|Ue01t}WxgHCxKK*xNV8pkCeP{v&b&e1=-6rX2($iK_dsQE^_&2n~RY zc3&%^czRWd_+8TEp4fxp$Hxkb0%#ksOcXS`t44))T;@9${v0a7XyH0t5?!wvTvI#S zb?h#b_>w^JzC&#O1BshyHmDjS{91DMMcDPPIsNZ&P&8yS0b74T?72p?GjU5|-k)C_ zN?dox3a+2Er~G+duSfaQF#iwPN*q z3RYJ8Z7eRH(HGK!hZjPd2wsxa z8hSbZD0hU3u07e;_WyC1p{@V5CCng{=|^Byx9eWZa%pB2Py59E8{j`KGvHrJffHh( z0~PG)$o~j-A65(4bU`mcjh4mR{3)_hgExF*GftR#t$!s$_Fb>Re|KzvFa{FEc;vJV zh&Y0Zz7!R3y0`1sg}a{f?V#8W+%cpczDhp+f0Vs_8leBqDSu}h~B#(L>)x* z-uobWCwdS;^e%|rdnbDDZS>9%L>=98%5`1$bKiUK&-3p0ugDlPXIbl9zj7Sk1?VTv z?x!H7NBc3pBJ4l+4;0%THDvX{jvm)k^!zLGXCw$-sm076>dRR<%R@I`oz?}7lHm5d z-#Xb+Rq;)F6T+g~ut!|dkmvv?20U(JSamufzpIsen{yDh5RH(BHrn~=qV-Gy(T$tP zXAEM(SR;vYziml%X35mI52}||;Rn86uA$87_^zISF;yk5jR!v8KB(TiVYV-mP&x#( zd_~^frW4&XX?MYLXHPpA3xjZ#jnz3OrwT-$GxRRSV%%uHTi#Tc_1Il;mzRD*yU&2kjhNz3wYWv# zG%?5cBJpLMru8Od7#R8~zbJi_YF%@B?V5fkBk9&P`#h%7!)IK@>M7|I9PMW(LWkVE zY*ed2$QsL^O=x(FK&&NJ?k5Pi$h`Ob0z=9TKRYCSD+5^p!A?=H^1&`t0fGp7EDj_` zXe;U1kEG7p{Aq<>yQ`mbNo^b_k@EHO#FSjf(E)x_W#5Xb{dH)rCVIxV5@3 zX$J6^KLqM|`n*(KMX3GA*`_Ir6iLdP9od8_IPwK|cm4Zms>K_N>_y!yJl6lLAba_X zw$coUd7q#a^B1a3GoY@&E1I30N%KVbX_LOE-E4+nvs^(ZsUSD=mh)T9csrR{gr@$G z>q_ER_t!fi4N)z>ZicIU9IfP?-l2*lMe#kMCXz;=MCxc@hToR3IWR)Vz(zvPC zO}?NIR?uhB+DmQ~YiUvr!H1xw>hX5#KK=2#TGf{=Nzms6^mdXjD6#lAsEj2w4_xI; zkVJX8_N76#N-ylCmN#h*Ijx*%BM_yT%=Mxb!zhG6iN$-WRTelYQqRN?ZLj_9Ah9O} ziN9+HgeB=8ki|DE##^J(LHoR^gm{4~O3bNFwkyfkt_L}k$)`)G^PZG0#=VR#f7XxB z`<}~AcO#~g&KkZm12Z|%vLiqH))oqWOrfG~SQ>Q(zXYJzsZL>qd1-s3~c^xdJr;WI5{FzKT|_59uKg`0MWc7PXfz_{QTltdosra)w3i9r z=S_+7mh&J&yAIl!X8vVfZ*${7R`hv>_}nk zr~t(}5uIY^*>=3}&29?bKInLFIh8lbEOH%QK!_uVo`6cg)jlB&#xr-OFW)dhC zZJuKO+Q;wZZ&&a@vs3@K!V$$SacVk+H1_aMY(42+NEYm)?U$Dak$Jr(qjSU|y%Iwr z{_7jD*}gYd7?MgE&H`uCkNL*fvp3N-dq%q3w_Agtb-m2$^t~Ou(%L;mpyO%t7?M4| z3>r>gLD1&2b{{EPc@zXlwiTKNCFzR>zd4JrJIoMCA#%;cI``%%6SqpB2z3zaH3t4X zceo9l(`)tswg!YgN;;jj;U+*;x=al)ND8?m3KWMOovgH8GL#-rrn(sRCjIHr=9v?B-D!VrA4TI6d1~co4dlJ4QdN+^o=VxNGudpUx1z4jtj|p!DvK zMrLI-|HiC~Jl1VBl&f_??5P1mCZ^JAY^$x~Z4~fH!YD?rlUZI->ClV8J7Yj*u>RrWnSHR-oJpK&Hq^Bm+&22oj>8QPB zEBf8G6U!Z3&H9ph@*15^^68o@kk)I0ynNF7MHPQbV&O-tRp4Wv(b2-~3Cv!pczeI+ zABmueolY(hw3xgQ9#AoO&6BX$VoU;bhl3a-v3wqSlEu<;7=&({WJbq@X{&7wPbZ`x z#T@em3cOq^&v(c9XTR(`W^7lD^0H=?B8Kw`1v2b$1rsKgI$ENm^|vNr(>^$rr~}%q>(6L zVA}NeNA%TO&gZ|0@Xn1qX>i$iRUb@uWZ(RLLG=W7wlm;hJi|KH&Ns0{?Sg~** z2+3tt%41YzQe<`~g;Xjb{IDD=%SEFJeQjr7*oaK=n@u<9#MCh~?6CvmtZjt(!C=S| z=Vcbqoz10`;BPi3*(pc6;F&Nc*@+*%ESF#T)yb1+%(NrUd4oxjO~T(5&tWQCAWjYA4I^vmsdh(*%m2MX{)x|dy92w*Gv|+#QDC-CUCc9<_SG%t9 zOU~9pP8BRHg4LYPNc40bB@x?<%JT`sCme0ZV=f}DP^-p|#~f?f>Ph|8o!4jYH|-7T zoR8qNs=XqtqZe!m4EL%OG9ykZN+>E4Y)R zYHd~n&nF)r6wX`j(kmknB+Dp<61J&n4HmW{XBC^dX?& zgI=!ph9^HvSFHd=U~VfWM`YD>*&bIQAR4+G)zlkIYXfs@OE=^qubyjeY(?cYjdzA% zI_@l8Mbje{`+w!MnaTjA_(G%~^SED}9iQ3OY- z|Hy{~L>%z`&5A6F)#r=6DVVh=XB_C&Gum$n1A}x=WBy?Q ztUP=3C{)L8*34&rmx`Hmix?7fIuCdAx68IV;k>Bi8UfHW@a8%6?bKPf-PrfP1(8^mPolKuSm8;~nsmZhNhL59l z>?9$1DecP619bZJ=YL8pzOc=6OCJcb)X#M%;bo2~Sf3fS0g^S&>&w|hw^Dgc$_RRb zs9I-DLu_B@wLFkM8pH)jY*1h|>J#!%&Pp)Cra zZ_Ytkjc-9<`hXit|2V@)G#_V}`INpziu-lJh3sSmi((X|imnOH50}usPM(t=8|C&j z)5|v1Xq$kr2xIfvRrqJgbJ#&I(W^vDI*qX`1=hg(wIUoDLw z$^l59dTupq%9d(suUt0L#rNwPt8IL`?dU{`fUc~ie#ae6@f!~3OtG6J^#ve~^`3!N zD(L|&&IbDkF9;#U+pjXQop!7v@^Wn@KGQ_LWA|g*up(aEfu=gU4UpE59jVaeSIm}1 zrC^(gPGjn!E9@5;13*oQzHg6->5x?=9TGsvly(;=R-aYvt+N}Z(hR)U2*DrP4zt^Oo+ zw%c2h|C!XSQ2nl=8~EP!s0_Jla?KjRW5A+4n_uICIm}c#eBu^;=~#;_%jY~BDW4n1 zs!dBPz;vl=8UZJWL7ID>QrpDxq^H^7N=m~kij-;l%s9e5;&L-BC19rIXvP*~)i`<} z+~!k@tjq@q8um2Tgq|5F8$VRw4t$~FV*JqwDGceKNVd%)ZF#H(`pPX6XGUxX zw4@k$Y_o-nmoU53c+HCz(<*_Lth%`TfKmAdJ+00EV}!?A4je2zO^bWuKBt;GCCPb- zQ3-?Z=CRQOxO0MB4n#m3!Dpk~$0dV@Trmq~ta|CO<**mlsx|{%r^&@|s@85(M;oXS z?tCT9roDPIq)~IRl~)R57eTGRXJgXU5nIsIOUIBV+fdJzs$T&rfw9js88xS3B+1b7 zB%fbrM9IAp*OfxSZr+R1=X{W?mt43JoL!(x zv1Nr9;hwlDMT@ls-exRYNH{p@=%MDC`}88Grec3x!WWJMi>uu#qUActy=IA}W6GRU zw;oel&+t!;+c*^k#F0iH!W9g~wDE2esI?T+z4Yaup|=w#MgXrPgp!cXyAQMR^seRV zcvB!|l&^@ToD*Fkr;zuR%fto<@sXSZu9a7)S)?Z~aw$4*=ARBH0D9B;c1W!GvS(T^ zB-u5^ej!m`l=GgCJVbSXzxv4)Y$&tVar2menOxdLybt_gasPuu-6)(q^-YK(sX&xAtI4 zWRCU3ZsL5OclWU6209Qt?KZ$(@dE^%7`R1MAYI<@(peiGu}-C9%KTgv@I=9;Lq}JJ zHncsp7dI$ws~9bZbdH`rIlht7S}a4S>IW=O%g z@*B~u-!1YDo{0$qJ%8Ov%qtg5p9|{tWBT&ucnuQ@#jQxr;8>w$Hk|c#@t#gZw^Hrz%3?> z`bjgbfGWjXf}gfU-7uyo(?h@BOEn_9e0+D>)S(Gk8%TbR!?Lycv^T_gAy21AXJBZ7 zetTZ@NY1NF){|!(W)ViDdhZ8cE1#NX-jiHQJ{u9+e@_c58e7Kb(Akp3 zbF~_p3yArJ7sTnQqQ~B9Pag>x&B9F zWF;0-pOe~uS|3rY*wWSZv8}!JK4uoUQ42(mvMDd~_Z1 zw}0CgI0*c5(f7ya{Jd4+%^$m~7*8k@iGev3MWAT2qn$DNJYjjcBs!ft-KQN0K5(DZ? zpPMMxNfDLZrT$t1DDAKbm)i(xY$v2&t=n<7W}y{FtE2*FeL3b?jKlF9o&y6j zAibFpSt9b&Sy3pv(h=72jO!N?{3Xl1yd+T)BgpQh$8fBtg#*o??h^Nb{Wkb4oa&_R7iibf2||* z=N##!f&0-+8JD~H4(rmL3i!|rb$2=2|91YAcaMYPE>Z5wKk2JzIt?X>6!bOxPkv!O zIi|l~h9Cn<$V}nG^*BZ@I|#9W@7g!fSC5C%J_-*<6!O3nrpfUd`Gqthka=MX=(DwW z>0V5dL3({O`ZJl*()e6nKCS=v6pD-kp@<*l|qW9S!bk1b`MRFkI4d`ZYx) z;JWtV3Ce{&6hM!}0koyZfNHjM1sP8a`r%c96ZC`A;cfSOhebV>pQfsLYR-+$HyA)O zVVPO_rB6vRckaY1xv}pXbkK>dqUbKFxBulD`hmnV0V)*3?PiSA#x^tev(f4^8IS*z zU1R}o*$Pvs|7h6gzF&Jyp$nqc{GZ2RAJZQ%X;=A5t5AA6lgj5-jru?g3c)1&sov*@ zQh@Q~M~$}YSlD^WB#wEZ^l~a&JVQLIl=3$7g*+jXRu0>Av0lX9Y^fOfrLS5L>KOnf z^KcztYV_W*EzI{8KAOFIy*E*dM`rqMt-7r3_}n(JomE6I)k>v5t1P}WM;JuQwS;?{ zZ5Cmbo3MYk)CAiF%q69xUCRIEaQnCrtyGHM0xb(k5dcrn0@(I+8$5(3ztKan|09R} zjODkiEBFC+VY7={Rd*1WK1u$!pclFJ#}h|2g1M>sLeo-%e+z^ zgkhfyZbbdEQbs5Op1Ffo5vlEqS31$s4ER}GmD#nX{&GGCAFX9`nIE6TuDui$f`4qs zmL6yoB897!=AFWEvwK8O0$RDfnNn3R>N#^D+o?9(SNMZtdv5JRI-l1hdwnS0Ycbmz z;GT|t7G0tlVBGCsTA%S+M)Ad4JDKX6Ri*iAZ`brZMVu&KtltyMOAaxi2OZ35N;h$g z+QE{Tze=(otw&fk8~3BRo%Kd;59zX-+qyTCGY=AJtSN+7(W>uHw~!D}tSR8U{OEQ_ z@geTN*@m?%CwyKKQL^vCI+C0Km|!YYg1M#5G&M=D8O;H#H{Yav5Bw)Wc4`Bk$h>j7 zb=5~HLH&1Ze+u;N6Qto44=6u_S`IR^PR+>mBAf12l-r32{2bkD;4B9Zgc&XKumP8YC5Gy8hXNPT%<$KGhV=waQKG@z!}$lm4MD;>I*6+i zx$9tHx0;wUR4KB;o3At%@$Ss2ajK{C_IwX`ws9o6?XIlm8+cQSPki+PKW5q3Gbb)*n?wR5>WPvO11*2)CDP(;=RhR2 zzdkc06E*<)&mC6%?-XRXYn4JW&s>1Qy5H0+CYS+Z526alac9GKy9%mWmuLN2`k?a_ z=;sFj-U`QY_$6}`I||w*0ts>(sfc7tOE9-i>5wsc0(}PDl3*{kU37bdAZ+a0Gzt8+ zirx+X9~$4oxYut%UBLhr&`#0Xx+MBz@OA`OqOdz3!j@2K4>ewYxYom1Yh0xaXwv=bq) z04x&~xAl8oTRZpxObl`n8AkUO_*}o1`u_rd^muwhoHwGM$=%Gdw)4>ia4h6=Ka`E^ zlW5^1`oDDj0+X6KG292peV#J$=WN|!_}>Yn(EFFP1zfh7n00D{JGl^!OU2`>=mwOz{M^dj`#>M3!$l4WBQ_4!zf8urQ% z2-m@Ry*MCQ(3}S{)zgv(cVl=MSy1yi8HF!@#^FbOk{{t&_jB|x`ctOLzaL!_0iWoy zbS!7?1C-75bmEF-xhm~gJuW*j&yEvLo13#hcW%?yGmY-X3?KL^czA^NXKP}@To+mq zobD&^6fHkK6Y#oyq1$FZ2|c)O(t%8zZ3mi9_rR`g1RWLe4XG#MhC>1PgVdL9$SJ|} zO*^vB$H!IfxHJ9x4S2$dA%(O%IL?L4{baG$(im0T=~y;a5D>cp%8wPl`w4|FPeUk(PPcE_i)Uy^dL^6ofDLkClx^pmWfKEUct02mdxzj%#%qu zDZ~y9TaZ0z3rH^#TGu*j1Yxvw)d4N>1X9nlaQI(JIQZU+0)WomtI zO+0c2RKc?pKntXZJdE_Okk7CA9k4`>DB@V36r=%e!wCdhiQD_Kj4StPxDl{1v8BH?o>Qw%_j1VQ}DTfWP1K@XB}-LoN&{aa*1|S zMyTVLC--Y{^su@lM`%a);K;1_1B+@UBKJD3qaNvJt}y}U?T&Zva@l19`j31YvwzLHRV@H`tLIJ2N><%9r$K?Zp+D@~BKRmy3lvY?fHl2y(&C*7J%b8Q& z1w$*F7Q$q_K`8wOHuA)BbOFec;3-gf=|ou4q=8$!r=ik z>CK@{Ffrq4&|q}5X6z*j!{#=%an=g`i9oyL&0|iL*5=m#`t4nhUsi?qwQ`IHhJnj{h>cpUIx~(9=;_0m+l;q!PXZcZ z%_e=dATagL?4a7sj8h`0_B1Kt`gztMBhV%;jALLpe)Ky`7jysSnEn0%e3z%xGB2x4 z+aZRwnwKx4^HncRKec5*UcDaBfbvWlqluH8V_Ck=zl)?DSZel8!N?Fwd*Pz-HhjV; zZ1~!^XfXz!8uRaG-U%N3)mM7nVtv*b+a255>X${+rAeHj9WT*LS~4*h8fu=^n{aqk zT{IpFL4z2MU=XKh@)zy+!k`pGU`v@;O{Ww(58Fdk@+tKqozO+(m-DIn9jf~lBt^!G zTUh3isl%3JwXb8Q!YY!~Q=c`u-`ErL9LyJ~e6?OTRFUA9Po08>u3z(8uM7UP0=&s! z4QJ~WT3QVNdKs(M7PB)lIlUjrjm)CilJ0Z4CUrf_tFN~9E?eh0c*X~D>GMZg%Fn6( z&3@1>I#yrIqm^)iXySsPnUxk;1ylXDso{&L`haQ}XrOU9kUpPy>jZV*zj0js{b`)~ zxx{++wxWoYRvF9B)Nu-tZ>1C2iByag* z>R3Z;Se>gT(?HHAkm!CktVZI_Ouk90 zLFcD^ac8DAdV^V@liM#7r^h;)0i9OWWYwxOrPFODwL?edLkBb`D30>6^@CvV?K{L7 zbAU@alGBvSuo!&MWbrzN2b70L)NTj+PQwLTz}wzfya@)CdieA`IVah~vdC<5y0SWS z^hQ`GdVzo_TwS(x+483U*ofkFI5(~>q>44c!fm%pCYjwgRNtO`%`mHm0~bS#wu_W~ zC|&D|K>`KO9%mm*rw8o)M8(ee(;bQ~=}B(0Z=3aEcd@g8VB2EZwP>gpce)A;G?|FCeAHjfiY@QdJ3etNiy& zn`iCoMU{SrE|;TdeLgHQ-%VT*y~WD&6sSKEZc~lsh1{0{0vJ;Ii{+hnC7#TeZP zwMnqWHv68L{dPD#-44%wro2QvJso`e;KkNGT7;v9_3<60w5KaxtW0pOXw_sSO8#3C##fg?(fvL5&)mb$0@8T)sqDMdA0LGF6ts( z<`Q@*dEyZ-=$~9>VI}r69uBAMIRW!0v8!|S{j{97CfDV_2Yey>yX*GH}kWF(Z36RcX4v^Wf z#f78ps2tCGrCS1)UJtO5Nj5qD<~?1^X9u4B$OwlzP0V-kf7f-EY2F)rb^yRdxoh4Y zg(#zdydq51NIz1%PI|3FrP+<0)<8HcMWg%a#i~+)k-LZcua6Azjsi^6jAJY?iee>{ zQr#jmu6lLe?(`G^Uv@GP_h7{jczH<0GbD5dg`7~Ej?ZE$v6H`YhliM1z&_m8Dk3nY zXzHK>yPdClyb8p+jn{-Ro#4{5`E!zJ+${&(jkRZ24*;VqKg!PcI_Nw`lPzQ-P!ZQM zB-G)-HqEW79P2>9O{3t$?5N8x{>%ZYHmoh#czOlC*H2yJRYN}3C{tjb8Pm`#h?iRG@bBzqoQ-9;jmaB5-aPTcNIwgU z=(w)2#+j-YN54mzS$4gh()>{|J}Mc=%r5{0m~2I6s=36|m9d2|*{#6|JX__Rq-yur zg;UjpH`2W}H>kK=(?x3bc%A^2&_Q7G|56Fzp)cmS!U-59ovqZe1-Jn%k}TdXM+y<& zPc%$g$ZV{ir}9@HKyAwS2(Z>7y>&T&%?rXjq7}{4`CX}Cbqf*bsfJdiI!P0m|p{plh>UyEfE*wwYrJ!KSS4YZ@o)YYbpz+EGPgjXSQ~6 z!THn`np13Bf$Px{uf7s2ai|VFL*LVe>-$ioGQ@*fVX!ryXnu_}^h(Ovf7V8vDSW|g zN$a*{_2MggpCyl<&zBoZR$7Rk+_BL+=LzG|YtI)OMB|tXR~#`(2?vZ}UWV*t0GPol z^?ihVoPm(j*EZa}n{H=_^%B}V3kaP3*fkP67Hd;YS-@HxX+bJ@nzCuxp@ni-6}%r% z9JkRPy|Je`m>ZE#-RiD>&tx|en}Lgv~SB3kL}(=PE$;f{H|SmWs?koNO=c^AANND z8#PA|2TI|&nL_9#|k(Pb2zm% z=JH3ijuDVQK3pXz;?kD<9ekd9-G^}p-ldzc$fm}0r>Fy}?$+jQR#lJYfn%HzNnxhG zJ4IUTt>##|4z~gko`EH6kC%@XeV|4tdM}zOinS&B$v;Uskx;k7p0Q-9dKm9+yV;Pt z@nJG-_jDHOsMC3QUfd&*)nD9wfJB~HW_-TzCDbks89p5H+{M}BuE9bLET{@5E8fRUUp9#J@Qf(L}6j*aMIqVnkQm6G(% zW47}{9Y(4}X;#CFv9W9qCjd6m!}mnW_cdwypB<(X11}h2c-GXwi*ILb7qg6rkkWP- z;tB-x>+5#o@P5)Ep0@M# zDuzCf!(N`#+`B>J5G7y_h7G-di=nIHQu1(Y1@i&>H*#m5fA0h^+sk9`<09=#%QU5Q zdHB@5$?@<)@BZ-(=t=Dz?!%+O_fl1x>~s;;wxZ_t^w*kNFXTtkJ*=Q$c+^cR%l; z%vP&r=reYVWUmG{ciBI1Li{pxkw0)krh0VKDv^)78%?z~crhicnkkM?@0Tx3Z6?z8(hqdh-Oo3<;5Oa$;a`(YC_ep`Mf0YT{~pK}zHkmF@0#BK=nT0^ z2j*N#|Jw-X#G->^!U-eeMVwlSsoc6baLQAUOX-h%~*y}l*}lm$T?B+ zv+sDQ82bcc_X+=(_NnlLnN>Cw_Xlg)-Q?r!;(VN4D8wU*!V;uU8y@;20bTQXuU!LD ziT+9Y-*d~1dNqX}<^)L2Ki9km1JY=hqwrVwRN>r3>Z*kq-y_yxzIRN_4L&||oW)7s z@vT8kk`(1U{1j({En3-cL<>)~>?NCiGu{ono;%Mn*b#~RiaenTdphAC+oYyIe( zTMZ@U;!nh_cL6n8_GtYKJ1L;F*ca&5O5*`Tl0M)ygiUQl_B@Sr0i@FUzD?A&l>nd2 zHAxzdnL}@h7X)UXJUnkPSsMykrk8;iK&MG09Al6Qp8-UTmV4^qM%B`WR7@f`TMAV! z*?X#$u~4nV@*w>B!eZupjFh&i97>A|SqUFe-EMgDK0SiOUH7ePd0-2J+=)M^K+Ds5 zw>Refr~--ERPQzl%f^!V5Vde2-{sT!9KVj}HiF_bTy%K?!b6!Sv>2suwW!g_YBzVC zpw?sPLlZw_D>XO8WKBg(9&;)Arx|(sb++u9ImI1$h}J3o+I2Q`%=&dQOHx-zIiuv@ z0Zl=sk5j7xPj*bRdluam`}1_a-<5$1KFBr^S8oEW&frs3ngr>X{Cm;U)y!pxY3U@q z>Sa5E^OHQ%w8Pa3<m*IPd9%F)3cuT39P^xEYV}Mz28?Sm5oLes9#m7xs z0O*p?%+uH`e7at~x26l&?yD+BdEQ0$(WENTO*aDYuwTlm5ZG+W<@NSV|Ah~vSUsJ0 zq$zk|O%WJy!Dm6%#aqQxhg}_^<+Vp{a|k@)Px==e8B*^N4tP5gP0R((+_jePjeCcs z8enJ%C0hdStgyOa5@4o3pG~mxav)B20j5X2gDQ4@rVNg|zjlkMV2M!`dFWz$UK6_l zphRK{ zA?iIvAICBu`itFj35mNydbGKU%eKwhj9(!ozVhRX>z>gn@A=BDuVK;Pe4{Yr3B4R| zbjN)YxEx?x1%bf$BVmUk2K%%wpB-%6O9jrLzdH2>+g9H0^Vlg$#jmgbR>u5(a7&?= zlxgzFAcev2^z=`XJI^_ENYu_Dx?Xm@zBanAyH;A{mQCHD9()q%RLjZ=IsZn^>d+B* ztc;1RD0cX`4}h#rd^ zaJHgcu)K3Qh>*=(t2L?MH zw;J1ME@vg94&-o{?NLO04=;BaX9(l)51k#+XDtBA=Z@Z>@?q z>D87r`s~=jm)HP(sq&G&G+E{7xi?)wW^1`N;H;y6QQ&!s$)hk|PFs-OylZyRU`BZA zvp9A?Rc}jH^ZVNF$jjo5>|l#b^V>j+lu3>KU;I1^C1gAfw7+5Sv}P4pcRQnIV+Vq+ z=Nmb*g|~-~O$kE)9U9Wt`L_g^coePc-2FQt#wG+0v^he4OwYPE16ZQn3j?nGp1u(pF`doStS3>t)`KkLV3sGU_ww zybX!$j_D6rkKPp~eKH=Na-^7+Mz4|D2FM)6iOeOXVh3}M9CY6G9sj-&t`WXxp9G3D zy%uvl5XrV=nR88G>ED-)b>#uzGSFCHwyOa8@{}z+7t$Y&_}#a1gBTt);}pT&bYvSt zdtL@7ains1+84j_?9__UQInbz5aF~*PLgQpWa1G-IAj$5CX9TEIVVn`nMP{8|2?kq zqmPmW%QMmXcJ_+x^3ha)3rhLb=|ML$*i>yNuRC8mB)plAkkR56x?O1$2#6M}%cRRF zFYmr#3a*K+FPehU>7*=!n6nlcHzePm7Y|tJJ}5RHI-brPNT0??*RNbZk8t4obmTGy z`63>(vKmr(W3fH^SnkQU2k>11wiA%nW|IXukLErUz}NQ}QUevfR~@)&Q}|5!xK;6~ z^k9M8&MAtvt@N+A(xiV+QK&ejKo)6?@v=QgAQj@@Fu^vD+isH4_42vJAf(cqLUDbT zKl}4lnv4Yh%U=zK2$xP1xLhgn8vdZzut36?q!%Ir)wZP|t+y=I9iIr{OTwuXYlc~$SUgt+uXIVrAl zjq^q;uQuOZD`}M)^0MJ3S&<9oWLVMlbqRu^ntSHA0mU*&qS7iDc%*EP)`JurAa3@x z6B~=|&jbG92!^Yd>gTm7q+~0!(hevBIpqfOT`e+UF`GYk#Q~(Xe*p!-2EqO6v^A_d&&IH zsgO}c{ol-F{5}jU0Dy~U;hoOQH$vdp1b}gY4w}?aM40Ki>llM$kSgM_%bs4bQ-$#z zU$feQKBVxlIsS5a(vS4~b-=pdZ(qHX{Eno+mRMGYCq(A76lINu^uX-MpCf$`icj{Y z1AhKI*9FWJhK-@&NHn^ix-V9gnU#BzP((3zZLEcYcR35Z0fSzIeJl&HZis0rksyX* zFJBeB_1wdBpt-7&LB!c$keqWx-!wJ6-i&awM1rM5a}Gq2u(-cX$d)f=m7!qPfM~s| zfYYA(to0fBEt0i=K&>t5C$h2ZD%7S}ZmKlnaN&=Mo`#W#cY{aE4-!{Uk$-xkBc_gKV5tM`}vXME^33Jt(cM51b+UQ^dkfP?~|viVsp!?c=f9|e#| zj1sGVitSI?(YCeE`;uoERRRyQP)g9Y>`5q1&eLQDkIG!E#I+P2UlePYK#9udhy5}= zwZ|naZN_NQI%ZCSK*jUnNNQ50G5a<~A!{+GqBGWv8w!j!hV*#A?kbf>)2^

      aJqSfRx|RXRpkJNcY`LC|hmZ6{?oo$o zv8kncar`S*x(QVKl&kZco!A1om+cWB0j7oavjDKcSF$0A3Lw9(SCOdryJL@AJ#a^W z3w}R4v2}~%KNP_oRl1yV^27SWaA-hEl1yEKKiELh57_1*0A-G75!1uDb^@TODF=vB z8-WXNAp`1#Zq08V!M5m+6tnSj0@x8?YTes1M%nre+dtb@qy-yv+^#P)=U=WGdOZYA zHfWsve9Hj>wcW|ox?ibW4I=_db1%~mHypfdotxH_DSZcXl}QY(Cf<^F0yP#w)|f#H z00Gmx#pFjL?}NlR3+T${$R>97VRYHEykE%#+JO`WV=mZgfh|)VMY$qVQK6uSv_Jnf zbU4xiyFr4D91e|)e1&^P_~nUvKr;;A{Rns`rRZrIudEVTTAZe;CeieOt%8uEtC^%^ z`e!#9>V1{rQxoT)MS)Kz2irS6mG0LUz4d8LA^2sRO>bxCZVN{uNjQd9?qUQ zMk(F1dt>M}dT}#MKT~3XxVmypa;0A$jCzcaz0>NINSjWn?T#&VtH?S#oWm~EwV#Zj z483&P_#)JS@8IC|N*QJ{5c4H9)AIJLv-$r13a<Bt=8?WqJLej*t zA>OgH{Z!)=kRKE-e>UjT;wnA5*MWW*SR!J|u`#dvrf0}~#B+lC&2Ubf{P&nd!?sQj zoUR~_j<(HwjiTvnt_(t<)^`BvMV{UH>zPW3)UBAyXS^5_abP=}-JmzZFRqHvwC~`R zFyg}v?x4d;oi7$B@Yf9#ZuIqm2ystW-HaRv`6yp)ZFFC_!Ns%mU>xRs10~n-$#R@| zZ0Aw#0H0VR~V5w)*XU|%^ zl-h|No2E+PYm!5CHZ%%5_UD+-&3JR=$(z`n|M*f%K1Q;Bpc-fqHmi4lpj37&;%UZk zMH=?U%X8Bv`_qRQXvM?zm$N+lAjYrS0v(Xg$w@{%9de65A)YFEHwrtEd!MDniw(kNb2nXS;c|&xVcu9+!2nGe#C=x!*vVRjLV~x@v$g1 z!UOVa6iRxO-?n1!XFe1vUJaq9g$XX@(xhcz(Pk~$FGmZ9x|(@D74{L^yBY0ms%n|o zyJTZBcvhmW(L=z-O~4i~HK@ZRJ@YD}AwP40px5aOnsV!kBgug&KvjdA?3x3MgNNo^ ze_K?C%{%&dZNqI;AEawFiLj44!GnDDWgo@Q-s&N&%4Rqe}G$wwsB@gC_d% zx+?j*UQOCN!B-x%(ekM_&FY;q8Sh4&zNVk)q_nwxM-DX5Z z&HFP^Exj+9Mpb?Jy-RKJ`L!h4XZ*e9HZ=l24>k0LtCvk6*5+Iwl)>>KJjeM&AHuIW zG-(vW;Sb#-ltgdc+`;zYwtX45dSw&%aQ}T9zV{z17Ldenw}vo-R8H^PD7?1!fx?q$)`VIj z@pO?qLZfpIKPZRaZL|K{`E?)FKp-!&<(xg+eUP8ZYfjDruUpWq_|J##-5tEw=xW`| zg#28xu>p7u_+a@Q5p6s;?1SwR!2#i%1g{hceql(uN>B{QDTz7Ddwr6cnAT_6e}i3OzoQ5=t-*8 zij)1NRr7i}U#H)laogiOo~+u|u`7bmHVgCqCWW=g=~4VtX3qTCT~g|7mCT=v{0IvO zg~9#f?);HHp$=}dw-*E(u}>xQiX~><0MPE_9dg=yZBD6;)=Q7B(^is#=_DXOel%$d zqYOPZatLnDKS21G_yh+t%kEeV<~uZZ$|7l%4r2hi*U9w$wc29#@JS+CqL}?XFS+qk z_u{41$Z^cpbn-e3TV9*#K{a0mm>beFpWdhGuQjvyTDlP(LDbYj*E=+g?M-fqX>ENH zFh-~C#ixn{n0s&uzJ}BT2WH`hj=D_2P$SI!$u#1b(gKzxov3}MZT9ED_w3`vA3xO@ zemX>X?h!)=;pF5{p}3F4tWv~CIS6r-r~O^|GV!bZ8dE^Ve&#{YBV?4tV4Vbcszxi3WiYspNpRs(zl`4p_e zxlZtGYa`-b@5J|Rem9>gXyrH=?K9cO2aMUG@~&F1PJ97Zs&zPI$LT~*2O1F6&S0}@ ze-%5q4zRw)n+}?cY14!hcq+Ut+DCXTd9is?XE&2*@C$BVp0Xl8L^NtEb!#4Y@tp%^v0?q4U)pU09U_M> z{1HPiXWiBWd{>9Gs;Tw9C3@a9y5_{A?@JxKZv}}#K0CNQk;Q)@*fRGi&7|$-_bH*K z;j01nB+^+t!m%n&?lOIKU4JaMt37USIWqYZV(N0nXW?7B8;K^O?OpJTC^URaK}l!< zH<4+d7bG9|Y11Q{XW;(>#FDKa6j8qSs12@Q@Ove*L_vvqL~%B5(%D1inT0>3#cM5b zkjx`{F`no|A)mU|kS#@bA&1PdUmLwb#wfJR%P$toIf#R39uf2=88FUYpE0^rU{SH{ z-snlCY_Xd@{Yk_^;WNy_j;)X?EQM?Csf{S-+5$c|cNBW&Zs#^laO~3?6d#1r_w#nS z>!^xzH4V9=f1NvJSN7XcARDtm&wfaL5p;xF({<30a^Pu!dSVa3e7jEj^AD^o^sDcD z#PtMryM@l|z%M>d_JSkP(&|>SBQD+o`^~Zwu&rH4HR>a1?O^3nI+Gp-7iiNzJ>6vp2Ikx_lVH{N&U*V8|_@wA#@UiJi?4NbMNj_K8Y ztUcD_9>fX0XYw#>#LdnaQEQBed6x!5ZrQ^ZEU|ah}Q+Zq8zjl1KqP>(g{t@4`-uT zlDSL+its|&2D!Wcx-j;6TA#Tlw19Mr{4)LWkro4@(SgtrnjT_@zO3D3ec<|vOLc@& zAuAxEk%9Sg7ji^DKIQ5@3P;1-PAZ$j+}gC~{@CB?{CN{vYI1wOgAuULOra_ab>GU0 zoY8+hUp&eGqnPqf?K})EczqBVv+O z0r2oeROz1L5Vw}V;3Su&1Ow5$Geg-Pa%~=g!=MmoT~F48@z9$1qSm?aQJx}t7x4t_ zDqKVo|H8b;Whrbaz?w~pQ^cpgm)rfJIy4loQ}oZ|*M37@TvGm&L!pqvDVcJ7YwmjY zP5tk__F};SCOBa&jz?SZWW183T_;V7_nEluK&)3db32#B$wzprhP(JZ*W8SVMh)P` zs#?qOs=dU+k@O>NYtB4tfC9s#*68npks8zchT-D&I{?oC&%1=8LK$G>W59b;ebSko zX>Dbdygraq&zLKU9Xz|hjXpUK1;59|&?n~kmfz=750nVWM8n1d{;u_Tf~r{%U*5Ow z$Cl`l;w92wImHE72ihato{p0ni@r`8RqmsxPKtfHvLKy+iocWk6L#5BKZcP7(!$oL zIk=`3FX`SYk44i$9d($0P$sYOdN0;sd5n|Zmq<>y%hW4abhV{zT7_X<;VC6arAgu+ zpKHT`$u(qTTjzicMmDEeeQYK^Sh2x!Ft)-5>h(>LMi<+D{rgLhe8X5p;A`{1L-LgHIDgA|EmyZ{`<8= zYj*6re^~(gZ`~`iTRRDz_k)Uimqs=f*^_L0WAjpFVX^gSc7kcZ%4!-Cks-G6RDw?+r%$b2o~61$Ec*2%hT+8s&Z6e})!6 zw^yGigG|`%c9N`%P0Z03ZSFn~D>qQ6%-C+?43j~UN;$i-e)&<&0oX>CKHqdU-MyT9 z8%_FM8zP~~BX2m3oD(s`SJga*t=dcF{Eqym`J3RDAiOZZnm7m}6XW4JOIb{TzJ3>- z7ukJ6pgy;*`1O{(luSlt>e~2u;d_}X>9+IrJIZw!i{DrUYNgu3Yr&>arybFSRnC)4 zc2ci!z-UIjqGU454q3D(FFy~kPWV=xG-=_S1d4^~Z4&z9Vx?(OAX`j9t^A5>HeatO z&xj{o;+~xF{i*(B>jo^uAD9m|?sxe?Sg$=FkAlyFU+=_79G^Zlvm5`!*Rw;8 zLcy0x$?wY>PwWx#Zm8b7Us;y>Z37Np;`@5LxEUQhVG|qwlgX^)T=!;@>*#$QVgBvc3y_bGZ`4Fs| zpj6At{$=1mg<-~nuND}CVROg#mZ7&Z8`dbgEoJPW-0<#><2g$+|&w#~W_SGsz zgw~jZAADvs2f|>Sfw4ik*LE$5czG2#slA-RMlQS!`uF1uM16Bu8Z`@pet6r{J8Yn< zp@;8js=`ph4OAE=6EO7gEEaUhENEX=_L#ymaUuUYn@J+s@)+WeqK~{`35(*TQdEGwMn3st9-|S0<7|+Vxq9SjvU_OHD{UJ-A zcv+A7aCPR7uYM`cIDr%FuJ=_3e&Xt@PD#Yw@Swi4>hrvE&czY@?lrQv{FNVWqY-iA z#`KqEmP6N+E4q)HgIxqaoWxo8S#B2*FE<{05le=#re#gm9Q7|x+_q-L5>uOO7i_Qa zTaQZy35EZy?e^p;Ashg+7PmIO4Fh)Mc=kvXyAO3GUWe0OP#2ZcwW-=_%231u>%z@u zQGQqFS@8p~FaxMzWVeH{!ekdIv(TC#q5&I$IH|GS1FV4M9^Hq)rIoX)=2`aiPm!OLp|P_Gj4ms zWXgebNha|T8=bVH9%SRe@Q~alT;qFuVoSuNAE1Vl-5aPO+cg|rut|lt@fwCV5#}%~ z1XJpYCr?K zHXgceuP%%+y@qYCe~Fe%Vi1k~sqip-k;LIfV{}TgW5YYrN#A0N>lGtB z*iQHH`5SGEhxl*QKr{&3gC*Ftvre>8 z&s?k4-p#Ymo1X%zp#9ggON0p!vHb^!D}k94a@q_v5&QgWG5cb)_;>dMcDJ&?ghA^Y z#gEVvn*=&-GUs}T2H;PYqaCuRNoo-dVJwLTqN${RTe(G)EYloAJjKGkzm4*WUv0Pz2#kg~RR*z|bVJ z(^)C7t1?+lpA+~Ja4rS|y9ln&W{FVo%czip$?9aH1kKKTS?#(;!2GPKA*uv>;tm(1KStx0y2+(Hajt4&pQN)Y>z-RfJfrdUDGEoTmW3_Mgn9k}qY+K&(-0J0 z*0r@1xR}$}7o-ujCIQq)g7q(EfN)nfMJc{HsNez^$$K@_1R^adTFmnTl?1{zfH2@? zI9bwM<+|)ZXA~KD#z5m$#Xg{&*=)KNg&knMQ+IQ@_~oxDVkQnu0^cBsXHqfaEh@AO zHMT4K=Mnz=Clwk%=SyT+bSzych^-u!ZARy0aN6>EqFJ0!Z4#w))`!GYeY^VnSZ340 z5;dsRChGnLb|N*RHsm%P-gTnbKwI{qR0CVW7Y;qEGL~jCcCwk1zph&3XHxZ0S^xNU zh12)eTI9k|{JWVhxhAQr8T3Hn{f~2TlX*)9r;pZ=uwE6YRg&Y{K4q^3de2u@;rO@VV55EX7V4S}mHB2aiWUizU&%{@*ceaU*sA)nM( zVDRgV+`T`a$6R=SL_^%p(D0($3WFa zKBA{J?O3nz^|N56ZIg%iy*1504U^`OK4chWZUu_P;yQEc+4soPJ`T?x|BkvP9_K4Q zgQF+%Nx+h_7@l*Y%glr607isp~Q7>aB0nHZIqYr<_h6}Dly?`g8{GX`5(~% zK6+n~nAThNI!klMy-)EAs&-ltisvRkk2B+B0RJaiblYfYEI{PZyH{#r$aheuh$O!8 zo?z(BVNtTGEHr8sgu{n65FoV%XBKp0f)F+NBs zlT5x9B4fp(lDdE3Wx|+Ax)##|L;Whc$a2_k`)BkNJg4{qun36}S;f2bB2;|6o0R(Q z$^C;?tzpAQL)}{rA+oFz9~gKhIphL4E=`ZP7GTj$CX;!J8B{`=1yoLh6-4^>H8rHgn$vgGD%I_uN=F_mOi~q01JT;U zw)4vX+4(K9jxE0F7!gyNY}ZA%Z+X=@IQp#E;HcO%5F<;}c#l8szbfuv*KY2j2Dv}F zm&&vN0IK&vZ360Mkj0NHb#|)z&Vfamgi^qnXomEcZ(>W3V=VzNG0+kp!cc)DR3%b* zU}eZ{!Ct&-C06g|(9J1%1v2L(glQ;3hQ-`O@6U#}jqbW4U3fpHk`tJsjm0-WsM2sf zRA5^CqF195R%SW;ej!COvqseL2`;D-9gT^#L5(TwuYHH~hf90%MP9pGzUaL*?24}m z^Tf^aqpa6Lm2th}a?WQ&+K|K9^e>!^Wb?xF8w-D&POv*Qu$KId74nNXFx}X$RcGr7 z7)k}q#k|*|U;^Yms5OL!bK9-cvspjMWfW{MPI)<_f z_u8~xZ7lLPp0Wb1fk2FDFq2(Z=^_1IvXAl1De?+d8uMz$P|9}yRHOZaT{bLDXg>@ zC_=nm&I>4ZkA;xM_Ty_>gyM+qIzsPYa5-o`n5i-}+km0o&X}zVs@k8(l1v4olZ)fd zN1G;+Ae$Ke)Z|X)p!93fPY~yF-@&2QEHq-a5G=yhc2F0t(S+LQN_M&ZQ^>moSm^OT z)vZHhqT;b5;^i=|nkl+{4kn>an94mdQ~jd+-I1DOArf`Ck0oCtmjvFUq%o+zJqBmi zVwInH4&|%GH@WqJX%jlY^wD&tbycVGYj*@GOKTfpr!PepBR=ao)=Dm2=c%6Kbk1-N?{Ffx6 z99!CoO!gpB)DFHs{L0=_lvJC8^}jH)Ksy#3_KTySmKYlK7SX$L%0b-)&!ZT~Ei0en zN5t`t=)8WSCN@{9z&QW8fA(ds@sRo=dnKN>hUc}1&2&xohX%M4 z{bsKN>4s_3uwVip5YWwR_S%Rt=dAPylRP>o$K!lD7k+?K<$O(P^?ZiJW+7@V>p%DQ zR8=wUFrOs6PzRvIg-W3Za)kvnVX3TQy`%c8N{#We=xfj2AGM+~2hp>d-`}3}fYgl1 zrDx~(M`>y{V#t!X9O(2R$Wq#yW(WBn$0wyYivzHNXU#MCdnNN&e=jf=oa2~7S8IK2 z?T*@^mA0ufcbYC%P8Pa4%s1C6(Fbcp>Eb|ILW*6xuckR_k+)7k($5b0yrICw zK!XO3(X`ey2(r|?e|Q5n%+ls|QMbmYI^kt4f9X~5IC(L}E{Ie1v#&^0b98DLQ~=j% zOOv@5H=e9(QMu)F-=)#eNBnTPIa2;5S@j2$L7x*4%$J1##aZLWpJLqOFjl$G-a*kX z0j?@juGjD&gSt*^^I&=~w8Sy?izKW131k!s_foE1#sRp^$qs!^x&cq_L{;+!Ky_(&Jfq9u#Z zpWi{>Vmd>ahnKB7cvA;6anc~pv_MJ5wLis>j@@BLyi-xeYlXZ@{6JtIxPCBR4v^fsJ9? znd_8-4yGGAKe&ER+9F@BA95fKkXAECc@bc;ph8ad!uN7$RU~t9zY0f3oYTT^`F)<9 zl!H;rp)@k3P%C$@EIbm3;62nUQcTLoIjnHDQo%hU{f*Rh$79?(NsBJbGv!=QV{a2$UJK3ODo z?@vE(c`@YFfk4O&hJXGhPG9%ozn)jKq0=M@!dMR!bT$S!b1%k}#c}us@||}8IGO3t zbVsAJ@@qUw1i|@-0gZVxBC25mpLjB8`flt@KZ;tFvhUS@9F(1)Bpy2W?q6NG`YGcL zI6fgBegDDqNq3?h5a;u^P+OAmtH+{;F8ixu6LF5+4^&^2Rvf-nxD?~4$~(CUbS;ct zbyMz*jiZ`}$>i2h=b0qN=q8w*0t!Yu{TiTO8q#L-cSPl<;|W zn`KZpG(K3dhgh?XP=&T864Jy_|JQ}f#u2N0Daw)19efYWmlXjhDpdHx$&dp1f;ZZZj(n(vA^l9DMZZ~#^pX29LwsU}SlLKDjrDSD~sd@>Hsxy5*EfDrSQ`sNh9^Mo= z;~xZKxFHHR86e=Wi@3~3&PKD8U=JI}u-5XzR5X|A&k#*RiBor?j{vYlrhzD$B(3%` zjah?yQD19g=wcem(9gA&@GXJ;v$GaU^cH&TwZ1k?>Exif&!l^e4+8-Rip?okck~$F%%V)37_h zO*w-7YI+63KQI_5ep~?Yo{8*&=TxTfmR7esaH1BjJHNEwB5uCzl&o&ka025DVq=X* zIXGWdfDHO)(9x_|c4DtWvBNQY*2v~FkS&741$ND{00$DFeqt|Pne%IO2no{_ow`?= zGu#&BeJCoYY^{KGZ`v`M0w-!$n&&bbx%EtMltE82^BLbj3>Q8VE9hE+Y2~lNiMiZyttX^d#R}KN8%x9FkZx+vDD=;cehSJ2FIydw-zRkl~C|)x_s@(7TB;{E3 zGD+p!bow3d1>0!IE$8?qGpn-y! z?n1?Vd|7?rDC3MI5)CbS^hUyRmwiNhsel}tlr0}YR;7-Em6B-d+nd`!2JIz}I6YXP zI>PI2qHR|^!iYUg*f1zIA6we|D!fH5)0$XtsmnieR=XVZU7#_=K(Vl_K_NxNo;Cxr;IZc~-ly`e^sj5i)wV(+6g|4V zfak?I^cB1m6-s~SiE=n8EM5IIRpXtgK3}^;4mGQJg=`4K{ihPi!7?#Dao3d=8-H$; z6O!ljaN`#4+T9L6o4En+mtV=1m>l|A(eQ5P?;B5>+@JkuYss|=fZ&TAFgDfa)3#cX zR;;t+72JBy`U2QZiPJogOMUDmY#M(@1L3|Ag0M>qT3-!@CZC3*??J>)OX(izpk}|# z`Pw_fUvq#~bB6Axn8Y@dTc?9^9%;u4x#~^tpbC{D#0{vo2*WHv z7o9O*?%(P&vkC^(4>iXLX9hq4H!9vkMaN@G>Qun{E;V*bq3e?m$}q?fw5EZaR<#uo z90giF@ytR?f#9A$426@SCS(%X+kxG-BeFYq(xLIDK=9Odi-;b|Gf5x~$)c-tDCY8< zhOO6TOmcR&KPFq#^GS}TlFGuAs$d4D3&&^?Q-+%ON(caPycZ{|2;%)TJI0p*w=qO-b?jwXR7-%aYkTs^2Z*5+jBIEu}9U znw$rsqxw0PMMv+_m)~1I{tKda0shyH;bS~oBpo_sGnfKlTJg|S+5~9ANK?-ChMQu3 ziW?TO#W~)>p)Sh`=WdO5SqdQslU_g`6gpM|TH=qle7Rq>NdY2*I2Vql%_sv#@mRlh zD3JZ`4E=>LoH*vnl-(L7anK6~EQyQ-@;MRDF1>>t3(c5-D+=Je*0RkC_&|Jj_a_bH zvG>Nj<~K#A0{w!kY%tA7ii(6$0=+jf-#8L%DTX5lBY(M6X%m-D;vo53F(H7-^5Fph zw|@pvIg*A>4MpSo_<@5W(r3>eQ)1U2qcm5~h}>h)rt&l^`mo~+5S`TE|7?_01O(1G zP)LG41eyeC)P$Wf5B?fb2G$}Mi6-!L*8dQ!&OU>gDe*&~jAqnG68)p||4Ta~_2yB_ zSc)Q-xu|Cgv}SEq%`%Puh&5hH>c(V;pnBAAtt=OzUd5QNn0P}sUZUC z-5qR#hG+wfgh3|rK$K%+`u0-Js1KyW%75gh%O=rgCXh6pjzz?&`oZ;OB!~n5`H2(1 zf6eHH59i9ZW{W*FlJNopZHNCmDyk~8@wk-rVLV}}6lwO;W>IqI91MCC+P|M@I1u5d z#77y5fw-X|%f=fW%L$v-Z();J>~3~{>g8wt6^9!Mrinc&Ox4TVMECNWit3nH=c_P^ zoA3(O{T-(NLrr|QZ46W{3Zm&NYQfx|a{ruZni52K@z%KXKbn~k`X!=HrO+-xGDI)t zXbr)sqIvk%bq-)pV!`EqRgE7V7!GtM;U{|EMtFuX9(dR(P1kl>6#mg8#L6iHtdfi(hpcLM`Bu)m4s>~_AxBuk1`-dO;u1M08(6Tv0+|xtN*Ws!j$kQpuSSbvJp{SCk*Q&rRJM)ezUtSnAA5xlA+!fQ@pk zLUt;(ZWzTYb&Ttqg2A71G(xcREfddyOppM;@-4#<{&GL+ zF|kTIngvW8zv2KHy#E6O>-n8sj)q{yY{%$ap)|(d*pju9Q zOIQI^9E<<)Yq1;T863xpFRxs_8VN6VYNKuSvp63C-SjG)!FW^zkj_P|PI}8t8Jv#{ z))OPP@4Rxon6G5~T`R*F+_cD_(-?AI#a z({i&z>PbblvFWc^BrZr;G%B8K$Iu(5FzG7!&t6T;+Nxli6+hPbAUoklIcmW1V{U?| z%}#_-Tbz4}poQD~_{ zd;R3dR}82jx^zKo$py}NjVr+zm_T6yP=tLz)OGKg4i1&46hU~P^V)N~II?))-22j5 z<w*%G3r0n8bG*bW=*|gX(%5XcT)e)PM!vyR84kuvstdcWP30}+Q52xm@e#DX zq9eJVPh1x0_N$1C8!(P>HM7^qp1w2SdNaMvHeqJX+*Uz46Cj8`Ke@3~>VYDuN!2$g zgxRl%J5cbzRjjdn-^^$n*PzyDMp`^yC9zszO*N{ZrD0H521D7hD2iQ6Vj#g}xU*lP zIChc!;0h`RnvnWz?2!zWO3?uyrVzYx#`-Xlku|{@ReMN(anS+*r)m6^NcalCX|g&1 zveO8F)9fP$F7M|J2*Pp`#kBz``!kO+VPXz0*&uo+dQ#-QztgozCv@uvKz?lU7SY@3 zdJOwFU>G5R@5(vsAYYcAX+LbtF}m^#c-?s^y*mo6*gTyiz062G5NHDiGO{VE&|t_m z6Z&l=cb%kClg2M4{&35>eERVp50-MEeFru_CEhy8I*G)!p)DfMCz1 zQ_MWv5&N3&#eM6KCW>jLpWSqWIx*fY0bL^4p^>mua=QGgSuLMAquy)M2v9zd<{;D> z9z*fE9#xugnBq=-1u@>xjJ=<%Bf^ zxRS1u!4aHXfxrhg%crlyYIPjUpV+hMa~@pQho~@FGYr_I2jfk~8EoF~rpnUv{{}Fb z0;9jvBF&oxuf%ItVx83*7AdbI`=eL`-!aj)z7CK%Kjh~cP4Adhxet!;%;$V0T{N0iX_;7W!zSWX;^ilG221$woUG7^J1H3`btV3EIOWzA>P^y7 z*B)AHk-MY&FdqL-pkL%NW~?3(dsgchw!|gXYXhF=gXs=$yg8=&jPGwy1l4Z*+Sg24H=;{Cw~6>f{S8UYAs%sVtf$ z#LeQeoq7Nar<9Pl;l%pXM@JppUD%oF!IiPyw65`yv5UX|5l<~SW`t}%xosDB2h(bFCSX!!r*EpdI@p`0kU1*}K{beyvp2074gz)ZOw_A@|im@xTku`+V7&1q~tic!eQ zMg@Bg3A&fB3-O*_m*c1KiG5S9NmKB9gb-@OsibL4t(Oiqm`i|;o_zUCnEAu6ssijT z=dkK6_Q@w@gG0X~yYmIT8-VRJ2PIhIMq?yF3WEk@Km0!H5) zi`I@gEVlxY-T>dio2mFeIHXGte7t}P9srhkuOUCMsi6m2>>CO|V%;rLI$WGc_g9)V z#D2yUXjIg|;oBbkqu`Xp=HWS0G2K?UW#x`|92 z0LETSh@?V5hy5rsv3~Xc9K0%s!Vd$iAzbXW4(*YnHs%ZYjy9^k$-6(D z{|+-bewm7=i8qPU<~}`N$349-09?}BPH>IrPuCho@JIFYdIw(M+*~G(8y)sDE%eC> zU3Q++cBc#EEb}aI(Vd|nRiKf{)-Ev#(1iv1RfQc3$14HmVb;thqRCuR%v{xPMiNz}@QLGnl2}A#-CEKg0_$x&~$jiryzVIfb%c%)W<++F6(f7tTX4 zpfKJ!w>_9)^_rfhV<&UeUAYzP6eu{yPK9Qf2=5S#(K6u}Z1M0**y2RS46fvb!o^vO zWf>ZvcTv=Jgg`Tj%pYa&!Dyr^aqBHw{wM4SMDDW_sCSgdV%%dbkgtZ-x*cO~VxYQy z_B^*1aYe+39T8+bQggn{)_OBqPiS_9Kjw54_+UKQRQ~X88R(-|zben{Fa897FcZ|U zVA~j0_o_)4rd;!2XFPGX+c4?0B6xdW#X67YP1&yz>Jkw|4`y{TP;I*MVp;QO9q}1V z6%IMSF9K?)>sW`VJ1+e>&pA8DnD?di^*jVVf^gFyl!s!IQ#>5dm7G4Y8uLSMn%ODEFcuEP_wb3u86s z&F#G7ZPfWl0?TA2x!oL*=bFD#D^IHmtgGiIa~K)uEU%B17G0GQ*gr(N7%kK^oNCxC zw*-dR83KDVMNCn*36y4}GS3DA|8uD4;Dx;SC|zRI+3-Ej#o#*qasQ9S{HKTE&q|lO zgTlVBXj!T5&Yym;{zgs`5C{fH>mIWP`#>3dP|pMLZC}hud4i)a>}D@Hf;QWk6XM9@gI>ZfXFd^KDj%mt$p-)(rJ;a14^R%Vq2{n zB$!x##Kwq31d+~}c= zY-p%UjUeiP<}GHNsdB{&J1#CKXh%JFmqslJ)R;WihFP*xmAI?!f>>aMK$%F5HJb^m zH(1s!p%mLBW>81vf`GH5p3dEXCfJ>=f`^()z|x3Dl_@7;kE$N3&htm6QQyg3R(ZhZ zFc{(p3+s*)Hh+dPVoB~~@$)HfudlNCkpkC6}{b>I6%<3>dWI=TQb~ zq&eU6m+N?OWj|FLP5knZB{ZkqDFvk-r?mWP+_F6uw^v0P+@b?D!@h#f9D7eTDr3)F ztU%8(d-P7Yo?_`<5g(iku%2DEeoOUMA3MkExkheM;$C+Uics6nf^nvUH)^(f^+CNP z|7_J*ij6vU$q$7?U9?StHY=4oZ+9uA2>SwQnqt^h;&)J9?Rz>eF`!j5V6|7Sx6yNb z_|xn~!fVAX>AC{&jx?C$ob(KgF*3(G-=z3)HRLx{YXc3XQ(?XZq&EZIR^mm5# zkv5u*SL>^zm%+F=rQW4Nai;nsf>Ta6{31&(P~{D$lstb+w1{AioyO}6V-T$I|7PWWcH^^fcpE(v)7DUASG zUK@Aho2&;=U~;cj?d!B>!ZEJ=t`7EFlOOhwsl5z%WjJpt;Nsd$(P(J}o|@Ej`M(2S#%>yWu8kNXh4yq zH#51054Z#2gze@GE>?%RWBHw#-&?u(rJ~syl-eocik4f0?6V5vpK9bw<;$n{i8m=N zhQ5UI@#iBp;$6Se0!3U6l&b7AgGBY$$766RKLTa>boKMz4mCCUUb(7mB?H}uOyiOy zy4wPVa;;48$Me&Sg#ypmh*}?8*mt0=pM>mN%c3!kH88n;O60Kko&cxyopMZf=8b6b z-utB4>wlW}OKQzFp7>rt;!1fr7x&Blsg&CLNbalk*AiE!Z+*|QIceJf=F+QD`Pd;@T zcbY!OZu!nP-&%mMkFVnlo&Y&TB+&(B*K3ZLq>yX1QFZ7P3g+<6ZLy>AVbiy^_hT+# zF4RCGV!zvZ*vUm?HVZfJujM*lqG$wSacr?btOjd?*EY=pIt}#ryzIiQ)?-RTSg?=L zAn~bmKWaJ%!8rCh7_NL(t=6Xk@nSx?EM~eLnx3D#Lst@40cJz0506{%ko0c!yjS7_ z(f|j=J=XgrWnscYoVEW-H7k(0#dL$pIcG0kKUI9^Q7jkn!iB8;|y3nLz7PT;ax1?A8x<&0dno{ZL{8>oJl zkL7&`j4`+jz&sldYgRGDMLI<_gQ9_vMlhiuNY<)ET~c@Nvhmvya!I@nARUzQ@XO{! z@4MO09tHsJGdY87w@2S`LR8|l4s28~ku|C`!fHo_dqOaK;-~*QqxWUQ+>s2}nXLvL zltqV1G~P_CKnQCWsMl*d>0MyYu)Uo1E7ILgBByhsm#DS!zG|9S=a@({a4!n59=-F` zqp!7meSrWke-|Y&cdY=3in#1Fq~fmit>RcA$E$jGkasCGiMab|E z!Cs_J80*9A4sld6eCx7t#S&lMIH5poXU53neJPtN!qHTL@^?7#B~9@!Un_s=&~TjW zs5h0pC<1j8dg{ju!>Vef8^$=s8`+Alp4;s$%Y#Ad_XC+W(tzFegB@Ne;{3JE2YCx6*hkbbErvO=FzO_``ZM=#-Mkvf_k$6; z^C+JKq}KZMD~QFF20MLH6g8@(Y1Q$tg%dY9_v(1uevKZ|B1qHJSiv7!m9aUd_WKh% z{?;^=TUWpRY=hdO-o_qO8L;vLk&+D}Uf1A#)8$$P_WD|2)K2l49MxsE3yx4G*vi>!q}&sX?I&bDax((@K`Ik{$A$uJ;PVaH&aZI z(8Nu9HJAJ_6QgywfHYbBn|JeU_}s$cxBTka9fD^{SSIF`kZo5t(M;QAN2!LJ2q&!A zrG{jxNl|j!Gk0X{kCWI{ahwj&GV`!9W}?whmJc|@zU{E;7`wM} zjmOD1GWX^A)BZ^hwQD#~`~lKs<_BXkdp#MXl|in_fgIG4#h}rtt=`QLDRYFu{PtSn zjV(PAgOlvf;fPXz}ub4%)Z9{+hBOf zCB3-eQ@2<|`ED$kKYhA-?nTKsB|xhL(?1fvMBohH{&u2iMIDcLUf%gYNyS42j!ZDG ziER)?<^VK^=z*DFstzu+E4&Ulx1p_9inBe%P*cRW3poKJxep>*YFEy_KT^9E8}*M|&V?Rlv5)R+SSF^vs)t^$z@j65HPkRHAn(aCGFnZC!O2+y~mHme#s6<3at}8xk2f=0Pb=Tw zGl0%45C%5pPj4GD2}@`@B#2cN?22wQ6I#a1!seLvACtHxFhP})X;6GSA9*q;oh0N^zgF&45t1Wu6#;Z+MwhU>K zt}hp83B!pirRuHunG?>C?WY(`n<+;*?j1G2ah6h!|@AY0my6T*LVF2HAnn^4k)|YTTe5BXztPf=y>@0mqE2|%Fqk0NoN)NOFhG$}6^P+pBDMiWcM@wZmQlqvazcm?0_30Cb>}UPU0+=5T zl%Wmw(RX&kL(fnPclI@MWySa=TO7NR=V|X%J#W!s_o`06dKtA#o32ATx|b7zXNObom;GL0h zBtsLPKT)lWIo;JY61!ceWqT~Je`JbqXVd}NO0K+@MmRlY7Wybn%;~;EC6Q1$CzJmi ze7=|};kEVZO4n}#ktG7tqcW&LDsDxs*6_Ap!2q@TZ<^3MuAW`8m9o%Z*HFjTV6%#j zn~NEYiE6!UCt^Pkz$E_2nm8g5A@z&&iDoySX`alDI+qls>T8i1{3k*L^2ZZGf5dVM z$`q>OsOPd!cp|YoPrhi4!>x~MNJ<*sEHO!s6fd=6-4X(-HWjdOP$!2@=Byk1tq0AH zD}*}JIRX?O+Yo9)JAF!-vXb?jA#LF?Ku#-g`L=peeK{i9E`y}~*8VV}(GrKFqmQqx zb-(wl2$Bh|uq=R?eQyE3X(Yk@OYkw zy}cuhh8kLb2888=>1ZDTf%4r3+cYK#EQ&(w*KT|f0+$`RGXR!@F|!e$%S@`fv?Sgz zO$&U*_6^j!25x`Qlx^N^7gG5)O%l=BngYs8qUi_$wiOcQ-uL89T&PQSnzP?0t$@U~ z6>odJ0H5?8*7=+ifK^$x-eIIrx!v&}J$^VTr;88mXQ_*O8H_za&6fPu!Se@B<*jWT z0Mq?wUL&lv+k9V8ztPNKtN?hTa;E%Vwz(q%*cJ=(T}GG~bn$lqWpB;nz^u^H<59i) zPTf1K(VSvmK`e`VfC$&c&nzSuwv$!XZ2d$-|qf2`@d2Wa_4#Qmi07U zsQ|xC2vOWKO|JJalx$zGPhAgdF2L$8^@`H0@B=rOageKX5-X^aE~3{z5AbH(S8 zK1(c*M0pDZQ_^XGkLeWmBv(ip#fA^E_1x8`7vIynt{1h&d>nqf>2hbyM_^K^%nFVt zk8Px$qU=EY;*i9DG*5L-=`DDW^ohO5H7_qXz9yaJMszymz~qe0umhI zM&lm)zoCz|??p5LDt_q_`W_!pD)<-lF=qG~`nU$14*_m2k&+nl(ipZC^>1zZbb37S zfO`lyZn94J0zIqlkejzD^^SOH&Gvl}31{fY0FV(K|H8YIUSU|~>;F_pj!BTksTGEY z1XDEn4y<>5`%)rJl@+&U9GFsQ8!cjS@o$Kmc{+#7P`knv!n-eYARr^PLVFk(r-k|1 zgFH3$q2@uww+71RA64CV9fDgxvr+p|B{v#LtRx~E)0K2pgQlOj$8B!x@fRf~#paKh>czpsjys{Y{AGYNvriKOpm?ZP zh3%RkIo6oJ$IRPxz1|w5)eH;Ylq3R-AW1 zk~1ij!VT<}BDaROr7IAv*W&PSc2ee4Uc<%r57wLXGwY2BOCZ*z)`%`)`?8A>Z@mL_ zbhRMrHTw)eWn`N_o`)1$EdR(JrLqi1_^X0LWPRMhuL> zBC9Exqws z@c6!0v^jHfV$M=nk|CP;744rHa`z6{oJb0oVv`Rhvxp!1BC*HhVnD|Ruh8~kQz*=Q zKjfucJ>IWWD-%GX56G<)5DC6K5eGll7AalyW5jEzDQ^O!nQA=GT)EsHPl>w{dsXzm2X=n>$P=afb9&AypQrKChlByWl z9}liKEPF-`yyM0@cKCwy7JB98qU80B>R=_zk@xJ-|uK z^68c;)4IbYn>;W`g#H_p*gz^pCEG0iC2>X@K@X_FBPFU&fwuxEC08WdhWjlu<%#{C z&y0l(_Dt2pr?zoGU3j=LM}_buGlkRXwU*g+@y^izcJdh6qb`7N^)yZPNjdxv#6(t0 zb!Vd8QGSBa*Gp;MyLTd7ZaZ*koNhnHU0$cV!5e6PUl_vtx!I(SFb=_=%I9V+(NWTQ zIM}mrjw>w(xI6%m?U`0u!S+Ne-ILMf=^gA{>533LCm=A*ICrfpBrEmQK$Y%kuuu3g z;_~ZK@Xu3bC=v@GCbn<{{3|h0lwZ!oLxCvf5Yy+wcZrye;O*&h(UKsNu@UerPz+F7 z&Q&=Elmh@Wq|h1w0lwq)(}~lI%8N!}jC9j1aXz@;kaPCL7giAkYNXrPt!oqC@=DeN zB&ryhd|6%|@^fh=vkR+Wy?TPEI#_z35#JPh@T{VbaR76p#S#L?;wit}Gqrg5E0K9? z+@|hy6T^3C3hLtX%`)SgU6gX{?=oE4-j7v>qwa`y zGN1K_o+;l{UnBIxzIsz3sTwgY&Lo+j4BfB7Q?8e>m2Km-l6#Nc+QidT+e7TH&vE);~*2uXWBlDO&zkC*XOkKzR~Z0Y1OU)&2g-LG<+K(kA@Qo{2vwtfizX2 z>STWq-nAE^iAJ-1-dn?y4(0R1g+{T1qzDXpmMx^Q2vkz@<aJ6_6|Jbn<3>&k_8vJ?OL^#j5~s`joA9mBHbRBk{RMX@2MmQhh#x zrfi6tiFBWO7k`6+Aj1Su)~lfa0BJkleVC5e-HP;@r%g3aUdacJpYdJgv!rAlq19XF zYu0w*`epHJ;?*<)_H~mPLnrSxT31!{Isqz!CMKCG%$Zj;i`)mUS4*KEz%;tmp=Bi3 zvao0hbh`Y)Ix_5deN9@3KcP&Yqn6CK&*Z8xH2+}{_Kj5p&q}oWTmD%?QcQ91kIM%8 ze!0szQ7=MzoZkQk)!r;erz|x*Y8;C;A83RYEzSpshH>aJ=BvmRe-UXwK8+sV3iq2etE%Sxln>+5N| zI$AgC0A8`8;)yZ}nq{l&38tJMX#{YJz5q4r4e$6}6ns|d98pWs2EIdUO6!*4!{Zaj za8qJ%RZc&4;ZKlZ^Q%8WhWf0Bz;J6$=z2S`)=`)%GiC`}`Z!1n)I!as*CuuoQ1y2| zvK7}7n~NLGcRP11V43bm*POL*u!aY76sML>P;e?9OmX$pvr46PA_)5`nQYSGr5!SW zHB=ORp=^Tx_>-NV&4yOHF@4Ewz@sY+J#_Xw9gSn&(=N$H%GvfQUMHa{Z;NIOli7Ld zhEP}@)Db3^%NVbR08G~VTinl;HkevdaxyG`Q++-HbuZ~8|AIEm;`>NGq%)YHclfKl zl~HvyKh*K$yR~{b8&GAbg5uA2Qt6HrGkuUVQ3z?Q)N6hJ^6B=4tf){ey*T=70O@&;mcS2-3I%Yi11$xbYygeEh(t*l< z0wq+&nPow9rXe;X>OuNop#fbW@5izc^F1suy~r}LPJHjlOkT~}{y2gnbG5l2phj^V z<8&1|5ec)USm-z89;PSk)gFYas@POTYR80(1p{&XEs%mDKPZ|2aBZvFAN=^zt)pAG zm0Us%t95M;4%z7z*YO}H+;oFn{`!hBUb5I=N!b2{PcNOmau zC1EniWHpq9d+aY~qt%+r%ZZSlafb&bk$B$&Gmer!z`vLDkM`RX)B}vd#(TDCx}Bl1 zjv~%F&Oz=sofbcJm+CLyyB%`K9H(GRv<*V)I+3cUBO}`!Jm>6pa&k80^!eQjM?fO%BQhJ)A!&yO>(S&L|MePPa&l@J@F!LyL z+LxT8XPlehlgH-UGi&*@{-0@{Wg%NZ>2HXJdLLqjriMNE%x*iI>qOio;*mFKv}QIq z2DYxaKCw4O8^t2D9WWrBylqC{>Cu%z4F-}CqOG63kB6L_9L^HjGb(b*HO=k4Ec}ix(l+#U{c}2V- z(WBtp;w%$!_Tk>%DKBEpZ4%ihJsC_AhAU&(q_gxV42IBCPBvET13NOr_@3qZ5(N5Z zD6an}Sy!RuUF)xM@*&Nt*gm;>bPwu5VNP|>-dv{3g6gO6iKo85)wuj^s_8yz>)BfR zvQQ5mnEIL-0OeIl3HghDR*eDypVqRb(Ry80a`(#&&s(l;;njUT_IEqJPj>5R)X)j_ zpm>Y236*#DX)1Z~uPPm?n=1mpJ~V26k+_KW6JjW^jI*aF8_8kT*sT`eBm(2%H!SM= zx65|XYIKq#I;MiNHI3bIvvRl2J_(xaw%>@-;G^p{j7J#v`oYKI-D5E6o$ei-9RQbb zyBsag^bP!MGN5Ob2D7riJ15a(+CaGX=CYRsq;96!r(lD1maunNTkB%%0COV=z;;#v zq_vaXSB|nW>DC9sWgw|DVb%_ZbbWAV+(f7#!0LpFO8b`VP~zLxt7>tPa|(!-1vKGg zMSAZr89q%t(W#!ewdE)D2wD9QCY7@vlDOjbx-V1Too)@}vtfP@v#28MZ4hTes(1b~ z45;+qQT5|@*fb4omrjb6cblpb89AR)V-+3FewJY;{Hk(9hXH3uu#;?#|o|D?pS&c1S&2G=Lag2Rx* z)>hYDM=)1OVF`O_)Mhr4OwPQ3c@FM%_NrOxjI5750ED8BBwm|)$5kNR{cxslylnOf zo$}O$df2F;k29C=HF01>l(oQZq$jUy_+cL@L;70?|J`#sxJoc5!H4Pdw>f zdp6LrdNb7R`OGM+INfL6yg-S08~t3tydS0(0*q4dc7Akr0x}~8x8_e<`R~GvojDE_ zIbz?_fntks5zv4!6V;UI`cQM%&j}V5qP_f+ud#n~*?|@O)fU>f4Hny!R%FRA|-r6vf7Y=BbqOd@g8Gf-^sqjxvH|FeZt|Qn-$Lr5=y^`y9^j#n`S$K&t|Qr#c~hhP1L0DX$>u z4C?i%lBwwDFh77vG9hxXs$^#VGa-#}#|4@Pw&fX37~J{~G@(cRzo7}WrV`!~FF}K$ zd!5%x22jphUj~{nUP5R8ylVLz_hsMeY29yEkvo;B=AuU8EvS*oYz+0N@*@D{LH3d$ zG62SiyfI)3%Cdk@teosV3k^eYS|;i5yRksvK_&b<`2Y*W`Q>97aewj*Fz!>B_)=Sr zD78$@g$fz5r=3AQCJT@eM2p$&@{P!4&77uT;eE^7IfCG@Xx^=}DG&4&W;go3$=c9s zPuaC3g;u>-a(5`8!d&<%3vOl56i(83JfQEUb_?v0&B*L?N)$dyCQqXAzxGzYc_Ic{ zSUW6V2Sjo}C!P6akmJ?d3O?(mF}T%_eqrN zlU#GPM_;9i$O#~TPUK`Tq;dP4S#Uzm0zYBW8l{?nu3Q(V*KR~^1H){i({;PKkk`C< zPA%EM;Vn{iG3o<R_ff;$3ScGFoG4geaQPrA~sGNHM~AIlzKkO9$M;`3~%T@ zHE;k&18D^?@2Gy8YosY(q2C~denN(H{6*cIEVO~&og)*p*K0L_DK%>CrcFXSN_9QR zh3vICNhL50k32D1D858Kk;VnC`q1m4f5HRn zB~`6~!j=SM-^7xqdgFzZw2ItuQm)TWYhV3eA%ZojPY}V+Z;V?W`=u-q&vzqn&H;eJ zV(f{wdnYRACcA#X&G4ev?AVW@#d2L=1)GUwKp}44)=oQUYh&2TK(pya42vEfKTt0e zeWlHmDfOx*trX)oeD~0OHB$^FmJw z%R`Q#%UWnQ603*_TT8bx3=K=s`r;^Od5>-tMkiuTda+yJD5FsUiekDRL8yoTX>fg7Av*)wqAI0kLpE(( zmNGGp@U1f-Ow*ARPY8+k>@0>wk1Ho}EKaMZ?}{Zh=|o^%p9q?0N})X{<&2{!Hn53} zenyU&PEftqxabaFp&32g_l9+DtmpO{%3$FgxRQkUoel$)lr_u$Is>Q^p3i`(xE+e+ zBfdKxoSjJx)2-TPgZOY1=99X}hXv8;Vu0`&Ox+b|S(sf5qzoI!mOOI;`i8RZVGOVt0}+XX%QZDO^Krb;}1lD_6}w{audHp@;Q)ACR0DON?cEW(k||KJq3AzVO=HXKlRiVuI$Xai7?qos0#Qx}#}96H_l z(7MC9#Di;~Nc7j&bRFddg}Medll0p^ED!Eq8#l@2`|fd>8E(e++E^faSE+o55UB*l zmJtifniR`F%?66W=cmHEkj&-Z8o7$t`YfydodGnY&;Ae52&uW-lEG=FHr0NtUuetj0n(zo^afW_-a3E ze{gf&z`)=hsGzhoTI(~qH@G3uF3;Mfuu@?&TdcDXD$;5#nq!1foCJn+6d<5&A1*ZK zFI4gA8_vcd48N8Jy-ql-+h53+{)uIBwvIQ;kg5`JSu?%HE*5CJ<^QW#*s7gqdOrFt2sVcH7# zSILm?9zg!WefZx{fvDAD#;9mVx$!Ln?`_X*d0nk^z@@q!_aDm|r(`fG; zDlR=?!!lG1t}`Mp;lkvauozr)KH;ZUcxC&*v<(6+gTAopLzjTB(|gaC$^I= z?$1}c7<}O6s)Qf#JU{Ls*rB(+;K7#=T3>UK2AUu`#8NS18pUbrHM<;@^EFjmJnja8 z%sJcb9}`3KZavoi2mrJ!@x#6R`Wp>EK*`au?xI!T@HsRb6qVrlbe)+Pa+#f<;i+l! zPIvH74GE7sv+crax}e)B4R4~G>dsWg@H8@UufK`4c>IemuL>hWex?4TC#&A+*F~L< zTS+X}p?Y0PkKo}_yPV&>qvXQrUJJPA z+eOu6Ml=8XXDZW#%(Q*7E2^u^W@}Q0Pg~VgjH^X5>o+YY)k#!0jRT3Zao_J6oD_z+ zcK4L+r$*KaZaLYUe7>hP-hFNn1l>L^`9%6|Tru?WlV7L7@Gu}HNB2dM&Q#0&oSQ7< z^cyikAl~0{uEs1^-qL*xycD8kw{~=H48&zsIIP}r;$hFYjgPLpK=j)B&U1gnpT1(n z(|1$IKIcyGiADF*v_&*y%GWz2)3kA|l!bpnKmv?Wkh8R_ z#JeOykhs4pTUM_Hnz(w~8@auvsiak}oN`h_6XvtX0&A4US~Fc=^DSPlS6Q>%jMZAz ze08G*^ZGHqzao(ZYNgai3%ewZ3Q6l#BeG17O z4+4Go-ErJ+-D7)brH!QALuQH6fdbr98ni!FFHwE8V7`(%AYpn8ezgt%1TX|mmvpz- zlM#N`msgSDTho^$U+4Y55WzZ|!UZ$)1%iq$sY$G4;lfW?p#!3=&iHIme;VJo21l z(RGyQQWAQ42c77@GX!U?=^N&;yFI0csN%DSu3yP@JUs5VhIpY%BwTvgqoRt7L%p#J zzFP7+@>x&41ZL4Nb5~b4FSkDaxOX+{NlkZkUI%72dLcoPAUnbmhO{87E;vC|^_~K= z8y#Ctcqo{OiDg9}#*`JWr@S^Ar8{#qJ-((3XkFr|2zr&u^0VGH9YegQx=p%IPUEB7 z_4F-ofrW|t+PBu3FLbYzc}ry8W`%S7{;Y_WStH>LbtGBXl>c#CW1aI1?`<__s)Jsa z(`NI!(d{F>hSB9#VI395n)FQ5Mu%MTDh{ zCbAr#Y46V>r#;uLIjS4nvJOeN0o8nk1W;Dg46k1Te(KYMA=qCk6YPz>2nL_7GvP0GtGf)@gl<|@VAy(NVWLUdG=fIhh^CIR3k~IDYqH1p=a+#!q!<#<(t_aZRegp9>4gJD&rOhq#L-eWO!)yO9 zmBP-oiArrIrxP)1kqu6L<))G|lY7_jM-SDT))8m1+%SD1`=p4R?G`~i_mc3U+2nJH}Gpl@Z~GLjvz_!$S+_KW3*BTjyYS983R z#pU~w!Wvd}S0s;9u3gV~7ruVBkC~9F_DyA8W@ybCYbG3yxy+5Z3?AE)|3RhsPOy~n zaZL5LE6+D^BV6x#?Yuvd`!hdU?SsusuJV{y$Vqv;NCY`0W38g9V55>+S#b@Q6Io;) zSxn5_=A<%TMm;`e*&nSU?o;xsA+rz0p zD)LbCI|g4zA*M4gSK70NY)KRGn6+^WZ9MKTm&~4)b@TyaosH&9kHLj8?(AJodFxyD zetc$OD;E7<2qm2_7jH`G+S%RRa=;@V#O{nz$jYN@<{Ix0e1J1XFW0gH2_4#pKq7#dm1vMYR%p z*kRgj;4=T!SZxq77>^4$@|5q@Or1oiTd$P zA6_JCOI#c6HtEFSWJ#H8URzhX<=bJsOc;xu-H@E6P7r}|y~S+A=gP`JV*|&FM_sX# zlZOeEcsCb3PN(sbO5)%5tG<$>txiYCi;|)3tui?Gw(v=>{x)%Y;qSTfvbX-O6Qlc0 zY5OghXkH`P6?KA~C|;q+!x4t(t3qDy=zVk~i|dZ6Im-L6CgAfTjR>bx(e>6e$&hk` zp>gPe*Kv3-yZn{4_;!$mp4^CJHixoF!;ToO{R_zG2oH8cZ=` z{_cBKY)WZ&OC+kTvJtEg;BpFI&yTU&yfbmIiq-4UiB(QWyypMPlao&4_^wkBPKJRT zst@fhH?aOO^zw3MrtLe=&&MC}MBvmD-xmEC10Z1%&*Nk2juvalO4NcsS&AmDG0WK^ z=Rv0jZib^V6he`kFg@86l07lm2uBq=84O#esH=Bjy~YJ8=U)*2h9~1Ahri}8c^%x< z|9){kY1>ThTb4ta`8tZ>>0Y&cn`u|P%&Gslm$M%MaV& ze4j8{=4)X#<*QP&H!yu4>lMTS5q0QS@hFVx3$*3TfHV%67xqpEm-h@H@QHFJh;gHA zK3I=hUqEf6^N**u*mpq~j=?*q(RymCIU^?Rhvu9x<>p}OIroj@-=2E$?=URdkHM5; zIs0MdBeZkkCxDSuWTbfbTzj?*x<_?5<=r!0ibXEWLy{}-^Us^@*D>+$wz$(dKzoR3Vi#$%i(cEO&b;HT$_W`dIrgHks|)2zNBes8 zoAg#XZDkHLvTPQ58{14E?a_vD_qe{lD^n2I%;q`D@g z287;@X1|8@fAREC)`g-Ac0483c|1%TFg~K|>a0?^CXTK%p$5`Gm3Wc(Gt=4chTq%q zdk3zqp{13oUJ#kzUaxWP!Y867mW^3F)P}cxY4t;Lwb&a;=seAhpcm{5_L>rZZc+X?lT(YZm_=$%P$OH?~1rm-q~d;v{` zA3(Ps$m?^?aPca(d{$+6GMe%jtZ@=WAPNoj;RVzY@k?Oiix$JX-Is$(QA*>;eYCnT z#T|Ko zFT>!c=MUT8UjkYi4kA7GdAVZ-4${A_Ca4f|VX?pBTjRU+(q#R}+U%1hOqG|$yFM0w zzfjd;QJD)XY!7vyi0mMv`k{WtcCMFxnJnVnAf?`35d+fo57wxI4dz)|89hEQZ?NF3 zA#eORD6ax?dcUD8bf_O&<=pw5Z+BW6NXDuW{G}LfkgE4r_xQIV_K07r5RV4mfgH<_^ zHmyj|8!(fEE}tgD?{h#QdHYPz^m+Qi$jI*Huq$mRx_F0hjn%QOlG>_1l=K{E@$9dU zq?uYEgSFw76=mgtGrQS7`5$9_Y`) zNOfJE6?zp!39D=f>DoZ+8R^YOiY-D07J33G_W%vPg-VsDOLy0M!{He~ zIb5s3$yu|vZPCASzDWTrb(K#q(wU399XfTNEc~MhA#k_)9dj6Dkt@hw{0Qeon*e^^ zKn7f`6`wl%)0_Fn10wK3u<`U;l#QqKhi|I1Vb;i*HyG%c3I+<5J&eCzLIWQY@Q6Sy zz`REZW;(U1e_UTqb{9jKLNuaD6fYOs~3aP`Bzx%&< zKat-&H=%()UcEIkU|2mPO8P;*TJ){w#V$tcrG1HwCR;YyTVhJe3zXcvT(6n=t4(5< z3sTPV@};oc!F^ne3e}|UTq6Ef8J(Zg+e8EPBCk@CBnOG>*5G&!3~ z3DmyY@pYtvY(ko?Wbx+e^8WDFi<0yGi0#|LjjB|1>|y*TO1WW@@{u^X^Vn z(wbC!B&XvW-R-ThUIJmt;FUu=`rR3@w))RvDc@Nu6e@QjxGKEP7p{N2=lzD1%OjaTIe)2H>RMOk zS!>B+y8VKbE*8yH%BMom>;8G9jBjf6dYl{aJ!PLX zyocE>&_xJ_QnKc7%|^`_!$s5Lz1!Gg6?xpcih=D^j}9#35*WDsedo;iy4%JqwPGTb z7KB#0wl9%QdvecMZE}Rtj<^29$LrS@-`2tkpe3Z%leyIxOB!@a{GsC!8IvLGBPV_P z*9L?5uH|9|7>7jcSX&f0hm7b{p{)<%^2c(C)aRYC+m6PJ zW9>|#}Ey6Y**(=>`?#K zV0$GXxq%)4QShx)kMdx}Yyyq3Th%q-cJw&qE6D4AvpoCx`CvDx&U`8;uR^{%(ed}b zi092+Fr~vG8A);Gmlf+BErF1ygo>>0ICI$7N|QGoa| zsj6~QxVg4fj>kA}bNX{usQ{bXT(=QV>~>aUd|BkMFaw9jjJ|6j^Aq{Vs5IXdcddnq zxw(TxABiD@+(2i{r$V)SL`mf)%RT&od)>~o(wTq>1&hfN#<$x`!DCF-+c{`GqK&eV zdDsc)1CJLFMX^+-(!;Cevg7SGQm76_p+DZu`O3J$?GI)YE21h9ROy#>%C}R4Q)ZL> z7vx{CR3V#p3f|a)5gUwB6Z(h7s{+k7bp4{0url-wY7~0rdT&-)YV71Jhcg6-h~kNW z=Qh3}%C*Qm0XFKW_x^AEH0#YJ@0(my3;Sxf;_|_V4=8#AiFRG1t@R_&QEZUDv*;X> zbWU4n_(iJ<*dx{hF3%?Iv2WaSND~47;iUc_hvW>JJv9go2K?pN^OCo<1f_E3u5Vr0 zPA|wY;N{Enc{wb_3(Lu!=8Jg>Zt`hM;nOT;8W!b7?8P>(Wy`d>Uwlm%Lu*)I)b(9F z%otqm_1bxH6}0~Q_V(Yxp|E=K31g4m)!IUv_BW|=m__K}Rf^nsYq6I7#KQ%f>zm6c-enOL8Z*jNY$p;DO}HvU0lT<`-5 zN6Z8neoPqI6XHsv)4a|Rd2Ge^IT@TjL(r8hFIzVmJWeEs!z!I7HXjY9V2*m2k@FdB zatV~cC5Z$T^xm+fVSg!JKBaAraL&Ya<&E7N7T=jFOp^@WDJjkA!yO6~p`126Dq@)& zo@**T44>)YuZXxNf>HYh3mA{qEtf zR^boX8`Rw3AbW%f^W?nV#*^gb-+GN+I>0neS6dlTI$LVI`571d z$&Bt-?p{5D%P2fV^=P5zsvhg}O`{dbPAZmMYC(zz6$tI*xZS~0*Ds+I{F(DoH;|wG zM7I-@eIS6hscBKIae1HLCEa$Zaj&X$Ap=t;RUf5ko-JWS4`n|;*KDq=N)uQ~D)I!F zT$Nw#Z?-dcXNMJ%roBgId!QDrXjYt+BlsSBt50IF<3Qh!Sz+dU_G`2@qv}Gq{>xpc zig-np!t7F>Bz{kAKZ#5PO~s=+@^U|q$*jX1m2n)vU)8`7B62=H?BJ|^%3+)cCn|Y= z8rUMA#%S7rdS<=gmb8YZWS#m>^Rs+e-C7Qwl7q(1cxjxFYYfWj1Phg7>A+y0L4BTe z$U#~4v6SL$3W}K5XgOAY=xtQP8%q&k5tIwTnE6C~y+bv^<|(-mTJG zGo(^_8%-G;j$hjPnFug7)f!LwIAX_{XfAG&-(ec!xI9e2Lmq0ZS$TafwF&z7JnqfX z=>;2^)$Eek$nJ+b$$9OsIZN!qJ+`2M$TMe5Q!0F;MhtArom%x<3`#^!k^;=%?CEMP zCbQ&Q7dGSCRmTfDBq6QWPL!IpsbYf{$<=Q0NIhJTQ#2eyyz?dvC8?YfAe4;E4|5)M4MRGOgJP0koq((gC)NMyY|S=j10Hs^O&g7JM*24_5W2W=95iNA_NWI;$f zeu-uqHL(C-;x}7vI^qR6P6=|VRoplqNv|*G-JltCJJ2_Jp^4FhyugWK%Vmp}?8=;2 zWM|hHL$d0MBySh5_BDU!Y!vEf`6+)NL$+q~IIyojYM|5lI$ zr#@(;mE(IGU{p6Lo5-KRANRh(zr!4bFc{q>xUq4#CMkAIPEoH7scTuVk&@;Ge9^t= zw&4JC(g?iFSE{U(t^J8jn;7(nLq)T&+ifwYe5#myYdhL}q^ZLez^Bd?;B78M&!ev2 zSEX}g!bFLByFZ@h>1l;ZtB9EoQ%~#2pMd_8 zm%C1_80^#C*Z;;_=X`TdsNbWFE$X;7lm0ZBIZKaT#f6$Lz!jILOwT5nojQv2TQu{H zGK4!iJu7L(OBJO{!kw*Is-IT?y5ezzK=Vs+Zd(Ide;;#42#r7D`iLH7&y2_AK4k9( zIq6T?;{S}E3cf)jz(YKpcRmsYeTkGa{2?DY-E5SQy-=PotR#!CX4^CR$=NDDV`~5X zSgVg#KRr#35mvb;&lD=AFlipoc>zI?qly>>IgIWYqi>}?P}!f?q68!Ro;9ec#>U7^ zcB()@CpQso3s-{LLd)^=%wqO4-qWqTAbOP_>8(u&D8EvzPj`<~4Pi?TPs}ETJsTvp zQ}Y=wvn2OJnWtyHY+3TjPZ%nP`77`e3w`@{R0<&4cHxC!qK_SB=c~M*_Sa;!=+z<| z=EQse@zi2wg}s9iyS9YMN#qKVk9h5i7hsLy^SOo6X}4Nnk5oLgQho{+y+V!Efh!~g zRe$&?{rbZP!2w{1Xd`Z@w$-UO$#gcT{jO&CFj)qS>o>IHnj870@Uhl2Zt;RNt9j&_ zxk5laX>M~lp}ZQlzV>AjhFJL4UX8;~@f-I`Weid6jj1E96^4z76XHVVe44~6+pCw& zTfAz5jVs^gYLe16g2K`DIv=QBJp7L%|37^TqI~&Nu<`M?_nSWW8Ix2u#m{ld$)cjR zpXDV}dNTfI0h9^c+us;2NfW1@6-BZO;=@9U669rN!IeTV^8B*nWDxY`F0k=*Qkd$X zY%3KdZm~+StvkDjG)0svcN91$2#~<;b=&D`(X`*~s=vXBPn0jy%6DaP*^no-zYY+w z;`F$7Jtw)>w{gw)yeOKG6*rom`2^=+Lt}grX;X=D4Lw;V(|&K9X?eLtFg^DMVjSGS z=Rtj(%4&dKYbMsT*dwdeC|Rpdc(G=I24WCbblQ9EW699OZ$b$|GthsFDTqx;6gJ7s zMtxC4Eyl^|+uH1wCHi)k+XZX9o33X{^>;z(&xuFqguxW)Askj^9L)j23KE=Ezx+!Z z^UWj_%lq1y=9yGn0JQ| z-J-(2@N2T#WT}$o80o+s6jwtxG=R8?hDj7JAVG)@gDSn|4aVA(*JK#ajtkEzzlH zeXL3-*~gvhi)|KjQgsD11|eo#7h(e83^Tc$TU4-pp19%O6KFoUZc9aSt1LLysA zN``Hfm3k1}X0?LUVl`DHwGFGJ@=MG{GU9B)Y!ZKXn1s}oYC!)E15SP#nJx^KF?g|{ z$?WQB+c8M0o4I`k0BEjbzW6CvXcE`G-=xwx>^iI?I$ljMexc=1w|J0XlJU4VkX!U1 zH=wHZMzkxpNMv2q)dyGQ3(gV~=10q1njl9<>Bsx(E=h*RXce5oZx!P6Et4!yfpYD} ztA6jhq$%s=k=x&cmb{pEavwXj9hW3E+FaOkbImkuloi?Bww!wu>u`BQ9Zo`3_iBSx z!G~U}pcBgzTSYjVB`g0R61w)7O%2t}Lnp2cinY=H;uM(x}O%>Obn?~nMHi{9UumCYH9j)?_0Q`QvK ziAgE)vd|c7D;ot9rDvkwiS{-~ZBe2}6|Sw95a3F+5|sk+0JeI!b`x&bN$&7yxS`pJ z@6R&%ANwCjAV|yQhKzuQFQd}RN18=%G0+m4?I-ZSK_A_w(>%GOPRhIftDY!Dim`s@ zL%ttNH4T{W2hr|C&m{NVUY2hZ5bOaw*03kv~m4E$$UR&iKEC{qD z<^rI_RRa+AD5Mmj38JNM!aheAR8p1t+7jBlh*I>gCH` z^+;%t3mz_ew|q3+Ae97H627B@$$oj#Bmh~ihMb}`mIiEp(q79f!PvJjceq4VRcKP**7#< zy1ZlX5K+WkD-TXPi|sDyv7cg}bx=ubPs|7aYT;Kd$3xV#cF*x0Ox8>=i#=#fPLY8w zjaTDfn%K2ZtJ)_+i!gUQQsq0wUm1z9%xbKiwc2;hC<#C+0F5%#+Xzy}dw#6&c`>O5 z&JPYkVq&>2Y6niNIhd6dIj{BaB3Ind&BrpHt&2ae5Y@r7xwRa)5ZtR8U!rj-pYz>- zUW{s%699X%HL1XthE9)ryQ+D8pH6Bv6sD+myw#PIk#c)i^w){k4i8}wV~UJp19ID5 zV}RVUEh~s>Gw-KdT$d6Sr(f?n>U?I)bHL?2W{LM?)>5=IT9O_h+2Z-?Ubz4YAu1?``leK=H=9vlRNP3e$ zR*z^}`|6d&5~#BOB{eyJhN|u2y7JdmOBxK1G=ofBc=onBl;oMiX(`+F;mFatXkU%7 zdEee0ZV`W1h-=Bba_$3q|4t9YT2~mG$A38_#o*F|KbfZ==cKLnI;+6qh4$ zZ?TS8z90$#BNpP6D8g=Gp>xf%=3ihaP~^9hKe$|TQTZwg0`GEtubn=W!hAg4ZHJ8& zB_T)Y9-e?%EJ|~Z*Pv%K4p9d)t>Xt=#+xq{2ic8E8qjKbU7D><{JHjUp7W=6`>1ML zHgk4E9<*U1^F|>p*Mr=ShfKL<#)Y;iXDJQi_Yd(VuN~;_yRtZo>QP|& z=`jbh{#Z3Ay04PqE)8l(kuN)Uq3j|fQQF#`N>IRSwND<_X*1C1M)>ct2_GRuE0yxY zgNw{WMW@DILNhz29NbIdG3W*9pfoYL6L^!WsE@;jVZY^c6`QGCIN~8dx;*Z5WHPm& z;df~9W`CtH?qYwoIei%KVqJ{UZshl&Q4U}u#tQx-G^{F{kr$r=E}{$6t8)vpEgXZl zyx1!5`4z*ARV&giqw@wLD6|_3i_Jr;5$)86BM@z!?^Z9%VzB+Ow$CSjcvK~orNpuA zG37Ou01ZBB!iAuqxE1-K&e<$&iLZK+q*J5eo-KZe*>q0+ttS2fpwmV~9GsS_bJ?$z z#N@f!2$R%*QM$II{Q6jD$eZu#>f`4DNI{-4V1MQmvPncHwe$TkKwKpd?z4ynac5<- zmF-ex$xLW6sZ>TdcGu)z0of zeONuF9dgR@|9Sp<_^{ju-iD5tTNv+}K~DTXtJSwsK0M2q*5aw;C(1U7{iO>Y8KHU3IUs$FZL^0H;5YLMLn4legoa4{L; zbf~}MYKRxJyA!2&H(0l(9ut|qXm~4Q7XBLW>5_3Vi9Mg z*PggZPAja7ur{SgP(su~AGm9FAh2Tt=2BLp?!ZRGu?q0~tR&xNzyFo*lt=mScwe4D zL(OW|a1g>NbFg@ua1*^Hn+?cfe=Frut%K*gfDDg_BfR$EdMJffb+2MY4+?L-HdDh$ ztRT{u$5$E%hD`~<_ge`I%_0=!!_7|u8^3S6%jL_POhtUPH{CuVlzfa`W4c}s zqg28iNxl3zzkI(CW>^8Ix8RM+Tjad|Bct?ZCQm*T7TB+~y6Kp(+31)s4E(%K;cq1F z0(GfF3z00gR^#Tfeim}<(4?~|gnoNyw9KBiAQX9_R&Dy0iqhl%BksS0n%w#*P}qi| z2uhcZD7~rl-la+CAYDOvlK`O!2uK$YNa&$M=)FfkrAcTKI-w{STIjuVAKr7$_uc!> z-1~a&KX=AqWF|wNXV_aabI?VidS+NUKGyEmO^y2qf_L4#O$3n<@Xp<_J8KGC zCu~sCU!OkPNVy+1sq+M-)6+bXZR|~>Z6ZDZ)yz52m>BR67RdN+>$0(KbyWxHF+k8H zJQv1|v8I(aZ&${D4d*%1^ehOiNo9XlsUsa6_;bE{1!CcIaNl~v9QLa2#`6-v9#?|sUF=_Q#py^|MV=Y-1 zt`E!8jX^LC5H0$#Z|)$|9OG*)P=@!`YSZ*Bb^-2b$E1iQwz1E)j|`vr#*yfuymDa5 zIUyPY(QC;@&m3RR7*dJ3UD?BpCOdVo{vd)sLX>0xr!)6ATdU+0zUxwc&vAi;8MnVb zH5PUc6cy@F^T0!XZS;2f5sA4-4c?67K>|BI%v-mA?U6OFrlEH}^`8H`mJh#0V^4ZE z8cWZ+2(3DvUXPjSt9YN0d|n9->Z!=+%P7=(s+yBgx04Gb zK8I)#ZJ3hIiuzww1DFBXA5v5*@|h|?54xaUxWpIJ(3Y6a)Z5PJ+*YisB<-hzlSZ$( z7(8+OtWxO}t!+=g_po+hE;+scIE^?dC<8rTQ{j(hQR zcKB}pDBK}B!RS#dOF3e?Cbs1%Y~AfC)qs+p8{6|(Km}DZeBIk!n8sp9A(As&qs$RUu?z71941FnFY$|EM1!R^`-hqT$@2XkGadvmujW#GHy3P7(63Tj2Wmq- z+0@#aPbkDf^se*7ND z>*6Pa%94qC?IJ^C*C_x-dScifGjFk;Qn8v(pvUrdhZd%xv6gVZ@zlzkdw&Q5CWs42 zWE;<$o6t!CXVCc(A$vh>8qif?i`}<-fo5oD5wPB%{1sl)mR$d`$#z~@)zA4vS;p8G z&BuSAws7-K5Jh0p>wv;N){B}MdmvnAK5l}KXKL0K?F_W`)EDO=es-c zd08d1NaS5`^`Xj#8*|ctwuC(*ym^u1O`e&4x@_Z-c30((QmSyH$tIIqY#9pattYEd z8uFy5#%oiBzWjZ(e8!EeO{&^U9MMvvYQKh?r^RaW02$f?(3zbWkqai3AN<5|%2(;O zNt3&sH7hh;5?n^jO=5BS>FVQaR^i34#8Z*%+ub#yirT<4w=ME$j?Nn^P8UidGoxvy zGM>>d{-`3va69fYWzG5{zoFLp(cgjaHB$q$YMFT~Oz-i6v!sE19Mgd!Bu>pAxD_a| zqq5ah)J%YsSaU00C<7iP9E@Xi)^0f8L%649?-Wwp(Uv0qbg~x8HC#?g*4_{2we9Oa zqhJ-*Pc;_yy;f^?%+j|qcG?R~H2YcMrLi`-YeXTX;?*8lks-KNV`r^LTbwD$rKDoe zr>l*|M>etecU?0erc^na-Ozg}k^vm46s5=>Otw-2SGQ~L^`QFlG6dImm%O#4d zIdJbn`XwOrKnIE~(l0 z(0lnn8G(Z?=MvcgZ!qwH*eZyjD?m&uHqW!1dnFxdd9M zwKqo#CJAou04aja;BGiaUG80@0bbAQ@nD2<4qM6qQ`00DM`J2f*64sCPm47hALA%u z6rw>SD`LGE!ny~r5cl}1Uy8eRIgU$mEfEJqUasReKT?UlKXzL#KyybZfmF2@mDpH= zXr5NwBU~Rb$wOeOE*?y)zJQ1=pvE(S&itl|HnC$@;*8L6nyLsGuD54xr30zc4b=oR zkf6fftD%)bLPOAwAm`8XKV1m)OIu%@u|sj)WBLyeC;_dA@s2h0o>4-6i!jRfi6vL9 z<6v5}Ndur%F{^KfCd&XSh3|gN>`w)vMk|ey^;oL3&E%UkC{ku~0iEDuc2WOYw{)O-%JM^g7xCVWr%0r8K1 zALe|JHaJi6>E6Q(%&n3X;0=}AItd>w=NM>n9n6wr_8)+^t0(dQm@Esv>|no2PoBu@ zvNqWLW@t*nI`Iv)Lq>Rl^R(kZwiY@>+521PDkE#zLsnjs9N=2HA|0PNQD=!(THm7; zm?Fk)_<^hlK9&|vnLavk2fMQ6KKQhW9B%zo<)DAW-rRX<3EH zd8g0oqdk?T{1zkvzkPP>oj&g@DL0YRY?P&rc!zJ{Vkkc0%bVmeCNezZkBZRA9&9b2 z$~2bW3jpjQG0Q!lA@nMD^)p~hyQ?FGL6gIya$Rf2>eIjiHox@n%vSn1QwV`h50J8}s@TsqnFk&>igQ;4#3Q#5sYxKZft7 z4L$d(hZF-x5OzxE_C`ttB|{0~FmFz1ynVHQxfB0qGhTOJ3NrFRLyClj-&p%LU2SVw z5d(Jf#!Ju}L$s!2M9e@Z;`_C5{J8*l8zCbvjBs<5iFsBH3xdU4S(*}Nr9J()~cOBhkLXO7+7eY9ma6SXNpMDWZUxOP1v7;7q*rvf+!8Q4_Udfayhh zoUgQb+0CBULaJ4M70{o5 z-fa8^patAPsAXCa=IA8-hZwj{loMT7NXKX3@pIR!>D_#tZZ*%lGqpvW2I>r|Gg)zx zIpXd3wM5uZ+$?{)aZeQ@SOOs4sTX7GqTfLswt5d~wgk?9VeCKup5Qw^+1`U>CYlL)6m=;*p(JRi-wnS6zgwR zDFYdtX99&>>~GO{Q~}h1mMpla>nW}K6V)WW!8zec{T%lpVe-pu|CbY;Ap3jKo;;FOX8sJZBfaxIP)>hr<~C_<1@pT zb>-)K8#zsqmJib!d|4!vv`+@J4J@d|3NloPxd_v4yF^J|D1Gi2VkvSAN6xO$`7D2x z8%T@Qwf@t|$i{aqBp96Xzz#6up$s}d67R2!hO-dSDN)5R6xZlJH;<5;IxU=sg(*%+ z>$##vOS&2%CLLdGdVl(nO=l+?vi2J%j(g?#;ueM(xn7JYb94{sJN*EN`DPU?!jUW* z40PgRATK}-`h~m3!$}V4B})6%xc7?i7o*J@<{*4+$NQOpQO@f(n|@RdNl^OG= z`^3v%`6yS7YjTB0AAm^w>BFYvbS8=Kd+Xy&HwNy00Y9ZL*u_j%?(78f$;pKy1h$u)!100 zW0=gFy90|&LO~4@Jkiu{xZ(8tXxeOf98(>qWk42HUiR&&Ug{WLhlICwKLMEJO*RLt z8g47!bVGe7>zrh&yJD)sJ}Ho~2J*UC#cQE@fU;<&mkPoBA{$=U?68fm+Fab>2 zBAu9jFnvXsv}uvI_O(M!mMPUy^E~q|Z0cIO=)BPcDTW zp?5XrCEE(gV|UvX{n?xKwo?urgee;9L>WDp@z_3- zWYjVVvNm!bz*iQByFtrj-L@dC=qzM2NshRv&M{W&bbm-eN9@Tb`77?cgSDG#Q*_qb z4Y_C3kYH7;PgpEdncGhON8N8ywi$r<6riRA(IdlxKpm^leTPNwDYJwPX$x=XRtxK7 z?qPh$uPJL_Cqp%E-;X4UO87mj2(fjV-XpgzpY^SKDR44WHDX6irh~DquUn;NURM!0 zI;wd+@DQ=~M#04;9D|$Geg9PMSlfSr@t_%N1Zfm6e+=>>z^1K5vK*^=Cv}2iPT&Hr zhP>=17m-6zsgf?BwN|2G zm1+WOp7R)J7~qb)ladwmox3L7ea}zvI;PdDiaqpSCp;<5?r^D}EgJ>OzVVK=3k;+m zsiqRCKZJ5>R@eC+y%X=ehD6?sfg8BgteuF$7K5&ymYXVt>9Q$5@(Ulr&sbzo7UoPB zzxckl(-D~bn5aUlY*V1(;*iQZOY`0zj>rDX@yMSQJu%PiYf?ljOke$^%Y>?{@XPPK z&tcr7Ym#4NQ}2#2XX>e~d|&?l#-#0-IN=H4P<_1IuJzK*+^9X-i4ke6%s~88nRvqN z83AKi)hovI>9UXcmLng)9AVx5C{;G=Z70SwnrloEr&zQXnU2^e2!oj1Q?99D%C%!x z7C`gJNSxX7?np(AQawY7IBnANB{kaa+ znSNDf2wePKMbQc5OwT)Q{+ya3%Umv+G$Se75xuGN!On~)Ng?hIr0eT^$Vu{u#`@E0 z;l0-emqmdI`1SQVdq8Q4;3fW+tXW4d3aUIXMyfPUMu%}&t5K`b8P?`b1ztP$n-=Uj z)9v>qrm|PKVa(JUZT@xPDMVe|=cuU;g(ytW5=c;pGBOxnE(`mjg4T+lSuWP+g@wJ) z5)IiFc{hjF74_XpLEeYpfJ>2)Fc#ehY}op*7BF-TAY!5P!6COILh^~1FgXO4VCr9F zxBB0#0~q}av4>D!rN&KW*@0GWGwVZfNc|&V0jO@!TgD{7_{}I4#}}aB4OEtYG}=Ek%}a8v$-1QSUJaB%)8Rl6M)Z|tYj>bw11-=6 z4~LUFa<$Oa4BEZZH$t<;M!rN!F+tb5c`OaRI&2@2YN0*F4%meTj#GvwlLTNjenZdi zF%wswSNI_@WL;K{_0Kvg$x&3~#|a0LsIzLJNcbLBQ%_JCwMw9Gu{$E!SHI}9QUK_y z9_~G)d($QFp+I{BS!eHP%aYuwJrT>1fEP5AyObckJt2~7rcS;?D~4@;bxVm)r8Gsr zh7s{BkqP9nuJzs?Lo1mn)WM3NrYN>Q`~Rl{{7)&{l>n5|7tSnWg(*$q9#}-&RXo5+ zrb4{*qfVfYUMQ|gL}KT7`W7HNDNtFBC}p_6h%t z>-sM<$$Lg2N*2+{wmuf8HHl6L*S|LC7Rif#(vI@3MgG++E5 zXOsVW+kbuqURvbOzUjZ8@Bh=6eh)|jQv~cZkV|@ZdYey_ceh74V;FVI^`cbFTK5ZL z>Z53$3uKP=rT)O7$K1|W)STNg&B$5rk1z%T;BamK8E@kt__}V2|OpAWy_h0c8bzi3LsQlhP zn+I@i$3SJ5x%b3-kA;JcYx{tOxRr$DCBFqztpj@8>Gxk`iuvkakM-qJ2I=Z%k9-Jk zhQvq1o8Rci-1!%4QWvHyzll zOnGL-!*V*+w&!9X_bgFZF6E1i=|v1iTAv6|AgMRWoV>F!GZ zw1f5W49i&K7pu(?g2o>aaipJc*8A<6cD&~6EI+U`^}drigFz>nZjn|T&!ebEMkMqo z0G{GDr|91f<^QD``m6DE9$vF#f!br5@YF278s10L_gchGty0Mj!Nnownw<17WR$Xe`VksoS&R$u%e^SK)1xOZ=v#rn(dS_k$x)qr= zH?17(PPoRL=noT}&y6F z3Vs0RzgA%UH@z320{XH6Y{`>%8GeKwyW>hUBznK+a-YO#5sSro8STn80tplx+`em$ zY@ug>5vg!$owlr`9X20&FI&Lom1tl_l&`{{6*B|#kD)`j0$PN-3NL*H4m zS^RUttaLgIr_*fdtV)dQHV4mLQpJ4AY8f_brk5Z7@y!3>OS0ZL<$&UV4JK+H`u?H& zN_$suf`dDDPdf-K zXv%X-Q}`pY^Q!ajyVz;Ie$)Zi02%hB13;wSE^9IpW9VT1EYH$P8)SNwVeFON7xuTv z+L(M?<(5-0IJl$3?c%E(dCvrc%P7Y^ML~ciWgxy(e4Q-;BFH@I)pidd_F%yciB-@0 zx)l_FLDo~FQuV*r!T-nAlRV`;pf%}B70d>1cml54i5l)-1Q(6?vJPl!McBteHMK5* zTgz<=;;HRYBs;gnknOnoVqa#+-Xoj7vg^>*=U=e=VL{}?T=YHNeI6D0?~3HZcDrv4 zu|H!47kvTTco_<%xDC_ke8mv*aO)vJ5_TOx0 znP8G9x@==b3fk0@h+6{A!y1F0M=Rv;7H|+CA6I_ntXEhd+qkkiT<6xWV@Esjp5D@O zD54Kp0HvEkA@8nWf}Zc=)PU;p^7s*FjoZQ-fyW-QB5L9+UF$$fLqTT6_hOR(jebrC z^6dE54`4t7ww%b|Ck@b6g*#xNAWd%W-(z?GJ_<%|>+px=u77sHnft(#3pq$TQ%@p8 zNjglL=@b`VGV1G`w!4jbXBnH@Q6s>$6u*E)I>~ONM7fYEIjwUid1y_@x(g{yv65?P zH5iXj=U!7um&`ER0j}ih10;jH&ud^EXD(V@XY~7b9uCDDL8GROfPQq)2z@ z*>hkY0~}nt6I07}y+Rw)2!Mj|lsGS9ffgI&+;m?aAB_MUAav{;OtK?uA}*bPKke@AvRjD3gQLZXZ|?(r8G44!Gu| z8e_=F6AOIQnKt65RB6c-Wkr@}1E+b2rlJV|Ti2m$uAGWH+2ENv6u+TI-r-C=8h zI5H<4Eux}!=dh|V^GPFF)VBVHSsH_khqua*kK+1%v2M?mOKtv&Tv|Lw80;#jn4FJ#^Ou1SkmsAP&n@XF0wm=0w8 zCwZhngZ-{)b__tR(}8sU2zvapg?>idctH+~d%0!}kh~1w5jJu(yurmJO#E6?50pM4 zdiD)U9AvOWBCFVMmamvRk)WHms*#wI@Y-Pi4GWO;gOnA>f`;1et{xGnH>J$f86oxo z^TK@&9`-C}5%(4Cg*Q5ELKde=E40%o)5j{y2eVxIvMj_i3W8yootm*wGadPf7*|4( zn*d4u*b2NmWf=ifkp#x0a4x3w68+uv!YRo_UxMBnrc3~e`5^}XmupoB+IjYO(aWmU zA(}d@Y)ZNv%*;4_g9U&XZ6mVSNF%5vcgNir`nF73060d9IJkK7-$-73Z~=;Yb|W=U z@hUM%WY*shS6uF-iK%sXqE0Ft<{>EF3J{D&tDeZWb3Krx_`StN#p_VmeD=gi-p$pY z>g7&wo4FqWc71hitj41G$(3eAAwRWYwB_njNlqd&bH>Rj>D>oe)?|hVB8KYwcAd)z zh5Jjrx>~$eNH3xE6v2BZ7qc0r$`s(%&4*{coxYf8K>?O$3sHb~jhR2%ekn%d;)oaJjO^WI?!%*kpcPoEa;$j_8$-jN*mpJ%4+m=FMXF8!;zJDvu4m zVC{U?M+yHXtW(w`ToTH*o>p@MDGY@^i$u_*xRuS9%N-7Aq4La6f>lT}2p|#E=~+;- z{RMSV;bI;8HC8Nyv}A_wRRYGOpY4CBd;O~$zW9JHleKP85+I`M<#d|81pw25{z&Be zoSI>SIeBCmh_jAP)Jdt~(N*~m+W^WheCdB=%T0hSF==9iK1%8TR>b3o`P5)iI$4E@ zn?x6AtON1^Sd#LwSxmDcAdA%etng+J3a{P`6j3H9%F~F~#Gc%OT77Z@Q;;;tbHxl7mF-a+8|Pp4TmxM0Lvd zp8%>>)F;nA!35J@7F=4myS4g%2s@qWR@3U_?QzF`lfkYKjea`rzcYOPH-qpWUdGc5 zzkv)j5`&+^yEwo>N>TjsaOXXq^z(?^Pp820tuu6S=7fiIZmwF%zm0v2UE-vu7Upyp zmMG4>;?APwGFqNJb{UbjpiYlyx9$<#$n7oZN_a>^j8bk=B(A@m8klWfWW*Y#9Rh5$ zit5@bcGTh0O0>9;y$$8d6opEl(46J`^7{@DZP8O#lWgX9cy(G?DgW@d4#%67Wezk;-jV1Y5W@xHJY_LXY)a@)FghUBQL)Iz-`@abojZEAx z2_EBz6yQ)q?o4JTNPDT~-#ClzkIE?B^~=zT8P{>9G`B8&(qLsSb@;Y}g(}IQ`r<$) zP(kmqUWnhHEt^Fx2f+~@5DINxtDf$GMSJVW zwtU}-4wtE)=XUUF{<1o{{|x86Aj?0*)c^TdzyS=%f;0x0A@(Yn4|FsxwT55VujZ9t zd_Sd2_&bf|tB*yil#se^NSZ`fs;k*xWZGGx=f>;ev4CTKw;Hcn_fGf4kyWEf{J_FM zZp9~|Xb%V{oep4Z&NXf~APSjt)s-$a+y$s|)RsEhlm6%Mr(;0yrw_*iUo2WP#Nwys zDm4vkyuAOLt+`0jaj6pEFjw<%&Z5ibD&p450+!Psh=U5OyY`C|on)?_RrsaC4>c+K zd8>g=IH5v3j4IHNNpJT2D%-A|;5#&K#Nn9_)Ff5dSly)@C$Pf-n*9Ol3!og}?6v!x z%Mi6FM+bX;5K1TpeUvF#R_w(G6!P9{*LkE3!1##z$@ML^Ag$R(0tm?I00#W?(7>J0 z=B(=;gFLY16Ng0*{Pb(-jEKmvDJE3gGZ)kc1^y%G1un5*KmbLosm`&>nZV7%|A%wn zua9}EU(N&(i~BlAL4>49f!@VYSM92x75rF7&xW0uSpX-c@2ItI)j#KDIQyjNs;{VZ z3OMhWz?lI{7XV!SGxUDlgp9_eFpj_QvUOS4{7;sd?%cVnW8&Q2F|_`Ob?-IT4&h4Z z5ao_l5%$(HRn2AMgJ=#P0Ep3xTuvpFB-u?Rla-0nctcSTp*#~Wd~AE=hE(h6C-_W} z&*yt$20D~&>@IMak|Ynqj(`>rqBD{UxClz8>JrgtKDfKfOXqwIkgK^*KgjdqW#!lP zDL3IOsVdI<_DZd$X^=XE+T;01rcM~N&9IP?TGnjyh(p$#{hJ`%ghrd*Qp3<*~GpaPu!NN3@_ z&|y31_-1ffvCSI)%G7AYmEfxPSgjbps=ax&T&mP{wAb=%>Fb$|bWB-MUkjjND7s21 zn|M{i*Z`1@H)k%hBcEAee^b-%0sJvpM$%l>8Fa9%z$dh>D2j9JnZ8Uf32J!&`FBc* zvEwLKRQ;e>{h)ui)yO*WsY&Un)f$DEqWnaLZB8hMkTj;H{75nmC?pc4iK|b5q6^Q* zGyY-_Y6x=@9ztEi@L>O{c^H2_z1n&Q8?wKl(006=dl`=Da_}(b)?$g=BGDi}AnRm) zZqUbL03>WJ^&C~F2?!at0&VHTf0%DXfZ$m(V zsjjU^q&aOY;^0L+W^=NAA2kB(ZT=@V0o3M(RutT`pL>?xPS@FY+8R<;C)FGqT`k0y z7Fw48!tUrB){329C1^PH18EB_x&+wXeW1dl^L;U0sH5a;+X_?X`@zwIqPKf*xJ2`@ zfa3#&Ig$bx3aaA+bGuS^#Ig>l=b45 zw-6%ecIbwFmBxBr%E517{9Nj`gk&2dg`9`7v==3o*Ih9{b9QI?A556vxr@)Jta%y( zqw$@&cl$oAPO4<8P!Mp+&<6Cx&E9(|B4P<|sb?%{JTzop$NpMNdI*RQagWPm% z_}e})AYA+V9!B^)-J;OFI$z~e^~RDd ziHCr7p4ZTGejtDYSOIh66Uzq+3H0Ft0y^dDyzF#Sw|np6?PbP>^#Mwk#VSQSrcv%0-n#MzfR5!WNn3ofW}3JAPt0(! zPjq$ltmACjnfOFq%^pAt_@Hf8eiY@Qtv2p7b8Hdl7V(df~ zcO9qtQk%`z>lS;C87XG{B>nSAIPL(*tRB7o4rph}Z%hBz)V%*W`O`@^ks2W5T#CLl zd2VWBbpz_5}|AlCq!Yn4P)U;%A;M(EJM*jT`>G* zlMNx)xD{9Qri_ZX+wlsasLOhlBWX;c{b+%VneXA%kzc#{z&UMquPEwsdAWG>`~Z>n z(~M$h!`#io9%hD5_)zopX#_kJBuuNP6NG=f8boDhg357Y{stcZpfooXgMvP1>U2~adKl~$nq8&!eGJCmhzoxKYp^+tL&QWckzKj%(^9Ijc9#s;z zghTh=dXMh*I*gP#!C#TMXdwhkmE^g)cui`-TGa##uY>XTQUqRWzM6bguJy^gK_n)X zh8(5*6K|-~D6PAl@CnYPYufR2W!zNME6lI?G%Mk+KWqYp*ER+y`t-^|@Dw_B z=kw`!%A+P%{l+u>gfq;RAzP_JjxQn+Y8rE$mmnhqKF_bHKwQ$FDw;mY?_6~3_*dS| zs`XO0rIb_z&B-SP~5hvw#aE% znxMePI)uNX`ZqM=l^dt3nJ+$os|eI+gmRDKyW;StFS9Yu*5xjB^IJ3Fn}Bzt5(-NN z>{FP9n$|Gg$8hV}GB}iRU8|Q4#mSZZYqgI2yLlZzpHFu3*jla`FSvcNtocToWsDpJ z1nKRAc5g0&)=WX+DM{O(@ZR&X0Rszs&r{g^>s0j47(<`Fc>|5h7BnGN z!7I%ha2eK&zW!={w)up5s+dzjLB8Mt)5@{!4iDPw zdVpY8`{V?uCiFP{2F#@|N{jMX6GGI}C3V?@#dH!as}FLg6jofu-&t7mPB>nZ6%l(} z@P{+upO3X(vcLL4(_d4nQVl~#xeE+{YM1R0h*@iBKT}iZy5W~HJ(<-A=D^JB2wsiV z>62YU`&X}uM0BXs7$!=!#ZsPu*}`)Y^e&Bs5DMyLg%?0tmE<0Pe7TxD63A+UU_lvN zkp#0qb|7qABOx#BiNntMqurlb%{|HZSYU%|wOyLqAc$sDi zML$12fYGeB+oM5f4AII^P{_7aV@M5CppXIUoR)7Rc}Awz9eek=GT{vPZ^O!*G%^ti z57M~W5xt=VpoI9Xa8I!U7T{;NsIDzq~CxHC_c@OW)fS~ zdeDRIG$|p=G6oC+2^363TfyUR{%bbiTf0Y`|OQM>mno11#tiYZ0`O z4Y_2Zz1hkx3moO<0G98Z$+vWcJYD^MVx%8l*si9=+5Z7N>Fwg2Dr|mVAUvW5pS0Hf z8TnYtONuYkO^MPIP5cdV6BEuaGt!HeVVKIAi$eonA>+o*N=kxYq0E7yJxL@J8w05_o<9*`;so!5?L-Yf<8&Hf?S~GsyB>k&Fp~^`&J_{6`L2&~9n}b$p8T@u2Ho9S_s9ni zc;x18s+t>@+@T^&fac0qewd#W6a5`{r=1)IRkpCG&drOCbJMlt;Srq_9wSL9*6RtJ z(RtlG*(BB(SK%$SrX27yQ#xF~GYIP8E0;4o>-Y&$dm}HIqA<{cxadxzX|(o5O>J%Q z)0DfovUMjuInSCHW;R5z|MEaY9?ZB*C!m)mq2APMBx8E1K*p|d#OBfXL`lMJCp#yv zG8tKYXK{P9XSaUqeADvm8iNk@`}wFFqtJ8iaGER*mXMH}G^*H@3WTXvQ*{qA!`9?T z)Bj?bBAWxV{HeleKfzCcX#3tZe8%@z{^yU)YqxjV*b)=rwp^rzaJP!3rlNkmbBiZs z>(M)?w-mFnZSf;j>5X&cfik$3@pU)$;R%k+#L{ctx75?)w5=l*Zt3ckvw50R$@ip{ zjSQ=3+9pGFjdinNJUz^%l*KW9%!U z{`YGEZ_ZmK9TNzCIvD#*SNCI)#;=AVoB$+|lK*;fo714(XGEe+t$SISF(PrB>-4-t zKu3s`R}rma*|aigq^0YsTBK{E9%EUT9bXnqNk`5uDlEkH)HH6yUf#1|DZ;(f(XlT7 zYl|P-6H{I4j~^c#_QG|G?7#v=lZ<_zrNLT(y{PVIAYtDu9Uf>xs#J2i1jz$Fa6^&w zgT3Cy5h2{LM~w$3lO*45$e(l)@HUc+KXNkD?)73s@tR*))*Chr!3pwi-aJetnlxbj zcJqJs^Z#imyjQ=odz6``Ui6Ip#(n84SO4@!1~DVsB412k_#ZyRub6zq1(IEck<c;&|s;hxu$sM*N{D0MmtiTg` z$-<=^KW_t1n66>@;!k_(KcCAWUaFUA9IN7c%=K7a zrCg_b#D2sSPhA-V)o@>{ics;|+|1H1cU&;HFr#O$9E{a+&(SL?n|uw4Gd!D;%~=_xIx^N zcbX{@2V5cF%Jq3-lfEm7XdITNNqP(#9GKFL4SZ2Q^Q>~0@?^X>e2?&7jGGGPR~&3Y zJFvny5y_F!Bd3XaryZjDo8Qo^A7Mvcjp)f49{(eU>Lo^4y`jU4ni5mfSH@0lWzAp{ zUB}dCB5Dht6=5_%S$-&*^y>jg7$0wyV&NbYM)& zv|>!p5d&&UaAt>^jfF#tq9xYa*ze^0k-B(l_`GRCZ;g(vQzI|tfX>fUd$oEjik9pl z-cdQt4lf6LVs!_z@4l({)2*<8UsZ2rURW&6sU}L$XYRF2czXBthR?q-Utybnb`qWb z$*3MD;j0`Iv$4LIE(oc4q3Q3>?KtpDw$a3Y43Jpv87qg;I5>DfAl>;UC*0GX3V$&A zzi?j54+OhzgNOb7Vs^$qOwo;ddvsq@rY9-qXo3|*II)TgIvZVIR3Ctq)KqHuAWY9& zEE5MiNT9ft^@ROsA|r%EB`+hRbl~;td8{&xgS>f4&^ve&_HFSbZ+gSX9?5&2y2Um- zfet7Wp~^ z*!$%yc=Y!w=%utD&%X2N{GH7EyuRSO%=Rx-UT5EVdBMx*G~qq-sRjZMkw+d8zw_1k z`(L)YmIHy&O>gU*A?^pR83`iFR%$&}GpkZ{8wshk?wu#F$i?o>TzLzm+8N#H5F*3q zq%zkXmp!(@f&lK~b8?vtxrmFtdIc zD*CLyIKZ+XLVQO@;dA;8Y(j4abK#>9Oj?GRN7>Q4V)l$H{{CC~Qu~uNz2Vz-osSL9`)o+97y3Z8TxVj{0nmule$@)xXk9FLI)MGsO0STTO&tvR@do_dJStw0dH&z5m^jZ%acADeaak~U4i)${B{N-E;5-qZAz6=11Qoq)(9_+{Y2=aC=&z)b5$TdVW|I+FeBSD5pCwRmHr+^&eSM@U zxF~aDW7Z#2kW@J9U%B{RM^6{0M*SzdpegB+tBi=^vmSp@#pgt-2}LQXRkn+gUP}el zsWEwoDA>`EYn0}gkb-|bAKM70JEfUh)tU_hU>227wM$S4?hnuyyxKyCNrU3iyKxwBr!LzKQpT-R=?a5LO@RKI^#;tCK88i0l zI>8zywQhZ#rK3eTg*nf^$DJm3UKw*5MRn5o(B13Q{NGdH>J4$mP5&!bu3TIg-(v^= zlb5+V9h#wInR0NU3bq3ai6|$Y0|A+n4HCuQ4ef9Xq}9bt3o8@0wnFXUR^Dz>{oy#ItsUZ1xq^ zR&8R)D>2;^hV>3C?p1b;efB~Q#6E#zKhAw1Yiv<8f##{G#j{7jXjou=o zy1-N9$szUD_hT*az)PF4O2KT^F zz?q}eOerfYfC)xMR> z$aGmQTg0R;sDU3NO-;H-cUrI%vGc6z%IT&(eHci0UXPUV#w&4m+;D=K6AjcU(eWo&iX%yU0M!ZnR*X9(i;;-YW$$-d~eqqudoSO4CbO4BnG+ z`?>b4!rH^5fSpe=_Uw2(%2|aQE>=hGLvMg7o>$!yek%ss4sFhd*v&pq5wcrKKiC(q z9vdC9?pI#VYOAtWFG~~ifA~u?yJ20XYYkHa4SjykLume@|6`MV=I-+3grkq^0%vA7H%z8s@h>F%Q|Sv*-xE>Xd3JQ1BQr&vKc;e{ARxI=y({mq_sP^X~e`4=1@u zwCo-A7oYWmXeTO>ECeVx@~uppOm`cf%q;`zn}zW(as!R^fTwK&6=-xY>-q7%8=+cS zB52J}RiDr;7X#c=z4%Rcd1he0zGC!)rW}9LuzbIb##U)^MmI4bl5B+Q9a>sWP5mRo zqY;4{RR>>Wyc9Ql6x!7C9PiBvX~~K8SxnQ4xyYVxkA_|j?1w9v#&4S)S1H*R#))lrgrmDry52gbsF+yv)k;x>;}F&-m9ku zO&Qqukj*!$wUr+bQT;xyt(bm~X(AWx40WYck|6vI5Wk2Y$Cfxj)aUX<3PWolmXFjN z@X5MSSN3;Nr*vyuCts}z+P@_!)S|_COE$6t!TT?%CEPyku1*aSTCR<*J+nR^1M4zq zTaEyeA{NMFR-K-#ji>(>EoFJ^K&cls%N~KF-xZnbKS$U5P*TQS45bcD%_hIX46v#X zGuoO=G)t+^jE*_|(Pkwi$GDa`Wpd`DdZNeuW_gmZ>h;e4k0j^k3XP1bZofSPK4*{o zj>9WJF|k>Hg+BE4AqOi_Ksjr)pE}8FYL+<`FKmBrel$IpKVVcqo>zwV=H`rF>fJRA z@<9&UXH^ZyqRq){XM;?OxSM8GcB9ZC>j9Kb>0^)8!`qZPRd=jw`Bz^W)mx4f{n{@q zZ7oG(YTX4oWTpL#TKsa}wXo5aZ5^=pgEz--?>AG)GB{J~uuv4S_A#X$jCi-$Z{{{Kuxj`cTR7W#TsPfU zd-(ICE`{na`ZlU`=nHTe(8PL^A-Py_lU^$*p~h*HpF-#4G<74w-6_0NE6v?cgGZgS zOQ7HBCx_~KpBP)HKuK~AY^(%rqCx(wKs2#n!_aWTVY|U6H|wT<30&0m`TlRZI8uZ< z&TC|J$8PQ#ypFr2DxtJ`K}_+NMNCFl&5Akb?6~YA(okUieL4c~8n?9-3|o?t85~X; z#U8PBC?i5}r);XifJSr zKaMf85$HHJq#HOEeYcw==>$^#TBxn1HjtI2*TZLRU*{XrQs;;?tz18zmTOhn`CQSj zg@;`g-$43Qh_PZku$_I+4vL*S0uwDqR$UHP<>t*t*h;{nl9k;LhRKgAZ$^&`nGzHD#clj3Pq08ur$K4I1+T<_7+Z7d44cgUS$&qduGs-KIL%_ZsvPZnw)vM2kx(pt1~Q4ogmAR>9Kujc5{FIAw%EUvzNMTUtk<$$smn>(R%xX z7RmF|JAs00Yi~-ItYHlxyYIamn4|efLXJQ@DK_hAgQO+6fh#sssJ28RgVny^M}Aq& z=19iOJG1^YLiIs$NV^HTSx!u-h@ARn^^+_qcp+X=XACV9cj^+fj2a5ux74fA!6%jZ zh7LBmNMHKfYKY3H`pd9l1}`)o0YYNF-Xq}pKiGQ@pr+ULTX>5F6#=&j2ng5^q*~}5 zks?Tk0HKM1^xjJn0TC4erAjYS1EB>7p$e!#kPZnDLKFzSNG}2M{c!KI&pzMX`~E#= z?wxPun=`|VgOHHlTb}oM*0a`nX;}vh@=_ZzbhicQLA6Bf*RAxJPTWC)8NC`UbC2?g zjk#};mL*;yUgg4T(5WCYVHG%++9L_%QTOi?y=3KfjnvaVgj~d_Ji5y7ixU${wW=?t zw#JP)vd`qkP5D_^sxrnRaV2gn=jMw_)UfgP!u2~3sB)ddUO-2OOA^j3_4jXxOc!$Q z?u}iV8>WhtUUw^VaB-fwZ7ZqlMo^%(KK?VAH%TvJA=n)(J<~0*4~BaiZJ~WOBt z&K=HL=4k`=CLqpsp6$RY*9VyW_h&G}o=*j(BzpCwM&fGCpL7{tr>rBD2`dvJ?(HXp zShwfc}c~7=LqU8Nx+7j-O6%9hwb2L|M z;P#Y5DBRQ7264%{-{gh^qiXr=qi)+wvnYG9l|A-uf@q~bv0uAMaBxHc&FDcw8B{PvG$}DQwD_`qdauf>V1qOET z31fWPZ9=c+&7S9};C=?vyw9WcL>@iDTF1{`E*Rp(ak8{NwD*%Byzg#(7CB5TlZaXvVupUAYn^+_!u93O}8 z<`I+L_+ZGSv!{}0GTPwa0^Zv4shWF)@WOOvFxcJ_%=cv|>Zl+>FLM)S1NOf@)RRZ+ z^$(8~51v-0o11<0Kr5Z{M zkBA!>3&rDPB|7%YD4*Iv{Wx;!iNrm)M}vnQWbd@k!8#`NgxSXk?#~m{q=Wpy?ci75 zP*SyK!>$t6*19Dl(W`TAQ0>w!2V704l+=@F^UhsY5%4M*6McR$kX;%RRyj~hfYPR& zDCA#VVK+9w)?AXsbP34^!^d`lDtkuEiKzn0Q4HCTYVP8I5o!|M&&<^5^`JYn7}J$p zzlWaCLJ{Y;-As^VTV0Hoi1p+o8FsRLxQnubF%}5amn@85Ul_UhZQo-K=f&Xb$kMGZ zk!oJPIf&wocH*gMXaqe&zQOiR9M>^D8gLl;rv`fpmZK}H_>*c>?~6!J&8g#~7pJhx zJ+m6&x=tOr*Xz5JS^AbzfGDf4()Y$R0711;ufd9T9p>5b{=_0$iC|S>J}Bie(bTBJ zv3;XWO}njsyw2atVSd{agVE88;cd}H7nqWRGPY#*uOcpIa-_WE;ZA_BCYd|AerBSv z?#HHDKc1#0d6TBvB{^X6>tgc1Jm4@}g&nN=XU;h``Z`7@W3bKMNX9_6`Ia=F6-3U7 zTm0NMoh7ADP>|`q-Mxh-a?S&91<@|b_WT8=FG$Pm{;Z2p9I@nm2bWs8qKGCG#rD9U z+^r*Ws0X_7F+_)pS1lfa=U!_tV=?Ave01b<5m-1U{=laHrTvN&Zf*_|wmH8T&%|eX zg;;z>(}HbX|FBB=Mok*qpY&iJDh%Mu_kOS7E|RA0LmMk&Dv8gcxeLq9T+3=6ceYtp zzJMdlM=JhIosX%MOFT|;gg;EC6PSB)1>O2NB7RQ4u;$SVeI_L1G16h?n#VO}0B<>f z!Jxv6WUkW~Y4UepyX?5-EZct1k$xu$MF(yE{KwX061xOnCo3tNAr@IWQj!iGV4EUn zPPKPQlV3?#S>gFluF-4CEbQ_+>o>H~9bB7hqd!)^KK8mIw__vZuJ&+_cP%R-yp#F>PEwl;tHe6tuOO!&aTT?v-ap467DuWVVM!9|8Z$y4unUz_ zqOvd@9@~vJPkbv2+_-Tk>{ijT2{;|*cW|lzE?-WCxU@4Z#H)N_`qxjsdl@P9o7DZB z_zPW){(JI1`SjBQZ}=s>sl(!!ZfM3n_nODdbsAfD1%E2rXu79{zY2okq*lWRl3%u4~WtL~w^yGn`}ONC_0C zPdCL8S5Hm%VC>tq=wzQ;)=9SB7)X^bxMbH6#ka<1$qdm*UcDg!SlmY?N0Qq_57u~wLMGUGXiPUR7Ze_JUO zGAh8>Fr57@f~%EiJf1PSY?j8uG_a#29PHk!0&FW{{a&tFi`u89A^NWS46(fGvY*&} zf0f(?;^Rb_^ImCzEL)L_Wr^%gYb2A~0EY;y)*q6IfkMp-KI`heDSfVCysIezDX1in z*CrPe+j&X*x01f=T$or1ai^@HL?7#R0&hhxlPqZLb8vPWa?pD!+O#CCXt-QmCXu2L zD#QA%R32je!HMuVq>=}^3AWP}r7n33vg?A+8X5mM@Jv^N-DhP4i(o(+h7vSKkDD1g z?O`!@W@hQ$d<{F^vfN14Fd*H6BAFMZUA3kJz&y&)QlR(MODnA;k)KLf^e?{$y0)*` z3gl5!j|T(C5`_(TrBmZ!eoJmPtj98e$%oDdnUk5uYw;PXBAOqa0)%xfTYCOV(Q{v`p(IJibG0Jm}JD9#QvIN3t> zUfJsxwzbLckIr>B?sOgJnAloz!4EVf>l|a>hrm=jpjt~rvp{}Cf!O=gUH1ny^>Cf^ z7{pMi(bSw!T>Hp73m=v4E;hncLS84p{m$tC0_g2!*M_=-CO%Zdb=j=d0v&Yp-u#+o z&6Ne*ZS|Yq-eq0XjFm?NF7AY- zwPw1n)Fc732EOI)oo4m@y%Z1~B3FJ|t!8P^j-$4qI%I|Lg2>>>K5~QQ9dfojS+pzu z30P4mzrwy1<))q}V@$^(&d8#10Vh3hllNzud)#?c%nY%GsrqW{7a^!G+k0x_X0Cnu z79H6cDWN@S0lq1Eg&=hRF$wFd_Xso8T6*8j*rrSWI&NWi!(`~H(eaEdc4fv$e2MO| z5ipzN;gGFXo?8rq2Jl5Sob#KV@J6*tHOFL*TdxqWA7vKhZbdw@6|D?s{WW$zH;OU{OoR^8kW6}Cyr5~e3OhD^Je%G3|ZXx43|xlf`AXUKa-p0 zDqza9Kpd4W`i4sqklQ1c)W>WZF3@Se=E7u*USrv=oJ(h1-No1Tn{rF^zVuq^0o{na zI4(kH%Se$8bp&qyvFdBfbUqUn#TJ9h>vSKSIv25}RD}F)#R}P@Tg0o)IWXuu)Vi-t zrbe|4o&dIh=y;0QPj%k4Pg?0|b!lufBGS%3thEV>dw3c{DAb)KU<*PH0he-n!VTsg z<2==RGi_%e7thhwQ1DYKyhh>>>?!@!nnP!N{WNA{aaeus`a3!i-&^`{y$AjTe?$@% z6Omk*H9Gi6Hs0;9B;$~weaA=(;QbPt5`DXeLTpc5;UUh;Hx-0lSn~G= zyHl@5wK?qWsbbRttm-BQN|x_|MMCq5X;9Z9q0TPv?oMLEdalVJ{5hwc_KDaVQLOwT zsmMyAgVs_JX!IdU@pc+_hP<*~NUOUHot6Wa&h+QUp?Fy9Gjaj)HnRuHcFnQvqz)!O(@ z%jBq3u`4jY`Ie!@hbG+WT3uaRqJmO4tHcnU|dBG zWE(dCW>$av{u733=xx7fFK*7LhyUPBwFw-2jH?Viodg6kAIK5w`)1b+XNR}IMz!8d*N#UfX=Wru)p2p}S zbd1b=J+qQk-P-58%3a%6wHPwS=@K(cdUcMGnw10}rg78GW8(^m=5ez1;`|p3&#)Ub zNuNLd8DsoQYr6m7XQ#n)vc}cd#X^Uc1D>CW9y66d;}6!8n&>vAP_Cd2xW3o!;9Uvz z)i3vSktfK<8w+Ql000m(S{Ks9vn8p@5zx8)$&$YMv|~bv$y?Ei5u@)cx-%L6EzO^X zY3CE#eX~wuhTkF1C8;YwmYR?Ixy=cQ!>(K!qBWBZ@u*9b7iU$lM4SW^B;@R7eumL% zbLjH>9uKaIh5G_dS^~x#20-tFY|02O&W5=xxM>?7ZBwpLoh6PXn%UM8UaY!ZoJ(Q& znFJ^HEbr$OmONrFS z42U`Feskz9Nt82b{Psy9Hd#NNmKz+$2zh(zisC#N+L`ydR9o{C>|CHDZkxd+q_#3D zS|j~<5d^sHB1Fo(@20(zsjTQM>g#_hh6*#z3V< z0JErLspJ7eD&1zsGN>({t(G*|-tTM*0`ZsOLcSf{ZQFH{B&pGU-Q|C^a-TVtRV3eK z0la|ej1i|{>UjCpL}*)Bg?Gj%V`Y69S=RcklMa(;^F7JAL&RwT&jEo4^>3fjbaI#t zafeYR72FTTIcXuzZ%rjk*V}dTEMZ(lcg9~x?tX$>gJ&cTlJFOS#sv1KXeL1q0e%rv zul_4~Oe)r1H$*trUAcsP>7==U@lCb(lAE3tm-PV50+|Z5m3X2?CGQpS^lVC#tv-() z&$L@R?G^v^ggp8fw_9O8HxqT1XrVzN4{U2YL`S_a>Ubwl+^}WuNJB`Nr~TK?I{=>V zBMC%@LJ_ApMs;_SK`F^2G_`|0s&4X<9!n{8qSG|uzTC2g`ZHVnEdFfy^8Cq51G$tl zBYp_z6MT_) z?v%XD0Xc{hav4!!XOSLt!c=68;&^dwJhZ#g8Phl+Nl7v3Eqt>6F3^+?PHufvtG5C6q9dl_o;STXws&C4O%idIhL|x zXA3v3HMa|KhXi=XXgGmY?;_;v`|Cf8CRbtMH;PSaZ_X0S%l$8Xjta&9RD7cj0^#&#ndzBYVa#<&&2*HOh{K1RYJM4R8(rOlz}4$U zWftLl*bI{_5Rh6eS?cnPc__pDtVddPT(exOIknRdYQ~R|_y#O2A`8J0`B|VdtQq~9 zV`GSLn$-P+LDYHBmDQ-X_TpT%@_Zko?HjqP>ZP1=FLlYXC>pH0nSOx#7(FVs2=SMY zo;4D8y+u6!2=-7;SjfaS|g+Yn4Q>t@iTH%=Y+`3)riM7|s zr|%LPvPOtJguy}sWtM@$`OkW;tFf14%`&>@$HQRnS=8*vWw`uZe}PJS?3=x7t)oI8 zHvJWAZ)hnN9>?OITE%OGI$BsJQw`->R5mgJ(;Rf_Cwiktjuq2npTQVC%{Gq`5IxqB z>pqI2$bnmdfc2x>!0s9VhFr(*yccYU=EwVR;xBhfk+4!w`P% z$NM?>SLb5`0-G<|4QIRM&wCl8YG0{m(%Kc3z8S3k)hEX-DtcMMW6Z$L;5y$)2dii~OooY{?Az^((H$Dj^cxiw(@EUi272kAXfX_fn&peYhafvjAzZIuoDRA&{wY}^UhczZtNwzK0l2|C_^n6Zd`_I)TQ--` zozqV1@0o_xuYwB5nMZ#nBLXU|pf5~qGZ`jRYaS;~UVfwUG{fX71NO>4_^SWI4vEzR ze*XcfnuF?OY@!_C?4U+Ksn;hZ`;-ps_Z<1jA(xw?IUC!t=g`LdZ_#$Mfuk;gDop)< z_=x}do{55TS^{e&S8RUr0e^e-8-O8fa{7|U&koCvS6MXx+2)tV=P5rs4nMela;JfG z>Za?^v!7u8{qZYrcBqb#?oe3Vfhqhq9_%JS5YG8vasDUg>9^1K-^u!8*q{D)vi`Z= z{=2jOw2l6|v;GJ}{|$rx+#@IdduII+JpX%U{pqOw|2n}CRl1yK0Vx1i+Rr>X&ZU7V z=(o8yZ}hk7gYOgPY_%GT2lCCNJ#{uIFBT~b{fDV!-|4flyok&HlI#ZQ&`60uD=O?q zt`Bq{(*B0$HZnJg{+5`u?RM%lhp=8vg~NG?0?tBQdxBbg7#2wd6c;K6KpIx_*o}+> z>fZB8v~CVK$1#CIoJ%BrA`fyG!25x=`8x%}Mz6dg!`hWR+UTzf0X~iTH9q#!RbO^i zS;Ye<4TJ0NI56O(k;P2!f5=eU_<{hRlMa5==v9DY8*BendqWp4@9RSs>`sJH>Mw0{ zIlX*7iFFy9T5P}irx5?AU48lKhbZl@OY)E$(~ha70Ul7%=&=!i?7^e6wa znGhxYwQCz=K0IEx8ad#lAW21v%_E}>&04++oEZ^i$5{gd3b#$eR`)Ih+v&Dbn#bPpB zlwWy?0MA_%l7rSg8(ZtTlC`%)DzF<%Cj2c#0)0qzVABPpozoZ)fR(bd$|}M23&u~> zOD;eCVpLjf$khWdm1m17!tHh!$$TG|Y_FgRXus7?I3TUMa0s8q@#yhk-OMPiRVjnr zhPH|P-#c3Q!#%@I_7C8DBm})~+Cz<&-8&YOXAAP9eg(4Pf>&cIS^|}CDXrlElEEA7 z{?gPi{I{$$mqxD#?vg4|@m9XxUnLTXZgw$;qIBYm3@1udYf=EN4SOwxy#Ok0#nkxz zc*j`cK6;W(>QZAt&tl)EEEZZ6a&rn3u*<9je}bGj!`#CKdc-~4U_L+16TZF{ zcYl$xyQ8a9@EGW8QtpyW#S!{*hYwFVSY?iAe`$jD*FV@7Ea8$)1enup=UK)o?p+|x zn;j!X>oB++l}8U~9B>E%&yF~W;qn4$G@TSONXkQK2HhfjM5(=F3}!otL~FaaHMn1yg&=R%9%k z<;`01e#m1BmP}bn!wRsD#}%vpPSU#Pd3>+v2vF(xUJ`Ojxu&UJr#VSpCfG2E%mJ)g zM$aUnC)EJwyDv4HfqDp_?oZA>lxRJzp$d3|_GyJ^0F=WR;basObF8G-)a!*I6mcVN zA*}P6??Lk?t7N0}D^x(SeTZ{uYFvCD^*(8-XiC6LRjarHmyls%FHebLl1G089V4Bq zEoJE_In#U+%!VxKp_6kXuem6^wzXdkup(b;Iqq&M;rEul01V zCJlCixwxsd=E~B0u*{;7FE#*&C5f3=1 z+jGMNEQvK;Tw;|m^!Icuq0A7j9zvFerdh;eB-M|fQ~##D8^n?GG+p>Ix50tn$_Knl zOeQ3tW9%q3sUwG%bd#3bh?j?(DNy85y4xOUk)AYJ(9bDxs84&`F(V0pq=Wfp>g+gG z_syNeN*53!E|gg8+F^H`1V}J)#3xQW*REaPJ$5wAsvpaCVA^3Nt_`$WT6^Grx^_7eWp-to z99;`#mT?VqpO9k8a%@ z>cq7^IVhVbccIAd%hmekh^PTPWeWx_ZX1%4q>m$O;X|ZBYr>q{S)Urt8oQSZ8jE<9 zH6>cY+E{TiL=L+9MZ8_w)6jddJwpO3Asd-W6BLTKCg}7tWG;U~q&u+3=}@4r^Zf|K z8V&kH4(vsR?mA~KH>F<`{ajB>U2Q75hwrQZb+UwhdoCOvXBBEiXw_feVT8oA)ZGgk!G}}ZAnkL&Ub>vPFEdAbm!feGOfbRyjO(nd$WuNb9j~`J#&p6@> z36+M*ZaW4UUPR{q(BhZ^P&dz~*X$B1{@zP7&&xrwp6Pcf4rq8(LAkw%&$ch(tS-uX zF*~euLi53$PB1|Dm<<~KEl$ckKb;M%iDhR5HrM=E>E^d-b4>hV?YT5`=q49Y6yt=i z2`CA?H64DB3>;PhsUz*%waf1EJEAfvruoYa(zWd`f*_o|ZZwOAuL0W5tDuhpe;tgU z%8CE#JJaxMUxPIbf3b%S$gYW%H`jA&0o|rF4V!wXC2$AZCRv^(8zUS$0Jl4T>JCf1 zC^tZ1>{hMY{KjX%U6_6JI=6ui`b)ieRz?bgTfpRjHux-0YwA!EKi;I_vnDQ9j9YYL zqI=vAPgu=4Dc6l%(INbXgxvgtD4er{F%mZYNzb;%}y&T?5)Ek6)k}6`u10n%xD6ZsCAPVVBW1>0IKrZRF&frE~*81ehRC{?7PH z6nNvTwWJNXM$GlQ>oX!~k6y7Fv_dqm$?h*`qtf;7M3yXoMxYDJHT5ufhReYiWu0a? zlL4}r`T_xa*lGxHJfrKUtIE6vGYIwwrD+44nkRoh(0d3#!*?SxqAS$E8@n$WG*L>j z7V?43Hx5_=10%lQk+C=rqm^k;@?}mhec5<|!~9y^Yt5mL<{?ofznXpKavU#dYQ>!t zoH#GCg8Mq*@GN2FRPwgN_Z8vhj_qgad&1k>EZ?mEc)RlDwciQaWvRsg5wTIPUK8Xp zdbQRox2_MGy9k>~;SN75d5cC=KZA^5vtHFYoGR59FF}c*DG{l%mXg$u)h=1~C7%{bvv6%C0R%FR_iFGZ8(v z-Z7i@oi*&?`yd3JJZm?6KhJY8W1z7A$#b5x9T?vNG-m^?%!}T@zH-L$-P(97^J0AZ zSyg$c2njdr!Q`nyBsmkmD*qrWu2oWU3bE*=uro8)Li{&eJIHkG(SQ$wqq3tsN{`7Q*Oj_)(qM+eH^uf_ zdeG{;qI)@Dbr)##bOa2VW7>c;7i1pI%bS1FWgr1bmSLPoKF%F5Gw~QVHriv^vl%g7 z)5;O^dvnsNMy^I4ZBzNs@7jJ?N+q#zv#J3nk^bhx*xfy1qwk25Wac*tKtMtCK(*IB z5*n6kCU(iiKq@g4yJ3?P2StB4X!4(#(e{Ik04c1IWGTX-x2}M!qtJ@wC&=EHHQJ6S z>ZOQB!*kBZ(livjgEv~?_qHO|&x4x(-R1hTtDh_bLKgr^SBbk|YP(G46J%J6JTm1o z&%P9tFtzADx1RXO3*7kQSk6)OY4NTS?4Sg$-Z%JeHx;Z{E?t3LBe-J~IHG7j)>Gjg=W3sJvUAj8U z{8F_296U)FUi*q2)Z~O3X++>YxHKRUf)P=j)cZvzx*X>;$#UyCK2NR(o;}`*`jCU^A~h^Pl3tAIT5I0aybz1bp8wd}@>ugRWp6>odN_EpF;oIZ?n~x8ov7qS z=vcip`_+5=gsl?F>$l2If8N|8 zQ}(=6GyB7{I09hS6gjrQ0L?9SUa=4Hgn-F@!Gvm;$HTHOYJHqb%*$skCN~grk>*BU zqRGcd_Zw$Do#4SAL{o_el#HzO(o6qmW9E-VsBW~qy^rfnO~mKOPAFPqNza&RYFwkI3A6}!eQ@L%}U$%(}chw z5FBq91H%HVxbM@^u|<%9{*n)i)oNkaPFK#x3h$^o?xu>U3*IQC&+XPfyZ~&!d_w=A z(vNafvY2c2c9mS(YBvDq+~;LI!M`n6oqYu9jWBL&u`CY#KE>Aux1Vp(9}KqK0g8?E zf03wYRAT=rQIVCgcwGp#ZQY))?XNszCqEJfXcY4xT1&sDcj8RniRroiA6Bro!z# zw%cx_BB0Ip|Kf!G*^u?WKd^~hbsHnEuXTEfMB{~oaA+rogs797$dr#P2(c>fHIR5F z#RZ)|b(mQeb0b(A>jA+oc*r)JCcS>OsnuPKV(WPJkk>HH#ORDP^3bp!Yyt}x)-oTl z;WsP_eZ6XOtE(YHw%S~CKNxC*+nPn%2wz*5Z+_TCl-+sb*jGB>O>33bciL;(Gd}H5 ziz2`8@1EE7A`#ITA_63#qql?cdtDEM064&QebXqevoEnkck46i9H2-!OnU&iR3Pku z3t8|npySZdH=7^LL_EB7`(uR7ZvvZj1+xd5pYZ^}+j}-h8M3;2~#ha7oBuC?@6vuQ9-L3wHO$|Aoe;2Xj4sQv)E|0{XUTP01-dOo+le zTy3rF=qtbKJ6scSXL&xSY-5FH^GZp*7`Bh%D4`AE_eMB_;?FGN@ce>?@CW^IFJp46 zLO5d8w)`x%wnE&Op{wXK1NQtP^~IH}Y|%)|$REOe|3&%;>E|;2vj*XJr%?n^TWyoA z3_<9%AM5eE<*zD0UCS`a$8F=HzObJO5pN!OzePMx)4z;!F;wq$u{9!-xj5oScA=j~ zVD-BoNB^DtT{pG>*>n0)NI|k5yCg><5ihLcaB$+C|^z2M9#SFM|(*hyc@N zO}4R=3}z>}fZMow*Wu8&d4dUuSk4aIc9=&8mZ&cnGO_#!zr=h;EAQ0O42T0a>}m$} zdCtgFJ-!Sojz?7MQty{!I~B_`ei4F0RV5;~$%^7>+d%d5ZYT$-IF^IFW+a)f2+s|? zRj4J>flUzO-s%gud+x?^QUCo;;P~U=Mg{KQ*a52Vmvme|OcE-15-NuQz9Sf6tIgO+ z?|{HpDLm;U^UPWtitSNmf*V_T1^q&t5X_>(sbS*Y-UGx%oh!4D7^iJR1+V$aW7a)^ zhXSmQHBA+LND%TS@<3`M;T_A_r_dY&$gaz##eIskc6ts|SY?mKGjv`G(Bydle1DNcMm!|RtHYM1T~$`;INky*Tz8-zjJ13T zgfOXJSbxJ4+T?;ffpzUqS^r{H0O%Yg`(gcpMnkgSlY}tdx+P%0?>lPoYGX|;&g8~9 zvIse>0P<@~!<%og`YUiKUj^ofy@RQ%d zMBbj|KQu-w6TGud zvfK&wQdrn6YdAl-ziC%w+KVTD&+=Oz%sc)5B7Sh5Mt^gStSX$A75oguIhpPWH*~1o znr_G6jWNKe2^Bvw{gJqfEE?V#ECS?uUjeqbeuwReML%^x8(blmLzQEU<`hLMv%88O z5LTZU#qtjg>f4pwDFN+#l<};3!XacVEbZ_zP0YCkz&N!82-BfnfJ$$nsuSUK>IrLO zW%(KZnxogZEnwA4|Fg#EN5v`ND{DEuXz()Jr_|-%zAPBhl~kX%(-OONdQoQtS~TYz zWh`lLw-%{`@^->)0X3FUf%*O63mGg_?G^bWv>y{a0}H@mwiUjWLwDx9^Z>`<}s zq|Y!FvCNpbL3(VIcenE+3dp2`sZ_ivi7e6h3=6bXXRaMWuPn=OC6fYOOtq@bF@d3; zDDIkfzY46eeOUx<6ek^sdRV5jn`WDD32?GEn~xpzFg$tzPd`V0eJN5qlItYK=sBs; z9L6Pr*9MrPtFOB<>=ZkSZVfnae2G`sQGR^wCQvs1E>!|>u)br}S#w-Bv@n3oJ|RCl zo45~1l|HJ9Hxzk%JK-UOE7CTF?ypyBfycfD`kAUx;wltLmUbgB$iV~A*ZUGTc$E#K zfrJ@QYJ1t^z&9dTrZh-~O{Wr3R*4G(;_#5CCa#en1j%N1+?gMU(T9P6NRKfcppPY{ zuZ3Pn5xdzfEoFAtx(W^|(I6W}vI~|HTnq~{U_(i^kBT9MjD2-(RYDH26J{#GM~1?P zy1Cu)Enxl|e*ri+0|pe22VE$8Sn8#H`KuvwV6Ub0%H5R8qq|BJVF>Xns&6xIyi{FvUVJbPA~eVInw+M0ZSA<{i~us znEU}&3*3haOq%rJJuH_-nuf}V5ye##$cCJ1-^z=_P7ec+cU$&b1;Rx-4#^uOn#l|S z<2NpOGM7{UDy)4=5k0j)Y>9Czo0EZa+Kv6+R9M!55}r{%`jj7NnN&o%eHt5>W^n%npwUu~zH|hKMtfypyWPQ_mw_UMOcr5O&8jwtukY@+>6zE45!w=6e1bR z5u_(C9zg3q_06p;@zOr-4J%+}G5hWShm4Xe+oo>p7Ms~jyK_H8410$k2qVE^Fz0UR zJ7(?xz5J8N6GWzg__|hYeFKwMQ<0n{<0$`Rc)W-2QTbUot#OsI(t&jUo}}EO$npRmys@e)6JZipKxe1>1Hz z31Hsy!wBSSW~rXkKsc;)hxiJ}&r=0V(N$*+_4Y5!4b-|32_Xd0qf8|x*Ub4%XPtia zdpus>m*sYCXR^@6s`EL2Xw@g;Wp1u}u7U^JnZ#RCzA9rW2QTqY>18Z}1{{3%zP8LO zZQuT|cJ<#xApcn(?c{y6v`LL2eAuUaE8fjnW|1I2=y9xRajM-y&$-oZEovHi@ePR% zyHM`h$WBH(!mz}PM`)5a2B-`Rz;QfrmPc~dm;^sR>Bd)y8pQLNpRb*NDkE)W*eJ8K z`2|1)=ccmq%%p)9@SX)GpiHLE=whyJ30<|!#vB*ThDUfrfhKuB?{gx}pdW5{B{2D0tWd1Io`nS{& z;g8qo5O*_RUVqnZq&T^O6M!|QOv)NdN%$;gm-UV)nfuM_{y=Ik6)sV$f*=G zZP3ZD1>`B2bm-ssHy@(pE)z0RAia}^Em_-_9VXsAH3i35!8$z zn#_0uSA1lbgKGI7)D|rp2fOh1X&RpWKLKC=bMgEW4PPK{E1CyO(`ckc>nB%XBoLQ< zrW|)eeS@J3sKXOQ+&p(rP}C{qJPkKz(kU*n5TKqVEvNgob?P9Yc)A_mF@vVudyO6} zSU_3BvW#WWzfjP_R?ix*Umh&yFc#= zpCpRWi3Xmhv8*8Nt`pyzCtvoJI-8cQe@`DvVI)7rR2i-1MU>-TQdu5^TiAs>@_yYX z0-MvHcL2PZR)?E8Go6>2@2CL&fsUHdZJH?^E+Su=Ih`aaRXK`B6q19TbF;BHy?KzZ^9Ad>w`bQ%f>D~4x+ z1t*L@l_U@;+!mlF%IgWB`4vNQ5V;_6yF+DRIDsqP;laSd+N-xAWQQktNPG%xp}KDO z;9*&-0K$qYr!yyZAwP5(pf9A*sMx!~C2)C*rVY^k2wKdJ^?9eHzG0e%_-n1FKT;Vv zc{a@52D~@a5(LmU*%sID+~&5KsQQKIQ;4g5=1H46g@lzjm#s#ChCARvLoref8-25t zO6KWi^9ws`cU;=<4A=X>`0PxPN?vvV4bE`5uXquU!4d+jr>1FWye24RFRrQ#RU)C= z5(^_?C7V}QNItw)fO(|*>ZF~|Xi*VZ;Hy}m9EVY(Q(hq2%5QSw1mZc*!ZT}WbM0Ol>+Nqga(z>kK4Thw_KJ#O+oAR020CCm|9i;T=9EqdRKvz>9{Z2pu?=_?Oq7SK||bY~)-P|N2p_!X%r{X6FN` z(akI00z4YtgzSENIN;n{AWv(>eP8%IuEo(Jq`>ML5|^Y4xgTC4xc$$ zJ?gDlW)x@?h029Um{YWqSw|QfgELI(PNM#Xgb|cp{1lkmEWTzqOlCKg zu(v?Smvu`oc)STu5LD9_jCy>f=FmBWJpStJ&^|x6L7@_@@xx&OyL8Z7XCUdhxn*hD zd${p0K%Ql7aXV{u`TxVh(aT=}h?e(%*E%+B+)6|sOGn4*j;{7jn?G2~jhM{rZE^DY ztRG80bVkjQ2LkQhA&%m45^A_@zdL%oY|i@7E(XaukkG4eLbX3-VTdNEF+Imh^iOKX zeOX3M+T5Mi)A5LCf-PbjXEGgKbyIHtiQo+p^K~6WmQUT+?8ckXAM|1YJ$8z|Sea+7 zK4~pEwH0W#-kA?q)G?mUfi5+$ z?%||f?PM9ADEVGlb+AbPM!E`AOqkRetrBR68=SaxAU67&yFf5T`%o+qH7nSyc$k z_iFD_rWepPyZ*OuCbax_^5?t-&@<}l>T;)Rp&lzv%&G~Mp)PYeY;cI1L*MI+Boh#Uh#2Os=rSz6_5_@$#l$|wcn%y#!S<-&*FD%MA%RqaCUCi@-mz@{dGg*J6&;V7X zfr4|qu>bT%7chT}+5Dmo83*Uo`ftotuf~ex2FK)0Zm&?GGjzS4fRSH~Akqn7u=8C? z+ig+W@A^|2k*1${lDdjZj&pp-TVBm%B2G1g%8yXSw9?|#8;o_0%DI#W&ZZzet@6wg5Z@v-!Wr&|vlf31f_i&K_HrY4v5w z2F7@$5dC+g)4$|cYSA4(iHZA~1>mMfSbXIF*RlQ^;^C2F&e1;}w%o5@M;EwF5bX!J zz0$M&AKc{;g|7 zxHSRFIi`Q`5#QgYYJc^Ew}3h`JSCRyZ=(5MKP@H(5I?SZ3SXr88$#?K1NZ0WmYW1V z;Z?32g@0Mb|HDKEtzMVYDwZ;({rqoTS#lN)e8RQ|vuFSM0seZeWc+u!{xos_ovuHv zlK<|mKL?NhzwWNy(}9$FARflYj{Rxw|5nWO#}6>!PoGZ~{TjR@)U36 zs*iEU(j;)H{bvK&rg-e}i^-mQ62MTbef$rHV*9#WW{RfkXv8sKUoTLROocz3DA^FWugDs>t#+|zY?v`?ist*o>2StKid+wj+`dBZyzi5KOBWN z3o24a+*>Yom`Sl|;}ucoG0ZnC@FSHpxt;PH(o3$iH}j&-Nt763jJQX%+y*=@TdOJ9 z{cDdlI(<@3D0R`vu%s%MU;kcYx(rvVW}1f0U{Uyw<#9RT{D-0UazRwX;6%3#me#Ao z`L2qIeVjS2nfHd}9xc3b_^dWwjice8;{3bN5F~aiIwd{(y~(BLqxTP2j9;7_=@p4I zcx9i&1i2U}WmjX;;4WOkej2}j^4!cwLxiW}poxH4`*rBCa~tFAXQx=VHnvF*TBEd551;-bR) zE00%`#&88H9Fb%Yv~Ei`Bda4`fYT-Fk|XA#q(@qc9^ZFuFM`a+D6NT*~BIbYHo{E$;$-;e@|zw{x+KU|XUFQgz z-LAu~ms+h5i;EO~NMD}0a~f_wtxX^CCCVlOvp0Xdw=v2H+i*D)j2UH1 zFIaH8$9|F3e+rM&;4_>6(-eo}(8cEOd}tTRPy^zd>eAd z%b+8bThn&Op@0FQ9er_LoAfd6Wc+J~3ahVA?e+E!2fObzL1S5VIYR~c>?YqFxL(w~ zxa?9;oZjW;cu5fCbeC7QP(_v9#8q}`J44b3`6uM8xg&OG61xOjQhx?NUYY8stoo{W zcRLLKwW^h7e9~#uXz+#n!IS2ezsNzNpatWBkTWb>yisCd(dHIDT*hLf zLaky+c$iA3sdJZLjYr(=J)V_;r(j=_>*3}QDx&{XdqGUg%m<>&Bo-RU)bl)u=Ox^* zzyMbL_;pCTg}x>NbpWd>Fte1$PF7bqNpBA#ga>AHyUZVY*t)ghH@TjV04*2kP5YVg z*Cn7crKM})YjN?sg{s6`mhBT$}f>>P7@+%&1tlQ$4 zY_b#xf3@9^ez+IuYZvJ&-MMQg`sj64bzFUfBP_dqc4B<0+5v2C_4^EcN~6Oq_~`o&sTa6M6;NW zW2r)8tQ|YAqI$+o?lq;}cl4hO0FJlDVW{g*-9orU|1xM%3;h|1#LP%>Yl1@Rj{;|1 zy}xCnh=Bb{$FU+e7LlS&vNo7}pAmH9w6h{h%Ts6PY5|;vWA0^yb9Q;Gy0KfoSIu2$ zEtn~2_mYHid)>A8gY)!SLQG4h?Ub5FoiEfYtKG|aoF!ZwWyMd47VnX~N`mw?w7E7% zX(S`O{X+Q_^cw#%hz%^ibp$6}?+K2GT9mrMfjTT$UE=Cd|0zE|iFz&&vY`a&{#lCT zKi=YC8WL~3Ywo2naccizj95T+dIe72nbw5KASx{jY1tFz_ z35AsC!@Ab+R)^zR~`W~3ESjheN5kU>jN5|c4{4~dk$um)>!H;`9cF=6> z!nJAi!f-YIjsOSZKE4nr&fs7&**yXc+U0WBki2A&JTF#+HxIP%<^OZL*jBI(0VVenV^YFU-( zEtw7rO`bs)2Zrf+zle>jLtIwgc!1VZ`D}3$Yv_gitmwNB@^xr~J8);luI5lTt_C>F zRT4QxGhmo~0|UVk#{m#2#S9I^%AqQgAK^;hdZ#4nBoo3UKLk0`jAU}$3uKOgvA?Sw zMZTrRwCF;%D?#a>NCC_w$j&X*l7PH{((PZctDP`n3}tdAo80j9u#NFUfvZ3%k zVf|*whNBTLHmXj(UtRJQ_FaC|d&_4+%>|LMa!W`+@MR@2Jx91ATfsj#W3l@7?-A4Cdy`T0vW60#0LZdF6wj8rAG6>ux zz8r#lX}`+?DV#+v*1

      (QE`DnVi{BCDA~os@$yTYQ(#k0;ZEef7`xR<0Txw%$@u z?=G|i4m5c6P+x;3?2ccFE8Mrbuxm%A3*;;4%YOMVx>T4M<`;B+XseM=5wHHni+-E( zn6T$XGIvk?wlP}k8>Tm6Ar*}VM2u8nCPB1>!k*bE+zPPZxATy{qtG|KGPD*ia9Xkx zA<%)iWeQCMx<|jpSQvK$6tNLV>h}RTjts&}d~zNF6&2q2^wHr7^PF~!?R@ndX_b1d zrlRDMTWF_kQb`Aw|Em6S+<7bBBe7g96Qr`>JE+l*YAS$7Kd4Gcj8fcOa3+)11RC#1 z$ZBo=8Z5ljkLeDKMJGGlPZ}ybphYy)1zcUvmzHQyr*?)F0@b!p6e}Y*b2e28!rDCY z!%Q6ovKIuHjMEZvGVEC>0<7j@sfG(AtfsA+4i?AeUimcA)OV%|@nplSx}OS~R_QZ$ z>Q||uzjzIDFMG%yd0U`#cz_pNv{IKRu83Z!N{hGmQSkcvI^QO~QJ=urL3q8QtuBUg zS1IPGur!H6GJ(5->(5``-JU$96U#UhQtRiZJ_68j_~;PsXC>OLkuXt^EAz$t%o$G! zh51sIchWnzF9FA+Bw!K~u?@Hx?pHC8Zk^dQE&8#08e0IWp-4HoXL3(|dt$ zWxtRH`vbZXn0(%i3niPRTDpo%S}{Ea_UXKL4|mkCEcJ8;SqUANZgKc+Wrz96 zH~dYCj342+_V1i76H%I2tE2cHZ%aex#KE-KPpO*x%9a~sfwTEa(M2x5r+n$X_!t1m zMZAm%!Cdik1&W~kR3^~163M)Oi%LxQnlY;T!uJpb7=PVV=u{y)jYd)#>IL!96=YlT>2OS2Modc)x&VzyxQfuez2G|ctT zbO3kLeN?4tl*Mq-K*9Hu=|+$b1p#6J7|9o@zoy*ta_|mrl~dc~qX@r-?(EH_oPFnj zlh6~5AQw{U(jwB(hSb?DHXCD_Qcqg- z_DE)TA>kwEz$<4R9C%Pw63cr$Ojwmnj(jk0<7+SAU!A{OCuBi>zJx?^d(cL;V=G!w zS}76tqHopis4I7_PX|mun5a}?5*IXnnSx%h8%1MjzQlkZW6En>>l6TvP$Q`SDP!=> zP14}|CtWr@WV^)<>;#HSUt+GRMy%PKe9XHCz&WR2?2YgG+uruDjr?UE{$E*000btd z)x)#KLJEXbgarH!nGqe{rHAA5q@h8ZEB*UR2UYS>R$Zd!cnb~_TcFnb^9g*8ZtpAq z(Z1Uf*iq9=G5@w37jb8j{}&7r4Ivj2d~w=-hssJ=B>$gaxxnsofv}BF4B5GQHF5@T z#}!M|%Rbb!wj(SR`PkquFiH!9V6F_{WM|pAiT@ zd*QY+GCqgE2*P`yX0di(NwfOoR-Xhn3xETO3K;3OghD9Fz99aNgmpX-3@ixUmECNEnOq4n-+Ew-KW;Sm^qBg?gU&4n zqccM`~q%XY@~Ia^A0_X73DT$~G@dfQoOSaKVs89Vl^+&JrC za8!m|6aP)c3uh~tDHM~buF@i`U+-SKt#8_ZikaP|>b{s8+T@64?4gSmK8}Z41z7pU z$E};*Tu;tHnBPfXu?7GZ7r040!5HkKlId}sks0Z|31j$a@OUNk{wqS~xSsEh(V=_k z9ZL^8^?3Qx*`fvbW-Q=X7ALPy0YVoU%3{Op58ACJtz~yUsvDkV2m#xD_n~#(GeXxA z!l+i)Olgxm-m0sj2=Yzqab}}wTR);Mec0WPF}}N>?@qEVvw%lJ+|jzHYCNXiQ!iUE zZG{E3G^l>QQ%p6(ozCcV(;i80{csb$OTgXY0T1)DZ%%T`TE8J5%)AfY*H^9Cr;cYR zv?)@qubKRL@pV~*_eU-P`z?TYWE!6+j3zK7#-W(Jm&I>4z#M-_e|r>zU6?R1l?=~| z#i&RtNr1|NoIeM6R$K1D!&uBj%YIjxa`A(8_AIu@v%VVLWDHg5Cb`QmJZds?r54_7~smyl@DDZS)#&yc3K(oE&_HfRbP>ei_aLpeb{!&Ab zGvVaCH&dNS+U<|q4k>)T7EZV`MuKHhWl`qNmiqBW?;&or`#P28nOMDuE4Glq2_R10^Y(lGT4gwBE#Iqx9?u8 zSCKw&AG0LvLvtMqI&z7-d#m~XBWtaOo2^$H~gk2*Vs3l4<1+oMuKQlia=6FAQafcl@J!Y$QG8Ce9EVVw7~0K_-k z^vlC|hHiUQtg3E^7VPqHVsh;@K9nqqP^fgK>F?nb6WbfkUs~h6H1iocL(q+rll(D` z>FemgX|lDaY6eRq5}dT-+eh1kla$e?M6Qh4M4}=FVH7BkWSoiHzYJH~1&-3a7ljPA zVLb~{m{vY_!%jFBOIW;kT*}#DGj)$YLVx!54>XWjXQ%fdoIk>(4BofTbA9}J0m6JO zS@;sI`eF5|IA*Hh%{MGL32q(Ki>tWg>iiWr-(nnUYkf!eoId zotiS&zVF%^yGdQ`nF%lItqAZIw{1pT^*UAYn0buk%=F7fNiPwgb^S!P6+KOfHv%wa z9qux_)GB?0o7BoqhGRWBb(9-D^z}CW{^?pre)ke><7?%>82dOe-~GJhF9bQF^KC; z@ohtjUx_~so{>x;AJuQ6%u*b2+O#jtZQ+yo@-~`|lNaEC{O8Um~qf5EEHhDx6Ebb_X%$k~eF8!@54D(w3d(*$d9)m+fOc@JK$v zhsGfK{?i&~;5epMN6$Iejbu69D!F}RPvJ;NpxVmC_}3+s4kwEV$x3wp@;X89Izegs zCfSW`wK{1-cjK#XA$vFr>noB7pVf}ZCGwm?ja1b2&Dgq7G~43}M^kI!uKUpk5(xtq z2kJlx?vQXLbicF;g6THvsK=EuYyOz2dFvbyAj=ZG4Q_rQB+DD?HitOD;qUGD(4 z;_J-~nJJoXpB@nE&Uiv8yMCasG*uX0!s@+8s>xlgx_-+mf=g!cAm4{I!hO?NrWYao-@}z`gHeJ?iaxzL&tCqtM{x5cRh?cc zz%qQsR(f|f-Noa+E)Y_9ekmp4OnHC3bqmqEEjitl(*S48tT+V=y|g&rw5hSYH=VKk zNkXp2oY^mpl;kOw;d(qoS1wuwyq8Qi;wr=#ncjND@K@LQGpy)SxldwS_<{;ZPqPsM zYKhVnP&Y=8c01lkdRk`_WqPJfjS(1!PZKB*7{9OEk-5KzY=4&ld9K!QswXYP?xKY5 zohW80vOZ-X z;g3Bp<>y}Jv; zm;KjYZ2B_dB5Lnm+oB}%X4HEn?9Y4tw1&z@LNLR~Gt|i#*mt2i(!9n?#O{RJG`Qpch%0g69;K}zTW&5 zmLmwBW0#e0R5eKdPDTQ_A0m|8n}J&q*fRh@I3#!$#z&`Kt|YY9WT`mYj}1K#auQcC zHy5RBnj(D-mcbe2R7=XZ1CH|r;BlM;!%Rz9!D1vH_RUcn=OnKu6+_U$ofE^(ctJj$ zZAZ|_QpwYh?dRE&j%XP^;u2d(^~u&Hc~4Vkf`sg9*0peVs|BC~JHZh4GfN3HN2E)Z z5Z#rq$u=K-$M-pb5KKsd4=HZVsJ1cMC~DFww%O=FsyysK-W1>XyQ7D}jXaX>bqflQ zY;mh$9Y0$Dmx-Eq94&nCGI@ukWZOHG?e#)MEV6KV(ZWL2;-SIH8|@ZxWbG1W3*D*3 zA>586O=XAR>0y)29GnS`wH8uIOe;xF)oIk4MRgQl#oRO|EX^ z(XL%E3He?%=I_P0z|Q1*|EyXKu*1kRF5N5;IzwJB808pTxxYbuHdJ$3#%b{O&OeD1 z-HLVr;EG#=&idYXoP878Y{a# zX#yZu>%!fv8nmi?tU(;T$8m+V#|=G__rlH>pZWiyUh#18nBUMZFsi7<5Usx7gH?(z z>^$7BSJo!L8@fy14d9O4!AnCcu=R`jGqqY=)(gUzs9V)Z-1KL?zOCasd(eF~G+kwO z-gzICLe#y%nvTo5)9Nzlf;Qa3WDoOy9ld|<&-dznmBfAJD?(D#6k)@P)qLE{-cO5_ zcNwfUDvb>vP9W*KJ72Hjw8CN$-BlQ>;xGHaG$Rx_YLcOz5kw)hdzq?EPyUP?Q2dQ;A>=J)$_AsF<)d@?(d-G>>o&Ufq7K~)IoW90I!L;`Mo_n9`UG;0HwU?*-LlvXoKDVb8eJ&`uf(=R|=q%c;l3dQd`xYg-^%hniUt%bu z`(PWr^s{dyY&@(>i=RB(udLzMOfAJjJ^0sTP&q zUNXe*KFJvhB=ztfJwYcsXazP-1b~$SG70=2obz4kGk|lN1PR;phKMdbvU}V9)T^QV z`rK4i)DUd70>107DTyT${4($mE95f^4wwr7PDauuRq17&Dstb}jO>0YcZABOE$JxA3NV`Ex zLrpwDP6gPg!FYGs+}f*$L%3J|n#aY2eY&e0u8|)x`Bshu3VweIh+rDk$o6c@zG?f! zN!DVqDZ6;BuUz*81IOrH5w0l&8=8G|v&cZPk%{6yy|W=Ks?|+A;$T}tv%iiEU)gti z?1+&|p*B`oQm_yAE8?0{Ds*#cV}nE^k0SO@b(UghtxH=b5w?H%8Er2mdgC^Jrqtt3 z@nxUHVDx2@Z4^!p>{pSOYXHRXZ6#hd(Qcp;dZd+sd`9kCnpeze+yv0f@R;AR^L^bdrXKOMAI?9?k zXqbMf_SSgT>gJqt;vY zai{_xNg%8?<68U>KIh32nP;{9-Cq)zmE}rGS|isP5zPwccE7=0M*B-A z_S~a1HP~U@ZJpiQJh2wFSpxRqcM)Nl(HF(C6Lrygscq?%3r>QtEn+!R@hfozJ=;}v z8?3Id2%s(dLm|yp6jV2m@;%5L<7s0t!MsB!N^;CT@~xX_vpc=VQo*WpM!+rhCG*WO z(10eG5fErCU;&dMs+p(Lg0NG6G^%J*?(xmW_!bTlW(HY=| zI2Tj}%||3yYL;3PC6aC|v~0aAvo^ga!|A74Cl6vX^<~(ga=9Rn=NM6>uPH4()WEE$ zTxgOoXqnv%4Y#m=d)0qv(4k*7nLhY2%_jUw_bX_xBe*t8gd?#cUho!!2BxNX4HPn1 zII4=1w+tGP8`}krEB#MG(=we?RZ~bN;miDviH|oYm=&YjYWfD|_Tc$y;_2woO``Zw ztDYCq`4AI%{ztKlmScgl)Nb=Aruf_1mO}FH?6Jc>n>9xtFFV9yHbDxygs%IER5!Xz z8r@gVMmutDguA^;HJ0*Xd&3556&tw+2WR%e)zWH%JzStKfmQQTmzKw3$!O>6>Jsr2 z_ZA6=x#F8qIO_=ie5_`yX13m&^`)x*Qd$rLa1;U75)uq)ee=mW5?sbG{E6B64w{toP-!pA+0Wmy$wPlU0o$K7md-d=s{FgXV zdl>S7$(~OLuNo-s+gLYeA!DOrOS+;sx#_J zlJef#-@X3GRUF6nm*7N0{^`_dA+{ebe^4)?ua>vf#)tJEh{wHNI1Kxf(f%%$YMl*d z6PN**$V=0|{Ig2F4&ibJxa+yJU)0R(^)K;ELmp_dSo^`EG`-P%!>LE0jSs)!iJgJ! z01|iW-uZHcJGSFNmpsBM=|MbDM70u+um>0k@;ClwkWu0VQ6#S%Q(#xC*XjQ^coNDUOhvJBfAk zEv~zKRYa(dV&H)^c(F2_z0Id!9kg99ui#_$ZL_#j)<`a$w4=#@VDuMR&a`t8qw_d1+7+H%`9rhW<=!r?o^jW_6bg`$+j#diHDPxQ!3$akdS_0WN~>s zp~l*A^!WpkbElwXNc>b%BG8FqIi14d=2QPYhE<*>GfbaSVKW9hI9k51QlH*FYSE(m z;r*H2CVK&pLe&PiHWC{fPmx^Q{F6AUq8Jk3?f|;vI+WjXn>zYkB=tyz{*yT7yp3A= z+KbyNwYdL)3;sgRi+=|+@PCUQlMv6j;+QdYBGh9RsBn^dW$lWsG@4aXvf0*~zppZF zKb9XlO52h}7)=?5YsO;Hg5ck{UaZQ5n>Gjn>_`oTERq=cDZp~?!pr~x80lNMvei-F|@>iB0Q3%SXRpOKze zbf*VG_P<-y5*=l!WnQ|S8|i9YCaEe$`}ELJ>?;N|aF1l%AS24~(!Y=ak80YsWo96| z*eB<&V!IY=F>8n>Yw*4|gxsJ7&w5{~h+C;S*hiF2|VlF&w){d*L+ zlGY>Ok)Mz!fBVj@U=6=Ty5}2s za&JPmyZ*cn0E5icqHUgZEGccpKIqgFL0ax7egLh7)sH_b6bW!ZDaX_KUtX@yhC6_t zIPZJw9SQOpuJMH`5H=jd)*?Uud;sH2vQ=zkYc-OqmFjuZ$zGIYm%V_P6@eE}?5h>G z*y;6aj#qHnQ--93=>?K3EnZ%F9$_YFE>nE?%-h$-;p_+YRosM4$_BrF&UjanHLZ?s zQ?{159mr5piiaEvn-^*2i+n;< z2fCaQoOsMW+o~&STW3`Eb$)4YRdRlniyWfH+o8gL5wkFMO^sjQr@han*>#i^$pUH| z*G3okv@_6C1KJZ4g5dKT&$-d7U&?F#dvhI{fXD)3w2-EI1XxV`QBD2om3@k?TvL?k zuYFq0l8?yl5)$l~iHoZJDaZt=+h=p9$a@4_RjwO0j~Fy^5%`jWF4Q2$)f_L7iA#4) zOjr0jx|Vlx<7*hP@F{g9?AF2&{pfW9mz|4@)?Jk2(kn`xADH?I(MN{I4x&QYOupBa zIo~0_x?mfCH4#G}7e=HOKnJFg4l8;Xbj~Q&;BZ>08Dwg%QmEAZ#X86Cw{Z#~=8Kd- z`dn}P*!3eXQ{NY`wa0vczLUzw{Z_~w%9zFLC=`AgqMg#`yRT&axB8Yqk{A5`94JpwL3~Y^fkd?=xCAZUW)siix_V`2Qif;G z;6EWd{&&do9?G9*jGX*Q^!k?m=?*3oM-^)h&`PPvX{h}QRtYG0?^0v`ZnR>&4s6pS z0G4JjgJ9#g2c-TYS2}^LF%+S85%#cWSo=MUae&(aB3S%KX9>eAab$b-tCb0dg~g> zPo&dV5i3MzTiap1#S7uIEH*#XMpFmt1YSr>sf7qBOI!?4-_GBd9+HDR1nk1&V+9Yj zu2rajHZBXQ-awP7UbNBFLBoR7ERIiBmy8Dc+5#`ECMWW zPBm6>N-!~E2Djmz+k3Hla*(l16Uh6FbaKOrFVQhQKi&Jz>&8L%OjVKovidej4H=%g z3wryrycYD5hlWX44O84650^%lw(;_pa!SRFNf~FG!RQa!m%h`{bM~x`?W zgUF%|`&qM@{OyQA2cv;}%6fB_vmkF4L;K&%hEQ4xMe8*d6gYSHjF$n3Ef%1Bjl?*p zPScch4~=rP2CCRIJ4tT-NKfMKJIX)l4L`qmuEG>Awwm7~pe0kY8ySW>A;N*^r{lMn3V{#?PoZx1O(5}Mtdwpnh zEDrB?M#hmK)+(+f%xzO~)Sf&YCAr=6y(dq1pL)zknALC-8<&oCmO-e+(J}T4qRZN=|n8>fEsv;Hq z?AhfUkQY29F}7{Ogxvt`H*lhHCC){ z5xhOm=+lIfy=KXJD7#kRv`Y)*Jk?9krlK9{Jcr)TYt{9zB)UqAk!h7#ChIv_i^n^+F%{R+o6F&uk~(;05luG7F;^w>%%@hnILCKk>#s^ zfTccsy5Gr3xfwN|B?zTc7X^h(K@;ieRB;*AH5WN#C=?`VXB5Rw8qAdxamyaS4g0ec z2f9{8lZcU!2|PxQiVh?RpVjw4VbZ3ExG_!0@J!1&wfEboB-P{CiUL^pMIHPDN&!C(cE00;u=M$+SPpSk1Jn9++gnf?Z*{8E&UF(;`?%rUv=X%vVu*W>vS z(x{@Q{O$khgEt4QDtH+5ulvL@wII<`5^+^UwRmW<4`0;=;dbVe^|cqU&SL{wvMkPY z#)V<5anHLi*K8R59q1EufPX_@TZlbggyYRSMj8mC_}e_v^{k;+YmaF|zdJl{J8_z= zO7XJwlGm4wBuuC}PW}&fQ*Xqlb^ea)YD+Nu4}4uJ5_jdOFYT4>RK#2d8dZluPPC6 zqClm1-p4Uf(28$`EWZV;&Vkl&Vle;`s!&bLefkXN-B zXEsd%DQzixzdLt%+F)*9Uc(aPv``cNxzV|}t&zc5+OY;=0@y**LCqQyEGPV{Xfw0o zUUIqoX7cdf>^BQzApOeHn(>>H2sx-w4{PspatzWCHDGD9i&qaVBY^1Q{Yy6qcT0A7 zXB4!X0A(Twe175ynp*E^ z8|m*50vd=RN|Ec}H2S1JrCe#!Cr`3~nW7u5>ORG;chsh$Ziz9wMC!nNMqIzcv(RR- zbTk*O7(k6VSa8r>cvhppQ(K2p3rfONg<-a9a4Oz*V=J1_Bm#(Nc=Mg z((yAY29u-DJs6g0oDQ_Ud6As>f(&4 z8uBmB^4*G1KEjf~tg9U?C01|r_F`qGCJ}Z_oW(?Ssw^|C=bCv^7T@j)6k|f?y5qE- zW@{+{?6Bob5xtm)a~#e2oiE5HGF{Mj>@~prGkiravFqd&55u7&PyPr7&XLHwVQF+x zrrnZ+&p1c&zOcGOf~P039|8^42O-<{4Xn%PzJ-!JUCB)JNc7Jgcacvlg257?YEgCA zx*5-w#sUVWSyweMB9ca+;HHuIvuwMP+uKBOCPD;sf}|_pVrpM;(oVAVU|NA8?aW*#d=Rn;DE3u zf|mH;!TMy#`Z97t!|Mg{W5BT<`YB&s6jLMf7ND9F=zdcfbnglJa%|Y_Z=+B}Eo`_|nPUR~mUG!x?wX0WDJC-j~im^vBFD0wnS8G>s|J_&7cHY>a z1Lb5_@7FmDMp_$6XC3_1pif|+fN>HnQwcVt?SeJm6Q|IZViwuNz2M6>n0)PU(~aQ# zC3%bAD|Ru`Z4jtKNi3F$*rZQNClJTM@K&SyO*ze%aGf?c<(;C6S4@PaL26nj7tBsCEM1YCqD#*V9>m%DY85@l~v0e?>U)4 ziPbR3xl+aiWsmR6j3f&-k_*TO!Q^pkaRyF<7T$FG6HCMm6jJ_sB2%%bsIG|*!dap! zrc-qFVnkKilXLlt1jA6xy`t9EA~W`&PZa8Hkb`e_`ogr`S9upp%h23d9KrluuepQn znyfw0wzxyP4%bJmtiuAj-kafqt*Tw_d{)~a;z zJ?$QAL!b$tJJolKy5UxxTIEIv6y)~YjJFSn+LfbS+aLK!UpP$ME;0b37!+%sqb0Me zoMn4a^zP5aW`%_m%t>Hs%v+bekC~dnmYruamDGJW$C4zlo|{E1=;|D??Ec#MEZc>< z)FQ3?tP2{|PI~J{Xed>@KW#x&kY5nezlyd>h0e%NCY2~RSRPX{m`0s>-O^tSF{d57 z!s;{U*W%Za>MQhG3&>N8M|PXCoZ`d7X$B?v0sapX-pU;Ooy}F1{-F z$MdsYXu)YlchsRp%v4woHW6^_Olc;C9jRK2q3}H*uQwC*xsM8@wjJ96LdM^#?b;Jm z2CO!Yqu;A-9ot*)qJ7OIolqaWzkN*nD&vog!WyQ1N^WQz|6oBtB!mTH$BcTRc0XZ7 zwkufs@8_3!hemo*EFu@jn z_iH@cq_#<7UpaejA-{Ce{jjjyAh-5-BtymU;?u~E3ivh9`m(D`BNvc_3+ zm&l{L#phSBuUYq$MJJ3V5<5}}=5pW&Bie1Ls_1cxIAQ<%A!%Jx=|#5QW`=pvW{20| zxCy?nafg*#3y=w%hJB(2>1h6$s2Mq4#g26CoT-A^7l*<_jJNtBB_og-)ai(Ts|QOl|e~0Z)~na(Dfg=mUFRzrXCV z!-!J82jno7Rj8kK=n~G?R;UBoFkp5DnX>8<4(N+Ox=I6ZNgwZHJ2}(5e!liaWR!_klDq9YiUGJl+*BShOt{&I)gP z8%AMtYRq(~CWa8hDL$VUeQwW}gumbPY~x1wCt0kwr9B&_c-+WSLJ#qa)a(I<5T~zL zmx?cMT1k11Lxq@c!47+3xu>n(TTL?&C^LVO&_}-`ddjzT#FMX28yqxU0#*mkj$|;A zO@NX^o8x1$(Ghh&cJdK@mo-F=Czvd!T7=dAmaLi-qw`i4seZq>58m+1SU8X3k*${T zMTR2cbeWX5qCD9vsSVck+glp1CtMGLI>JWxTB@G&+DDofjVO(6K`k%>SZ*WVCgodk zzd__Q^QIT>X#r}OvAyShs)+_7_|(c)n&p)0QxQ;Hbw2po>3jrX4Fou9r2Nnav!Nyb zciheaam!4zeV?Fvi+1~lzrlu1-Kcl$QgiEL!^fI@_#H9w;^Ne$RCUnNSJ`sFn#$s zZTerD=h_1TMcUvY;22|87WH?(j-)7eq-Zro`~8lJuTXb@QHj>DZ^M1TD_LLtNJ&4b zZd3u1jG#X8M?FOMdkm7b<6*#d$=>Y-Cq53hv51djfdyJyE! zPB?aahbAK)jibQexNKGhu^g|{EgXl2MQ`Wgtq^tVWEa!?+#Ig{_g0qUr|$$t6!b@E z()U-+wCatnt-ZMdrue5oUhOU05|@pClNdyih$CKtljW}~yj;2+hy*Bd%& z&7OFHkkGU`^t7FY8FIY))tk$Va$5=5&y?kCQh5@AV<1z+vZ`MzR0KaY{!v*Wp06IF zEGMBEh=RjPhKd!0<9*hi6?>Gs+8tok6cFU~do&A61krXuOyv_t48g_f!Xv!HtHfwP z)Ata96kN5IIoTMfpLNmby5|P zHFGXnJeW`WAoYXeCDhy{Zl{HPLnD?7g}&?+no0%gL(9-qv*Y_x^ZRKMP(Guj>!w zj*2Y|zKO$`s#*sQ#N{^p)E*EuQtTLA;nCy4Tv*x;(vG5t_uo5cXh$TMpDRT_{SAn` z6#GS^k1c;L4;H^62^wakx#sQnJJzDQm;bD$WFTO!SD2`MLGe3czOD=ZxrHn685luV zi_H~n-iV3?{euYGFwMumR=nI_k~l0ehl@EU43@5GvGfnM4Pbu@Sm4h!U|m(I1a~r% z$S0dV7aK^5o==ZF#Z2E8A`qSmV3f0oa4&QU(a{0;oWk?RHF`=X$C48WCsMoH=AFc~ zf`pBmMVA$z#w6QdUYH}Th)e*q5{3pyACxj@;=d@_o~?AKw5#-5OB~Jm{^V6QfqldI z=khVQo=DN+c}L4m+r&rg)?Lx&EeHt>$!~p7sx+fm<89UV34jn?&OdLK`=cmU2J@>D zGqUuM=$8af>cK2i!FLQ z=ws=vpX*6QWkKcT2i0vp=i*V<`%?zcy^LJ|C;_Of2H^dkt_6$3g4WxNiSt=DInXRB zD{ppC(0m-MXb0_{0S5C~$H;1vtfL~>di^Wp!+)EPkp65EA$G1|N=55ac8=qBJQQs> z$5;>h`-2VbStYv#)J7^4PU@du$=IO>5M4^DZn*woB<0B~>35$*o1cGbO8K`^jg;y+ zXG!bEXx4K4eQwJHJm}|wId*q3c^$pzHk+W=^M|9>N<<-AUX6#HQ{>NshEM(&D*t9v zWC7Ur=n)v!(*M^>`AkIh%Xi70$^PCR>uv&X`8jhi!vDk~DK7>vu(1djF8L2}4kZ4! zH_Wq{sGp@YidkrS=&1h_!zXF%&!5OabtlzE5m`_)<(hHvFH`@MtN7oavqAUU=&5+3 zw=eb&bDRJBYySN|Zd8D~k*3j*;veJjzx%SI3xUDrq(TKeLfQYL*RLc1-zqqiP56&U z`QNYgKYxFhN&ef|h)>^|@&D-cpEQB*&Yi07?!Wtp{}xjJ@ig}^;Nk&K>FVnLc;J8d zCm0zR+f8_L`lo^0|NOZB_5S|98~^`q{C|yz|2|&+*Q5WpSNuC9(dI{tCj>PEvId;J Qe*r%-5(?rKq96SJKVe@tY5)KL literal 0 HcmV?d00001 diff --git a/java_demo.md b/java_demo.md new file mode 100644 index 0000000000..5cfaf9ba60 --- /dev/null +++ b/java_demo.md @@ -0,0 +1,112 @@ + +* [Java Android Demo](#java-android-demo) + * [编译](#编译) + * [准备 demo 需要的其他文件](#准备-demo-需要的其他文件) + * [脚本方法](#脚本方法) + * [手动拷贝方法](#手动拷贝方法) + * [把 .so 动态库和 .jar 拷贝进安卓demo程序:](#把-so-动态库和-jar-拷贝进安卓demo程序) + * [把demo使用到的模型文件拷贝进安卓程序:](#把demo使用到的模型文件拷贝进安卓程序) + * [运行 Android 程序结果](#运行-android-程序结果) + + + + + +# Java Android Demo +本节中,Java demo 完整代码位于 [demo/java](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite/demo/java) 。 + +要编译和跑起Android demo 程序 PaddlePredictor,你需要准备: + +1. 一台能运行安卓程序的安卓手机 +2. 一台带有AndroidStudio的开发机 + +## 编译 + +首先在PaddleLite的开发 [Docker镜像](./source_compile) 中,拉取最新PaddleLite代码,编译对应你手机架构的预测库, +下面我们以arm8 架构举例。进入paddlelite 目录,运行以下命令: + +```shell +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv8 \ + --arm_lang=gcc \ + --android_stl=c++_static \ + tiny_publish +``` + +命令完成后查看要存在 + +``` +./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/so/libpaddle_lite_jni.so +./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/jar/PaddlePredictor.jar +./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android +``` + +libpaddle_lite_jni.so为 PaddleLite c++ 动态链接库,PaddlePredictor.jar为 Java jar 包,两者包含 PaddleLite Java API,接下来 Android Java 代码会使用这些api。android文件夹中则是Android demo。 + +## 准备 demo 需要的其他文件 + +Demo 除了代码,还需要准备在Android工程目录下配置好JNI .so 库(上节提到的`libpaddle_lite_jni.so`),Java .jar 包(上文提到的`PaddlePredictor.jar` ),和模型文件。我们提供了自动化的脚本和手动拷贝两种方法,用户可以根据自己需要选择: + +### 脚本方法 + +进入 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android`,我们准备了一个脚本`prepare_demo.bash`,脚本输入一个参数,为你要拷贝的.so 对应的架构文件夹名。 + +例如运行 + +``` +bash prepare_demo.bash arm8 +``` + +该脚本自动下载并解压缩模型文件,拷贝了 .jar 包进demo,还有生成的.so包进`PaddlePredictor/app/src/main/jinLibs/架构文件夹下`, +在我们这个例子里,armv8 就是架构文件夹。备注:这种方式构建的 demo 在 armv8 手机运行正常。如果要demo 程序在别的手机架构(如 armv7)上也运行正常,需要添加别的架构。 + +### 手动拷贝方法 + +接下来我们介绍手动拷贝,如果使用了脚本,那么可以跳过以下手动方法的介绍。 + +### 把 .so 动态库和 .jar 拷贝进安卓demo程序: + +1. 将PaddlePredictor 载入到AndroidStudio。 +2. 将`libpaddle_lite_jni.so`拷贝进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下` ,比如文件夹arm8里要包含该 .so文件。 +3. 将 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下 + +### 把demo使用到的模型文件拷贝进安卓程序: + +下载我们的5个模型文件,并解压缩到 `PaddlePredictor/app/src/main/assets` 这个文件夹中 +需要拷贝的模型文件和下载地址: + +``` +inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz +lite_naive_model_opt.nb http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz +mobilenet_v1_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz +mobilenet_v2_relu_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz +resnet50_opt.nb http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz +``` + +下载完后,assets文件夹里要包含解压后的上面五个模型文件夹,但demo里不需要保存原压缩.tar.gz 文件。 + +注意:输入的模型要求为naive buffer存储格式,您可以通过 [**Model Optimize Tool**](./model_optimize_tool) 将fluid模型转为naive buffer存储格式。 + +## 运行 Android 程序结果 + +以上准备工作完成,就可以开始Build 、安装、和运行安卓demo程序。当你运行PaddlePredictor 程序时,大概会等10秒,然后看到类似以下字样: + +``` +lite_naive_model output: 50.213173, -28.872887 +expected: 50.2132, -28.8729 + +inception_v4_simple test:true +time: xxx ms + +resnet50 test:true +time: xxx ms + +mobilenet_v1 test:true +time: xxx ms + +mobilenet_v2 test:true +time: xxx ms +``` + +该 demo 程序跑我们的 5 个模型,第一个模型结果将真正的头两个数字输出,并在第二行附上期望的正确值。你应该要看到他们的误差小于0.001。后面四个模型如果你看到 `test:true` 字样,说明模型输出通过了我们在 demo 程序里对其输出的测试。time 代表该测试花费的时间。 diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt deleted file mode 100644 index 937781293a..0000000000 --- a/lite/CMakeLists.txt +++ /dev/null @@ -1,159 +0,0 @@ -include(lite) - -message(WARNING "Lite enabled!") -message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}") -message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}") -message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") -message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") -message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") -message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") -message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") - -set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install") -set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}) - -add_subdirectory(utils) -add_subdirectory(operators) -add_subdirectory(kernels) -add_subdirectory(core) -add_subdirectory(model_parser) -add_subdirectory(api) -add_subdirectory(fluid) -add_subdirectory(backends) - -if (NOT LITE_ON_TINY_PUBLISH) - add_subdirectory(tests) - add_subdirectory(tools) -endif() -if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND NOT LITE_ON_TINY_PUBLISH) - add_subdirectory(gen_code) -endif() - -if (WITH_TESTING) - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz") - if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz") - endif() - if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz") - endif() -endif() - -if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) - # for publish - set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}") - if (LITE_WITH_OPENCL) - set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.opencl") - endif(LITE_WITH_OPENCL) - if (LITE_WITH_NPU) - set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.npu") - endif(LITE_WITH_NPU) - message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") - - # The final target for publish lite lib - add_custom_target(publish_inference) - if (NOT LITE_ON_TINY_PUBLISH) - # add cxx lib - add_custom_target(publish_inference_cxx_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" - COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" - ) - if(NOT IOS) - #add_dependencies(publish_inference_cxx_lib model_optimize_tool) - add_dependencies(publish_inference_cxx_lib paddle_code_generator) - add_dependencies(publish_inference_cxx_lib bundle_full_api) - add_dependencies(publish_inference_cxx_lib bundle_light_api) - add_dependencies(publish_inference_cxx_lib test_model_bin) - add_dependencies(publish_inference publish_inference_cxx_lib) - add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD - COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a) - endif() - else() - if (IOS OR (ARM_TARGET_OS STREQUAL "armlinux")) - add_custom_target(tiny_publish_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/lib" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" - COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" - ) - add_dependencies(tiny_publish_lib bundle_light_api) - add_dependencies(publish_inference tiny_publish_lib) - endif() - endif() - - - if (LITE_WITH_JAVA) - # add java lib - add_custom_target(publish_inference_java_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/java/so" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/java/jar" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/libpaddle_lite_jni.so" "${INFER_LITE_PUBLISH_ROOT}/java/so" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/android/jni/PaddlePredictor.jar" "${INFER_LITE_PUBLISH_ROOT}/java/jar" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/api/android/jni/src" "${INFER_LITE_PUBLISH_ROOT}/java" - ) - add_dependencies(publish_inference_java_lib paddle_lite_jni PaddlePredictor) - add_dependencies(publish_inference publish_inference_java_lib) - add_custom_command(TARGET publish_inference_java_lib POST_BUILD - COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/java/so/libpaddle_lite_jni.so) - endif() - - if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND - ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) - if (NOT LITE_ON_TINY_PUBLISH) - # copy - add_custom_target(publish_inference_android_cxx_demos ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include" - COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/gflags" "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/Makefile.def" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_full" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" - ) - add_dependencies(publish_inference_android_cxx_demos logging gflags) - add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) - endif() - - if (LITE_WITH_JAVA) - # copy java mobile_light demo/lib - add_custom_target(publish_inference_android_java_demo ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/java/android" "${INFER_LITE_PUBLISH_ROOT}/demo/java" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/java/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/java" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/libs" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm7" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm8" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm64-v8a" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/armeabi-v7a" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/x86" - ) - add_dependencies(publish_inference_java_lib publish_inference_android_java_demo) - endif() - endif() - - if (LITE_WITH_OPENCL) - add_custom_target(publish_inference_opencl ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/opencl" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel" "${INFER_LITE_PUBLISH_ROOT}/opencl" - ) - add_dependencies(publish_inference_cxx_lib publish_inference_opencl) - endif() -endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt deleted file mode 100644 index 7767458b37..0000000000 --- a/lite/api/CMakeLists.txt +++ /dev/null @@ -1,239 +0,0 @@ -if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - lite_cc_library(place SRCS paddle_place.cc DEPS logging) -else() - lite_cc_library(place SRCS paddle_place.cc DEPS glog) -endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - -if (WITH_TESTING) - lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc - DEPS scope optimizer target_wrapper_host model_parser program - ${ops} ${host_kernels} - CUDA_DEPS ${cuda_kernels} - X86_DEPS ${x86_kernels}) -endif() -if(LITE_WITH_FPGA) - set(light_api_deps ${light_api_deps} ${fpga_deps}) - set(cxx_api_deps ${cxx_api_deps} ${fpga_deps}) -endif() - -message(STATUS "get ops ${ops}") -message(STATUS "get X86 kernels ${x86_kernels}") -message(STATUS "get Host kernels ${host_kernels}") -message(STATUS "get ARM kernels ${arm_kernels}") -message(STATUS "get NPU kernels ${npu_kernels}") -message(STATUS "get FPGA kernels ${fpga_kernels}") - -# for full api -if (NOT LITE_ON_TINY_PUBLISH) - set(cxx_api_deps - scope optimizer target_wrapper_host model_parser program) - lite_cc_library(cxx_api - SRCS cxx_api.cc - DEPS ${cxx_api_deps} ${ops} ${host_kernels} program - X86_DEPS ${x86_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels}) -endif() - -# for light api -set(light_api_deps - scope target_wrapper_host model_parser program) -if(LITE_WITH_CUDA) - set(light_api_deps ${light_api_deps} target_wrapper_cuda) -endif() -lite_cc_library(light_api SRCS light_api.cc - DEPS scope target_wrapper_host model_parser - ${light_api_deps} ${ops} ${host_kernels} program - CUDA_DEPS ${cuda_kernels} - X86_DEPS ${x86_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels}) - -include(ExternalProject) -set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING - "A path setting inference demo download directories.") - -if(WITH_TESTING) - lite_cc_test(test_cxx_api SRCS cxx_api_test.cc - DEPS cxx_api mir_passes lite_api_test_helper - ${ops} ${host_kernels} - X86_DEPS ${x86_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - EXCLUDE_COMPILE_DEPS "ON" - ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model - --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) - add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz) - if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc - DEPS cxx_api mir_passes lite_api_test_helper - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/googlenet) - add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz) - lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc - DEPS cxx_api mir_passes lite_api_test_helper - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1) - add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz) - lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc - DEPS cxx_api mir_passes lite_api_test_helper - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu) - add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz) - lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc - DEPS cxx_api mir_passes lite_api_test_helper - ${ops} ${host_kernels} ${x86_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple) - add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz) - endif() -endif() - -if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) - set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels}) - - lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc - DEPS ${lite_model_test_DEPS} - CL_DEPS ${opencl_kernels} - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl - --model_dir=${LITE_MODEL_DIR}/MobilenetV1_quant SERIAL) - add_dependencies(test_mobilenetv1_int8 extern_lite_download_MobileNetV1_quant_tar_gz) - - lite_cc_test(test_mobilenetv1 SRCS mobilenetv1_test.cc - DEPS ${lite_model_test_DEPS} - CL_DEPS ${opencl_kernels} - NPU_DEPS ${npu_kernels} ${npu_bridges} - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl - --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL) - add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_tar_gz) - set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}") - - lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc - DEPS ${lite_model_test_DEPS} - CL_DEPS ${opencl_kernels} - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl - --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu SERIAL) - add_dependencies(test_mobilenetv2 extern_lite_download_mobilenet_v2_relu_tar_gz) - set_target_properties(test_mobilenetv2 PROPERTIES LINK_FLAGS "${LINK_FLAGS}") - - lite_cc_test(test_resnet50 SRCS resnet50_test.cc - DEPS ${lite_model_test_DEPS} paddle_api_light - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl - --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL) - add_dependencies(test_resnet50 extern_lite_download_resnet50_tar_gz) - - lite_cc_test(test_resnet50_fpga SRCS resnet50_test_fpga.cc - DEPS ${lite_model_test_DEPS} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) - - lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc - DEPS ${lite_model_test_DEPS} - CL_DEPS ${opencl_kernels} - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl - --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) - add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) - # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc - # DEPS ${lite_model_test_DEPS}) - - # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc - # DEPS ${lite_model_test_DEPS} - # CL_DEPS ${opencl_kernels} - # FPGA_DEPS ${fpga_kernels}) -endif() - -# These tests needs CLI arguments, and is not supported in ARM CI. -# TODO(Superjomn) support latter. -lite_cc_test(test_light_api SRCS light_api_test.cc - DEPS light_api program mir_passes - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) - -lite_cc_test(test_apis SRCS apis_test.cc - DEPS cxx_api light_api ${ops} - CL_DEPS ${opencl_kernels} - X86_DEPS ${x86_kernels} - FPGA_DEPS ${fpga_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model - --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) - -lite_cc_library(paddle_api SRCS paddle_api.cc DEPS op_params tensor) - -#----------------------------------------------------------------------------------------------------- -# The final inference library for both CxxConfig and MobileConfig. -if (LITE_ON_TINY_PUBLISH) - lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api stream) -else() - lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api) -endif() -if (NOT LITE_ON_TINY_PUBLISH) - lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light - ${ops} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) - # The final inference library for just MobileConfig. - bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) -endif() -bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api) -#----------------------------------------------------------------------------------------------------- - -if (LITE_WITH_JAVA AND LITE_WITH_ARM) - add_subdirectory(android) -endif() - -if (LITE_ON_TINY_PUBLISH) - return() -endif() - -if (LITE_ON_MODEL_OPTIMIZE_TOOL) - message(STATUS "Compiling model_optimize_tool") - lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc - DEPS gflags kernel op optimizer mir_passes utils) - add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc) -endif(LITE_ON_MODEL_OPTIMIZE_TOOL) - -lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light - ${ops} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - X86_DEPS ${x86_kernels} - FPGA_DEPS ${fpga_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) -if (WITH_TESTING) - add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) -endif() - -# Some bins -if(NOT IOS) - lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils - ${ops} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - X86_DEPS ${x86_kernels}) - lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils - ${ops} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - X86_DEPS ${x86_kernels}) -endif() - -#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc - #X86_DEPS operator - #DEPS light_api model_parser target_wrapper_host mir_passes - #ARM_DEPS ${arm_kernels}) NPU_DEPS ${npu_kernels}) diff --git a/lite/api/_paddle_use_kernels.h b/lite/api/_paddle_use_kernels.h deleted file mode 100644 index 75756736f4..0000000000 --- a/lite/api/_paddle_use_kernels.h +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * ATTENTION this header file can only include in .cc file. - */ - -#pragma once -#include "paddle_lite_factory_helper.h" // NOLINT -#ifndef LITE_WITH_FPGA -USE_LITE_KERNEL(feed, kHost, kAny, kAny, def); -USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); -USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def); -USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def); -#else -USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def); -#endif - -// host kernels -USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def); -USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def); -USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def); - -#ifdef LITE_WITH_ARM -USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(power, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(argmax, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(negative, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(crop, kARM, kFloat, kNCHW, def); - -USE_LITE_KERNEL(norm, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(sequence_softmax, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(im2sequence, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(less_than, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(shape, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(anchor_generator, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(generate_proposals, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def) // for x2paddle -USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def) // for x2paddle -USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def) // for x2paddle -USE_LITE_KERNEL(roi_align, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(box_clip, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(assign_value, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def) - -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8); -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32); -USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8); -USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32); -USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out); -USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out); -USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out); -USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out); -USE_LITE_KERNEL(gru_unit, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(gru, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(beam_search_decode, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(beam_search, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def) -USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def); -#endif - -#ifdef LITE_WITH_X86 -// NOTE all the X86 kernels are disabled temporarily for kernel are changed. -// USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def); -USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def); -USE_LITE_KERNEL(slice, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(fill_constant, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def); -#endif - -#ifdef LITE_WITH_CUDA -USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def); -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host); -USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, device_to_host); -USE_LITE_KERNEL(conv2d, kCUDA, kFloat, kNCHW, def); -USE_LITE_KERNEL(leaky_relu, kCUDA, kFloat, kNCHW, def); -USE_LITE_KERNEL(nearest_interp, kCUDA, kFloat, kNCHW, def); -USE_LITE_KERNEL(yolo_box, kCUDA, kFloat, kNCHW, def); -USE_LITE_KERNEL(concat, kCUDA, kFloat, kNCHW, def); -#endif - -#ifdef LITE_WITH_OPENCL -USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, device_to_host); -USE_LITE_KERNEL(io_copy_once, kOpenCL, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy_once, kOpenCL, kAny, kAny, device_to_host); - -USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def); -USE_LITE_KERNEL(mul, kOpenCL, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def); -USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW, def); -USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def); -USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def); -USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def); -USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def); -#endif - -#ifdef LITE_WITH_NPU -USE_LITE_KERNEL(graph_op, kNPU, kFloat, kNCHW, def); -#endif -#ifdef LITE_WITH_FPGA -USE_LITE_KERNEL(relu, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(scale, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(softmax, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, device_to_host); -USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, host_to_device_once); -USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, device_to_host_once); -USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga); -USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga); -USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga); -USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga); -USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16); -USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16); -USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16); -USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16); -#endif diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h deleted file mode 100644 index 890c57c4aa..0000000000 --- a/lite/api/_paddle_use_ops.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -// ATTENTION This can only include in a .cc file. - -#include "paddle_lite_factory_helper.h" // NOLINT - -USE_LITE_OP(mul); -USE_LITE_OP(matmul); -USE_LITE_OP(fc); -USE_LITE_OP(relu); -USE_LITE_OP(relu6); -USE_LITE_OP(scale); -USE_LITE_OP(feed); -USE_LITE_OP(lrn); -USE_LITE_OP(decode_bboxes); -USE_LITE_OP(box_coder); -USE_LITE_OP(fetch); -USE_LITE_OP(io_copy); -USE_LITE_OP(io_copy_once); -USE_LITE_OP(elementwise_add) -USE_LITE_OP(elementwise_sub) -USE_LITE_OP(elementwise_mul) -USE_LITE_OP(elementwise_max) -USE_LITE_OP(elementwise_div) -USE_LITE_OP(fusion_elementwise_add_activation) -USE_LITE_OP(fusion_elementwise_mul_activation) -USE_LITE_OP(fusion_elementwise_max_activation) -USE_LITE_OP(fusion_elementwise_div_activation) -USE_LITE_OP(square) -USE_LITE_OP(softmax) -USE_LITE_OP(dropout) -USE_LITE_OP(concat) -USE_LITE_OP(conv2d) -USE_LITE_OP(depthwise_conv2d) -USE_LITE_OP(pool2d) -USE_LITE_OP(batch_norm) -USE_LITE_OP(fusion_elementwise_sub_activation) -USE_LITE_OP(transpose) -USE_LITE_OP(transpose2) -USE_LITE_OP(arg_max) -USE_LITE_OP(axpy) -USE_LITE_OP(leaky_relu) -USE_LITE_OP(relu_clipped) -USE_LITE_OP(prelu) -USE_LITE_OP(sigmoid) -USE_LITE_OP(tanh) -USE_LITE_OP(swish) -USE_LITE_OP(log) -USE_LITE_OP(exp) -USE_LITE_OP(conv2d_transpose) -USE_LITE_OP(negative) -USE_LITE_OP(pad2d) -USE_LITE_OP(power) -USE_LITE_OP(shuffle_channel) -USE_LITE_OP(yolo_box) -USE_LITE_OP(bilinear_interp) -USE_LITE_OP(nearest_interp) -USE_LITE_OP(reduce_mean) -USE_LITE_OP(stack) - -USE_LITE_OP(assign); -USE_LITE_OP(crop) -USE_LITE_OP(prior_box) -USE_LITE_OP(density_prior_box) -USE_LITE_OP(reshape) -USE_LITE_OP(reshape2) -USE_LITE_OP(flatten) -USE_LITE_OP(flatten2) -USE_LITE_OP(split) -USE_LITE_OP(fake_quantize_moving_average_abs_max); -USE_LITE_OP(fake_dequantize_max_abs); -USE_LITE_OP(fake_quantize_range_abs_max); -USE_LITE_OP(calib); -USE_LITE_OP(calib_once); -USE_LITE_OP(norm); -USE_LITE_OP(layout); -USE_LITE_OP(layout_once); -USE_LITE_OP(im2sequence); -USE_LITE_OP(sequence_softmax); -USE_LITE_OP(logical_xor); -USE_LITE_OP(logical_and); -USE_LITE_OP(less_than); -USE_LITE_OP(top_k); -USE_LITE_OP(increment); -USE_LITE_OP(write_to_array); -USE_LITE_OP(read_from_array); -USE_LITE_OP(gru_unit) -USE_LITE_OP(gru) -USE_LITE_OP(beam_search_decode) -USE_LITE_OP(beam_search) -USE_LITE_OP(fill_constant) -USE_LITE_OP(while) -USE_LITE_OP(lod_reset) -USE_LITE_OP(lookup_table) -USE_LITE_OP(multiclass_nms) -USE_LITE_OP(graph_op) -USE_LITE_OP(sequence_expand) -USE_LITE_OP(sequence_pool) -USE_LITE_OP(reduce_max) -USE_LITE_OP(is_empty) -USE_LITE_OP(shape) -USE_LITE_OP(slice) -USE_LITE_OP(cast) -USE_LITE_OP(affine_channel) -USE_LITE_OP(anchor_generator) -USE_LITE_OP(generate_proposals) -USE_LITE_OP(squeeze) // for x2paddle -USE_LITE_OP(squeeze2) // for x2paddle -USE_LITE_OP(expand) // for x2paddle -USE_LITE_OP(roi_align) -USE_LITE_OP(box_clip) -USE_LITE_OP(assign_value) -USE_LITE_OP(hard_sigmoid) diff --git a/lite/api/android/.gitignore b/lite/api/android/.gitignore deleted file mode 100644 index a1d6334395..0000000000 --- a/lite/api/android/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/bin/ -.classpath diff --git a/lite/api/android/CMakeLists.txt b/lite/api/android/CMakeLists.txt deleted file mode 100644 index 7f31f7e947..0000000000 --- a/lite/api/android/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -if ((NOT LITE_WITH_JAVA) OR (NOT LITE_WITH_ARM)) - return() -endif() - -add_subdirectory(jni) diff --git a/lite/api/android/jni/.gitignore b/lite/api/android/jni/.gitignore deleted file mode 100644 index 1299d2738c..0000000000 --- a/lite/api/android/jni/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/PaddleListTest.class -/PaddleLite.class -/bin/ diff --git a/lite/api/android/jni/CMakeLists.txt b/lite/api/android/jni/CMakeLists.txt deleted file mode 100644 index b2f5671a7b..0000000000 --- a/lite/api/android/jni/CMakeLists.txt +++ /dev/null @@ -1,52 +0,0 @@ -if ((NOT LITE_WITH_ARM) OR (NOT LITE_WITH_JAVA)) - return() -endif() - -include(UseJava) -find_package(Java REQUIRED) - -# We are only interested in finding jni.h: we do not care about extended JVM -# functionality or the AWT library. -set(JAVA_AWT_LIBRARY NotNeeded) -set(JAVA_JVM_LIBRARY NotNeeded) -set(JAVA_INCLUDE_PATH2 NotNeeded) -set(JAVA_AWT_INCLUDE_PATH NotNeeded) -find_package(JNI REQUIRED) - -# Generate PaddlePredictor.jar -include_directories(${JNI_INCLUDE_DIRS}) -add_jar(PaddlePredictor - src/com/baidu/paddle/lite/ConfigBase.java - src/com/baidu/paddle/lite/CxxConfig.java - src/com/baidu/paddle/lite/MobileConfig.java - src/com/baidu/paddle/lite/PaddleLiteInitializer.java - src/com/baidu/paddle/lite/PaddlePredictor.java - src/com/baidu/paddle/lite/PowerMode.java - src/com/baidu/paddle/lite/Place.java - src/com/baidu/paddle/lite/Tensor.java) -get_target_property(_jarFile PaddlePredictor JAR_FILE) -get_target_property(_classDir PaddlePredictor CLASSDIR) -set(_stubDir "${CMAKE_CURRENT_BINARY_DIR}") - -# Generate native headers -add_custom_target( - paddle_lite_jni_header ALL - COMMAND ${Java_JAVAH_EXECUTABLE} -verbose - -classpath ${_classDir} - -o "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/paddle_lite_jni.h" - -jni - com.baidu.paddle.lite.PaddlePredictor - COMMAND ${Java_JAVAH_EXECUTABLE} -verbose - -classpath ${_classDir} - -o "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/tensor_jni.h" - -jni - com.baidu.paddle.lite.Tensor - COMMAND ${Java_JAVAH_EXECUTABLE} -verbose - -classpath ${_classDir} - -o "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/paddle_init_jni.h" - -jni - com.baidu.paddle.lite.PaddleLiteInitializer - DEPENDS PaddlePredictor -) - -add_subdirectory(native) diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt deleted file mode 100644 index afe051a437..0000000000 --- a/lite/api/android/jni/native/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -# Generate paddle_lite_jni.so - -if (LITE_ON_TINY_PUBLISH) - set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG") - set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") - set(lib_DEPS light_api paddle_api paddle_api_light) -else() - set(lib_DEPS light_api cxx_api paddle_api_full paddle_api paddle_api_light) -endif() - -include_directories(${JNI_INCLUDE_DIRS} ${_classDir} ${_stubDir}) -if (NOT LITE_ON_TINY_PUBLISH) - lite_cc_library(paddle_lite_jni MODULE - SRCS paddle_lite_jni.cc tensor_jni.cc - DEPS ${lib_DEPS} - ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels}) - # Unlike static library, module library has to link target to be able to work - # as a single .so lib. - target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels}) -else() - add_library(paddle_lite_jni SHARED "") - target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) - add_dependencies(paddle_lite_jni op_list_h kernel_list_h) -endif() - -if (APPLE) - # MacOS only accepts JNI lib ends with .jnilib or .dylib - set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".jnilib") -elseif (WIN32) - # Windows only accepts JNI lib ends with .dll - set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".dll") -endif (APPLE) diff --git a/lite/api/android/jni/native/convert_util_jni.h b/lite/api/android/jni/native/convert_util_jni.h deleted file mode 100644 index ae987c330d..0000000000 --- a/lite/api/android/jni/native/convert_util_jni.h +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "lite/api/light_api.h" -#include "lite/api/paddle_api.h" -#include "lite/api/paddle_place.h" - -namespace paddle { -namespace lite_api { - -inline std::string jstring_to_cpp_string(JNIEnv *env, jstring jstr) { - // In java, a unicode char will be encoded using 2 bytes (utf16). - // so jstring will contain characters utf16. std::string in c++ is - // essentially a string of bytes, not characters, so if we want to - // pass jstring from JNI to c++, we have convert utf16 to bytes. - if (!jstr) { - return ""; - } - const jclass stringClass = env->GetObjectClass(jstr); - const jmethodID getBytes = - env->GetMethodID(stringClass, "getBytes", "(Ljava/lang/String;)[B"); - const jbyteArray stringJbytes = (jbyteArray)env->CallObjectMethod( - jstr, getBytes, env->NewStringUTF("UTF-8")); - - size_t length = (size_t)env->GetArrayLength(stringJbytes); - jbyte *pBytes = env->GetByteArrayElements(stringJbytes, NULL); - - std::string ret = std::string(reinterpret_cast(pBytes), length); - env->ReleaseByteArrayElements(stringJbytes, pBytes, JNI_ABORT); - - env->DeleteLocalRef(stringJbytes); - env->DeleteLocalRef(stringClass); - return ret; -} - -inline jfloatArray cpp_array_to_jfloatarray(JNIEnv *env, - const float *buf, - int64_t len) { - jfloatArray result = env->NewFloatArray(len); - env->SetFloatArrayRegion(result, 0, len, buf); - return result; -} - -inline jintArray cpp_array_to_jintarray(JNIEnv *env, - const int *buf, - int64_t len) { - jintArray result = env->NewIntArray(len); - env->SetIntArrayRegion(result, 0, len, buf); - return result; -} - -inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env, - const int8_t *buf, - int64_t len) { - jbyteArray result = env->NewByteArray(len); - env->SetByteArrayRegion(result, 0, len, buf); - return result; -} - -inline jlongArray int64_vector_to_jlongarray(JNIEnv *env, - const std::vector &vec) { - jlongArray result = env->NewLongArray(vec.size()); - jlong *buf = new jlong[vec.size()]; - for (size_t i = 0; i < vec.size(); ++i) { - buf[i] = (jlong)vec[i]; - } - env->SetLongArrayRegion(result, 0, vec.size(), buf); - delete[] buf; - return result; -} - -inline std::vector jlongarray_to_int64_vector(JNIEnv *env, - jlongArray dims) { - int dim_size = env->GetArrayLength(dims); - jlong *dim_nums = env->GetLongArrayElements(dims, nullptr); - std::vector dim_vec(dim_nums, dim_nums + dim_size); - env->ReleaseLongArrayElements(dims, dim_nums, 0); - return dim_vec; -} - -/** - * Converts Java com.baidu.paddle.lite.Place to c++ paddle::lite_api::Place. - */ -inline Place jplace_to_cpp_place(JNIEnv *env, jobject java_place) { - jclass place_jclazz = env->GetObjectClass(java_place); - - jmethodID target_method = - env->GetMethodID(place_jclazz, "getTargetInt", "()I"); - jmethodID precision_method = - env->GetMethodID(place_jclazz, "getPrecisionInt", "()I"); - jmethodID data_layout_method = - env->GetMethodID(place_jclazz, "getDataLayoutInt", "()I"); - jmethodID device_method = env->GetMethodID(place_jclazz, "getDevice", "()I"); - - int target = env->CallIntMethod(java_place, target_method); - int precision = env->CallIntMethod(java_place, precision_method); - int data_layout = env->CallIntMethod(java_place, data_layout_method); - int device = env->CallIntMethod(java_place, device_method); - - return Place(static_cast(target), - static_cast(precision), - static_cast(data_layout), - device); -} - -inline CxxConfig jcxxconfig_to_cpp_cxxconfig(JNIEnv *env, jobject jcxxconfig) { - jclass cxxconfig_jclazz = env->GetObjectClass(jcxxconfig); - - jmethodID model_dir_method = - env->GetMethodID(cxxconfig_jclazz, "getModelDir", "()Ljava/lang/String;"); - jmethodID preferred_place_method = env->GetMethodID( - cxxconfig_jclazz, "getPreferredPlace", "()Lcom/baidu/paddle/lite/Place;"); - jmethodID valid_places_method = env->GetMethodID( - cxxconfig_jclazz, "getValidPlaces", "()[Lcom/baidu/paddle/lite/Place;"); - - CxxConfig config; - - jstring java_model_dir = - (jstring)env->CallObjectMethod(jcxxconfig, model_dir_method); - if (java_model_dir != nullptr) { - std::string cpp_model_dir = jstring_to_cpp_string(env, java_model_dir); - config.set_model_dir(cpp_model_dir); - } - - jobject java_preferred_place = - env->CallObjectMethod(jcxxconfig, preferred_place_method); - if (java_preferred_place != nullptr) { - Place cpp_preferred_place = jplace_to_cpp_place(env, java_preferred_place); - config.set_preferred_place(cpp_preferred_place); - } - - jobject object_valid_places = - env->CallObjectMethod(jcxxconfig, valid_places_method); - jobjectArray *java_valid_places = - reinterpret_cast(&object_valid_places); - if (java_valid_places != nullptr) { - int valid_place_count = env->GetArrayLength(*java_valid_places); - std::vector cpp_valid_places; - for (int i = 0; i < valid_place_count; ++i) { - jobject jplace = env->GetObjectArrayElement(*java_valid_places, i); - cpp_valid_places.push_back(jplace_to_cpp_place(env, jplace)); - } - config.set_valid_places(cpp_valid_places); - } - - return config; -} - -inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env, - jobject jmobileconfig) { - jclass mobileconfig_jclazz = env->GetObjectClass(jmobileconfig); - - MobileConfig config; - - // set model dir - jmethodID model_dir_method = env->GetMethodID( - mobileconfig_jclazz, "getModelDir", "()Ljava/lang/String;"); - jstring java_model_dir = - (jstring)env->CallObjectMethod(jmobileconfig, model_dir_method); - if (java_model_dir != nullptr) { - std::string cpp_model_dir = jstring_to_cpp_string(env, java_model_dir); - config.set_model_dir(cpp_model_dir); - } - - // set threads - jmethodID threads_method = - env->GetMethodID(mobileconfig_jclazz, "getThreads", "()I"); - int threads = env->CallIntMethod(jmobileconfig, threads_method); - config.set_threads(threads); - - // set power mode - jmethodID power_mode_method = - env->GetMethodID(mobileconfig_jclazz, "getPowerModeInt", "()I"); - int power_mode = env->CallIntMethod(jmobileconfig, power_mode_method); - config.set_power_mode(static_cast(power_mode)); - - return config; -} - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/android/jni/native/paddle_lite_jni.cc b/lite/api/android/jni/native/paddle_lite_jni.cc deleted file mode 100644 index aa4ece6818..0000000000 --- a/lite/api/android/jni/native/paddle_lite_jni.cc +++ /dev/null @@ -1,164 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/api/android/jni/native/paddle_lite_jni.h" - -#include -#include -#include -#include - -#include "lite/api/android/jni/native/convert_util_jni.h" -#include "lite/api/light_api.h" -#include "lite/api/paddle_api.h" - -#ifdef __cplusplus -extern "C" { -#endif - -namespace paddle { -namespace lite_api { - -inline static std::shared_ptr *getPaddlePredictorPointer( - JNIEnv *env, jobject jpaddle_predictor) { - jclass jclazz = env->GetObjectClass(jpaddle_predictor); - jfieldID jfield = env->GetFieldID(jclazz, "cppPaddlePredictorPointer", "J"); - jlong java_pointer = env->GetLongField(jpaddle_predictor, jfield); - std::shared_ptr *ptr = - reinterpret_cast *>(java_pointer); - return ptr; -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_PaddlePredictor_run( - JNIEnv *env, jobject jpaddle_predictor) { - std::shared_ptr *predictor = - getPaddlePredictorPointer(env, jpaddle_predictor); - if (predictor == nullptr || (*predictor == nullptr)) { - return JNI_FALSE; - } - (*predictor)->Run(); - return JNI_TRUE; -} - -JNIEXPORT jboolean JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel( - JNIEnv *env, jobject jpaddle_predictor, jstring model_dir) { - std::shared_ptr *predictor = - getPaddlePredictorPointer(env, jpaddle_predictor); - if (predictor == nullptr || (*predictor == nullptr)) { - return JNI_FALSE; - } - (*predictor)->SaveOptimizedModel(jstring_to_cpp_string(env, model_dir)); - return JNI_TRUE; -} - -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_getInputCppTensorPointer( - JNIEnv *env, jobject jpaddle_predictor, jint offset) { - std::shared_ptr *predictor = - getPaddlePredictorPointer(env, jpaddle_predictor); - if (predictor == nullptr || (*predictor == nullptr)) { - return 0; - } - std::unique_ptr tensor = - (*predictor)->GetInput(static_cast(offset)); - std::unique_ptr *cpp_tensor_pointer = - new std::unique_ptr(std::move(tensor)); - return reinterpret_cast(cpp_tensor_pointer); -} - -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_getOutputCppTensorPointer( - JNIEnv *env, jobject jpaddle_predictor, jint offset) { - std::shared_ptr *predictor = - getPaddlePredictorPointer(env, jpaddle_predictor); - if (predictor == nullptr || (*predictor == nullptr)) { - return 0; - } - std::unique_ptr tensor = - (*predictor)->GetOutput(static_cast(offset)); - std::unique_ptr *cpp_tensor_pointer = - new std::unique_ptr(std::move(tensor)); - return reinterpret_cast(cpp_tensor_pointer); -} - -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_getCppTensorPointerByName( - JNIEnv *env, jobject jpaddle_predictor, jstring name) { - std::string cpp_name = jstring_to_cpp_string(env, name); - std::shared_ptr *predictor = - getPaddlePredictorPointer(env, jpaddle_predictor); - if (predictor == nullptr || (*predictor == nullptr)) { - return 0; - } - std::unique_ptr tensor = (*predictor)->GetTensor(cpp_name); - std::unique_ptr *cpp_tensor_pointer = - new std::unique_ptr(std::move(tensor)); - return reinterpret_cast(cpp_tensor_pointer); -} - -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\ -paddle_lite_CxxConfig_2(JNIEnv *env, - jobject jpaddle_predictor, - jobject jcxxconfig) { -#ifndef LITE_ON_TINY_PUBLISH - CxxConfig config = jcxxconfig_to_cpp_cxxconfig(env, jcxxconfig); - std::shared_ptr predictor = - paddle::lite_api::CreatePaddlePredictor(config); - if (predictor == nullptr) { - return 0; - } - std::shared_ptr *predictor_pointer = - new std::shared_ptr(predictor); - return reinterpret_cast(predictor_pointer); -#else - return 0; -#endif -} - -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\ -paddle_lite_MobileConfig_2(JNIEnv *env, - jobject jpaddle_predictor, - jobject jmobileconfig) { - MobileConfig config = jmobileconfig_to_cpp_mobileconfig(env, jmobileconfig); - std::shared_ptr predictor = - paddle::lite_api::CreatePaddlePredictor(config); - if (predictor == nullptr) { - return 0; - } - std::shared_ptr *predictor_pointer = - new std::shared_ptr(predictor); - return reinterpret_cast(predictor_pointer); -} - -JNIEXPORT jboolean JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_deleteCppPaddlePredictor( - JNIEnv *env, jobject jpaddle_predictor, jlong java_pointer) { - if (java_pointer == 0) { - return JNI_FALSE; - } - std::shared_ptr *ptr = - reinterpret_cast *>(java_pointer); - ptr->reset(); - delete ptr; - return JNI_TRUE; -} - -} // namespace lite_api -} // namespace paddle - -#ifdef __cplusplus -} -#endif diff --git a/lite/api/android/jni/native/paddle_lite_jni.h b/lite/api/android/jni/native/paddle_lite_jni.h deleted file mode 100644 index 913e9a4c3a..0000000000 --- a/lite/api/android/jni/native/paddle_lite_jni.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class com_baidu_paddle_lite_PaddlePredictor */ -#include "lite/api/paddle_lite_factory_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#ifndef LITE_ON_TINY_PUBLISH -#include "lite/api/paddle_use_passes.h" -#endif -#ifdef __cplusplus -extern "C" { -#endif - -namespace paddle { -namespace lite_api { - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: run - * Signature: ()Z - */ -JNIEXPORT jboolean JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_run(JNIEnv *, jobject); - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: saveOptimizedModel - * Signature: (Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel(JNIEnv *, - jobject, - jstring); - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: getInputCppTensorPointer - * Signature: (I)J - */ -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_getInputCppTensorPointer(JNIEnv *, - jobject, - jint); - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: getOutputCppTensorPointer - * Signature: (I)J - */ -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_getOutputCppTensorPointer(JNIEnv *, - jobject, - jint); - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: getCppTensorPointerByName - * Signature: (Ljava/lang/String;)J - */ -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_getCppTensorPointerByName(JNIEnv *, - jobject, - jstring); - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: newCppPaddlePredictor - * Signature: (Lcom/baidu/paddle/lite/CxxConfig;)J - */ -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\ -paddle_lite_CxxConfig_2(JNIEnv *, jobject, jobject); - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: newCppPaddlePredictor - * Signature: (Lcom/baidu/paddle/lite/MobileConfig;)J - */ -JNIEXPORT jlong JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\ -paddle_lite_MobileConfig_2(JNIEnv *, jobject, jobject); - -/* - * Class: com_baidu_paddle_lite_PaddlePredictor - * Method: deleteCppPaddlePredictor - * Signature: (J)Z - */ -JNIEXPORT jboolean JNICALL -Java_com_baidu_paddle_lite_PaddlePredictor_deleteCppPaddlePredictor(JNIEnv *, - jobject, - jlong); - -} // namespace lite_api -} // namespace paddle - -#ifdef __cplusplus -} -#endif diff --git a/lite/api/android/jni/native/tensor_jni.cc b/lite/api/android/jni/native/tensor_jni.cc deleted file mode 100644 index 59cafa1939..0000000000 --- a/lite/api/android/jni/native/tensor_jni.cc +++ /dev/null @@ -1,168 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/api/android/jni/native/tensor_jni.h" - -#include -#include - -#include "lite/api/android/jni/native/convert_util_jni.h" - -#ifdef __cplusplus -extern "C" { -#endif - -namespace paddle { -namespace lite_api { - -inline static int64_t product(const std::vector &vec) { - if (vec.empty()) { - return 0; - } - int64_t result = 1; - for (int64_t d : vec) { - result *= d; - } - return result; -} - -inline static bool is_const_tensor(JNIEnv *env, jobject jtensor) { - jclass jclazz = env->GetObjectClass(jtensor); - jfieldID jfield = env->GetFieldID(jclazz, "readOnly", "Z"); - jboolean read_only = env->GetBooleanField(jtensor, jfield); - return static_cast(read_only); -} - -inline static std::unique_ptr *get_writable_tensor_pointer( - JNIEnv *env, jobject jtensor) { - jclass jclazz = env->GetObjectClass(jtensor); - jfieldID jfield = env->GetFieldID(jclazz, "cppTensorPointer", "J"); - jlong java_pointer = env->GetLongField(jtensor, jfield); - std::unique_ptr *ptr = - reinterpret_cast *>(java_pointer); - return ptr; -} - -inline static std::unique_ptr *get_read_only_tensor_pointer( - JNIEnv *env, jobject jtensor) { - jclass jclazz = env->GetObjectClass(jtensor); - jfieldID jfield = env->GetFieldID(jclazz, "cppTensorPointer", "J"); - jlong java_pointer = env->GetLongField(jtensor, jfield); - std::unique_ptr *ptr = - reinterpret_cast *>(java_pointer); - return ptr; -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeResize( - JNIEnv *env, jobject jtensor, jlongArray dims) { - std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); - if (tensor == nullptr || (*tensor == nullptr)) { - return JNI_FALSE; - } - std::vector shape = jlongarray_to_int64_vector(env, dims); - (*tensor)->Resize(shape); - return JNI_TRUE; -} - -JNIEXPORT jlongArray JNICALL -Java_com_baidu_paddle_lite_Tensor_shape(JNIEnv *env, jobject jtensor) { - if (is_const_tensor(env, jtensor)) { - std::unique_ptr *tensor = - get_read_only_tensor_pointer(env, jtensor); - std::vector shape = (*tensor)->shape(); - return int64_vector_to_jlongarray(env, shape); - } else { - std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); - std::vector shape = (*tensor)->shape(); - return int64_vector_to_jlongarray(env, shape); - } -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3F( - JNIEnv *env, jobject jtensor, jfloatArray buf) { - std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); - if (tensor == nullptr || (*tensor == nullptr)) { - return JNI_FALSE; - } - int64_t buf_size = (int64_t)env->GetArrayLength(buf); - if (buf_size != product((*tensor)->shape())) { - return JNI_FALSE; - } - - float *input = (*tensor)->mutable_data(); - env->GetFloatArrayRegion(buf, 0, buf_size, input); - return JNI_TRUE; -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B( - JNIEnv *env, jobject jtensor, jbyteArray buf) { - std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); - if (tensor == nullptr || (*tensor == nullptr)) { - return JNI_FALSE; - } - int64_t buf_size = (int64_t)env->GetArrayLength(buf); - if (buf_size != product((*tensor)->shape())) { - return JNI_FALSE; - } - - int8_t *input = (*tensor)->mutable_data(); - env->GetByteArrayRegion(buf, 0, buf_size, input); - return JNI_TRUE; -} - -JNIEXPORT jfloatArray JNICALL -Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) { - if (is_const_tensor(env, jtensor)) { - std::unique_ptr *tensor = - get_read_only_tensor_pointer(env, jtensor); - return cpp_array_to_jfloatarray( - env, (*tensor)->data(), product((*tensor)->shape())); - } else { - std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); - return cpp_array_to_jfloatarray( - env, (*tensor)->data(), product((*tensor)->shape())); - } -} - -JNIEXPORT jbyteArray JNICALL -Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *env, jobject jtensor) { - if (is_const_tensor(env, jtensor)) { - std::unique_ptr *tensor = - get_read_only_tensor_pointer(env, jtensor); - return cpp_array_to_jbytearray( - env, (*tensor)->data(), product((*tensor)->shape())); - } else { - std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); - return cpp_array_to_jbytearray( - env, (*tensor)->data(), product((*tensor)->shape())); - } -} - -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor( - JNIEnv *env, jobject jtensor, jlong java_pointer) { - if (java_pointer == 0) { - return JNI_FALSE; - } - std::unique_ptr *ptr = - reinterpret_cast *>(java_pointer); - ptr->reset(); - delete ptr; - return JNI_TRUE; -} - -} // namespace lite_api -} // namespace paddle - -#ifdef __cplusplus -} -#endif diff --git a/lite/api/android/jni/native/tensor_jni.h b/lite/api/android/jni/native/tensor_jni.h deleted file mode 100644 index 34c35b6a76..0000000000 --- a/lite/api/android/jni/native/tensor_jni.h +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class com_baidu_paddle_lite_Tensor */ - -#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_ -#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_ -#ifdef __cplusplus -extern "C" { -#endif - -namespace paddle { -namespace lite_api { - -/* - * Class: com_baidu_paddle_lite_Tensor - * Method: shape - * Signature: ()[J - */ -JNIEXPORT jlongArray JNICALL Java_com_baidu_paddle_lite_Tensor_shape(JNIEnv *, - jobject); - -/* - * Class: com_baidu_paddle_lite_Tensor - * Method: getFloatData - * Signature: ()[F - */ -JNIEXPORT jfloatArray JNICALL -Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *, jobject); - -/* - * Class: com_baidu_paddle_lite_Tensor - * Method: getByteData - * Signature: ()[B - */ -JNIEXPORT jbyteArray JNICALL -Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject); - -/* - * Class: com_baidu_paddle_lite_Tensor - * Method: nativeResize - * Signature: ([J)Z - */ -JNIEXPORT jboolean JNICALL -Java_com_baidu_paddle_lite_Tensor_nativeResize(JNIEnv *, jobject, jlongArray); - -/* - * Class: com_baidu_paddle_lite_Tensor - * Method: nativeSetData - * Signature: ([F)Z - */ -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3F( - JNIEnv *, jobject, jfloatArray); - -/* - * Class: com_baidu_paddle_lite_Tensor - * Method: nativeSetData - * Signature: ([B)Z - */ -JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B( - JNIEnv *, jobject, jbyteArray); - -/* - * Class: com_baidu_paddle_lite_Tensor - * Method: deleteCppTensor - * Signature: (J)Z - */ -JNIEXPORT jboolean JNICALL -Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(JNIEnv *, jobject, jlong); - -} // namespace lite_api -} // namespace paddle - -#ifdef __cplusplus -} -#endif -#endif // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_ diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore b/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore deleted file mode 100644 index 870ec275e8..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/PaddleLite.class -/PaddleLiteTest.class diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java b/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java deleted file mode 100644 index 51115b3016..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -/** - * Base class for all configurations. - */ -public class ConfigBase { - - protected String modelDir; - - public String getModelDir() { - return modelDir; - } - - public void setModelDir(String modelDir) { - this.modelDir = modelDir; - } - -} diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java deleted file mode 100644 index 906293c92f..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -/** - * CxxConfig is the configuration for the Full feature predictor. - */ -public class CxxConfig extends ConfigBase { - - protected Place preferredPlace; - protected Place[] validPlaces; - - public Place getPreferredPlace() { - return preferredPlace; - } - - public void setPreferredPlace(Place preferredPlace) { - this.preferredPlace = preferredPlace; - } - - public Place[] getValidPlaces() { - return validPlaces; - } - - public void setValidPlaces(Place[] validPlaces) { - this.validPlaces = validPlaces; - } -} diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java deleted file mode 100644 index 5c71db0c92..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -/** - * MobileConfig is the config for the light weight predictor, it will skip IR - * optimization or other unnecessary stages. - */ -public class MobileConfig extends ConfigBase { - - /** - * Set power mode. - * - * @return - */ - public void setPowerMode(PowerMode powerMode) { - this.powerMode = powerMode; - } - - /** - * Returns power mode. - * - * @return power mode - */ - public PowerMode getPowerMode() { - return powerMode; - } - - /** - * Set threads num. - * - * @return - */ - public void setThreads(int threads) { - this.threads = threads; - } - - /** - * Returns threads num. - * - * @return threads num - */ - public int getThreads() { - return threads; - } - - /** - * Returns power mode as enum int value. - * - * @return power mode as enum int value - */ - public int getPowerModeInt() { - return powerMode.value(); - } - - private PowerMode powerMode = PowerMode.LITE_POWER_HIGH; - private int threads = 1; -} diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java b/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java deleted file mode 100644 index 876d7cebd4..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.baidu.paddle.lite; - -/** - * Initializer for PaddleLite. The initialization methods are called by package - * classes only. Public users don't have to call them. Public users can get - * PaddleLite information constants such as JNI lib name in this class. - */ -public class PaddleLiteInitializer { - - /** name of C++ JNI lib */ - public final static String JNI_LIB_NAME = "paddle_lite_jni"; - - /** - * loads the C++ JNI lib. We only call it in our package, so it shouldn't be - * visible to public users. - * - * @return true if initialize successfully. - */ - protected static boolean init() { - System.loadLibrary(JNI_LIB_NAME); - return true; - } -} diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java b/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java deleted file mode 100644 index d022fd7d61..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -/** Java Native Interface (JNI) class for Paddle Lite APIs */ -public class PaddlePredictor { - - /** - * Java doesn't have pointer. To maintain the life cycle of underneath C++ - * PaddlePredictor object, we use a long value to maintain it. - */ - private long cppPaddlePredictorPointer; - - /** - * Constructor of a PaddlePredictor. - * - * @param config the input configuration. - */ - public PaddlePredictor(ConfigBase config) { - init(config); - } - - /** - * Creates a PaddlePredictor object. - * - * @param config the input configuration. - * @return the PaddlePredictor object, or null if failed to create it. - */ - public static PaddlePredictor createPaddlePredictor(ConfigBase config) { - PaddlePredictor predictor = new PaddlePredictor(config); - return predictor.cppPaddlePredictorPointer == 0L ? null : predictor; - } - - /** - * Get offset-th input tensor. - * - * @param offset - * @return the tensor or null if failed to get it. - */ - public Tensor getInput(int offset) { - long cppTensorPointer = getInputCppTensorPointer(offset); - return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ false, this); - } - - /** - * Get offset-th output tensor. - * - * @param offset - * @return the tensor or null if failed to get it. - */ - public Tensor getOutput(int offset) { - long cppTensorPointer = getOutputCppTensorPointer(offset); - return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ true, this); - } - - /** - * Get a tensor by name. - * - * @param name the name of the tensor. - * @return the tensor or null if failed to get it. - */ - public Tensor getTensor(String name) { - long cppTensorPointer = getCppTensorPointerByName(name); - return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ true, this); - } - - /** - * Run the PaddlePredictor. - * - * @return true if run successfully. - */ - public native boolean run(); - - /** - * Saves the optimized model. It is available only for {@link CxxConfig} - * - * @param modelDir the path to save the optimized model - * @return true if save successfully. Otherwise returns false. - */ - public native boolean saveOptimizedModel(String modelDir); - - /** - * Deletes C++ PaddlePredictor pointer when Java PaddlePredictor object is - * destroyed - */ - @Override - protected void finalize() throws Throwable { - clear(); - super.finalize(); - } - - /** - * Create a C++ PaddlePredictor object based on configuration - * - * @param config the input configuration - * @return true if create successfully - */ - protected boolean init(ConfigBase config) { - if (config instanceof CxxConfig) { - cppPaddlePredictorPointer = newCppPaddlePredictor((CxxConfig) config); - } else if (config instanceof MobileConfig) { - cppPaddlePredictorPointer = newCppPaddlePredictor((MobileConfig) config); - } else { - throw new IllegalArgumentException("Not supported PaddleLite Config type"); - } - return cppPaddlePredictorPointer != 0L; - } - - /** - * Deletes C++ PaddlePredictor pointer - * - * @return true if deletion success - */ - protected boolean clear() { - boolean result = false; - if (cppPaddlePredictorPointer != 0L) { - result = deleteCppPaddlePredictor(cppPaddlePredictorPointer); - cppPaddlePredictorPointer = 0L; - } - return result; - } - - /** - * Gets offset-th input tensor pointer at C++ side. - * - * @param offset - * @return a long value which is reinterpret_cast of the C++ pointer. - */ - private native long getInputCppTensorPointer(int offset); - - /** - * Gets offset-th output tensor pointer at C++ side. - * - * @param offset - * @return a long value which is reinterpret_cast of the C++ pointer. - */ - private native long getOutputCppTensorPointer(int offset); - - /** - * Gets tensor pointer at C++ side by name. - * - * @param name the name of the tensor. - * @return a long value which is reinterpret_cast of the C++ pointer. - */ - private native long getCppTensorPointerByName(String name); - - /** - * Creates a new C++ PaddlePredcitor object using CxxConfig, returns the - * reinterpret_cast value of the C++ pointer which points to C++ - * PaddlePredictor. - * - * @param config - * @return a long value which is reinterpret_cast of the C++ pointer. - */ - private native long newCppPaddlePredictor(CxxConfig config); - - /** - * Creates a new C++ PaddlePredcitor object using Mobile, returns the - * reinterpret_cast value of the C++ pointer which points to C++ - * PaddlePredictor. - * - * @param config - * @return a long value which is reinterpret_cast of the C++ pointer. - */ - private native long newCppPaddlePredictor(MobileConfig config); - - /** - * Delete C++ PaddlePredictor object pointed by the input pointer, which is - * presented by a long value. - * - * @param nativePointer a long value which is reinterpret_cast of the C++ - * pointer. - * @return true if deletion success. - */ - private native boolean deleteCppPaddlePredictor(long nativePointer); - - /* Initializes at the beginning */ - static { - PaddleLiteInitializer.init(); - } -} diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java b/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java deleted file mode 100644 index 98777f3111..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -/** - * Place specifies the execution context of a Kernel or input/output for a - * kernel. It is used to make the analysis of the MIR more clear and accurate. - */ -public class Place { - - /** Place hardware target type. */ - public enum TargetType { - UNKNOWN(0), HOST(1), X86(2), CUDA(3), ARM(4), OPEN_CL(5), FPGA(7), NPU(8), ANY(6); - - public final int value; - - private TargetType(int value) { - this.value = value; - } - } - - /** Place precision type */ - public enum PrecisionType { - UNKNOWN(0), FLOAT(1), INT8(2), FP16(5), INT32(3), ANY(4), BOOL(6); - - public final int value; - - private PrecisionType(int value) { - this.value = value; - } - } - - /** Place data layout type */ - public enum DataLayoutType { - UNKNOWN(0), NCHW(1), NHWC(3), ANY(2); - - public final int value; - - private DataLayoutType(int value) { - this.value = value; - } - } - - private TargetType target; - private PrecisionType precision; - private DataLayoutType layout; - private int device; - - public Place() { - target = TargetType.UNKNOWN; - precision = PrecisionType.UNKNOWN; - layout = DataLayoutType.UNKNOWN; - device = 0; - } - - public Place(TargetType target) { - this(target, PrecisionType.FLOAT); - } - - public Place(TargetType target, PrecisionType precision) { - this(target, precision, DataLayoutType.NCHW); - } - - public Place(TargetType target, PrecisionType precision, DataLayoutType layout) { - this(target, precision, layout, 0); - } - - public Place(TargetType target, PrecisionType precision, DataLayoutType layout, int device) { - this.target = target; - this.precision = precision; - this.layout = layout; - this.device = device; - } - - public boolean isValid() { - return target != TargetType.UNKNOWN && precision != PrecisionType.UNKNOWN && layout != DataLayoutType.UNKNOWN; - } - - public TargetType getTarget() { - return target; - } - - public void setTarget(TargetType target) { - this.target = target; - } - - public PrecisionType getPrecision() { - return precision; - } - - public void setPrecision(PrecisionType precision) { - this.precision = precision; - } - - public DataLayoutType getLayout() { - return layout; - } - - public void setLayout(DataLayoutType layout) { - this.layout = layout; - } - - public int getDevice() { - return device; - } - - public void setDevice(int device) { - this.device = device; - } - - /** - * Returns hardware target as enum int value. - * - * @return hardware target as enum int value - */ - public int getTargetInt() { - return target.value; - } - - /** - * Returns precision target as enum int value. - * - * @return precision as enum int value - */ - public int getPrecisionInt() { - return precision.value; - } - - /** - * Returns data layout as enum int value. - * - * @return data layout as enum int value - */ - public int getDataLayoutInt() { - return layout.value; - } -} diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java b/lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java deleted file mode 100644 index 36bd568406..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -/** - * PowerMode is the cpu running power mode for the light weight predictor. - */ -public enum PowerMode { - LITE_POWER_HIGH(0), - LITE_POWER_LOW(1), - LITE_POWER_FULL(2), - LITE_POWER_NO_BIND(3), - LITE_POWER_RAND_HIGH(4), - LITE_POWER_RAND_LOW(5); - - private PowerMode(int value) { - this.value = value; - } - - public int value() { - return this.value; - } - - private final int value; -} diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java deleted file mode 100644 index ac78800bd2..0000000000 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -/** - * Tensor class provides the Java APIs that users can get or set the shape or - * the data of a Tensor. - */ -public class Tensor { - - /** - * Java doesn't have pointer. To maintain the life cycle of underneath C++ - * PaddlePredictor object, we use a long value to maintain it. - */ - private long cppTensorPointer; - - /** - * Is this tensor read-only. This field is also used at C++ side to know whether - * we should interpret the C++ tensor pointer to "Tensor" pointer or "const - * Tensor" pointer. - */ - private boolean readOnly; - - /** - * Due to different memory management of Java and C++, at C++, if a user - * destroys PaddlePredictor object, the tensor's memory will be released and a - * pointer operating on the released tensor will cause unknown behavior. At C++ - * side, that's users' responsibility to manage memory well. But for our Java - * code, we have to prevent this case. We make this {@link Tensor} keep a - * reference to {@link PaddlePredictor} to prevent the {@link PaddlePredictor} - * object be collected by JVM before {@Tensor}. - */ - private PaddlePredictor predictor; - - /** - * Accessed by package only to prevent public users to create it wrongly. A - * Tensor can be created by {@link com.baidu.paddle.lite.PaddlePredictor} only - */ - protected Tensor(long cppTensorPointer, boolean readOnly, PaddlePredictor predictor) { - this.cppTensorPointer = cppTensorPointer; - this.readOnly = readOnly; - this.predictor = predictor; - } - - /** Deletes C++ Tensor pointer when Java Tensor object is destroyed */ - protected void finalize() throws Throwable { - if (cppTensorPointer != 0L) { - deleteCppTensor(cppTensorPointer); - cppTensorPointer = 0L; - } - super.finalize(); - } - - /** - * @return whether this Tensor is read-only. - */ - public boolean isReadOnly() { - return readOnly; - } - - /** - * Resizes the tensor shape. - * - * @param dims long array of shape. - * @return true if resize successfully. - */ - public boolean resize(long[] dims) { - if (readOnly) { - return false; - } - return nativeResize(dims); - } - - /** - * Set the tensor float data. - * - * @param buf the float array buffer which will be copied into tensor. - * @return true if set data successfully. - */ - public boolean setData(float[] buf) { - if (readOnly) { - return false; - } - return nativeSetData(buf); - } - - /** - * Set the tensor byte data. - * - * @param buf the byte array buffer which will be copied into tensor. - * @return true if set data successfully. - */ - public boolean setData(byte[] buf) { - if (readOnly) { - return false; - } - return nativeSetData(buf); - } - - /** - * @return shape of the tensor as long array. - */ - public native long[] shape(); - - /** - * @return the tensor data as float array. - */ - public native float[] getFloatData(); - - /** - * @return the tensor data as byte array. - */ - public native byte[] getByteData(); - - private native boolean nativeResize(long[] dims); - - private native boolean nativeSetData(float[] buf); - - private native boolean nativeSetData(byte[] buf); - - /** - * Delete C++ Tenor object pointed by the input pointer, which is presented by a - * long value. - * - * @param nativePointer a long value which is reinterpret_cast of the C++ - * pointer. - * @return true if deletion success. - */ - private native boolean deleteCppTensor(long nativePointer); -} \ No newline at end of file diff --git a/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java b/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java deleted file mode 100644 index 0af11efd28..0000000000 --- a/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -package com.baidu.paddle.lite; - -import org.junit.jupiter.api.Test; - -import static org.junit.Assert.assertEquals; - -/** - * Deprecated test. Now we use Android demo's Instrument test. - * - * @TODO make this test as Java Unit test. Then we don't have to launch Android - * demo to test. - */ -class PaddlePredictorTest { - - @Test - public void run_defaultModel() { - MobileConfig config = new MobileConfig(); - config.setModelDir(""); - PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); - - float[] inputBuffer = new float[10000]; - for (int i = 0; i < 10000; ++i) { - inputBuffer[i] = i; - } - long[] dims = { 100, 100 }; - - Tensor input = predictor.getInput(0); - input.resize(dims); - input.setData(inputBuffer); - - predictor.run(); - - Tensor output = predictor.getOutput(0); - float[] outputBuffer = output.getFloatData(); - - assertEquals(outputBuffer.length, 50000); - assertEquals(outputBuffer[0], 50.2132f, 1e-3f); - assertEquals(outputBuffer[1], -28.8729f, 1e-3f); - } - -} diff --git a/lite/api/apis_test.cc b/lite/api/apis_test.cc deleted file mode 100644 index 3dc0224084..0000000000 --- a/lite/api/apis_test.cc +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * We test multiple apis here. - */ -#include -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/light_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/mir/pass_registry.h" - -DEFINE_string(model_dir, "", ""); -DEFINE_string(optimized_model, "", ""); - -namespace paddle { -namespace lite { - -void SetConstInput(lite::Tensor* x) { - x->Resize(DDim(std::vector({100, 100}))); - auto* data = x->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } -} - -bool CompareTensors(const std::string& name, - const Predictor& cxx_api, - const LightPredictor& light_api) { - const auto* a = cxx_api.GetTensor(name); - const auto* b = light_api.GetTensor(name); - return TensorCompareWith(*a, *b); -} - -TEST(CXXApi_LightApi, optim_model) { - lite::Predictor cxx_api; - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, // Both works on X86 and ARM - }); - // On ARM devices, the preferred X86 target not works, but it can still - // select ARM kernels. - cxx_api.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, - valid_places); - cxx_api.SaveModel(FLAGS_optimized_model); -} - -TEST(CXXApi_LightApi, save_and_load_model) { - lite::Predictor cxx_api; - lite::LightPredictor light_api(FLAGS_optimized_model); - - // CXXAPi - { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, // Both works on X86 and ARM - }); - // On ARM devices, the preferred X86 target not works, but it can still - // select ARM kernels. - cxx_api.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, - valid_places); - - auto* x = cxx_api.GetInput(0); - SetConstInput(x); - - cxx_api.Run(); - - LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; - cxx_api.SaveModel(FLAGS_optimized_model); - } - - // LightApi - { - auto* x = light_api.GetInput(0); - SetConstInput(x); - - light_api.Run(); - } - - const auto* cxx_out = cxx_api.GetOutput(0); - const auto* light_out = light_api.GetOutput(0); - ASSERT_TRUE(TensorCompareWith(*cxx_out, *light_out)); - - std::vector tensors_with_order({ - "a", "fc_0.w_0", "scale_0.tmp_0", - }); - - for (const auto& tensor_name : tensors_with_order) { - ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api)); - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc deleted file mode 100644 index ca7bfe7fe6..0000000000 --- a/lite/api/benchmark.cc +++ /dev/null @@ -1,190 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/device_info.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/string.h" - -DEFINE_string(input_shape, - "1,3,224,224", - "input shapes, separated by colon and comma"); -DEFINE_string(result_filename, "", "save test result"); -DEFINE_bool(run_model_optimize, - false, - "apply model_optimize_tool to model, use optimized model to test"); - -namespace paddle { -namespace lite_api { - -void OutputOptModel(const std::string& load_model_dir, - const std::string& save_optimized_model_dir, - const std::vector>& input_shapes) { - lite_api::CxxConfig config; - config.set_model_dir(load_model_dir); - config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)}); - config.set_valid_places({ - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - auto predictor = lite_api::CreatePaddlePredictor(config); - - int ret = system( - paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) - .c_str()); - if (ret == 0) { - LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; - } - predictor->SaveOptimizedModel(save_optimized_model_dir, - LiteModelType::kNaiveBuffer); - LOG(INFO) << "Load model from " << load_model_dir; - LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; -} - -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -void Run(const std::vector>& input_shapes, - const std::string& model_dir, - const int repeat, - const int thread_num, - const int warmup_times, - const std::string model_name) { - lite_api::MobileConfig config; - config.set_threads(thread_num); - if (thread_num == 1) { - config.set_power_mode(LITE_POWER_HIGH); - } else { - config.set_power_mode(LITE_POWER_NO_BIND); - } - config.set_model_dir(model_dir); - - auto predictor = lite_api::CreatePaddlePredictor(config); - - for (int j = 0; j < input_shapes.size(); ++j) { - auto input_tensor = predictor->GetInput(j); - input_tensor->Resize(input_shapes[j]); - auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (int i = 0; i < input_shapes[j].size(); ++i) { - input_num *= input_shapes[j][i]; - } - for (int i = 0; i < input_num; ++i) { - input_data[i] = 1.f; - } - } - - for (int i = 0; i < warmup_times; ++i) { - predictor->Run(); - } - - auto start = lite::GetCurrentUS(); - for (int i = 0; i < repeat; ++i) { - predictor->Run(); - } - auto end = lite::GetCurrentUS(); - - std::FILE* pf = std::fopen(FLAGS_result_filename.c_str(), "a"); - if (nullptr == pf) { - LOG(INFO) << "create result file error"; - exit(0); - } - fprintf(pf, - "-- %-18s avg = %5.4f ms\n", - model_name.c_str(), - (end - start) / repeat / 1000.0); - std::fclose(pf); -} -#endif - -} // namespace lite_api -} // namespace paddle - -int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_model_dir == "" || FLAGS_result_filename == "") { - LOG(INFO) << "usage: " - << "--model_dir /path/to/your/model --result_filename " - "/path/to/resultfile"; - exit(0); - } - - std::size_t found = FLAGS_model_dir.find_last_of("/"); - std::string model_name = FLAGS_model_dir.substr(found + 1); - std::string save_optimized_model_dir = FLAGS_model_dir + "opt2"; - - auto split_string = - [](const std::string& str_in) -> std::vector { - std::vector str_out; - std::string tmp_str = str_in; - while (!tmp_str.empty()) { - size_t next_offset = tmp_str.find(":"); - str_out.push_back(tmp_str.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); - } - } - return str_out; - }; - - auto get_shape = [](const std::string& str_shape) -> std::vector { - std::vector shape; - std::string tmp_str = str_shape; - while (!tmp_str.empty()) { - int dim = atoi(tmp_str.data()); - shape.push_back(dim); - size_t next_offset = tmp_str.find(","); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); - } - } - return shape; - }; - - std::vector str_input_shapes = split_string(FLAGS_input_shape); - std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { - input_shapes.push_back(get_shape(str_input_shapes[i])); - } - - // Output optimized model - if (FLAGS_run_model_optimize) { - paddle::lite_api::OutputOptModel( - FLAGS_model_dir, save_optimized_model_dir, input_shapes); - } - -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - // Run inference using optimized model - std::string run_model_dir = - FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir; - paddle::lite_api::Run(input_shapes, - run_model_dir, - FLAGS_repeats, - FLAGS_threads, - FLAGS_warmup, - model_name); -#endif - return 0; -} diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc deleted file mode 100644 index eeba686301..0000000000 --- a/lite/api/cxx_api.cc +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/cxx_api.h" -#include -#include -#include -#include -#include "lite/utils/io.h" -#ifdef LITE_WITH_NPU -#include "lite/backends/npu/npu_helper.h" -#endif - -namespace paddle { -namespace lite { - -void Predictor::SaveModel(const std::string &dir, - lite_api::LiteModelType model_type) { - if (!program_) { - GenRuntimeProgram(); - } - program_->SaveOpInfosToProgram(&program_desc_); - program_->UpdateVarsOfProgram(&program_desc_); - switch (model_type) { - case lite_api::LiteModelType::kProtobuf: - SaveModelPb(dir, *program_->exec_scope(), program_desc_, true); - break; - case lite_api::LiteModelType::kNaiveBuffer: - SaveModelNaive(dir, *program_->exec_scope(), program_desc_); - break; - default: - LOG(FATAL) << "Unknown model type"; - } -#ifdef LITE_WITH_NPU - for (auto name : npu::DeviceInfo::Global().AllClientNames()) { - // the npu offline model is saved in current dir - // so just copy to dst dir - CHECK_EQ( - system(string_format("cp -r %s %s", name.c_str(), dir.c_str()).c_str()), - 0) - << "Failed copy NPU model to " << dir; - } -#endif -} - -lite::Tensor *Predictor::GetInput(size_t offset) { - auto *_feed_list = exec_scope_->FindVar("feed"); - CHECK(_feed_list) << "no feed variable in exec_scope"; - auto *feed_list = _feed_list->GetMutable>(); - if (offset >= feed_list->size()) { - feed_list->resize(offset + 1); - } - return &feed_list->at(offset); -} - -const lite::Tensor *Predictor::GetOutput(size_t offset) const { - auto *_fetch_list = exec_scope_->FindVar("fetch"); - CHECK(_fetch_list) << "no fatch variable in exec_scope"; - auto &fetch_list = *_fetch_list->GetMutable>(); - CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; - return &fetch_list.at(offset); -} - -const std::vector *Predictor::GetOutputs() const { - auto *_fetch_list = exec_scope_->FindVar("fetch"); - CHECK(_fetch_list) << "no fatch variable in exec_scope"; - auto &fetch_list = *_fetch_list->GetMutable>(); - return &fetch_list; -} - -const cpp::ProgramDesc &Predictor::program_desc() const { - return program_desc_; -} -const RuntimeProgram &Predictor::runtime_program() const { return *program_; } - -void Predictor::Build(const lite_api::CxxConfig &config, - const std::vector &valid_places, - const std::vector &passes, - lite_api::LiteModelType model_type) { - const std::string &model_path = config.model_dir(); - const std::string &model_file = config.model_file(); - const std::string ¶m_file = config.param_file(); - const Place prefer_place = config.preferred_place(); - const bool model_from_memory = config.model_from_memory(); - LOG(INFO) << "load from memory " << model_from_memory; - - Build(model_path, - model_file, - param_file, - prefer_place, - valid_places, - passes, - model_type, - model_from_memory); -} -void Predictor::Build(const std::string &model_path, - const std::string &model_file, - const std::string ¶m_file, - const Place &prefer_place, - const std::vector &valid_places, - const std::vector &passes, - lite_api::LiteModelType model_type, - bool model_from_memory) { - switch (model_type) { - case lite_api::LiteModelType::kProtobuf: { - bool combined_param = false; - if (!model_file.empty() && !param_file.empty()) { - combined_param = true; - } - LoadModelPb(model_path, - model_file, - param_file, - scope_.get(), - &program_desc_, - combined_param, - model_from_memory); - } break; - case lite_api::LiteModelType::kNaiveBuffer: - CHECK(!model_path.empty()) - << "NaiveBuffer backend only supported combined param"; - LoadModelNaive(model_path, scope_.get(), &program_desc_); - break; - default: - LOG(FATAL) << "Unknown model type"; - } - Build(program_desc_, prefer_place, valid_places, passes); -} - -void Predictor::Build(const cpp::ProgramDesc &desc, - const Place &prefer_place, - const std::vector &valid_places, - const std::vector &passes) { - program_desc_ = desc; - Program program(desc, scope_, valid_places); - optimizer_.KernelPickPreferPlace(prefer_place); - core::KernelPickFactor factor; - factor.ConsiderTarget(); - factor.ConsiderPrecision(); - optimizer_.Run(std::move(program), valid_places, factor, passes); - exec_scope_ = optimizer_.exec_scope(); -} - -void Predictor::GenRuntimeProgram() { - program_ = optimizer_.GenRuntimeProgram(); - CHECK_EQ(exec_scope_, program_->exec_scope()); - program_generated_ = true; -} - -const lite::Tensor *Predictor::GetTensor(const std::string &name) const { - auto *var = exec_scope_->FindVar(name); - return &var->Get(); -} - -#ifdef LITE_WITH_TRAIN -void Predictor::FeedVars(const std::vector &tensors) { - auto var = scope_->FindVar("feed"); - auto &feed_list = *(var->GetMutable>()); - feed_list.resize(tensors.size()); - - for (size_t i = 0; i < tensors.size(); ++i) - feed_list[i].ShareDataWith(tensors[i]); -} -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h deleted file mode 100644 index 2506ae47b0..0000000000 --- a/lite/api/cxx_api.h +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "lite/api/paddle_api.h" -#include "lite/core/op_lite.h" -#include "lite/core/optimizer.h" -#include "lite/core/program.h" -#include "lite/core/types.h" -#include "lite/model_parser/model_parser.h" - -namespace paddle { -namespace lite { - -/* - * Predictor for inference, input a model, it will optimize and execute it. - */ -class LITE_API Predictor { - public: - // Create an empty predictor. - Predictor() { scope_ = std::make_shared(); } - // Create a predictor with the weight variable scope set. - explicit Predictor(const std::shared_ptr& root_scope) - : scope_(root_scope) {} - - // Build from a model, with places set for hardware config. - void Build( - const lite_api::CxxConfig& config, - const std::vector& valid_places, - const std::vector& passes = {}, - lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf); - - void Build( - const std::string& model_path, - const std::string& model_file_path, - const std::string& param_file_path, - const Place& prefer_place, - const std::vector& valid_places, - const std::vector& passes = {}, - lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf, - bool memory_from_memory = false); - - void Build(const cpp::ProgramDesc& desc, - const Place& prefer_place, - const std::vector& valid_places, - const std::vector& passes = {}); - - void GenRuntimeProgram(); - - // Run the predictor for a single batch of data. - void Run() { - if (!program_generated_) { - GenRuntimeProgram(); - } - program_->Run(); - LOG(INFO) << "running"; - } - - // Get offset-th col of feed inputs. - lite::Tensor* GetInput(size_t offset); - - // Get offset-th col of fetch results. - const lite::Tensor* GetOutput(size_t offset) const; - const std::vector* GetOutputs() const; - - const cpp::ProgramDesc& program_desc() const; - const lite::Tensor* GetTensor(const std::string& name) const; - const RuntimeProgram& runtime_program() const; - - // This method is disabled in mobile, for unnecessary dependencies required. - void SaveModel( - const std::string& dir, - lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf); - -#ifdef LITE_WITH_TRAIN - void Run(const std::vector& tensors) { - FeedVars(tensors); - program_->Run(); - } - - void FeedVars(const std::vector& tensors); -#endif - - private: - Optimizer optimizer_; - cpp::ProgramDesc program_desc_; - std::shared_ptr scope_; - const Scope* exec_scope_; - std::unique_ptr program_; - bool program_generated_{false}; -}; - -/* - * An executor for training. - * - * Usage: - * - * CXXTrainer trainer(...); - * trainer.RunStartupProgram(...); - * auto exe = BuildMainProgramExecutor(...); - * - * for (auto& epoch : epoches) { - * auto* tensor0 = exe.GetInput(...); - * // fill data for tensor0 - * exe.Run(); - * } -#ifdef LITE_WITH_X86 -class LITE_API CXXTrainer { - public: - CXXTrainer(const std::shared_ptr& root_scope, - const Place& preferred_place, - const std::vector& valid_places) - : scope_(root_scope), - preferred_place_(preferred_place), - valid_places_(valid_places), - main_program_executor_(Predictor(scope_)) {} - - // Build the RuntimeProgram cache for the main program. The cache will run - // multiple times for the epoches. - // NOTE Just support to execute the 0-th block currently. - Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc, - int block_id = 0) { - main_program_executor_.Build(desc, preferred_place_, valid_places_); - return main_program_executor_; - } - -#ifdef LITE_WITH_TRAIN - Predictor& BuildMainProgramExecutor(framework::ProgramDesc& desc) { // NOLINT - return BuildMainProgramExecutor(*desc.Proto()); - } - - void RunStartupProgram(framework::ProgramDesc& desc) { // NOLINT - RunStartupProgram(*desc.Proto()); - } -#endif - - // Run the startup program. It just executes once, no cache needed. - void RunStartupProgram(const framework::proto::ProgramDesc& desc, - int block_id = 0) { - Predictor exe(scope_); - exe.Build(desc, preferred_place_, valid_places_); - exe.Run(); - } - - private: - std::shared_ptr scope_; - - Place preferred_place_; - std::vector valid_places_; - - // The training program. - Predictor main_program_executor_; -}; -#endif -*/ - -} // namespace lite -} // namespace paddle diff --git a/lite/api/cxx_api_bin.cc b/lite/api/cxx_api_bin.cc deleted file mode 100644 index 000e94307c..0000000000 --- a/lite/api/cxx_api_bin.cc +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/cxx_api.h" -#include // NOLINT -#include "lite/api/paddle_use_passes.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -using Time = decltype(std::chrono::high_resolution_clock::now()); -Time time() { return std::chrono::high_resolution_clock::now(); } -double time_diff(Time t1, Time t2) { - typedef std::chrono::microseconds ms; - auto diff = t2 - t1; - ms counter = std::chrono::duration_cast(diff); - return counter.count() / 1000.0; -} - -void Run(const char* model_dir, int repeat) { -#ifdef LITE_WITH_ARM - DeviceInfo::Init(); -#endif - lite::Predictor predictor; - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kInt8)}, - }); - - predictor.Build( - model_dir, "", "", Place{TARGET(kARM), PRECISION(kInt8)}, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < input_tensor->dims().production(); i++) { - data[i] = 1; - } - - auto time1 = time(); - for (int i = 0; i < repeat; i++) predictor.Run(); - auto time2 = time(); - std::cout << " predict cost: " << time_diff(time1, time2) / repeat << "ms" - << std::endl; - - auto* out = predictor.GetOutput(0); - LOG(INFO) << out << " memory size " << out->data_size(); - LOG(INFO) << "out " << out->data()[0]; - LOG(INFO) << "out " << out->data()[1]; - LOG(INFO) << "dims " << out->dims(); - LOG(INFO) << "out data size: " << out->data_size(); -} - -} // namespace lite -} // namespace paddle - -int main(int argc, char** argv) { - CHECK_EQ(argc, 3) << "usage: ./cmd "; - paddle::lite::Run(argv[1], std::stoi(argv[2])); - - return 0; -} - -USE_LITE_OP(mul); -USE_LITE_OP(fc); -USE_LITE_OP(scale); -USE_LITE_OP(feed); -USE_LITE_OP(fetch); -USE_LITE_OP(io_copy); -USE_LITE_OP(io_copy_once); - -USE_LITE_OP(conv2d); -USE_LITE_OP(batch_norm); -USE_LITE_OP(relu); -USE_LITE_OP(depthwise_conv2d); -USE_LITE_OP(pool2d); -USE_LITE_OP(elementwise_add); -USE_LITE_OP(softmax); -USE_LITE_OP(fake_quantize_moving_average_abs_max); -USE_LITE_OP(fake_dequantize_max_abs); - -USE_LITE_KERNEL(feed, kHost, kAny, kAny, def); -USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); -USE_LITE_OP(calib); - -#ifdef LITE_WITH_ARM -USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out); -USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out); -USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); - -USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out); -USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out); -USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); - -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8); -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32); - -// USE_LITE_KERNEL(feed, kARM, kAny, kAny, def); -// USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def); -#endif // LITE_WITH_ARM - -#ifdef LITE_WITH_CUDA -USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def); -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host); -USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, device_to_host); -#endif diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc deleted file mode 100644 index b8c92a8f96..0000000000 --- a/lite/api/cxx_api_impl.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_api.h" - -namespace paddle { -namespace lite { - -class CxxPaddleApiImpl : public lite_api::PaddlePredictor { - public: - CxxPaddleApiImpl(); - - /// Create a new predictor from a config. - void Init(const lite_api::CxxConfig &config); - - std::unique_ptr GetInput(int i) override; - - std::unique_ptr GetOutput(int i) const override; - - void Run() override; - - std::unique_ptr GetTensor( - const std::string &name) const override; - - void SaveOptimizedModel(const std::string &model_dir, - lite_api::LiteModelType model_type = - lite_api::LiteModelType::kProtobuf) override; - - private: - Predictor raw_predictor_; -}; - -CxxPaddleApiImpl::CxxPaddleApiImpl() {} - -void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { - auto places = config.valid_places(); - places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); - raw_predictor_.Build(config, places); -} - -std::unique_ptr CxxPaddleApiImpl::GetInput(int i) { - auto *x = raw_predictor_.GetInput(i); - return std::unique_ptr(new lite_api::Tensor(x)); -} - -std::unique_ptr CxxPaddleApiImpl::GetOutput( - int i) const { - const auto *x = raw_predictor_.GetOutput(i); - return std::unique_ptr(new lite_api::Tensor(x)); -} - -void CxxPaddleApiImpl::Run() { raw_predictor_.Run(); } - -std::unique_ptr CxxPaddleApiImpl::GetTensor( - const std::string &name) const { - auto *x = raw_predictor_.GetTensor(name); - return std::unique_ptr(new lite_api::Tensor(x)); -} - -void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir, - lite_api::LiteModelType model_type) { - raw_predictor_.SaveModel(model_dir, model_type); -} - -} // namespace lite - -namespace lite_api { - -template <> -std::shared_ptr CreatePaddlePredictor( - const CxxConfig &config) { - auto x = std::make_shared(); - x->Init(config); - return x; -} - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/cxx_api_test.cc b/lite/api/cxx_api_test.cc deleted file mode 100644 index c562b9f080..0000000000 --- a/lite/api/cxx_api_test.cc +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/cxx_api.h" -#include -#include -#include -#include "lite/api/lite_api_test_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" - -// For training. -DEFINE_string(startup_program_path, "", ""); -DEFINE_string(main_program_path, "", ""); - -namespace paddle { -namespace lite { - -#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -TEST(CXXApi, test) { - const lite::Tensor* out = RunHvyModel(); - LOG(INFO) << out << " memory size " << out->data_size(); - for (int i = 0; i < 10; i++) { - LOG(INFO) << "out " << out->data()[i]; - } - LOG(INFO) << "dims " << out->dims(); - // LOG(INFO) << "out " << *out; -} - -TEST(CXXApi, save_model) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kCUDA), PRECISION(kFloat)}, - valid_places); - - LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; - predictor.SaveModel(FLAGS_optimized_model, - lite_api::LiteModelType::kProtobuf); - predictor.SaveModel(FLAGS_optimized_model + ".naive", - lite_api::LiteModelType::kNaiveBuffer); -} - -/*TEST(CXXTrainer, train) { - Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)}); - std::vector valid_places({prefer_place}); - auto scope = std::make_shared(); - - CXXTrainer trainer(scope, prefer_place, valid_places); - - std::string main_program_pb, startup_program_pb; - ReadBinaryFile(FLAGS_main_program_path, &main_program_pb); - ReadBinaryFile(FLAGS_startup_program_path, &startup_program_pb); - framework::proto::ProgramDesc main_program_desc, startup_program_desc; - main_program_desc.ParseFromString(main_program_pb); - startup_program_desc.ParseFromString(startup_program_pb); - - // LOG(INFO) << main_program_desc.DebugString(); - - for (const auto& op : main_program_desc.blocks(0).ops()) { - LOG(INFO) << "get op " << op.type(); - } - - return; - - trainer.RunStartupProgram(startup_program_desc); - auto& exe = trainer.BuildMainProgramExecutor(main_program_desc); - auto* tensor0 = exe.GetInput(0); - tensor0->Resize(std::vector({100, 100})); - auto* data0 = tensor0->mutable_data(); - data0[0] = 0; - - exe.Run(); -}*/ -#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - -#ifdef LITE_WITH_ARM -TEST(CXXApi, save_model) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}}); - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kARM), PRECISION(kFloat)}, - valid_places); - - LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; - predictor.SaveModel(FLAGS_optimized_model); - predictor.SaveModel(FLAGS_optimized_model + ".naive", - lite_api::LiteModelType::kNaiveBuffer); -} - -TEST(CXXApi, load_model_naive) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}}); - predictor.Build(FLAGS_optimized_model + ".naive", - "", - "", - Place{TARGET(kARM), PRECISION(kFloat)}, - valid_places, - {}, - lite_api::LiteModelType::kNaiveBuffer); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(std::vector({1, 100})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100; i++) { - data[i] = 1; - } - - predictor.Run(); - - std::vector result({0.4350058, - -0.6048313, - -0.29346266, - 0.40377066, - -0.13400325, - 0.37114543, - -0.3407839, - 0.14574292, - 0.4104212, - 0.8938774}); - - auto* output_tensor = predictor.GetOutput(0); - auto output_shape = output_tensor->dims().Vectorize(); - ASSERT_EQ(output_shape.size(), 2); - ASSERT_EQ(output_shape[0], 1); - ASSERT_EQ(output_shape[1], 500); - - int step = 50; - for (int i = 0; i < result.size(); i += step) { - EXPECT_NEAR(output_tensor->data()[i], result[i], 1e-6); - } -} -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/api/detection_model_test.cc b/lite/api/detection_model_test.cc deleted file mode 100644 index 2d79653baa..0000000000 --- a/lite/api/detection_model_test.cc +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -DEFINE_bool(is_run_model_optimize, - false, - "apply model_optimize_tool to model, use optimized model to test"); - -namespace paddle { -namespace lite_api { - -void OutputOptModel(const std::string& load_model_dir, - const std::string& save_optimized_model_dir) { - lite_api::CxxConfig config; - config.set_model_dir(load_model_dir); - config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)}); - config.set_valid_places({ - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - auto predictor = lite_api::CreatePaddlePredictor(config); - - int ret = system( - paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) - .c_str()); - if (ret == 0) { - LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; - } - predictor->SaveOptimizedModel(save_optimized_model_dir, - LiteModelType::kNaiveBuffer); - LOG(INFO) << "Load model from " << load_model_dir; - LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; -} - -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -void Run(const std::string& model_dir, - const int repeat, - const int warmup_times, - const int thread_num) { - // set config and create predictor - lite_api::MobileConfig config; - config.set_model_dir(model_dir); - config.set_threads(thread_num); - if (thread_num == 1) { - config.set_power_mode(LITE_POWER_HIGH); - } else { - config.set_power_mode(LITE_POWER_NO_BIND); - } - - auto predictor = lite_api::CreatePaddlePredictor(config); - - // set input - auto input_image = predictor->GetInput(0); - input_image->Resize({1, 3, 300, 300}); - auto input_image_data = input_image->mutable_data(); - std::ifstream read_file("/data/local/tmp/pjc/ssd_img.txt"); - if (!read_file.is_open()) { - LOG(INFO) << "read image file fail"; - return; - } - auto input_shape = input_image->shape(); - int64_t input_image_size = 1; - for (auto t : input_shape) { - input_image_size *= t; - } - for (int i = 0; i < input_image_size; i++) { - read_file >> input_image_data[i]; - } - - // warmup and run - for (int i = 0; i < warmup_times; ++i) { - predictor->Run(); - } - - auto start = lite::GetCurrentUS(); - for (int i = 0; i < repeat; ++i) { - predictor->Run(); - } - - // show result - auto end = lite::GetCurrentUS(); - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (end - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - auto out = predictor->GetOutput(0); - auto out_data = out->data(); - LOG(INFO) << "output shape:"; - auto out_shape = out->shape(); - for (auto t : out_shape) { - LOG(INFO) << t; - } - LOG(INFO) << "output data:"; - int output_len = 20; - for (int i = 0; i < output_len; i++) { - LOG(INFO) << out_data[i]; - } -} -#endif - -} // namespace lite_api -} // namespace paddle - -TEST(Faster_RCNN, test_arm) { - std::string save_optimized_model_dir; - if (FLAGS_is_run_model_optimize) { - save_optimized_model_dir = FLAGS_model_dir + "opt"; - paddle::lite_api::OutputOptModel(FLAGS_model_dir, save_optimized_model_dir); - } - std::string run_model_dir = - FLAGS_is_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir; - paddle::lite_api::Run( - run_model_dir, FLAGS_repeats, FLAGS_threads, FLAGS_warmup); -} diff --git a/lite/api/efficientnet_b0_test.cc b/lite/api/efficientnet_b0_test.cc deleted file mode 100644 index fa16a6be81..0000000000 --- a/lite/api/efficientnet_b0_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -void TestModel(const std::vector &valid_places, - const Place &preferred_place) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); - - auto *input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto *data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {-0.6746618, -0.7119305, -0.053502668, -0.6767762, -0.07488631, - -1.1109267, 0.63711894, 0.5979086, -0.20651843, -0.49293622, - -0.7404337, -0.25586239, 2.244521, 0.8738271, 0.7193805, - -0.21894705, -0.90460795, 0.07160086, 0.54588217, 0.020132724})); - auto *out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 2e-4); - } - } -} - -TEST(EfficientNetB0, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - // Place{TARGET(kOpenCL), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -TEST(EfficientNetB0, test_opencl) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)})); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/inceptionv4_test.cc b/lite/api/inceptionv4_test.cc deleted file mode 100644 index ae772dbba5..0000000000 --- a/lite/api/inceptionv4_test.cc +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM -TEST(InceptionV4, test) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}}); - - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kARM), PRECISION(kFloat)}, - valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - // std::vector results({0.00078033, 0.00083865, 0.00060029, 0.00057083, - // 0.00070094, 0.00080584, 0.00044525, 0.00074907, - // 0.00059774, 0.00063654}); - // - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.0011684548, 0.0010390386, 0.0011301535, 0.0010133048, - 0.0010259597, 0.0010982729, 0.00093195855, 0.0009141837, - 0.00096620916, 0.00089982944, 0.0010064574, 0.0010474789, - 0.0009782845, 0.0009230255, 0.0010548076, 0.0010974824, - 0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc deleted file mode 100644 index 98b79e58aa..0000000000 --- a/lite/api/light_api.cc +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/light_api.h" - -namespace paddle { -namespace lite { - -void LightPredictor::Build(const std::string& model_dir, - const std::string& model_buffer, - const std::string& param_buffer, - lite_api::LiteModelType model_type, - bool model_from_memory) { - cpp::ProgramDesc desc; - switch (model_type) { -#ifndef LITE_ON_TINY_PUBLISH - case lite_api::LiteModelType::kProtobuf: - LoadModelPb(model_dir, "", "", scope_.get(), &desc); - break; -#endif - case lite_api::LiteModelType::kNaiveBuffer: { - if (model_from_memory) { - LoadModelNaiveFromMemory( - model_buffer, param_buffer, scope_.get(), &desc); - } else { - LoadModelNaive(model_dir, scope_.get(), &desc); - } - break; - } - default: - LOG(FATAL) << "Unknown model type"; - } - BuildRuntimeProgram(desc); -} - -Tensor* LightPredictor::GetInput(size_t offset) { - auto* _feed_list = program_->exec_scope()->FindVar("feed"); - CHECK(_feed_list) << "no feed variable in exec_scope"; - auto* feed_list = _feed_list->GetMutable>(); - if (offset >= feed_list->size()) { - feed_list->resize(offset + 1); - } - return &feed_list->at(offset); -} - -const Tensor* LightPredictor::GetOutput(size_t offset) { - auto* _fetch_list = program_->exec_scope()->FindVar("fetch"); - CHECK(_fetch_list) << "no fatch variable in exec_scope"; - auto& fetch_list = *_fetch_list->GetMutable>(); - CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; - return &fetch_list.at(offset); -} - -void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { - std::vector insts; - // 1. Create op first - Program program(prog, scope_, {}); - - // 2. Create Instructs - - // Create the kernels of the target places, and filter out the specific - // kernel with the target alias. - for (auto& op : program.ops()) { - auto kernel_type = op->op_info()->GetAttr(kKernelTypeAttr); - std::string op_type, alias; - Place place; - KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); - auto kernels = op->CreateKernels({place}); - // filter out a kernel - auto it = std::find_if( - kernels.begin(), kernels.end(), [&](std::unique_ptr& it) { - return it->alias() == alias; - }); - CHECK(it != kernels.end()); - (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); - insts.emplace_back(op, std::move(*it)); - } - program_.reset(new RuntimeProgram(std::move(insts))); - CHECK(program.exec_scope()); - program_->set_exec_scope(program.exec_scope()); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/light_api.h b/lite/api/light_api.h deleted file mode 100644 index 2415401744..0000000000 --- a/lite/api/light_api.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file implements a light-weight API which can run on mobile. We limit the - * dependencies and the runtime computation complexity. - */ -#pragma once - -#include -#include -#include -#include -#include "lite/api/paddle_api.h" -#include "lite/core/context.h" -#include "lite/core/program.h" -#include "lite/core/tensor.h" -#include "lite/core/types.h" -#include "lite/model_parser/model_parser.h" - -namespace paddle { -namespace lite { - -/* - * The light weight predictor, mainly for mobile. It loads an optimized model, - * and will not depend on the MIR or perform latter optimization. - */ -class LITE_API LightPredictor { - public: - LightPredictor( - const std::string& model_dir, - const std::string& model_buffer = "", - const std::string& param_buffer = "", - bool model_from_memory = false, - lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf) { - scope_ = std::make_shared(); - Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory); - } - - void Run() { program_->Run(); } - - // Get offset-th col of feed inputs. - Tensor* GetInput(size_t offset); - - // Get offset-th col of fetch outputs. - const Tensor* GetOutput(size_t offset); - - const lite::Tensor* GetTensor(const std::string& name) const { - auto* var = program_->exec_scope()->FindVar(name); - return &var->Get(); - } - - private: - void Build( - const std::string& model_dir, - const std::string& model_buffer, - const std::string& param_buffer, - lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf, - bool model_from_memory = false); - - void BuildRuntimeProgram(const cpp::ProgramDesc& prog); - - private: - std::shared_ptr scope_; - std::unique_ptr program_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc deleted file mode 100644 index 6075f1a36f..0000000000 --- a/lite/api/light_api_impl.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/light_api.h" -#include "lite/api/paddle_api.h" -#include "lite/model_parser/model_parser.h" - -namespace paddle { -namespace lite_api { - -class LightPredictorImpl : public PaddlePredictor { - public: - LightPredictorImpl() = default; - - std::unique_ptr GetInput(int i) override; - - std::unique_ptr GetOutput(int i) const override; - - void Run() override; - - std::unique_ptr GetTensor( - const std::string& name) const override; - - void Init(const MobileConfig& config); - - private: - std::unique_ptr raw_predictor_; -}; - -void LightPredictorImpl::Init(const MobileConfig& config) { -// LightPredictor Only support NaiveBuffer backend in publish lib -#ifdef LITE_WITH_ARM - lite::DeviceInfo::Init(); - lite::DeviceInfo::Global().SetRunMode(config.power_mode(), config.threads()); -#endif - raw_predictor_.reset(new lite::LightPredictor(config.model_dir(), - config.model_buffer(), - config.param_buffer(), - config.model_from_memory(), - LiteModelType::kNaiveBuffer)); -} - -std::unique_ptr LightPredictorImpl::GetInput(int i) { - return std::unique_ptr(new Tensor(raw_predictor_->GetInput(i))); -} - -std::unique_ptr LightPredictorImpl::GetOutput(int i) const { - return std::unique_ptr(new Tensor(raw_predictor_->GetOutput(i))); -} - -void LightPredictorImpl::Run() { raw_predictor_->Run(); } - -std::unique_ptr LightPredictorImpl::GetTensor( - const std::string& name) const { - return std::unique_ptr( - new Tensor(raw_predictor_->GetTensor(name))); -} - -template <> -std::shared_ptr CreatePaddlePredictor( - const MobileConfig& config) { - auto x = std::make_shared(); - x->Init(config); - return x; -} - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc deleted file mode 100644 index 8e2fc420bc..0000000000 --- a/lite/api/light_api_test.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/light_api.h" -#include -#include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" - -DEFINE_string(optimized_model, "", ""); - -namespace paddle { -namespace lite { - -TEST(LightAPI, load) { - if (FLAGS_optimized_model.empty()) { - FLAGS_optimized_model = "lite_naive_model"; - } - LightPredictor predictor(FLAGS_optimized_model, "", ""); - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({100, 100}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - predictor.Run(); - - const auto* output = predictor.GetOutput(0); - const float* raw_output = output->data(); - - for (int i = 0; i < 10; i++) { - LOG(INFO) << "out " << raw_output[i]; - } -} - -TEST(LightAPI, loadNaiveBuffer) { - if (FLAGS_optimized_model.empty()) { - FLAGS_optimized_model = "lite_naive_model"; - } - - auto model_path = std::string(FLAGS_optimized_model) + "/__model__.nb"; - auto params_path = std::string(FLAGS_optimized_model) + "/param.nb"; - std::string model_buffer = lite::ReadFile(model_path); - size_t size_model = model_buffer.length(); - std::string params_buffer = lite::ReadFile(params_path); - size_t size_params = params_buffer.length(); - LOG(INFO) << "sizeModel: " << size_model; - LOG(INFO) << "sizeParams: " << size_params; - - lite_api::MobileConfig config; - config.set_model_buffer( - model_buffer.c_str(), size_model, params_buffer.c_str(), size_params); - LightPredictor predictor(config.model_dir(), - config.model_buffer(), - config.param_buffer(), - config.model_from_memory(), - lite_api::LiteModelType::kNaiveBuffer); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({100, 100}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - predictor.Run(); - - const auto* output = predictor.GetOutput(0); - const float* raw_output = output->data(); - - for (int i = 0; i < 10; i++) { - LOG(INFO) << "out " << raw_output[i]; - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/lite_api_test_helper.cc b/lite/api/lite_api_test_helper.cc deleted file mode 100644 index cd576998d3..0000000000 --- a/lite/api/lite_api_test_helper.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/lite_api_test_helper.h" -#include - -DEFINE_string(model_dir, "", ""); -DEFINE_string(optimized_model, "", ""); - -namespace paddle { -namespace lite { - -const lite::Tensor* RunHvyModel() { - lite::Predictor predictor; -#ifndef LITE_WITH_CUDA - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); -#else - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)}, - Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)}, - Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)}, - Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)}, - Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)}, - Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)}, - }); -#endif - - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, // origin cuda - valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({100, 100}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - // LOG(INFO) << "input " << *input_tensor; - - predictor.Run(); - - const auto* out = predictor.GetOutput(0); - return out; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/lite_api_test_helper.h b/lite/api/lite_api_test_helper.h deleted file mode 100644 index ac3be77b10..0000000000 --- a/lite/api/lite_api_test_helper.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/api/cxx_api.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" - -DECLARE_string(model_dir); -DECLARE_string(optimized_model); - -namespace paddle { -namespace lite { - -const lite::Tensor* RunHvyModel(); - -} // namespace lite -} // namespace paddle diff --git a/lite/api/mobilenetv1_int8_test.cc b/lite/api/mobilenetv1_int8_test.cc deleted file mode 100644 index 769f195d19..0000000000 --- a/lite/api/mobilenetv1_int8_test.cc +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -void TestModel(const std::vector& valid_places, - const Place& preferred_place) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.000227548, 0.000262385, 0.000260347, 0.000293865, 0.00025008})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} - -TEST(MobileNetV1, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kInt8)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kInt8)})); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/mobilenetv1_ssd_test.cc b/lite/api/mobilenetv1_ssd_test.cc deleted file mode 100644 index e37e180f9b..0000000000 --- a/lite/api/mobilenetv1_ssd_test.cc +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM -void TestModel(const std::vector& valid_places, - const Place& preferred_place) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 300, 300}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {3, 0.042103, 0.00439525, 0.0234783, 1.01127, 0.990756})); - results.emplace_back(std::vector( - {5, 0.0145793, 0.00860882, 0.0344975, 1.01375, 1.00129})); - results.emplace_back(std::vector( - {8, 0.560059, 0.00439525, 0.0234783, 1.01127, 0.990756})); - results.emplace_back(std::vector( - {9, 0.0165109, -0.0020006, 0.0013622, 0.999179, 0.991846})); - results.emplace_back(std::vector( - {12, 0.0263337, -0.0020006, 0.0013622, 0.999179, 0.991846})); - results.emplace_back(std::vector( - {15, 0.0116742, 0.00580454, 0.0321349, 1.00545, 0.98476})); - results.emplace_back(std::vector( - {17, 0.0405541, 0.00860882, 0.0344975, 1.01375, 1.00129})); - results.emplace_back(std::vector( - {18, 0.0231487, -0.00245976, 0.00771075, 1.01654, 1.00395})); - results.emplace_back(std::vector( - {19, 0.0133921, 0.00860882, 0.0344975, 1.01375, 1.00129})); - results.emplace_back(std::vector( - {20, 0.039664, 0.00860882, 0.0344975, 1.01375, 1.00129})); - - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 10); - ASSERT_EQ(out->dims()[1], 6); - ASSERT_EQ(out->lod().size(), 1); - ASSERT_EQ(out->lod()[0].size(), 2); - ASSERT_EQ(out->lod()[0][0], 0); - ASSERT_EQ(out->lod()[0][1], 10); - - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR( - out->data()[j + (out->dims()[1] * i)], results[i][j], 5e-6); - } - } -} - -TEST(MobileNetV1_SSD, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -#endif // LITE_WITH_ARM - -} // namespace lite -} // namespace paddle diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc deleted file mode 100644 index 91d1828a94..0000000000 --- a/lite/api/mobilenetv1_test.cc +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -DEFINE_string(optimized_model, "", "optimized_model"); - -namespace paddle { -namespace lite { - -void TestModel(const std::vector& valid_places, - const Place& preferred_place, - const std::string& model_dir = FLAGS_model_dir, - bool save_model = false) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - if (save_model) { - LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; - predictor.SaveModel(FLAGS_optimized_model); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> ref; - ref.emplace_back(std::vector( - {0.00019130898, 9.467885e-05, 0.00015971427, 0.0003650665, - 0.00026431272, 0.00060884043, 0.0002107942, 0.0015819625, - 0.0010323516, 0.00010079765, 0.00011006987, 0.0017364529, - 0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986, - 0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722})); - auto* out = predictor.GetOutput(0); - const auto* pdata = out->data(); - int step = 50; -#ifdef LITE_WITH_NPU - ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); - VLOG(3) << diff; - EXPECT_LT(diff, eps); - } - } -#else - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - double eps = 1e-6; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - EXPECT_NEAR(result, ref[i][j], eps); - } - } -#endif -} - -#ifdef LITE_WITH_NPU -TEST(MobileNetV1, test_npu) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kNPU), PRECISION(kFloat)}, - }); - - TestModel(valid_places, - Place({TARGET(kARM), PRECISION(kFloat)}), - FLAGS_model_dir, - true /* save_model*/); - - TestModel(valid_places, - Place({TARGET(kARM), PRECISION(kFloat)}), - FLAGS_optimized_model, - false /* save model */); -} -#endif // LITE_WITH_NPU - -TEST(MobileNetV1, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -#ifdef LITE_WITH_OPENCL -TEST(MobileNetV1, test_opencl) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)})); -} -#endif // LITE_WITH_OPENCL - -} // namespace lite -} // namespace paddle diff --git a/lite/api/mobilenetv1_yolov3_test.cc b/lite/api/mobilenetv1_yolov3_test.cc deleted file mode 100644 index 3a12203b71..0000000000 --- a/lite/api/mobilenetv1_yolov3_test.cc +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM -void TestModel(const std::vector& valid_places, - const Place& preferred_place) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 608, 608}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 50; - } - - auto* img_size = predictor.GetInput(1); - img_size->Resize(DDim(std::vector({1, 2}))); - auto* size_data = img_size->mutable_data(); - size_data[0] = 608; - size_data[1] = 608; - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0., 0.7803235, 577.7447, 592.5643, 582.15314, 597.3399})); - results.emplace_back(std::vector( - {0., 0.7643098, 473.50653, 592.58966, 478.26117, 597.2353})); - results.emplace_back(std::vector( - {0., 0.7614112, 593.06946, 591.99646, 598.64087, 597.553})); - results.emplace_back(std::vector( - {0., 0.7579255, 161.40321, 592.61694, 166.33885, 597.28406})); - results.emplace_back(std::vector( - {0., 0.7569634, 193.39563, 592.62164, 198.35269, 597.2968})); - results.emplace_back(std::vector( - {0., 0.7568337, 297.3981, 592.62024, 302.35202, 597.2969})); - results.emplace_back(std::vector( - {0., 0.7568283, 265.39816, 592.6203, 270.35214, 597.29694})); - results.emplace_back(std::vector( - {0., 0.74383223, 33.430492, 592.7017, 38.453976, 597.4267})); - results.emplace_back(std::vector( - {0., 0.66492873, 9.396143, 576.7084, 15.35708, 581.8059})); - results.emplace_back(std::vector( - {0., 0.6568178, 9.970305, 145.12535, 15.043035, 149.76646})); - - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 100); - ASSERT_EQ(out->dims()[1], 6); - ASSERT_EQ(out->lod().size(), 1); - ASSERT_EQ(out->lod()[0].size(), 2); - ASSERT_EQ(out->lod()[0][0], 0); - ASSERT_EQ(out->lod()[0][1], 100); - - int skip = 10; - for (int i = 0; i < results.size(); i += skip) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR( - out->data()[j + (out->dims()[1] * i)], results[i][j], 3e-6); - } - } -} - -TEST(MobileNetV1_YoloV3, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -#endif // LITE_WITH_ARM - -} // namespace lite -} // namespace paddle diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc deleted file mode 100644 index ca36943cb9..0000000000 --- a/lite/api/mobilenetv2_test.cc +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -DEFINE_string(optimized_model, "", "optimized_model"); - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM -void TestModel(const std::vector& valid_places, - const Place& preferred_place, - const std::string& model_dir = FLAGS_model_dir, - bool save_model = false) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - if (save_model) { - LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; - predictor.SaveModel(FLAGS_optimized_model); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> ref; - // i = 1 - ref.emplace_back(std::vector( - {0.00017082224, 5.699624e-05, 0.000260885, 0.00016412718, - 0.00034818667, 0.00015230637, 0.00032959113, 0.0014772735, - 0.0009059976, 9.5378724e-05, 5.386537e-05, 0.0006427285, - 0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048, - 6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358})); - auto* out = predictor.GetOutput(0); - const auto* pdata = out->data(); - int step = 50; -#ifdef LITE_WITH_NPU - ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - auto result = pdata[j * step + (out->dims()[1] * i)]; - auto diff = std::fabs((result - ref[i][j]) / ref[i][j]); - VLOG(3) << diff; - EXPECT_LT(diff, eps); - } - } -#else - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - for (int i = 0; i < ref.size(); ++i) { - for (int j = 0; j < ref[i].size(); ++j) { - EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6); - } - } -#endif -} - -#ifdef LITE_WITH_NPU -TEST(MobileNetV2, test_npu) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kNPU), PRECISION(kFloat)}, - }); - - TestModel(valid_places, - Place({TARGET(kARM), PRECISION(kFloat)}), - FLAGS_model_dir, - true /* save_model*/); - - TestModel(valid_places, - Place({TARGET(kARM), PRECISION(kFloat)}), - FLAGS_optimized_model, - false /* save model */); -} -#endif // LITE_WITH_NPU - -TEST(MobileNetV2, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -#ifdef LITE_WITH_OPENCL -TEST(MobileNetV2, test_opencl) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)})); -} -#endif // LITE_WITH_OPENCL - -#endif // LITE_WITH_ARM - -} // namespace lite -} // namespace paddle diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc deleted file mode 100644 index 37c09b3446..0000000000 --- a/lite/api/model_optimize_tool.cc +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#ifdef PADDLE_WITH_TESTING -#include -#endif -#include "all_kernel_faked.cc" // NOLINT -#include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/op_registry.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/string.h" - -DEFINE_string(model_dir, - "", - "path of the model. This option will be ignored if model_file " - "and param_file are exist"); -DEFINE_string(model_file, "", "model file path of the combined-param model"); -DEFINE_string(param_file, "", "param file path of the combined-param model"); -DEFINE_string( - optimize_out_type, - "protobuf", - "store type of the output optimized model. protobuf/naive_buffer"); -DEFINE_bool(display_kernels, false, "Display kernel information"); -DEFINE_string(optimize_out, "", "path of the output optimized model"); -DEFINE_string(valid_targets, - "arm", - "The targets this model optimized for, should be one of (arm, " - "opencl, x86), splitted by space"); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); - -namespace paddle { -namespace lite_api { - -//! Display the kernel information. -void DisplayKernels() { - LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString(); -} - -void Main() { - if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) { - LOG(WARNING) - << "Load combined-param model. Option model_dir will be ignored"; - } - - if (FLAGS_display_kernels) { - DisplayKernels(); - exit(0); - } - - lite_api::CxxConfig config; - config.set_model_dir(FLAGS_model_dir); - config.set_model_file(FLAGS_model_file); - config.set_param_file(FLAGS_param_file); - - std::vector valid_places; - auto target_reprs = lite::Split(FLAGS_valid_targets, " "); - for (auto& target_repr : target_reprs) { - if (target_repr == "arm") { - valid_places.emplace_back(TARGET(kARM)); - } else if (target_repr == "opencl") { - valid_places.emplace_back(TARGET(kOpenCL)); - } else if (target_repr == "x86") { - valid_places.emplace_back(TARGET(kX86)); - } else { - LOG(FATAL) << lite::string_format( - "Wrong target '%s' found, please check the command flag " - "'valid_targets'", - target_repr.c_str()); - } - } - valid_places.emplace_back(TARGET(kHost)); - - CHECK(!valid_places.empty()) - << "At least one target should be set, should set the " - "command argument 'valid_targets'"; - - if (FLAGS_prefer_int8_kernel) { - LOG(WARNING) << "Int8 mode is only support by ARM target"; - valid_places.push_back(Place{TARGET(kARM), PRECISION(kInt8)}); - config.set_preferred_place(Place{TARGET(kARM), PRECISION(kInt8)}); - } - config.set_valid_places(valid_places); - - auto predictor = lite_api::CreatePaddlePredictor(config); - - LiteModelType model_type; - if (FLAGS_optimize_out_type == "protobuf") { - model_type = LiteModelType::kProtobuf; - } else if (FLAGS_optimize_out_type == "naive_buffer") { - model_type = LiteModelType::kNaiveBuffer; - } else { - LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type; - } - - predictor->SaveOptimizedModel(FLAGS_optimize_out, model_type); -} - -} // namespace lite_api -} // namespace paddle - -int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, false); - paddle::lite_api::Main(); - return 0; -} diff --git a/lite/api/model_run_test_image.cc b/lite/api/model_run_test_image.cc deleted file mode 100644 index 099a74ed7f..0000000000 --- a/lite/api/model_run_test_image.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -TEST(model, test) { -#ifdef LITE_WITH_ARM - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kInt8)}}); - - auto precision = PRECISION(kFloat); - if (FLAGS_int8) { - precision = PRECISION(kInt8); - } - predictor.Build( - FLAGS_model_dir, "", "", Place{TARGET(kARM), precision}, valid_places); - int im_width = FLAGS_im_width; - int im_height = FLAGS_im_height; - auto* input_tensor = predictor.GetInput(0); - auto in_dims = input_tensor->dims(); - input_tensor->Resize( - DDim(std::vector({1, 3, im_width, im_height}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - auto* output_tensors = predictor.GetOutputs(); - - LOG(INFO) << "======output:========"; - for (auto t : *output_tensors) { - LOG(INFO) << t; - } - LOG(INFO) - << "=====RUN_finished!!============= Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; -#endif -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc deleted file mode 100644 index 6e0a249a81..0000000000 --- a/lite/api/model_test.cc +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/paddle_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/device_info.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/string.h" - -DEFINE_string(input_shape, - "1,3,224,224", - "input shapes, separated by colon and comma"); - -namespace paddle { -namespace lite_api { - -void OutputOptModel(const std::string& load_model_dir, - const std::string& save_optimized_model_dir, - const std::vector>& input_shapes) { - lite_api::CxxConfig config; - config.set_model_dir(load_model_dir); - config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)}); - config.set_valid_places({ - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - auto predictor = lite_api::CreatePaddlePredictor(config); - - // delete old optimized model - int ret = system( - paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str()) - .c_str()); - if (ret == 0) { - LOG(INFO) << "delete old optimized model " << save_optimized_model_dir; - } - predictor->SaveOptimizedModel(save_optimized_model_dir, - LiteModelType::kNaiveBuffer); - LOG(INFO) << "Load model from " << load_model_dir; - LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; -} - -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -void Run(const std::vector>& input_shapes, - const std::string& model_dir, - const int repeat, - const int thread_num, - const int warmup_times = 0) { -#ifdef LITE_WITH_ARM - lite::DeviceInfo::Init(); - lite::DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, thread_num); -#endif - lite_api::MobileConfig config; - config.set_model_dir(model_dir); - - auto predictor = lite_api::CreatePaddlePredictor(config); - - for (int j = 0; j < input_shapes.size(); ++j) { - auto input_tensor = predictor->GetInput(j); - input_tensor->Resize(input_shapes[j]); - auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (int i = 0; i < input_shapes[j].size(); ++i) { - input_num *= input_shapes[j][i]; - } - for (int i = 0; i < input_num; ++i) { - input_data[i] = 1.f; - } - } - - for (int i = 0; i < warmup_times; ++i) { - predictor->Run(); - } - - auto start = lite::GetCurrentUS(); - for (int i = 0; i < repeat; ++i) { - predictor->Run(); - } - auto end = lite::GetCurrentUS(); - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << model_dir << ", threads num " << thread_num - << ", warmup: " << warmup_times << ", repeats: " << repeat - << ", spend " << (end - start) / repeat / 1000.0 - << " ms in average."; - - auto output = predictor->GetOutput(0); - auto out = output->data(); - LOG(INFO) << "out " << out[0]; - LOG(INFO) << "out " << out[1]; - auto output_shape = output->shape(); - int output_num = 1; - for (int i = 0; i < output_shape.size(); ++i) { - output_num *= output_shape[i]; - } - LOG(INFO) << "output_num: " << output_num; -} -#endif - -} // namespace lite_api -} // namespace paddle - -int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_model_dir == "") { - LOG(INFO) << "usage: " - << "--model_dir /path/to/your/model"; - exit(0); - } - std::string save_optimized_model_dir = FLAGS_model_dir + "opt2"; - - auto split_string = - [](const std::string& str_in) -> std::vector { - std::vector str_out; - std::string tmp_str = str_in; - while (!tmp_str.empty()) { - size_t next_offset = tmp_str.find(":"); - str_out.push_back(tmp_str.substr(0, next_offset)); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); - } - } - return str_out; - }; - - auto get_shape = [](const std::string& str_shape) -> std::vector { - std::vector shape; - std::string tmp_str = str_shape; - while (!tmp_str.empty()) { - int dim = atoi(tmp_str.data()); - shape.push_back(dim); - size_t next_offset = tmp_str.find(","); - if (next_offset == std::string::npos) { - break; - } else { - tmp_str = tmp_str.substr(next_offset + 1); - } - } - return shape; - }; - - LOG(INFO) << "input shapes: " << FLAGS_input_shape; - std::vector str_input_shapes = split_string(FLAGS_input_shape); - std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { - LOG(INFO) << "input shape: " << str_input_shapes[i]; - input_shapes.push_back(get_shape(str_input_shapes[i])); - } - - // Output optimized model - paddle::lite_api::OutputOptModel( - FLAGS_model_dir, save_optimized_model_dir, input_shapes); - -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - // Run inference using optimized model - paddle::lite_api::Run(input_shapes, - save_optimized_model_dir, - FLAGS_repeats, - FLAGS_threads, - FLAGS_warmup); -#endif - return 0; -} diff --git a/lite/api/ocr_attention_test.cc b/lite/api/ocr_attention_test.cc deleted file mode 100644 index 89cf6a3e8d..0000000000 --- a/lite/api/ocr_attention_test.cc +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -void TestModel(const std::vector& valid_places, - const Place& preferred_place, - bool use_npu = false) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 1, 48, 512}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - auto* init_scores = predictor.GetInput(2); - init_scores->Resize(DDim(std::vector({1, 1}))); - auto* data_scores = init_scores->mutable_data(); - auto scores_size = input_tensor->dims().production(); - for (int i = 0; i < scores_size; i++) { - data_scores[i] = 0; - } - auto lod_scores = init_scores->mutable_lod(); - std::vector> lod_s{{0, 1}, {0, 1}}; - *lod_scores = lod_s; - - auto* init_ids = predictor.GetInput(1); - init_ids->Resize(DDim(std::vector({1, 1}))); - auto* data_ids = init_ids->mutable_data(); - auto ids_size = init_ids->dims().production(); - for (int i = 0; i < ids_size; i++) { - data_ids[i] = 0; - } - auto lod_ids = init_ids->mutable_lod(); - std::vector> lod_i{{0, 1}, {0, 1}}; - *lod_ids = lod_i; - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - // std::vector> results; - // // i = 1 - // results.emplace_back(std::vector( - // {0.00019130898, 9.467885e-05, 0.00015971427, 0.0003650665, - // 0.00026431272, 0.00060884043, 0.0002107942, 0.0015819625, - // 0.0010323516, 0.00010079765, 0.00011006987, 0.0017364529, - // 0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986, - // 0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722})); - // auto* out = predictor.GetOutput(0); - // ASSERT_EQ(out->dims().size(), 2); - // ASSERT_EQ(out->dims()[0], 1); - // ASSERT_EQ(out->dims()[1], 1000); - // - // int step = 50; - // for (int i = 0; i < results.size(); ++i) { - // for (int j = 0; j < results[i].size(); ++j) { - // EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - // results[i][j], - // 1e-6); - // } - // } -} - -TEST(OcrAttention, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc deleted file mode 100644 index fee4ebf6dc..0000000000 --- a/lite/api/paddle_api.cc +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/paddle_api.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite_api { - -Tensor::Tensor(void *raw) : raw_tensor_(raw) {} - -// TODO(Superjomn) refine this by using another `const void* const_raw`; -Tensor::Tensor(const void *raw) { raw_tensor_ = const_cast(raw); } - -lite::Tensor *tensor(void *x) { return static_cast(x); } -const lite::Tensor *ctensor(void *x) { - return static_cast(x); -} - -void Tensor::Resize(const shape_t &shape) { - tensor(raw_tensor_)->Resize(shape); -} - -template <> -const float *Tensor::data() const { - return ctensor(raw_tensor_)->data(); -} -template <> -const int8_t *Tensor::data() const { - return ctensor(raw_tensor_)->data(); -} - -template <> -float *Tensor::mutable_data() const { - return tensor(raw_tensor_)->mutable_data(); -} -template <> -int8_t *Tensor::mutable_data() const { - return tensor(raw_tensor_)->mutable_data(); -} - -shape_t Tensor::shape() const { - return ctensor(raw_tensor_)->dims().Vectorize(); -} - -lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); } - -void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); } - -void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir, - LiteModelType model_type) { - LOG(FATAL) - << "The SaveOptimizedModel API is only supported by CxxConfig predictor."; -} - -template -std::shared_ptr CreatePaddlePredictor(const ConfigT &) { - return std::shared_ptr(); -} - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h deleted file mode 100644 index b1a8b21935..0000000000 --- a/lite/api/paddle_api.h +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file defines PaddlePredictor, the api for lite. It supports multiple - * hardware including ARM, X86, OpenCL, CUDA and so on. - */ - -#ifndef PADDLE_LITE_API_H_ // NOLINT -#define PADDLE_LITE_API_H_ -#include -#include -#include -#include "paddle_place.h" // NOLINT - -namespace paddle { -namespace lite_api { - -using shape_t = std::vector; -using lod_t = std::vector>; - -enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK }; - -struct LITE_API Tensor { - explicit Tensor(void* raw); - explicit Tensor(const void* raw); - - void Resize(const shape_t& shape); - - /// Readonly data. - template - const T* data() const; - - template - T* mutable_data() const; - - /// Shape of the tensor. - shape_t shape() const; - - // LoD of the tensor - lod_t lod() const; - - // Set LoD of the tensor - void SetLoD(const lod_t& lod); - - private: - void* raw_tensor_; -}; - -/// The PaddlePredictor defines the basic interfaces for different kinds of -/// predictors. -class LITE_API PaddlePredictor { - public: - PaddlePredictor() = default; - - /// Get i-th input. - virtual std::unique_ptr GetInput(int i) = 0; - - /// Get i-th output. - virtual std::unique_ptr GetOutput(int i) const = 0; - - virtual void Run() = 0; - - /// Get a readonly tensor, return null if no one called `name` exists. - virtual std::unique_ptr GetTensor( - const std::string& name) const = 0; - - /// Persist the optimized model to disk. This API is only supported by - /// CxxConfig, and the persisted model can be reused for MobileConfig. - virtual void SaveOptimizedModel( - const std::string& model_dir, - LiteModelType model_type = LiteModelType::kProtobuf); - - virtual ~PaddlePredictor() = default; -}; - -/// Base class for all the configs. -class LITE_API ConfigBase { - std::string model_dir_; - - public: - void set_model_dir(const std::string& x) { model_dir_ = x; } - - const std::string& model_dir() const { return model_dir_; } -}; - -/// CxxConfig is the config for the Full feature predictor. -class LITE_API CxxConfig : public ConfigBase { - Place preferred_place_; - std::vector valid_places_; - std::string model_file_; - std::string param_file_; - bool model_from_memory_{false}; - - public: - void set_preferred_place(const Place& x) { preferred_place_ = x; } - void set_valid_places(const std::vector& x) { valid_places_ = x; } - void set_model_file(const std::string& path) { model_file_ = path; } - void set_param_file(const std::string& path) { param_file_ = path; } - void set_model_buffer(const char* model_buffer, - size_t model_buffer_size, - const char* param_buffer, - size_t param_buffer_size) { - model_file_ = std::string(model_buffer, model_buffer + model_buffer_size); - param_file_ = std::string(param_buffer, param_buffer + param_buffer_size); - model_from_memory_ = true; - } - - const Place& preferred_place() const { return preferred_place_; } - const std::vector& valid_places() const { return valid_places_; } - std::string model_file() const { return model_file_; } - std::string param_file() const { return param_file_; } - bool model_from_memory() const { return model_from_memory_; } -}; - -/// MobileConfig is the config for the light weight predictor, it will skip -/// IR optimization or other unnecessary stages. -class LITE_API MobileConfig : public ConfigBase { - PowerMode mode_{LITE_POWER_HIGH}; - int threads_{1}; - std::string model_buffer_; - std::string param_buffer_; - bool model_from_memory_{false}; - - public: - MobileConfig(Place preferred_place = Place(TARGET(kARM), - PRECISION(kFloat), - DATALAYOUT(kNCHW)), - PowerMode mode = LITE_POWER_HIGH, - int threads = 1) - : mode_(mode), threads_(threads) {} - void set_power_mode(PowerMode mode) { mode_ = mode; } - void set_threads(int threads) { threads_ = threads; } - void set_model_buffer(const char* model_buffer, - size_t model_buffer_size, - const char* param_buffer, - size_t param_buffer_size) { - model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size); - param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size); - model_from_memory_ = true; - } - - PowerMode power_mode() const { return mode_; } - int threads() const { return threads_; } - bool model_from_memory() const { return model_from_memory_; } - const std::string& model_buffer() const { return model_buffer_; } - const std::string& param_buffer() const { return param_buffer_; } -}; - -template -std::shared_ptr CreatePaddlePredictor(const ConfigT&); - -} // namespace lite_api -} // namespace paddle - -#endif // NOLINT diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc deleted file mode 100644 index 02502ff9c8..0000000000 --- a/lite/api/paddle_api_test.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/paddle_api.h" -#include -#include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/io.h" -DEFINE_string(model_dir, "", ""); - -namespace paddle { -namespace lite_api { - -TEST(CxxApi, run) { - lite_api::CxxConfig config; - config.set_model_dir(FLAGS_model_dir); - config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)}); - config.set_valid_places({ - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - - auto predictor = lite_api::CreatePaddlePredictor(config); - - auto input_tensor = predictor->GetInput(0); - input_tensor->Resize(std::vector({100, 100})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - predictor->Run(); - - auto output = predictor->GetOutput(0); - auto* out = output->data(); - LOG(INFO) << out[0]; - LOG(INFO) << out[1]; - - EXPECT_NEAR(out[0], 50.2132, 1e-3); - EXPECT_NEAR(out[1], -28.8729, 1e-3); - - predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2"); - predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2.naive", - LiteModelType::kNaiveBuffer); -} - -// Demo1 for Mobile Devices :Load model from file and run -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -TEST(LightApi, run) { - lite_api::MobileConfig config; - config.set_model_dir(FLAGS_model_dir + ".opt2.naive"); - - auto predictor = lite_api::CreatePaddlePredictor(config); - - auto input_tensor = predictor->GetInput(0); - input_tensor->Resize(std::vector({100, 100})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - predictor->Run(); - - auto output = predictor->GetOutput(0); - auto* out = output->data(); - LOG(INFO) << out[0]; - LOG(INFO) << out[1]; - - EXPECT_NEAR(out[0], 50.2132, 1e-3); - EXPECT_NEAR(out[1], -28.8729, 1e-3); -} - -// Demo2 for Loading model from memory -TEST(MobileConfig, LoadfromMemory) { - // Get naive buffer - auto model_path = std::string(FLAGS_model_dir) + ".opt2.naive/__model__.nb"; - auto params_path = std::string(FLAGS_model_dir) + ".opt2.naive/param.nb"; - std::string model_buffer = lite::ReadFile(model_path); - size_t size_model = model_buffer.length(); - std::string params_buffer = lite::ReadFile(params_path); - size_t size_params = params_buffer.length(); - // set model buffer and run model - lite_api::MobileConfig config; - config.set_model_buffer( - model_buffer.c_str(), size_model, params_buffer.c_str(), size_params); - - auto predictor = lite_api::CreatePaddlePredictor(config); - auto input_tensor = predictor->GetInput(0); - input_tensor->Resize(std::vector({100, 100})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - predictor->Run(); - - const auto output = predictor->GetOutput(0); - const float* raw_output = output->data(); - - for (int i = 0; i < 10; i++) { - LOG(INFO) << "out " << raw_output[i]; - } -} - -#endif - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h deleted file mode 100644 index e99127e233..0000000000 --- a/lite/api/paddle_lite_factory_helper.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file defines some MACROS that explicitly determine the op, kernel, mir - * passes used in the inference lib. - */ -#pragma once - -#define USE_LITE_OP(op_type__) \ - extern int touch_op_##op_type__(); \ - int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \ - touch_op_##op_type__(); - -#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \ - extern int touch_##op_type__##target__##precision__##layout__##alias__(); \ - int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \ - __attribute__((unused)) = \ - touch_##op_type__##target__##precision__##layout__##alias__(); - -#define USE_MIR_PASS(name__) \ - extern bool mir_pass_registry##name__##_fake(); \ - static bool mir_pass_usage##name__ __attribute__((unused)) = \ - mir_pass_registry##name__##_fake(); - -#define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__ diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc deleted file mode 100644 index dbdf9ff269..0000000000 --- a/lite/api/paddle_place.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/api/paddle_place.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/hash.h" -#include "lite/utils/replace_stl/stream.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite_api { - -size_t Place::hash() const { - std::hash h; - size_t hash = h(static_cast(target)); - hash = lite::hash_combine(hash, static_cast(precision)); - hash = lite::hash_combine(hash, static_cast(layout)); - hash = lite::hash_combine(hash, static_cast(device)); - return hash; -} - -bool operator<(const Place& a, const Place& b) { - if (a.target != b.target) return a.target < b.target; - if (a.precision != b.precision) return a.precision < b.precision; - if (a.layout != b.layout) return a.layout < b.layout; - if (a.device != b.device) return a.device < b.device; - return false; -} - -std::string Place::DebugString() const { - STL::stringstream os; - os << TargetToStr(target) << "/" << PrecisionToStr(precision) << "/" - << DataLayoutToStr(layout); - return os.str(); -} - -const std::string& TargetToStr(TargetType target) { - static const std::string target2string[] = { - "unk", "host", "x86", "cuda", "arm", "opencl", "any", "fpga", "npu"}; - auto x = static_cast(target); - CHECK_LT(x, static_cast(TARGET(NUM))); - return target2string[x]; -} - -const std::string& PrecisionToStr(PrecisionType precision) { - static const std::string precision2string[] = {"unk", - "float", - "int8_t", - "int32_t", - "any", - "float16", - "bool", - "int64_t", - "int16_t"}; - auto x = static_cast(precision); - CHECK_LT(x, static_cast(PRECISION(NUM))); - return precision2string[x]; -} - -const std::string& DataLayoutToStr(DataLayoutType layout) { - static const std::string datalayout2string[] = {"unk", "NCHW", "any", "NHWC"}; - auto x = static_cast(layout); - CHECK_LT(x, static_cast(DATALAYOUT(NUM))); - return datalayout2string[x]; -} - -const std::string& TargetRepr(TargetType target) { - static const std::string target2string[] = {"kUnk", - "kHost", - "kX86", - "kCUDA", - "kARM", - "kOpenCL", - "kAny", - "kFPGA", - "kNPU"}; - auto x = static_cast(target); - CHECK_LT(x, static_cast(TARGET(NUM))); - return target2string[x]; -} - -const std::string& PrecisionRepr(PrecisionType precision) { - static const std::string precision2string[] = {"kUnk", - "kFloat", - "kInt8", - "kInt32", - "kAny", - "kFP16", - "kBool", - "kInt64", - "kInt16"}; - auto x = static_cast(precision); - CHECK_LT(x, static_cast(PRECISION(NUM))); - return precision2string[x]; -} - -const std::string& DataLayoutRepr(DataLayoutType layout) { - static const std::string datalayout2string[] = { - "kUnk", "kNCHW", "kAny", "kNHWC"}; - auto x = static_cast(layout); - CHECK_LT(x, static_cast(DATALAYOUT(NUM))); - return datalayout2string[x]; -} - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h deleted file mode 100644 index 5e4f2ed21c..0000000000 --- a/lite/api/paddle_place.h +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -// Generic helper definitions for shared library support -#if defined _WIN32 || defined __CYGWIN__ -#define PADDLE_LITE_HELPER_DLL_IMPORT __declspec(dllimport) -#define PADDLE_LITE_HELPER_DLL_EXPORT __declspec(dllexport) -#define PADDLE_LITE_HELPER_DLL_LOCAL -#else -#if __GNUC__ >= 4 -#define PADDLE_LITE_HELPER_DLL_IMPORT __attribute__((visibility("default"))) -#define PADDLE_LITE_HELPER_DLL_EXPORT __attribute__((visibility("default"))) -#else -#define PADDLE_LITE_HELPER_DLL_IMPORT -#define PADDLE_LITE_HELPER_DLL_EXPORT -#endif -#endif - -#ifdef LITE_ON_TINY_PUBLISH -#define LITE_API PADDLE_LITE_HELPER_DLL_EXPORT -#define LITE_API_IMPORT PADDLE_LITE_HELPER_DLL_IMPORT -#else -#define LITE_API -#define LITE_API_IMPORT -#endif - -namespace paddle { -namespace lite_api { - -enum class TargetType : int { - kUnk = 0, - kHost = 1, - kX86 = 2, - kCUDA = 3, - kARM = 4, - kOpenCL = 5, - kFPGA = 7, - kNPU = 8, - kAny = 6, // any target - NUM = 9, // number of fields. -}; -enum class PrecisionType : int { - kUnk = 0, - kFloat = 1, - kInt8 = 2, - kInt32 = 3, - kAny = 4, // any precision - kFP16 = 5, - kBool = 6, - kInt64 = 7, - kInt16 = 8, - NUM = 9, // number of fields. -}; -enum class DataLayoutType : int { - kUnk = 0, - kNCHW = 1, - kNHWC = 3, - kAny = 2, // any data layout - NUM = 4, // number of fields. -}; - -typedef enum { - LITE_POWER_HIGH = 0, - LITE_POWER_LOW = 1, - LITE_POWER_FULL = 2, - LITE_POWER_NO_BIND = 3, - LITE_POWER_RAND_HIGH = 4, - LITE_POWER_RAND_LOW = 5 -} PowerMode; - -enum class ActivationType : int { - kIndentity = 0, - kRelu = 1, - kRelu6 = 2, - kPRelu = 3, - kLeakyRelu = 4, - kSigmoid = 5, - kTanh = 6, - kSwish = 7 -}; - -static size_t PrecisionTypeLength(PrecisionType type) { - switch (type) { - case PrecisionType::kFloat: - return 4; - case PrecisionType::kInt8: - return 1; - case PrecisionType::kInt32: - return 4; - case PrecisionType::kFP16: - return 2; - default: - return 4; - } -} - -#define TARGET(item__) paddle::lite_api::TargetType::item__ -#define PRECISION(item__) paddle::lite_api::PrecisionType::item__ -#define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__ - -const std::string& TargetToStr(TargetType target); - -const std::string& PrecisionToStr(PrecisionType precision); - -const std::string& DataLayoutToStr(DataLayoutType layout); - -const std::string& TargetRepr(TargetType target); - -const std::string& PrecisionRepr(PrecisionType precision); - -const std::string& DataLayoutRepr(DataLayoutType layout); - -/* - * Place specifies the execution context of a Kernel or input/output for a - * kernel. It is used to make the analysis of the MIR more clear and accurate. - */ -struct LITE_API Place { - TargetType target{TARGET(kUnk)}; - PrecisionType precision{PRECISION(kUnk)}; - DataLayoutType layout{DATALAYOUT(kUnk)}; - int16_t device{0}; // device ID - - Place() = default; - Place(TargetType target, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW), - int16_t device = 0) - : target(target), precision(precision), layout(layout), device(device) {} - - bool is_valid() const { - return target != TARGET(kUnk) && precision != PRECISION(kUnk) && - layout != DATALAYOUT(kUnk); - } - - size_t hash() const; - - bool operator==(const Place& other) const { - return target == other.target && precision == other.precision && - layout == other.layout && device == other.device; - } - - bool operator!=(const Place& other) const { return !(*this == other); } - - friend bool operator<(const Place& a, const Place& b); - - std::string DebugString() const; -}; - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h deleted file mode 100644 index e43c0f2768..0000000000 --- a/lite/api/paddle_use_passes.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle_lite_factory_helper.h" // NOLINT - -USE_MIR_PASS(demo); -USE_MIR_PASS(static_kernel_pick_pass); -USE_MIR_PASS(variable_place_inference_pass); -USE_MIR_PASS(type_target_cast_pass); -USE_MIR_PASS(generate_program_pass); -USE_MIR_PASS(subgraph_program_pass); - -USE_MIR_PASS(io_copy_kernel_pick_pass); -USE_MIR_PASS(argument_type_display_pass); -USE_MIR_PASS(runtime_context_assign_pass); -USE_MIR_PASS(graph_visualze); - -USE_MIR_PASS(lite_conv_bn_fuse_pass); -USE_MIR_PASS(lite_fc_fuse_pass); -USE_MIR_PASS(lite_shuffle_channel_fuse_pass); -USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); -USE_MIR_PASS(lite_interpolate_fuse_pass); -USE_MIR_PASS(identity_scale_eliminate_pass); -USE_MIR_PASS(lite_conv_elementwise_fuse_pass); -USE_MIR_PASS(lite_conv_activation_fuse_pass); -USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass); -USE_MIR_PASS(lite_quant_dequant_fuse_pass); -USE_MIR_PASS(type_precision_cast_pass); -USE_MIR_PASS(type_layout_cast_pass); diff --git a/lite/api/resnet18_test.cc b/lite/api/resnet18_test.cc deleted file mode 100644 index c003dc1dba..0000000000 --- a/lite/api/resnet18_test.cc +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM -TEST(ResNet18, test) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}}); - - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kARM), PRECISION(kFloat)}, - valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.00020891492, 0.00012855872, 0.00019274367, 0.00031139381, - 0.0003184143, 0.00022596598, 0.00025920002, 0.0006651449, - 0.0015664422, 0.0002835265, 0.0001418782, 0.0013916927, - 0.007779476, 0.0020724828, 0.0012296075, 0.00073855236, - 0.00014572912, 0.00025809053, 0.0004427299, 0.00042198936})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/api/resnet50_test.cc b/lite/api/resnet50_test.cc deleted file mode 100644 index 6e78d12be0..0000000000 --- a/lite/api/resnet50_test.cc +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM -void TestModel(const std::vector& valid_places, - const Place& preferred_place) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.00024139918, 0.00020566184, 0.00022418296, 0.00041731037, - 0.0005366107, 0.00016948722, 0.00028638865, 0.0009257241, - 0.00072681636, 8.531815e-05, 0.0002129998, 0.0021168243, - 0.006387163, 0.0037145028, 0.0012812682, 0.00045948103, - 0.00013535398, 0.0002483765, 0.00076759676, 0.0002773295})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} - -TEST(ResNet50, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -#ifdef LITE_WITH_OPENCL -TEST(ResNet50, test_opencl) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)})); -} -#endif // LITE_WITH_OPENCL - -#endif // LITE_WITH_ARM - -} // namespace lite -} // namespace paddle diff --git a/lite/api/resnet50_test_fpga.cc b/lite/api/resnet50_test_fpga.cc deleted file mode 100644 index 7ea81cc746..0000000000 --- a/lite/api/resnet50_test_fpga.cc +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_FPGA -TEST(ResNet50, test) { - lite::Predictor predictor; - std::vector valid_places( - {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, - Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNHWC)}}); - - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)}, - valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; -} -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/api/shufflenetv2_test.cc b/lite/api/shufflenetv2_test.cc deleted file mode 100644 index f67bc8c6cf..0000000000 --- a/lite/api/shufflenetv2_test.cc +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -void TestModel(const std::vector& valid_places, - const Place& preferred_place) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - - predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim((std::vector({1, 3, 224, 224})))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; ++i) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - results.emplace_back(std::vector( - {0.00020622103, 9.36264e-05, 0.0002608151, 0.0004974526, - 0.00028529152, 9.3994095e-05, 0.00028626667, 0.0011567438, - 0.00094107876, 8.8955254e-05, 4.1932417e-05, 0.00016469292, - 0.006776762, 0.0028232741, 0.00024495262, 0.00022493803, - 0.00015700555, 0.00013883937, 0.00093898486, 0.00018184447})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} - -TEST(ShuffleNetV2, test_arm) { - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - // Place{TARGET(kOpenCL), PRECISION(kFloat)}, - }); - - TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)})); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/test_googlenet_lite.cc b/lite/api/test_googlenet_lite.cc deleted file mode 100644 index 4c9ecd90c6..0000000000 --- a/lite/api/test_googlenet_lite.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/lite_api_test_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" - -// for googlenet -DEFINE_string(model_dir, "", ""); - -namespace paddle { -namespace lite { -#ifdef LITE_WITH_X86 -TEST(CXXApi, test_lite_googlenet) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); - - // LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < input_tensor->dims().production(); i++) { - data[i] = 1; - } - predictor.Run(); - - auto* out = predictor.GetOutput(0); - std::vector results( - {0.00034298553, 0.0008200012, 0.0005046297, 0.000839279, - 0.00052616704, 0.0003447803, 0.0010877076, 0.00081762316, - 0.0003941339, 0.0011430943, 0.0008892841, 0.00080191303, - 0.0004442384, 0.000658702, 0.0026721435, 0.0013686896, - 0.0005618166, 0.0006556497, 0.0006984528, 0.0014619455}); - for (size_t i = 0; i < results.size(); ++i) { - EXPECT_NEAR(out->data()[i * 51], results[i], 1e-5); - } - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); -} -#endif -} // namespace lite -} // namespace paddle diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h deleted file mode 100644 index d835c030f0..0000000000 --- a/lite/api/test_helper.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -// for eval -DEFINE_string(model_dir, "", "model dir"); -DEFINE_int32(warmup, 0, "warmup times"); -DEFINE_int32(repeats, 1, "repeats times"); -DEFINE_int32(threads, 1, "threads num"); -DEFINE_int32(im_width, 224, "image width"); -DEFINE_int32(im_height, 224, "image height"); -DEFINE_bool(int8, false, "is run int8"); - -namespace paddle { -namespace lite { - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/test_inceptionv4_lite_x86.cc b/lite/api/test_inceptionv4_lite_x86.cc deleted file mode 100644 index 5d1dbbe144..0000000000 --- a/lite/api/test_inceptionv4_lite_x86.cc +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/lite_api_test_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { - -TEST(InceptionV4, test_inceptionv4_lite_x86) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); - - // LOG(INFO)<<"FLAGS_eval_googlenet_dir:"< passes({"static_kernel_pick_pass", - "variable_place_inference_pass", - "type_target_cast_pass", - "variable_place_inference_pass", - "io_copy_kernel_pick_pass", - "variable_place_inference_pass", - "runtime_context_assign_pass"}); - predictor.Build(model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, - valid_places, - passes); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < input_tensor->dims().production(); i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup - << ", repeats: " << FLAGS_repeats << ", spend " - << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.0011684548, 0.0010390386, 0.0011301535, 0.0010133048, - 0.0010259597, 0.0010982729, 0.00093195855, 0.0009141837, - 0.00096620916, 0.00089982944, 0.0010064574, 0.0010474789, - 0.0009782845, 0.0009230255, 0.0010548076, 0.0010974824, - 0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767})); - - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/test_mobilenetv1_lite_x86.cc b/lite/api/test_mobilenetv1_lite_x86.cc deleted file mode 100644 index d755410b6a..0000000000 --- a/lite/api/test_mobilenetv1_lite_x86.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/lite_api_test_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { - -TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); - - std::string model_dir = FLAGS_model_dir; - std::vector passes({"static_kernel_pick_pass", - "variable_place_inference_pass", - "type_target_cast_pass", - "variable_place_inference_pass", - "io_copy_kernel_pick_pass", - "variable_place_inference_pass", - "runtime_context_assign_pass"}); - predictor.Build(model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, - valid_places, - passes); - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < input_tensor->dims().production(); i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup - << ", repeats: " << FLAGS_repeats << ", spend " - << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.00019130898, 9.467885e-05, 0.00015971427, 0.0003650665, - 0.00026431272, 0.00060884043, 0.0002107942, 0.0015819625, - 0.0010323516, 0.00010079765, 0.00011006987, 0.0017364529, - 0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986, - 0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/test_mobilenetv2_lite_x86.cc b/lite/api/test_mobilenetv2_lite_x86.cc deleted file mode 100644 index b1090cc6f2..0000000000 --- a/lite/api/test_mobilenetv2_lite_x86.cc +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/lite_api_test_helper.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -// for googlenet - -namespace paddle { -namespace lite { - -TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); - - // LOG(INFO)<<"FLAGS_eval_googlenet_dir:"< passes({"static_kernel_pick_pass", - "variable_place_inference_pass", - "type_target_cast_pass", - "variable_place_inference_pass", - "io_copy_kernel_pick_pass", - "variable_place_inference_pass", - "runtime_context_assign_pass"}); - predictor.Build(model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, - valid_places, - passes); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < input_tensor->dims().production(); i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup - << ", repeats: " << FLAGS_repeats << ", spend " - << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.00017082224, 5.699624e-05, 0.000260885, 0.00016412718, - 0.00034818667, 0.00015230637, 0.00032959113, 0.0014772735, - 0.0009059976, 9.5378724e-05, 5.386537e-05, 0.0006427285, - 0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048, - 6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/api/unet_test.cc b/lite/api/unet_test.cc deleted file mode 100644 index aae5f493eb..0000000000 --- a/lite/api/unet_test.cc +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM -TEST(unet, test) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads); - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}}); - - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kARM), PRECISION(kFloat)}, - valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 512, 512}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - for (int i = 0; i < FLAGS_warmup; ++i) { - predictor.Run(); - } - - auto start = GetCurrentUS(); - for (int i = 0; i < FLAGS_repeats; ++i) { - predictor.Run(); - } - - LOG(INFO) << "================== Speed Report ==================="; - LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads - << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats - << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 - << " ms in average."; - - // std::vector results({0.00078033, 0.00083865, 0.00060029, 0.00057083, - // 0.00070094, 0.00080584, 0.00044525, 0.00074907, - // 0.00059774, 0.00063654}); - // - std::vector> results; - // i = 1 - results.emplace_back(std::vector( - {0.9134332, 0.9652493, 0.959906, 0.96601194, 0.9704161, 0.973321, - 0.9763035, 0.9788776, 0.98090196, 0.9823532, 0.9830632, 0.98336476, - 0.9837605, 0.98430413, 0.9848935, 0.9854547, 0.9858877, 0.9862335, - 0.9865361, 0.9867324, 0.98686767, 0.9870094, 0.98710895, 0.98710257, - 0.98703253, 0.98695105, 0.98681927, 0.98661137, 0.98637575, 0.98613656, - 0.9858899, 0.98564225, 0.9853931, 0.9851323, 0.98487836, 0.9846578, - 0.9844529, 0.9842441, 0.98405427, 0.9839205, 0.98382735, 0.98373055, - 0.9836299, 0.9835474, 0.9834818, 0.9834427, 0.98343164, 0.9834163, - 0.9833809, 0.9833255, 0.9832343, 0.9831207, 0.98302484, 0.9829579, - 0.9829039, 0.98283756, 0.9827444, 0.98264474, 0.9825466, 0.98243505, - 0.982312, 0.98218083, 0.98203814, 0.981895, 0.9817609, 0.9816264, - 0.9814932, 0.9813706, 0.98124915, 0.9811211, 0.98099536, 0.9808748, - 0.98075336, 0.9806301, 0.98050594, 0.98038554, 0.980272, 0.9801562, - 0.9800356, 0.9799207, 0.9798147, 0.97971845, 0.97963905, 0.9795745, - 0.9795107, 0.97943753, 0.9793595, 0.97928876, 0.97922987, 0.9791764, - 0.97912955, 0.9790941, 0.9790663, 0.9790414, 0.9790204, 0.9790055, - 0.97899526, 0.9789867, 0.9789797, 0.9789748})); - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 4); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 21); - - int step = 1; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { - EXPECT_NEAR(out->data()[j * step + (out->dims()[1] * i)], - results[i][j], - 1e-6); - } - } -} -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt deleted file mode 100644 index 80dc574de8..0000000000 --- a/lite/backends/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_subdirectory(arm) -add_subdirectory(x86) -add_subdirectory(cuda) -add_subdirectory(fpga) -add_subdirectory(host) -add_subdirectory(opencl) -add_subdirectory(npu) diff --git a/lite/backends/arm/CMakeLists.txt b/lite/backends/arm/CMakeLists.txt deleted file mode 100644 index 2767b4e7ae..0000000000 --- a/lite/backends/arm/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(math) diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt deleted file mode 100644 index f17928cc29..0000000000 --- a/lite/backends/arm/math/CMakeLists.txt +++ /dev/null @@ -1,111 +0,0 @@ -if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)) - return() -endif() - -if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)) - return() -endif() - -set(HAS_ARM_MATH_LIB_DIR OFF) -# will search name as "libmath_arm.${os}.${abi}.${lang}.a" -if(ARM_MATH_LIB_DIR AND EXISTS "${ARM_MATH_LIB_DIR}") - set(arm_math_name "") - if(ARM_TARGET_OS STREQUAL "android") - if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") - set(arm_math_name "math_arm.android.armv8") - elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(arm_math_name "math_arm.android.armv7") - endif() - endif() - - if(ARM_TARGET_OS STREQUAL "armlinux" ) - if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") - set(arm_math_name "math_arm.armlinux.armv8") - elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(arm_math_name "math_arm.armlinux.armv7") - endif() - endif() - - if(ARM_TARGET_LANG STREQUAL "clang") - set(arm_math_name "${arm_math_name}.clang") - else() - set(arm_math_name "${arm_math_name}.gcc") - endif() - - find_library(math_arm_file ${arm_math_name} ${ARM_MATH_LIB_DIR} NO_DEFAULT_PATH) - if(math_arm_file) - add_library(math_arm STATIC IMPORTED GLOBAL) - set_property(TARGET math_arm PROPERTY IMPORTED_LOCATION ${math_arm_file}) - message(STATUS "ARM math library imported: ${math_arm_file}") - set(HAS_ARM_MATH_LIB_DIR ON) - else() - message(WARNING "Can not find arm math library ${arm_math_name} in ${ARM_MATH_LIB_DIR}") - endif() -endif() - - -if (NOT HAS_ARM_MATH_LIB_DIR) - # TODO(xxx): seperate them and do not deps proto, eigen3 - cc_library(math_arm SRCS - funcs.cc - packed_sgemm.cc - sgemm.cc - softmax.cc - scale.cc - pooling.cc - elementwise.cc - lrn.cc - decode_bboxes.cc - concat.cc - sgemv.cc - type_trans.cc - box_coder.cc - conv_impl.cc - conv_direct_3x3s1.cc - conv_direct_3x3s2.cc - conv_direct.cc - conv_depthwise_3x3_int8.cc - conv_depthwise_5x5s1_int8.cc - conv_depthwise_3x3p0.cc - conv_depthwise_3x3p1.cc - conv_depthwise_5x5s1.cc - conv_depthwise_5x5s2.cc - conv_depthwise.cc - conv_gemmlike.cc - conv_winograd_3x3.cc - conv_winograd.cc - split.cc - shuffle_channel.cc - activation.cc - yolo_box.cc - dropout.cc - gemm_prepacked_int8.cc - gemv_arm_int8.cc - conv3x3s1_direct_int8.cc - conv3x3s2_direct_int8.cc - power.cc - interpolate.cc - argmax.cc - axpy.cc - fill_bias_relu.cc - col_im_transform.cc - im2sequence.cc - prior_box.cc - sequence_softmax.cc - norm.cc - topk.cc - increment.cc - pad2d.cc - negative.cc - beam_search.cc - reduce_max.cc - sequence_pool.cc - sequence_expand.cc - slice.cc - reduce_mean.cc - stack.cc - affine_channel.cc - anchor_generator.cc - DEPS ${lite_kernel_deps}) -endif() - diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc deleted file mode 100644 index c227077779..0000000000 --- a/lite/backends/arm/math/activation.cc +++ /dev/null @@ -1,698 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/activation.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void act_relu(const float* din, float* dout, int size, int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt = nums_per_thread >> 4; - int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - int cnt = neon_loop_cnt; -#ifdef __aarch64__ - for (int num = 0; num < neon_loop_cnt; ++num) { - float32x4_t vr0 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr1 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr2 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr3 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - vr0 = vmaxq_f32(vr0, vzero); - vr1 = vmaxq_f32(vr1, vzero); - vr2 = vmaxq_f32(vr2, vzero); - vr3 = vmaxq_f32(vr3, vzero); - vst1q_f32(ptr_out_thread, vr0); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vr1); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vr2); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vr3); - ptr_out_thread += 4; - } - -#else - if (cnt > 0) { - asm volatile( - "1: @ loop header\n" - "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" - "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" - - "vmax.f32 q8, q0, %q[vzero] @ relu\n" - "vmax.f32 q9, q1, %q[vzero] @ relu\n" - "vmax.f32 q10, q2, %q[vzero] @ relu\n" - "vmax.f32 q11, q3, %q[vzero] @ relu\n" - - "vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer\n" - "vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - "bne 1b @ jump to main loop start " - "point\n" - : [dout] "+r"(ptr_out_thread), - [din] "+r"(ptr_in_thread), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero) - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); - } -#endif - for (int j = 0; j < neon_loop_remain; ++j) { - ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; - ptr_in_thread++; - ptr_out_thread++; - } - } - float* out_ptr_remain = dout + threads * nums_per_thread; - const float* in_ptr_remain = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f; - in_ptr_remain++; - out_ptr_remain++; - } -} - -template <> -void act_relu_neg(const float* din, - float* dout, - int size, - float negative_slope, - int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt = nums_per_thread >> 4; - int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t valpha = vdupq_n_f32(negative_slope); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - int cnt = neon_loop_cnt; -#ifdef __aarch64__ - for (int num = 0; num < neon_loop_cnt; ++num) { - float32x4_t vr0 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr1 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr2 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr3 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - - uint32x4_t vm0 = vcgeq_f32(vr0, vzero); - uint32x4_t vm1 = vcgeq_f32(vr1, vzero); - uint32x4_t vm2 = vcgeq_f32(vr2, vzero); - uint32x4_t vm3 = vcgeq_f32(vr3, vzero); - - float32x4_t vn0 = vmulq_f32(vr0, valpha); - float32x4_t vn1 = vmulq_f32(vr1, valpha); - float32x4_t vn2 = vmulq_f32(vr2, valpha); - float32x4_t vn3 = vmulq_f32(vr3, valpha); - - float32x4_t vo0 = vbslq_f32(vm0, vr0, vn0); - float32x4_t vo1 = vbslq_f32(vm1, vr1, vn1); - float32x4_t vo2 = vbslq_f32(vm2, vr2, vn2); - float32x4_t vo3 = vbslq_f32(vm3, vr3, vn3); - - vst1q_f32(ptr_out_thread, vo0); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vo1); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vo2); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vo3); - ptr_out_thread += 4; - } - -#else - if (cnt > 0) { - asm volatile( - "1: @ loop header\n" - "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" - "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" - - "vcge.f32 q8, q0, %q[vzero] @ get mask\n" - "vcge.f32 q9, q1, %q[vzero] @ get mask\n" - "vcge.f32 q10, q2, %q[vzero] @ get mask\n" - "vcge.f32 q11, q3, %q[vzero] @ get mask\n" - - "vmul.f32 q4, q0, %q[valpha] @ get neg data\n" - "vmul.f32 q5, q1, %q[valpha] @ get neg data\n" - "vmul.f32 q6, q2, %q[valpha] @ get neg data\n" - "vmul.f32 q7, q3, %q[valpha] @ get neg data\n" - - "vbit q4, q0, q8 @ bitsel, insert q0 to q4, " - "if q8 is 1\n" - "vbit q5, q1, q9 @ bitsel, insert q1 to q5, " - "if q9 is 1\n" - "vbit q6, q2, q10 @ bitsel, insert q2 to q6, " - "if q10 is 1\n" - "vbit q7, q3, q11 @ bitsel, insert q3 to q7, " - "if q11 is 1\n" - - "vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer\n" - "vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - "bne 1b @ jump to main loop start " - "point\n" - : [dout] "+r"(ptr_out_thread), - [din] "+r"(ptr_in_thread), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), [valpha] "w"(valpha) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); - } -#endif - for (int j = 0; j < neon_loop_remain; ++j) { - ptr_out_thread[0] = ptr_in_thread[0] > 0.f - ? ptr_in_thread[0] - : ptr_in_thread[0] * negative_slope; - ptr_in_thread++; - ptr_out_thread++; - } - } - float* out_ptr_remain = dout + threads * nums_per_thread; - const float* in_ptr_remain = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - out_ptr_remain[0] = in_ptr_remain[0] > 0.f - ? in_ptr_remain[0] - : in_ptr_remain[0] * negative_slope; - in_ptr_remain++; - out_ptr_remain++; - } -} - -template <> -void act_clipped_relu( - const float* din, float* dout, int size, float coef, int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt = nums_per_thread >> 4; - int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vclip = vdupq_n_f32(coef); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - int cnt = neon_loop_cnt; -#ifdef __aarch64__ - for (int num = 0; num < neon_loop_cnt; ++num) { - float32x4_t vr0 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr1 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr2 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vr3 = vld1q_f32(ptr_in_thread); - ptr_in_thread += 4; - float32x4_t vt0 = vmaxq_f32(vr0, vzero); - float32x4_t vt1 = vmaxq_f32(vr1, vzero); - float32x4_t vt2 = vmaxq_f32(vr2, vzero); - float32x4_t vt3 = vmaxq_f32(vr3, vzero); - - float32x4_t vo0 = vminq_f32(vt0, vclip); - float32x4_t vo1 = vminq_f32(vt1, vclip); - float32x4_t vo2 = vminq_f32(vt2, vclip); - float32x4_t vo3 = vminq_f32(vt3, vclip); - - vst1q_f32(ptr_out_thread, vo0); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vo1); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vo2); - ptr_out_thread += 4; - vst1q_f32(ptr_out_thread, vo3); - ptr_out_thread += 4; - } -#else - if (cnt > 0) { - asm volatile( - "1: @ loop header\n" - "vld1.32 {d0-d3}, [%[din]]! @ load din 0\n" - "vld1.32 {d4-d7}, [%[din]]! @ load din 0\n" - - "vmax.f32 q8, q0, %q[vzero] @ relu\n" - "vmax.f32 q9, q1, %q[vzero] @ relu\n" - "vmax.f32 q10, q2, %q[vzero] @ relu\n" - "vmax.f32 q11, q3, %q[vzero] @ relu\n" - - "vmin.f32 q4, q8, %q[vclip] @ clip relu\n" - "vmin.f32 q5, q9, %q[vclip] @ clip relu\n" - "vmin.f32 q6, q10, %q[vclip] @ clip relu\n" - "vmin.f32 q7, q11, %q[vclip] @ clip relu\n" - - "vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer\n" - "vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - "bne 1b @ jump to main loop start " - "point\n" - : [dout] "+r"(ptr_out_thread), - [din] "+r"(ptr_in_thread), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), [vclip] "w"(vclip) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); - } -#endif - for (int j = 0; j < neon_loop_remain; ++j) { - ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; - ptr_out_thread[0] = ptr_out_thread[0] < coef ? ptr_out_thread[0] : coef; - ptr_in_thread++; - ptr_out_thread++; - } - } - float* out_ptr_remain = dout + threads * nums_per_thread; - const float* in_ptr_remain = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f; - out_ptr_remain[0] = out_ptr_remain[0] < coef ? out_ptr_remain[0] : coef; - in_ptr_remain++; - out_ptr_remain++; - } -} - -template <> -void act_prelu(const float* din, - float* dout, - int outer_size, - int channel_size, - int inner_size, - std::string mode, - const float* alpha_data, - int threads) { - if (mode == "all" || mode == "channel") { - int stride_size = inner_size * channel_size; - int cnt = inner_size >> 4; - int remain = inner_size & 15; - float32x4_t vzero = vdupq_n_f32(0.f); - for (int n = 0; n < outer_size; n++) { - const float* data_in_batch = din + n * stride_size; - float* data_out_batch = dout + n * stride_size; -#pragma omp parallel for - for (int c = 0; c < channel_size; c++) { - const float* data_in_c = data_in_batch + c * inner_size; - float* data_out_c = data_out_batch + c * inner_size; - - float slope = mode == "all" ? alpha_data[0] : alpha_data[c]; - float32x4_t vslope = vdupq_n_f32(slope); -#ifdef __aarch64__ - for (int i = 0; i < cnt; ++i) { - float32x4_t vr0 = vld1q_f32(data_in_c); - float32x4_t vr1 = vld1q_f32(data_in_c + 4); - float32x4_t vr2 = vld1q_f32(data_in_c + 8); - float32x4_t vr3 = vld1q_f32(data_in_c + 12); - uint32x4_t vm0 = vcltq_f32(vr0, vzero); // vr0 <= vzero - uint32x4_t vm1 = vcltq_f32(vr1, vzero); // vr0 <= vzero - uint32x4_t vm2 = vcltq_f32(vr2, vzero); // vr0 <= vzero - uint32x4_t vm3 = vcltq_f32(vr3, vzero); // vr0 <= vzero - float32x4_t vo0 = vmulq_f32(vr0, vslope); // vr0 * vslope - float32x4_t vo1 = vmulq_f32(vr1, vslope); // vr0 * vslope - float32x4_t vo2 = vmulq_f32(vr2, vslope); // vr0 * vslope - float32x4_t vo3 = vmulq_f32(vr3, vslope); // vr0 * vslope - float32x4_t vos0 = vbslq_f32(vm0, vo0, vr0); - float32x4_t vos1 = vbslq_f32(vm1, vo1, vr1); - float32x4_t vos2 = vbslq_f32(vm2, vo2, vr2); - float32x4_t vos3 = vbslq_f32(vm3, vo3, vr3); - vst1q_f32(data_out_c, vos0); - vst1q_f32(data_out_c + 4, vos1); - vst1q_f32(data_out_c + 8, vos2); - vst1q_f32(data_out_c + 12, vos3); - data_in_c += 16; - data_out_c += 16; - } -#else - int cnt_loop = cnt; - if (cnt_loop > 0) { - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_in]]! @ load " - "input to q0, q1\n" - "pld [%[ptr_in]] @ preload\n" - "pld [%[ptr_in], #64] @ preload\n" - "pld [%[ptr_in], #128] @ preload\n" - "pld [%[ptr_in], #192] @ preload\n" - "1: @main loop\n" - "vld1.32 {d4-d7}, [%[ptr_in]]! @ load input to " - "q2, q3\n" - "vclt.f32 q8, q0, %q[vzero] @vcle q0 <= " - "vzero\n" - "vclt.f32 q9, q1, %q[vzero] @vcle q1 <= " - "vzero\n" - "vmul.f32 q10, q0, %q[vslope] @vmul q0 * " - "vslope\n" - "vmul.f32 q11, q1, %q[vslope] @vmul q1 * " - "vslope\n" - - "vclt.f32 q12, q2, %q[vzero] @vcle q2 <= " - "vzero\n" - "vclt.f32 q13, q3, %q[vzero] @vcle q3 <= " - "vzero\n" - "vmul.f32 q14, q2, %q[vslope] @vmul q2 * " - "vslope\n" - "vmul.f32 q15, q3, %q[vslope] @vmul q3 * " - "vslope\n" - - "vbif.32 q10, q0, q8 @vbit q10, q0, " - "q8\n" - "vbif.32 q11, q1, q9 @vbit q11, q1, " - "q9\n" - "vbif.32 q14, q2, q12 @vbit q14, q2, " - "q12\n" - "vbif.32 q15, q3, q13 @vbit q15, q3, " - "q13\n" - - "subs %[cnt], #1 @subs nn, 1\n" - "vld1.32 {d0-d3}, [%[ptr_in]]! @ load input to " - "q0, q1\n" - - "vst1.f32 {d20-d23}, [%[dout]]! @store data\n" - "vst1.f32 {d28-d31}, [%[dout]]! @store data\n" - "bne 1b @bne nn\n" - "sub %[ptr_in], #32 @ ptr-32\n" - : [ptr_in] "+r"(data_in_c), - [cnt] "+r"(cnt_loop), - [dout] "+r"(data_out_c) - : [vzero] "w"(vzero), [vslope] "w"(vslope) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif // __aarch64__ - for (int i = remain; i > 0; i--) { - *(data_out_c++) = - data_in_c[0] > 0.f ? data_in_c[0] : data_in_c[0] * slope; - data_in_c++; - } - } - } - } else { // mode = element - int stride_size = inner_size * channel_size; - for (int n = 0; n < outer_size; n++) { - const float* data_in_batch = din + n * stride_size; - const float* data_alpha_batch = alpha_data + n * stride_size; - float* data_out_batch = dout + n * stride_size; - for (int c = 0; c < channel_size; c++) { - const float* data_in_c = data_in_batch + c * inner_size; - const float* data_alpha_c = data_alpha_batch + c * inner_size; - float* data_out_c = data_out_batch + c * inner_size; - for (int i = 0; i < inner_size; i++) { - data_out_c[0] = data_in_c[0] > 0.f ? data_in_c[0] - : data_in_c[0] * data_alpha_c[0]; - data_in_c++; - data_alpha_c++; - data_out_c++; - } - } - } - } -} - -template <> -void act_sigmoid(const float* din, float* dout, int size, int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt_dim4 = nums_per_thread >> 2; - int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); - - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - float32x4_t exp_vec = vdupq_n_f32(0.0f); - float32x4_t recip = vdupq_n_f32(0.0f); - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - for (int k = 0; k < neon_loop_cnt_dim4; ++k) { - exp_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); - exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f)); - recip = vrecpeq_f32(exp_vec); - recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip); - recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip); - vst1q_f32(ptr_out_thread, recip); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for (int j = 0; j < neon_loop_remain_dim4; ++j) { - ptr_out_thread[0] = 1.f / (1 + expf(-ptr_in_thread[0])); - ptr_in_thread++; - ptr_out_thread++; - } - } - float* ptr_out = dout + threads * nums_per_thread; - const float* ptr_in = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - ptr_out[0] = 1.f / (1 + expf(-ptr_in[0])); - ptr_in++; - ptr_out++; - } -} - -// tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template <> -void act_tanh(const float* din, float* dout, int size, int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt_dim4 = nums_per_thread >> 2; - int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - float32x4_t exp_plus_vec = vdupq_n_f32(0.0f); - float32x4_t exp_minus_vec = vdupq_n_f32(0.0f); - float32x4_t exp_sum_vec = vdupq_n_f32(0.0f); - float32x4_t exp_diff_vec = vdupq_n_f32(0.0f); - float32x4_t recip = vdupq_n_f32(0.0f); - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - for (int k = 0; k < neon_loop_cnt_dim4; ++k) { - exp_plus_vec = exp_ps(vld1q_f32(ptr_in_thread)); - exp_minus_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); - exp_sum_vec = vaddq_f32(exp_plus_vec, exp_minus_vec); - exp_diff_vec = vsubq_f32(exp_plus_vec, exp_minus_vec); - recip = div_ps(exp_diff_vec, exp_sum_vec); - vst1q_f32(ptr_out_thread, recip); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for (int j = 0; j < neon_loop_remain_dim4; ++j) { - ptr_out_thread[0] = (expf(ptr_in_thread[0]) - expf(-ptr_in_thread[0])) / - (expf(ptr_in_thread[0]) + expf(-ptr_in_thread[0])); - ptr_in_thread++; - ptr_out_thread++; - } - } - float* ptr_out = dout + threads * nums_per_thread; - const float* ptr_in = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - ptr_out[0] = (expf(ptr_in[0]) - expf(-ptr_in[0])) / - (expf(ptr_in[0]) + expf(-ptr_in[0])); - ptr_in++; - ptr_out++; - } -} - -// swish: x /(1 + exp(-(b * x))) -template <> -void act_swish( - const float* din, float* dout, int size, float coef, int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt_dim4 = nums_per_thread >> 2; - int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); - const float beta = coef; - float32x4_t vbeta = vdupq_n_f32(beta); - float32x4_t vone = vdupq_n_f32(1.f); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - for (int k = 0; k < neon_loop_cnt_dim4; ++k) { - float32x4_t va = vld1q_f32(ptr_in_thread); // x - float32x4_t vb = vnegq_f32(vld1q_f32(ptr_in_thread)); // -x - float32x4_t vsum = vmulq_f32(vb, vbeta); - vsum = exp_ps(vsum); - float32x4_t vc = vaddq_f32(vone, vsum); - float32x4_t vrst = div_ps(va, vc); - vst1q_f32(ptr_out_thread, vrst); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for (int j = 0; j < neon_loop_remain_dim4; ++j) { - ptr_out_thread[0] = - ptr_in_thread[0] / (1.0 + expf(-ptr_in_thread[0] * beta)); - ptr_in_thread++; - ptr_out_thread++; - } - } - float* ptr_out = dout + threads * nums_per_thread; - const float* ptr_in = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - ptr_out[0] = ptr_in[0] / (1.0 + expf(-ptr_in[0] * beta)); - ptr_in++; - ptr_out++; - } -} - -template <> -void act_log(const float* din, float* dout, int size, int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt_dim4 = nums_per_thread >> 2; - int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); - LOG(INFO) << "nums_per_thread" << nums_per_thread; - LOG(INFO) << "remain" << remain; - LOG(INFO) << "neon_loop_cnt_dim4" << neon_loop_cnt_dim4; - LOG(INFO) << "neon_loop_remian_dim4" << neon_loop_remain_dim4; - - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - float32x4_t exp_vec = vdupq_n_f32(0.0f); - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - for (int k = 0; k < neon_loop_cnt_dim4; ++k) { - exp_vec = log_ps(vld1q_f32(ptr_in_thread)); - vst1q_f32(ptr_out_thread, exp_vec); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for (int j = 0; j < neon_loop_remain_dim4; ++j) { - ptr_out_thread[0] = logf(ptr_in_thread[0]); - ptr_in_thread++; - ptr_out_thread++; - } - } - float* ptr_out = dout + threads * nums_per_thread; - const float* ptr_in = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - ptr_out[0] = logf(ptr_in[0]); - ptr_in++; - ptr_out++; - } -} - -template <> -void act_exp(const float* din, float* dout, int size, int threads) { - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - int neon_loop_cnt_dim4 = nums_per_thread >> 2; - int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); - - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < threads; ++i) { - float32x4_t exp_vec = vdupq_n_f32(0.0f); - const float* ptr_in_thread = din + i * nums_per_thread; - float* ptr_out_thread = dout + i * nums_per_thread; - for (int k = 0; k < neon_loop_cnt_dim4; ++k) { - exp_vec = exp_ps(vld1q_f32(ptr_in_thread)); - vst1q_f32(ptr_out_thread, exp_vec); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for (int j = 0; j < neon_loop_remain_dim4; ++j) { - ptr_out_thread[0] = expf(ptr_in_thread[0]); - ptr_in_thread++; - ptr_out_thread++; - } - } - float* ptr_out = dout + threads * nums_per_thread; - const float* ptr_in = din + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - ptr_out[0] = expf(ptr_in[0]); - ptr_in++; - ptr_out++; - } -} - -template <> -void act_floor(const float* din, float* dout, int size, int threads) { - const float* ptr_in = din; - float* ptr_out = dout; - for (int i = 0; i < size; ++i) { - ptr_out[0] = floorf(ptr_in[0]); - ptr_in++; - ptr_out++; - } -} - -template <> -void act_hard_sigmoid(const float* din, - float* dout, - const int64_t size, - const float slope, - const float offset, - int threads) { - for (int64_t i = 0; i < size; ++i) { - dout[0] = din[0] * slope + offset; - dout[0] = dout[0] < 1.0f ? dout[0] : 1.0f; - dout[0] = dout[0] > 0.0f ? dout[0] : 0.0f; - ++din; - ++dout; - } -} -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h deleted file mode 100644 index 794c5e0d41..0000000000 --- a/lite/backends/arm/math/activation.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void act_relu(const T* din, T* dout, int size, int threads); - -template -void act_relu_neg( - const T* din, T* dout, int size, float negative_slope, int threads); - -template -void act_clipped_relu(const T* din, T* dout, int size, float coef, int threads); - -template -void act_prelu(const T* din, - T* dout, - int outer_size, - int channel_size, - int inner_size, - std::string mode, - const float* alpha_data, - int threads); - -template -void act_sigmoid(const T* din, T* dout, int size, int threads); - -template -void act_tanh(const T* din, T* dout, int size, int threads); - -template -void act_swish(const T* din, T* dout, int size, float coef, int threads); - -template -void act_log(const T* din, T* dout, int size, int threads); - -template -void act_exp(const T* din, T* dout, int size, int threads); - -template -void act_floor(const T* din, T* dout, int size, int threads); - -template -void act_hard_sigmoid(const T* din, - T* dout, - const int64_t size, - const float slope, - const float offset, - int threads); -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/affine_channel.cc b/lite/backends/arm/math/affine_channel.cc deleted file mode 100644 index a2c735afcc..0000000000 --- a/lite/backends/arm/math/affine_channel.cc +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/affine_channel.h" -#include -#include -#include -#include "lite/backends/arm/math/axpy.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/backends/arm/math/saturate.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void affine_channel_func(const float* x, - const float* scale, - const float* bias, - const std::string data_layout, - int num, - int channel, - int height, - int width, - float* out) { - if (data_layout == "NCHW") { - int hw_size = height * width; - for (int n = 0; n < num; n++) { - for (int c = 0; c < channel; c++) { - const float* x_ptr = x + n * channel * hw_size + c * hw_size; - const float* scale_ptr = scale + c; - const float* bias_ptr = bias + c; - float* out_ptr = out + n * channel * hw_size + c * hw_size; - for (int i = 0; i < hw_size; i++) { - *out_ptr = (*x_ptr) * (*scale_ptr) + (*bias_ptr); - x_ptr++; - out_ptr++; - } - } - } - } else if (data_layout == "NHWC") { - int nhw = num * height * width; - for (int i = 0; i < nhw; i++) { - const float* x_ptr = x + i * channel; - float* out_ptr = out + i * channel; - for (int c = 0; c < channel; c++) { - *out_ptr = (*x_ptr) * scale[c] + bias[c]; - x_ptr++; - out_ptr++; - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/affine_channel.h b/lite/backends/arm/math/affine_channel.h deleted file mode 100644 index f050d0ae28..0000000000 --- a/lite/backends/arm/math/affine_channel.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/operators/op_params.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void affine_channel_func(const float* x, - const float* scale, - const float* bias, - const std::string data_layout, - int num, - int channel, - int h, - int w, - float* dout); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/anchor_generator.cc b/lite/backends/arm/math/anchor_generator.cc deleted file mode 100644 index 2f8a738fbf..0000000000 --- a/lite/backends/arm/math/anchor_generator.cc +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/anchor_generator.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/backends/arm/math/saturate.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void anchor_generator_func(int feature_height, - int feature_width, - std::vector anchor_sizes, - std::vector aspect_ratios, - std::vector stride, - std::vector variances, - float offset, - float* anchors_ptr, - float* vars_ptr) { - float stride_width = stride[0]; - float stride_height = stride[1]; - int num_anchors = aspect_ratios.size() * anchor_sizes.size(); - for (int h_idx = 0; h_idx < feature_height; ++h_idx) { - float* anchors_ptr_h = - anchors_ptr + h_idx * feature_width * num_anchors * 4; - for (int w_idx = 0; w_idx < feature_width; ++w_idx) { - float* anchors_ptr_w = anchors_ptr_h + w_idx * num_anchors * 4; - float x_ctr = (w_idx * stride_width) + offset * (stride_width - 1); - float y_ctr = (h_idx * stride_height) + offset * (stride_height - 1); - float area, area_ratios; - float base_w, base_h; - float scale_w, scale_h; - float anchor_width, anchor_height; - int idx = 0; - for (size_t r = 0; r < aspect_ratios.size(); ++r) { - auto ar = aspect_ratios[r]; - for (size_t s = 0; s < anchor_sizes.size(); ++s) { - auto anchor_size = anchor_sizes[s]; - area = stride_width * stride_height; - area_ratios = area / ar; - base_w = round(sqrt(area_ratios)); - base_h = round(base_w * ar); - scale_w = anchor_size / stride_width; - scale_h = anchor_size / stride_height; - anchor_width = scale_w * base_w; - anchor_height = scale_h * base_h; - anchors_ptr_w[idx++] = x_ctr - 0.5 * (anchor_width - 1); - anchors_ptr_w[idx++] = y_ctr - 0.5 * (anchor_height - 1); - anchors_ptr_w[idx++] = x_ctr + 0.5 * (anchor_width - 1); - anchors_ptr_w[idx++] = y_ctr + 0.5 * (anchor_height - 1); - } - } - } - } - - int64_t hwn = feature_height * feature_width * num_anchors * 4; - for (int64_t i = 0; i < hwn; i++) { - *vars_ptr = variances[i % 4]; - vars_ptr++; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/anchor_generator.h b/lite/backends/arm/math/anchor_generator.h deleted file mode 100644 index c6be6700d3..0000000000 --- a/lite/backends/arm/math/anchor_generator.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/operators/op_params.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void anchor_generator_func(int feature_height, - int feature_widht, - std::vector anchor_sizes, - std::vector aspect_ratios, - std::vector stride, - std::vector variances, - float offset, - float* anchors_data, - float* variances_data); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/argmax.cc b/lite/backends/arm/math/argmax.cc deleted file mode 100644 index 3ca6d97c4d..0000000000 --- a/lite/backends/arm/math/argmax.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/argmax.h" -#include -#include -#include -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void argmax_func(const lite::Tensor *input, - const int axis, - lite::Tensor *output) { - auto input_ddim = input->dims(); - auto output_ddim = output->dims(); - - const int size = input_ddim[axis]; - const int in_channel = input_ddim.count(axis, input_ddim.size()); - const int out_channel = output_ddim.count(axis, output_ddim.size()); - const int in_stride = input_ddim.count(axis + 1, input_ddim.size()); - const int out_stride = input_ddim.count(0, axis); - - for (int n = 0; n < out_stride; n++) { - for (int k = 0; k < in_stride; k++) { - const float *in_ptr = input->data() + n * in_channel + k; - std::vector> vec; - vec.resize(size); - for (int i = 0; i < size; i++) { - vec[i] = std::make_pair(in_ptr[i * in_stride], i); - } - // sort - std::partial_sort(vec.begin(), - vec.begin() + 1, - vec.end(), - std::greater>()); - - // out - float *out_ptr = output->mutable_data() + n * out_channel + k; - *out_ptr = vec[0].second; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/argmax.h b/lite/backends/arm/math/argmax.h deleted file mode 100644 index c78cf2f7a8..0000000000 --- a/lite/backends/arm/math/argmax.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/operators/op_params.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void argmax_func(const lite::Tensor* input, - const int axis, - lite::Tensor* output); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/axpy.cc b/lite/backends/arm/math/axpy.cc deleted file mode 100644 index 0863cc009c..0000000000 --- a/lite/backends/arm/math/axpy.cc +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/axpy.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/backends/arm/math/saturate.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void axpy_kernel_fp32(const float* scale, - const float* din, - const float* bias, - float* dout, - int num, - int channel, - int size, - int in_channel) { - int cnt = size >> 3; - int remain = size % 8; - for (int n = 0; n < num; n++) { - const float* din_ptr = din + n * in_channel; - const float* scale_ptr = scale + n * channel; - const float* bias_ptr = bias + n * in_channel; - float* dout_ptr = dout + n * in_channel; -#pragma omp parallel for - for (int c = 0; c < channel; c++) { - const float* din_ch_ptr = din_ptr + c * size; - const float* bias_ch_ptr = bias_ptr + c * size; - float* dout_ch_ptr = dout_ptr + c * size; - float32x4_t scale_val = vdupq_n_f32(scale_ptr[c]); - int col_cnt = cnt; - if (cnt > 0) { -#ifdef __aarch64__ - asm volatile( - "ld1 {v0.4s}, [%[din_ptr]], #16 \n" - "ld1 {v1.4s}, [%[bias_ptr]], #16 \n" - "1: \n" - "ld1 {v2.4s}, [%[din_ptr]], #16 \n" - "ld1 {v3.4s}, [%[bias_ptr]], #16 \n" - "fmul v4.4s , v0.4s, %[scale].4s \n" - "fmul v5.4s , v2.4s, %[scale].4s \n" - "fadd v4.4s, v4.4s, v1.4s \n" - "fadd v5.4s, v5.4s, v3.4s \n" - "ld1 {v0.4s}, [%[din_ptr]], #16 \n" - "ld1 {v1.4s}, [%[bias_ptr]], #16 \n" - "subs %[cnt], %[cnt], #1 \n" - "st1 {v4.4s}, [%[dout_ptr]], #16 \n" - "st1 {v5.4s}, [%[dout_ptr]], #16 \n" - "bne 1b \n" - : [din_ptr] "+r"(din_ch_ptr), - [bias_ptr] "+r"(bias_ch_ptr), - [dout_ptr] "+r"(dout_ch_ptr), - [cnt] "+r"(col_cnt) - : [scale] "w"(scale_val) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); -#else - asm volatile( - "vld1.32 {d2-d3}, [%[din_ptr]]! \n" - "vld1.32 {d4-d5}, [%[bias_ptr]]! \n" - "1: \n" - "vld1.32 {d6-d7}, [%[din_ptr]]! \n" - "vld1.32 {d8-d9}, [%[bias_ptr]]! \n" - "vmul.f32 q5, q1, %q[scale] \n" - "vmul.f32 q6, q3, %q[scale] \n" - "vadd.f32 q5, q5, q2 \n" - "vadd.f32 q6, q6, q4 \n" - "vld1.f32 {d2-d3}, [%[din_ptr]]! \n" - "vld1.f32 {d4-d5}, [%[bias_ptr]]! \n" - "subs %[cnt], #1 \n" - "vst1.32 {d10-d11}, [%[dout_ptr]]! \n" - "vst1.32 {d12-d13}, [%[dout_ptr]]! \n" - "bne 1b \n" - : [din_ptr] "+r"(din_ch_ptr), - [bias_ptr] "+r"(bias_ch_ptr), - [dout_ptr] "+r"(dout_ch_ptr), - [cnt] "+r"(col_cnt) - : [scale] "w"(scale_val) - : "cc", "memory", "q1", "q2", "q3", "q4", "q5", "q6"); -#endif - } - din_ch_ptr = din_ptr + c * size + cnt * 8; - bias_ch_ptr = bias_ptr + c * size + cnt * 8; - for (int i = 0; i < remain; i++) { - *dout_ch_ptr = (*din_ch_ptr) * scale_ptr[c] + (*bias_ch_ptr); - dout_ch_ptr++; - din_ch_ptr++; - bias_ch_ptr++; - } - } - } -} - -void axpy_kernel_int8(const int8_t* scale, - const int8_t* din, - const int8_t* bias, - int8_t* dout, - int num, - int channel, - int size, - int in_channel) { - int cnt = size >> 4; - int remain = size % 16; - for (int n = 0; n < num; n++) { - const int8_t* din_ptr = din + n * in_channel; - const int8_t* scale_ptr = scale + n * channel; - const int8_t* bias_ptr = bias + n * in_channel; - int8_t* dout_ptr = dout + n * in_channel; -#pragma omp parallel for - for (int c = 0; c < channel; c++) { - const int8_t* din_ch_ptr = din_ptr + c * size; - const int8_t* bias_ch_ptr = bias_ptr + c * size; - int8_t* dout_ch_ptr = dout_ptr + c * size; - int8x8_t scale_val = vdup_n_s8(scale_ptr[c]); - int col_cnt = cnt; - if (col_cnt > 0) { -#ifdef __aarch64__ - asm volatile( - "ld1 {v0.8b}, [%[din_ptr]], #8 \n" - "ld1 {v1.8b}, [%[bias_ptr]], #8 \n" - "1: \n" - "ld1 {v2.8b}, [%[din_ptr]], #8 \n" - "ld1 {v3.8b}, [%[bias_ptr]], #8 \n" - "smull v4.8h, v0.8b, %[scale].8b \n" - "smull v5.8h, v2.8b, %[scale].8b \n" - "saddw v4.8h, v4.8h, v1.8b \n" - "saddw v5.8h, v5.8h, v3.8b \n" - "ld1 {v0.8b}, [%[din_ptr]], #8 \n" - "ld1 {v1.8b}, [%[bias_ptr]], #8 \n" - "subs %[cnt], %[cnt], #1 \n" - // int16->int8 - "sqxtn v6.8b, v4.8h \n" - "sqxtn v7.8b, v5.8h \n" - "st1 {v6.8b}, [%[dout_ptr]], #8 \n" /* store c0r0*/ - "st1 {v7.8b}, [%[dout_ptr]], #8 \n" /* store c2r0*/ - "bne 1b \n" - : [din_ptr] "+r"(din_ch_ptr), - [bias_ptr] "+r"(bias_ch_ptr), - [dout_ptr] "+r"(dout_ch_ptr), - [cnt] "+r"(col_cnt) - : [scale] "w"(scale_val) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); -#else - asm volatile( - "vdup.s8 d0, %[scale] \n" - "vld1.8 {d2}, [%[din_ptr]]! \n" - "vld1.8 {d4}, [%[bias_ptr]]! \n" - "1: \n" - "vld1.8 {d3}, [%[din_ptr]]! \n" - "vld1.8 {d5}, [%[bias_ptr]]! \n" - "vmull.s8 q4, d2, d0 \n" - "vmull.s8 q5, d3, d0 \n" - "vaddw.s16 q4, q4, d4 \n" - "vaddw.s16 q5, q5, d5 \n" - "vld1.8 {d2}, [%[din_ptr]]! \n" - "vld1.8 {d4}, [%[bias_ptr]]! \n" - "subs %[cnt], #1 \n" - // int16->int8 - "vqmovn.s16 d12, q4 @ cnt to int8\n" - "vqmovn.s16 d13, q5 @ cnt to int8\n" - "vst1.32 {d12-d13}, [%[dout_ptr]]! \n" - "bne 1b \n" - : [din_ptr] "+r"(din_ch_ptr), - [bias_ptr] "+r"(bias_ch_ptr), - [dout_ptr] "+r"(dout_ch_ptr), - [cnt] "+r"(col_cnt) - : [scale] "r"(scale_val) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); -#endif - } - din_ch_ptr = din_ptr + c * size + cnt * 16; - bias_ch_ptr = bias_ptr + c * size + cnt * 16; - for (int i = 0; i < remain; i++) { - *dout_ch_ptr = saturate_cast( - roundf((*din_ch_ptr) * scale_ptr[c] + (*bias_ch_ptr))); - dout_ch_ptr++; - din_ch_ptr++; - bias_ch_ptr++; - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/axpy.h b/lite/backends/arm/math/axpy.h deleted file mode 100644 index 8245bf1d1a..0000000000 --- a/lite/backends/arm/math/axpy.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/operators/op_params.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void axpy_kernel_fp32(const float* scale, - const float* din, - const float* bias, - float* dout, - int num, - int channel, - int size, - int in_channel); - -void axpy_kernel_int8(const int8_t* scale, - const int8_t* din, - const int8_t* bias, - int8_t* dout, - int num, - int channel, - int size, - int in_channel); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc deleted file mode 100644 index f93fcc0d60..0000000000 --- a/lite/backends/arm/math/beam_search.cc +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/beam_search.h" -#include -#include -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -/* -* The basic items help to sort. -*/ -struct Item { - Item() {} - Item(size_t offset, size_t id, float score) - : offset(offset), id(id), score(score) {} - // offset in the higher lod level. - size_t offset; - // prefix id in the lower lod level. - // size_t prefix; - // the candidate id - size_t id; - // the corresponding score - float score; - - inline bool operator<(const Item &in) const { - return (score < in.score) || ((score == in.score) && (offset < in.offset)); - } - - inline void operator=(const Item &in) { - offset = in.offset; - id = in.id; - score = in.score; - } - - std::string ToString() { - std::ostringstream os; - os << "{"; - os << "offset: " << offset << ", "; - os << "id: " << id << ", "; - os << "score: " << score << ""; - os << "}"; - return os.str(); - } -}; - -/* - * Prune the source sentences all branchs finished, and it is optional. - * Pruning must one step later than finishing (thus pre_ids is needed here), - * since the end tokens must be writed out. - */ -void PruneEndBeams(const Tensor *pre_ids, - const LoD &abs_lod, - std::vector> *items, - size_t lod_level, - int end_id) { - auto *pre_ids_data = pre_ids->data(); - auto &high_level = abs_lod[lod_level]; - for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { - size_t src_prefix_start = high_level[src_idx]; - size_t src_prefix_end = high_level[src_idx + 1]; - bool finish_flag = true; - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) { - for (auto &item : items->at(offset)) { - if (item.id != static_cast(end_id) || - pre_ids_data[offset] != end_id) { - finish_flag = false; - break; - } - } - if (!finish_flag) break; - } - if (finish_flag) { // all branchs of the beam (source sentence) end and - // prune this beam - for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) - items->at(offset).clear(); - } - } -} - -/* - * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance. - */ -std::vector> ToMap( - const std::vector> &items, size_t element_num) { - std::vector> result; - result.resize(element_num); - for (auto &entries : items) { - for (const auto &item : entries) { - result[item.offset].push_back(item); - } - } - return result; -} - -void Insert(std::vector *top_beam_ptr, - const Item &item, - size_t beam_size) { - std::vector &top_beam = *top_beam_ptr; - - size_t num_beams = top_beam.size(); - if (num_beams < beam_size) { - top_beam.resize(num_beams + 1); - num_beams++; - } else { - if (item < top_beam[beam_size - 1]) { - return; - } - } - - for (int k = static_cast(num_beams) - 2; k >= 0; --k) { - if (top_beam[k] < item) { - top_beam[k + 1] = top_beam[k]; - } else { - top_beam[k + 1] = item; - return; - } - } - top_beam[0] = item; -} - -/* - * For each source, select top beam_size records. - */ -std::vector> SelectTopBeamSizeItems(const Tensor *pre_ids, - const Tensor *pre_scores, - const Tensor *ids, - const Tensor *scores, - size_t lod_level, - size_t beam_size, - int end_id, - bool is_accumulated) { - std::vector> result; - - // find the current candidates - // auto abs_lod = framework::ToAbsOffset(scores->lod()); - auto abs_lod = scores->lod(); - auto *pre_ids_data = pre_ids->data(); - auto *pre_scores_data = pre_scores->data(); - - auto *ids_data = ids ? ids->data() : nullptr; - auto *scores_data = scores->data(); - - size_t num_seqs = abs_lod[lod_level].size() - 1; - size_t seq_width = 1; - for (int i = 1; i < scores->dims().size(); i++) { - seq_width *= scores->dims()[i]; - } - - for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) { - size_t seq_offset_start = abs_lod[lod_level][seq_id]; - size_t seq_offset_end = abs_lod[lod_level][seq_id + 1]; - - std::vector top_beam; - top_beam.reserve(beam_size); - - for (size_t offset = seq_offset_start; offset < seq_offset_end; ++offset) { - auto pre_id = pre_ids_data[offset]; - auto pre_score = pre_scores_data[offset]; - if (pre_id == end_id) { - // Allocate all probability mass to end_id for finished branchs and - // the other candidate ids can be ignored. - Item item(offset, end_id, pre_score); - Insert(&top_beam, item, beam_size); - } else { - size_t index = offset * seq_width; - for (size_t d = 0; d < seq_width; d++, index++) { - int64_t id = ids_data ? ids_data[index] : static_cast(d); - float score = is_accumulated - ? scores_data[index] - : pre_score + std::log(scores_data[index]); - Item item(offset, id, score); - Insert(&top_beam, item, beam_size); - } - } - } - - result.emplace_back(top_beam); - } - return result; -} - -void beam_search(const Tensor *pre_ids, - const Tensor *pre_scores, - const Tensor *ids, - const Tensor *scores, - Tensor *selected_ids, - Tensor *selected_scores, - Tensor *parent_idx, - int level, - int beam_size, - int end_id, - bool is_accumulated, - Context *ctx) { - // auto abs_lod = framework::ToAbsOffset(scores->lod()); - auto abs_lod = scores->lod(); - auto &high_level = abs_lod[level]; - auto items = SelectTopBeamSizeItems(pre_ids, - pre_scores, - ids, - scores, - level, - beam_size, - end_id, - is_accumulated); - auto selected_items = ToMap(items, high_level.back()); - - PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id); - // calculate the output tensor's height - size_t num_instances = std::accumulate( - std::begin(selected_items), - std::end(selected_items), - 0, - [](size_t a, std::vector &b) { return a + b.size(); }); - // the output tensor shape should be [num_instances, 1] - auto dims = std::vector({static_cast(num_instances), 1}); - selected_ids->Resize(dims); - selected_scores->Resize(dims); - if (parent_idx) { - parent_idx->Resize(dims); - } - auto *selected_ids_data = selected_ids->mutable_data(); - auto *selected_scores_data = selected_scores->mutable_data(); - auto *parent_idx_data = - parent_idx ? parent_idx->mutable_data() : nullptr; - - // fill in data - std::vector low_level; - size_t low_offset = 0; - for (auto &items : selected_items) { - low_level.push_back(low_offset); - for (auto &item : items) { - if (parent_idx) { - parent_idx_data[low_offset] = static_cast(low_level.size() - 1); - } - selected_ids_data[low_offset] = item.id; - selected_scores_data[low_offset] = item.score; - low_offset++; - } - } - low_level.push_back(low_offset); - - // fill lod - LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - *(selected_ids->mutable_lod()) = lod; - *(selected_scores->mutable_lod()) = lod; -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/beam_search.h b/lite/backends/arm/math/beam_search.h deleted file mode 100644 index 2f07175e35..0000000000 --- a/lite/backends/arm/math/beam_search.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void beam_search(const Tensor* pre_ids, - const Tensor* pre_scores, - const Tensor* ids, - const Tensor* scores, - Tensor* selected_ids, - Tensor* selected_scores, - Tensor* parent_idx, - int level, - int beam_size, - int end_id, - bool is_accumulated, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/box_coder.cc b/lite/backends/arm/math/box_coder.cc deleted file mode 100644 index 7cb904a8ee..0000000000 --- a/lite/backends/arm/math/box_coder.cc +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/box_coder.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void box_coder(lite::Tensor* proposals, - const lite::Tensor* anchors, - const lite::Tensor* variances, - const lite::Tensor* bbox_deltas, - const std::string code_type, - bool box_normalized, - int axis) { - if (code_type == "decode_center_size") { - float normalized = !box_normalized ? 1.f : 0; - - const float* anchor_data = anchors->data(); - const float* bbox_deltas_data = bbox_deltas->data(); - float* proposals_data = proposals->mutable_data(); - const float* variances_data = variances->data(); - - int N = bbox_deltas->dims()[0]; - int M = bbox_deltas->dims()[1]; - int len = bbox_deltas->dims()[2]; - - for (int64_t row_id = 0; row_id < N; ++row_id) { - for (int64_t col_id = 0; col_id < M; ++col_id) { - size_t offset = row_id * M * len + col_id * len; - int prior_box_offset = axis == 0 ? col_id * len : row_id * len; - int var_offset = axis == 0 ? col_id * len : row_id * len; - - auto anchor_data_tmp = anchor_data + prior_box_offset; - auto bbox_deltas_data_tmp = bbox_deltas_data + offset; - auto proposals_data_tmp = proposals_data + offset; - - auto anchor_width = - anchor_data_tmp[2] - anchor_data_tmp[0] + normalized; - auto anchor_height = - anchor_data_tmp[3] - anchor_data_tmp[1] + normalized; - auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; - auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; - - float bbox_center_x = 0, bbox_center_y = 0; - float bbox_width = 0, bbox_height = 0; - - auto variances_data_tmp = variances_data + var_offset; - - bbox_center_x = - variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + - anchor_center_x; - bbox_center_y = - variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height + - anchor_center_y; - bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) * - anchor_width; - bbox_height = - std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) * - anchor_height; - - proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; - proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; - proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized; - proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized; - } - } - } else if (code_type == "encode_center_size") { - LOG(FATAL) << "not implemented type: " << code_type; - } else { - LOG(FATAL) << "not supported type: " << code_type; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/box_coder.h b/lite/backends/arm/math/box_coder.h deleted file mode 100644 index bbeb3e0618..0000000000 --- a/lite/backends/arm/math/box_coder.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void box_coder(lite::Tensor* proposals, - const lite::Tensor* anchors, - const lite::Tensor* variances, - const lite::Tensor* bbox_deltas, - const std::string code_type, - bool box_normalized, - int axis); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/col_im_transform.cc b/lite/backends/arm/math/col_im_transform.cc deleted file mode 100644 index b5d2c6af13..0000000000 --- a/lite/backends/arm/math/col_im_transform.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/col_im_transform.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { - return static_cast(a) < static_cast(b); -} - -template <> -void col2im(const float* data_col, - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - float* data_im) { - memset(data_im, 0, height * width * channels * sizeof(float)); - const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; - const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; - const int channel_size = height * width; - for (int channel = channels; channel--; data_im += channel_size) { - for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; - for (int output_rows = output_h; output_rows; output_rows--) { - if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { - data_col += output_w; - } else { - int input_col = -pad_w + kernel_col * dilation_w; - for (int output_col = output_w; output_col; output_col--) { - if (is_a_ge_zero_and_a_lt_b(input_col, width)) { - data_im[input_row * width + input_col] += *data_col; - } - data_col++; - input_col += stride_w; - } - } - input_row += stride_h; - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/col_im_transform.h b/lite/backends/arm/math/col_im_transform.h deleted file mode 100644 index 8560679d7f..0000000000 --- a/lite/backends/arm/math/col_im_transform.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void col2im(const Dtype* data_col, - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - Dtype* data_im); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc deleted file mode 100644 index 9b94cefa16..0000000000 --- a/lite/backends/arm/math/concat.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/concat.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void concat_func(const std::vector &input, - const int axis, - lite::Tensor *output) { - size_t num = input.size(); - int rows = 1; - auto dim_0 = input[0]->dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (int i = 0; i < num; ++i) { - int t_cols = input[i]->numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; - } - - // computation - for (int k = 0; k < out_rows; ++k) { - float *dst_ptr = output->mutable_data() + k * out_cols; - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = input_cols[j]; - const float *src_prt = input[j]->data() + k * col_len; - std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len); - col_idx += col_len; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/concat.h b/lite/backends/arm/math/concat.h deleted file mode 100644 index 4c6159e9e0..0000000000 --- a/lite/backends/arm/math/concat.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/operators/op_params.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void concat_func(const std::vector &input, - const int axis, - lite::Tensor *output); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s1_direct_int8.cc b/lite/backends/arm/math/conv3x3s1_direct_int8.cc deleted file mode 100644 index d44d911131..0000000000 --- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc +++ /dev/null @@ -1,806 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -#ifdef __aarch64__ -void conv_3x3s1_direct_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale) { - const int hin_r_block = 4; - const int hout_c_block = 4; // 8; - const int hout_r_block = 2; - - int stride_w = param.strides[1]; - int pad_w = param.paddings[1]; - int pad_h = param.paddings[0]; - bool flag_relu = param.fuse_relu; - bool flag_bias = (param.bias != nullptr); - - int wout_round = ((wout + 3) / 4) * 4; - int win_round = wout_round * stride_w + 4; - - int threads = ctx->threads(); - - int* tmp_work_space = ctx->workspace_data(); - int* ptr_zero = tmp_work_space; - memset(ptr_zero, 0, sizeof(int) * win_round); - int* ptr_write = ptr_zero + win_round; - - int in_len = win_round * chin; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - signed char* pre_din = reinterpret_cast(ptr_write + wout_round); - - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - int w_stride = chin * 9; - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int size_out = wout_round * hout_c_block; - - // printf("win_round: %d, wout_round: %d, ws: %d, we: %d\n", win_round, - // wout_round, ws, we); - // here - for (int n = 0; n < num; ++n) { - const signed char* din_batch = - static_cast(din) + n * chin * size_in_channel; - signed char* dout_batch = - reinterpret_cast(dout) + - n * chout * size_out_channel * PrecisionTypeLength(out_type); - - for (int h = 0; h < hout; h += 2) { - int hs = h - pad_h; - int he = hs + 4; - // printf("hs: %d, he: %d, chin: %d, hin: %d, win: %d \n", hs, he, chin, - // hin, win); - prepack_input_nxw(din_batch, - pre_din, - 0, - chin, - hs, - he, - ws, - we, - chin, - win, - hin, - (signed char*)ptr_zero); - -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < chout; c += hout_c_block) { -#ifdef ARM_WITH_OMP - int* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4) + - omp_get_thread_num() * pre_out_size; -#else - int* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4); -#endif - // printf("ptr_zero_int: %x, ptr_zero: %x, ptr_write: %x, pre_din: %x, - // pre_out: %x \n", ptr_zero_int, ptr_zero, ptr_write, pre_din, - // pre_out); - const signed char* inr0 = pre_din; - const signed char* inr1 = inr0 + in_len; - const signed char* inr2 = inr1 + in_len; - const signed char* inr3 = inr2 + in_len; - - const signed char* wc0 = - static_cast(weights) + c * w_stride; - - const int* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = static_cast(bias) + c; - } - // hout_r_block * wout_round * hout_c_block - fill_packed_bias_nxmw_int8( - bias_ptr, pre_out, hout_c_block, hout_r_block, wout_round); - - for (int i = 0; i < chin; ++i) { - const signed char* r0 = inr0; - const signed char* r1 = inr1; - const signed char* r2 = inr2; - const signed char* r3 = inr3; - - int* ptr_out0 = pre_out; - int* ptr_out1 = pre_out + size_out; - - int cnt = w_loop; - const signed char* ptr_wc0 = wc0; - - asm volatile( - "ldp q4, q5, [%[wc0]] \n" /* w4 w5 w6 w7 */ - "ldr q6, [%[wc0], #32] \n" /* w8 */ - "SXTL v11.8h, v4.8b \n" /* w to int16 */ - "SXTL2 v12.8h, v4.16b \n" /* w to int16 */ - "SXTL v13.8h, v5.8b \n" /* to int16 */ - "SXTL2 v14.8h, v5.16b \n" /* to int16 */ - "SXTL v15.8h, v6.8b \n" /* to int16 */ - "1: \n" /* main loop*/ - "ldr d0, [%[r0]] \n" /* load data din0-dinn7*/ - "SXTL v1.8h, v0.8b \n" /* to int16 */ - - /*output 1st row*/ - "smull v16.4s, v11.4h, v1.h[0] \n" /* */ - "smull v17.4s, v11.4h, v1.h[1] \n" /* */ - "smull v18.4s, v11.4h, v1.h[2] \n" /* */ - "smull v19.4s, v11.4h, v1.h[3] \n" /* */ - - "add %[r0], %[r0], #4\n" - - /*output 1st row*/ - "smlal2 v16.4s, v11.8h, v1.h[1] \n" /* */ - "smlal2 v17.4s, v11.8h, v1.h[2] \n" /* */ - "smlal2 v18.4s, v11.8h, v1.h[3] \n" /* */ - "smlal2 v19.4s, v11.8h, v1.h[4] \n" /* */ - - "ldr d0, [%[r1]] \n" /* load data */ - - /*output 1st row*/ - "smlal v16.4s, v12.4h, v1.h[2] \n" /* */ - "smlal v17.4s, v12.4h, v1.h[3] \n" /* */ - "SXTL v2.8h, v0.8b \n" /* to int16 */ - "smlal v18.4s, v12.4h, v1.h[4] \n" /* */ - "smlal v19.4s, v12.4h, v1.h[5] \n" /* */ - - "add %[r1], %[r1], #4 \n" - - /*output 1st row*/ - "smlal2 v16.4s, v12.8h, v2.h[0] \n" /* */ - "smlal2 v17.4s, v12.8h, v2.h[1] \n" /* */ - "smlal2 v18.4s, v12.8h, v2.h[2] \n" /* */ - "smlal2 v19.4s, v12.8h, v2.h[3] \n" /* */ - - /*output 1st row*/ - "smlal v16.4s, v13.4h, v2.h[1] \n" /* */ - "smlal v17.4s, v13.4h, v2.h[2] \n" /* */ - "smlal v18.4s, v13.4h, v2.h[3] \n" /* */ - "smlal v19.4s, v13.4h, v2.h[4] \n" /* */ - - /*output 1st row*/ - "smlal2 v16.4s, v13.8h, v2.h[2] \n" /* */ - "smlal2 v17.4s, v13.8h, v2.h[3] \n" /* */ - "smlal2 v18.4s, v13.8h, v2.h[4] \n" /* */ - "smlal2 v19.4s, v13.8h, v2.h[5] \n" /* */ - - /*output 2rd row*/ - "smull v24.4s, v11.4h, v2.h[0] \n" /* */ - "smull v25.4s, v11.4h, v2.h[1] \n" /* */ - "smull v26.4s, v11.4h, v2.h[2] \n" /* */ - "smull v27.4s, v11.4h, v2.h[3] \n" /* */ - - /*output 2rd row*/ - "smlal2 v24.4s, v11.8h, v2.h[1] \n" /* */ - "smlal2 v25.4s, v11.8h, v2.h[2] \n" /* */ - "smlal2 v26.4s, v11.8h, v2.h[3] \n" /* */ - "smlal2 v27.4s, v11.8h, v2.h[4] \n" /* */ - - "ldr d0, [%[r2]] \n" /* load data */ - - /*output 2rd row*/ - "smlal v24.4s, v12.4h, v2.h[2] \n" /* */ - "smlal v25.4s, v12.4h, v2.h[3] \n" /* */ - "SXTL v1.8h, v0.8b \n" /* to int16 */ - "smlal v26.4s, v12.4h, v2.h[4] \n" /* */ - "smlal v27.4s, v12.4h, v2.h[5] \n" /* */ - - /*output 1st row*/ - "smlal v16.4s, v14.4h, v1.h[0] \n" /* */ - "smlal v17.4s, v14.4h, v1.h[1] \n" /* */ - "smlal v18.4s, v14.4h, v1.h[2] \n" /* */ - "smlal v19.4s, v14.4h, v1.h[3] \n" /* */ - - "add %[r2], %[r2], #4 \n" - - /*output 1st row*/ - "smlal2 v16.4s, v14.8h, v1.h[1] \n" /* */ - "smlal2 v17.4s, v14.8h, v1.h[2] \n" /* */ - "smlal2 v18.4s, v14.8h, v1.h[3] \n" /* */ - "smlal2 v19.4s, v14.8h, v1.h[4] \n" /* */ - - "ldp q3, q4, [%[ptr_out0]] \n" - "ldp q5, q6, [%[ptr_out0], #32] \n" - - /*output 1st row*/ - "smlal v16.4s, v15.4h, v1.h[2] \n" /* */ - "smlal v17.4s, v15.4h, v1.h[3] \n" /* */ - "smlal v18.4s, v15.4h, v1.h[4] \n" /* */ - "smlal v19.4s, v15.4h, v1.h[5] \n" /* */ - - "ADD v3.4s, v16.4s, v3.4s \n" - "ADD v4.4s, v17.4s, v4.4s \n" - "ADD v5.4s, v18.4s, v5.4s \n" - "ADD v6.4s, v19.4s, v6.4s \n" - - "stp q3, q4, [%[ptr_out0]], #32 \n" /* save to - output*/ - "stp q5, q6, [%[ptr_out0]], #32 \n" /* save to - output*/ - - /*output 2rd row*/ - "smlal2 v24.4s, v12.8h, v1.h[0] \n" /* */ - "smlal2 v25.4s, v12.8h, v1.h[1] \n" /* */ - "smlal2 v26.4s, v12.8h, v1.h[2] \n" /* */ - "smlal2 v27.4s, v12.8h, v1.h[3] \n" /* */ - - /*output 2rd row*/ - "smlal v24.4s, v13.4h, v1.h[1] \n" /* */ - "smlal v25.4s, v13.4h, v1.h[2] \n" /* */ - "smlal v26.4s, v13.4h, v1.h[3] \n" /* */ - "smlal v27.4s, v13.4h, v1.h[4] \n" /* */ - - "ldr d0, [%[r3]] \n" /* load data */ - - /*output 2rd row*/ - "smlal2 v24.4s, v13.8h, v1.h[2] \n" /* */ - "smlal2 v25.4s, v13.8h, v1.h[3] \n" /* */ - "SXTL v2.8h, v0.8b \n" /* to int16 */ - "smlal2 v26.4s, v13.8h, v1.h[4] \n" /* */ - "smlal2 v27.4s, v13.8h, v1.h[5] \n" /* */ - - /*output 2rd row*/ - "smlal v24.4s, v14.4h, v2.h[0] \n" /* */ - "smlal v25.4s, v14.4h, v2.h[1] \n" /* */ - "smlal v26.4s, v14.4h, v2.h[2] \n" /* */ - "smlal v27.4s, v14.4h, v2.h[3] \n" /* */ - - "add %[r3], %[r3], #4 \n" - - /*output 2rd row*/ - "smlal2 v24.4s, v14.8h, v2.h[1] \n" /* */ - "smlal2 v25.4s, v14.8h, v2.h[2] \n" /* */ - "smlal2 v26.4s, v14.8h, v2.h[3] \n" /* */ - "smlal2 v27.4s, v14.8h, v2.h[4] \n" /* */ - - "ldp q3, q4, [%[ptr_out1]] \n" - "ldp q5, q6, [%[ptr_out1], #32] \n" - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1 */ - - /*output 2rd row*/ - "smlal v24.4s, v15.4h, v2.h[2] \n" /* */ - "smlal v25.4s, v15.4h, v2.h[3] \n" /* */ - "smlal v26.4s, v15.4h, v2.h[4] \n" /* */ - "smlal v27.4s, v15.4h, v2.h[5] \n" /* */ - - "ADD v3.4s, v24.4s, v3.4s \n" - "ADD v4.4s, v25.4s, v4.4s \n" - "ADD v5.4s, v26.4s, v5.4s \n" - "ADD v6.4s, v27.4s, v6.4s \n" - - "stp q3, q4, [%[ptr_out1]], #32 \n" /* save to output*/ - "stp q5, q6, [%[ptr_out1]], #32 \n" /* save to output*/ - - "bne 1b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), - [wc0] "+r"(ptr_wc0), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v24", - "v25", - "v26", - "v27" - - ); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } - if (out_type == PRECISION(kFloat)) { - write_to_output_c4_int32_1(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - hout_r_block, - c, - c + 4, - h, - h + 2, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else if (out_type == PRECISION(kInt8)) { - write_to_output_c4_int32_1(pre_out, - dout_batch, - hout_c_block, - hout_r_block, - c, - c + 4, - h, - h + 2, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else { // int32 - write_to_output_c4_int32(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - hout_r_block, - c, - c + 4, - h, - h + 2, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - ptr_write); - } - } - } - } -} - -#else - -void conv_3x3s1_direct_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale) { - // printf("conv2_3x3s1_direct_int8 \n"); - - const int hin_r_block = 4; - const int hout_c_block = 4; // 8 - const int hout_r_block = 2; - - int stride_w = param.strides[1]; - int pad_w = param.paddings[1]; - int pad_h = param.paddings[0]; - bool flag_relu = param.fuse_relu; - bool flag_bias = (param.bias != nullptr); - - int wout_round = ((wout + 3) / 4) * 4; - int win_round = wout_round * stride_w + 4; - - int threads = ctx->threads(); - - int* tmp_work_space = ctx->workspace_data(); - int* ptr_zero = tmp_work_space; - memset(ptr_zero, 0, sizeof(int) * win_round); - int* ptr_write = ptr_zero + win_round; - - int in_len = win_round * chin; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - signed char* pre_din = reinterpret_cast(ptr_write + wout_round); - - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - int w_stride = chin * 9; - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int size_out = wout_round * hout_c_block; - - // printf("win_round: %d, wout_round: %d, ws: %d, we: %d\n", win_round, - // wout_round, ws, we); - - for (int n = 0; n < num; ++n) { - const signed char* din_batch = - static_cast(din) + n * chin * size_in_channel; - signed char* dout_batch = - reinterpret_cast(dout) + - n * chout * size_out_channel * PrecisionTypeLength(out_type); - - for (int h = 0; h < hout; h += 2) { - int hs = h - pad_h; - int he = hs + 4; - // printf("hs: %d, he: %d, chin: %d, hin: %d, win: %d \n", hs, he, chin, - // hin, win); - prepack_input_nxw(din_batch, - pre_din, - 0, - chin, - hs, - he, - ws, - we, - chin, - win, - hin, - (signed char*)ptr_zero); - -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < chout; c += hout_c_block) { // 4 -#ifdef ARM_WITH_OMP - int* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4) + - omp_get_thread_num() * pre_out_size; -#else - int* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4); -#endif - // printf("ptr_zero_int: %x, ptr_zero: %x, ptr_write: %x, pre_din: %x, - // pre_out: %x \n", ptr_zero_int, ptr_zero, ptr_write, pre_din, - // pre_out); - const signed char* inr0 = pre_din; - const signed char* inr1 = inr0 + in_len; - const signed char* inr2 = inr1 + in_len; - const signed char* inr3 = inr2 + in_len; - - const signed char* wc0 = - static_cast(weights) + c * w_stride; - - const int* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = static_cast(bias) + c; - } - // hout_r_block * wout_round * hout_c_block - fill_packed_bias_nxmw_int8( - bias_ptr, pre_out, hout_c_block, hout_r_block, wout_round); - - for (int i = 0; i < chin; ++i) { - const signed char* r0 = inr0; - const signed char* r1 = inr1; - const signed char* r2 = inr2; - const signed char* r3 = inr3; - - int* ptr_out0 = pre_out; - int* ptr_out1 = pre_out + size_out; - - int cnt = w_loop; - const signed char* ptr_wc = wc0; - - asm volatile( - "vld1.s8 {d0-d3}, [%[wc0]]! \n" /* wc0, wc1, wc2, wc3, wc4, - wc5, wc6, wc7*/ - "vld1.s8 {d4}, [%[wc0]]! \n" /* wc8 */ - "vmovl.s8 q3, d0 \n" /* q3 = w0, w1 */ - "vmovl.s8 q4, d1 \n" /* q4 = w2 ,w3 */ - "vmovl.s8 q5, d2 \n" /* q5 = w4, w5 */ - "vmovl.s8 q6, d3 \n" /* q6 = w6, w7 */ - "vmovl.s8 q7, d4 \n" /* q7 = w8 */ - - "1: \n" /* main loop*/ - "vld1.s32 {d0}, [%[r0]] \n" /* load data din0-dinn7*/ - "vmovl.s8 q0, d0 \n" /* movl d0 -> q0 */ - /*output 1st row*/ - "vmull.s16 q8, d6, d0[0] \n" /* q8 = w0 * r0[0] */ - "vmull.s16 q9, d6, d0[1] \n" /* q9 = w0 * r0[2] */ - "vmull.s16 q10, d6, d0[2] \n" /* q10 = w0 * r0[4] */ - "vmull.s16 q11, d6, d0[3] \n" /* q11 = w0 * r0[6] */ - - "add %[r0], #4 \n" - - /*output 1st row*/ - "vmlal.s16 q8, d7, d0[1] \n" /* q8 = w1 * r0[1] */ - "vmlal.s16 q9, d7, d0[2] \n" /* q9 = w1 * r0[2] */ - "vmlal.s16 q10, d7, d0[3] \n" /* q10 = w1 * r0[3] */ - "vmlal.s16 q11, d7, d1[0] \n" /* q11 = w1 * r0[4] */ - - "vld1.s32 {d2}, [%[r1]] \n" /* load input r1 -> d2 */ - "vmovl.s8 q1, d2 \n" /* movl d2 -> q1 */ - - /*output 1st row*/ - "vmlal.s16 q8, d8, d0[2] \n" /* q8 = w2 * r0[2] */ - "vmlal.s16 q9, d8, d0[3] \n" /* q9 = w2 * r0[3] */ - "vmlal.s16 q10, d8, d1[0] \n" /* q10 = w2 * r0[4] */ - "vmlal.s16 q11, d8, d1[1] \n" /* q11 = w2 * r0[5] */ - - /*output 1st row*/ - "vmlal.s16 q8, d9, d2[0] \n" /* */ - "vmlal.s16 q9, d9, d2[1] \n" /* */ - "vmlal.s16 q10, d9, d2[2] \n" /* */ - "vmlal.s16 q11, d9, d2[3] \n" /* */ - - "add %[r1], #4 \n" - - /*output 1st row*/ - "vmlal.s16 q8, d10, d2[1] \n" /* */ - "vmlal.s16 q9, d10, d2[2] \n" /* */ - "vmlal.s16 q10, d10, d2[3] \n" /* */ - "vmlal.s16 q11, d10, d3[0] \n" /* */ - - /*output 1st row*/ - "vmlal.s16 q8, d11, d2[2] \n" /* */ - "vmlal.s16 q9, d11, d2[3] \n" /* */ - "vmlal.s16 q10, d11, d3[0] \n" /* */ - "vmlal.s16 q11, d11, d3[1] \n" /* */ - - /*output 2rd row*/ - "vmull.s16 q12, d6, d2[0] \n" /* */ - "vmull.s16 q13, d6, d2[1] \n" /* */ - "vmull.s16 q14, d6, d2[2] \n" /* */ - "vmull.s16 q15, d6, d2[3] \n" /* */ - - "vld1.s32 {d0}, [%[r2]] \n" /* load input r2 -> d2 */ - "vmovl.s8 q0, d0 \n" /* movl d2 -> q1 */ - - /*output 2rd row*/ - "vmlal.s16 q12, d7, d2[1] \n" /* */ - "vmlal.s16 q13, d7, d2[2] \n" /* */ - "vmlal.s16 q14, d7, d2[3] \n" /* */ - "vmlal.s16 q15, d7, d3[0] \n" /* */ - - /*output 2rd row*/ - "vmlal.s16 q12, d8, d2[2] \n" /* */ - "vmlal.s16 q13, d8, d2[3] \n" /* */ - "vmlal.s16 q14, d8, d3[0] \n" /* */ - "vmlal.s16 q15, d8, d3[1] \n" /* */ - - "add %[r2], #4 \n" - - /*output 1st row*/ - "vmlal.s16 q8, d12, d0[0] \n" /* */ - "vmlal.s16 q9, d12, d0[1] \n" /* */ - "vmlal.s16 q10, d12, d0[2] \n" /* */ - "vmlal.s16 q11, d12, d0[3] \n" /* */ - - /*output 1st row*/ - "vmlal.s16 q8, d13, d0[1] \n" /* */ - "vmlal.s16 q9, d13, d0[2] \n" /* */ - "vmlal.s16 q10, d13, d0[3] \n" /* */ - "vmlal.s16 q11, d13, d1[0] \n" /* */ - - "vld1.32 {d2-d5}, [%[ptr_out0]] \n" /* load ptr_out -> q, q - */ - - /*output 1st row*/ - "vmlal.s16 q8, d14, d0[2] \n" /* */ - "vmlal.s16 q9, d14, d0[3] \n" /* */ - "vmlal.s16 q10, d14, d1[0] \n" /* */ - "vmlal.s16 q11, d14, d1[1] \n" /* */ - - /*load & store output 1st row*/ - "vadd.s32 q1, q8, q1 \n" /* out[0] += q8 */ - "vadd.s32 q2, q9, q2 \n" /* out[0] += q8 */ - "vst1.s32 {d2-d5}, [%[ptr_out0]]! \n" - - /*output 2rd row*/ - "vmlal.s16 q12, d9, d0[0] \n" /* */ - "vmlal.s16 q13, d9, d0[1] \n" /* */ - "vmlal.s16 q14, d9, d0[2] \n" /* */ - "vmlal.s16 q15, d9, d0[3] \n" /* */ - - "vld1.32 {d2-d5}, [%[ptr_out0]] \n" /* load ptr_out -> q2, q3 - */ - - /*output 2rd row */ - "vmlal.s16 q12, d10, d0[1] \n" /* */ - "vmlal.s16 q13, d10, d0[2] \n" /* */ - "vadd.s32 q1, q10, q1 \n" /* out[0] += q */ - "vadd.s32 q2, q11, q2 \n" /* out[1] += q */ - - "vmlal.s16 q14, d10, d0[3] \n" /* */ - "vst1.s32 {d2-d5}, [%[ptr_out0]]! \n" - "vmlal.s16 q15, d10, d1[0] \n" /* */ - - /*output 2rd row */ - "vmlal.s16 q12, d11, d0[2] \n" /* */ - "vmlal.s16 q13, d11, d0[3] \n" /* */ - - "vld1.s32 {d4}, [%[r3]] \n" /* load input r2 -> d2 - */ - "vmovl.s8 q2, d4 \n" /* movl d2 -> q2 */ - - "vmlal.s16 q14, d11, d1[0] \n" /* */ - "vmlal.s16 q15, d11, d1[1] \n" /* */ - - "add %[r3], #4 \n" - - /*output 2rd row */ - "vmlal.s16 q12, d12, d4[0] \n" /* */ - "vmlal.s16 q13, d12, d4[1] \n" /* */ - "vmlal.s16 q14, d12, d4[2] \n" /* */ - "vmlal.s16 q15, d12, d4[3] \n" /* */ - - "vld1.32 {d0-d3}, [%[ptr_out1]] \n" /* */ - - /*output 2rd row */ - "vmlal.s16 q12, d13, d4[1] \n" /* */ - "vmlal.s16 q13, d13, d4[2] \n" /* */ - "vmlal.s16 q14, d13, d4[3] \n" /* */ - "vmlal.s16 q15, d13, d5[0] \n" /* */ - - "subs %[cnt], #1 \n" - - /*output 2rd row */ - "vmlal.s16 q12, d14, d4[2] \n" /* */ - "vmlal.s16 q13, d14, d4[3] \n" /* */ - "vmlal.s16 q14, d14, d5[0] \n" /* */ - "vmlal.s16 q15, d14, d5[1] \n" /* */ - - /*output 2rd row*/ - "vadd.s32 q0, q12, q0 \n" /* */ - "vadd.s32 q1, q13, q1 \n" /* */ - "vst1.s32 {d0-d3}, [%[ptr_out1]]! \n" - - "vld1.32 {d0-d3}, [%[ptr_out1]] \n" /* */ - "vadd.s32 q0, q14, q0 \n" /* */ - "vadd.s32 q1, q15, q1 \n" /* */ - "vst1.s32 {d0-d3}, [%[ptr_out1]]! \n" - - "bne 1b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1), - [wc0] "+r"(ptr_wc) - : - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } - - if (out_type == PRECISION(kFloat)) { - write_to_output_c4_int32_1(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - hout_r_block, - c, - c + 4, - h, - h + 2, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else if (out_type == PRECISION(kInt8)) { - write_to_output_c4_int32_1(pre_out, - dout_batch, - hout_c_block, - hout_r_block, - c, - c + 4, - h, - h + 2, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else { // int32 - write_to_output_c4_int32(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - hout_r_block, - c, - c + 4, - h, - h + 2, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - ptr_write); - } - } - } - } -} - -#endif // __aarch64__ - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2_direct_int8.cc b/lite/backends/arm/math/conv3x3s2_direct_int8.cc deleted file mode 100644 index 6169ad5d12..0000000000 --- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc +++ /dev/null @@ -1,1081 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -#ifdef __aarch64__ -int conv_3x3s2_direct_int8_c_num() { return 8; } -void conv_3x3s2_direct_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale) { - //! 3x3s2 int8 convolution, implemented by direct algorithm - //! prepack input to tmp buffer - //! write output to tmp buffer - int threads = ctx->threads(); - int stride_w = param.strides[1]; - int pad_w = param.paddings[1]; - int pad_h = param.paddings[0]; - bool flag_relu = param.fuse_relu; - bool flag_bias = (param.bias != nullptr); - - //! set 2/3 l2 cache - int l2_size = ctx->llc_size() / 3 * 2; - const int hout_c_block = 8; - const int hout_r_kernel = 2; - const int wout_round = ((wout + 3) / 4) * 4; - const int win_round = wout_round * stride_w + 1; - - //! get h block - //! win_round * chin * hin_r_block * sizeof(int8_t) + wout_round * - //! hout_c_block * hout_r_block * threads * sizeof(int32_t)= l2_size - //! win_round = 2 * wout_round + 1 - //! hin_r_block = 2 * hout_r_block + 1 - int hout_r_block = - (l2_size - 2 * wout_round * chin - chin) / - ((4 * wout_round + 2) * chin + wout_round * hout_c_block * threads * 4); - hout_r_block = hout_r_block > hout ? hout : hout_r_block; - hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel; - hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block; - - const int hin_r_block = hout_r_block * 2 + 1; - - int8_t* tmp_work_space = ctx->workspace_data(); - int zero_size = chout > (win_round + 3) / 4 ? chout : (win_round + 3) / 4; - const int kZeroSize = zero_size; - int32_t ptr_zero[kZeroSize]; - memset(ptr_zero, 0, sizeof(int32_t) * zero_size); - const int kWoutRound = wout_round; - int32_t ptr_write[kWoutRound]; - - int in_len = win_round * chin; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - //! l2_cache start - int8_t* pre_din = tmp_work_space; - - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - int w_stride = chin * 9; - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int out_row_stride = hout_c_block * wout_round; - - for (int n = 0; n < num; ++n) { - const int8_t* din_batch = din + n * chin * size_in_channel; - int8_t* dout_batch = - reinterpret_cast(dout) + - n * chout * size_out_channel * PrecisionTypeLength(out_type); - for (int h = 0; h < hout; h += hout_r_block) { - int h_kernel = hout_r_block; - if (h + hout_r_block > hout) { - h_kernel = hout - h; - } - int hs = h * 2 - pad_h; - int he = hs + h_kernel * 2 + 1; - prepack_input_nxw(din_batch, - pre_din, - 0, - chin, - hs, - he, - ws, - we, - chin, - win, - hin, - reinterpret_cast(ptr_zero)); - - const int8_t* cblock_inr0 = pre_din; - const int8_t* cblock_inr1 = cblock_inr0 + in_len; - const int8_t* cblock_inr2 = cblock_inr1 + in_len; - const int8_t* cblock_inr3 = cblock_inr2 + in_len; - const int8_t* cblock_inr4 = cblock_inr3 + in_len; - -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < chout; c += hout_c_block) { -#ifdef ARM_WITH_OMP - int32_t* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4) + - omp_get_thread_num() * pre_out_size; -#else - int32_t* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4); -#endif - const int8_t* block_inr0 = cblock_inr0; - const int8_t* block_inr1 = cblock_inr1; - const int8_t* block_inr2 = cblock_inr2; - const int8_t* block_inr3 = cblock_inr3; - const int8_t* block_inr4 = cblock_inr4; - - const int8_t* weight_c = weights + c * w_stride; - const int32_t* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = bias + c; - } - - fill_packed_bias_nxmw_int8(bias_ptr, pre_out, 8, h_kernel, wout_round); - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const int8_t* wc0 = weight_c; - - const int8_t* inr0 = block_inr0; - const int8_t* inr1 = block_inr1; - const int8_t* inr2 = block_inr2; - const int8_t* inr3 = block_inr3; - const int8_t* inr4 = block_inr4; - - int32_t* pre_out0 = pre_out + hk * out_row_stride; - int32_t* pre_out1 = pre_out0 + out_row_stride; - for (int i = 0; i < chin; ++i) { - int16x8_t v0 = vmovl_s8(vld1_s8(wc0)); // w0 - int16x8_t v1 = vmovl_s8(vld1_s8(wc0 + 8)); // w1 - int16x8_t v2 = vmovl_s8(vld1_s8(wc0 + 16)); // w2, - - int16x8_t v3 = vmovl_s8(vld1_s8(wc0 + 24)); // w3 - int16x8_t v4 = vmovl_s8(vld1_s8(wc0 + 32)); // w4 - int16x8_t v5 = vmovl_s8(vld1_s8(wc0 + 40)); // w5 - - int16x8_t v6 = vmovl_s8(vld1_s8(wc0 + 48)); // w6 - int16x8_t v7 = vmovl_s8(vld1_s8(wc0 + 56)); // w7 - int16x8_t v8 = vmovl_s8(vld1_s8(wc0 + 64)); // w8 - - const int8_t* r0 = inr0; - const int8_t* r1 = inr1; - const int8_t* r2 = inr2; - const int8_t* r3 = inr3; - const int8_t* r4 = inr4; - - int32_t* ptr_out0 = pre_out0; - int32_t* ptr_out1 = pre_out1; - int cnt = w_loop; - - asm volatile( - "ldr q0, [%[r0]], #8 \n" /* load input r0 */ - "ldr q1, [%[r2]], #8 \n" /* load input r2 */ - "sshll v0.8h, v0.8b, #0 \n" /* r0: int8 -> int16 */ - "sshll v1.8h, v1.8b, #0 \n" /* r1: int8 -> int16*/ - "1: \n" /* main loop */ - - /* r0, r2 mul w00 */ - "smull v4.4s, %[v0].4h, v0.h[0]\n" /* outr00 = v0 * r0[0] - */ - "smull2 v5.4s, %[v0].8h, v0.h[0]\n" /* outr00 = v0 * r0[0] - */ - "smull v6.4s, %[v0].4h, v0.h[2]\n" /* outr01 = v0 * r0[2] - */ - "smull2 v7.4s, %[v0].8h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smull v8.4s, %[v0].4h, v0.h[4]\n" /* outr02 = v0 * r0[4] - */ - "smull2 v9.4s, %[v0].8h, v0.h[4]\n" /* outr00 = v0 * r0[0] - */ - "smull v10.4s, %[v0].4h, v0.h[6]\n" /* outr03 = v0 * r0[6] - */ - "smull2 v11.4s, %[v0].8h, v0.h[6]\n" /* outr00 = v0 * r0[0] - */ - - "smull v12.4s, %[v0].4h, v1.h[0]\n" /* outr10 = v0 * r2[0] - */ - "smull2 v13.4s, %[v0].8h, v1.h[0]\n" /* outr11 = v0 * r2[2] - */ - "smull v14.4s, %[v0].4h, v1.h[2]\n" /* outr12 = v0 * r2[4] - */ - "smull2 v15.4s, %[v0].8h, v1.h[2]\n" /* outr13 = v0 * r2[6] - */ - "smull v16.4s, %[v0].4h, v1.h[4]\n" /* outr10 = v0 * r2[0] - */ - "smull2 v17.4s, %[v0].8h, v1.h[4]\n" /* outr11 = v0 * r2[2] - */ - "smull v18.4s, %[v0].4h, v1.h[6]\n" /* outr12 = v0 * r2[4] - */ - "smull2 v19.4s, %[v0].8h, v1.h[6]\n" /* outr13 = v0 * r2[6] - */ - - /* r2, mul w06 */ - "smlal v4.4s, %[v6].4h, v1.h[0]\n" /* outr00 = v6 * r2[1] - */ - "smlal2 v5.4s, %[v6].8h, v1.h[0]\n" /* outr01 = v6 * r2[3] - */ - "smlal v6.4s, %[v6].4h, v1.h[2]\n" /* outr02 = v6 * r2[5] - */ - "smlal2 v7.4s, %[v6].8h, v1.h[2]\n" /* outr03 = v6 * r2[7] - */ - "smlal v8.4s, %[v6].4h, v1.h[4]\n" /* outr00 = v6 * r2[1] - */ - "smlal2 v9.4s, %[v6].8h, v1.h[4]\n" /* outr01 = v6 * r2[3] - */ - "smlal v10.4s, %[v6].4h, v1.h[6]\n" /* outr02 = v6 * r2[5] - */ - "smlal2 v11.4s, %[v6].8h, v1.h[6]\n" /* outr03 = v6 * r2[7] - */ - - "ldr q2, [%[r0]] \n" /* load r0, 9th - data,v10.s[0] */ - - /* r0, r2, mul w01 */ - "smlal v4.4s, %[v1].4h, v0.h[1]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v5.4s, %[v1].8h, v0.h[1]\n" /* outr00 = v0 * r0[0] - */ - "smlal v6.4s, %[v1].4h, v0.h[3]\n" /* outr01 = v0 * r0[2] - */ - "smlal2 v7.4s, %[v1].8h, v0.h[3]\n" /* outr00 = v0 * r0[0] - */ - "sshll v2.8h, v2.8b, #0 \n" /* r0: int8 -> int16 */ - "smlal v8.4s, %[v1].4h, v0.h[5]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v9.4s, %[v1].8h, v0.h[5]\n" /* outr00 = v0 * r0[0] - */ - "smlal v10.4s, %[v1].4h, v0.h[7]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v11.4s, %[v1].8h, v0.h[7]\n" /* outr00 = v0 * r0[0] - */ - - "smlal v12.4s, %[v1].4h, v1.h[1]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v13.4s, %[v1].8h, v1.h[1]\n" /* outr11 = v0 * r2[2] - */ - "smlal v14.4s, %[v1].4h, v1.h[3]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v15.4s, %[v1].8h, v1.h[3]\n" /* outr13 = v0 * r2[6] - */ - "smlal v16.4s, %[v1].4h, v1.h[5]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v17.4s, %[v1].8h, v1.h[5]\n" /* outr11 = v0 * r2[2] - */ - "smlal v18.4s, %[v1].4h, v1.h[7]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v19.4s, %[v1].8h, v1.h[7]\n" /* outr13 = v0 * r2[6] - */ - - /* r2, mul w07 */ - "smlal v4.4s, %[v7].4h, v1.h[1]\n" /* outr00 = v6 * r2[1] - */ - "smlal2 v5.4s, %[v7].8h, v1.h[1]\n" /* outr01 = v6 * r2[3] - */ - "smlal v6.4s, %[v7].4h, v1.h[3]\n" /* outr02 = v6 * r2[5] - */ - "smlal2 v7.4s, %[v7].8h, v1.h[3]\n" /* outr03 = v6 * r2[7] - */ - "smlal v8.4s, %[v7].4h, v1.h[5]\n" /* outr00 = v6 * r2[1] - */ - "smlal2 v9.4s, %[v7].8h, v1.h[5]\n" /* outr01 = v6 * r2[3] - */ - "smlal v10.4s, %[v7].4h, v1.h[7]\n" /* outr02 = v6 * r2[5] - */ - "smlal2 v11.4s, %[v7].8h, v1.h[7]\n" /* outr03 = v6 * r2[7] - */ - - "ldr q3, [%[r2]] \n" /* load r2, 9th - data,v11.s[0] */ - - /* r0, r2, mul w02 */ - "smlal v4.4s, %[v2].4h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v5.4s, %[v2].8h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smlal v6.4s, %[v2].4h, v0.h[4]\n" /* outr01 = v0 * r0[2] - */ - "smlal2 v7.4s, %[v2].8h, v0.h[4]\n" /* outr00 = v0 * r0[0] - */ - "sshll v3.8h, v3.8b, #0 \n" /* r2: int8 -> int16*/ - "smlal v8.4s, %[v2].4h, v0.h[6]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v9.4s, %[v2].8h, v0.h[6]\n" /* outr00 = v0 * r0[0] - */ - "smlal v10.4s, %[v2].4h, v2.h[0]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v11.4s, %[v2].8h, v2.h[0]\n" /* outr00 = v0 * r0[0] - */ - - "ldr q0, [%[r1]], #8 \n" /* load input r1 */ - - "smlal v12.4s, %[v2].4h, v1.h[2]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v13.4s, %[v2].8h, v1.h[2]\n" /* outr11 = v0 * r2[2] - */ - "smlal v14.4s, %[v2].4h, v1.h[4]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v15.4s, %[v2].8h, v1.h[4]\n" /* outr13 = v0 * r2[6] - */ - "sshll v0.8h, v0.8b, #0 \n" /* r1 : int8 -> int16 */ - "smlal v16.4s, %[v2].4h, v1.h[6]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v17.4s, %[v2].8h, v1.h[6]\n" /* outr11 = v0 * r2[2] - */ - "smlal v18.4s, %[v2].4h, v3.h[0]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v19.4s, %[v2].8h, v3.h[0]\n" /* outr13 = v0 * r2[6] - */ - - /* r2, mul w08 */ - "smlal v4.4s, %[v8].4h, v1.h[2]\n" /* outr00 = v6 * r2[1] - */ - "smlal2 v5.4s, %[v8].8h, v1.h[2]\n" /* outr01 = v6 * r2[3] - */ - "smlal v6.4s, %[v8].4h, v1.h[4]\n" /* outr02 = v6 * r2[5] - */ - "smlal2 v7.4s, %[v8].8h, v1.h[4]\n" /* outr03 = v6 * r2[7] - */ - "smlal v8.4s, %[v8].4h, v1.h[6]\n" /* outr00 = v6 * r2[1] - */ - "smlal2 v9.4s, %[v8].8h, v1.h[6]\n" /* outr01 = v6 * r2[3] - */ - "smlal v10.4s, %[v8].4h, v3.h[0]\n" /* outr02 = v6 * r2[5] - */ - "smlal2 v11.4s, %[v8].8h, v3.h[0]\n" /* outr03 = v6 * r2[7] - */ - - "ldr q1, [%[r3]], #8 \n" /* load input r3 */ - - /* r1, r3, mul w03 */ - "smlal v4.4s, %[v3].4h, v0.h[0]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v5.4s, %[v3].8h, v0.h[0]\n" /* outr00 = v0 * r0[0] - */ - "smlal v6.4s, %[v3].4h, v0.h[2]\n" /* outr01 = v0 * r0[2] - */ - "smlal2 v7.4s, %[v3].8h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "sshll v1.8h, v1.8b, #0 \n" /* r3: int8 -> int16 */ - "smlal v8.4s, %[v3].4h, v0.h[4]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v9.4s, %[v3].8h, v0.h[4]\n" /* outr00 = v0 * r0[0] - */ - "smlal v10.4s, %[v3].4h, v0.h[6]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v11.4s, %[v3].8h, v0.h[6]\n" /* outr00 = v0 * r0[0] - */ - "ldr q2, [%[r1]] \n" /* load r1, 9th - data,v10.s[0] */ - - "smlal v12.4s, %[v3].4h, v1.h[0]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v13.4s, %[v3].8h, v1.h[0]\n" /* outr11 = v0 * r2[2] - */ - "smlal v14.4s, %[v3].4h, v1.h[2]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v15.4s, %[v3].8h, v1.h[2]\n" /* outr13 = v0 * r2[6] - */ - "ldr q3, [%[r3]] \n" /* load r3, 9th - data,v11.s[0] */ - "smlal v16.4s, %[v3].4h, v1.h[4]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v17.4s, %[v3].8h, v1.h[4]\n" /* outr11 = v0 * r2[2] - */ - "smlal v18.4s, %[v3].4h, v1.h[6]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v19.4s, %[v3].8h, v1.h[6]\n" /* outr13 = v0 * r2[6] - */ - "sshll v2.8h, v2.8b, #0 \n" /* r1 : int8 -> int16 */ - - /* r1, r3, mul w05 */ - "smlal v4.4s, %[v5].4h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v5.4s, %[v5].8h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smlal v6.4s, %[v5].4h, v0.h[4]\n" /* outr01 = v0 * r0[2] - */ - "smlal2 v7.4s, %[v5].8h, v0.h[4]\n" /* outr00 = v0 * r0[0] - */ - "sshll v3.8h, v3.8b, #0 \n" /* r3 : int8 -> int16 */ - "smlal v8.4s, %[v5].4h, v0.h[6]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v9.4s, %[v5].8h, v0.h[6]\n" /* outr00 = v0 * r0[0] - */ - "smlal v10.4s, %[v5].4h, v2.h[0]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v11.4s, %[v5].8h, v2.h[0]\n" /* outr00 = v0 * r0[0] - */ - - "smlal v12.4s, %[v5].4h, v1.h[2]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v13.4s, %[v5].8h, v1.h[2]\n" /* outr11 = v0 * r2[2] - */ - "smlal v14.4s, %[v5].4h, v1.h[4]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v15.4s, %[v5].8h, v1.h[4]\n" /* outr13 = v0 * r2[6] - */ - "smlal v16.4s, %[v5].4h, v1.h[6]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v17.4s, %[v5].8h, v1.h[6]\n" /* outr11 = v0 * r2[2] - */ - "smlal v18.4s, %[v5].4h, v3.h[0]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v19.4s, %[v5].8h, v3.h[0]\n" /* outr13 = v0 * r2[6] - */ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1 */ - - /* r1, r3, mul w04 */ - "smlal v4.4s, %[v4].4h, v0.h[1]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v5.4s, %[v4].8h, v0.h[1]\n" /* outr00 = v0 * r0[0] - */ - "smlal v6.4s, %[v4].4h, v0.h[3]\n" /* outr01 = v0 * r0[2] - */ - "smlal2 v7.4s, %[v4].8h, v0.h[3]\n" /* outr00 = v0 * r0[0] - */ - "smlal v8.4s, %[v4].4h, v0.h[5]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v9.4s, %[v4].8h, v0.h[5]\n" /* outr00 = v0 * r0[0] - */ - "smlal v10.4s, %[v4].4h, v0.h[7]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v11.4s, %[v4].8h, v0.h[7]\n" /* outr00 = v0 * r0[0] - */ - - "ldr q0, [%[r4]], #8 \n" /* load input r4 */ - - "smlal v12.4s, %[v4].4h, v1.h[1]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v13.4s, %[v4].8h, v1.h[1]\n" /* outr11 = v0 * r2[2] - */ - "smlal v14.4s, %[v4].4h, v1.h[3]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v15.4s, %[v4].8h, v1.h[3]\n" /* outr13 = v0 * r2[6] - */ - "sshll v0.8h, v0.8b, #0 \n" /* r4 : int8 -> int16 */ - "smlal v16.4s, %[v4].4h, v1.h[5]\n" /* outr10 = v0 * r2[0] - */ - "smlal2 v17.4s, %[v4].8h, v1.h[5]\n" /* outr11 = v0 * r2[2] - */ - "smlal v18.4s, %[v4].4h, v1.h[7]\n" /* outr12 = v0 * r2[4] - */ - "smlal2 v19.4s, %[v4].8h, v1.h[7]\n" /* outr13 = v0 * r2[6] - */ - - "ldr q2, [%[r4]] \n" /* load r4, 9th - data,v10.s[0] */ - "sshll v2.8h, v2.8b, #0 \n" /* r4 : int8 -> int16 */ - - "ldp q1, q3, [%[ptr_out0]] \n" /* load ptr_out + 0 -> - q2, q3 */ - "ldp q20, q21, [%[ptr_out0], #32]\n" /* load ptr_out + 32 -> - q4, q5 */ - - "add v4.4s, v1.4s , v4.4s \n" /* v10 = outr00[0].low - + q2 */ - "add v5.4s, v3.4s , v5.4s \n" /* v11 = outr00[0].high - + q3 */ - "add v6.4s, v20.4s, v6.4s \n" /* v12 = outr01[0].low - + q4 */ - "add v7.4s, v21.4s, v7.4s \n" /* v13 = outr01[0].high - + q5 */ - - "ldp q1 , q3 , [%[ptr_out0], #64]\n" /* load ptr_out + 64 -> - q6, q7 */ - "ldp q20, q21, [%[ptr_out0], #96]\n" /* load ptr_out + 96 -> - q8, q9 */ - - "stp q4, q5 , [%[ptr_out0]], #32\n" /* store q10, q11 -> - ptr_out */ - "stp q6, q7 , [%[ptr_out0]], #32\n" /* store q10, q11 -> - ptr_out */ - - "add v8.4s , v1.4s , v8.4s \n" /* v10 = outr00[0].low - + q2 */ - "add v9.4s , v3.4s , v9.4s \n" /* v11 = outr00[0].high - + q3 */ - "add v10.4s, v20.4s, v10.4s \n" /* v12 = outr01[0].low - + q4 */ - "add v11.4s, v21.4s, v11.4s \n" /* v13 = outr01[0].high - + q5 */ - "stp q8, q9, [%[ptr_out0]], #32\n" /* store q14, q15 -> - ptr_out += 64 */ - "stp q10, q11, [%[ptr_out0]], #32\n" /* store q16, q17 -> - ptr_out += 96 */ - - /* r4, mul w08 */ - "smlal v12.4s, %[v8].4h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v13.4s, %[v8].8h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smlal v14.4s, %[v8].4h, v0.h[4]\n" /* outr01 = v0 * r0[2] - */ - "smlal2 v15.4s, %[v8].8h, v0.h[4]\n" /* outr00 = v0 * r0[0] - */ - - "smlal v16.4s, %[v8].4h, v0.h[6]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v17.4s, %[v8].8h, v0.h[6]\n" /* outr00 = v0 * r0[0] - */ - "smlal v18.4s, %[v8].4h, v2.h[0]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v19.4s, %[v8].8h, v2.h[0]\n" /* outr00 = v0 * r0[0] - */ - - /* r4, mul w07 */ - "smlal v12.4s, %[v7].4h, v0.h[1]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v13.4s, %[v7].8h, v0.h[1]\n" /* outr00 = v0 * r0[0] - */ - "smlal v14.4s, %[v7].4h, v0.h[3]\n" /* outr01 = v0 * r0[2] - */ - "smlal2 v15.4s, %[v7].8h, v0.h[3]\n" /* outr00 = v0 * r0[0] - */ - - "ldr q1, [%[r2]], #8 \n" /* load input r2 */ - - "smlal v16.4s, %[v7].4h, v0.h[5]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v17.4s, %[v7].8h, v0.h[5]\n" /* outr00 = v0 * r0[0] - */ - "smlal v18.4s, %[v7].4h, v0.h[7]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v19.4s, %[v7].8h, v0.h[7]\n" /* outr00 = v0 * r0[0] - */ - - "sshll v1.8h, v1.8b, #0 \n" /* r2: int8 -> int16 - */ - - /* r4, mul w06 */ - "ldp q4, q5, [%[ptr_out1]] \n" /* load ptr_out + 0 -> - q2, q3 */ - - "smlal v12.4s, %[v6].4h, v0.h[0]\n" /* outr00 = v0 * r0[0] - */ - "smlal2 v13.4s, %[v6].8h, v0.h[0]\n" /* outr00 = v0 * r0[0] - */ - "smlal v14.4s, %[v6].4h, v0.h[2]\n" /* outr01 = v0 * r0[2] - */ - - "ldp q8, q9, [%[ptr_out1], #64]\n" /* load ptr_out + 64 -> - q6, q7 */ - - "smlal2 v15.4s, %[v6].8h, v0.h[2]\n" /* outr00 = v0 * r0[0] - */ - "smlal v16.4s, %[v6].4h, v0.h[4]\n" /* outr02 = v0 * r0[4] - */ - "smlal2 v17.4s, %[v6].8h, v0.h[4]\n" /* outr00 = v0 * r0[0] - */ - - "ldp q10, q11, [%[ptr_out1], #96]\n" /* load ptr_out + 96 -> - q8, q9 */ - - "smlal v18.4s, %[v6].4h, v0.h[6]\n" /* outr03 = v0 * r0[6] - */ - "smlal2 v19.4s, %[v6].8h, v0.h[6]\n" /* outr00 = v0 * r0[0] - */ - - "ldr q0, [%[r0]], #8 \n" /* load input r2 */ - "ldp q6, q7, [%[ptr_out1], #32]\n" /* load ptr_out + 32 -> - q4, q5 */ - - "sshll v0.8h, v0.8b, #0 \n" /* r0: int8 -> int16 */ - - /* store outr1 */ - "add v12.4s, v4.4s , v12.4s\n" /* v10 = outr10[0].low + q2 */ - "add v13.4s, v5.4s , v13.4s\n" /* v11 = outr10[0].high + q3 */ - "add v14.4s, v6.4s , v14.4s\n" /* v12 = outr11[0].low + q4 */ - "add v15.4s, v7.4s , v15.4s\n" /* v13 = outr11[0].high + q5 */ - - "stp q12, q13, [%[ptr_out1]], #32\n" /* store q10, q11 -> - ptr_out */ - - "add v16.4s, v8.4s , v16.4s\n" /* v14 = outr12[0].low + q6 */ - "add v17.4s, v9.4s , v17.4s\n" /* v15 = outr12[0].high + q7 */ - - "stp q14, q15, [%[ptr_out1]], #32\n" /* store q12, q13 -> - ptr_out += 32 */ - - "add v18.4s, v10.4s, v18.4s\n" /* v16 = outr13[0].low + q8 */ - "add v19.4s, v11.4s, v19.4s\n" /* v17 = outr13[0].high + q9 */ - - "stp q16, q17, [%[ptr_out1]], #32\n" /* store q14, q15 -> - ptr_out += 64 */ - "stp q18, q19, [%[ptr_out1]], #32\n" /* store q16, q17 -> - ptr_out += 96 */ - - "bne 1b \n" /* jump to main loop */ - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [r4] "+r"(r4), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [v0] "w"(v0), - [v1] "w"(v1), - [v2] "w"(v2), - [v3] "w"(v3), - [v4] "w"(v4), - [v5] "w"(v5), - [v6] "w"(v6), - [v7] "w"(v7), - [v8] "w"(v8) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } - block_inr0 = block_inr4; - block_inr1 = block_inr0 + in_len; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - block_inr4 = block_inr3 + in_len; - } - if (out_type == PRECISION(kFloat)) { - write_to_output_c8_int32_1(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - 2, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else if (out_type == PRECISION(kInt8)) { - write_to_output_c8_int32_1(pre_out, - dout_batch, - hout_c_block, - 2, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else { - write_to_output_c8_int32(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - 2, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - ptr_write); - } - } - } - } -} - -#else // __aarch64__ -int conv_3x3s2_direct_int8_c_num() { return 4; } -void conv_3x3s2_direct_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale) { - //! 3x3s2 int8 convolution, implemented by direct algorithm - //! prepack input to tmp buffer - //! write output to tmp buffer - int threads = ctx->threads(); - int stride_w = param.strides[1]; - int pad_w = param.paddings[1]; - int pad_h = param.paddings[0]; - bool flag_relu = param.fuse_relu; - bool flag_bias = (param.bias != nullptr); - - //! set 2/3 l2 cache - int l2_size = ctx->llc_size() / 3 * 2; - const int hout_c_block = 4; - const int hout_r_kernel = 1; - const int wout_round = ((wout + 3) / 4) * 4; - const int win_round = wout_round * stride_w + 1; - - //! get h block - //! win_round * chin * hin_r_block * sizeof(int8_t) + wout_round * - //! hout_c_block * hout_r_block * threads * sizeof(int32_t)= l2_size - //! win_round = 2 * wout_round + 1 - //! hin_r_block = 2 * hout_r_block + 1 - int hout_r_block = - (l2_size - 2 * wout_round * chin - chin) / - ((4 * wout_round + 2) * chin + wout_round * hout_c_block * threads * 4); - hout_r_block = hout_r_block > hout ? hout : hout_r_block; - hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel; - hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block; - - const int hin_r_block = hout_r_block * 2 + 1; - - int8_t* tmp_work_space = ctx->workspace_data(); - int zero_size = chout > (win_round + 3) / 4 ? chout : (win_round + 3) / 4; - const int kZeroSize = zero_size; - int32_t ptr_zero[kZeroSize]; - memset(ptr_zero, 0, sizeof(int32_t) * zero_size); - const int kWoutRound = wout_round; - int32_t ptr_write[kWoutRound]; - - int in_len = win_round * chin; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - //! l2_cache start - int8_t* pre_din = tmp_work_space; - - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - int w_stride = chin * 9; - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int out_row_stride = hout_c_block * wout_round; - - for (int n = 0; n < num; ++n) { - const int8_t* din_batch = din + n * chin * size_in_channel; - int8_t* dout_batch = - reinterpret_cast(dout) + - n * chout * size_out_channel * PrecisionTypeLength(out_type); - for (int h = 0; h < hout; h += hout_r_block) { - int h_kernel = hout_r_block; - if (h + hout_r_block > hout) { - h_kernel = hout - h; - } - int hs = h * 2 - pad_h; - int he = hs + h_kernel * 2 + 1; - prepack_input_nxw(din_batch, - pre_din, - 0, - chin, - hs, - he, - ws, - we, - chin, - win, - hin, - reinterpret_cast(ptr_zero)); - - const int8_t* cblock_inr0 = pre_din; - const int8_t* cblock_inr1 = cblock_inr0 + in_len; - const int8_t* cblock_inr2 = cblock_inr1 + in_len; -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < chout; c += hout_c_block) { -#ifdef ARM_WITH_OMP - int32_t* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4) + - omp_get_thread_num() * pre_out_size; -#else - int32_t* pre_out = - reinterpret_cast(pre_din + (pre_in_size + 3) / 4 * 4); -#endif - const int8_t* block_inr0 = cblock_inr0; - const int8_t* block_inr1 = cblock_inr1; - const int8_t* block_inr2 = cblock_inr2; - - const int8_t* weight_c = weights + c * w_stride; - const int32_t* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = bias + c; - } - - fill_packed_bias_nxmw_int8(bias_ptr, pre_out, 4, h_kernel, wout_round); - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const int8_t* wc0 = weight_c; - - const int8_t* inr0 = block_inr0; - const int8_t* inr1 = block_inr1; - const int8_t* inr2 = block_inr2; - - int32_t* pre_out0 = pre_out + hk * out_row_stride; - for (int i = 0; i < chin; ++i) { - const int8_t* r0 = inr0; - const int8_t* r1 = inr1; - const int8_t* r2 = inr2; - - int32_t* ptr_out0 = pre_out0; - const signed char* ptr_wc0 = wc0; - int cnt = w_loop; - asm volatile( - "vld1.s32 {d0-d3}, [%[wc0]]! \n" /* w0-w7 */ - "vld1.s32 {d4}, [%[wc0]]! \n" /* w8 */ - "vmovl.s8 q3, d0 \n" /* q3 = w0, w1 */ - "vmovl.s8 q4, d1 \n" /* q4 = w2 ,w3 */ - "vmovl.s8 q5, d2 \n" /* q5 = w4, w5 */ - "vmovl.s8 q6, d3 \n" /* q6 = w6, w7 */ - "vmovl.s8 q7, d4 \n" /* q7 = w8 */ - "vld1.s32 {d0}, [%[r0]]! \n" /* load input r0 -> d0 */ - "vmovl.s8 q0, d0 \n" /* movl d0 -> q0 */ - "1: \n" /* main loop */ - - /* r0 mul w0 */ - "vmull.s16 q8, d6, d0[0] \n" /* q8 = w0 * r0[0] */ - "vmull.s16 q9, d6, d0[2] \n" /* q9 = w0 * r0[2] */ - "vmull.s16 q10, d6, d1[0] \n" /* q10 = w0 * r0[4] */ - "vmull.s16 q11, d6, d1[2] \n" /* q11 = w0 * r0[6] */ - - "vld1.s32 {d2}, [%[r1]]! \n" /* load input r1 -> d2 */ - "vmovl.s8 q1, d2 \n" /* movl d2 -> q1 */ - - /* r0 mul w1 */ - "vmlal.s16 q8, d7, d0[1] \n" /* q8 = w1 * r0[1] */ - "vmlal.s16 q9, d7, d0[3] \n" /* q9 = w1 * r0[3] */ - "vmlal.s16 q10, d7, d1[1] \n" /* q10 = w1 * r0[5] */ - "vmlal.s16 q11, d7, d1[3] \n" /* q11 = w1 * r0[7] */ - - "vld1.s32 {d4}, [%[r0]] \n" /* load r0[8] -> d4 */ - "vmovl.s8 q2 , d4 \n" /* movl d4 -> q2 */ - - /* r0 mul w2 */ - "vmlal.s16 q8, d8, d0[2] \n" /* q8 = w2 * r0[2] */ - "vmlal.s16 q9, d8, d1[0] \n" /* q9 = w2 * r0[4] */ - "vmlal.s16 q10, d8, d1[2] \n" /* q10 = w2 * r0[6] */ - "vmlal.s16 q11, d8, d4[0] \n" /* q11 = w2 * r0[8] */ - - "subs %[cnt], #1 \n" /* loop count -1 */ - - /* r1 mul w3 */ - "vmlal.s16 q8, d9, d2[0] \n" /* q8 = w3 * r1[0] */ - "vmlal.s16 q9, d9, d2[2] \n" /* q9 = w3 * r1[2] */ - "vmlal.s16 q10, d9, d3[0] \n" /* q10 = w3 * r1[4] */ - "vmlal.s16 q11, d9, d3[2] \n" /* q11 = w3 * r1[6] */ - - "vld1.s32 {d4}, [%[r2]]! \n" /* load input r2 -> d4*/ - "vmovl.s8 q2, d4 \n" /* movl d4 -> q2 */ - - /* r1 mul w4 */ - "vmlal.s16 q8, d10, d2[1] \n" /* q8 = w4 * r1[1] */ - "vmlal.s16 q9, d10, d2[3] \n" /* q9 = w4 * r1[3] */ - "vmlal.s16 q10, d10, d3[1] \n" /* q10 = w4 * r1[5] */ - "vmlal.s16 q11, d10, d3[3] \n" /* q11 = w4 * r1[7] */ - - "vld1.s32 {d0}, [%[r1]] \n" /* load r1[8] -> d0 */ - "vmovl.s8 q0, d0 \n" /* movl d0 -> q0 */ - - /* r1 mul w5 */ - "vmlal.s16 q8, d11, d2[2] \n" /* q8 = w5 * r1[2] */ - "vmlal.s16 q9, d11, d3[0] \n" /* q9 = w5 * r1[4] */ - "vmlal.s16 q10, d11, d3[2] \n" /* q10 = w5 * r1[6] */ - "vmlal.s16 q11, d11, d0[0] \n" /* q11 = w5 * r1[8] */ - - /* r2 mul w6 */ - "vmlal.s16 q8, d12, d4[0] \n" /* q8 = w6 * r2[0] */ - "vmlal.s16 q9, d12, d4[2] \n" /* q9 = w6 * r2[2] */ - "vmlal.s16 q10, d12, d5[0] \n" /* q10 = w6 * r2[4] */ - "vmlal.s16 q11, d12, d5[2] \n" /* q11 = w6 * r2[6] */ - - "vld1.s32 {d24-d27}, [%[ptr_out0]] \n" /* load output -> q12, - q13 */ - - /* r2 mul w7 */ - "vmlal.s16 q8, d13, d4[1] \n" /* q8 = w7 * r2[1] */ - "vmlal.s16 q9, d13, d4[3] \n" /* q9 = w7 * r2[3] */ - "vmlal.s16 q10, d13, d5[1] \n" /* q10 = w7 * r2[5] */ - "vmlal.s16 q11, d13, d5[3] \n" /* q11 = w7 * r2[7] */ - - "vld1.s32 {d0}, [%[r2]] \n" /* load r2[8] -> d0 */ - "vmovl.s8 q0, d0 \n" /* movl d0 -> q0 */ - - /* r2 mul w8 */ - "vmlal.s16 q8, d14, d4[2] \n" /* q8 = w8 * r2[2] */ - "vmlal.s16 q9, d14, d5[0] \n" /* q9 = w8 * r2[4] */ - "vmlal.s16 q10, d14, d5[2] \n" /* q10 = w8 * r2[6] */ - "vmlal.s16 q11, d14, d0[0] \n" /* q11 = w8 * r2[8] */ - - "vadd.s32 q12, q8, q12 \n" /* out[0] += q8 */ - "vadd.s32 q13, q9, q13 \n" /* out[1] += q9 */ - "vst1.s32 {d24-d27}, [%[ptr_out0]]! \n" /* store q12, q13 -> - output[0,1] */ - - "vld1.s32 {d0}, [%[r0]]! \n" /* load next input r0 -> d0*/ - "vmovl.s8 q0, d0 \n" /* movl d0 -> q0 */ - - "vld1.s32 {d28-d31}, [%[ptr_out0]] \n" /* load output[0,1] -> - q14, q15 */ - "vadd.s32 q14, q10, q14 \n" /* out[2] += q10 */ - "vadd.s32 q15, q11, q15 \n" /* out[3] += q11 */ - "vst1.s32 {d28-d31}, [%[ptr_out0]]! \n" /* store q14, q15 -> - output[2,3] */ - - "bne 1b \n" /* jump to main loop */ - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [ptr_out0] "+r"(ptr_out0), - [wc0] "+r"(ptr_wc0) - : - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - } - block_inr0 = block_inr2; - block_inr1 = block_inr0 + in_len; - block_inr2 = block_inr1 + in_len; - } - if (out_type == PRECISION(kFloat)) { - write_to_output_c4_int32_1(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - 1, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else if (out_type == PRECISION(kInt8)) { - write_to_output_c4_int32_1(pre_out, - dout_batch, - hout_c_block, - 1, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - &scale[c], - out_type); - } else { - write_to_output_c4_int32(pre_out, - reinterpret_cast(dout_batch), - hout_c_block, - 1, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - ptr_write); - } - } - } - } -} -#endif // __aarch64__ - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h deleted file mode 100644 index 3deb6bcb5f..0000000000 --- a/lite/backends/arm/math/conv_block_utils.h +++ /dev/null @@ -1,4292 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "lite/backends/arm/math/saturate.h" -#include "lite/backends/arm/math/type_trans.h" -#include "lite/core/target_wrapper.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -#define LITEMAX(a, b) ((a) > (b) ? (a) : (b)) - -inline void fill_packed_biasc4(float* dout, const float* bias, int size) { - float32x4_t vb = vld1q_f32(bias); - int cnt = size / 4; - for (int i = 0; i < cnt; ++i) { - vst1q_f32(dout, vb); - dout += 4; - } -} - -/*preprocessing weights -* input weights: [chout, chin/ group, kh, kw] --> outputs weights: [chout / n, -* chin/ group, kh, kw, n] -*/ -template -static bool conv_trans_weights_numc(const dtype* din, - dtype* dout, - int chout, - int chin, - int n, - int kernel_size) { - if (n <= 0) { - LOG(ERROR) << "ch_n and hei_n are more than zero"; - return false; - } - int c_loop = chout / n; - int chout_round = (chout + n - 1) / n; - int win_stride = chin * kernel_size; - int wout_stride = n * win_stride; - int co = 0; - for (; co < c_loop; ++co) { - dtype* dout_c = dout + co * wout_stride; - const dtype* din_array[n]; - din_array[0] = din + co * wout_stride; - for (int i = 1; i < n; i++) { - din_array[i] = din_array[i - 1] + win_stride; - } - for (int ci = 0; ci < chin; ++ci) { - for (int k = 0; k < kernel_size; ++k) { - for (int i = 0; i < n; i++) { - *(dout_c++) = *(din_array[i]++); - } - } - } - } - // pad final chout - if (chout_round > c_loop) { - dtype* dout_c = dout + c_loop * wout_stride; - const dtype* din_array[n]; - din_array[0] = din + c_loop * wout_stride; - for (int i = 1; i < n; i++) { - din_array[i] = din_array[i - 1] + win_stride; - } - // deal remain - int cremain = chout_round * n - chout; - for (int i = 1; i <= cremain; i++) { - din_array[n - i] = din_array[0]; - } - for (int ci = 0; ci < chin; ++ci) { - for (int k = 0; k < kernel_size; ++k) { - for (int i = 0; i < n; i++) { - *(dout_c++) = *(din_array[i]++); - } - } - } - } - return true; -} -/*preprocessing inputs -* input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws] -* n = he - hs -*/ -template -static bool prepack_input_nxw(const dtype* din, - dtype* dout, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int width, - int height, - dtype* zero_ptr) { - int n = he - hs; - if (n <= 0) { - LOG(ERROR) << "hei_n is more than zero"; - return false; - } - int w0 = ws < 0 ? 0 : ws; - int w1 = we > width ? width : we; - - int size_w = we - ws; - int size_wc_len = size_w * channel; - int size_c = width * height; - - int valid_w = w1 - w0; - size_t valid_w_byte = valid_w * sizeof(dtype); - - dtype* out_array[n]; - out_array[0] = dout; - for (int i = 1; i < n; i++) { - out_array[i] = out_array[i - 1] + size_wc_len; - } - - for (int c = 0; c < channel; ++c) { - int j = 0; - // valid height - for (int i = hs; i < he; i++) { - // get address - const dtype* in_array; - if (i < 0 || i >= height) { - in_array = zero_ptr; - } else { - in_array = din + i * width; - } - - for (int w = ws; w < w0; ++w) { - *(out_array[j]++) = 0.f; - } - memcpy(out_array[j], in_array, valid_w_byte); - out_array[j] += valid_w; - for (int w = w1; w < we; ++w) { - *(out_array[j]++) = 0.f; - } - j++; - } - din += size_c; - } - return true; -} - -/*wirte result in outputs -* input din: [n, c, h, w], output dout: [n, c, h, w] -*/ -inline bool write_to_output_c1_fp32(const float* din, - float* dout, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - float* trash_ptr) { - if (cs > channel) { - return true; - } - - const int c1 = 1; - const int w4 = 4; - - int size_c_out = width * height; - - float* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - - const float* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int w_round = we - ws; - int cnt = (width - ws) / w4; - - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - float* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - const float* din_hei_ptr = ptr_din + i * w_round * c1; - if (cnt > 0) { - int cnt_loop = cnt; - if (flag_relu) { -#ifdef __aarch64__ - asm volatile( - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "fmax v1.4s, v0.4s, v20.4s \n" /*relu*/ - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q1, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "bne 1b \n" /* jump to main loop*/ - : [doutc0r0] "+r"(doutc0_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v20"); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data, c0r0, " - "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - - "vmax.f32 q1, q0, q15 @ relu\n" - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data \n" - - "vst1.32 {d2-d3}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q15"); -#endif - } else { -#ifdef __aarch64__ - asm volatile( - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "1: \n" /* main loop*/ - "str q0, [%[doutc0r0]], #16 \n" /* store c2r0*/ - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "ldr q0, [%[ptr_din]], #16 \n" /* load data, c0r0, c0r1, c0r2, - c0r3 */ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0"); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data, c0r0, " - "c0r1, c0r2, c0r3\n" - "1: @ main loop\n" - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - "vld1.32 {d0-d1}, [%[ptr_din]]! @ load data \n" - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0"); -#endif - } - } - if (we > width) { - int offset = i * w_round * c1 + c1 * w4 * cnt; - din_hei_ptr = ptr_din + offset; - int j = we - w4; - if (flag_relu) { - for (; j < width; ++j) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f); - din_hei_ptr++; - } - } else { - for (; j < width; ++j) { - *(doutc0_ptr++) = *(din_hei_ptr++); - } - } - } - } - return true; -} - -/*wirte result in outputs -* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] -*/ -inline bool write_to_output_c2_fp32(const float* din, - float* dout, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - float* trash_ptr) { - if (cs > channel) { - return true; - } - - const int c2 = 2; - const int w4 = 4; - - // float trash_ptr[width]; - - int size_c_out = width * height; - - float* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - float* doutc1r0 = doutc0r0 + size_c_out; - - const float* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int w_round = we - ws; - int cnt = (width - ws) / w4; - - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - float* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - float* doutc1_ptr = doutc1r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 1: - doutc1_ptr = trash_ptr; - default: - break; - } - } - const float* din_hei_ptr = ptr_din + i * w_round * c2; - if (cnt > 0) { - int cnt_loop = cnt; - if (flag_relu) { -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - - "fmax v2.4s, v4.4s, v20.4s \n" /*relu*/ - "fmax v3.4s, v5.4s, v20.4s \n" /*relu*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - - "str q2, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q3, [%[doutc1r0]], #16 \n" /* store c2r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " - "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " - "c1r0, c1r1 \n" - "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " - "c1r2, c1r3 \n" - - "vswp d1, d2 @ swap data\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); -#endif - } else { -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "1: \n" /* main loop*/ - "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - - "str q4, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q5, [%[doutc1r0]], #16 \n" /* store c2r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " - "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "1: @ main loop\n" - "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " - "c1r0, c1r1 \n" - "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " - "c1r2, c1r3 \n" - - "vswp d1, d2 @ swap data\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); -#endif - } - } - if (we > width) { - int offset = i * w_round * c2 + c2 * w4 * cnt; - din_hei_ptr = ptr_din + offset; - int j = we - w4; - if (flag_relu) { - for (; j < width; ++j) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f); - *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f); - din_hei_ptr += 2; - } - } else { - for (; j < width; ++j) { - *(doutc0_ptr++) = *(din_hei_ptr++); - *(doutc1_ptr++) = *(din_hei_ptr++); - } - } - } - } - return true; -} - -/*wirte result in outputs -* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] -*/ -inline bool write_to_output_c4_fp32(const float* din, - float* dout, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - float* trash_ptr) { - const int c4 = 4; - const int w4 = 4; - const int w_round = we - ws; - const int ch_n = ce - cs; - if (ch_n != 4) { - LOG(ERROR) << "write_to_output_c4_fp32 ch_n must be equal 4 and hei_n is " - "more than zero"; - return false; - } - int size_c_out = width * height; - - float* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - float* doutc1r0 = doutc0r0 + size_c_out; - float* doutc2r0 = doutc1r0 + size_c_out; - float* doutc3r0 = doutc2r0 + size_c_out; - - const float* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int cnt = (width - ws) / w4; - - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - float* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - float* doutc1_ptr = doutc1r0 + size_w; - float* doutc2_ptr = doutc2r0 + size_w; - float* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - const float* din_hei_ptr = ptr_din + i * w_round * ch_n; - if (cnt > 0) { - int cnt_loop = cnt; - if (flag_relu) { -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - "vmax.f32 q2, q2, q15 @ relu\n" - "vmax.f32 q3, q3, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); -#endif - } else { -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v8", - "v9", - "v10", - "v11", - "v16", - "v17", - "v18", - "v19"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3"); -#endif - } - } - if (we > width) { - int offset = i * w_round * c4 + c4 * w4 * cnt; - din_hei_ptr = ptr_din + offset; - int j = we - w4; - if (flag_relu) { - for (; j < width; ++j) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f); - *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f); - *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f); - *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f); - din_hei_ptr += w4; - } - } else { - for (; j < width; ++j) { - *(doutc0_ptr++) = din_hei_ptr[0]; - *(doutc1_ptr++) = din_hei_ptr[1]; - *(doutc2_ptr++) = din_hei_ptr[2]; - *(doutc3_ptr++) = din_hei_ptr[3]; - din_hei_ptr += w4; - } - } - } - } - return true; -} - -/*wirte result in outputs -* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w] -*/ -inline bool write_to_output_c8_fp32(const float* din, - float* dout, - int ch_n, - int hei_n, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - float* trash_ptr) { - if (ch_n != 8 || hei_n <= 0) { - LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero"; - return false; - } - int size_c_out = width * height; - - float* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - float* doutc1r0 = doutc0r0 + size_c_out; - float* doutc2r0 = doutc1r0 + size_c_out; - float* doutc3r0 = doutc2r0 + size_c_out; - float* doutc4r0 = doutc3r0 + size_c_out; - float* doutc5r0 = doutc4r0 + size_c_out; - float* doutc6r0 = doutc5r0 + size_c_out; - float* doutc7r0 = doutc6r0 + size_c_out; - - const float* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int valid_w = we - ws; - int cnt = valid_w / 4; - - if (we > width) { - cnt--; - } - if (flag_relu) { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - float* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - float* doutc1_ptr = doutc1r0 + size_w; - float* doutc2_ptr = doutc2r0 + size_w; - float* doutc3_ptr = doutc3r0 + size_w; - float* doutc4_ptr = doutc4r0 + size_w; - float* doutc5_ptr = doutc5r0 + size_w; - float* doutc6_ptr = doutc6r0 + size_w; - float* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const float* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ - - "fmax v8.4s, v8.4s, v20.4s \n" /*relu*/ - "fmax v9.4s, v9.4s, v20.4s \n" /*relu*/ - "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ - "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - "vmax.f32 q2, q2, q15 @ relu\n" - "vmax.f32 q3, q3, q15 @ relu\n" - - "vmax.f32 q4, q4, q15 @ relu\n" - "vmax.f32 q5, q5, q15 @ relu\n" - "vmax.f32 q6, q6, q15 @ relu\n" - "vmax.f32 q7, q7, q15 @ relu\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4", "q15"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f); - *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f); - *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f); - *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f); - *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0.f); - *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0.f); - *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0.f); - *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0.f); - din_hei_ptr += 8; - } - } - } - } else { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - float* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - float* doutc1_ptr = doutc1r0 + size_w; - float* doutc2_ptr = doutc2r0 + size_w; - float* doutc3_ptr = doutc3r0 + size_w; - float* doutc4_ptr = doutc4r0 + size_w; - float* doutc5_ptr = doutc5r0 + size_w; - float* doutc6_ptr = doutc6r0 + size_w; - float* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const float* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = din_hei_ptr[0]; - *(doutc1_ptr++) = din_hei_ptr[1]; - *(doutc2_ptr++) = din_hei_ptr[2]; - *(doutc3_ptr++) = din_hei_ptr[3]; - *(doutc4_ptr++) = din_hei_ptr[4]; - *(doutc5_ptr++) = din_hei_ptr[5]; - *(doutc6_ptr++) = din_hei_ptr[6]; - *(doutc7_ptr++) = din_hei_ptr[7]; - din_hei_ptr += 8; - } - } - } - } - return true; -} - -/*wirte result in outputs -* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] -*/ -inline bool write_to_output_c4_int32(const int* din, - int* dout, - int ch_n, - int hei_n, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - int* trash_ptr) { - if (ch_n != 4 || hei_n <= 0) { - LOG(ERROR) << "ch_n must be equal 4 and hei_n is more than zero"; - return false; - } - int size_c_out = width * height; - - int* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - int* doutc1r0 = doutc0r0 + size_c_out; - int* doutc2r0 = doutc1r0 + size_c_out; - int* doutc3r0 = doutc2r0 + size_c_out; - - const int* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int valid_w = we - ws; - int cnt = valid_w / 4; - - if (we > width) { - cnt--; - } - if (flag_relu) { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - int* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - int* doutc1_ptr = doutc1r0 + size_w; - int* doutc2_ptr = doutc2r0 + size_w; - int* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "smax v16.4s, v16.4s, v20.4s \n" /* relu */ - "smax v17.4s, v17.4s, v20.4s \n" /* relu */ - "smax v18.4s, v18.4s, v20.4s \n" /* relu */ - "smax v19.4s, v19.4s, v20.4s \n" /* relu */ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans q0, q1 \n" - "vtrn.32 q2, q3 @ trans q2, q3 \n" - "vswp.32 d1, d4 @ swap d1, d4 \n" - "vswp.32 d3, d6 @ swap d3, d6 \n" - - "vmax.s32 q0, q0, q15 @ relu\n" - "vmax.s32 q1, q1, q15 @ relu\n" - "vmax.s32 q2, q2, q15 @ relu\n" - "vmax.s32 q3, q3, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4", "q15"); -#endif - } - if (we > width) { - int offset = 16 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0); - *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0); - *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0); - *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0); - din_hei_ptr += 4; - } - } - } - } else { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - int* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - int* doutc1_ptr = doutc1r0 + size_w; - int* doutc2_ptr = doutc2r0 + size_w; - int* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans q0, q1\n" - "vtrn.32 q2, q3 @ trans q2, q3\n" - "vswp.32 d1, d4 @ swap d1, d4 \n" - "vswp.32 d3, d6 @ swap d3, d6 \n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4", "q15"); -#endif - } - if (we > width) { - int offset = 16 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = din_hei_ptr[0]; - *(doutc1_ptr++) = din_hei_ptr[1]; - *(doutc2_ptr++) = din_hei_ptr[2]; - *(doutc3_ptr++) = din_hei_ptr[3]; - din_hei_ptr += 4; - } - } - } - } - return true; -} - -/*wirte result in outputs --int8, fp32 -* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] -*/ -template -inline bool write_to_output_c4_int32_1(const int* din, - dtype* dout, - int ch_n, - int hei_n, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - dtype* trash_ptr, - const float* scale, - PrecisionType out_dtype) { - if (ch_n != 4 || hei_n <= 0) { - LOG(ERROR) << "ch_n must be equal 4 and hei_n is more than zero"; - return false; - } - int size_c_out = width * height; - - dtype* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - dtype* doutc1r0 = doutc0r0 + size_c_out; - dtype* doutc2r0 = doutc1r0 + size_c_out; - dtype* doutc3r0 = doutc2r0 + size_c_out; - - const int* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int valid_w = we - ws; - int cnt = valid_w / 4; - - float32x4_t w_scale = vld1q_f32(scale); - // float32x4_t vzero = vdupq_n_f32(0.f); - - if (we > width) { - cnt--; - } - if (out_dtype == PRECISION(kFloat)) { - // int32_to_fp32 - if (flag_relu) { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "smax v16.4s, v16.4s, v20.4s \n" /* relu */ - "smax v17.4s, v17.4s, v20.4s \n" /* relu */ - "smax v18.4s, v18.4s, v20.4s \n" /* relu */ - "smax v19.4s, v19.4s, v20.4s \n" /* relu */ - // int32 --> fp32 - "scvtf v4.4s, v16.4s \n" - "scvtf v5.4s, v17.4s \n" - "scvtf v6.4s, v18.4s \n" - "scvtf v7.4s, v19.4s \n" - // mul - "fmul v16.4s, v4.4s, %[scale].s[0] \n" - "fmul v17.4s, v5.4s, %[scale].s[2] \n" - "fmul v18.4s, v6.4s, %[scale].s[1] \n" - "fmul v19.4s, v7.4s, %[scale].s[3] \n" - // res - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : [scale] "w"(w_scale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q2, q3 @ trans q0, q1 \n" - "vtrn.32 q4, q5 @ trans q2, q3 \n" - "vswp.32 d5, d8 @ swap d1, d4 \n" - "vswp.32 d7, d10 @ swap d3, d6 \n" - - "vmax.s32 q2, q2, q15 @ relu\n" - "vmax.s32 q3, q3, q15 @ relu\n" - "vmax.s32 q4, q4, q15 @ relu\n" - "vmax.s32 q5, q5, q15 @ relu\n" - - // int32-> fp32 - "vcvt.f32.s32 q6, q2 \n" - "vcvt.f32.s32 q7, q3 \n" - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - - // mul - "vmul.f32 q2, q6, %e[scale][0] \n" - "vmul.f32 q3, q7, %e[scale][1] \n" - "vmul.f32 q4, q8, %f[scale][0] \n" - "vmul.f32 q5, q9, %f[scale][1] \n" - - "vst1.32 {d4-d5}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : [scale] "w"(w_scale) - : "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - } - if (we > width) { - int offset = 16 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int j = we - 4; - for (; j < width; ++j) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0] * scale[0], 0); - *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1] * scale[1], 0); - *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2] * scale[2], 0); - *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3] * scale[3], 0); - din_hei_ptr += 4; - } - } - } - } else { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - // int32 --> fp32 - "scvtf v4.4s, v16.4s \n" - "scvtf v5.4s, v17.4s \n" - "scvtf v6.4s, v18.4s \n" - "scvtf v7.4s, v19.4s \n" - // mul - "fmul v16.4s, v4.4s, %[scale].s[0] \n" - "fmul v17.4s, v5.4s, %[scale].s[2] \n" - "fmul v18.4s, v6.4s, %[scale].s[1] \n" - "fmul v19.4s, v7.4s, %[scale].s[3] \n" - // res - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : [scale] "w"(w_scale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q2, q3 @ trans q0, q1 \n" - "vtrn.32 q4, q5 @ trans q2, q3 \n" - "vswp.32 d5, d8 @ swap d1, d4 \n" - "vswp.32 d7, d10 @ swap d3, d6 \n" - - // int32-> fp32 - "vcvt.f32.s32 q6, q2 \n" - "vcvt.f32.s32 q7, q3 \n" - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - - // mul - "vmul.f32 q2, q6, %e[scale][0] \n" - "vmul.f32 q3, q7, %e[scale][1] \n" - "vmul.f32 q4, q8, %f[scale][0] \n" - "vmul.f32 q5, q9, %f[scale][1] \n" - - "vst1.32 {d4-d5}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : [scale] "w"(w_scale) - : "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - } - if (we > width) { - int offset = 16 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int j = we - 4; - for (; j < width; ++j) { - *(doutc0_ptr++) = din_hei_ptr[0] * scale[0]; - *(doutc1_ptr++) = din_hei_ptr[1] * scale[1]; - *(doutc2_ptr++) = din_hei_ptr[2] * scale[2]; - *(doutc3_ptr++) = din_hei_ptr[3] * scale[3]; - din_hei_ptr += 4; - } - } - } - } - - } else if (out_dtype == PRECISION(kInt8)) { - // int32_to_int8 - if (flag_relu) { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "smax v16.4s, v16.4s, v20.4s \n" /* relu */ - "smax v17.4s, v17.4s, v20.4s \n" /* relu */ - "smax v18.4s, v18.4s, v20.4s \n" /* relu */ - "smax v19.4s, v19.4s, v20.4s \n" /* relu */ - // int32 --> fp32 - "scvtf v4.4s, v16.4s \n" - "scvtf v5.4s, v17.4s \n" - "scvtf v6.4s, v18.4s \n" - "scvtf v7.4s, v19.4s \n" - - // mul - "fmul v16.4s, v4.4s, %[scale].s[0] \n" - "fmul v17.4s, v5.4s, %[scale].s[2] \n" - "fmul v18.4s, v6.4s, %[scale].s[1] \n" - "fmul v19.4s, v7.4s, %[scale].s[3] \n" - - // fp32-int32 - "fcvtas v4.4s, v16.4s \n" - "fcvtas v5.4s, v17.4s \n" - "fcvtas v6.4s, v18.4s \n" - "fcvtas v7.4s, v19.4s \n" - - // int32-int16 - "sqxtn v8.4h, v4.4s \n" - "sqxtn v9.4h, v5.4s \n" - "sqxtn v10.4h, v6.4s \n" - "sqxtn v11.4h, v7.4s \n" - - "sqxtn v16.8b, v8.8h \n" - "sqxtn v17.8b, v9.8h \n" - "sqxtn v18.8b, v10.8h \n" - "sqxtn v19.8b, v11.8h \n" - // res - "str s16, [%[doutc0r0]], #4 \n" - "str s17, [%[doutc2r0]], #4 \n" - "str s18, [%[doutc1r0]], #4 \n" - "str s19, [%[doutc3r0]], #4 \n" - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : [scale] "w"(w_scale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q2, q3 @ trans q0, q1 \n" - "vtrn.32 q4, q5 @ trans q2, q3 \n" - "vswp.32 d5, d8 @ swap d1, d4 \n" - "vswp.32 d7, d10 @ swap d3, d6 \n" - - "vmax.s32 q2, q2, q15 @ relu\n" - "vmax.s32 q3, q3, q15 @ relu\n" - "vmax.s32 q4, q4, q15 @ relu\n" - "vmax.s32 q5, q5, q15 @ relu\n" - - // int32-> fp32 - "vcvt.f32.s32 q6, q2 \n" - "vcvt.f32.s32 q7, q3 \n" - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - - "vmov.f32 q2, #0.5 \n" - - // "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q3, q2, q2 @ set offset, 0.5\n" - "vand.i32 q4, q2, q2 @ set offset, 0.5\n" - "vand.i32 q5, q2, q2 @ set offset, 0.5\n" - - "vcgt.f32 q10, q6, q15 @ get mask > 0, in0\n" - "vcgt.f32 q11, q7, q15 @ get mask > 0, in1\n" - "vcgt.f32 q12, q8, q15 @ get mask > 0, in2\n" - "vcgt.f32 q13, q9, q15 @ get mask > 0, in3\n" - - "vmov.f32 q15, #-0.5 \n" - - "vbif.f32 q2, q15, q10 @ get right offset\n" - "vbif.f32 q3, q15, q11 @ get right offset\n" - "vbif.f32 q4, q15, q12 @ get right offset\n" - "vbif.f32 q5, q15, q13 @ get right offset\n" - - "vmla.f32 q2, q6, %e[scale][0] @ mul scale\n" - "vmla.f32 q3, q7, %e[scale][1] @ mul scale\n" - "vmla.f32 q4, q8, %f[scale][0] @ mul scale\n" - "vmla.f32 q5, q9, %f[scale][1] @ mul scale\n" - - "vcvt.s32.f32 q6, q2 @ cvt to int32\n" - "vcvt.s32.f32 q7, q3 @ cvt to int32\n" - "vcvt.s32.f32 q8, q4 @ cvt to int32\n" - "vcvt.s32.f32 q9, q5 @ cvt to int32\n" - - "vqmovn.s32 d20, q6 @ cnt to int16\n" - "vqmovn.s32 d22, q7 @ cnt to int16\n" - "vqmovn.s32 d24, q8 @ cnt to int16\n" - "vqmovn.s32 d26, q9 @ cnt to int16\n" - - "vqmovn.s16 d8, q10 @ cnt to int8\n" - "vqmovn.s16 d9, q11 @ cnt to int8\n" - "vqmovn.s16 d10, q12 @ cnt to int8\n" - "vqmovn.s16 d11, q13 @ cnt to int8\n" - - "vst1.32 {d8[0]}, [%[doutc0r0]] @ write to output\n" - "vst1.32 {d9[0]}, [%[doutc1r0]] @ write to output\n" - "vst1.32 {d10[0]}, [%[doutc2r0]] @ write to output\n" - "vst1.32 {d11[0]}, [%[doutc3r0]] @ write to output\n" - - "add %[doutc0r0], #4 \n" - "add %[doutc1r0], #4 \n" - "add %[doutc2r0], #4 \n" - "add %[doutc3r0], #4 \n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - "vmov.u32 q15, #0 @ dump zero\n" - - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : [scale] "w"(w_scale) - : "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - } - if (we > width) { - int offset = 16 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int j = we - 4; - for (; j < width; ++j) { - *(doutc0_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[0], 0) * scale[0])); - *(doutc1_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[1], 0) * scale[1])); - *(doutc2_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[2], 0) * scale[2])); - *(doutc3_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[3], 0) * scale[3])); - din_hei_ptr += 4; - } - } - } - } else { - for (int i = 0; i < size_h; i++) { // size_h - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 3: - doutc1_ptr = trash_ptr; - case 2: - doutc2_ptr = trash_ptr; - case 1: - doutc3_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - // int32 --> fp32 - "scvtf v4.4s, v16.4s \n" - "scvtf v5.4s, v17.4s \n" - "scvtf v6.4s, v18.4s \n" - "scvtf v7.4s, v19.4s \n" - - // mul - "fmul v16.4s, v4.4s, %[scale].s[0] \n" - "fmul v17.4s, v5.4s, %[scale].s[2] \n" - "fmul v18.4s, v6.4s, %[scale].s[1] \n" - "fmul v19.4s, v7.4s, %[scale].s[3] \n" - - // fp32-int32 - "fcvtas v4.4s, v16.4s \n" - "fcvtas v5.4s, v17.4s \n" - "fcvtas v6.4s, v18.4s \n" - "fcvtas v7.4s, v19.4s \n" - - // int32-int16 - "sqxtn v8.4h, v4.4s \n" - "sqxtn v9.4h, v5.4s \n" - "sqxtn v10.4h, v6.4s \n" - "sqxtn v11.4h, v7.4s \n" - - "sqxtn v16.8b, v8.8h \n" - "sqxtn v17.8b, v9.8h \n" - "sqxtn v18.8b, v10.8h \n" - "sqxtn v19.8b, v11.8h \n" - // res - "str s16, [%[doutc0r0]], #4 \n" - "str s17, [%[doutc2r0]], #4 \n" - "str s18, [%[doutc1r0]], #4 \n" - "str s19, [%[doutc3r0]], #4 \n" - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : [scale] "w"(w_scale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q2, q3 @ trans q0, q1 \n" - "vtrn.32 q4, q5 @ trans q2, q3 \n" - "vswp.32 d5, d8 @ swap d1, d4 \n" - "vswp.32 d7, d10 @ swap d3, d6 \n" - - // int32-> fp32 - "vcvt.f32.s32 q6, q2 \n" - "vcvt.f32.s32 q7, q3 \n" - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - - "vmov.f32 q2, #0.5 \n" - - // "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q3, q2, q2 @ set offset, 0.5\n" - "vand.i32 q4, q2, q2 @ set offset, 0.5\n" - "vand.i32 q5, q2, q2 @ set offset, 0.5\n" - - "vcgt.f32 q10, q6, q15 @ get mask > 0, in0\n" - "vcgt.f32 q11, q7, q15 @ get mask > 0, in1\n" - "vcgt.f32 q12, q8, q15 @ get mask > 0, in2\n" - "vcgt.f32 q13, q9, q15 @ get mask > 0, in3\n" - - "vmov.f32 q15, #-0.5 \n" - - "vbif.f32 q2, q15, q10 @ get right offset\n" - "vbif.f32 q3, q15, q11 @ get right offset\n" - "vbif.f32 q4, q15, q12 @ get right offset\n" - "vbif.f32 q5, q15, q13 @ get right offset\n" - - "vmla.f32 q2, q6, %e[scale][0] @ mul scale\n" - "vmla.f32 q3, q7, %e[scale][1] @ mul scale\n" - "vmla.f32 q4, q8, %f[scale][0] @ mul scale\n" - "vmla.f32 q5, q9, %f[scale][1] @ mul scale\n" - - "vcvt.s32.f32 q6, q2 @ cvt to int32\n" - "vcvt.s32.f32 q7, q3 @ cvt to int32\n" - "vcvt.s32.f32 q8, q4 @ cvt to int32\n" - "vcvt.s32.f32 q9, q5 @ cvt to int32\n" - - "vqmovn.s32 d20, q6 @ cnt to int16\n" - "vqmovn.s32 d22, q7 @ cnt to int16\n" - "vqmovn.s32 d24, q8 @ cnt to int16\n" - "vqmovn.s32 d26, q9 @ cnt to int16\n" - - "vqmovn.s16 d8, q10 @ cnt to int8\n" - "vqmovn.s16 d9, q11 @ cnt to int8\n" - "vqmovn.s16 d10, q12 @ cnt to int8\n" - "vqmovn.s16 d11, q13 @ cnt to int8\n" - - "vst1.32 {d8[0]}, [%[doutc0r0]] @ write to output\n" - "vst1.32 {d9[0]}, [%[doutc1r0]] @ write to output\n" - "vst1.32 {d10[0]}, [%[doutc2r0]] @ write to output\n" - "vst1.32 {d11[0]}, [%[doutc3r0]] @ write to output\n" - - "add %[doutc0r0], #4 \n" - "add %[doutc1r0], #4 \n" - "add %[doutc2r0], #4 \n" - "add %[doutc3r0], #4 \n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : [scale] "w"(w_scale) - : "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - } - if (we > width) { - int offset = 16 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int j = we - 4; - for (; j < width; ++j) { - *(doutc0_ptr++) = - saturate_cast(roundf(din_hei_ptr[0] * scale[0])); - *(doutc1_ptr++) = - saturate_cast(roundf(din_hei_ptr[1] * scale[1])); - *(doutc2_ptr++) = - saturate_cast(roundf(din_hei_ptr[2] * scale[2])); - *(doutc3_ptr++) = - saturate_cast(roundf(din_hei_ptr[3] * scale[3])); - din_hei_ptr += 4; - } - } - } - } - } else { - LOG(ERROR) << "ERROR: unsupported input data type!!"; - return false; - } - return true; -} - -/*wirte result in outputs -* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w] -*/ -inline bool write_to_output_c8_int32(const int* din, - int* dout, - int ch_n, - int hei_n, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - int* trash_ptr) { - if (ch_n != 8 || hei_n <= 0) { - LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero"; - return false; - } - int size_c_out = width * height; - - int* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - int* doutc1r0 = doutc0r0 + size_c_out; - int* doutc2r0 = doutc1r0 + size_c_out; - int* doutc3r0 = doutc2r0 + size_c_out; - int* doutc4r0 = doutc3r0 + size_c_out; - int* doutc5r0 = doutc4r0 + size_c_out; - int* doutc6r0 = doutc5r0 + size_c_out; - int* doutc7r0 = doutc6r0 + size_c_out; - - const int* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int valid_w = we - ws; - int cnt = valid_w / 4; - - if (we > width) { - cnt--; - } - if (flag_relu) { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - int* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - int* doutc1_ptr = doutc1r0 + size_w; - int* doutc2_ptr = doutc2r0 + size_w; - int* doutc3_ptr = doutc3r0 + size_w; - int* doutc4_ptr = doutc4r0 + size_w; - int* doutc5_ptr = doutc5r0 + size_w; - int* doutc6_ptr = doutc6r0 + size_w; - int* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "smax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "smax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "smax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "smax v19.4s, v19.4s, v20.4s \n" /*relu*/ - - "smax v8.4s, v8.4s, v20.4s \n" /*relu*/ - "smax v9.4s, v9.4s, v20.4s \n" /*relu*/ - "smax v12.4s, v12.4s, v20.4s \n" /*relu*/ - "smax v13.4s, v13.4s, v20.4s \n" /*relu*/ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "vmov.s32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "vmax.s32 q0, q0, q15 @ relu\n" - "vmax.s32 q1, q1, q15 @ relu\n" - "vmax.s32 q2, q2, q15 @ relu\n" - "vmax.s32 q3, q3, q15 @ relu\n" - - "vmax.s32 q4, q4, q15 @ relu\n" - "vmax.s32 q5, q5, q15 @ relu\n" - "vmax.s32 q6, q6, q15 @ relu\n" - "vmax.s32 q7, q7, q15 @ relu\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4", "q15"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0); - *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0); - *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0); - *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0); - *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0); - *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0); - *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0); - *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0); - din_hei_ptr += 8; - } - } - } - } else { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - int* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - int* doutc1_ptr = doutc1r0 + size_w; - int* doutc2_ptr = doutc2r0 + size_w; - int* doutc3_ptr = doutc3r0 + size_w; - int* doutc4_ptr = doutc4r0 + size_w; - int* doutc5_ptr = doutc5r0 + size_w; - int* doutc6_ptr = doutc6r0 + size_w; - int* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4", "q15"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = din_hei_ptr[0]; - *(doutc1_ptr++) = din_hei_ptr[1]; - *(doutc2_ptr++) = din_hei_ptr[2]; - *(doutc3_ptr++) = din_hei_ptr[3]; - *(doutc4_ptr++) = din_hei_ptr[4]; - *(doutc5_ptr++) = din_hei_ptr[5]; - *(doutc6_ptr++) = din_hei_ptr[6]; - *(doutc7_ptr++) = din_hei_ptr[7]; - din_hei_ptr += 8; - } - } - } - } - return true; -} - -/*wirte result in outputs--int8, fp32 -* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w] -*/ -template -static bool write_to_output_c8_int32_1(const int* din, - dtype* dout, - int ch_n, - int hei_n, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - dtype* trash_ptr, - const float* scale, - PrecisionType out_dtype) { - if (ch_n != 8 || hei_n <= 0) { - LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero"; - return false; - } - int size_c_out = width * height; - - dtype* doutc0r0 = dout + cs * size_c_out + hs * width + ws; - dtype* doutc1r0 = doutc0r0 + size_c_out; - dtype* doutc2r0 = doutc1r0 + size_c_out; - dtype* doutc3r0 = doutc2r0 + size_c_out; - dtype* doutc4r0 = doutc3r0 + size_c_out; - dtype* doutc5r0 = doutc4r0 + size_c_out; - dtype* doutc6r0 = doutc5r0 + size_c_out; - dtype* doutc7r0 = doutc6r0 + size_c_out; - - const int* ptr_din = din; - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int valid_w = we - ws; - int cnt = valid_w / 4; - - float32x4_t w_scale0 = vld1q_f32(scale); - float32x4_t w_scale1 = vld1q_f32(scale + 4); - - float32x4_t vzero = vdupq_n_f32(0.f); - - if (we > width) { - cnt--; - } - if (out_dtype == PRECISION(kFloat)) { - if (flag_relu) { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - dtype* doutc4_ptr = doutc4r0 + size_w; - dtype* doutc5_ptr = doutc5r0 + size_w; - dtype* doutc6_ptr = doutc6r0 + size_w; - dtype* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "smax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "smax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "smax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "smax v19.4s, v19.4s, v20.4s \n" /*relu*/ - - "smax v8.4s, v8.4s, v20.4s \n" /*relu*/ - "smax v9.4s, v9.4s, v20.4s \n" /*relu*/ - "smax v12.4s, v12.4s, v20.4s \n" /*relu*/ - "smax v13.4s, v13.4s, v20.4s \n" /*relu*/ - - // int32->fp32 - "scvtf v10.4s, v16.4s \n" - "scvtf v11.4s, v17.4s \n" - "scvtf v14.4s, v18.4s \n" - "scvtf v15.4s, v19.4s \n" - // mul - "fmul v16.4s, v10.4s, %[scale0].s[0] \n" - "fmul v17.4s, v11.4s, %[scale0].s[2] \n" - "fmul v18.4s, v14.4s, %[scale0].s[1] \n" - "fmul v19.4s, v15.4s, %[scale0].s[3] \n" - - "scvtf v10.4s, v8.4s \n" - "scvtf v11.4s, v9.4s \n" - "scvtf v14.4s, v12.4s \n" - "scvtf v15.4s, v13.4s \n" - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - // mul - "fmul v8.4s, v10.4s, %[scale1].s[0] \n" - "fmul v9.4s, v11.4s, %[scale1].s[2] \n" - "fmul v12.4s, v14.4s, %[scale1].s[1] \n" - "fmul v13.4s, v15.4s, %[scale1].s[3] \n" - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "vmov.s32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vmax.s32 q0, q0, q15 @ relu\n" - "vmax.s32 q1, q1, q15 @ relu\n" - "vmax.s32 q2, q2, q15 @ relu\n" - "vmax.s32 q3, q3, q15 @ relu\n" - - "vmax.s32 q4, q4, q15 @ relu\n" - "vmax.s32 q5, q5, q15 @ relu\n" - "vmax.s32 q6, q6, q15 @ relu\n" - "vmax.s32 q7, q7, q15 @ relu\n" - - // int32-> fp32 - "vcvt.f32.s32 q8, q0 \n" - "vcvt.f32.s32 q9, q1 \n" - "vcvt.f32.s32 q10, q2 \n" - "vcvt.f32.s32 q11, q3 \n" - - // mul - "vmul.f32 q0, q8, %q[scale0] \n" - "vmul.f32 q1, q9, %q[scale1] \n" - "vmul.f32 q2, q10, %q[scale0] \n" - "vmul.f32 q3, q11, %q[scale1] \n" - - // int32-> fp32 - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - "vcvt.f32.s32 q10, q6 \n" - "vcvt.f32.s32 q11, q7 \n" - - // mul - "vmul.f32 q4, q8, %q[scale0] \n" - "vmul.f32 q5, q9, %q[scale1] \n" - "vmul.f32 q6, q10, %q[scale0] \n" - "vmul.f32 q7, q11, %q[scale1] \n" - - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q15"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0] * scale[0], 0); - *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1] * scale[1], 0); - *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2] * scale[2], 0); - *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3] * scale[3], 0); - *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4] * scale[4], 0); - *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5] * scale[5], 0); - *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6] * scale[6], 0); - *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7] * scale[7], 0); - din_hei_ptr += 8; - } - } - } - } else { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - dtype* doutc4_ptr = doutc4r0 + size_w; - dtype* doutc5_ptr = doutc5r0 + size_w; - dtype* doutc6_ptr = doutc6r0 + size_w; - dtype* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - // int32->fp32 - "scvtf v10.4s, v16.4s \n" - "scvtf v11.4s, v17.4s \n" - "scvtf v14.4s, v18.4s \n" - "scvtf v15.4s, v19.4s \n" - // mul - "fmul v16.4s, v10.4s, %[scale0].s[0] \n" - "fmul v17.4s, v11.4s, %[scale0].s[2] \n" - "fmul v18.4s, v14.4s, %[scale0].s[1] \n" - "fmul v19.4s, v15.4s, %[scale0].s[3] \n" - - "scvtf v10.4s, v8.4s \n" - "scvtf v11.4s, v9.4s \n" - "scvtf v14.4s, v12.4s \n" - "scvtf v15.4s, v13.4s \n" - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - // mul - "fmul v8.4s, v10.4s, %[scale1].s[0] \n" - "fmul v9.4s, v11.4s, %[scale1].s[2] \n" - "fmul v12.4s, v14.4s, %[scale1].s[1] \n" - "fmul v13.4s, v15.4s, %[scale1].s[3] \n" - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "vmov.s32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - // int32-> fp32 - "vcvt.f32.s32 q8, q0 \n" - "vcvt.f32.s32 q9, q1 \n" - "vcvt.f32.s32 q10, q2 \n" - "vcvt.f32.s32 q11, q3 \n" - - // mul - "vmul.f32 q0, q8, %q[scale0] \n" - "vmul.f32 q1, q9, %q[scale1] \n" - "vmul.f32 q2, q10, %q[scale0] \n" - "vmul.f32 q3, q11, %q[scale1] \n" - - // int32-> fp32 - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - "vcvt.f32.s32 q10, q6 \n" - "vcvt.f32.s32 q11, q7 \n" - - // mul - "vmul.f32 q4, q8, %q[scale0] \n" - "vmul.f32 q5, q9, %q[scale1] \n" - "vmul.f32 q6, q10, %q[scale0] \n" - "vmul.f32 q7, q11, %q[scale1] \n" - - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q15"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = din_hei_ptr[0] * scale[0]; - *(doutc1_ptr++) = din_hei_ptr[1] * scale[1]; - *(doutc2_ptr++) = din_hei_ptr[2] * scale[2]; - *(doutc3_ptr++) = din_hei_ptr[3] * scale[3]; - *(doutc4_ptr++) = din_hei_ptr[4] * scale[4]; - *(doutc5_ptr++) = din_hei_ptr[5] * scale[5]; - *(doutc6_ptr++) = din_hei_ptr[6] * scale[6]; - *(doutc7_ptr++) = din_hei_ptr[7] * scale[7]; - din_hei_ptr += 8; - } - } - } - } - } else if (out_dtype == PRECISION(kInt8)) { - // int32_to_int8 - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - if (flag_relu) { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - dtype* doutc4_ptr = doutc4r0 + size_w; - dtype* doutc5_ptr = doutc5r0 + size_w; - dtype* doutc6_ptr = doutc6r0 + size_w; - dtype* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - // "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "smax v16.4s, v16.4s, %[vzero].4s \n" /*relu*/ - "smax v17.4s, v17.4s, %[vzero].4s \n" /*relu*/ - "smax v18.4s, v18.4s, %[vzero].4s \n" /*relu*/ - "smax v19.4s, v19.4s, %[vzero].4s \n" /*relu*/ - - "smax v8.4s, v8.4s, %[vzero].4s \n" /*relu*/ - "smax v9.4s, v9.4s, %[vzero].4s \n" /*relu*/ - "smax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - "smax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - // int32 --> fp32 - "scvtf v10.4s, v16.4s \n" - "scvtf v11.4s, v17.4s \n" - "scvtf v14.4s, v18.4s \n" - "scvtf v15.4s, v19.4s \n" - - "scvtf v20.4s, v8.4s \n" - "scvtf v21.4s, v9.4s \n" - "scvtf v22.4s, v12.4s \n" - "scvtf v23.4s, v13.4s \n" - - // mul - "fmul v16.4s, v10.4s, %[scale0].s[0] \n" - "fmul v17.4s, v11.4s, %[scale0].s[2] \n" - "fmul v18.4s, v14.4s, %[scale0].s[1] \n" - "fmul v19.4s, v15.4s, %[scale0].s[3] \n" - - "fmul v8.4s, v20.4s, %[scale1].s[0] \n" - "fmul v9.4s, v21.4s, %[scale1].s[2] \n" - "fmul v12.4s, v22.4s, %[scale1].s[1] \n" - "fmul v13.4s, v23.4s, %[scale1].s[3] \n" - - // fp32-int32 - "fcvtas v10.4s, v16.4s \n" - "fcvtas v11.4s, v17.4s \n" - "fcvtas v14.4s, v18.4s \n" - "fcvtas v15.4s, v19.4s \n" - - "fcvtas v20.4s, v8.4s \n" - "fcvtas v21.4s, v9.4s \n" - "fcvtas v22.4s, v12.4s \n" - "fcvtas v23.4s, v13.4s \n" - - // int32-int16 - "sqxtn v16.4h, v10.4s \n" - "sqxtn v17.4h, v11.4s \n" - "sqxtn v18.4h, v14.4s \n" - "sqxtn v19.4h, v15.4s \n" - - "sqxtn v8.4h, v20.4s \n" - "sqxtn v9.4h, v21.4s \n" - "sqxtn v12.4h, v22.4s \n" - "sqxtn v13.4h, v23.4s \n" - - // int16-int8 - "sqxtn v10.8b, v16.8h \n" - "sqxtn v11.8b, v17.8h \n" - "sqxtn v14.8b, v18.8h \n" - "sqxtn v15.8b, v19.8h \n" - - "sqxtn v20.8b, v8.8h \n" - "sqxtn v21.8b, v9.8h \n" - "sqxtn v22.8b, v12.8h \n" - "sqxtn v23.8b, v13.8h \n" - - "str s10, [%[doutc0r0]], #4 \n" /* store c0r0*/ - "str s11, [%[doutc2r0]], #4 \n" /* store c2r0*/ - "str s14, [%[doutc1r0]], #4 \n" /* store c1r0*/ - "str s15, [%[doutc3r0]], #4 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str s20, [%[doutc4r0]], #4 \n" /* store c0r0*/ - "str s21, [%[doutc6r0]], #4 \n" /* store c2r0*/ - "str s22, [%[doutc5r0]], #4 \n" /* store c1r0*/ - "str s23, [%[doutc7r0]], #4 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - [scale0] "w"(w_scale0), [scale1] "w"(w_scale1), [vzero] "w"(vzero) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23"); -#else - asm volatile( - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "1: @ main loop\n" - "vmax.s32 q4, q4, %q[vzero] @ relu\n" - "vmax.s32 q5, q5, %q[vzero] @ relu\n" - "vmax.s32 q6, q6, %q[vzero] @ relu\n" - "vmax.s32 q7, q7, %q[vzero] @ relu\n" - - // int32-> fp32 - "vmov.f32 q15, #0.5 \n" - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - "vcvt.f32.s32 q10, q6 \n" - "vcvt.f32.s32 q11, q7 \n" - - "vand.i32 q4, q15, q15 @ set offset, 0.5\n" - "vand.i32 q5, q15, q15 @ set offset, 0.5\n" - "vand.i32 q6, q15, q15 @ set offset, 0.5\n" - "vand.i32 q7, q15, q15 @ set offset, 0.5\n" - - "vmov.f32 q15, #-0.5 \n" - - "vcgt.f32 q12, q8, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q13, q9, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q14, q10, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q3, q11, %q[vzero] @ get mask > 0, in0\n" - - "vbif.f32 q4, q15, q12 @ get right offset\n" - "vbif.f32 q5, q15, q13 @ get right offset\n" - "vbif.f32 q6, q15, q14 @ get right offset\n" - "vbif.f32 q7, q15, q3 @ get right offset\n" - - "vld1.32 {d24-d27}, [%[ptr_din]]! @load data \n" - "vld1.32 {d28-d29}, [%[ptr_din]]! @load data \n" - "vld1.32 {d6-d7}, [%[ptr_din]]! @load data \n" - - "vmla.f32 q4, q8, %q[scale0] @ mul scale\n" - "vmla.f32 q5, q9, %q[scale1] @ mul scale\n" - "vmla.f32 q6, q10, %q[scale0] @ mul scale\n" - "vmla.f32 q7, q11, %q[scale1] @ mul scale\n" - - "vmax.s32 q12, q12, %q[vzero] @ relu\n" - "vmax.s32 q13, q13, %q[vzero] @ relu\n" - "vmax.s32 q14, q14, %q[vzero] @ relu\n" - "vmax.s32 q3, q3, %q[vzero] @ relu\n" - - "vcvt.s32.f32 q8, q4 @ cvt to int32\n" - "vcvt.s32.f32 q9, q5 @ cvt to int32\n" - "vcvt.s32.f32 q10, q6 @ cvt to int32\n" - "vcvt.s32.f32 q11, q7 @ cvt to int32\n" - - "vqmovn.s32 d8, q8 @ cnt to int16\n" - "vqmovn.s32 d10, q9 @ cnt to int16\n" - "vqmovn.s32 d12, q10 @ cnt to int16\n" - "vqmovn.s32 d14, q11 @ cnt to int16\n" - - "vqmovn.s16 d16, q4 @ cnt to int8\n" - "vqmovn.s16 d17, q5 @ cnt to int8\n" - "vqmovn.s16 d18, q6 @ cnt to int8\n" - "vqmovn.s16 d19, q7 @ cnt to int8\n" - - "vmov.f32 q15, #0.5 \n" - - "vcvt.f32.s32 q4, q12 \n" - "vcvt.f32.s32 q5, q13 \n" - "vcvt.f32.s32 q6, q14 \n" - "vcvt.f32.s32 q7, q3 \n" - - "vand.i32 q12, q15, q15 @ set offset, 0.5\n" - "vand.i32 q13, q15, q15 @ set offset, 0.5\n" - "vand.i32 q14, q15, q15 @ set offset, 0.5\n" - "vand.i32 q3, q15, q15 @ set offset, 0.5\n" - - "vmov.f32 q15, #-0.5 \n" - - "vcgt.f32 q10, q4, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q11, q5, %q[vzero] @ get mask > 0, in0\n" - - "vbif.f32 q12, q15, q10 @ get right offset\n" - "vbif.f32 q13, q15, q11 @ get right offset\n" - - "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in0\n" - - "vbif.f32 q14, q15, q10 @ get right offset\n" - "vbif.f32 q3, q15, q11 @ get right offset\n" - - "vmla.f32 q12, q4, %q[scale0] @ mul scale\n" - "vmla.f32 q13, q5, %q[scale1] @ mul scale\n" - "vmla.f32 q14, q6, %q[scale0] @ mul scale\n" - "vmla.f32 q3, q7, %q[scale1] @ mul scale\n" - - "vcvt.s32.f32 q4, q12 @ cvt to int32\n" - "vcvt.s32.f32 q5, q13 @ cvt to int32\n" - "vcvt.s32.f32 q6, q14 @ cvt to int32\n" - "vcvt.s32.f32 q7, q3 @ cvt to int32\n" - - "vqmovn.s32 d24, q4 @ cnt to int16\n" - "vqmovn.s32 d26, q5 @ cnt to int16\n" - "vqmovn.s32 d28, q6 @ cnt to int16\n" - "vqmovn.s32 d6, q7 @ cnt to int16\n" - - "vqmovn.s16 d20, q12 @ cnt to int8\n" - "vqmovn.s16 d21, q13 @ cnt to int8\n" - "vqmovn.s16 d22, q14 @ cnt to int8\n" - "vqmovn.s16 d23, q3 @ cnt to int8\n" - - "vtrn.8 d16, d18 @ trans q0, q2 \n" - "vtrn.8 d20, d22 @ trans q4, q6 \n" - "vtrn.16 d16, d20 @ trans q0, q2 \n" - "vtrn.16 d18, d22 @ trans q4, q6 \n" - - "vtrn.8 d17, d19 @ trans q0, q2 \n" - "vtrn.8 d21, d23 @ trans q4, q6 \n" - "vtrn.16 d17, d21 @ trans q0, q2 \n" - "vtrn.16 d19, d23 @ trans q4, q6 \n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d16[0]}, [%[doutc0r0]] @ store result, add " - "pointer\n" - "vst1.32 {d18[0]}, [%[doutc1r0]] @ store result, add " - "pointer\n" - "vst1.32 {d20[0]}, [%[doutc2r0]] @ store result, add " - "pointer\n" - "vst1.32 {d22[0]}, [%[doutc3r0]] @ store result, add " - "pointer\n" - - "vst1.32 {d17[0]}, [%[doutc4r0]] @ store result, add " - "pointer\n" - "vst1.32 {d19[0]}, [%[doutc5r0]] @ store result, add " - "pointer\n" - "vst1.32 {d21[0]}, [%[doutc6r0]] @ store result, add " - "pointer\n" - "vst1.32 {d23[0]}, [%[doutc7r0]] @ store result, add " - "pointer\n" - - "add %[doutc0r0], #4 @ add \n" - "add %[doutc1r0], #4 @ add \n" - "add %[doutc2r0], #4 @ add \n" - "add %[doutc3r0], #4 @ add \n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "add %[doutc4r0], #4 @ add \n" - "add %[doutc5r0], #4 @ add \n" - "add %[doutc6r0], #4 @ add \n" - "add %[doutc7r0], #4 @ add \n" - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - [scale0] "w"(w_scale0), [scale1] "w"(w_scale1), [vzero] "w"(vzero) - : "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[0] * scale[0], 0))); - *(doutc1_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[1] * scale[1], 0))); - *(doutc2_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[2] * scale[2], 0))); - *(doutc3_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[3] * scale[3], 0))); - *(doutc4_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[4] * scale[4], 0))); - *(doutc5_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[5] * scale[5], 0))); - *(doutc6_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[6] * scale[6], 0))); - *(doutc7_ptr++) = saturate_cast( - roundf(LITEMAX(din_hei_ptr[7] * scale[7], 0))); - din_hei_ptr += 8; - } - } - } - } else { - for (int i = 0; i < size_h; i++) { - int size_w = i * width; - dtype* doutc0_ptr = doutc0r0 + size_w; // doutc0r0 + width; - dtype* doutc1_ptr = doutc1r0 + size_w; - dtype* doutc2_ptr = doutc2r0 + size_w; - dtype* doutc3_ptr = doutc3r0 + size_w; - dtype* doutc4_ptr = doutc4r0 + size_w; - dtype* doutc5_ptr = doutc5r0 + size_w; - dtype* doutc6_ptr = doutc6r0 + size_w; - dtype* doutc7_ptr = doutc7r0 + size_w; - if (ce > channel) { - switch (ce - channel) { - case 7: - doutc1_ptr = trash_ptr; - case 6: - doutc2_ptr = trash_ptr; - case 5: - doutc3_ptr = trash_ptr; - case 4: - doutc4_ptr = trash_ptr; - case 3: - doutc5_ptr = trash_ptr; - case 2: - doutc6_ptr = trash_ptr; - case 1: - doutc7_ptr = trash_ptr; - default: - break; - } - } - ptr_din = din + i * valid_w * ch_n; - const int* din_hei_ptr = ptr_din; - if (cnt > 0) { - int cnt_loop = cnt; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - // "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - // int32 --> fp32 - "scvtf v10.4s, v16.4s \n" - "scvtf v11.4s, v17.4s \n" - "scvtf v14.4s, v18.4s \n" - "scvtf v15.4s, v19.4s \n" - - "scvtf v20.4s, v8.4s \n" - "scvtf v21.4s, v9.4s \n" - "scvtf v22.4s, v12.4s \n" - "scvtf v23.4s, v13.4s \n" - - // mul - "fmul v16.4s, v10.4s, %[scale0].s[0] \n" - "fmul v17.4s, v11.4s, %[scale0].s[2] \n" - "fmul v18.4s, v14.4s, %[scale0].s[1] \n" - "fmul v19.4s, v15.4s, %[scale0].s[3] \n" - - "fmul v8.4s, v20.4s, %[scale1].s[0] \n" - "fmul v9.4s, v21.4s, %[scale1].s[2] \n" - "fmul v12.4s, v22.4s, %[scale1].s[1] \n" - "fmul v13.4s, v23.4s, %[scale1].s[3] \n" - - // fp32-int32 - "fcvtas v10.4s, v16.4s \n" - "fcvtas v11.4s, v17.4s \n" - "fcvtas v14.4s, v18.4s \n" - "fcvtas v15.4s, v19.4s \n" - - "fcvtas v20.4s, v8.4s \n" - "fcvtas v21.4s, v9.4s \n" - "fcvtas v22.4s, v12.4s \n" - "fcvtas v23.4s, v13.4s \n" - - // int32-int16 - "sqxtn v16.4h, v10.4s \n" - "sqxtn v17.4h, v11.4s \n" - "sqxtn v18.4h, v14.4s \n" - "sqxtn v19.4h, v15.4s \n" - - "sqxtn v8.4h, v20.4s \n" - "sqxtn v9.4h, v21.4s \n" - "sqxtn v12.4h, v22.4s \n" - "sqxtn v13.4h, v23.4s \n" - - // int16-int8 - "sqxtn v10.8b, v16.8h \n" - "sqxtn v11.8b, v17.8h \n" - "sqxtn v14.8b, v18.8h \n" - "sqxtn v15.8b, v19.8h \n" - - "sqxtn v20.8b, v8.8h \n" - "sqxtn v21.8b, v9.8h \n" - "sqxtn v22.8b, v12.8h \n" - "sqxtn v23.8b, v13.8h \n" - - "str s10, [%[doutc0r0]], #4 \n" /* store c0r0*/ - "str s11, [%[doutc2r0]], #4 \n" /* store c2r0*/ - "str s14, [%[doutc1r0]], #4 \n" /* store c1r0*/ - "str s15, [%[doutc3r0]], #4 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str s20, [%[doutc4r0]], #4 \n" /* store c0r0*/ - "str s21, [%[doutc6r0]], #4 \n" /* store c2r0*/ - "str s22, [%[doutc5r0]], #4 \n" /* store c1r0*/ - "str s23, [%[doutc7r0]], #4 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23"); -#else - asm volatile( - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "1: @ main loop\n" - // int32-> fp32 - "vmov.f32 q15, #0.5 \n" - "vcvt.f32.s32 q8, q4 \n" - "vcvt.f32.s32 q9, q5 \n" - "vcvt.f32.s32 q10, q6 \n" - "vcvt.f32.s32 q11, q7 \n" - - "vand.i32 q4, q15, q15 @ set offset, 0.5\n" - "vand.i32 q5, q4, q4 @ set offset, 0.5\n" - "vand.i32 q6, q4, q4 @ set offset, 0.5\n" - "vand.i32 q7, q4, q4 @ set offset, 0.5\n" - - "vmov.f32 q15, #-0.5 \n" - - "vcgt.f32 q12, q8, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q13, q9, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q14, q10, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q3, q11, %q[vzero] @ get mask > 0, in0\n" - - "vbif.f32 q4, q15, q12 @ get right offset\n" - "vbif.f32 q5, q15, q13 @ get right offset\n" - "vbif.f32 q6, q15, q14 @ get right offset\n" - "vbif.f32 q7, q15, q3 @ get right offset\n" - - "vld1.32 {d24-d27}, [%[ptr_din]]! @load data \n" - "vld1.32 {d28-d29}, [%[ptr_din]]! @load data \n" - "vld1.32 {d6-d7}, [%[ptr_din]]! @load data \n" - - "vmla.f32 q4, q8, %q[scale0] @ mul scale\n" - "vmla.f32 q5, q9, %q[scale1] @ mul scale\n" - "vmla.f32 q6, q10, %q[scale0] @ mul scale\n" - "vmla.f32 q7, q11, %q[scale1] @ mul scale\n" - - "vcvt.s32.f32 q8, q4 @ cvt to int32\n" - "vcvt.s32.f32 q9, q5 @ cvt to int32\n" - "vcvt.s32.f32 q10, q6 @ cvt to int32\n" - "vcvt.s32.f32 q11, q7 @ cvt to int32\n" - - "vqmovn.s32 d8, q8 @ cnt to int16\n" - "vqmovn.s32 d10, q9 @ cnt to int16\n" - "vqmovn.s32 d12, q10 @ cnt to int16\n" - "vqmovn.s32 d14, q11 @ cnt to int16\n" - - "vqmovn.s16 d16, q4 @ cnt to int8\n" - "vqmovn.s16 d17, q5 @ cnt to int8\n" - "vqmovn.s16 d18, q6 @ cnt to int8\n" - "vqmovn.s16 d19, q7 @ cnt to int8\n" - - "vmov.f32 q15, #0.5 \n" - - "vcvt.f32.s32 q4, q12 \n" - "vcvt.f32.s32 q5, q13 \n" - "vcvt.f32.s32 q6, q14 \n" - "vcvt.f32.s32 q7, q3 \n" - - "vand.i32 q12, q15, q15 @ set offset, 0.5\n" - "vand.i32 q13, q12, q12 @ set offset, 0.5\n" - "vand.i32 q14, q12, q12 @ set offset, 0.5\n" - "vand.i32 q3, q12, q12 @ set offset, 0.5\n" - - "vmov.f32 q15, #-0.5 \n" - - "vcgt.f32 q10, q4, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q11, q5, %q[vzero] @ get mask > 0, in0\n" - - "vbif.f32 q12, q15, q10 @ get right offset\n" - "vbif.f32 q13, q15, q11 @ get right offset\n" - - "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in0\n" - - "vbif.f32 q14, q15, q10 @ get right offset\n" - "vbif.f32 q3, q15, q11 @ get right offset\n" - - "vmla.f32 q12, q4, %q[scale0] @ mul scale\n" - "vmla.f32 q13, q5, %q[scale1] @ mul scale\n" - "vmla.f32 q14, q6, %q[scale0] @ mul scale\n" - "vmla.f32 q3, q7, %q[scale1] @ mul scale\n" - - "vcvt.s32.f32 q4, q12 @ cvt to int32\n" - "vcvt.s32.f32 q5, q13 @ cvt to int32\n" - "vcvt.s32.f32 q6, q14 @ cvt to int32\n" - "vcvt.s32.f32 q7, q3 @ cvt to int32\n" - - "vqmovn.s32 d24, q4 @ cnt to int16\n" - "vqmovn.s32 d26, q5 @ cnt to int16\n" - "vqmovn.s32 d28, q6 @ cnt to int16\n" - "vqmovn.s32 d6, q7 @ cnt to int16\n" - - "vqmovn.s16 d20, q12 @ cnt to int8\n" - "vqmovn.s16 d21, q13 @ cnt to int8\n" - "vqmovn.s16 d22, q14 @ cnt to int8\n" - "vqmovn.s16 d23, q3 @ cnt to int8\n" - - "vtrn.8 d16, d18 @ trans q0, q2 \n" - "vtrn.8 d20, d22 @ trans q4, q6 \n" - "vtrn.16 d16, d20 @ trans q0, q2 \n" - "vtrn.16 d18, d22 @ trans q4, q6 \n" - - "vtrn.8 d17, d19 @ trans q0, q2 \n" - "vtrn.8 d21, d23 @ trans q4, q6 \n" - "vtrn.16 d17, d21 @ trans q0, q2 \n" - "vtrn.16 d19, d23 @ trans q4, q6 \n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d16[0]}, [%[doutc0r0]] @ store result, add " - "pointer\n" - "vst1.32 {d18[0]}, [%[doutc1r0]] @ store result, add " - "pointer\n" - "vst1.32 {d20[0]}, [%[doutc2r0]] @ store result, add " - "pointer\n" - "vst1.32 {d22[0]}, [%[doutc3r0]] @ store result, add " - "pointer\n" - - "vst1.32 {d17[0]}, [%[doutc4r0]] @ store result, add " - "pointer\n" - "vst1.32 {d19[0]}, [%[doutc5r0]] @ store result, add " - "pointer\n" - "vst1.32 {d21[0]}, [%[doutc6r0]] @ store result, add " - "pointer\n" - "vst1.32 {d23[0]}, [%[doutc7r0]] @ store result, add " - "pointer\n" - - "add %[doutc0r0], #4 @ add \n" - "add %[doutc1r0], #4 @ add \n" - "add %[doutc2r0], #4 @ add \n" - "add %[doutc3r0], #4 @ add \n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "add %[doutc4r0], #4 @ add \n" - "add %[doutc5r0], #4 @ add \n" - "add %[doutc6r0], #4 @ add \n" - "add %[doutc7r0], #4 @ add \n" - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - [scale0] "w"(w_scale0), [scale1] "w"(w_scale1), [vzero] "w"(vzero) - : "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - } - if (we > width) { - int offset = 32 * (valid_w / 4 - 1); - din_hei_ptr = ptr_din + offset; - int i = we - 4; - for (; i < width; ++i) { - *(doutc0_ptr++) = - saturate_cast(roundf(din_hei_ptr[0] * scale[0])); - *(doutc1_ptr++) = - saturate_cast(roundf(din_hei_ptr[1] * scale[1])); - *(doutc2_ptr++) = - saturate_cast(roundf(din_hei_ptr[2] * scale[2])); - *(doutc3_ptr++) = - saturate_cast(roundf(din_hei_ptr[3] * scale[3])); - *(doutc4_ptr++) = - saturate_cast(roundf(din_hei_ptr[4] * scale[4])); - *(doutc5_ptr++) = - saturate_cast(roundf(din_hei_ptr[5] * scale[5])); - *(doutc6_ptr++) = - saturate_cast(roundf(din_hei_ptr[6] * scale[6])); - *(doutc7_ptr++) = - saturate_cast(roundf(din_hei_ptr[7] * scale[7])); - din_hei_ptr += 8; - } - } - } - } - } else { - LOG(ERROR) << "ERROR: unsupported input data type!!"; - return false; - } - return true; -} - -/* -* din [n, hei_n, ch_n, w] -* dout [n, ch_n, hei_n, w] -*/ -template -static bool write_to_output_numc(const dtype* din, - dtype* dout, - int ch_n, - int hei_n, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - dtype* trash_ptr) { - if (ch_n <= 0 || hei_n <= 0) { - LOG(ERROR) << "ch_n and hei_n are more than zero"; - return false; - } - int size_c_out = width * height; - - dtype* out_array[ch_n]; - out_array[0] = dout + cs * size_c_out + hs * width + ws; - - for (int i = 1; i < ch_n; i++) { - out_array[i] = out_array[i - 1] + size_c_out; - } - - const dtype* ptr_din = din; - - int cremain = ce - channel; - for (int i = 1; i <= cremain; i++) { - out_array[ch_n - i] = trash_ptr; - } - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int size_w = we - ws; - - int size_c_in = ch_n * size_w; - - size_t valid_w_byte = width * sizeof(dtype); - - if (flag_relu) { - for (int h = 0; h < size_h; h++) { - const dtype* din_ptr = din + h * size_c_in; - for (int i = 0; i < ch_n; i++) { - dtype* dout_ptr = out_array[i] + h * width; - for (int k = 0; k < width; k++) { - *(dout_ptr++) = LITEMAX(din_ptr[k], 0); - } - din_ptr += size_w; - } - } - } else { - for (int h = 0; h < size_h; h++) { - const dtype* din_ptr = din + h * size_c_in; - for (int i = 0; i < ch_n; i++) { - dtype* dout_ptr = out_array[i] + h * width; - memcpy(dout_ptr, din_ptr, valid_w_byte); - din_ptr += size_w; - } - } - } - return true; -} - -/// ch_n == ce - cs ?? -/// hei_n == he - hs ?? -/// channel height width ? -> output -template -static bool write2_to_output_numc(const ditype* din, - dotype* dout, - int ch_n, - int hei_n, - int cs, - int ce, - int hs, - int he, - int ws, - int we, - int channel, - int height, - int width, - bool flag_relu, - dotype* trash_ptr, - float const* scales) { - // static_assert(std::is_same::value, "just support float"); - - if (ch_n <= 0 || hei_n <= 0) { - LOG(ERROR) << "ch_n and hei_n are more than zero"; - return false; - } - - int size_c_out = width * height; - - dotype* out_array[ch_n]; - out_array[0] = dout + cs * size_c_out + hs * width + ws; - - for (int i = 1; i < ch_n; i++) { - out_array[i] = out_array[i - 1] + size_c_out; - } - - const ditype* ptr_din = din; - - int cremain = ce - channel; - for (int i = 1; i <= cremain; i++) { - out_array[ch_n - i] = trash_ptr; - } - - int size_h = (he > height ? height : he) - hs; // size_h == hei_n - - int size_w = we - ws; - - int size_c_in = ch_n * size_w; - - size_t valid_w_byte = width * sizeof(ditype); - - if (flag_relu) { - for (int h = 0; h < size_h; h++) { - ditype const* din_ptr = din + h * size_c_in; - for (int i = 0; i < ch_n; i++) { - float const ws = scales[(i + cs) % ch_n]; - dotype* dout_ptr = out_array[i] + h * width; - for (int k = 0; k < width; k++) { - *(dout_ptr++) = LITEMAX(din_ptr[k] * ws, 0); - } - din_ptr += size_w; - } - } - } else { - for (int h = 0; h < size_h; h++) { - ditype const* din_ptr = din + h * size_c_in; - for (int i = 0; i < ch_n; i++) { - dotype* dout_ptr = out_array[i] + h * width; - - float const* ws = &scales[(i + cs) % ch_n]; - int32_to_dtype(din_ptr, dout_ptr, ws, 1, 1, width); - - din_ptr += size_w; - } - } - } - return true; -} -/** -* innput din: nchwc(num) -*/ -inline bool fill_packed_bias_nxmw_fp32( - const float* bias, float* dout, int ch_n, int hei_n, int wround) { - if (ch_n <= 0 || hei_n <= 0) { - LOG(ERROR) << "ch_n and hei_n are more than zero"; - return false; - } - int cnt_ch = ch_n / 4; - int size = wround * ch_n; - for (int h = 0; h < hei_n; h++) { - float* dout_ptr = dout + h * size; - for (int i = 0; i < wround; i++) { - const float* bias_ptr = bias; - int j = 0; - for (; j < cnt_ch; j++) { - float32x4_t vb = vld1q_f32(bias_ptr); - bias_ptr += 4; - - vst1q_f32(dout_ptr, vb); - dout_ptr += 4; - } - j = j * 4; - for (; j < ch_n; j++) { - *dout_ptr = *bias_ptr; - dout_ptr++; - bias_ptr++; - } - } - } -} - -inline bool fill_packed_bias_nxmw_int8( - const int* bias, int* dout, int ch_n, int hei_n, int wround) { - if (ch_n <= 0 || hei_n <= 0) { - LOG(ERROR) << "ch_n and hei_n are more than zero"; - return false; - } - int cnt_ch = ch_n / 4; - int size = wround * ch_n; - for (int h = 0; h < hei_n; h++) { - int* dout_ptr = dout + h * size; - for (int i = 0; i < wround; i++) { - const int* bias_ptr = bias; - int j = 0; - for (; j < cnt_ch; j++) { - int32x4_t vb = vld1q_s32(bias_ptr); - bias_ptr += 4; - - vst1q_s32(dout_ptr, vb); - dout_ptr += 4; - } - j = j * 4; - for (; j < ch_n; j++) { - *dout_ptr = *bias_ptr; - dout_ptr++; - bias_ptr++; - } - } - } - return true; -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise.cc b/lite/backends/arm/math/conv_depthwise.cc deleted file mode 100644 index 79b8cec571..0000000000 --- a/lite/backends/arm/math/conv_depthwise.cc +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -bool DepthwiseConv::create(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ic = x_dims[1]; - int ow = o_dims[3]; - int oc = o_dims[1]; - int kw = w_dims[3]; - int sw = param.strides[1]; - // select dw conv kernel - if (kw == 3) { - VLOG(5) << "invoke 3x3 dw conv"; - impl_ = conv_depthwise_3x3; - } else if (kw == 5) { - VLOG(5) << "invoke 5x5 dw conv"; - this->ctx_->ExtendWorkspace((iw + ow) * sizeof(float)); - impl_ = conv_depthwise_5x5; - } else { - LOG(ERROR) << "this type dw conv not impl"; - return false; - } - return true; -} - -template <> -bool DepthwiseConv::init(const operators::ConvParam& param, - Context* ctx) { - this->ctx_ = ctx; - return create(param, ctx); -} - -template <> -bool DepthwiseConv::run(const operators::ConvParam& param) { - // start timer - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - auto* o_data = param.output->mutable_data(); - - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int bs = x_dims[0]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int oc = o_dims[1]; - - impl_(i_data, - o_data, - bs, - oc, - oh, - ow, - ic, - ih, - iw, - w_data, - b_data, - param, - this->ctx_); - - // timer end - return true; -} - -template -bool DepthwiseConvInt8::create(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int ic = x_dims[1]; - int ih = x_dims[2]; - int iw = x_dims[3]; // nchw - int oc = o_dims[1]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int kw = w_dims[3]; - int sw = param.strides[1]; - w_scale_ = param.weight_scale; - - //! select dw conv kernel - if (kw == 3) { - tmp_int32_out_.Resize(o_dims); - VLOG(5) << "invoke 3x3 depthwise int8 conv"; - impl_ = conv_depthwise_3x3_int8; - } else if (kw == 5) { - // update w_data scale - if (Ptype_out == PRECISION(kFloat) || Ptype_out == PRECISION(kInt8)) { - CHECK_EQ(w_scale_.size(), oc) << "w_data scale size must be oc"; - float input_scale = param.input_scale; - float output_scale = param.output_scale; - for (auto& ws : w_scale_) { - ws *= input_scale; - if (Ptype_out == PRECISION(kInt8)) { - ws /= output_scale; - } - } - } - - const int wout_round = ((ow + 7) / 8) * 8; - const int win_round = wout_round * sw + 5 - 1; - const int hout_round = ((oh + 2) / 3) * 3; - const int hin_round = hout_round * sw + 5 - 1; - const int tmp_size_out = wout_round * hout_round; - const int tmp_size_in = win_round * hin_round; - const int tmp_size_io_bytes = tmp_size_in + tmp_size_out * sizeof(int); - const int tmp_row_io_bytes = win_round + wout_round * sizeof(int); - const int tmp_size_io_float = - (tmp_size_io_bytes + sizeof(float) - 1) / sizeof(float); - const int tmp_row_io_float = - (tmp_row_io_bytes + sizeof(float) - 1) / sizeof(float); - ctx_->ExtendWorkspace( - (ctx_->threads() * tmp_size_io_float + tmp_row_io_float) * - sizeof(float)); - impl_ = conv_depthwise_5x5_int8; - VLOG(5) << "invoke conv_depthwise_5x5 int8 conv"; - } else { - LOG(ERROR) << "this type depthwise int8 conv not impl"; - return false; - } - return true; -} - -template -bool DepthwiseConvInt8::init(const operators::ConvParam& param, - Context* ctx) { - this->ctx_ = ctx; - return create(param, ctx); -} - -template -bool DepthwiseConvInt8::run(const operators::ConvParam& param) { - const int8_t* i_data = param.x->data(); - int32_t* o_data = nullptr; - const int8_t* w_data = param.filter->data(); - const int32_t* b_data = param.bias ? param.bias->data() : nullptr; - - // LOG(INFO) << "input size: " << param.x->memory_size() << " " - // << param.input_scale << " " << w_scale_.size(); - - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - int bs = x_dims[0]; - int ic = x_dims[1]; - int ih = x_dims[2]; - int iw = x_dims[3]; // nchw - int oc = o_dims[1]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int kw = w_dims[3]; - int sw = param.strides[1]; - - if (kw == 3 && Ptype_out != PRECISION(kInt32)) { - o_data = tmp_int32_out_.mutable_data(); - } else if (kw == 5 || (kw == 3 && Ptype_out == PRECISION(kInt32))) { - o_data = param.output->mutable_data(); - } else { - LOG(ERROR) << "this type dw int8 conv not impl"; - return false; - } - - impl_(i_data, - o_data, - bs, - oc, - oh, - ow, - ic, - ih, - iw, - w_data, - b_data, - param, - this->ctx_, - Ptype_out, - w_scale_.data()); - - auto i_scale = param.input_scale; - auto o_scale = param.output_scale; - if (kw == 3) { - if (Ptype_out == PRECISION(kInt8)) { - trans_tensor_dtype( - &tmp_int32_out_, param.output, i_scale, o_scale, w_scale_); - } else if (Ptype_out == PRECISION(kFloat)) { - trans_tensor_dtype( - &tmp_int32_out_, param.output, i_scale, 1.f, w_scale_); - } else if (Ptype_out != PRECISION(kInt32)) { - LOG(ERROR) << "unsupported precision type!!"; - return false; - } - } - - return true; -} - -template class DepthwiseConvInt8; -template class DepthwiseConvInt8; -template class DepthwiseConvInt8; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h deleted file mode 100644 index cdddda79d1..0000000000 --- a/lite/backends/arm/math/conv_depthwise.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -class DepthwiseConv - : public ImplBase { - public: - typedef void (*conv_dw_impl)(const float* i_data, - float* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int kw, - const float* w_data, - const float* b_data, - const operators::ConvParam& param, - Context* ctx); - DepthwiseConv() = default; - ~DepthwiseConv() {} - - virtual bool init(const operators::ConvParam& param, - Context* ctx); - - virtual bool create(const operators::ConvParam& param, - Context* ctx); - - virtual bool run(const operators::ConvParam& param); - - private: - conv_dw_impl impl_{nullptr}; -}; - -template -class DepthwiseConvInt8 - : public ImplBase { - public: - typedef void (*conv_dw_int8_impl)(const int8_t* i_data, - int32_t* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int kw, - const int8_t* w_data, - const int32_t* b_data, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - - DepthwiseConvInt8() = default; - ~DepthwiseConvInt8() {} - - virtual bool init(const operators::ConvParam& param, - Context* ctx); - - virtual bool create(const operators::ConvParam& param, - Context* ctx); - - virtual bool run(const operators::ConvParam& param); - - private: - conv_dw_int8_impl impl_{nullptr}; - std::vector w_scale_; - Tensor tmp_int32_out_; -}; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3_int8.cc b/lite/backends/arm/math/conv_depthwise_3x3_int8.cc deleted file mode 100644 index d1eedd9557..0000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3_int8.cc +++ /dev/null @@ -1,5832 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p1_bias_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 8 -void conv_depthwise_3x3s1p1_bias_s_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 8 -void conv_depthwise_3x3s2p1_bias_s_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s1p1_bias_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p1_bias_s_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - ARMContext* ctx, - PrecisionType out_type, - const float* scale) { - int w_in = win; - int h_in = hin; - int ch_in = chin; - - int w_out = wout; - int h_out = hout; - int ch_out = chout; - int stride_h = param.strides[0]; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active){ - // if (param.activation_param.active == Active_relu || - // fabs(param.activation_param.negative_slope) > 1e-6f){ - // flag_relu = true; - // } - // } - //! only support stride = 1 or 2 - if (stride_h == 1) { - if (flag_relu) { - if (w_in > 8) { - conv_depthwise_3x3s1p1_bias_relu_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s_relu_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 8) { - conv_depthwise_3x3s1p1_bias_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 16) { - conv_depthwise_3x3s2p1_bias_relu_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s_relu_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 16) { - conv_depthwise_3x3s2p1_bias_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s_int8(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ - -// 4line w_in > 8 -void conv_depthwise_3x3s1p1_bias_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - // printf("3x3s1 mult height \n"); - //! pad is done implicit - const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - const unsigned char right_pad_idx[16] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_in; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 7) >> 3; - int tile_h = (h_out + 1) >> 1; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(w_in - 7 - (cnt_col << 3)); - - int size_pad_bottom = h_out % 2; - - uint8x8_t vmask_rp1 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - uint8x8_t vmask_rp2 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8)); - - uint8x16_t vmask_rp = - vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx)); - // uint8x8_t vmask_rp2 = vcgt_u8(vdup_n_u8(size_pad_right), - // vld1_u8(right_pad_idx + 8)); - unsigned char vmask[16]; - vst1q_u8(vmask, vmask_rp); - - unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3)); - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - - int8x8_t vzero = vdup_n_s8(0); - int32x4_t vzero_32 = vdupq_n_s32(0); - - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; - -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - int* doutr0 = nullptr; - int* doutr1 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - const signed char* dr3 = dr2 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - const signed char* din_ptr3 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - unsigned int* rst_mask = rmask; - unsigned char* val_mask = vmask; - - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; -#ifdef __aarch64__ - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - // left - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v1.8b}, [%[din_ptr0]] \n" /* load - a00-a015 to - q0*/ - "ld1 {v3.8b}, [%[din_ptr1]] \n" /* load - a00-a015 to - q0*/ - - "ld1 {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - // r0 - "smull v18.8h, %[v1].8b, v0.8b \n" /* outr00 = 01234567 * w01 - */ - - "ext v4.8b, v21.8b, v0.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v5.8b, v0.8b, v1.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "ld1 {v6.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v8.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "smlal v18.8h, %[v0].8b, v4.8b\n" /* outr00 += 00123456 * w00 */ - - "ld1 {v7.8b}, [%[din_ptr2]] \n" /* load - a00-a015 - to q0*/ - "ld1 {v9.8b}, [%[din_ptr3]] \n" /* load - a00-a015 - to q0*/ - - "sub %[din_ptr0], %[din_ptr0], #1 \n" - "sub %[din_ptr1], %[din_ptr1], #1 \n" - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v5.8b\n" /* outr00 += 12345678 * w02 */ - - "ext v4.8b, v21.8b, v2.8b, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v5.8b, v2.8b, v3.8b, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - // r1 - "sub %[din_ptr2], %[din_ptr2], #1 \n" - "sub %[din_ptr3], %[din_ptr3], #1 \n" - - "smull v19.8h, %[v1].8b, v2.8b \n" /* outr10 += 01234567 * w11 - */ - "smlal v18.8h, %[v4].8b, v2.8b \n" /* outr00 += 01234567 * w11 - */ - - "ext v14.8b, v21.8b, v6.8b, #7 \n" /* vext_s8(vzero, vinr0, - 7); 00123456 */ - "ext v15.8b, v6.8b, v7.8b, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "smlal v19.8h, %[v0].8b, v4.8b \n" /* outr00 += 01234567 * w11 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v3].8b, v4.8b \n" /* outr00 += 001234567 * w10 - */ - - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v5.8b \n" /* outr00 += 01234567 * w11 - */ - "smlal v18.8h, %[v5].8b, v5.8b \n" /* outr00 += 12345678 * w12 - */ - - // r2 - "ld1 {v1.8b}, [%[din_ptr0]] \n" /* load - a00-a015 to - q0*/ - "ld1 {v3.8b}, [%[din_ptr1]] \n" /* load - a00-a015 to - q0*/ - - "smlal v19.8h, %[v4].8b, v6.8b \n" /* outr10 += 01234567 * w11 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v7].8b, v6.8b \n" /* outr00 += 01234567 * w11 - */ - - "ext v4.8b, v21.8b, v8.8b, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v5.8b, v8.8b, v9.8b, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v3].8b, v14.8b \n" /* outr10 += 01234567 * w11 - */ - "smlal v18.8h, %[v6].8b, v14.8b \n" /* outr00 += 01234567 * w11 - */ - - "ld1 {v6.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - - "smlal v19.8h, %[v5].8b, v15.8b \n" /* outr10 += 01234567 * w11 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v15.8b \n" /* outr00 += 01234567 * w11 - */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v7].8b, v8.8b \n" /* outr00 += 01234567 * w11 - */ - - "ld1 {v8.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "ld1 {v7.8b}, [%[din_ptr2]] \n" /* load - a00-a015 to - q0*/ - "ld1 {v9.8b}, [%[din_ptr3]] \n" /* load - a00-a015 to - q0*/ - - "smlal v19.8h, %[v6].8b, v4.8b \n" /* outr00 += 01234567 * - w11 */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "stp q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v5.8b \n" /* outr00 += 01234567 * - w11 */ - - "ld1 {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "stp q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "cmp %[cnt], #1 \n" - "blt 3f \n" - // mid - "1: \n" - "ext v4.8b, v0.8B, v1.8b, #1 \n" /*12345678 */ - "ext v5.8b, v0.8b, v1.8B, #2 \n" /*23456789 */ - - // r0 - "smull v18.8h, %[v0].8b, v0.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v14.8b, v2.8B, v3.8b, #1 \n" /*12345678 */ - "ext v15.8b, v2.8b, v3.8B, #2 \n" /*23456789 */ - - "smlal v18.8h, %[v1].8b, v4.8b\n" /* outr00 += 12345678 * w01 */ - - "ext v16.8b, v6.8B, v7.8b, #1 \n" /*12345678 */ - "ext v17.8b, v6.8b, v7.8B, #2 \n" /*23456789 */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v5.8b\n" /* outr00 += 23456789 * w02 */ - - // r1 - "ext v4.8b, v8.8B, v9.8b, #1 \n" /*12345678 */ - "ext v5.8b, v8.8b, v9.8B, #2 \n" /*23456789 */ - - "smull v19.8h, %[v0].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v3].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - - "smlal v19.8h, %[v1].8b, v14.8b\n" /* outr00 += 12345678 * w01 */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v4].8b, v14.8b\n" /* outr00 += 12345678 * w01 */ - - "ld1 {v1.8b}, [%[din_ptr0]] \n" /* load - a00-a015 - to q0*/ - "ld1 {v3.8b}, [%[din_ptr1]] \n" /* load - a00-a015 - to q0*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v15.8b\n" /* outr00 += 23456789 * w02 */ - "smlal v18.8h, %[v5].8b, v15.8b\n" /* outr00 += 12345678 * w01 */ - - // r2 - "smlal v19.8h, %[v3].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v6].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v4].8b, v16.8b\n" /* outr00 += 12345678 * w01 */ - "smlal v18.8h, %[v7].8b, v16.8b\n" /* outr00 += 12345678 * w01 */ - - "smlal v19.8h, %[v5].8b, v17.8b\n" /* outr00 += 23456789 * w02 */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v17.8b\n" /* outr00 += 12345678 * w01 */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v6].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v6.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v8.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smlal v19.8h, %[v7].8b, v4.8b\n" /* outr00 += 12345678 * w01 */ - - "ld1 {v7.8b}, [%[din_ptr2]] \n" /* load - a00-a015 - to q0*/ - "ld1 {v9.8b}, [%[din_ptr3]] \n" /* load - a00-a015 - to q0*/ - - "stp q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v5.8b\n" /* outr00 += 23456789 * w02 */ - - "ld1 {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "subs %[cnt], %[cnt], #1 \n" - - "stp q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "bne 1b \n" - // right - "3: \n" - "ld1 {v14.8b}, [%[vmask]], #8 \n" - "ld1 {v15.8b}, [%[vmask]] \n" - - "bif v0.8b, v21.8b, v14.8b \n" - "bif v1.8b, v21.8b, v15.8b \n" - "bif v2.8b, v21.8b, v14.8b \n" - "bif v3.8b, v21.8b, v15.8b \n" - - "ext v4.8b, v0.8b, v1.8b, #1 \n" - "ext v5.8b, v0.8b, v1.8b, #2 \n" - - // r0 - "smull v18.8h, %[v0].8b, v0.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v16.8b, v2.8b, v3.8b, #1 \n" - "ext v17.8b, v2.8b, v3.8b, #2 \n" - - "bif v6.8b, v21.8b, v14.8b \n" - "bif v7.8b, v21.8b, v15.8b \n" - - "smlal v18.8h, %[v1].8b, v4.8b \n" /* outr00 = 01234567 * w00 - */ - - "bif v8.8b, v21.8b, v14.8b \n" - "bif v9.8b, v21.8b, v15.8b \n" - - "ext v20.8b, v6.8b, v7.8b, #1 \n" - "ext v22.8b, v6.8b, v7.8b, #2 \n" - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v5.8b \n" /* outr00 = 01234567 * w00 - */ - - // r1 - "ext v4.8b, v8.8b, v9.8b, #1 \n" - "ext v5.8b, v8.8b, v9.8b, #2 \n" - - "smull v19.8h, %[v0].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v3].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v14.4s}, [%[rmask]], #16 \n" - "ld1 {v15.4s}, [%[rmask]] \n" - - "smlal v19.8h, %[v1].8b, v16.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v4].8b, v16.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v0.4s}, [%[ptr_out0]], #16 \n" - "ld1 {v2.4s}, [%[ptr_out1]], #16 \n" - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v17.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v5].8b, v17.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v1.4s}, [%[ptr_out0]] \n" - "ld1 {v3.4s}, [%[ptr_out1]] \n" - - // r2 - "smlal v19.8h, %[v3].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v6].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "sub %[ptr_out0], %[ptr_out0], #16 \n" - "sub %[ptr_out1], %[ptr_out1], #16 \n" - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v4].8b, v20.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v7].8b, v20.8b \n" /* outr00 = 01234567 * w00 - */ - - "smlal v19.8h, %[v5].8b, v22.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v22.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v6].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smlal v19.8h, %[v7].8b, v4.8b \n" /* outr00 = 01234567 * w00 - */ - - "bif v10.16b, v0.16b, v14.16b \n" - "bif v11.16b, v1.16b, v15.16b \n" - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v5.8b \n" /* outr00 = 01234567 * w00 - */ - - "stp q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "bif v12.16b, v2.16b, v14.16b \n" - "bif v13.16b, v3.16b, v15.16b \n" - - "stp q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 -> - ptr_out */ - - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [ptr_out0] "+r"(doutr0), - [ptr_out1] "+r"(doutr1), - [vmask] "+r"(val_mask), - [rmask] "+r"(rst_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [bias_val] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); -#else - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - "vld1.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vmov.u32 d11, #0 @ zero\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = - // vbias - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = - // vbias - - // r0 - "vmull.s8 q12, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #1 @ ext \n" // d11 = 12345678 - - "vld1.8 {d12-d13}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.8 {d14-d15}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vdup.s8 d5, d0[3] @ d5 = w10, w10, w00, w00\n" - "vdup.s8 d6, d0[4] @ d6 = w11, w11, w01, w01\n" - - "vmlal.s8 q12, d30, d2 @ out0 += din0 * w00 \n" // q12 += d10 * w00 - - "vdup.s8 d7, d0[5] @ d7 = w12, w12\n" - "add %[din_ptr0], #7 @add \n" - "add %[din_ptr1], #7 @add \n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #1 @ ext \n" // d11 = 12345678 - "vmull.s8 q13, d12, d3 @ out1 = din1 * w01 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d12, d6 @ out0 = din1 * w11 \n" // q12 = d12 * w11 - - "vld1.8 {d12-d13}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vdup.s8 d8, d0[6] @ d8 = w20, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d9 = w21, w01, w01, w01\n" - "vdup.s8 d10, d1[0] @ d10 = w22, w02, w02, w02\n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d30, d2 @ out1 += din1 * w00 \n" // q12 += d10 * w00 - "vmull.s8 q12, d30, d5 @ out0 += din1 * w10 \n" // q12 += d10 * w00 - - "add %[din_ptr2], #7 @add \n" - "add %[din_ptr3], #7 @add \n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d11, d14, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #1 @ ext \n" // d11 = 12345678 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d14, d6 @ out1 = din2 * w11 \n" // q13 = d12 * w01 - "vmull.s8 q12, d14, d9 @ out1 = din2 * w21 \n" // q13 = d12 * w01 - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d5 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d8 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #1 @ ext \n" // d11 = 12345678 - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d12, d9 @ out1 = din3 * w21 \n" // q13 = d12 * w01 - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - - "vmlal.s8 q13, d30, d8 @ out1 += din3 * w20 \n" // q13 += d10 * w00 - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store\n" - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vst1.32 {d20-d21}, [%[dout_ptr2]]! @ store\n" - "cmp %[cnt], #1 \n" - "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - "blt 1f \n" - - // mid - "2: \n" - "vld1.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = - // vbias - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = - // vbias - - // r0 - "vmull.s8 q12, d12, d2 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 12345678 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 23456789 - - "vld1.8 {d12-d13}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.8 {d14-d15}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - "vmlal.s8 q12, d30, d3 @ out0 += din0 * w00 \n" // q12 += d10 * w00 - - "add %[din_ptr0], #8 @add \n" - "add %[din_ptr1], #8 @add \n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 12345678 - "vmull.s8 q13, d12, d2 @ out1 = din1 * w01 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d12, d5 @ out0 = din1 * w11 \n" // q12 = d12 * w11 - - "vld1.8 {d12-d13}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - "vmlal.s8 q13, d30, d3 @ out1 += din1 * w00 \n" // q12 += d10 * w00 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d30, d6 @ out0 += din1 * w10 \n" // q12 += d10 * w00 - - "add %[din_ptr2], #8 @add \n" - "add %[din_ptr3], #8 @add \n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d14, d15, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #2 @ ext \n" // d11 = 12345678 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d14, d5 @ out1 = din2 * w11 \n" // q13 = d12 * w01 - "vmull.s8 q12, d14, d8 @ out1 = din2 * w21 \n" // q13 = d12 * w01 - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d6 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d9 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 12345678 - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d12, d8 @ out1 = din3 * w21 \n" // q13 = d12 * w01 - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - - "vmlal.s8 q13, d30, d9 @ out1 += din3 * w20 \n" // q13 += d10 * w00 - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - - "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vst1.32 {d20-d21}, [%[dout_ptr2]]! @ store\n" - "subs %[cnt], #1 \n" - "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - "bne 2b \n" - // right - "1: \n" - "vld1.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.8 {d28-d29}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = vbias - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = vbias - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with right pad\n" - "vld1.8 {d14-d15}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - // r0 - "vmull.s8 q12, d12, d2 @ out0 = din0 * w00 \n" // q12 = d12 * w01 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 12345678 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 23456789 - - "vld1.8 {d12-d13}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with right pad\n" - - "vmlal.s8 q12, d30, d3 @ out0 += din0 * w01 \n" // q12 += d10 * w00 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d14, d15, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #2 @ ext \n" // d11 = 12345678 - - "vmull.s8 q13, d14, d2 @ out1 = din1 * w00 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d14, d5 @ out0 = din1 * w10 \n" // q12 = d12 * w11 - - "vld1.8 {d14-d15}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vbif.8 d12, d11, d28 @ bit select, deal with " - "right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with " - "right pad\n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d30, d3 @ out1 += din1 * w01 \n" // q12 += d10 * w00 - "vmull.s8 q12, d30, d6 @ out0 += din1 * w11 \n" // q12 += d10 * w00 - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 12345678 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d12, d5 @ out1 = din2 * w10 \n" // q13 = d12 * w01 - "vmull.s8 q12, d12, d8 @ out1 = din2 * w20 \n" // q13 = d12 * w01 - - "vbif.8 d14, d11, d28 @ bit select, deal with " - "right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with " - "right pad\n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d6 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d9 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - "vld1.32 {d28-d29}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d12-d13}, [%[dout_ptr1]] @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 5 6 7 8 " - "9\n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d14, d15, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #2 @ ext \n" // d11 = 12345678 - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d14, d8 @ out1 = din3 * w20 \n" // q13 = d12 * w01 - "sub %[dout_ptr1], #16 @ sub \n" - "vld1.32 {d14-d15}, [%[dout_ptr2]]! @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d24-d25}, [%[dout_ptr2]] @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - - "vmlal.s8 q13, d30, d9 @ out1 += din3 * w21 \n" // q13 += d10 * w00 - "vbif q8, q14, q1 @ bit select, deal with right " - "pad\n" - "vbif q9, q6, q2 @ bit select, deal with right " - "pad\n" - "sub %[dout_ptr2], #16 @ sub \n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store\n" - "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vbif q10, q7, q1 @ bit select, deal with right pad\n" - "vbif q11, q12, q2 @ bit select, deal with right pad\n" - - "vst1.32 {d20-d21}, [%[dout_ptr2]]! @ store\n" - "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [cnt] "+r"(cnt), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - dout_ptr += 2 * w_out; - } - } - } -} - -// w_in <= 8 -void conv_depthwise_3x3s1p1_bias_s_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - // printf("3x3s1 mult height \n"); - const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - //! for 4x6 convolution window - const unsigned char right_pad_idx[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_in; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_h = (h_out + 1) >> 1; - - unsigned int size_pad_right = (unsigned int)(w_in); - - uint8x8_t vmask_rp = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - // uint8x8_t vmask_rp2 = vcgt_u8(vdup_n_u8(size_pad_right), - // vld1_u8(right_pad_idx + 8)); - unsigned char vmask[8]; - vst1_u8(vmask, vmask_rp); - - unsigned int rst_remain = (unsigned int)w_out; - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - - int8x8_t vzero = vdup_n_s8(0); - int32x4_t vzero_32 = vdupq_n_s32(0); - - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - int* doutr0 = nullptr; - int* doutr1 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - const signed char* dr3 = dr2 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - const signed char* din_ptr3 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - unsigned int* rst_mask = rmask; - - int out_buf1[8]; - int out_buf2[8]; - int trash_buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = trash_buf; - } -#ifdef __aarch64__ - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - // left - "ld1 {v4.8b}, [%[vmask]] \n" - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v1.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v3.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "bif v0.8b, v21.8b, v4.8b \n" - "bif v1.8b, v21.8b, v4.8b \n" - "bif v2.8b, v21.8b, v4.8b \n" - "bif v3.8b, v21.8b, v4.8b \n" - - "ext v6.8b, v21.8b, v0.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v7.8b, v0.8b, v21.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "ld1 {v10.4s}, [%[vbias]] \n" - "ld1 {v11.4s}, [%[vbias]] \n" - - // r0 - "smull v18.8h, %[v1].8b, v0.8b \n" /* outr00 = 01234567 * w01 - */ - - "ext v8.8b, v21.8b, v1.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v9.8b, v1.8b, v21.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "smlal v18.8h, %[v0].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v12.4s}, [%[vbias]] \n" - "ld1 {v13.4s}, [%[vbias]] \n" - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v7.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v6.8b, v21.8b, v2.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v7.8b, v2.8b, v21.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - // r1 - "smull v19.8h, %[v1].8b, v1.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v4].8b, v1.8b \n" /* outr00 = 01234567 * w00 - */ - - // "ld1 {v14.4s}, [%[rmask]], #16 \n" - // "ld1 {v15.4s}, [%[rmask]] \n" - - "smlal v19.8h, %[v0].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v3].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - // "ld1 {v16.4s}, [%[ptr_out0]], #16 \n" - // "ld1 {v17.4s}, [%[ptr_out1]], #16 \n" - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v9.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v5].8b, v9.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v8.8b, v21.8b, v3.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v9.8b, v3.8b, v21.8B, #1 \n" // vext_s8(vinr0, vinr0_1, - // 1); 12345678 - - // "ld1 {v0.4s}, [%[ptr_out0]] \n" - // "ld1 {v1.4s}, [%[ptr_out1]] \n" - - // r2 - "smlal v19.8h, %[v4].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v7].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - // "sub %[ptr_out0], %[ptr_out0], #16 \n" - // "sub %[ptr_out1], %[ptr_out1], #16 \n" - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v3].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v6].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "smlal v19.8h, %[v5].8b, v7.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v7.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v7].8b, v3.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smlal v19.8h, %[v6].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - // "bif v10.16b, v16.16b, v14.16b \n" - // "bif v11.16b, v0.16b, v15.16b \n" - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v9.8b \n" /* outr00 = 01234567 * w00 - */ - - "stp q10, q11, [%[ptr_out0]] \n" /* store q10, q11 -> ptr_out */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // "bif v12.16b, v17.16b, v14.16b \n" - // "bif v13.16b, v1.16b, v15.16b \n" - - "stp q12, q13, [%[ptr_out1]] \n" /* store q10, q11 -> ptr_out */ - - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [rmask] "+r"(rst_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [vbias] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22), - [vmask] "r"(vmask), - [ptr_out0] "r"(out_buf1), - [ptr_out1] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); -#else - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - "vld1.8 {d28}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vld1.8 {d12}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vld1.8 {d13}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - - "vmov.u32 d11, #0 @ zero\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = - // vbias - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d28 @ bit select, deal with right pad\n" - "vld1.8 {d14}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vld1.8 {d15}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = - // vbias - - // r0 - "vmull.s8 q12, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d11, #1 @ ext \n" // d11 = 12345678 - - "vdup.s8 d5, d0[3] @ d5 = w10, w10, w00, w00\n" - "vdup.s8 d6, d0[4] @ d6 = w11, w11, w01, w01\n" - - "vmlal.s8 q12, d30, d2 @ out0 += din0 * w00 \n" // q12 += d10 * w00 - - "vdup.s8 d7, d0[5] @ d7 = w12, w12\n" - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d28 @ bit select, deal with right pad\n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d11, d13, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d13, d11, #1 @ ext \n" // d11 = 12345678 - "vmull.s8 q13, d13, d3 @ out1 = din1 * w01 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d13, d6 @ out0 = din1 * w11 \n" // q12 = d12 * w11 - - "vdup.s8 d8, d0[6] @ d8 = w20, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d9 = w21, w01, w01, w01\n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d30, d2 @ out1 += din1 * w00 \n" // q12 += d10 * w00 - "vmull.s8 q12, d30, d5 @ out0 += din1 * w10 \n" // q12 += d10 * w00 - - "vdup.s8 d10, d1[0] @ d10 = w22, w02, w02, w02\n" - // "vld1.32 {d28-d29}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 - // 6 7 8 9\n" "vld1.32 {d12-d13}, [%[dout_ptr1]] @ load din00= 0 - // 1 2 3 4 5 6 7 8 9\n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d11, d14, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d11, #1 @ ext \n" // d11 = 12345678 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d14, d6 @ out1 = din2 * w11 \n" // q13 = d12 * w01 - "vmull.s8 q12, d14, d9 @ out1 = din2 * w21 \n" // q13 = d12 * w01 - - // "sub %[dout_ptr1], #16 @ sub \n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d5 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d8 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - // "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 - // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 - // 5 6 7 8 9\n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d11, d15, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d15, d11, #1 @ ext \n" // d11 = 12345678 - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d15, d9 @ out1 = din3 * w21 \n" // q13 = d12 * w01 - - // "vld1.32 {d6-d7}, [%[dout_ptr2]]! @ load din00= 0 1 2 3 4 5 6 - // 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr2]] @ load din00= 0 1 - // 2 3 4 5 6 7 8 9\n" - - "vmlal.s8 q13, d30, d8 @ out1 += din3 * w20 \n" // q13 += d10 * w00 - - // "vbif q8, q14, q1 @ bit select, deal with right - // pad\n" "vbif q9, q6, q2 @ bit select, deal - // with right pad\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - // "sub %[dout_ptr2], #16 @ sub \n" - - "vst1.32 {d16-d19}, [%[dout_ptr1]] @ store\n" - // "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // "vbif q10, q3, q1 @ bit select, deal with right - // pad\n" "vbif q11, q7, q2 @ bit select, deal - // with right pad\n" - - "vst1.32 {d20-d23}, [%[dout_ptr2]] @ store\n" - // "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask), - [dout_ptr1] "r"(out_buf1), - [dout_ptr2] "r"(out_buf2) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - dout_ptr += 2 * w_out; - } - } - } -} - -// 4line w_in > 16 -void conv_depthwise_3x3s2p1_bias_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - // printf("3x3s2 mult height \n"); - //! pad is done implicit - const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - //! for 4x6 convolution window - const unsigned char right_pad_idx[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_out; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 15) >> 4; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(w_in - 15 - (cnt_col << 4)); - if (size_pad_right == 17) { - size_pad_right = 0; - cnt_col++; - } - - uint8x8_t vmask_rp1 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - uint8x8_t vmask_rp2 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8)); - unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3)); - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - uint8x16_t vmask_rp = - vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx)); - unsigned char vmask[16]; - vst1q_u8(vmask, vmask_rp); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - - int8x8_t vzero = vdup_n_s8(0); - // printf("cnt_col: %d, rst_remain: %d, size_pad_right: %d\n", cnt_col, - // rst_remain, size_pad_right); - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - - int* doutr0 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - - doutr0 = dout_ptr; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din_ptr1 = zero_ptr; - case 1: - din_ptr2 = zero_ptr; - default: - break; - } - } -#ifdef __aarch64__ - int cnt = cnt_col; - unsigned char* val_mask = vmask; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "movi v10.4s, #0x0\n" - // left - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]] \n" /*load a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]] \n" /* load a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]] \n" /*load a00-a015 - to q0*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "ext v6.8b, v10.8b, v1.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v7.8b, v10.8b, v3.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v8.8b, v10.8b, v5.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - - // r0 - "smull v14.8h, %[v1].8b, v0.8b \n" /* outr00 = 02468 * w01 */ - "smull v15.8h, %[v2].8b, v1.8b\n" /* outr00 += 13579 * w02 */ - "smull v16.8h, %[v0].8b, v6.8b\n" /* outr00 += 013579 * w00 */ - - "add %[din_ptr0], %[din_ptr0], #15 \n" - "add %[din_ptr1], %[din_ptr1], #15 \n" - "add %[din_ptr2], %[din_ptr2], #15 \n" - - // r1 - "smlal v14.8h, %[v4].8b, v2.8b \n" /* outr00 = 02468 * w01 */ - "smlal v15.8h, %[v5].8b, v3.8b\n" /* outr00 += 13579 * w02 */ - "smlal v16.8h, %[v3].8b, v7.8b\n" /* outr00 += 013579 * w00 */ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v14.8h, %[v7].8b, v4.8b \n" /* outr00 = 02468 * w01 */ - "smull v15.8h, %[v8].8b, v5.8b\n" /* outr00 += 13579 * w02 */ - "smull v16.8h, %[v6].8b, v8.8b\n" /* outr00 += 013579 * w00 */ - - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]], #16 \n" /*load - a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]], #16 \n" /* load - a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]], #16 \n" /*load - a00-a015 - to q0*/ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - "stp q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "cmp %[cnt], #1 \n" - "blt 3f \n" - // mid - "1: \n" - "ld1 {v6.8b}, [%[din_ptr0]] \n" /*load a00-a015 to q0*/ - "ld1 {v7.8b}, [%[din_ptr1]] \n" /*load a00-a015 to q0*/ - "ld1 {v8.8b}, [%[din_ptr2]] \n" /*load a00-a015 to q0*/ - - "ext v9.8b, v0.8b, v6.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 246810 */ - "ext v11.8b, v2.8b, v7.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 246810 */ - "ext v14.8b, v4.8b, v8.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 246810 */ - - // r0 - "smull v6.8h, %[v0].8b, v0.8b \n" /* outr00 = 02468 * w00 */ - "smull v7.8h, %[v1].8b, v1.8b\n" /* outr00 += 13579 * w01 */ - "smull v8.8h, %[v2].8b, v9.8b\n" /* outr00 += 246810 * w02 */ - - // r1 - "smlal v6.8h, %[v3].8b, v2.8b \n" /* outr00 = 02468 * w00 */ - "smlal v7.8h, %[v4].8b, v3.8b\n" /* outr00 += 13579 * w01 */ - "smlal v8.8h, %[v5].8b, v11.8b\n" /* outr00 += 246810 * w02 */ - - "saddw v12.4s, v12.4s, v6.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v6.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v7.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v7.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v8.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v8.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v6.8h, %[v6].8b, v4.8b \n" /* outr00 = 02468 * w00 */ - "smull v7.8h, %[v7].8b, v5.8b\n" /* outr00 += 13579 * w01 */ - "smull v8.8h, %[v8].8b, v14.8b\n" /* outr00 += 246810 * w02 */ - - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]], #16 \n" /*load - a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]], #16 \n" /* load - a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]], #16 \n" /*load - a00-a015 - to q0*/ - - "saddw v12.4s, v12.4s, v6.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v6.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v7.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v7.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v8.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v8.8h \n" /* v11 += outr00.high*/ - - "subs %[cnt], %[cnt], #1 \n" - - "stp q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "bne 1b \n" - // right - "3: \n" - "ld1 {v14.8b}, [%[vmask]], #8 \n" - "ld1 {v15.8b}, [%[vmask]] \n" - - "bif v0.8b, v10.8b, v14.8b \n" - "bif v1.8b, v10.8b, v15.8b \n" - "bif v2.8b, v10.8b, v14.8b \n" - "bif v3.8b, v10.8b, v15.8b \n" - "bif v4.8b, v10.8b, v14.8b \n" - "bif v5.8b, v10.8b, v15.8b \n" - - "ext v6.8b, v0.8b, v10.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 2468.. */ - "ext v7.8b, v2.8b, v10.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 2468..*/ - "ext v8.8b, v4.8b, v10.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 2468.. */ - - // r0 - "smull v14.8h, %[v0].8b, v0.8b \n" /* outr00 = 02468 * w00 */ - "smull v15.8h, %[v1].8b, v1.8b\n" /* outr00 += 13579 * w01 */ - "smull v16.8h, %[v2].8b, v6.8b\n" /* outr00 += 246810 * w02 */ - - // r1 - "smlal v14.8h, %[v3].8b, v2.8b \n" /* outr00 = 02468 * w00 */ - "smlal v15.8h, %[v4].8b, v3.8b\n" /* outr00 += 13579 * w01 */ - "smlal v16.8h, %[v5].8b, v7.8b\n" /* outr00 += 246810 * w02 */ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v14.8h, %[v6].8b, v4.8b \n" /* outr00 = 02468 * w00 */ - "smull v15.8h, %[v7].8b, v5.8b\n" /* outr00 += 13579 * w01 */ - "smull v16.8h, %[v8].8b, v8.8b\n" /* outr00 += 246810 * w02 */ - - "ldp q0, q1, [%[ptr_out0]] \n" /* dup v10, bias */ - "ldp q9, q11, [%[rst_mask]] \n" /* dup v10, bias */ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - "bif v12.16b, v0.16b, v9.16b \n" - "bif v13.16b, v1.16b, v11.16b \n" - - "stp q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [ptr_out0] "+r"(doutr0), - [vmask] "+r"(val_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [bias_val] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22), - [rst_mask] "r"(rmask) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); -#else - unsigned int* rst_mask = rmask; - int cnt = cnt_col; - // prefetch input - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - "vld2.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]] @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]] @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - "vmov.u32 d11, #0 @ zero\n" - - "vdup.s8 d5, d0[3] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d6, d0[4] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d7, d0[5] @ d4 = w02, w02, w02, w02\n" - - "vext.8 d18, d11, d13, #7 @ ext \n" // d16 = -1 1 3 5 - "vext.8 d19, d11, d15, #7 @ ext \n" // d17 = -1 1 3 5 - "vext.8 d20, d11, d17, #7 @ ext \n" // d18 = -1 1 3 5 - - // r0 - "vmull.s8 q13, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vmull.s8 q14, d13, d4 @ out1 = din0 * w02 \n" // q12 = d12 * w02 - "vmull.s8 q15, d18, d2 @ out2 = din0 * w00 \n" // q12 = d12 * w02 - - "vdup.s8 d8, d0[6] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d10, d1[0] @ d4 = w02, w02, w02, w02\n" - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = - // vbias - - // r1 - "vmlal.s8 q13, d14, d6 @ out0 += din1 * w11 \n" // q12 = d12 * w11 - "vmlal.s8 q14, d15, d7 @ out1 += din1 * w12 \n" // q12 = d12 * w11 - "vmlal.s8 q15, d19, d5 @ out2 += din1 * w10 \n" // q12 = d12 * w11 - - "add %[din_ptr0], #15 @add \n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "add %[din_ptr1], #15 @add \n" - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - "add %[din_ptr2], #15 @add \n" - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d9 @ out0 += din1 * w21 \n" // q12 = d12 * w11 - "vmull.s8 q14, d17, d10 @ out1 += din1 * w22 \n" // q12 = d12 * w11 - "vmull.s8 q15, d20, d8 @ out2 += din1 * w20 \n" // q12 = d12 * w11 - - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vst1.32 {d22-d23}, [%[dout_ptr1]]! @ store\n" - "cmp %[cnt], #1 \n" - "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - "blt 1f \n" - - // mid - "2: \n" - "vld2.8 {d12-d13}, [%[din_ptr0]]! @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]]! @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]]! @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - - "vld1.8 {d21}, [%[din_ptr0]] @ load din00= 16 17\n" // d10 = 0 2 - // 4 6 - "vld1.8 {d22}, [%[din_ptr1]] @ load din00= 16 17\n" // d12 = 0 2 - // 4 6 - "vld1.8 {d23}, [%[din_ptr2]] @ load din00= 16 17\n" // d14 = 0 2 - // 4 6 - - "vext.8 d18, d12, d21, #1 @ ext din00 = 2 4 6 8\n" // d16 = 2 - // 4 6 8 - "vext.8 d19, d14, d22, #1 @ ext \n" // d17 = 2 4 6 8 - "vext.8 d20, d16, d23, #1 @ ext \n" // d18 = 2 4 6 8 - - // r0 - "vmull.s8 q13, d12, d2 @ out0 = din0 * w00 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d13, d3 @ out1 = din0 * w01 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d18, d4 @ out2 = din0 * w02 \n" // q12 = 2 4 6 8 - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = - // vbias - - // r1 - "vmlal.s8 q13, d14, d5 @ out0 += din1 * w10 \n" // q12 = 0 2 4 6 - "vmlal.s8 q14, d15, d6 @ out1 += din1 * w11 \n" // q12 = 1 3 5 7 - "vmlal.s8 q15, d19, d7 @ out2 += din1 * w12 \n" // q12 = 2 4 6 8 - - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d8 @ out0 += din1 * w20 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d17, d9 @ out1 += din1 * w21 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d20, d10 @ out2 += din1 * w22 \n" // q12 = 2 4 6 8 - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vst1.32 {d22-d23}, [%[dout_ptr1]]! @ store\n" - - "subs %[cnt], #1 \n" - "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - "bne 2b \n" - // right - "1: \n" - "cmp %[size_pad_right], #1 \n" - "blt 3f \n" - "vld2.8 {d12-d13}, [%[din_ptr0]]! @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]]! @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]]! @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - "vld1.8 {d28-d29}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = vbias - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d16, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d17, d11, d29 @ bit select, deal with right pad\n" - - "vext.8 d18, d12, d11, #1 @ ext din00 = 2 4 6 8\n" // d16 = -1 - // 1 3 5 - "vext.8 d19, d14, d11, #1 @ ext \n" // d17 = -1 1 3 5 - "vext.8 d20, d16, d11, #1 @ ext \n" // d18 = -1 1 3 5 - - // r0 - "vmull.s8 q13, d12, d2 @ out0 = din0 * w00 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d13, d3 @ out1 = din0 * w01 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d18, d4 @ out2 = din0 * w02 \n" // q12 = 2 4 6 8 - - // r1 - "vmlal.s8 q13, d14, d5 @ out0 += din1 * w11 \n" // q12 = 0 2 4 6 - "vmlal.s8 q14, d15, d6 @ out1 += din1 * w12 \n" // q12 = 1 3 5 7 - "vmlal.s8 q15, d19, d7 @ out2 += din1 * w10 \n" // q12 = 2 4 6 8 - - "vld1.32 {d12-d13}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d14-d15}, [%[dout_ptr1]] @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "sub %[dout_ptr1], #16 @ sub \n" - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d8 @ out0 += din1 * w11 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d17, d9 @ out1 += din1 * w12 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d20, d10 @ out2 += din1 * w10 \n" // q12 = 2 4 6 8 - - "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 5 6 7 8 " - "9\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vbif q11, q6, q1 @ bit select, deal with right pad\n" - "vbif q12, q7, q2 @ bit select, deal with right pad\n" - - "vst1.32 {d22-d23}, [%[dout_ptr1]]! @ store\n" - "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - "3: \n" - - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [dout_ptr1] "+r"(doutr0), - [cnt] "+r"(cnt), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask), [size_pad_right] "r"(size_pad_right) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - dout_ptr += w_out; - } - } - } -} -// w_in <= 16 -void conv_depthwise_3x3s2p1_bias_s_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - // printf("3x3s2 mult height \n"); - //! pad is done implicit - // const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - //! for 4x6 convolution window - const unsigned char right_pad_idx[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_out; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - unsigned int size_pad_right = (unsigned int)(w_in); - - uint8x8_t vmask_rp1 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - uint8x8_t vmask_rp2 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8)); - unsigned int rst_remain = (unsigned int)w_out; - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - uint8x16_t vmask_rp = - vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx)); - unsigned char vmask[16]; - vst1q_u8(vmask, vmask_rp); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - - int8x8_t vzero = vdup_n_s8(0); - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - int* doutr0 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - - doutr0 = dout_ptr; - int out_buf1[8]; - - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr2 + w_in; - dr2 = dr1 + w_in; - } - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din_ptr1 = zero_ptr; - case 1: - din_ptr2 = zero_ptr; - default: - break; - } - } -#ifdef __aarch64__ - unsigned int* rst_mask = rmask; - unsigned char* val_mask = vmask; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "movi v16.4s, #0x0\n" - // left - "ld1 {v10.8b}, [%[vmask]], #8 \n" - "ld1 {v11.8b}, [%[vmask]] \n" - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]] \n" /*load a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]] \n" /* load a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]] \n" /*load a00-a015 - to q0*/ - - "bif v0.8b, v16.8b, v10.8b \n" - "bif v1.8b, v16.8b, v11.8b \n" - "bif v2.8b, v16.8b, v10.8b \n" - "bif v3.8b, v16.8b, v11.8b \n" - "bif v4.8b, v16.8b, v10.8b \n" - "bif v5.8b, v16.8b, v11.8b \n" - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "ext v6.8b, v16.8b, v1.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v7.8b, v16.8b, v3.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v8.8b, v16.8b, v5.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - - // r0 - "smull v17.8h, %[v1].8b, v0.8b \n" /* outr00 = 02468 * w01 */ - "smull v18.8h, %[v2].8b, v1.8b\n" /* outr00 += 13579 * w02 */ - "smull v19.8h, %[v0].8b, v6.8b\n" /* outr00 += 013579 * w00 */ - - // "ldp q0, q1, [%[ptr_out0]] \n" /* dup v10, - // bias */ "ldp q10, q11, [%[rst_mask]] \n" /* - // dup v10, bias */ - - // r1 - "smlal v17.8h, %[v4].8b, v2.8b \n" /* outr00 = 02468 * w01 */ - "smlal v18.8h, %[v5].8b, v3.8b\n" /* outr00 += 13579 * w02 */ - "smlal v19.8h, %[v3].8b, v7.8b\n" /* outr00 += 013579 * w00 */ - - "saddw v12.4s, v12.4s, v17.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v17.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v18.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v17.8h, %[v7].8b, v4.8b \n" /* outr00 = 02468 * w01 */ - "smull v18.8h, %[v8].8b, v5.8b\n" /* outr00 += 13579 * w02 */ - "smull v19.8h, %[v6].8b, v8.8b\n" /* outr00 += 013579 * w00 */ - - "saddw v12.4s, v12.4s, v17.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v17.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v18.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // "bif v12.16b, v0.16b, v10.16b \n" - // "bif v13.16b, v1.16b, v11.16b \n" - - "stp q12, q13, [%[ptr_out0]] \n" /* store q10, q11 -> ptr_out - */ - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [vmask] "+r"(val_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [bias_val] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22), - [rst_mask] "r"(rmask), - [ptr_out0] "r"(out_buf1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); -#else - unsigned int* rst_mask = rmask; - // prefetch input - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - "vld2.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]] @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]] @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - "vld1.8 {d28-d29}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vmov.u32 d11, #0 @ zero\n" - - "vdup.s8 d5, d0[3] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d6, d0[4] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d7, d0[5] @ d4 = w02, w02, w02, w02\n" - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d16, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d17, d11, d29 @ bit select, deal with right pad\n" - - "vext.8 d18, d11, d13, #7 @ ext \n" // d16 = -1 1 3 5 - "vext.8 d19, d11, d15, #7 @ ext \n" // d17 = -1 1 3 5 - "vext.8 d20, d11, d17, #7 @ ext \n" // d18 = -1 1 3 5 - - // "pld [%[dout_ptr1]] @ preload data\n" - - // r0 - "vmull.s8 q13, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vmull.s8 q14, d13, d4 @ out1 = din0 * w02 \n" // q12 = d12 * w02 - "vmull.s8 q15, d18, d2 @ out2 = din0 * w00 \n" // q12 = d12 * w02 - - "vdup.s8 d8, d0[6] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d10, d1[0] @ d4 = w02, w02, w02, w02\n" - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = - // vbias - - // r1 - "vmlal.s8 q13, d14, d6 @ out0 += din1 * w11 \n" // q12 = d12 * w11 - "vmlal.s8 q14, d15, d7 @ out1 += din1 * w12 \n" // q12 = d12 * w11 - "vmlal.s8 q15, d19, d5 @ out2 += din1 * w10 \n" // q12 = d12 * w11 - - // "vld1.32 {d12-d13}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 - // 6 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr1]] @ load din00= 0 - // 1 2 3 4 5 6 7 8 9\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d9 @ out0 += din1 * w21 \n" // q12 = d12 * w11 - "vmull.s8 q14, d17, d10 @ out1 += din1 * w22 \n" // q12 = d12 * w11 - "vmull.s8 q15, d20, d8 @ out2 += din1 * w20 \n" // q12 = d12 * w11 - - // "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 - // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 - // 5 6 7 8 9\n" - - // "sub %[dout_ptr1], #16 @ sub \n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // "vbif q11, q6, q1 @ bit select, deal with right pad\n" - // "vbif q12, q7, q2 @ bit select, deal with right pad\n" - - "vst1.32 {d22-d25}, [%[dout_ptr1]] @ store\n" - // "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask), - [size_pad_right] "r"(size_pad_right), - [dout_ptr1] "r"(out_buf1) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - } - dout_ptr += w_out; - } - } - } -} - -// relu -void conv_depthwise_3x3s1p1_bias_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - // printf("3x3s1 mult height \n"); - //! pad is done implicit - const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - //! for 4x6 convolution window - const unsigned char right_pad_idx[16] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_in; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 7) >> 3; - int tile_h = (h_out + 1) >> 1; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(w_in - 7 - (cnt_col << 3)); - - int size_pad_bottom = h_out % 2; - - uint8x8_t vmask_rp1 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - uint8x8_t vmask_rp2 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8)); - unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3)); - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - int8x8_t vzero = vdup_n_s8(0); - int32x4_t vzero_32 = vdupq_n_s32(0); - - uint8x16_t vmask_rp = - vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx)); - // uint8x8_t vmask_rp2 = vcgt_u8(vdup_n_u8(size_pad_right), - // vld1_u8(right_pad_idx + 8)); - unsigned char vmask[16]; - vst1q_u8(vmask, vmask_rp); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - - int* doutr0 = nullptr; - int* doutr1 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - const signed char* dr3 = dr2 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - const signed char* din_ptr3 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - unsigned int* rst_mask = rmask; - unsigned char* val_mask = vmask; - - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; -#ifdef __aarch64__ - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - // left - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v1.8b}, [%[din_ptr0]] \n" /* load - a00-a015 to - q0*/ - "ld1 {v3.8b}, [%[din_ptr1]] \n" /* load - a00-a015 to - q0*/ - - "ld1 {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - // r0 - "smull v18.8h, %[v1].8b, v0.8b \n" /* outr00 = 01234567 * w01 - */ - - "ext v4.8b, v21.8b, v0.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v5.8b, v0.8b, v1.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "ld1 {v6.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v8.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "smlal v18.8h, %[v0].8b, v4.8b\n" /* outr00 += 00123456 * w00 */ - - "ld1 {v7.8b}, [%[din_ptr2]] \n" /* load - a00-a015 - to q0*/ - "ld1 {v9.8b}, [%[din_ptr3]] \n" /* load - a00-a015 - to q0*/ - - "sub %[din_ptr0], %[din_ptr0], #1 \n" - "sub %[din_ptr1], %[din_ptr1], #1 \n" - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v5.8b\n" /* outr00 += 12345678 * w02 */ - - "ext v4.8b, v21.8b, v2.8b, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v5.8b, v2.8b, v3.8b, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - // r1 - "sub %[din_ptr2], %[din_ptr2], #1 \n" - "sub %[din_ptr3], %[din_ptr3], #1 \n" - - "smull v19.8h, %[v1].8b, v2.8b \n" /* outr10 += 01234567 * w11 - */ - "smlal v18.8h, %[v4].8b, v2.8b \n" /* outr00 += 01234567 * w11 - */ - - "ext v14.8b, v21.8b, v6.8b, #7 \n" /* vext_s8(vzero, vinr0, - 7); 00123456 */ - "ext v15.8b, v6.8b, v7.8b, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "smlal v19.8h, %[v0].8b, v4.8b \n" /* outr00 += 01234567 * w11 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - "smull v18.8h, %[v3].8b, v4.8b \n" /* outr00 += 001234567 * w10 - */ - - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v5.8b \n" /* outr00 += 01234567 * w11 - */ - "smlal v18.8h, %[v5].8b, v5.8b \n" /* outr00 += 12345678 * w12 - */ - - // r2 - "ld1 {v1.8b}, [%[din_ptr0]] \n" /* load - a00-a015 to - q0*/ - "ld1 {v3.8b}, [%[din_ptr1]] \n" /* load - a00-a015 to - q0*/ - - "smlal v19.8h, %[v4].8b, v6.8b \n" /* outr10 += 01234567 * w11 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - "smull v18.8h, %[v7].8b, v6.8b \n" /* outr00 += 01234567 * w11 - */ - - "ext v4.8b, v21.8b, v8.8b, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v5.8b, v8.8b, v9.8b, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v3].8b, v14.8b \n" /* outr10 += 01234567 * w11 - */ - "smlal v18.8h, %[v6].8b, v14.8b \n" /* outr00 += 01234567 * w11 - */ - - "ld1 {v6.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - - "smlal v19.8h, %[v5].8b, v15.8b \n" /* outr10 += 01234567 * w11 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v15.8b \n" /* outr00 += 01234567 * w11 - */ - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v7].8b, v8.8b \n" /* outr00 += 01234567 * w11 - */ - - "ld1 {v8.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "ld1 {v7.8b}, [%[din_ptr2]] \n" /* load - a00-a015 to - q0*/ - "ld1 {v9.8b}, [%[din_ptr3]] \n" /* load - a00-a015 to - q0*/ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smlal v19.8h, %[v6].8b, v4.8b \n" /* outr00 += 01234567 * - w11 */ - - "smax v10.4s, v10.4s, v21.4s \n" /* relu*/ - "smax v11.4s, v11.4s, v21.4s \n" /* relu*/ - - "stp q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v5.8b \n" /* outr00 += 01234567 * - w11 */ - - "ld1 {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smax v12.4s, v12.4s, v21.4s \n" /* relu*/ - "smax v13.4s, v13.4s, v21.4s \n" /* relu*/ - - "stp q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "cmp %[cnt], #1 \n" - "blt 3f \n" - // mid - "1: \n" - "ext v4.8b, v0.8B, v1.8b, #1 \n" /*12345678 */ - "ext v5.8b, v0.8b, v1.8B, #2 \n" /*23456789 */ - - // r0 - "smull v18.8h, %[v0].8b, v0.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v14.8b, v2.8B, v3.8b, #1 \n" /*12345678 */ - "ext v15.8b, v2.8b, v3.8B, #2 \n" /*23456789 */ - - "smlal v18.8h, %[v1].8b, v4.8b\n" /* outr00 += 12345678 * w01 */ - - "ext v16.8b, v6.8B, v7.8b, #1 \n" /*12345678 */ - "ext v17.8b, v6.8b, v7.8B, #2 \n" /*23456789 */ - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v5.8b\n" /* outr00 += 23456789 * w02 */ - - // r1 - "ext v4.8b, v8.8B, v9.8b, #1 \n" /*12345678 */ - "ext v5.8b, v8.8b, v9.8B, #2 \n" /*23456789 */ - - "smull v19.8h, %[v0].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v3].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - - "smlal v19.8h, %[v1].8b, v14.8b\n" /* outr00 += 12345678 * w01 */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v4].8b, v14.8b\n" /* outr00 += 12345678 * w01 */ - - "ld1 {v1.8b}, [%[din_ptr0]] \n" /* load - a00-a015 - to q0*/ - "ld1 {v3.8b}, [%[din_ptr1]] \n" /* load - a00-a015 - to q0*/ - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v15.8b\n" /* outr00 += 23456789 * w02 */ - "smlal v18.8h, %[v5].8b, v15.8b\n" /* outr00 += 12345678 * w01 */ - - // r2 - "smlal v19.8h, %[v3].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v6].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v4].8b, v16.8b\n" /* outr00 += 12345678 * w01 */ - "smlal v18.8h, %[v7].8b, v16.8b\n" /* outr00 += 12345678 * w01 */ - - "smlal v19.8h, %[v5].8b, v17.8b\n" /* outr00 += 23456789 * w02 */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v17.8b\n" /* outr00 += 12345678 * w01 */ - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v6].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v6.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v8.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smlal v19.8h, %[v7].8b, v4.8b\n" /* outr00 += 12345678 * w01 */ - - "ld1 {v7.8b}, [%[din_ptr2]] \n" /* load - a00-a015 - to q0*/ - "ld1 {v9.8b}, [%[din_ptr3]] \n" /* load - a00-a015 - to q0*/ - - "smax v10.4s, v10.4s, v21.4s \n" /* relu*/ - "smax v11.4s, v11.4s, v21.4s \n" /* relu*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v5.8b\n" /* outr00 += 23456789 * w02 */ - - "stp q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "subs %[cnt], %[cnt], #1 \n" - - "smax v12.4s, v12.4s, v21.4s \n" /* relu*/ - "smax v13.4s, v13.4s, v21.4s \n" /* relu*/ - - "stp q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "bne 1b \n" - // right - "3: \n" - "ld1 {v14.8b}, [%[vmask]], #8 \n" - "ld1 {v15.8b}, [%[vmask]] \n" - - "bif v0.8b, v21.8b, v14.8b \n" - "bif v1.8b, v21.8b, v15.8b \n" - "bif v2.8b, v21.8b, v14.8b \n" - "bif v3.8b, v21.8b, v15.8b \n" - - "ext v4.8b, v0.8b, v1.8b, #1 \n" - "ext v5.8b, v0.8b, v1.8b, #2 \n" - - // r0 - "smull v18.8h, %[v0].8b, v0.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v16.8b, v2.8b, v3.8b, #1 \n" - "ext v17.8b, v2.8b, v3.8b, #2 \n" - - "bif v6.8b, v21.8b, v14.8b \n" - "bif v7.8b, v21.8b, v15.8b \n" - - "smlal v18.8h, %[v1].8b, v4.8b \n" /* outr00 = 01234567 * w00 - */ - - "bif v8.8b, v21.8b, v14.8b \n" - "bif v9.8b, v21.8b, v15.8b \n" - - "ext v20.8b, v6.8b, v7.8b, #1 \n" - "ext v22.8b, v6.8b, v7.8b, #2 \n" - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v5.8b \n" /* outr00 = 01234567 * w00 - */ - - // r1 - "ext v4.8b, v8.8b, v9.8b, #1 \n" - "ext v5.8b, v8.8b, v9.8b, #2 \n" - - "smull v19.8h, %[v0].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v3].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v14.4s}, [%[rmask]], #16 \n" - "ld1 {v15.4s}, [%[rmask]] \n" - - "smlal v19.8h, %[v1].8b, v16.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - "smull v18.8h, %[v4].8b, v16.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v0.4s}, [%[ptr_out0]], #16 \n" - "ld1 {v2.4s}, [%[ptr_out1]], #16 \n" - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v17.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v5].8b, v17.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v1.4s}, [%[ptr_out0]] \n" - "ld1 {v3.4s}, [%[ptr_out1]] \n" - - // r2 - "smlal v19.8h, %[v3].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - "smull v18.8h, %[v6].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "sub %[ptr_out0], %[ptr_out0], #16 \n" - "sub %[ptr_out1], %[ptr_out1], #16 \n" - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v4].8b, v20.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v7].8b, v20.8b \n" /* outr00 = 01234567 * w00 - */ - - "smlal v19.8h, %[v5].8b, v22.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v22.8b \n" /* outr00 = 01234567 * w00 - */ - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v6].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smlal v19.8h, %[v7].8b, v4.8b \n" /* outr00 = 01234567 * w00 - */ - - "smax v10.4s, v10.4s, v21.4s \n" /* relu*/ - "smax v11.4s, v11.4s, v21.4s \n" /* relu*/ - - "bif v10.16b, v0.16b, v14.16b \n" - "bif v11.16b, v1.16b, v15.16b \n" - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v5.8b \n" /* outr00 = 01234567 * w00 - */ - - "stp q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smax v12.4s, v12.4s, v21.4s \n" /* relu*/ - "smax v13.4s, v13.4s, v21.4s \n" /* relu*/ - - "bif v12.16b, v2.16b, v14.16b \n" - "bif v13.16b, v3.16b, v15.16b \n" - - "stp q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 -> - ptr_out */ - - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [ptr_out0] "+r"(doutr0), - [ptr_out1] "+r"(doutr1), - [vmask] "+r"(val_mask), - [rmask] "+r"(rst_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [bias_val] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); -#else - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - "vld1.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vmov.u32 d11, #0 @ zero\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = - // vbias - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = - // vbias - - // r0 - "vmull.s8 q12, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #1 @ ext \n" // d11 = 12345678 - - "vld1.8 {d12-d13}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.8 {d14-d15}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vdup.s8 d5, d0[3] @ d5 = w10, w10, w00, w00\n" - "vdup.s8 d6, d0[4] @ d6 = w11, w11, w01, w01\n" - - "vmlal.s8 q12, d30, d2 @ out0 += din0 * w00 \n" // q12 += d10 * w00 - - "vdup.s8 d7, d0[5] @ d7 = w12, w12\n" - "add %[din_ptr0], #7 @add \n" - "add %[din_ptr1], #7 @add \n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #1 @ ext \n" // d11 = 12345678 - "vmull.s8 q13, d12, d3 @ out1 = din1 * w01 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d12, d6 @ out0 = din1 * w11 \n" // q12 = d12 * w11 - - "vld1.8 {d12-d13}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vdup.s8 d8, d0[6] @ d8 = w20, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d9 = w21, w01, w01, w01\n" - "vdup.s8 d10, d1[0] @ d10 = w22, w02, w02, w02\n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d30, d2 @ out1 += din1 * w00 \n" // q12 += d10 * w00 - "vmull.s8 q12, d30, d5 @ out0 += din1 * w10 \n" // q12 += d10 * w00 - - "add %[din_ptr2], #7 @add \n" - "add %[din_ptr3], #7 @add \n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d11, d14, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #1 @ ext \n" // d11 = 12345678 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d14, d6 @ out1 = din2 * w11 \n" // q13 = d12 * w01 - "vmull.s8 q12, d14, d9 @ out1 = din2 * w21 \n" // q13 = d12 * w01 - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d5 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d8 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #1 @ ext \n" // d11 = 12345678 - "vmov.u32 q0, #0 @ mov \n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d12, d9 @ out1 = din3 * w21 \n" // q13 = d12 * w01 - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "vmax.s32 q8, q8, q0 @ max \n" - "vmax.s32 q9, q9, q0 @ max \n" - - "vmlal.s8 q13, d30, d8 @ out1 += din3 * w20 \n" // q13 += d10 * w00 - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - - "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmax.s32 q10, q10, q0 @ max \n" - "vmax.s32 q11, q11, q0 @ max \n" - - "vst1.32 {d20-d21}, [%[dout_ptr2]]! @ store\n" - "cmp %[cnt], #1 \n" - "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - "blt 1f \n" - - // mid - "2: \n" - "vld1.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = - // vbias - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = - // vbias - - // r0 - "vmull.s8 q12, d12, d2 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 12345678 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 23456789 - - "vld1.8 {d12-d13}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.8 {d14-d15}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - "vmlal.s8 q12, d30, d3 @ out0 += din0 * w00 \n" // q12 += d10 * w00 - - "add %[din_ptr0], #8 @add \n" - "add %[din_ptr1], #8 @add \n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 12345678 - "vmull.s8 q13, d12, d2 @ out1 = din1 * w01 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d12, d5 @ out0 = din1 * w11 \n" // q12 = d12 * w11 - - "vld1.8 {d12-d13}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - "vmlal.s8 q13, d30, d3 @ out1 += din1 * w00 \n" // q12 += d10 * w00 - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d30, d6 @ out0 += din1 * w10 \n" // q12 += d10 * w00 - - "add %[din_ptr2], #8 @add \n" - "add %[din_ptr3], #8 @add \n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d14, d15, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #2 @ ext \n" // d11 = 12345678 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d14, d5 @ out1 = din2 * w11 \n" // q13 = d12 * w01 - "vmull.s8 q12, d14, d8 @ out1 = din2 * w21 \n" // q13 = d12 * w01 - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d6 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d9 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 12345678 - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d12, d8 @ out1 = din3 * w21 \n" // q13 = d12 * w01 - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "vmax.s32 q8, q8, q0 @ max \n" - "vmax.s32 q9, q9, q0 @ max \n" - - "vmlal.s8 q13, d30, d9 @ out1 += din3 * w20 \n" // q13 += d10 * w00 - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - - "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmax.s32 q10, q10, q0 @ max \n" - "vmax.s32 q11, q11, q0 @ max \n" - - "vst1.32 {d20-d21}, [%[dout_ptr2]]! @ store\n" - "subs %[cnt], #1 \n" - "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - "bne 2b \n" - // right - "1: \n" - "vld1.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.8 {d28-d29}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = vbias - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = vbias - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with right pad\n" - "vld1.8 {d14-d15}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - // r0 - "vmull.s8 q12, d12, d2 @ out0 = din0 * w00 \n" // q12 = d12 * w01 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 12345678 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 23456789 - - "vld1.8 {d12-d13}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with right pad\n" - - "vmlal.s8 q12, d30, d3 @ out0 += din0 * w01 \n" // q12 += d10 * w00 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d14, d15, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #2 @ ext \n" // d11 = 12345678 - - "vmull.s8 q13, d14, d2 @ out1 = din1 * w00 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d14, d5 @ out0 = din1 * w10 \n" // q12 = d12 * w11 - - "vld1.8 {d14-d15}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vbif.8 d12, d11, d28 @ bit select, deal with " - "right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with " - "right pad\n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d30, d3 @ out1 += din1 * w01 \n" // q12 += d10 * w00 - "vmull.s8 q12, d30, d6 @ out0 += din1 * w11 \n" // q12 += d10 * w00 - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d12, d13, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d13, #2 @ ext \n" // d11 = 12345678 - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d12, d5 @ out1 = din2 * w10 \n" // q13 = d12 * w01 - "vmull.s8 q12, d12, d8 @ out1 = din2 * w20 \n" // q13 = d12 * w01 - - "vbif.8 d14, d11, d28 @ bit select, deal with " - "right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with " - "right pad\n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d6 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d9 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - "vld1.32 {d28-d29}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d12-d13}, [%[dout_ptr1]] @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 5 6 7 8 " - "9\n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d14, d15, #1 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d15, #2 @ ext \n" // d11 = 12345678 - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d14, d8 @ out1 = din3 * w20 \n" // q13 = d12 * w01 - "vld1.32 {d14-d15}, [%[dout_ptr2]]! @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d24-d25}, [%[dout_ptr2]] @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vmax.s32 q8, q8, q0 @ max \n" - "vmax.s32 q9, q9, q0 @ max \n" - - "vmlal.s8 q13, d30, d9 @ out1 += din3 * w21 \n" // q13 += d10 * w00 - "vbif q8, q14, q1 @ bit select, deal with right " - "pad\n" - "vbif q9, q6, q2 @ bit select, deal with right " - "pad\n" - "sub %[dout_ptr1], #16 @ sub \n" - "sub %[dout_ptr2], #16 @ sub \n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - "vst1.32 {d16-d17}, [%[dout_ptr1]]! @ store\n" - "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmax.s32 q10, q10, q0 @ max \n" - "vmax.s32 q11, q11, q0 @ max \n" - - "vbif q10, q7, q1 @ bit select, deal with right pad\n" - "vbif q11, q12, q2 @ bit select, deal with right pad\n" - - "vst1.32 {d20-d21}, [%[dout_ptr2]]! @ store\n" - "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [cnt] "+r"(cnt), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - dout_ptr += 2 * w_out; - } - } - } -} -// w_in <= 8 -void conv_depthwise_3x3s1p1_bias_s_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - //! for 4x6 convolution window - const unsigned char right_pad_idx[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_in; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_h = (h_out + 3) >> 2; - - unsigned int size_pad_right = (unsigned int)(w_in); - - int size_pad_bottom = h_out % 4; - - uint8x8_t vmask_rp = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - unsigned int rst_remain = (unsigned int)w_out; - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - unsigned char vmask[8]; - vst1_u8(vmask, vmask_rp); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - - int8x8_t vzero = vdup_n_s8(0); - int32x4_t vzero_32 = vdupq_n_s32(0); - - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - - int* doutr0 = nullptr; - int* doutr1 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - const signed char* dr3 = dr2 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - const signed char* din_ptr3 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - int out_buf1[8]; - int out_buf2[8]; - int trash_buf[8]; - - unsigned int* rst_mask = rmask; - unsigned char* val_mask = vmask; - - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = trash_buf; - } -#ifdef __aarch64__ - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - // left - "ld1 {v4.8b}, [%[vmask]] \n" - "ld1 {v0.8b}, [%[din_ptr0]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v1.8b}, [%[din_ptr1]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v2.8b}, [%[din_ptr2]], #8 \n" /* load - a00-a015 - to - q0*/ - "ld1 {v3.8b}, [%[din_ptr3]], #8 \n" /* load - a00-a015 - to - q0*/ - - "bif v0.8b, v21.8b, v4.8b \n" - "bif v1.8b, v21.8b, v4.8b \n" - "bif v2.8b, v21.8b, v4.8b \n" - "bif v3.8b, v21.8b, v4.8b \n" - - "ext v6.8b, v21.8b, v0.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v7.8b, v0.8b, v21.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "ld1 {v10.4s}, [%[vbias]] \n" - "ld1 {v11.4s}, [%[vbias]] \n" - - // r0 - "smull v18.8h, %[v1].8b, v0.8b \n" /* outr00 = 01234567 * w01 - */ - - "ext v8.8b, v21.8b, v1.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v9.8b, v1.8b, v21.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - "smlal v18.8h, %[v0].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "ld1 {v12.4s}, [%[vbias]] \n" - "ld1 {v13.4s}, [%[vbias]] \n" - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v2].8b, v7.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v6.8b, v21.8b, v2.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v7.8b, v2.8b, v21.8B, #1 \n" /* vext_s8(vinr0, vinr0_1, - 1); 12345678 */ - - // r1 - "smull v19.8h, %[v1].8b, v1.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v4].8b, v1.8b \n" /* outr00 = 01234567 * w00 - */ - - // "ld1 {v14.4s}, [%[rmask]], #16 \n" - // "ld1 {v15.4s}, [%[rmask]] \n" - - "smlal v19.8h, %[v0].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v3].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - // "ld1 {v16.4s}, [%[ptr_out0]], #16 \n" - // "ld1 {v17.4s}, [%[ptr_out1]], #16 \n" - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v2].8b, v9.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v5].8b, v9.8b \n" /* outr00 = 01234567 * w00 - */ - - "ext v8.8b, v21.8b, v3.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 00123456 */ - "ext v9.8b, v3.8b, v21.8B, #1 \n" // vext_s8(vinr0, vinr0_1, - // 1); 12345678 - - // "ld1 {v0.4s}, [%[ptr_out0]] \n" - // "ld1 {v1.4s}, [%[ptr_out1]] \n" - - // r2 - "smlal v19.8h, %[v4].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v7].8b, v2.8b \n" /* outr00 = 01234567 * w00 - */ - - // "sub %[ptr_out0], %[ptr_out0], #16 \n" - // "sub %[ptr_out1], %[ptr_out1], #16 \n" - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v3].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - "smlal v18.8h, %[v6].8b, v6.8b \n" /* outr00 = 01234567 * w00 - */ - - "smlal v19.8h, %[v5].8b, v7.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smull v18.8h, %[v8].8b, v7.8b \n" /* outr00 = 01234567 * w00 - */ - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r3 - "smull v19.8h, %[v7].8b, v3.8b \n" /* outr00 = 01234567 * w00 - */ - - "saddw v10.4s, v10.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v11.4s, v11.4s, v18.8h \n" /* v11 += outr00.high*/ - - "smlal v19.8h, %[v6].8b, v8.8b \n" /* outr00 = 01234567 * w00 - */ - - "smax v10.4s, v10.4s, v21.4s \n" /* relu */ - "smax v11.4s, v11.4s, v21.4s \n" /* relu */ - - // "bif v10.16b, v16.16b, v14.16b \n" - // "bif v11.16b, v0.16b, v15.16b \n" - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smull v19.8h, %[v8].8b, v9.8b \n" /* outr00 = 01234567 * w00 - */ - - "stp q10, q11, [%[ptr_out0]] \n" /* store q10, q11 -> ptr_out */ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smax v12.4s, v12.4s, v21.4s \n" /* relu */ - "smax v13.4s, v13.4s, v21.4s \n" /* relu */ - - // "bif v12.16b, v17.16b, v14.16b \n" - // "bif v13.16b, v1.16b, v15.16b \n" - - "stp q12, q13, [%[ptr_out1]] \n" /* store q10, q11 -> ptr_out */ - - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [rmask] "+r"(rst_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [vbias] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22), - [vmask] "r"(vmask), - [ptr_out0] "r"(out_buf1), - [ptr_out1] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); -#else - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "pld [%[din_ptr3]] @ preload data\n" - "vld1.8 {d28}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vld1.8 {d12}, [%[din_ptr0]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vld1.8 {d13}, [%[din_ptr1]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - - "vmov.u32 d11, #0 @ zero\n" - // out0 - "vdup.32 q8, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q9, %[bias] @ and \n" // q9 = - // vbias - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d28 @ bit select, deal with right pad\n" - "vld1.8 {d14}, [%[din_ptr2]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - "vld1.8 {d15}, [%[din_ptr3]] @ load din00= 0 1 2 3 4 5 6 7 8 9\n" - // out1 - "vdup.32 q10, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q11, %[bias] @ and \n" // q9 = - // vbias - - // r0 - "vmull.s8 q12, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vext.8 d30, d11, d12, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d12, d11, #1 @ ext \n" // d11 = 12345678 - - "vdup.s8 d5, d0[3] @ d5 = w10, w10, w00, w00\n" - "vdup.s8 d6, d0[4] @ d6 = w11, w11, w01, w01\n" - - "vmlal.s8 q12, d30, d2 @ out0 += din0 * w00 \n" // q12 += d10 * w00 - - "vdup.s8 d7, d0[5] @ d7 = w12, w12\n" - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d28 @ bit select, deal with right pad\n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q12, d31, d4 @ out0 += din0 * w02 \n" // q12 += d11 * w02 - - // r1 - "vext.8 d30, d11, d13, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d13, d11, #1 @ ext \n" // d11 = 12345678 - "vmull.s8 q13, d13, d3 @ out1 = din1 * w01 \n" // q13 = d12 * w01 - - "vmlal.s8 q12, d13, d6 @ out0 = din1 * w11 \n" // q12 = d12 * w11 - - "vdup.s8 d8, d0[6] @ d8 = w20, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d9 = w21, w01, w01, w01\n" - - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d30, d2 @ out1 += din1 * w00 \n" // q12 += d10 * w00 - "vmull.s8 q12, d30, d5 @ out0 += din1 * w10 \n" // q12 += d10 * w00 - - "vdup.s8 d10, d1[0] @ d10 = w22, w02, w02, w02\n" - // "vld1.32 {d28-d29}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 - // 6 7 8 9\n" "vld1.32 {d12-d13}, [%[dout_ptr1]] @ load din00= 0 - // 1 2 3 4 5 6 7 8 9\n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d4 @ out1 += din1 * w02 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d31, d7 @ out0 += din1 * w12 \n" // q12 += d10 * w00 - - // r2 - "vext.8 d30, d11, d14, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d14, d11, #1 @ ext \n" // d11 = 12345678 - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d14, d6 @ out1 = din2 * w11 \n" // q13 = d12 * w01 - "vmull.s8 q12, d14, d9 @ out1 = din2 * w21 \n" // q13 = d12 * w01 - - // "sub %[dout_ptr1], #16 @ sub \n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d30, d5 @ out1 += din2 * w10 \n" // q12 += d10 * w00 - "vmlal.s8 q12, d30, d8 @ out0 += din2 * w20 \n" // q12 += d10 * w00 - - // "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 - // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 - // 5 6 7 8 9\n" - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmlal.s8 q13, d31, d7 @ out1 += din2 * w12 \n" // q12 += d10 * w00 - "vmull.s8 q12, d31, d10 @ out0 += din2 * w22 \n" // q12 += d10 * w00 - - // r3 - "vext.8 d30, d11, d15, #7 @ ext \n" // d10 = 00123456 - "vext.8 d31, d15, d11, #1 @ ext \n" // d11 = 12345678 - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vaddw.s16 q8, q8, d24 @addw \n" // out0 += - // vget_low_s16(out00) - "vaddw.s16 q9, q9, d25 @addw \n" // out0_1 += - // vget_high_s16(out00) - - "vmull.s8 q13, d15, d9 @ out1 = din3 * w21 \n" // q13 = d12 * w01 - - "vmov.u32 q0, #0 @ zero\n" - - // "vld1.32 {d6-d7}, [%[dout_ptr2]]! @ load din00= 0 1 2 3 4 5 6 - // 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr2]] @ load din00= 0 1 - // 2 3 4 5 6 7 8 9\n" - - "vmlal.s8 q13, d30, d8 @ out1 += din3 * w20 \n" // q13 += d10 * w00 - - "vmax.s32 q8, q8, q0 @ max \n" - "vmax.s32 q9, q9, q0 @ max \n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmull.s8 q13, d31, d10 @ out1 += din3 * w22 \n" // q12 += d10 * w00 - - // "sub %[dout_ptr2], #16 @ sub \n" - // "vbif q8, q14, q1 @ bit select, deal with right - // pad\n" "vbif q9, q6, q2 @ bit select, deal - // with right pad\n" - - "vaddw.s16 q10, q10, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q11, q11, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vst1.32 {d16-d19}, [%[dout_ptr1]] @ store\n" - // "vst1.32 {d18-d19}, [%[dout_ptr1]]! @ store\n" - - "vmax.s32 q10, q10, q0 @ max \n" - "vmax.s32 q11, q11, q0 @ max \n" - - // "vbif q10, q3, q1 @ bit select, deal with right - // pad\n" "vbif q11, q7, q2 @ bit select, deal - // with right pad\n" - - "vst1.32 {d20-d23}, [%[dout_ptr2]] @ store\n" - // "vst1.32 {d22-d23}, [%[dout_ptr2]]! @ store\n" - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask), - [dout_ptr1] "r"(out_buf1), - [dout_ptr2] "r"(out_buf2) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - dout_ptr += 2 * w_out; - } - } - } -} - -// 1 line w_in > 16 -void conv_depthwise_3x3s2p1_bias_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - // printf("3x3s2 mult height \n"); - //! pad is done implicit - //! for 4x6 convolution window - const unsigned char right_pad_idx[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_out; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 15) >> 4; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(w_in - 15 - (cnt_col << 4)); - if (size_pad_right == 17) { - size_pad_right = 0; - cnt_col++; - } - - uint8x8_t vmask_rp1 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - uint8x8_t vmask_rp2 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8)); - unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3)); - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - int8x8_t vzero = vdup_n_s8(0); - int32x4_t vzero_32 = vdupq_n_s32(0); - - uint8x16_t vmask_rp = - vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx)); - unsigned char vmask[16]; - vst1q_u8(vmask, vmask_rp); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - - int* doutr0 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - - doutr0 = dout_ptr; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din_ptr1 = zero_ptr; - case 1: - din_ptr2 = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; -#ifdef __aarch64__ - unsigned char* val_mask = vmask; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "movi v10.4s, #0x0\n" - // left - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]] \n" /*load a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]] \n" /* load a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]] \n" /*load a00-a015 - to q0*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "ext v6.8b, v10.8b, v1.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v7.8b, v10.8b, v3.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v8.8b, v10.8b, v5.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - - // r0 - "smull v14.8h, %[v1].8b, v0.8b \n" /* outr00 = 02468 * w01 */ - "smull v15.8h, %[v2].8b, v1.8b\n" /* outr00 += 13579 * w02 */ - "smull v16.8h, %[v0].8b, v6.8b\n" /* outr00 += 013579 * w00 */ - - "add %[din_ptr0], %[din_ptr0], #15 \n" - "add %[din_ptr1], %[din_ptr1], #15 \n" - "add %[din_ptr2], %[din_ptr2], #15 \n" - - // r1 - "smlal v14.8h, %[v4].8b, v2.8b \n" /* outr00 = 02468 * w01 */ - "smlal v15.8h, %[v5].8b, v3.8b\n" /* outr00 += 13579 * w02 */ - "smlal v16.8h, %[v3].8b, v7.8b\n" /* outr00 += 013579 * w00 */ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v14.8h, %[v7].8b, v4.8b \n" /* outr00 = 02468 * w01 */ - "smull v15.8h, %[v8].8b, v5.8b\n" /* outr00 += 13579 * w02 */ - "smull v16.8h, %[v6].8b, v8.8b\n" /* outr00 += 013579 * w00 */ - - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]], #16 \n" /*load - a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]], #16 \n" /* load - a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]], #16 \n" /*load - a00-a015 - to q0*/ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - "smax v12.4s, v12.4s, v10.4s \n" /*relu*/ - "smax v13.4s, v13.4s, v10.4s \n" /*relu*/ - - "stp q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "cmp %[cnt], #1 \n" - "blt 3f \n" - // mid - "1: \n" - "ld1 {v6.8b}, [%[din_ptr0]] \n" /*load a00-a015 to q0*/ - "ld1 {v7.8b}, [%[din_ptr1]] \n" /*load a00-a015 to q0*/ - "ld1 {v8.8b}, [%[din_ptr2]] \n" /*load a00-a015 to q0*/ - - "ext v9.8b, v0.8b, v6.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 246810 */ - "ext v11.8b, v2.8b, v7.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 246810 */ - "ext v14.8b, v4.8b, v8.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 246810 */ - - // r0 - "smull v6.8h, %[v0].8b, v0.8b \n" /* outr00 = 02468 * w00 */ - "smull v7.8h, %[v1].8b, v1.8b\n" /* outr00 += 13579 * w01 */ - "smull v8.8h, %[v2].8b, v9.8b\n" /* outr00 += 246810 * w02 */ - - // r1 - "smlal v6.8h, %[v3].8b, v2.8b \n" /* outr00 = 02468 * w00 */ - "smlal v7.8h, %[v4].8b, v3.8b\n" /* outr00 += 13579 * w01 */ - "smlal v8.8h, %[v5].8b, v11.8b\n" /* outr00 += 246810 * w02 */ - - "saddw v12.4s, v12.4s, v6.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v6.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v7.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v7.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v8.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v8.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v6.8h, %[v6].8b, v4.8b \n" /* outr00 = 02468 * w00 */ - "smull v7.8h, %[v7].8b, v5.8b\n" /* outr00 += 13579 * w01 */ - "smull v8.8h, %[v8].8b, v14.8b\n" /* outr00 += 246810 * w02 */ - - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]], #16 \n" /*load - a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]], #16 \n" /* load - a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]], #16 \n" /*load - a00-a015 - to q0*/ - - "saddw v12.4s, v12.4s, v6.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v6.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v7.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v7.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v8.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v8.8h \n" /* v11 += outr00.high*/ - - "smax v12.4s, v12.4s, v10.4s \n" /*relu*/ - "smax v13.4s, v13.4s, v10.4s \n" /*relu*/ - - "subs %[cnt], %[cnt], #1 \n" - - "stp q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - "bne 1b \n" - // right - "3: \n" - "ld1 {v14.8b}, [%[vmask]], #8 \n" - "ld1 {v15.8b}, [%[vmask]] \n" - - "bif v0.8b, v10.8b, v14.8b \n" - "bif v1.8b, v10.8b, v15.8b \n" - "bif v2.8b, v10.8b, v14.8b \n" - "bif v3.8b, v10.8b, v15.8b \n" - "bif v4.8b, v10.8b, v14.8b \n" - "bif v5.8b, v10.8b, v15.8b \n" - - "ext v6.8b, v0.8b, v10.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 2468.. */ - "ext v7.8b, v2.8b, v10.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 2468..*/ - "ext v8.8b, v4.8b, v10.8B, #1 \n" /* vext_s8(vzero, vinr0, 7); - 2468.. */ - - // r0 - "smull v14.8h, %[v0].8b, v0.8b \n" /* outr00 = 02468 * w00 */ - "smull v15.8h, %[v1].8b, v1.8b\n" /* outr00 += 13579 * w01 */ - "smull v16.8h, %[v2].8b, v6.8b\n" /* outr00 += 246810 * w02 */ - - // r1 - "smlal v14.8h, %[v3].8b, v2.8b \n" /* outr00 = 02468 * w00 */ - "smlal v15.8h, %[v4].8b, v3.8b\n" /* outr00 += 13579 * w01 */ - "smlal v16.8h, %[v5].8b, v7.8b\n" /* outr00 += 246810 * w02 */ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v14.8h, %[v6].8b, v4.8b \n" /* outr00 = 02468 * w00 */ - "smull v15.8h, %[v7].8b, v5.8b\n" /* outr00 += 13579 * w01 */ - "smull v16.8h, %[v8].8b, v8.8b\n" /* outr00 += 246810 * w02 */ - - "ldp q0, q1, [%[ptr_out0]] \n" /* dup v10, bias */ - "ldp q9, q11, [%[rst_mask]] \n" /* dup v10, bias */ - - "saddw v12.4s, v12.4s, v14.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v14.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v15.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v15.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v16.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v16.8h \n" /* v11 += outr00.high*/ - - "smax v12.4s, v12.4s, v10.4s \n" /*relu*/ - "smax v13.4s, v13.4s, v10.4s \n" /*relu*/ - - "bif v12.16b, v0.16b, v9.16b \n" - "bif v13.16b, v1.16b, v11.16b \n" - - "stp q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 -> - ptr_out */ - - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [ptr_out0] "+r"(doutr0), - [vmask] "+r"(val_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [bias_val] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22), - [rst_mask] "r"(rmask) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); -#else - unsigned int* rst_mask = rmask; - // prefetch input - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - "vld2.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]] @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]] @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - "vmov.u32 d11, #0 @ zero\n" - - "vdup.s8 d5, d0[3] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d6, d0[4] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d7, d0[5] @ d4 = w02, w02, w02, w02\n" - - "vext.8 d18, d11, d13, #7 @ ext \n" // d16 = -1 1 3 5 - "vext.8 d19, d11, d15, #7 @ ext \n" // d17 = -1 1 3 5 - "vext.8 d20, d11, d17, #7 @ ext \n" // d18 = -1 1 3 5 - - // r0 - "vmull.s8 q13, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vmull.s8 q14, d13, d4 @ out1 = din0 * w02 \n" // q12 = d12 * w02 - "vmull.s8 q15, d18, d2 @ out2 = din0 * w00 \n" // q12 = d12 * w02 - - "vdup.s8 d8, d0[6] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d10, d1[0] @ d4 = w02, w02, w02, w02\n" - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = - // vbias - - // r1 - "vmlal.s8 q13, d14, d6 @ out0 += din1 * w11 \n" // q12 = d12 * w11 - "vmlal.s8 q14, d15, d7 @ out1 += din1 * w12 \n" // q12 = d12 * w11 - "vmlal.s8 q15, d19, d5 @ out2 += din1 * w10 \n" // q12 = d12 * w11 - - "add %[din_ptr0], #15 @add \n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "add %[din_ptr1], #15 @add \n" - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "add %[din_ptr2], #15 @add \n" - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d9 @ out0 += din1 * w21 \n" // q12 = d12 * w11 - "vmull.s8 q14, d17, d10 @ out1 += din1 * w22 \n" // q12 = d12 * w11 - "vmull.s8 q15, d20, d8 @ out2 += din1 * w20 \n" // q12 = d12 * w11 - - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmov.u32 q8, #0 @ max \n" // max - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmax.s32 q11, q11, q8 @ max\n" - "vmax.s32 q12, q12, q8 @ max\n" - - "vst1.32 {d22-d23}, [%[dout_ptr1]]! @ store\n" - "cmp %[cnt], #1 \n" - "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - "blt 1f \n" - - // mid - "2: \n" - "vld2.8 {d12-d13}, [%[din_ptr0]]! @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]]! @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]]! @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - - "vld1.8 {d21}, [%[din_ptr0]] @ load din00= 16 17\n" // d10 = 0 2 - // 4 6 - "vld1.8 {d22}, [%[din_ptr1]] @ load din00= 16 17\n" // d12 = 0 2 - // 4 6 - "vld1.8 {d23}, [%[din_ptr2]] @ load din00= 16 17\n" // d14 = 0 2 - // 4 6 - - "vext.8 d18, d12, d21, #1 @ ext din00 = 2 4 6 8\n" // d16 = 2 - // 4 6 8 - "vext.8 d19, d14, d22, #1 @ ext \n" // d17 = 2 4 6 8 - "vext.8 d20, d16, d23, #1 @ ext \n" // d18 = 2 4 6 8 - - // r0 - "vmull.s8 q13, d12, d2 @ out0 = din0 * w00 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d13, d3 @ out1 = din0 * w01 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d18, d4 @ out2 = din0 * w02 \n" // q12 = 2 4 6 8 - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = - // vbias - - // r1 - "vmlal.s8 q13, d14, d5 @ out0 += din1 * w10 \n" // q12 = 0 2 4 6 - "vmlal.s8 q14, d15, d6 @ out1 += din1 * w11 \n" // q12 = 1 3 5 7 - "vmlal.s8 q15, d19, d7 @ out2 += din1 * w12 \n" // q12 = 2 4 6 8 - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d8 @ out0 += din1 * w20 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d17, d9 @ out1 += din1 * w21 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d20, d10 @ out2 += din1 * w22 \n" // q12 = 2 4 6 8 - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vmov.u32 q8, #0 @ mov \n" - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - - "vmax.s32 q11, q11, q8 @ max\n" - "vmax.s32 q12, q12, q8 @ max\n" - - "vst1.32 {d22-d23}, [%[dout_ptr1]]! @ store\n" - - "subs %[cnt], #1 \n" - "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - "bne 2b \n" - // right - "1: \n" - "cmp %[size_pad_right], #1 \n" - "blt 3f \n" - "vld2.8 {d12-d13}, [%[din_ptr0]]! @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]]! @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]]! @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - "vld1.8 {d28-d29}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = vbias - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d16, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d17, d11, d29 @ bit select, deal with right pad\n" - - "vext.8 d18, d12, d11, #1 @ ext din00 = 2 4 6 8\n" // d16 = -1 - // 1 3 5 - "vext.8 d19, d14, d11, #1 @ ext \n" // d17 = -1 1 3 5 - "vext.8 d20, d16, d11, #1 @ ext \n" // d18 = -1 1 3 5 - - // r0 - "vmull.s8 q13, d12, d2 @ out0 = din0 * w00 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d13, d3 @ out1 = din0 * w01 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d18, d4 @ out2 = din0 * w02 \n" // q12 = 2 4 6 8 - - // r1 - "vmlal.s8 q13, d14, d5 @ out0 += din1 * w11 \n" // q12 = 0 2 4 6 - "vmlal.s8 q14, d15, d6 @ out1 += din1 * w12 \n" // q12 = 1 3 5 7 - "vmlal.s8 q15, d19, d7 @ out2 += din1 * w10 \n" // q12 = 2 4 6 8 - - "vld1.32 {d12-d13}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - "vld1.32 {d14-d15}, [%[dout_ptr1]] @ load din00= 0 1 2 3 4 5 6 " - "7 8 9\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d8 @ out0 += din1 * w11 \n" // q12 = 0 2 4 6 - "vmull.s8 q14, d17, d9 @ out1 += din1 * w12 \n" // q12 = 1 3 5 7 - "vmull.s8 q15, d20, d10 @ out2 += din1 * w10 \n" // q12 = 2 4 6 8 - - "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 5 6 7 8 " - "9\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "sub %[dout_ptr1], #16 @ sub \n" - "vmov.u32 q8, #0 @mov \n" - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmax.s32 q11, q11, q8 @ max\n" - "vmax.s32 q12, q12, q8 @ max\n" - - "vbif q11, q6, q1 @ bit select, deal with right pad\n" - "vbif q12, q7, q2 @ bit select, deal with right pad\n" - - "vst1.32 {d22-d23}, [%[dout_ptr1]]! @ store\n" - "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - "3: \n" - - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [dout_ptr1] "+r"(doutr0), - [cnt] "+r"(cnt), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask), [size_pad_right] "r"(size_pad_right) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - dout_ptr += w_out; - } - } - } -} -// w_in <= 16 -void conv_depthwise_3x3s2p1_bias_s_relu_int8(int* dout, - const signed char* din, - const signed char* weights, - const int* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - // printf("3x3s2 mult height \n"); - //! pad is done implicit - // const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - //! for 4x6 convolution window - const unsigned char right_pad_idx[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - - // printf("conv3x3_dw start \n"); - signed char* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(signed char)); - int* write_ptr = - reinterpret_cast(ctx->workspace_data()) + w_out; - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - unsigned int size_pad_right = (unsigned int)(w_in); - - uint8x8_t vmask_rp1 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx)); - uint8x8_t vmask_rp2 = - vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8)); - unsigned int rst_remain = (unsigned int)w_out; - uint32x4_t vmask_result1 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst)); - uint32x4_t vmask_result2 = - vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4)); - - uint8x16_t vmask_rp = - vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx)); - unsigned char vmask[16]; - vst1q_u8(vmask, vmask_rp); - - unsigned int rmask[8]; - vst1q_u32(rmask, vmask_result1); - vst1q_u32(rmask + 4, vmask_result2); - int8x8_t vzero = vdup_n_s8(0); - int32x4_t vzero_32 = vdupq_n_s32(0); - - for (int n = 0; n < num; ++n) { - const signed char* din_batch = din + n * ch_in * size_in_channel; - int* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - int* dout_ptr = dout_batch + c * size_out_channel; - - const signed char* din_ch_ptr = din_batch + c * size_in_channel; - - int bias_val = flag_bias ? bias[c] : 0; - - const signed char* wei_ptr = weights + c * w_stride; - -#ifdef __aarch64__ - int vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - int8x8_t wr00 = vdup_n_s8(wei_ptr[0]); - int8x8_t wr10 = vdup_n_s8(wei_ptr[3]); - int8x8_t wr20 = vdup_n_s8(wei_ptr[6]); - - int8x8_t wr01 = vdup_n_s8(wei_ptr[1]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[4]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[7]); - - int8x8_t wr02 = vdup_n_s8(wei_ptr[2]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[5]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[8]); -#endif - - int* doutr0 = nullptr; - - const signed char* dr0 = din_ch_ptr; - const signed char* dr1 = dr0 + w_in; - const signed char* dr2 = dr1 + w_in; - - const signed char* din_ptr0 = nullptr; - const signed char* din_ptr1 = nullptr; - const signed char* din_ptr2 = nullptr; - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - - doutr0 = dout_ptr; - - int out_buf1[8]; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr2 + w_in; - dr2 = dr1 + w_in; - } - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din_ptr1 = zero_ptr; - case 1: - din_ptr2 = zero_ptr; - default: - break; - } - } -#ifdef __aarch64__ - unsigned int* rst_mask = rmask; - unsigned char* val_mask = vmask; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "movi v16.4s, #0x0\n" - // left - "ld1 {v10.8b}, [%[vmask]], #8 \n" - "ld1 {v11.8b}, [%[vmask]] \n" - "ld2 {v0.8b - v1.8b}, [%[din_ptr0]] \n" /*load a00-a015 - to q0*/ - "ld2 {v2.8b - v3.8b}, [%[din_ptr1]] \n" /* load a00-a015 - to q0*/ - "ld2 {v4.8b - v5.8b}, [%[din_ptr2]] \n" /*load a00-a015 - to q0*/ - - "bif v0.8b, v16.8b, v10.8b \n" - "bif v1.8b, v16.8b, v11.8b \n" - "bif v2.8b, v16.8b, v10.8b \n" - "bif v3.8b, v16.8b, v11.8b \n" - "bif v4.8b, v16.8b, v10.8b \n" - "bif v5.8b, v16.8b, v11.8b \n" - - "ld1 {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */ - - "ext v6.8b, v16.8b, v1.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v7.8b, v16.8b, v3.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - "ext v8.8b, v16.8b, v5.8B, #7 \n" /* vext_s8(vzero, vinr0, 7); - 013579 */ - - // r0 - "smull v17.8h, %[v1].8b, v0.8b \n" /* outr00 = 02468 * w01 */ - "smull v18.8h, %[v2].8b, v1.8b\n" /* outr00 += 13579 * w02 */ - "smull v19.8h, %[v0].8b, v6.8b\n" /* outr00 += 013579 * w00 */ - - // "ldp q0, q1, [%[ptr_out0]] \n" /* dup v10, - // bias */ "ldp q10, q11, [%[rst_mask]] \n" /* - // dup v10, bias */ - - // r1 - "smlal v17.8h, %[v4].8b, v2.8b \n" /* outr00 = 02468 * w01 */ - "smlal v18.8h, %[v5].8b, v3.8b\n" /* outr00 += 13579 * w02 */ - "smlal v19.8h, %[v3].8b, v7.8b\n" /* outr00 += 013579 * w00 */ - - "saddw v12.4s, v12.4s, v17.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v17.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v18.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - // r2 - "smull v17.8h, %[v7].8b, v4.8b \n" /* outr00 = 02468 * w01 */ - "smull v18.8h, %[v8].8b, v5.8b\n" /* outr00 += 13579 * w02 */ - "smull v19.8h, %[v6].8b, v8.8b\n" /* outr00 += 013579 * w00 */ - - "saddw v12.4s, v12.4s, v17.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v17.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v18.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v18.8h \n" /* v11 += outr00.high*/ - - "saddw v12.4s, v12.4s, v19.4h \n" /* v10 += outr00.low*/ - "saddw2 v13.4s, v13.4s, v19.8h \n" /* v11 += outr00.high*/ - - "smax v12.4s, v12.4s, v16.4s \n" /*relu*/ - "smax v13.4s, v13.4s, v16.4s \n" /*relu*/ - - // "bif v12.16b, v0.16b, v10.16b \n" - // "bif v13.16b, v1.16b, v11.16b \n" - - "stp q12, q13, [%[ptr_out0]] \n" /* store q10, q11 -> ptr_out - */ - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [vmask] "+r"(val_mask) - : [v0] "w"(wr00), - [v1] "w"(wr01), - [v2] "w"(wr02), - [v3] "w"(wr10), - [bias_val] "r"(vbias), - [v4] "w"(wr11), - [v5] "w"(wr12), - [v6] "w"(wr20), - [v7] "w"(wr21), - [v8] "w"(wr22), - [rst_mask] "r"(rmask), - [ptr_out0] "r"(out_buf1) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); - -#else - unsigned int* rst_mask = rmask; - // prefetch input - // store weights - asm volatile("vld1.8 {d0-d1}, [%[wei_ptr]] \n" - : - : [wei_ptr] "r"(wei_ptr) - : "memory"); - asm volatile( - // left - "pld [%[din_ptr0]] @ preload data\n" - "pld [%[din_ptr1]] @ preload data\n" - "pld [%[din_ptr2]] @ preload data\n" - "vdup.s8 d2, d0[0] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d3, d0[1] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d4, d0[2] @ d4 = w02, w02, w02, w02\n" - "vld2.8 {d12-d13}, [%[din_ptr0]] @ load din00= 0 2 4 6 8\n" // d10 = 0 2 4 6 - "vld2.8 {d14-d15}, [%[din_ptr1]] @ load din00= 0 2 4 6 8\n" // d12 = 0 2 4 6 - "vld2.8 {d16-d17}, [%[din_ptr2]] @ load din00= 0 2 4 6 8\n" // d14 = 0 2 4 6 - "vld1.8 {d28-d29}, [%[mask]] @ load din00= 0 1 2 3 4 5 6 7 " - "8 9\n" - "vmov.u32 d11, #0 @ zero\n" - - "vdup.s8 d5, d0[3] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d6, d0[4] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d7, d0[5] @ d4 = w02, w02, w02, w02\n" - - "vbif.8 d12, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d13, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d14, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d15, d11, d29 @ bit select, deal with right pad\n" - - "vbif.8 d16, d11, d28 @ bit select, deal with right pad\n" - "vbif.8 d17, d11, d29 @ bit select, deal with right pad\n" - - "vext.8 d18, d11, d13, #7 @ ext \n" // d16 = -1 1 3 5 - "vext.8 d19, d11, d15, #7 @ ext \n" // d17 = -1 1 3 5 - "vext.8 d20, d11, d17, #7 @ ext \n" // d18 = -1 1 3 5 - - // "pld [%[dout_ptr1]] @ preload data\n" - - // r0 - "vmull.s8 q13, d12, d3 @ out0 = din0 * w01 \n" // q12 = d12 * w01 - "vmull.s8 q14, d13, d4 @ out1 = din0 * w02 \n" // q12 = d12 * w02 - "vmull.s8 q15, d18, d2 @ out2 = din0 * w00 \n" // q12 = d12 * w02 - - "vdup.s8 d8, d0[6] @ d2 = w00, w00, w00, w00\n" - "vdup.s8 d9, d0[7] @ d3 = w01, w01, w01, w01\n" - "vdup.s8 d10, d1[0] @ d4 = w02, w02, w02, w02\n" - - // out0 - "vdup.32 q11, %[bias] @ and \n" // q8 = - // vbias - "vdup.32 q12, %[bias] @ and \n" // q9 = - // vbias - - // r1 - "vmlal.s8 q13, d14, d6 @ out0 += din1 * w11 \n" // q12 = d12 * w11 - "vmlal.s8 q14, d15, d7 @ out1 += din1 * w12 \n" // q12 = d12 * w11 - "vmlal.s8 q15, d19, d5 @ out2 += din1 * w10 \n" // q12 = d12 * w11 - - // "vld1.32 {d12-d13}, [%[dout_ptr1]]! @ load din00= 0 1 2 3 4 5 - // 6 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr1]] @ load din00= 0 - // 1 2 3 4 5 6 7 8 9\n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - // r2 - "vmull.s8 q13, d16, d9 @ out0 += din1 * w21 \n" // q12 = d12 * w11 - "vmull.s8 q14, d17, d10 @ out1 += din1 * w22 \n" // q12 = d12 * w11 - "vmull.s8 q15, d20, d8 @ out2 += din1 * w20 \n" // q12 = d12 * w11 - - // "vld1.32 {d2-d3}, [%[rs_mask]]! @ load din00= 0 1 2 3 4 5 6 7 - // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]] @ load din00= 0 1 2 3 4 - // 5 6 7 8 9\n" - - // "sub %[dout_ptr1], #16 @ sub \n" - - "vaddw.s16 q11, q11, d26 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d27 @addw \n" // out1_1 += - // vget_high_s16(out10) - "vmov.u32 q8, #0 @ mov \n" - - "vaddw.s16 q11, q11, d28 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d29 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vaddw.s16 q11, q11, d30 @addw \n" // out1 += - // vget_low_s16(out10) - "vaddw.s16 q12, q12, d31 @addw \n" // out1_1 += - // vget_high_s16(out10) - - "vmax.s32 q11, q11, q8 @ max\n" - "vmax.s32 q12, q12, q8 @ max\n" - - // "vbif q11, q6, q1 @ bit select, deal with right pad\n" - // "vbif q12, q7, q2 @ bit select, deal with right pad\n" - - "vst1.32 {d22-d25}, [%[dout_ptr1]] @ store\n" - // "vst1.32 {d24-d25}, [%[dout_ptr1]]! @ store\n" - : [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [bias] "+r"(bias_val), - [rs_mask] "+r"(rst_mask) - : [mask] "r"(vmask), - [size_pad_right] "r"(size_pad_right), - [dout_ptr1] "r"(out_buf1) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - } - dout_ptr += w_out; - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3p0.cc b/lite/backends/arm/math/conv_depthwise_3x3p0.cc deleted file mode 100644 index ec7f3cfb84..0000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc +++ /dev/null @@ -1,4178 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s1p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3p0(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (stride == 1) { - if (flag_relu) { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 8) { - conv_depthwise_3x3s2p0_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 8) { - conv_depthwise_3x3s2p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -// 4line -void conv_depthwise_3x3s1p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - // wr0 = vsetq_lane_f32(0.f, wr0, 3); - // wr1 = vsetq_lane_f32(0.f, wr1, 3); - // wr2 = vsetq_lane_f32(0.f, wr2, 3); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - - // mid - // "cmp %[cnt], #1 \n" - // "blt 5f \n" - "4: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 4b \n" - - // right - "5: \n" - "cmp %[remain], #1 \n" - "blt 0f \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v18.4s}, [%[rmask]] \n" - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - // end - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - case 0: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r3\n" - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "cmp %[remain], #1 @ check whether has " - "mid cols\n" - "blt 0f @ jump to main loop start " - "point\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - "0: \n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 >= h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - // mid - "2: \n" - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -// 4line -void conv_depthwise_3x3s1p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - // wr0 = vsetq_lane_f32(0.f, wr0, 3); - // wr1 = vsetq_lane_f32(0.f, wr1, 3); - // wr2 = vsetq_lane_f32(0.f, wr2, 3); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - - // mid - "4: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - // r5 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 4b \n" - - // right - "5: \n" - "cmp %[remain], #1 \n" - "blt 0f \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v18.4s}, [%[rmask]] \n" - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - // end - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_out; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - case 0: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r3\n" - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "cmp %[remain], #1 @ check whether has " - "mid cols\n" - "blt 0f @ jump to main loop start " - "point\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - "0: \n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, with reulu - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 >= h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v14.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - // mid - "2: \n" - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - "vmax.f32 q3, q3, q9 @ relu \n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float* dr0 = din_channel + j * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" - - "bif v0.16b, %[zero].16b, %[mask1].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask2].16b\n" // d0_1234 - - "bif v2.16b, %[zero].16b, %[mask1].16b\n" // d1_1234 - "bif v3.16b, %[zero].16b, %[mask2].16b\n" // d1_1234 - - "bif v4.16b, %[zero].16b, %[mask1].16b\n" // d2_1234 - "bif v5.16b, %[zero].16b, %[mask2].16b\n" // d2_1234 - - "bif v6.16b, %[zero].16b, %[mask1].16b\n" // d3_1234 - "bif v7.16b, %[zero].16b, %[mask2].16b\n" // d3_1234 - - "ext v8.16b, v0.16b, v1.16b, #4\n" // d1_2345 - "ext v9.16b, v0.16b, v1.16b, #8\n" // d1_3450 - - "and v12.16b, %[vbias].16b, %[vbias].16b \n" // v12 = vbias - "and v13.16b, %[vbias].16b, %[vbias].16b \n" // v13 = vbias - - // r0 - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v2.16b, v3.16b, #4\n" // d1_2345 - "ext v9.16b, v2.16b, v3.16b, #8\n" // d1_3450 - - // r1 - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v4.16b, v5.16b, #4\n" // d1_2345 - "ext v9.16b, v4.16b, v5.16b, #8\n" // d1_3450 - - // r2 - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v6.16b, v7.16b, #4\n" // d1_2345 - "ext v9.16b, v6.16b, v7.16b, #8\n" // d1_3450 - - // r3 - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fadd v12.4s, v12.4s, v10.4s\n" - - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "fadd v12.4s, v12.4s, v11.4s\n" // out1 - "fadd v13.4s, v13.4s, v14.4s\n" // out2 - "fadd v13.4s, v13.4s, v15.4s\n" // out2 - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); -#else - unsigned int* vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vadd.f32 q4, q4, q11 @ q4 += q10 \n" - - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" - "vadd.f32 q5, q5, q9 @ q4 += q10 \n" - - "vst1.32 {d8-d9}, [%[out1]] @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[out2]] @ store result, add pointer\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ - -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; ++j) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - "and v4.16b, %[bias].16b, %[bias].16b \n" // v10 = vbias - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v10.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v7.16b, v12.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v8.16b, v14.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" // 0246 * w00 - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" // 1357 * w01 - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" // 2468 * w02 - - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" // v13 * w12 - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" // v15 * w21 - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v16.4s \n" - - // "fadd v4.4s, v4.4s, %[bias].4s \n" - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,0} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q7 = {2,4,6,0} - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" // q8 = {2,4,6,0} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // {0,2,4,6} - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // {1,3,5,7} - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // {2,4,6,0} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(dmask) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float* dr0 = din_channel + j * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" - - "bif v0.16b, %[zero].16b, %[mask1].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask2].16b\n" // d0_1234 - - "bif v2.16b, %[zero].16b, %[mask1].16b\n" // d1_1234 - "bif v3.16b, %[zero].16b, %[mask2].16b\n" // d1_1234 - - "bif v4.16b, %[zero].16b, %[mask1].16b\n" // d2_1234 - "bif v5.16b, %[zero].16b, %[mask2].16b\n" // d2_1234 - - "bif v6.16b, %[zero].16b, %[mask1].16b\n" // d3_1234 - "bif v7.16b, %[zero].16b, %[mask2].16b\n" // d3_1234 - - "ext v8.16b, v0.16b, v1.16b, #4\n" // d1_2345 - "ext v9.16b, v0.16b, v1.16b, #8\n" // d1_3450 - - "and v12.16b, %[vbias].16b, %[vbias].16b \n" // v12 = vbias - "and v13.16b, %[vbias].16b, %[vbias].16b \n" // v13 = vbias - - // r0 - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v2.16b, v3.16b, #4\n" // d1_2345 - "ext v9.16b, v2.16b, v3.16b, #8\n" // d1_3450 - - // r1 - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v4.16b, v5.16b, #4\n" // d1_2345 - "ext v9.16b, v4.16b, v5.16b, #8\n" // d1_3450 - - // r2 - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v6.16b, v7.16b, #4\n" // d1_2345 - "ext v9.16b, v6.16b, v7.16b, #8\n" // d1_3450 - - // r3 - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fadd v12.4s, v12.4s, v10.4s\n" - - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "fadd v12.4s, v12.4s, v11.4s\n" // out1 - "fadd v13.4s, v13.4s, v14.4s\n" // out2 - "fadd v13.4s, v13.4s, v15.4s\n" // out2 - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - "fmax v12.4s, v12.4s, %[zero].4s \n" - "fmax v13.4s, v13.4s, %[zero].4s \n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); -#else - unsigned int* vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vadd.f32 q4, q4, q11 @ q4 += q10 \n" - - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" - "vadd.f32 q5, q5, q9 @ q4 += q10 \n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vst1.32 {d8-d9}, [%[out1]] @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[out2]] @ store result, add pointer\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - // doutr0 = doutr1; - // doutr1 += w_out; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 7 - */ -void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; ++j) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]] \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - "and v4.16b, %[bias].16b, %[bias].16b \n" // v10 = vbias - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v10.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v7.16b, v12.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v8.16b, v14.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" // 0246 * w00 - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" // 1357 * w01 - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" // 2468 * w02 - - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" // v13 * w12 - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" // v15 * w21 - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v16.4s \n" - "fmax v4.4s, v4.4s, v9.4s \n" - - // "fadd v4.4s, v4.4s, %[bias].4s \n" - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf), - [mask_ptr] "r"(mask_ptr) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,0} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q7 = {2,4,6,0} - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" // q8 = {2,4,6,0} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // {0,2,4,6} - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // {1,3,5,7} - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // {2,4,6,0} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(mask_ptr) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3p1.cc b/lite/backends/arm/math/conv_depthwise_3x3p1.cc deleted file mode 100644 index b5de99d7f5..0000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc +++ /dev/null @@ -1,4850 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s1p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3p1(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (stride == 1) { - if (flag_relu) { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -// 4line -void conv_depthwise_3x3s1p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - // printf("conv3x3_dw start \n"); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - - // left - // r0 - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * - w0[1]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * - w0[0]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * - w0[2]*/ - - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * - w1[1]*/ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ - - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ - "cmp %[cnt], #1 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "blt 3f \n" - // mid - "1: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 1b \n" - - // right - "3: \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v18.4s}, [%[rmask]] \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" - "vext.32 q7, q8, q9, #1 @ 1234\n" - - // left - // r0 - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" - - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" - "vext.32 q7, q10, q11, #1 @ 1234\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" - "vext.32 q7, q12, q13, #1 @ 1234\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" - "vext.32 q7, q14, q15, #1 @ 1234\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "cmp %[cnt], #1 @ check whether has " - "mid cols\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - "blt 3f @ jump to main loop start " - "point\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" // v10 = {0,1,3,5} - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmul v12.4s, v1.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr0], %[inptr0], #4 \n" - "sub %[inptr1], %[inptr1], #4 \n" - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v12.4s, v3.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr2], %[inptr2], #4 \n" - "sub %[inptr3], %[inptr3], #4 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmla v11.4s, v4.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - - "fmul v14.4s, v5.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v12.4s, v5.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - - "fmla v17.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - "fmla v16.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr4], %[inptr4], #4 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v7.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" // v10 = {0,1,3,5} - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v9.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "cmp %[cnt], #1 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "blt 1f \n" - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - unsigned int* mask_ptr = dmask; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q10, q11 - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q12, q13 - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v13={0,2,4,6} v14={1,3,5,7}, q14, q15 - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vext.32 q6, q9, q11, #3 @ shift right 1 " - "data\n" // q2 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, " - "out1\n" // q0 * w01 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, " - "out1\n" // q1 * w02 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, " - "out1\n" // q2 * w00 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "cmp %[cnt], #1 \n" - "blt 1f \n" - // mid - "2: \n" - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -// 4line -void conv_depthwise_3x3s1p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - // printf("conv3x3_dw start \n"); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int tile_h = (h_in + 3) >> 2; - int cnt_col = tile_w - 2; - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - - // left - // r0 - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * - w0[1]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * - w0[0]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * - w0[2]*/ - - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * - w1[1]*/ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ - - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ - "cmp %[cnt], #1 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "blt 3f \n" - // mid - "1: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 1b \n" - - // right - "3: \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v18.4s}, [%[rmask]] \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" - "vext.32 q7, q8, q9, #1 @ 1234\n" - - // left - // r0 - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" - - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" - "vext.32 q7, q10, q11, #1 @ 1234\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" - "vext.32 q7, q12, q13, #1 @ 1234\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" - "vext.32 q7, q14, q15, #1 @ 1234\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "cmp %[cnt], #1 @ check whether has " - "mid cols\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - "blt 3f @ jump to main loop start " - "point\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, with reulu - */ -// w_in > 7 -void conv_depthwise_3x3s2p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" // v10 = {0,1,3,5} - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmul v12.4s, v1.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr0], %[inptr0], #4 \n" - "sub %[inptr1], %[inptr1], #4 \n" - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v12.4s, v3.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr2], %[inptr2], #4 \n" - "sub %[inptr3], %[inptr3], #4 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmla v11.4s, v4.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - - "fmul v14.4s, v5.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v12.4s, v5.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - - "fmla v17.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - "fmla v16.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr4], %[inptr4], #4 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v7.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" // v10 = {0,1,3,5} - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v9.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "cmp %[cnt], #1 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "blt 1f \n" - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v14.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - - unsigned int* mask_ptr = dmask; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q10, q11 - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q12, q13 - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v13={0,2,4,6} v14={1,3,5,7}, q14, q15 - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vext.32 q6, q9, q11, #3 @ shift right 1 " - "data\n" // q2 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, " - "out1\n" // q0 * w01 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, " - "out1\n" // q1 * w02 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, " - "out1\n" // q2 * w00 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "cmp %[cnt], #1 \n" - "blt 1f \n" - // mid - "2: \n" - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "subs %[cnt], #1 \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s}, [%[din0]], #16\n" - "ld1 {v1.4s}, [%[din1]], #16\n" - "ld1 {v2.4s}, [%[din2]], #16\n" - "ld1 {v3.4s}, [%[din3]], #16\n" - - "bif v0.16b, %[zero].16b, %[mask].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask].16b\n" // d1_1234 - "bif v2.16b, %[zero].16b, %[mask].16b\n" // d2_1234 - "bif v3.16b, %[zero].16b, %[mask].16b\n" // d3_1234 - - "ext v4.16b, %[zero].16b, v0.16b, #12\n" // d0_0123 - "ext v5.16b, %[zero].16b, v1.16b, #12\n" // d1_0123 - "ext v6.16b, %[zero].16b, v2.16b, #12\n" // d2_0123 - "ext v7.16b, %[zero].16b, v3.16b, #12\n" // d3_0123 - - "ext v8.16b, v0.16b, %[zero].16b, #4\n" // d0_2340 - "ext v9.16b, v1.16b, %[zero].16b, #4\n" // d1_2340 - "ext v10.16b, v2.16b, %[zero].16b, #4\n" // d2_2340 - "ext v11.16b, v3.16b, %[zero].16b, #4\n" // d3_2340 - - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" - - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" - - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" - - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" - - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" - - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" - - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" - - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" - - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" - - "fadd v12.4s, v12.4s, v14.4s\n" - "fadd v12.4s, v12.4s, v16.4s\n" - - "fadd v13.4s, v13.4s, v15.4s\n" // out1 - "fadd v13.4s, v13.4s, v17.4s\n" // out2 - - "fadd v12.4s, v12.4s, %[bias].4s\n" // out1 add bias - "fadd v13.4s, v13.4s, %[bias].4s\n" // out2 add bias - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); -#else - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d12-d13}, [%[din0]]!\n" - "vld1.32 {d14-d15}, [%[din1]]!\n" - "vld1.32 {d16-d17}, [%[din2]]!\n" - "vld1.32 {d18-d19}, [%[din3]]!\n" - - "vbif q6, %q[zero], %q[mask]\n" // d0_1234 - "vbif q7, %q[zero], %q[mask]\n" // d1_1234 - "vbif q8, %q[zero], %q[mask]\n" // d2_1234 - "vbif q9, %q[zero], %q[mask]\n" // d3_1234 - - "vmul.f32 q14, q6, %e[wr0][1]\n" - "vmul.f32 q15, q7, %e[wr0][1]\n" - - "vmla.f32 q14, q7, %e[wr1][1]\n" - "vmla.f32 q15, q8, %e[wr1][1]\n" - - "vmla.f32 q14, q8, %e[wr2][1]\n" - "vmla.f32 q15, q9, %e[wr2][1]\n" - - "vext.32 q10, %q[zero], q6, #3\n" // d0_0123 - "vext.32 q11, %q[zero], q7, #3\n" // d1_0123 - "vext.32 q12, %q[zero], q8, #3\n" // d2_0123 - "vext.32 q13, %q[zero], q9, #3\n" // d3_0123 - - "vmla.f32 q14, q10, %e[wr0][0]\n" - "vmla.f32 q15, q11, %e[wr0][0]\n" - - "vmla.f32 q14, q11, %e[wr1][0]\n" - "vmla.f32 q15, q12, %e[wr1][0]\n" - - "vmla.f32 q14, q12, %e[wr2][0]\n" - "vmla.f32 q15, q13, %e[wr2][0]\n" - - "vext.32 q10, q6, %q[zero], #1\n" // d0_2340 - "vext.32 q11, q7, %q[zero], #1\n" // d1_2340 - "vext.32 q12, q8, %q[zero], #1\n" // d2_2340 - "vext.32 q13, q9, %q[zero], #1\n" // d3_2340 - - "vmla.f32 q14, q10, %f[wr0][0]\n" - "vmla.f32 q15, q11, %f[wr0][0]\n" - - "vmla.f32 q14, q11, %f[wr1][0]\n" - "vmla.f32 q15, q12, %f[wr1][0]\n" - - "vmla.f32 q14, q12, %f[wr2][0]\n" // out1 - "vmla.f32 q15, q13, %f[wr2][0]\n" // out2 - - "vadd.f32 q14, q14, %q[bias]\n" // out1 add bias - "vadd.f32 q15, q15, %q[bias]\n" // out2 add bias - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vst1.32 {d28-d29}, [%[out1]]\n" - "vst1.32 {d30-d31}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ - -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v9.16b, v11.16b, #12 \n" // v6 = - // {0,1,3,5} - "ext v7.16b, v9.16b, v13.16b, #12 \n" // v7 = - // {0,1,3,5} - "ext v8.16b, v9.16b, v15.16b, #12 \n" // v8 = - // {0,1,3,5} - - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" // v10 * w01 - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" // v11 * w02 - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" // v6 * w00 - - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" // v13 * w12 - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" // v15 * w21 - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v6.4s \n" - - "fadd v4.4s, v4.4s, %[bias].4s \n" - - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" // q6 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" // q7 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" // q8 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, " - "out0\n" // q10 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, " - "out0\n" // q11 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w00 - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s}, [%[din0]], #16\n" - "ld1 {v1.4s}, [%[din1]], #16\n" - "ld1 {v2.4s}, [%[din2]], #16\n" - "ld1 {v3.4s}, [%[din3]], #16\n" - - "bif v0.16b, %[zero].16b, %[mask].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask].16b\n" // d1_1234 - "bif v2.16b, %[zero].16b, %[mask].16b\n" // d2_1234 - "bif v3.16b, %[zero].16b, %[mask].16b\n" // d3_1234 - - "ext v4.16b, %[zero].16b, v0.16b, #12\n" // d0_0123 - "ext v5.16b, %[zero].16b, v1.16b, #12\n" // d1_0123 - "ext v6.16b, %[zero].16b, v2.16b, #12\n" // d2_0123 - "ext v7.16b, %[zero].16b, v3.16b, #12\n" // d3_0123 - - "ext v8.16b, v0.16b, %[zero].16b, #4\n" // d0_2340 - "ext v9.16b, v1.16b, %[zero].16b, #4\n" // d1_2340 - "ext v10.16b, v2.16b, %[zero].16b, #4\n" // d2_2340 - "ext v11.16b, v3.16b, %[zero].16b, #4\n" // d3_2340 - - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" - - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" - - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" - - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" - - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" - - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" - - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" - - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" - - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" - - "fadd v12.4s, v12.4s, v14.4s\n" - "fadd v12.4s, v12.4s, v16.4s\n" - - "fadd v13.4s, v13.4s, v15.4s\n" // out1 - "fadd v13.4s, v13.4s, v17.4s\n" // out2 - - "fadd v12.4s, v12.4s, %[bias].4s\n" // out1 add bias - "fadd v13.4s, v13.4s, %[bias].4s\n" // out2 add bias - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "fmax v12.4s, v12.4s, %[zero].4s\n" // out1 -> relu - "fmax v13.4s, v13.4s, %[zero].4s\n" // out2 -> relu - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); -#else - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d12-d13}, [%[din0]]!\n" - "vld1.32 {d14-d15}, [%[din1]]!\n" - "vld1.32 {d16-d17}, [%[din2]]!\n" - "vld1.32 {d18-d19}, [%[din3]]!\n" - - "vbif q6, %q[zero], %q[mask]\n" // d0_1234 - "vbif q7, %q[zero], %q[mask]\n" // d1_1234 - "vbif q8, %q[zero], %q[mask]\n" // d2_1234 - "vbif q9, %q[zero], %q[mask]\n" // d3_1234 - - "vmul.f32 q14, q6, %e[wr0][1]\n" - "vmul.f32 q15, q7, %e[wr0][1]\n" - - "vmla.f32 q14, q7, %e[wr1][1]\n" - "vmla.f32 q15, q8, %e[wr1][1]\n" - - "vmla.f32 q14, q8, %e[wr2][1]\n" - "vmla.f32 q15, q9, %e[wr2][1]\n" - - "vext.32 q10, %q[zero], q6, #3\n" // d0_0123 - "vext.32 q11, %q[zero], q7, #3\n" // d1_0123 - "vext.32 q12, %q[zero], q8, #3\n" // d2_0123 - "vext.32 q13, %q[zero], q9, #3\n" // d3_0123 - - "vmla.f32 q14, q10, %e[wr0][0]\n" - "vmla.f32 q15, q11, %e[wr0][0]\n" - - "vmla.f32 q14, q11, %e[wr1][0]\n" - "vmla.f32 q15, q12, %e[wr1][0]\n" - - "vmla.f32 q14, q12, %e[wr2][0]\n" - "vmla.f32 q15, q13, %e[wr2][0]\n" - - "vext.32 q10, q6, %q[zero], #1\n" // d0_2340 - "vext.32 q11, q7, %q[zero], #1\n" // d1_2340 - "vext.32 q12, q8, %q[zero], #1\n" // d2_2340 - "vext.32 q13, q9, %q[zero], #1\n" // d3_2340 - - "vmla.f32 q14, q10, %f[wr0][0]\n" - "vmla.f32 q15, q11, %f[wr0][0]\n" - - "vmla.f32 q14, q11, %f[wr1][0]\n" - "vmla.f32 q15, q12, %f[wr1][0]\n" - - "vmla.f32 q14, q12, %f[wr2][0]\n" // out1 - "vmla.f32 q15, q13, %f[wr2][0]\n" // out2 - - "vadd.f32 q14, q14, %q[bias]\n" // out1 add bias - "vadd.f32 q15, q15, %q[bias]\n" // out2 add bias - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmax.f32 q14, q14, %q[zero]\n" // out1 -> relu - "vmax.f32 q15, q15, %q[zero]\n" // out2 -> relu - - "vst1.32 {d28-d29}, [%[out1]]\n" - "vst1.32 {d30-d31}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 7 - */ -void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v9.16b, v11.16b, #12 \n" // v6 = - // {0,1,3,5} - "ext v7.16b, v9.16b, v13.16b, #12 \n" // v7 = - // {0,1,3,5} - "ext v8.16b, v9.16b, v15.16b, #12 \n" // v8 = - // {0,1,3,5} - - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" // v10 * w01 - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" // v11 * w02 - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" // v6 * w00 - - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" // v13 * w12 - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" // v15 * w21 - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v6.4s \n" - - "fadd v4.4s, v4.4s, %[bias].4s \n" // out add bias - "fmax v4.4s, v4.4s, v9.4s \n" - - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" // q6 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" // q7 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" // q8 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, " - "out0\n" // q10 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, " - "out0\n" // q11 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w00 - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu\n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_5x5s1.cc b/lite/backends/arm/math/conv_depthwise_5x5s1.cc deleted file mode 100644 index 2b9744665c..0000000000 --- a/lite/backends/arm/math/conv_depthwise_5x5s1.cc +++ /dev/null @@ -1,9615 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -//! weights layout -//! *-----------------------*-----* -//! w0 <-- | W0 W1 W2 W3 | W4 | -//! *-----------------------* | -//! w1 <-- | W5 W6 W7 W8 | W9 | -//! *-----------------------* | --> w5 -//! w2 <-- | W10 W11 W12 W13 | W14 | -//! *-----------------------* | -//! w3 <-- | W15 W16 W17 W18 | W19 | -//! *-----------------------*-----* -//! w4 <-- | W20 W21 W22 W23 | W24 | --> w6[0] -//! *-----------------------*-----* - -void conv_depthwise_5x5s1_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s1_small_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s1_relu_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s1_small_relu_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -static float* prepad_input( - const float* input, int num, int ch_in, int h_in, int w_in, int pad) { - int h_new = h_in + 2 * pad; - int w_new = w_in + 2 * pad; - float* new_input = - static_cast(malloc(h_new * w_new * ch_in * num * sizeof(float))); - float* new_input_ptr = new_input; - for (int c = 0; c < num * ch_in; ++c) { - memset(new_input_ptr, 0x00, w_new * pad * sizeof(float)); - new_input_ptr += w_new * pad; - for (int i = 0; i < h_in; ++i) { - memset(new_input_ptr, 0x00, pad * sizeof(float)); - new_input_ptr += pad; - memcpy(new_input_ptr, input, w_in * sizeof(float)); - new_input_ptr += w_in; - input += w_in; - memset(new_input_ptr, 0x00, pad * sizeof(float)); - new_input_ptr += pad; - } - memset(new_input_ptr, 0x00, w_new * pad * sizeof(float)); - new_input_ptr += w_new * pad; - } - return new_input; -} - -#ifdef __aarch64__ - -//! kernel for one out without extracting data mid -//! deal with four lines out -void compute_one_out_without_extract(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - float32x4_t w5, - float32x4_t w6, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! din0 - din7: 5 v20, v21 - //! dout0 - dout3: v16-v19 - asm volatile( - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - "ld1 {v20.s}[0], [%[din0]] \n" - "ld1 {v21.s}[0], [%[din4]] \n" - "ld1 {v20.s}[1], [%[din1]] \n" - "ld1 {v21.s}[1], [%[din5]] \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - "ld1 {v20.s}[2], [%[din2]] \n" - "ld1 {v21.s}[2], [%[din6]] \n" - "ld1 {v20.s}[3], [%[din3]] \n" - "ld1 {v21.s}[3], [%[din7]] \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // ext - "ext v22.16b, v20.16b, v21.16b, #4 \n" // 1 2 3 4 - "ext v23.16b, v20.16b, v21.16b, #8 \n" // 2 3 4 5 - "ext v24.16b, v20.16b, v21.16b, #12 \n" // 3 4 5 6 - - // in col5 - "fmla v16.4s, %[w5].4s, v20.4s \n" - "fmla v17.4s, %[w5].4s, v22.4s \n" - "fmla v18.4s, %[w5].4s, v23.4s \n" - "fmla v19.4s, %[w5].4s, v24.4s \n" - - "ld1 {v31.4s}, [%[bias]] \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - - // in[24] * w6[0] - "fmla v25.4s, v21.4s, %[w6].s[0]\n" - "fadd v25.4s, v25.4s, v31.4s \n" - - // write output - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [bias] "r"(bias) - : "memory", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v31"); -} - -//! kernel for one out without extracting data mid -//! deal with four lines out -void compute_one_out_without_extract_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - float32x4_t w5, - float32x4_t w6, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! din0 - din7: 5 v20, v21 - //! dout0 - dout3: v16-v19 - asm volatile( - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - "ld1 {v20.s}[0], [%[din0]] \n" - "ld1 {v21.s}[0], [%[din4]] \n" - "ld1 {v20.s}[1], [%[din1]] \n" - "ld1 {v21.s}[1], [%[din5]] \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - "ld1 {v20.s}[2], [%[din2]] \n" - "ld1 {v21.s}[2], [%[din6]] \n" - "ld1 {v20.s}[3], [%[din3]] \n" - "ld1 {v21.s}[3], [%[din7]] \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // ext - "ext v22.16b, v20.16b, v21.16b, #4 \n" // 1 2 3 4 - "ext v23.16b, v20.16b, v21.16b, #8 \n" // 2 3 4 5 - "ext v24.16b, v20.16b, v21.16b, #12 \n" // 3 4 5 6 - - // in col5 - "fmla v16.4s, %[w5].4s, v20.4s \n" - "fmla v17.4s, %[w5].4s, v22.4s \n" - "fmla v18.4s, %[w5].4s, v23.4s \n" - "fmla v19.4s, %[w5].4s, v24.4s \n" - - "ld1 {v31.4s}, [%[bias]] \n" - "movi v30.4s, #0 \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - - // in[24] * w6[0] - "fmla v25.4s, v21.4s, %[w6].s[0] \n" - "fadd v25.4s, v25.4s, v31.4s \n" - "fmax v25.4s, v25.4s, v30.4s \n" - - // write output - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [bias] "r"(bias) - : "memory", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v30", - "v31"); -} - -//! kernel for one out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_one_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - //! weights: v0-v4 - asm volatile( - // load weights - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" - "ldr q1, [%[wh]], #20 \n" - "ldr q2, [%[wh]], #20 \n" - "ldr q3, [%[wh]], #20 \n" - "ldr q4, [%[wh]], #20 \n" - - "ld1 {v31.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v31.4s \n" - - // write output - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [bias] "r"(bias) - : "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v25", - "v26", - "v31"); -} - -//! kernel for one out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_one_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - //! weights: v0-v4 - asm volatile( - // load weights - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" - "ldr q1, [%[wh]], #20 \n" - "ldr q2, [%[wh]], #20 \n" - "ldr q3, [%[wh]], #20 \n" - "ldr q4, [%[wh]], #20 \n" - - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - "ld1 {v31.4s}, [%[bias]] \n" - "movi v30.4s, #0 \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v31.4s \n" - "fmax v25.4s, v25.4s, v30.4s \n" - - // write output - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [bias] "r"(bias) - : "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v25", - "v26", - "v30", - "v31"); -} - -//! kernel for one out with extracting data post -//! deal with four lines out -void compute_one_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - asm volatile( - "ld1 {v31.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v31.4s \n" - - // write output - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v25", - "v26", - "v31"); -} - -//! kernel for one out with extracting data post -//! deal with four lines out -void compute_one_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - asm volatile( - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - "ld1 {v31.4s}, [%[bias]] \n" - "movi v30.4s, #0 \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v31.4s \n" - "fmax v25.4s, v25.4s, v30.4s \n" - - // write output - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v25", - "v26", - "v30", - "v31"); -} - -//! kernel for two out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_two_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - //! weights: v0-v4 - asm volatile( - // load weights - "movi v31.4s, #0 \n" - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" // 1, 2, 3, 4 - "ldr q1, [%[wh]], #20 \n" // 6, 7, 8, 9 - "ldr q2, [%[wh]], #20 \n" // 11, 12, 13, 14 - "ldr q3, [%[wh]], #20 \n" // 16, 17, 18, 19 - "ldr q4, [%[wh]], #20 \n" // 21, 22, 23, 24 - - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 2, 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 7, 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 12, 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 17, 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 22, 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v8.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v8.4s \n" - - // zip - "zip1 v6.4s, v7.4s, v5.4s \n" - "zip2 v8.4s, v7.4s, v5.4s \n" - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "str d6, [%[dout0]] \n" - "str d7, [%[dout1]] \n" - "str d8, [%[dout2]] \n" - "str d9, [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [bias] "r"(bias) - : "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v31"); -} - -//! kernel for two out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_two_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - //! weights: v0-v4 - asm volatile( - // load weights - "movi v31.4s, #0 \n" - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" // 1, 2, 3, 4 - "ldr q1, [%[wh]], #20 \n" // 6, 7, 8, 9 - "ldr q2, [%[wh]], #20 \n" // 11, 12, 13, 14 - "ldr q3, [%[wh]], #20 \n" // 16, 17, 18, 19 - "ldr q4, [%[wh]], #20 \n" // 21, 22, 23, 24 - - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 2, 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 7, 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 12, 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 17, 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 22, 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v8.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v8.4s \n" - - // zip - "zip1 v6.4s, v7.4s, v5.4s \n" - "zip2 v8.4s, v7.4s, v5.4s \n" - - // add bias - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - - // relu - "fmax v6.4s, v6.4s, v31.4s \n" - "fmax v8.4s, v8.4s, v31.4s \n" - - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "str d6, [%[dout0]] \n" - "str d7, [%[dout1]] \n" - "str d8, [%[dout2]] \n" - "str d9, [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [bias] "r"(bias) - : "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v31"); -} - -//! kernel for two out with extracting data post -//! deal with four lines out -void compute_two_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - asm volatile( - "movi v31.4s, #0 \n" - - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v8.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v8.4s \n" - - // zip - "zip1 v6.4s, v5.4s, v7.4s \n" - "zip2 v8.4s, v5.4s, v7.4s \n" - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "str d6, [%[dout0]] \n" - "str d7, [%[dout1]] \n" - "str d8, [%[dout2]] \n" - "str d9, [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v31"); -} - -//! kernel for two out with extracting data post -//! deal with four lines out -void compute_two_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - asm volatile( - "movi v31.4s, #0 \n" - - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v8.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v8.4s \n" - - // zip - "zip1 v6.4s, v5.4s, v7.4s \n" - "zip2 v8.4s, v5.4s, v7.4s \n" - - // add bias - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - - // relu - "fmax v6.4s, v6.4s, v31.4s \n" - "fmax v8.4s, v8.4s, v31.4s \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "str d6, [%[dout0]] \n" - "str d7, [%[dout1]] \n" - "str d8, [%[dout2]] \n" - "str d9, [%[dout3]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [dout0] "r"(dout0), - [dout1] "r"(dout1), - [dout2] "r"(dout2), - [dout3] "r"(dout3), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v31"); -} - -//! kernel for three out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_three_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - //! weights: v0-v4 - asm volatile( - // load weights - "movi v31.4s, #0 \n" - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" // 1, 2, 3, 4 - "ldr q1, [%[wh]], #20 \n" // 6, 7, 8, 9 - "ldr q2, [%[wh]], #20 \n" // 11, 12, 13, 14 - "ldr q3, [%[wh]], #20 \n" // 16, 17, 18, 19 - "ldr q4, [%[wh]], #20 \n" // 21, 22, 23, 24 - - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 2, 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 7, 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 12, 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 17, 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 22, 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v6.4s \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v20.4s \n" - - // zip - "zip1 v6.4s, v7.4s, v5.4s \n" - "zip2 v8.4s, v7.4s, v5.4s \n" - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "st1 {v25.s}[0], [%[dout0]], #4 \n" - "st1 {v25.s}[1], [%[dout1]], #4 \n" - "st1 {v25.s}[2], [%[dout2]], #4 \n" - "st1 {v25.s}[3], [%[dout3]], #4 \n" - - "str d6, [%[dout0]] \n" - "str d7, [%[dout1]] \n" - "str d8, [%[dout2]] \n" - "str d9, [%[dout3]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [dout2] "+r"(dout2), - [dout3] "+r"(dout3), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [bias] "r"(bias) - : "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v25", - "v26", - "v31"); -} - -//! kernel for three out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_three_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v16-v19 - //! weights: v0-v4 - asm volatile( - // load weights - "movi v31.4s, #0 \n" - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" // 1, 2, 3, 4 - "ldr q1, [%[wh]], #20 \n" // 6, 7, 8, 9 - "ldr q2, [%[wh]], #20 \n" // 11, 12, 13, 14 - "ldr q3, [%[wh]], #20 \n" // 16, 17, 18, 19 - "ldr q4, [%[wh]], #20 \n" // 21, 22, 23, 24 - - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 2, 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 7, 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 12, 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 17, 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 22, 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v6.4s \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v20.4s \n" - "fmax v25.4s, v25.4s, v31.4s \n" - - // zip - "zip1 v6.4s, v7.4s, v5.4s \n" - "zip2 v8.4s, v7.4s, v5.4s \n" - - // add bias - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - - // relu - "fmax v6.4s, v6.4s, v31.4s \n" - "fmax v8.4s, v8.4s, v31.4s \n" - - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "st1 {v25.s}[0], [%[dout0]], #4 \n" - "st1 {v25.s}[1], [%[dout1]], #4 \n" - "st1 {v25.s}[2], [%[dout2]], #4 \n" - "st1 {v25.s}[3], [%[dout3]], #4 \n" - - "str d6, [%[dout0]] \n" - "str d7, [%[dout1]] \n" - "str d8, [%[dout2]] \n" - "str d9, [%[dout3]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [dout2] "+r"(dout2), - [dout3] "+r"(dout3), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [bias] "r"(bias) - : "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v25", - "v26", - "v31"); -} - -//! kernel for three out with extracting data post -//! deal with four lines out -void compute_three_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v6, v8, v25 - asm volatile( - "movi v31.4s, #0 \n" - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v6.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v20.4s \n" - - // zip - "zip1 v6.4s, v5.4s, v7.4s \n" - "zip2 v8.4s, v5.4s, v7.4s \n" - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "str d6, [%[dout0]], #8 \n" - "str d7, [%[dout1]], #8 \n" - "str d8, [%[dout2]], #8 \n" - "str d9, [%[dout3]], #8 \n" - - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [dout2] "+r"(dout2), - [dout3] "+r"(dout3), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v25", - "v26", - "v31"); -} - -//! kernel for three out with extracting data post -//! deal with four lines out -void compute_three_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v6, v8, v25 - asm volatile( - "movi v31.4s, #0 \n" - - // load inputs - "ld1 {v20.4s}, [%[bias]] \n" - "ld1 {v8.4s}, [%[din0]], #16 \n" - "ld1 {v9.4s}, [%[din1]], #16 \n" - "ld1 {v10.4s}, [%[din2]], #16 \n" - "ld1 {v11.4s}, [%[din3]], #16 \n" - "ld1 {v12.4s}, [%[din4]], #16 \n" - "ld1 {v13.4s}, [%[din5]], #16 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], #16 \n" - "ld1 {v15.4s}, [%[din7]], #16 \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v5 - "faddp v5.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v5.4s, v5.4s, v6.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v7 - "faddp v7.4s, v16.4s, v17.4s \n" - "faddp v6.4s, v18.4s, v19.4s \n" - "faddp v7.4s, v7.4s, v6.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - "fadd v25.4s, v25.4s, v20.4s \n" - "fmax v25.4s, v25.4s, v31.4s \n" - - // zip - "zip1 v6.4s, v5.4s, v7.4s \n" - "zip2 v8.4s, v5.4s, v7.4s \n" - - // add bias - "fadd v6.4s, v6.4s, v20.4s \n" - "fadd v8.4s, v8.4s, v20.4s \n" - - // relu - "fmax v6.4s, v6.4s, v31.4s \n" - "fmax v8.4s, v8.4s, v31.4s \n" - - "ext v7.16b, v6.16b, v31.16b, #8 \n" - "ext v9.16b, v8.16b, v31.16b, #8 \n" - - // write output - "str d6, [%[dout0]], #8 \n" - "str d7, [%[dout1]], #8 \n" - "str d8, [%[dout2]], #8 \n" - "str d9, [%[dout3]], #8 \n" - - "st1 {v25.s}[0], [%[dout0]] \n" - "st1 {v25.s}[1], [%[dout1]] \n" - "st1 {v25.s}[2], [%[dout2]] \n" - "st1 {v25.s}[3], [%[dout3]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [dout2] "+r"(dout2), - [dout3] "+r"(dout3), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v25", - "v26", - "v31"); -} - -//! kernel for four out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_four_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v0-v3 - //! weights: v0-v4, v5, v6 - asm volatile( - // load weights - "movi v31.4s, #0 \n" - "mov x0, #20 \n" - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" // 1, 2, 3, 4 - "ldr q1, [%[wh]], #20 \n" // 6, 7, 8, 9 - "ldr q2, [%[wh]], #20 \n" // 11, 12, 13, 14 - "ldr q3, [%[wh]], #20 \n" // 16, 17, 18, 19 - "ldr q4, [%[wh]] \n" // 21, 22, 23, 24 - "sub %[wh], %[wh], #68 \n" - - // load inputs - "ld1 {v8.4s}, [%[din0]] \n" - "ld1 {v9.4s}, [%[din1]] \n" - "ld1 {v10.4s}, [%[din2]] \n" - "ld1 {v11.4s}, [%[din3]] \n" - "ld1 {v12.4s}, [%[din4]] \n" - "ld1 {v13.4s}, [%[din5]] \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]] \n" - "ld1 {v15.4s}, [%[din7]] \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - - // load weights col5 - "ld1 {v5.s}[0], [%[wh]], x0 \n" - "ld1 {v5.s}[1], [%[wh]], x0 \n" - "ld1 {v5.s}[2], [%[wh]], x0 \n" - "ld1 {v5.s}[3], [%[wh]], x0 \n" - "ld1 {v6.s}[0], [%[wh]] \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 2, 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 7, 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 12, 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 17, 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 22, 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v27 - "faddp v27.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v27.4s, v27.4s, v26.4s \n" - - // load in col5 - "ld1 {v20.s}[0], [%[din0]] \n" - "ld1 {v20.s}[1], [%[din1]] \n" - "ld1 {v20.s}[2], [%[din2]] \n" - "ld1 {v20.s}[3], [%[din3]] \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 23, 24 - - "ld1 {v21.s}[0], [%[din4]] \n" - "ld1 {v21.s}[1], [%[din5]] \n" - "ld1 {v21.s}[2], [%[din6]] \n" - "ld1 {v21.s}[3], [%[din7]] \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v26 - "faddp v26.4s, v16.4s, v17.4s \n" - "faddp v28.4s, v18.4s, v19.4s \n" - "faddp v26.4s, v26.4s, v28.4s \n" - - // ext input col5 - "ext v22.16b, v20.16b, v21.16b, #4 \n" - "ext v23.16b, v20.16b, v21.16b, #8 \n" - "ext v24.16b, v20.16b, v21.16b, #12 \n" - - // in col5 - "fmul v16.4s, v5.4s, v20.4s \n" - "fmul v17.4s, v5.4s, v22.4s \n" - "fmul v18.4s, v5.4s, v23.4s \n" - "fmul v19.4s, v5.4s, v24.4s \n" - - // add to out register v28 - "faddp v28.4s, v16.4s, v17.4s \n" - "faddp v29.4s, v18.4s, v19.4s \n" - "faddp v28.4s, v28.4s, v29.4s \n" - "fmla v28.4s, v21.4s, v6.s[0] \n" - - "ld1 {v8.4s}, [%[bias]] \n" - - // zip - "zip1 v0.4s, v28.4s, v26.4s \n" - "zip2 v2.4s, v28.4s, v26.4s \n" - "zip1 v4.4s, v27.4s, v25.4s \n" - "zip2 v6.4s, v27.4s, v25.4s \n" - - "fadd v0.4s, v0.4s, v8.4s \n" - "fadd v2.4s, v2.4s, v8.4s \n" - "fadd v4.4s, v4.4s, v8.4s \n" - "fadd v6.4s, v6.4s, v8.4s \n" - - "ext v1.16b, v0.16b, v31.16b, #8 \n" - "ext v3.16b, v2.16b, v31.16b, #8 \n" - "ext v5.16b, v4.16b, v31.16b, #8 \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - - // write output - "str d0, [%[dout0]], #8 \n" - "str d1, [%[dout1]], #8 \n" - "str d2, [%[dout2]], #8 \n" - "str d3, [%[dout3]], #8 \n" - - "str d4, [%[dout0]] \n" - "str d5, [%[dout1]] \n" - "str d6, [%[dout2]] \n" - "str d7, [%[dout3]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [dout2] "+r"(dout2), - [dout3] "+r"(dout3), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [bias] "r"(bias) - : "memory", - "x0", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v31"); -} - -//! kernel for four out with extracting data pre -//! deal with four lines out -//! need extra load weights -void compute_four_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - const float* weights, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v0-v3 - //! weights: v0-v4, v5, v6 - asm volatile( - // load weights - "movi v31.4s, #0 \n" - "mov x0, #20 \n" - "add %[wh], %[wh], #4 \n" - "ldr q0, [%[wh]], #20 \n" // 1, 2, 3, 4 - "ldr q1, [%[wh]], #20 \n" // 6, 7, 8, 9 - "ldr q2, [%[wh]], #20 \n" // 11, 12, 13, 14 - "ldr q3, [%[wh]], #20 \n" // 16, 17, 18, 19 - "ldr q4, [%[wh]] \n" // 21, 22, 23, 24 - "sub %[wh], %[wh], #68 \n" - - // load inputs - "ld1 {v8.4s}, [%[din0]] \n" - "ld1 {v9.4s}, [%[din1]] \n" - "ld1 {v10.4s}, [%[din2]] \n" - "ld1 {v11.4s}, [%[din3]] \n" - "ld1 {v12.4s}, [%[din4]] \n" - "ld1 {v13.4s}, [%[din5]] \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]] \n" - "ld1 {v15.4s}, [%[din7]] \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - - // load weights col5 - "ld1 {v5.s}[0], [%[wh]], x0 \n" - "ld1 {v5.s}[1], [%[wh]], x0 \n" - "ld1 {v5.s}[2], [%[wh]], x0 \n" - "ld1 {v5.s}[3], [%[wh]], x0 \n" - "ld1 {v6.s}[0], [%[wh]] \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 2, 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 7, 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 12, 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 17, 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 22, 23, 24 - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v27 - "faddp v27.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v27.4s, v27.4s, v26.4s \n" - - // load in col5 - "ld1 {v20.s}[0], [%[din0]] \n" - "ld1 {v20.s}[1], [%[din1]] \n" - "ld1 {v20.s}[2], [%[din2]] \n" - "ld1 {v20.s}[3], [%[din3]] \n" - - // ext weights - "ext v0.16b, v0.16b, v31.16b, #4 \n" // 3, 4 - "ext v1.16b, v1.16b, v31.16b, #4 \n" // 8, 9 - "ext v2.16b, v2.16b, v31.16b, #4 \n" // 13, 14 - "ext v3.16b, v3.16b, v31.16b, #4 \n" // 18, 19 - "ext v4.16b, v4.16b, v31.16b, #4 \n" // 23, 24 - - "ld1 {v21.s}[0], [%[din4]] \n" - "ld1 {v21.s}[1], [%[din5]] \n" - "ld1 {v21.s}[2], [%[din6]] \n" - "ld1 {v21.s}[3], [%[din7]] \n" - - // in row0 - "fmul v16.4s, v0.4s, v8.4s \n" - "fmul v17.4s, v0.4s, v9.4s \n" - "fmul v18.4s, v0.4s, v10.4s \n" - "fmul v19.4s, v0.4s, v11.4s \n" - - // in row1 - "fmla v16.4s, v1.4s, v9.4s \n" - "fmla v17.4s, v1.4s, v10.4s \n" - "fmla v18.4s, v1.4s, v11.4s \n" - "fmla v19.4s, v1.4s, v12.4s \n" - - // in row2 - "fmla v16.4s, v2.4s, v10.4s \n" - "fmla v17.4s, v2.4s, v11.4s \n" - "fmla v18.4s, v2.4s, v12.4s \n" - "fmla v19.4s, v2.4s, v13.4s \n" - - // in row3 - "fmla v16.4s, v3.4s, v11.4s \n" - "fmla v17.4s, v3.4s, v12.4s \n" - "fmla v18.4s, v3.4s, v13.4s \n" - "fmla v19.4s, v3.4s, v14.4s \n" - - // in row4 - "fmla v16.4s, v4.4s, v12.4s \n" - "fmla v17.4s, v4.4s, v13.4s \n" - "fmla v18.4s, v4.4s, v14.4s \n" - "fmla v19.4s, v4.4s, v15.4s \n" - - // add to out register v26 - "faddp v26.4s, v16.4s, v17.4s \n" - "faddp v28.4s, v18.4s, v19.4s \n" - "faddp v26.4s, v26.4s, v28.4s \n" - - // ext input col5 - "ext v22.16b, v20.16b, v21.16b, #4 \n" - "ext v23.16b, v20.16b, v21.16b, #8 \n" - "ext v24.16b, v20.16b, v21.16b, #12 \n" - - // in col5 - "fmul v16.4s, v5.4s, v20.4s \n" - "fmul v17.4s, v5.4s, v22.4s \n" - "fmul v18.4s, v5.4s, v23.4s \n" - "fmul v19.4s, v5.4s, v24.4s \n" - - // add to out register v28 - "faddp v28.4s, v16.4s, v17.4s \n" - "faddp v29.4s, v18.4s, v19.4s \n" - "faddp v28.4s, v28.4s, v29.4s \n" - "fmla v28.4s, v21.4s, v6.s[0] \n" - - "ld1 {v8.4s}, [%[bias]] \n" - - // zip - "zip1 v0.4s, v28.4s, v26.4s \n" - "zip2 v2.4s, v28.4s, v26.4s \n" - "zip1 v4.4s, v27.4s, v25.4s \n" - "zip2 v6.4s, v27.4s, v25.4s \n" - - // add bias - "fadd v0.4s, v0.4s, v8.4s \n" - "fadd v2.4s, v2.4s, v8.4s \n" - "fadd v4.4s, v4.4s, v8.4s \n" - "fadd v6.4s, v6.4s, v8.4s \n" - - // relu - "fmax v0.4s, v0.4s, v31.4s \n" - "fmax v2.4s, v2.4s, v31.4s \n" - "fmax v4.4s, v4.4s, v31.4s \n" - "fmax v6.4s, v6.4s, v31.4s \n" - - "ext v1.16b, v0.16b, v31.16b, #8 \n" - "ext v3.16b, v2.16b, v31.16b, #8 \n" - "ext v5.16b, v4.16b, v31.16b, #8 \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - - // write output - "str d0, [%[dout0]], #8 \n" - "str d1, [%[dout1]], #8 \n" - "str d2, [%[dout2]], #8 \n" - "str d3, [%[dout3]], #8 \n" - - "str d4, [%[dout0]] \n" - "str d5, [%[dout1]] \n" - "str d6, [%[dout2]] \n" - "str d7, [%[dout3]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [dout2] "+r"(dout2), - [dout3] "+r"(dout3), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [wh] "+r"(weights) - : [bias] "r"(bias) - : "memory", - "x0", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v31"); -} - -//! kernel for four out with extracting data post -//! deal with four lines out -void compute_four_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v0-v3 - const int64_t s_12 = 12; - const float* doutl[4] = {dout0, dout1, dout2, dout3}; - void* doutl_ptr = reinterpret_cast(doutl); - asm volatile( - "movi v31.4s, #0 \n" - "ldp x0, x1, [%[doutl]], #16 \n" - "ldp x2, x3, [%[doutl]] \n" - - // load inputs - "ld1 {v8.4s}, [%[din0]], %[s_12] \n" - "ld1 {v9.4s}, [%[din1]], %[s_12] \n" - "ld1 {v10.4s}, [%[din2]], %[s_12] \n" - "ld1 {v11.4s}, [%[din3]], %[s_12] \n" - "ld1 {v12.4s}, [%[din4]], %[s_12] \n" - "ld1 {v13.4s}, [%[din5]], %[s_12] \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], %[s_12] \n" - "ld1 {v15.4s}, [%[din7]], %[s_12] \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - - // load input col5 - "ld1 {v20.s}[0], [%[din0]] \n" - "ld1 {v20.s}[1], [%[din1]] \n" - "ld1 {v20.s}[2], [%[din2]] \n" - "ld1 {v20.s}[3], [%[din3]] \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // load input col5 - "ld1 {v21.s}[0], [%[din4]] \n" - "ld1 {v21.s}[1], [%[din5]] \n" - "ld1 {v21.s}[2], [%[din6]] \n" - "ld1 {v21.s}[3], [%[din7]] \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v27 - "faddp v27.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v27.4s, v27.4s, v26.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v26 - "faddp v26.4s, v16.4s, v17.4s \n" - "faddp v28.4s, v18.4s, v19.4s \n" - "faddp v26.4s, v26.4s, v28.4s \n" - - // ext input col5 - "ext v8.16b, v20.16b, v21.16b, #4 \n" - "ext v9.16b, v20.16b, v21.16b, #8 \n" - "ext v10.16b, v20.16b, v21.16b, #12 \n" - - // ext weights col0 - "ins v5.s[0], %[w0].s[0] \n" - "ins v5.s[1], %[w1].s[0] \n" - "ins v5.s[2], %[w2].s[0] \n" - "ins v5.s[3], %[w3].s[0] \n" - - // in col5 - "fmul v16.4s, v5.4s, v20.4s \n" - "fmul v17.4s, v5.4s, v8.4s \n" - "fmul v18.4s, v5.4s, v9.4s \n" - "fmul v19.4s, v5.4s, v10.4s \n" - - // add to out register v28 - "faddp v28.4s, v16.4s, v17.4s \n" - "faddp v29.4s, v18.4s, v19.4s \n" - "faddp v28.4s, v28.4s, v29.4s \n" - "fmla v28.4s, v21.4s, %[w4].s[0] \n" - - "ld1 {v8.4s}, [%[bias]] \n" - - // zip - "zip1 v0.4s, v25.4s, v27.4s \n" - "zip2 v2.4s, v25.4s, v27.4s \n" - "zip1 v4.4s, v26.4s, v28.4s \n" - "zip2 v6.4s, v26.4s, v28.4s \n" - - "fadd v0.4s, v0.4s, v8.4s \n" - "fadd v2.4s, v2.4s, v8.4s \n" - "fadd v4.4s, v4.4s, v8.4s \n" - "fadd v6.4s, v6.4s, v8.4s \n" - - "ext v1.16b, v0.16b, v31.16b, #8 \n" - "ext v3.16b, v2.16b, v31.16b, #8 \n" - "ext v5.16b, v4.16b, v31.16b, #8 \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - - // write output - "str d0, [x0], #8 \n" - "str d1, [x1], #8 \n" - "str d2, [x2], #8 \n" - "str d3, [x3], #8 \n" - - "str d4, [x0] \n" - "str d5, [x1] \n" - "str d6, [x2] \n" - "str d7, [x3] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [doutl] "+r"(doutl_ptr) - : [s_12] "r"(s_12), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "x0", - "x1", - "x2", - "x3", - "v0", - "v1", - "v2", - "v3", - "v5", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v25", - "v26", - "v27", - "v28", - "v29", - "v31"); -} - -//! kernel for four out with extracting data post -//! deal with four lines out -void compute_four_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - const float* din6, - const float* din7, - float* dout0, - float* dout1, - float* dout2, - float* dout3, - float32x4_t w0, - float32x4_t w1, - float32x4_t w2, - float32x4_t w3, - float32x4_t w4, - const float* bias) { - //! din0 - din7: 0-4 v8-v15 - //! dout0 - dout3: v0-v3 - const int64_t s_12 = 12; - const float* doutl[4] = {dout0, dout1, dout2, dout3}; - void* doutl_ptr = reinterpret_cast(doutl); - asm volatile( - "movi v31.4s, #0 \n" - "ldp x0, x1, [%[doutl]], #16 \n" - "ldp x2, x3, [%[doutl]] \n" - - // load inputs - "ld1 {v8.4s}, [%[din0]], %[s_12] \n" - "ld1 {v9.4s}, [%[din1]], %[s_12] \n" - "ld1 {v10.4s}, [%[din2]], %[s_12] \n" - "ld1 {v11.4s}, [%[din3]], %[s_12] \n" - "ld1 {v12.4s}, [%[din4]], %[s_12] \n" - "ld1 {v13.4s}, [%[din5]], %[s_12] \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - "ld1 {v14.4s}, [%[din6]], %[s_12] \n" - "ld1 {v15.4s}, [%[din7]], %[s_12] \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v25 - "faddp v25.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v25.4s, v25.4s, v26.4s \n" - - // load input col5 - "ld1 {v20.s}[0], [%[din0]] \n" - "ld1 {v20.s}[1], [%[din1]] \n" - "ld1 {v20.s}[2], [%[din2]] \n" - "ld1 {v20.s}[3], [%[din3]] \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // load input col5 - "ld1 {v21.s}[0], [%[din4]] \n" - "ld1 {v21.s}[1], [%[din5]] \n" - "ld1 {v21.s}[2], [%[din6]] \n" - "ld1 {v21.s}[3], [%[din7]] \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v27 - "faddp v27.4s, v16.4s, v17.4s \n" - "faddp v26.4s, v18.4s, v19.4s \n" - "faddp v27.4s, v27.4s, v26.4s \n" - - // ext input - "ext v8.16b, v8.16b, v31.16b, #4 \n" - "ext v9.16b, v9.16b, v31.16b, #4 \n" - "ext v10.16b, v10.16b, v31.16b, #4 \n" - "ext v11.16b, v11.16b, v31.16b, #4 \n" - "ext v12.16b, v12.16b, v31.16b, #4 \n" - "ext v13.16b, v13.16b, v31.16b, #4 \n" - "ext v14.16b, v14.16b, v31.16b, #4 \n" - "ext v15.16b, v15.16b, v31.16b, #4 \n" - - // in row0 - "fmul v16.4s, %[w0].4s, v8.4s \n" - "fmul v17.4s, %[w0].4s, v9.4s \n" - "fmul v18.4s, %[w0].4s, v10.4s \n" - "fmul v19.4s, %[w0].4s, v11.4s \n" - - // in row1 - "fmla v16.4s, %[w1].4s, v9.4s \n" - "fmla v17.4s, %[w1].4s, v10.4s \n" - "fmla v18.4s, %[w1].4s, v11.4s \n" - "fmla v19.4s, %[w1].4s, v12.4s \n" - - // in row2 - "fmla v16.4s, %[w2].4s, v10.4s \n" - "fmla v17.4s, %[w2].4s, v11.4s \n" - "fmla v18.4s, %[w2].4s, v12.4s \n" - "fmla v19.4s, %[w2].4s, v13.4s \n" - - // in row3 - "fmla v16.4s, %[w3].4s, v11.4s \n" - "fmla v17.4s, %[w3].4s, v12.4s \n" - "fmla v18.4s, %[w3].4s, v13.4s \n" - "fmla v19.4s, %[w3].4s, v14.4s \n" - - // in row4 - "fmla v16.4s, %[w4].4s, v12.4s \n" - "fmla v17.4s, %[w4].4s, v13.4s \n" - "fmla v18.4s, %[w4].4s, v14.4s \n" - "fmla v19.4s, %[w4].4s, v15.4s \n" - - // add to out register v26 - "faddp v26.4s, v16.4s, v17.4s \n" - "faddp v28.4s, v18.4s, v19.4s \n" - "faddp v26.4s, v26.4s, v28.4s \n" - - // ext input col5 - "ext v8.16b, v20.16b, v21.16b, #4 \n" - "ext v9.16b, v20.16b, v21.16b, #8 \n" - "ext v10.16b, v20.16b, v21.16b, #12 \n" - - // ext weights col0 - "ins v5.s[0], %[w0].s[0] \n" - "ins v5.s[1], %[w1].s[0] \n" - "ins v5.s[2], %[w2].s[0] \n" - "ins v5.s[3], %[w3].s[0] \n" - - // in col5 - "fmul v16.4s, v5.4s, v20.4s \n" - "fmul v17.4s, v5.4s, v8.4s \n" - "fmul v18.4s, v5.4s, v9.4s \n" - "fmul v19.4s, v5.4s, v10.4s \n" - - // add to out register v28 - "faddp v28.4s, v16.4s, v17.4s \n" - "faddp v29.4s, v18.4s, v19.4s \n" - "faddp v28.4s, v28.4s, v29.4s \n" - "fmla v28.4s, v21.4s, %[w4].s[0] \n" - - "ld1 {v8.4s}, [%[bias]] \n" - - // zip - "zip1 v0.4s, v25.4s, v27.4s \n" - "zip2 v2.4s, v25.4s, v27.4s \n" - "zip1 v4.4s, v26.4s, v28.4s \n" - "zip2 v6.4s, v26.4s, v28.4s \n" - - // add bias - "fadd v0.4s, v0.4s, v8.4s \n" - "fadd v2.4s, v2.4s, v8.4s \n" - "fadd v4.4s, v4.4s, v8.4s \n" - "fadd v6.4s, v6.4s, v8.4s \n" - - // relu - "fmax v0.4s, v0.4s, v31.4s \n" - "fmax v2.4s, v2.4s, v31.4s \n" - "fmax v4.4s, v4.4s, v31.4s \n" - "fmax v6.4s, v6.4s, v31.4s \n" - - "ext v1.16b, v0.16b, v31.16b, #8 \n" - "ext v3.16b, v2.16b, v31.16b, #8 \n" - "ext v5.16b, v4.16b, v31.16b, #8 \n" - "ext v7.16b, v6.16b, v31.16b, #8 \n" - - // write output - "str d0, [x0], #8 \n" - "str d1, [x1], #8 \n" - "str d2, [x2], #8 \n" - "str d3, [x3], #8 \n" - - "str d4, [x0] \n" - "str d5, [x1] \n" - "str d6, [x2] \n" - "str d7, [x3] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [din6] "+r"(din6), - [din7] "+r"(din7), - [doutl] "+r"(doutl_ptr) - : [s_12] "r"(s_12), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [bias] "r"(bias) - : "memory", - "x0", - "x1", - "x2", - "x3", - "v0", - "v1", - "v2", - "v3", - "v5", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v25", - "v26", - "v27", - "v28", - "v29", - "v31"); -} - -void conv_depthwise_5x5s1_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_out_new = h_out - 2 * pad_0; - int mid_out = w_out - 2 * pad; - int mid_cnt = mid_out >> 2; - int mid_remain = mid_out - (mid_cnt << 2); - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_c); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_c; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - const float* din_list[8]; - const float* dinl[8]; - //! set din ptr with zero buffer - for (int i = 0; i < pad_new; ++i) { - din_list[i] = zero_ptr; - } - //! set din ptr with input data - for (int i = pad_new; i < 8; ++i) { - din_list[i] = din_ch; - din_ch += w_in; - } - - //! every h loop, deal with 4 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - float* dout2 = dout1 + w_out; - float* dout3 = dout2 + w_out; - - //! load weights to neon register - const float* weights_c = weights + c * weights_saptial_size; - - float32x4_t w5; - float32x4_t w6; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 5); - float32x4_t w2 = vld1q_f32(weights_c + 10); - float32x4_t w3 = vld1q_f32(weights_c + 15); - float32x4_t w4 = vld1q_f32(weights_c + 20); - w5 = vsetq_lane_f32(weights_c[4], w5, 0); - w5 = vsetq_lane_f32(weights_c[9], w5, 1); - w5 = vsetq_lane_f32(weights_c[14], w5, 2); - w5 = vsetq_lane_f32(weights_c[19], w5, 3); - w6 = vsetq_lane_f32(weights_c[24], w6, 0); - - //! h loop - for (int h = 0; h < h_out_new; h += 4) { - //! (h - pad_new) + 7 > h_in - 1 - if (h + 8 - pad_new > h_in) { - switch (h + 8 - pad_new - h_in) { - case 7: - din_list[1] = zero_ptr; - case 6: - din_list[2] = zero_ptr; - case 5: - din_list[3] = zero_ptr; - case 4: - din_list[4] = zero_ptr; - case 3: - din_list[5] = zero_ptr; - case 2: - din_list[6] = zero_ptr; - case 1: - din_list[7] = zero_ptr; - default: - break; - } - } - if (h + 4 > h_out_new) { - switch (h + 4 - h_out_new) { - case 3: - dout1 = write_ptr; - case 2: - dout2 = write_ptr; - case 1: - dout3 = write_ptr; - default: - break; - } - } - - //! every h loop, deal with 8 line input - dinl[0] = din_list[0]; - dinl[1] = din_list[1]; - dinl[2] = din_list[2]; - dinl[3] = din_list[3]; - dinl[4] = din_list[4]; - dinl[5] = din_list[5]; - dinl[6] = din_list[6]; - dinl[7] = din_list[7]; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - float* dout_ptr2 = dout2; - float* dout_ptr3 = dout3; - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - vst1q_f32(dout_ptr2, vbias_c); - vst1q_f32(dout_ptr3, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_c; - *dout_ptr1++ = bias_c; - *dout_ptr2++ = bias_c; - *dout_ptr3++ = bias_c; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - dout_ptr2 += pad_0; - dout_ptr3 += pad_0; - } - //! deal with w_out pad_new column pre - switch (pad_new) { - case 4: - compute_four_out_extract_pre(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - break; - case 3: - compute_three_out_extract_pre(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - dout_ptr2 += 3; - dout_ptr3 += 3; - break; - case 2: - compute_two_out_extract_pre(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - dout_ptr2 += 2; - dout_ptr3 += 2; - break; - case 1: - compute_one_out_extract_pre(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - dout_ptr2 += 1; - dout_ptr3 += 1; - break; - } - //! mid loop - if (mid_cnt > 0) { - void* dinl_ptr = reinterpret_cast(dinl); - int mid_loop = mid_cnt; - asm volatile( - //! din: v7-v14 - //! dout: v15-v18 - "mov x0, #0 \n" - "mov x1, #4 \n" - "ldp x2, x3, [%[dinl]], #16 \n" - "ldp x4, x5, [%[dinl]], #16 \n" - "ldp x6, x7, [%[dinl]], #16 \n" - "ldp x8, x9, [%[dinl]], #16 \n" - - "ld1 {v7.4s} , [x2], x1 \n" - "ld1 {v8.4s} , [x3], x1 \n" - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - //! load bias - "ld1 {v19.4s}, [%[bias]] \n" - - "1: \n" - //! add bias to output - "mov v15.16b, v19.16b \n" - "mov v16.16b, v19.16b \n" - "mov v17.16b, v19.16b \n" - "mov v18.16b, v19.16b \n" - - //! loop cnt is even, prefetch 64 Byte to l1 cache - "cmp x0, #1 \n" - "bne 2f \n" - "mov x0, #0 \n" - "prfm pldl1keep, [x2] \n" - "prfm pldl1keep, [x3] \n" - "prfm pldl1keep, [x4] \n" - "prfm pldl1keep, [x5] \n" - "prfm pldl1keep, [x6] \n" - "prfm pldl1keep, [x7] \n" - "prfm pldl1keep, [x8] \n" - "prfm pldl1keep, [x9] \n" - - "2: \n" - // weights col 0 - "fmla v15.4s, v7.4s , %[w0].s[0] \n" - "fmla v16.4s, v8.4s , %[w0].s[0] \n" - "fmla v17.4s, v9.4s , %[w0].s[0] \n" - "fmla v18.4s, v10.4s, %[w0].s[0] \n" - - "fmla v15.4s, v8.4s , %[w1].s[0] \n" - "fmla v16.4s, v9.4s , %[w1].s[0] \n" - "fmla v17.4s, v10.4s, %[w1].s[0] \n" - "fmla v18.4s, v11.4s, %[w1].s[0] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[0] \n" - "fmla v16.4s, v10.4s, %[w2].s[0] \n" - "fmla v17.4s, v11.4s, %[w2].s[0] \n" - "fmla v18.4s, v12.4s, %[w2].s[0] \n" - - "fmla v15.4s, v10.4s, %[w3].s[0] \n" - "fmla v16.4s, v11.4s, %[w3].s[0] \n" - "fmla v17.4s, v12.4s, %[w3].s[0] \n" - "fmla v18.4s, v13.4s, %[w3].s[0] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[0] \n" - "fmla v16.4s, v12.4s, %[w4].s[0] \n" - "fmla v17.4s, v13.4s, %[w4].s[0] \n" - "fmla v18.4s, v14.4s, %[w4].s[0] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 1 - "fmla v15.4s, v7.4s , %[w0].s[1] \n" - "fmla v16.4s, v8.4s , %[w0].s[1] \n" - "fmla v17.4s, v9.4s , %[w0].s[1] \n" - "fmla v18.4s, v10.4s, %[w0].s[1] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s , %[w1].s[1] \n" - "fmla v16.4s, v9.4s , %[w1].s[1] \n" - "fmla v17.4s, v10.4s, %[w1].s[1] \n" - "fmla v18.4s, v11.4s, %[w1].s[1] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[1] \n" - "fmla v16.4s, v10.4s, %[w2].s[1] \n" - "fmla v17.4s, v11.4s, %[w2].s[1] \n" - "fmla v18.4s, v12.4s, %[w2].s[1] \n" - - "fmla v15.4s, v10.4s, %[w3].s[1] \n" - "fmla v16.4s, v11.4s, %[w3].s[1] \n" - "fmla v17.4s, v12.4s, %[w3].s[1] \n" - "fmla v18.4s, v13.4s, %[w3].s[1] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[1] \n" - "fmla v16.4s, v12.4s, %[w4].s[1] \n" - "fmla v17.4s, v13.4s, %[w4].s[1] \n" - "fmla v18.4s, v14.4s, %[w4].s[1] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 2 - "fmla v15.4s, v7.4s , %[w0].s[2] \n" - "fmla v16.4s, v8.4s , %[w0].s[2] \n" - "fmla v17.4s, v9.4s , %[w0].s[2] \n" - "fmla v18.4s, v10.4s, %[w0].s[2] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s , %[w1].s[2] \n" - "fmla v16.4s, v9.4s , %[w1].s[2] \n" - "fmla v17.4s, v10.4s, %[w1].s[2] \n" - "fmla v18.4s, v11.4s, %[w1].s[2] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[2] \n" - "fmla v16.4s, v10.4s, %[w2].s[2] \n" - "fmla v17.4s, v11.4s, %[w2].s[2] \n" - "fmla v18.4s, v12.4s, %[w2].s[2] \n" - - "fmla v15.4s, v10.4s, %[w3].s[2] \n" - "fmla v16.4s, v11.4s, %[w3].s[2] \n" - "fmla v17.4s, v12.4s, %[w3].s[2] \n" - "fmla v18.4s, v13.4s, %[w3].s[2] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[2] \n" - "fmla v16.4s, v12.4s, %[w4].s[2] \n" - "fmla v17.4s, v13.4s, %[w4].s[2] \n" - "fmla v18.4s, v14.4s, %[w4].s[2] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 3 - "fmla v15.4s, v7.4s , %[w0].s[3] \n" - "fmla v16.4s, v8.4s , %[w0].s[3] \n" - "fmla v17.4s, v9.4s , %[w0].s[3] \n" - "fmla v18.4s, v10.4s, %[w0].s[3] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s , %[w1].s[3] \n" - "fmla v16.4s, v9.4s , %[w1].s[3] \n" - "fmla v17.4s, v10.4s, %[w1].s[3] \n" - "fmla v18.4s, v11.4s, %[w1].s[3] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[3] \n" - "fmla v16.4s, v10.4s, %[w2].s[3] \n" - "fmla v17.4s, v11.4s, %[w2].s[3] \n" - "fmla v18.4s, v12.4s, %[w2].s[3] \n" - - "fmla v15.4s, v10.4s, %[w3].s[3] \n" - "fmla v16.4s, v11.4s, %[w3].s[3] \n" - "fmla v17.4s, v12.4s, %[w3].s[3] \n" - "fmla v18.4s, v13.4s, %[w3].s[3] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[3] \n" - "fmla v16.4s, v12.4s, %[w4].s[3] \n" - "fmla v17.4s, v13.4s, %[w4].s[3] \n" - "fmla v18.4s, v14.4s, %[w4].s[3] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 4 - "fmla v15.4s, v7.4s, %[w5].s[0] \n" - "fmla v16.4s, v8.4s, %[w5].s[0] \n" - "fmla v17.4s, v9.4s, %[w5].s[0] \n" - "fmla v18.4s, v10.4s, %[w5].s[0] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s, %[w5].s[1] \n" - "fmla v16.4s, v9.4s, %[w5].s[1] \n" - "fmla v17.4s, v10.4s, %[w5].s[1] \n" - "fmla v18.4s, v11.4s, %[w5].s[1] \n" - - "fmla v15.4s, v9.4s , %[w5].s[2] \n" - "fmla v16.4s, v10.4s, %[w5].s[2] \n" - "fmla v17.4s, v11.4s, %[w5].s[2] \n" - "fmla v18.4s, v12.4s, %[w5].s[2] \n" - - "fmla v15.4s, v10.4s, %[w5].s[3] \n" - "fmla v16.4s, v11.4s, %[w5].s[3] \n" - "fmla v17.4s, v12.4s, %[w5].s[3] \n" - "fmla v18.4s, v13.4s, %[w5].s[3] \n" - - "fmla v15.4s, v11.4s, %[w6].s[0] \n" - "fmla v16.4s, v12.4s, %[w6].s[0] \n" - "fmla v17.4s, v13.4s, %[w6].s[0] \n" - "fmla v18.4s, v14.4s, %[w6].s[0] \n" - - "st1 {v15.4s}, [%[dout0]], #16 \n" - "st1 {v16.4s}, [%[dout1]], #16 \n" - "st1 {v17.4s}, [%[dout2]], #16 \n" - "st1 {v18.4s}, [%[dout3]], #16 \n" - - "subs %w[cnt], %w[cnt], #1 \n" - "add x0, x0, #1 \n" - "bne 1b \n" - - : [dout0] "+r"(dout_ptr0), - [dout1] "+r"(dout_ptr1), - [dout2] "+r"(dout_ptr2), - [dout3] "+r"(dout_ptr3), - [cnt] "+r"(mid_loop), - [dinl] "+r"(dinl_ptr) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [bias] "r"(vbias) - : "cc", - "memory", - "x0", - "x1", - "x2", - "x3", - "x4", - "x5", - "x6", - "x7", - "x8", - "x9", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19"); - } - dinl[0] += 4 * mid_cnt; - dinl[1] += 4 * mid_cnt; - dinl[2] += 4 * mid_cnt; - dinl[3] += 4 * mid_cnt; - dinl[4] += 4 * mid_cnt; - dinl[5] += 4 * mid_cnt; - dinl[6] += 4 * mid_cnt; - dinl[7] += 4 * mid_cnt; - //! deal with mid remain - for (int i = 0; i < mid_remain; ++i) { - compute_one_out_without_extract(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - w5, - w6, - vbias); - dinl[0]++; - dinl[1]++; - dinl[2]++; - dinl[3]++; - dinl[4]++; - dinl[5]++; - dinl[6]++; - dinl[7]++; - - dout_ptr0++; - dout_ptr1++; - dout_ptr2++; - dout_ptr3++; - } - //! deal with w_out pad_new column post - switch (pad_new) { - case 4: - compute_four_out_extract_post(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - break; - case 3: - compute_three_out_extract_post(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - dout_ptr2 += 3; - dout_ptr3 += 3; - break; - case 2: - compute_two_out_extract_post(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - dout_ptr2 += 2; - dout_ptr3 += 2; - break; - case 1: - compute_one_out_extract_post(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - dout_ptr2 += 1; - dout_ptr3 += 1; - break; - } - - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - memcpy(dout_ptr2, dout2, pad_0 * sizeof(float)); - memcpy(dout_ptr3, dout3, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - } - - din_list[0] = din_list[4]; - din_list[1] = din_list[5]; - din_list[2] = din_list[6]; - din_list[3] = din_list[7]; - din_list[4] = din_list[3] + w_in; - din_list[5] = din_list[4] + w_in; - din_list[6] = din_list[5] + w_in; - din_list[7] = din_list[6] + w_in; - - dout0 = dout3 + w_out; - dout1 = dout0 + w_out; - dout2 = dout1 + w_out; - dout3 = dout2 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } -} - -void conv_depthwise_5x5s1_relu_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_out_new = h_out - 2 * pad_0; - int mid_out = w_out - 2 * pad; - int mid_cnt = mid_out >> 2; - int mid_remain = mid_out - (mid_cnt << 2); - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float bias_relu = bias_c > 0.f ? bias_c : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_relu); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_relu; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - const float* din_list[8]; - const float* dinl[8]; - //! set din ptr with zero buffer - for (int i = 0; i < pad_new; ++i) { - din_list[i] = zero_ptr; - } - //! set din ptr with input data - for (int i = pad_new; i < 8; ++i) { - din_list[i] = din_ch; - din_ch += w_in; - } - - //! every h loop, deal with 4 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - float* dout2 = dout1 + w_out; - float* dout3 = dout2 + w_out; - - //! load weights to neon register - const float* weights_c = weights + c * weights_saptial_size; - - float32x4_t w5; - float32x4_t w6; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 5); - float32x4_t w2 = vld1q_f32(weights_c + 10); - float32x4_t w3 = vld1q_f32(weights_c + 15); - float32x4_t w4 = vld1q_f32(weights_c + 20); - w5 = vsetq_lane_f32(weights_c[4], w5, 0); - w5 = vsetq_lane_f32(weights_c[9], w5, 1); - w5 = vsetq_lane_f32(weights_c[14], w5, 2); - w5 = vsetq_lane_f32(weights_c[19], w5, 3); - w6 = vsetq_lane_f32(weights_c[24], w6, 0); - - //! h loop - for (int h = 0; h < h_out_new; h += 4) { - //! (h - pad_new) + 7 > h_in - 1 - if (h + 8 - pad_new > h_in) { - switch (h + 8 - pad_new - h_in) { - case 7: - din_list[1] = zero_ptr; - case 6: - din_list[2] = zero_ptr; - case 5: - din_list[3] = zero_ptr; - case 4: - din_list[4] = zero_ptr; - case 3: - din_list[5] = zero_ptr; - case 2: - din_list[6] = zero_ptr; - case 1: - din_list[7] = zero_ptr; - default: - break; - } - } - if (h + 4 > h_out_new) { - switch (h + 4 - h_out_new) { - case 3: - dout1 = write_ptr; - case 2: - dout2 = write_ptr; - case 1: - dout3 = write_ptr; - default: - break; - } - } - - //! every h loop, deal with 8 line input - dinl[0] = din_list[0]; - dinl[1] = din_list[1]; - dinl[2] = din_list[2]; - dinl[3] = din_list[3]; - dinl[4] = din_list[4]; - dinl[5] = din_list[5]; - dinl[6] = din_list[6]; - dinl[7] = din_list[7]; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - float* dout_ptr2 = dout2; - float* dout_ptr3 = dout3; - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - vst1q_f32(dout_ptr2, vbias_c); - vst1q_f32(dout_ptr3, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_relu; - *dout_ptr1++ = bias_relu; - *dout_ptr2++ = bias_relu; - *dout_ptr3++ = bias_relu; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - dout_ptr2 += pad_0; - dout_ptr3 += pad_0; - } - //! deal with w_out pad_new column pre - switch (pad_new) { - case 4: - compute_four_out_extract_pre_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - break; - case 3: - compute_three_out_extract_pre_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - dout_ptr2 += 3; - dout_ptr3 += 3; - break; - case 2: - compute_two_out_extract_pre_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - dout_ptr2 += 2; - dout_ptr3 += 2; - break; - case 1: - compute_one_out_extract_pre_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - weights_ptr, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - dout_ptr2 += 1; - dout_ptr3 += 1; - break; - } - //! mid loop - if (mid_cnt > 0) { - void* dinl_ptr = reinterpret_cast(dinl); - int mid_loop = mid_cnt; - asm volatile( - //! din: v7-v14 - //! dout: v15-v18 - "mov x0, #0 \n" - "mov x1, #4 \n" - "movi v31.4s, #0 \n" - "ldp x2, x3, [%[dinl]], #16 \n" - "ldp x4, x5, [%[dinl]], #16 \n" - "ldp x6, x7, [%[dinl]], #16 \n" - "ldp x8, x9, [%[dinl]], #16 \n" - - "ld1 {v7.4s} , [x2], x1 \n" - "ld1 {v8.4s} , [x3], x1 \n" - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - //! load bias - "ld1 {v19.4s}, [%[bias]] \n" - - "1: \n" - //! add bias to output - "mov v15.16b, v19.16b \n" - "mov v16.16b, v19.16b \n" - "mov v17.16b, v19.16b \n" - "mov v18.16b, v19.16b \n" - - //! loop cnt is even, prefetch 64 Byte to l1 cache - "cmp x0, #1 \n" - "bne 2f \n" - "mov x0, #0 \n" - "prfm pldl1keep, [x2] \n" - "prfm pldl1keep, [x3] \n" - "prfm pldl1keep, [x4] \n" - "prfm pldl1keep, [x5] \n" - "prfm pldl1keep, [x6] \n" - "prfm pldl1keep, [x7] \n" - "prfm pldl1keep, [x8] \n" - "prfm pldl1keep, [x9] \n" - - "2: \n" - // weights col 0 - "fmla v15.4s, v7.4s , %[w0].s[0] \n" - "fmla v16.4s, v8.4s , %[w0].s[0] \n" - "fmla v17.4s, v9.4s , %[w0].s[0] \n" - "fmla v18.4s, v10.4s, %[w0].s[0] \n" - - "fmla v15.4s, v8.4s , %[w1].s[0] \n" - "fmla v16.4s, v9.4s , %[w1].s[0] \n" - "fmla v17.4s, v10.4s, %[w1].s[0] \n" - "fmla v18.4s, v11.4s, %[w1].s[0] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[0] \n" - "fmla v16.4s, v10.4s, %[w2].s[0] \n" - "fmla v17.4s, v11.4s, %[w2].s[0] \n" - "fmla v18.4s, v12.4s, %[w2].s[0] \n" - - "fmla v15.4s, v10.4s, %[w3].s[0] \n" - "fmla v16.4s, v11.4s, %[w3].s[0] \n" - "fmla v17.4s, v12.4s, %[w3].s[0] \n" - "fmla v18.4s, v13.4s, %[w3].s[0] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[0] \n" - "fmla v16.4s, v12.4s, %[w4].s[0] \n" - "fmla v17.4s, v13.4s, %[w4].s[0] \n" - "fmla v18.4s, v14.4s, %[w4].s[0] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 1 - "fmla v15.4s, v7.4s , %[w0].s[1] \n" - "fmla v16.4s, v8.4s , %[w0].s[1] \n" - "fmla v17.4s, v9.4s , %[w0].s[1] \n" - "fmla v18.4s, v10.4s, %[w0].s[1] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s , %[w1].s[1] \n" - "fmla v16.4s, v9.4s , %[w1].s[1] \n" - "fmla v17.4s, v10.4s, %[w1].s[1] \n" - "fmla v18.4s, v11.4s, %[w1].s[1] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[1] \n" - "fmla v16.4s, v10.4s, %[w2].s[1] \n" - "fmla v17.4s, v11.4s, %[w2].s[1] \n" - "fmla v18.4s, v12.4s, %[w2].s[1] \n" - - "fmla v15.4s, v10.4s, %[w3].s[1] \n" - "fmla v16.4s, v11.4s, %[w3].s[1] \n" - "fmla v17.4s, v12.4s, %[w3].s[1] \n" - "fmla v18.4s, v13.4s, %[w3].s[1] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[1] \n" - "fmla v16.4s, v12.4s, %[w4].s[1] \n" - "fmla v17.4s, v13.4s, %[w4].s[1] \n" - "fmla v18.4s, v14.4s, %[w4].s[1] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 2 - "fmla v15.4s, v7.4s , %[w0].s[2] \n" - "fmla v16.4s, v8.4s , %[w0].s[2] \n" - "fmla v17.4s, v9.4s , %[w0].s[2] \n" - "fmla v18.4s, v10.4s, %[w0].s[2] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s , %[w1].s[2] \n" - "fmla v16.4s, v9.4s , %[w1].s[2] \n" - "fmla v17.4s, v10.4s, %[w1].s[2] \n" - "fmla v18.4s, v11.4s, %[w1].s[2] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[2] \n" - "fmla v16.4s, v10.4s, %[w2].s[2] \n" - "fmla v17.4s, v11.4s, %[w2].s[2] \n" - "fmla v18.4s, v12.4s, %[w2].s[2] \n" - - "fmla v15.4s, v10.4s, %[w3].s[2] \n" - "fmla v16.4s, v11.4s, %[w3].s[2] \n" - "fmla v17.4s, v12.4s, %[w3].s[2] \n" - "fmla v18.4s, v13.4s, %[w3].s[2] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[2] \n" - "fmla v16.4s, v12.4s, %[w4].s[2] \n" - "fmla v17.4s, v13.4s, %[w4].s[2] \n" - "fmla v18.4s, v14.4s, %[w4].s[2] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 3 - "fmla v15.4s, v7.4s , %[w0].s[3] \n" - "fmla v16.4s, v8.4s , %[w0].s[3] \n" - "fmla v17.4s, v9.4s , %[w0].s[3] \n" - "fmla v18.4s, v10.4s, %[w0].s[3] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s , %[w1].s[3] \n" - "fmla v16.4s, v9.4s , %[w1].s[3] \n" - "fmla v17.4s, v10.4s, %[w1].s[3] \n" - "fmla v18.4s, v11.4s, %[w1].s[3] \n" - - "ld1 {v7.4s}, [x2], x1 \n" - "ld1 {v8.4s}, [x3], x1 \n" - - "fmla v15.4s, v9.4s , %[w2].s[3] \n" - "fmla v16.4s, v10.4s, %[w2].s[3] \n" - "fmla v17.4s, v11.4s, %[w2].s[3] \n" - "fmla v18.4s, v12.4s, %[w2].s[3] \n" - - "fmla v15.4s, v10.4s, %[w3].s[3] \n" - "fmla v16.4s, v11.4s, %[w3].s[3] \n" - "fmla v17.4s, v12.4s, %[w3].s[3] \n" - "fmla v18.4s, v13.4s, %[w3].s[3] \n" - - "ld1 {v9.4s} , [x4], x1 \n" - "ld1 {v10.4s}, [x5], x1 \n" - - "fmla v15.4s, v11.4s, %[w4].s[3] \n" - "fmla v16.4s, v12.4s, %[w4].s[3] \n" - "fmla v17.4s, v13.4s, %[w4].s[3] \n" - "fmla v18.4s, v14.4s, %[w4].s[3] \n" - - "ld1 {v11.4s}, [x6], x1 \n" - "ld1 {v12.4s}, [x7], x1 \n" - - // weights col 4 - "fmla v15.4s, v7.4s, %[w5].s[0] \n" - "fmla v16.4s, v8.4s, %[w5].s[0] \n" - "fmla v17.4s, v9.4s, %[w5].s[0] \n" - "fmla v18.4s, v10.4s, %[w5].s[0] \n" - - "ld1 {v13.4s}, [x8], x1 \n" - "ld1 {v14.4s}, [x9], x1 \n" - - "fmla v15.4s, v8.4s, %[w5].s[1] \n" - "fmla v16.4s, v9.4s, %[w5].s[1] \n" - "fmla v17.4s, v10.4s, %[w5].s[1] \n" - "fmla v18.4s, v11.4s, %[w5].s[1] \n" - - "fmla v15.4s, v9.4s , %[w5].s[2] \n" - "fmla v16.4s, v10.4s, %[w5].s[2] \n" - "fmla v17.4s, v11.4s, %[w5].s[2] \n" - "fmla v18.4s, v12.4s, %[w5].s[2] \n" - - "fmla v15.4s, v10.4s, %[w5].s[3] \n" - "fmla v16.4s, v11.4s, %[w5].s[3] \n" - "fmla v17.4s, v12.4s, %[w5].s[3] \n" - "fmla v18.4s, v13.4s, %[w5].s[3] \n" - - "fmla v15.4s, v11.4s, %[w6].s[0] \n" - "fmla v16.4s, v12.4s, %[w6].s[0] \n" - "fmla v17.4s, v13.4s, %[w6].s[0] \n" - "fmla v18.4s, v14.4s, %[w6].s[0] \n" - - "fmax v15.4s, v15.4s, v31.4s \n" - "fmax v16.4s, v16.4s, v31.4s \n" - "fmax v17.4s, v17.4s, v31.4s \n" - "fmax v18.4s, v18.4s, v31.4s \n" - - "st1 {v15.4s}, [%[dout0]], #16 \n" - "st1 {v16.4s}, [%[dout1]], #16 \n" - "st1 {v17.4s}, [%[dout2]], #16 \n" - "st1 {v18.4s}, [%[dout3]], #16 \n" - - "subs %w[cnt], %w[cnt], #1 \n" - "add x0, x0, #1 \n" - "bne 1b \n" - - : [dout0] "+r"(dout_ptr0), - [dout1] "+r"(dout_ptr1), - [dout2] "+r"(dout_ptr2), - [dout3] "+r"(dout_ptr3), - [cnt] "+r"(mid_loop), - [dinl] "+r"(dinl_ptr) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [bias] "r"(vbias) - : "cc", - "memory", - "x0", - "x1", - "x2", - "x3", - "x4", - "x5", - "x6", - "x7", - "x8", - "x9", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v31"); - } - dinl[0] += 4 * mid_cnt; - dinl[1] += 4 * mid_cnt; - dinl[2] += 4 * mid_cnt; - dinl[3] += 4 * mid_cnt; - dinl[4] += 4 * mid_cnt; - dinl[5] += 4 * mid_cnt; - dinl[6] += 4 * mid_cnt; - dinl[7] += 4 * mid_cnt; - //! deal with mid remain - for (int i = 0; i < mid_remain; ++i) { - compute_one_out_without_extract_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - w5, - w6, - vbias); - dinl[0]++; - dinl[1]++; - dinl[2]++; - dinl[3]++; - dinl[4]++; - dinl[5]++; - dinl[6]++; - dinl[7]++; - - dout_ptr0++; - dout_ptr1++; - dout_ptr2++; - dout_ptr3++; - } - //! deal with w_out pad_new column post - switch (pad_new) { - case 4: - compute_four_out_extract_post_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - break; - case 3: - compute_three_out_extract_post_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - dout_ptr2 += 3; - dout_ptr3 += 3; - break; - case 2: - compute_two_out_extract_post_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - dout_ptr2 += 2; - dout_ptr3 += 2; - break; - case 1: - compute_one_out_extract_post_relu(dinl[0], - dinl[1], - dinl[2], - dinl[3], - dinl[4], - dinl[5], - dinl[6], - dinl[7], - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - dout_ptr2 += 1; - dout_ptr3 += 1; - break; - } - - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - memcpy(dout_ptr2, dout2, pad_0 * sizeof(float)); - memcpy(dout_ptr3, dout3, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - } - - din_list[0] = din_list[4]; - din_list[1] = din_list[5]; - din_list[2] = din_list[6]; - din_list[3] = din_list[7]; - din_list[4] = din_list[3] + w_in; - din_list[5] = din_list[4] + w_in; - din_list[6] = din_list[5] + w_in; - din_list[7] = din_list[6] + w_in; - - dout0 = dout3 + w_out; - dout1 = dout0 + w_out; - dout2 = dout1 + w_out; - dout3 = dout2 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } -} - -void conv_depthwise_5x5s1_small_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_in_new = h_in + 2 * pad_new; - int w_in_new = w_in + 2 * pad_new; - int h_out_new = h_out - 2 * pad_0; - int w_out_new = w_out - 2 * pad_0; - float zero_ptr[w_in_new + w_out]; - memset(zero_ptr, 0, w_in_new * sizeof(float)); - float* write_ptr = zero_ptr + w_in_new; - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in_new * h_in_new; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new); - for (int n = 0; n < num; ++n) { - const float* din_batch = din_new + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_c); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_c; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - //! every h loop, deal with 8 line input - const float* din0 = din_ch; - const float* din1 = din0 + w_in_new; - const float* din2 = din1 + w_in_new; - const float* din3 = din2 + w_in_new; - const float* din4 = din3 + w_in_new; - const float* din5 = din4 + w_in_new; - const float* din6 = din5 + w_in_new; - const float* din7 = din6 + w_in_new; - //! every h loop, deal with 4 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - float* dout2 = dout1 + w_out; - float* dout3 = dout2 + w_out; - - //! load weights to neon register - const float* weights_c = weights + c * weights_saptial_size; - - float32x4_t w5; - float32x4_t w6; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 5); - float32x4_t w2 = vld1q_f32(weights_c + 10); - float32x4_t w3 = vld1q_f32(weights_c + 15); - float32x4_t w4 = vld1q_f32(weights_c + 20); - w5 = vsetq_lane_f32(weights_c[4], w5, 0); - w5 = vsetq_lane_f32(weights_c[9], w5, 1); - w5 = vsetq_lane_f32(weights_c[14], w5, 2); - w5 = vsetq_lane_f32(weights_c[19], w5, 3); - w6 = vsetq_lane_f32(weights_c[24], w6, 0); - //! h loop - for (int h = 0; h < h_out_new; h += 4) { - //! (h - pad_new) + 7 > h_in - 1 - if (h + 8 > h_in_new) { - switch (h + 8 - h_in_new) { - case 7: - din1 = zero_ptr; - case 6: - din2 = zero_ptr; - case 5: - din3 = zero_ptr; - case 4: - din4 = zero_ptr; - case 3: - din5 = zero_ptr; - case 2: - din6 = zero_ptr; - case 1: - din7 = zero_ptr; - default: - break; - } - } - if (h + 4 > h_out_new) { - switch (h + 4 - h_out_new) { - case 3: - dout1 = write_ptr; - case 2: - dout2 = write_ptr; - case 1: - dout3 = write_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - const float* din_ptr6 = din6; - const float* din_ptr7 = din7; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - float* dout_ptr2 = dout2; - float* dout_ptr3 = dout3; - - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - vst1q_f32(dout_ptr2, vbias_c); - vst1q_f32(dout_ptr3, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_c; - *dout_ptr1++ = bias_c; - *dout_ptr2++ = bias_c; - *dout_ptr3++ = bias_c; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - dout_ptr2 += pad_0; - dout_ptr3 += pad_0; - } - //! mid loop - for (int i = 0; i < w_out_new; ++i) { - compute_one_out_without_extract(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - din_ptr6, - din_ptr7, - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - w5, - w6, - vbias); - din_ptr0++; - din_ptr1++; - din_ptr2++; - din_ptr3++; - din_ptr4++; - din_ptr5++; - din_ptr6++; - din_ptr7++; - - dout_ptr0++; - dout_ptr1++; - dout_ptr2++; - dout_ptr3++; - } - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - memcpy(dout_ptr2, dout2, pad_0 * sizeof(float)); - memcpy(dout_ptr3, dout3, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - } - - din0 = din4; - din1 = din5; - din2 = din6; - din3 = din7; - din4 = din3 + w_in_new; - din5 = din4 + w_in_new; - din6 = din5 + w_in_new; - din7 = din6 + w_in_new; - - dout0 = dout3 + w_out; - dout1 = dout0 + w_out; - dout2 = dout1 + w_out; - dout3 = dout2 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } - free(din_new); -} - -void conv_depthwise_5x5s1_small_relu_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_in_new = h_in + 2 * pad_new; - int w_in_new = w_in + 2 * pad_new; - float zero_ptr[w_in_new + w_out]; - memset(zero_ptr, 0, w_in_new * sizeof(float)); - float* write_ptr = zero_ptr + w_in_new; - int h_out_new = h_out - 2 * pad_0; - int w_out_new = w_out - 2 * pad_0; - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in_new * h_in_new; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new); - for (int n = 0; n < num; ++n) { - const float* din_batch = din_new + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float bias_relu = bias_c > 0.f ? bias_c : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_relu); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_relu; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - - //! every h loop, deal with 8 line input - const float* din0 = din_ch; - const float* din1 = din0 + w_in_new; - const float* din2 = din1 + w_in_new; - const float* din3 = din2 + w_in_new; - const float* din4 = din3 + w_in_new; - const float* din5 = din4 + w_in_new; - const float* din6 = din5 + w_in_new; - const float* din7 = din6 + w_in_new; - //! every h loop, deal with 4 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - float* dout2 = dout1 + w_out; - float* dout3 = dout2 + w_out; - - //! load weights to neon register - const float* weights_c = weights + c * weights_saptial_size; - - float32x4_t w5; - float32x4_t w6; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 5); - float32x4_t w2 = vld1q_f32(weights_c + 10); - float32x4_t w3 = vld1q_f32(weights_c + 15); - float32x4_t w4 = vld1q_f32(weights_c + 20); - w5 = vsetq_lane_f32(weights_c[4], w5, 0); - w5 = vsetq_lane_f32(weights_c[9], w5, 1); - w5 = vsetq_lane_f32(weights_c[14], w5, 2); - w5 = vsetq_lane_f32(weights_c[19], w5, 3); - w6 = vsetq_lane_f32(weights_c[24], w6, 0); - - //! h loop - for (int h = 0; h < h_out_new; h += 4) { - //! (h - pad_new) + 7 > h_in - 1 - if (h + 8 > h_in_new) { - switch (h + 8 - h_in_new) { - case 7: - din1 = zero_ptr; - case 6: - din2 = zero_ptr; - case 5: - din3 = zero_ptr; - case 4: - din4 = zero_ptr; - case 3: - din5 = zero_ptr; - case 2: - din6 = zero_ptr; - case 1: - din7 = zero_ptr; - default: - break; - } - } - if (h + 4 > h_out_new) { - switch (h + 4 - h_out_new) { - case 3: - dout1 = write_ptr; - case 2: - dout2 = write_ptr; - case 1: - dout3 = write_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - const float* din_ptr6 = din6; - const float* din_ptr7 = din7; - - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - float* dout_ptr2 = dout2; - float* dout_ptr3 = dout3; - - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - vst1q_f32(dout_ptr2, vbias_c); - vst1q_f32(dout_ptr3, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - dout_ptr2 += 4; - dout_ptr3 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_relu; - *dout_ptr1++ = bias_relu; - *dout_ptr2++ = bias_relu; - *dout_ptr3++ = bias_relu; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - dout_ptr2 += pad_0; - dout_ptr3 += pad_0; - } - - //! mid loop - for (int i = 0; i < w_out_new; ++i) { - compute_one_out_without_extract_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - din_ptr6, - din_ptr7, - dout_ptr0, - dout_ptr1, - dout_ptr2, - dout_ptr3, - w0, - w1, - w2, - w3, - w4, - w5, - w6, - vbias); - din_ptr0++; - din_ptr1++; - din_ptr2++; - din_ptr3++; - din_ptr4++; - din_ptr5++; - din_ptr6++; - din_ptr7++; - - dout_ptr0++; - dout_ptr1++; - dout_ptr2++; - dout_ptr3++; - } - - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - memcpy(dout_ptr2, dout2, pad_0 * sizeof(float)); - memcpy(dout_ptr3, dout3, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr2, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr3, 0x00, pad_0 * sizeof(float)); - } - - din0 = din4; - din1 = din5; - din2 = din6; - din3 = din7; - din4 = din3 + w_in_new; - din5 = din4 + w_in_new; - din6 = din5 + w_in_new; - din7 = din6 + w_in_new; - - dout0 = dout3 + w_out; - dout1 = dout0 + w_out; - dout2 = dout1 + w_out; - dout3 = dout2 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } - free(din_new); -} - -#else - -//! kernel for one out without extracting data mid -//! deal with two lines out -void compute_one_out_without_extract(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d6[0]}, [%[din0]] \n" - "vld1.32 {d6[1]}, [%[din1]] \n" - "vld1.32 {d7[0]}, [%[din2]] \n" - "vld1.32 {d7[1]}, [%[din3]] \n" - - // weights r2 - "vmla.f32 q9, q0, q4 \n" - "vmla.f32 q10, q0, q5 \n" - - "vld1.32 {d8[0]}, [%[din4]] \n" - "vld1.32 {d8[1]}, [%[din5]] \n" - - "vld1.32 {d0-d1}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q1, q5 \n" - "vmla.f32 q10, q1, q6 \n" - - // weights col4 - "sub %[wh], #64 \n" - "vld1.32 {d4[0]}, [%[wh]], r0 \n" - "vld1.32 {d4[1]}, [%[wh]], r0 \n" - "vld1.32 {d5[0]}, [%[wh]], r0 \n" - "vld1.32 {d5[1]}, [%[wh]], r0 \n" - - // weights r4 - "vmla.f32 q9, q0, q6 \n" - "vmla.f32 q10, q0, q7 \n" - - "vext.32 q5, q3, q4, #1 \n" - - "vmla.f32 q9, q2, q3 \n" - "vmla.f32 q10, q2, q5 \n" - - "vld1.32 {d4[0]}, [%[wh]] \n" - "vld1.32 {d6}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - "vmla.f32 d18, d8, d4[0] \n" - - // add bias - "vadd.f32 d18, d18, d6 \n" - - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); -} - -//! kernel for one out without extracting data mid -//! deal with two lines out -void compute_one_out_without_extract_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vmov.i32 q15, #0x0 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d6[0]}, [%[din0]] \n" - "vld1.32 {d6[1]}, [%[din1]] \n" - "vld1.32 {d7[0]}, [%[din2]] \n" - "vld1.32 {d7[1]}, [%[din3]] \n" - - // weights r2 - "vmla.f32 q9, q0, q4 \n" - "vmla.f32 q10, q0, q5 \n" - - "vld1.32 {d8[0]}, [%[din4]] \n" - "vld1.32 {d8[1]}, [%[din5]] \n" - - "vld1.32 {d0-d1}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q1, q5 \n" - "vmla.f32 q10, q1, q6 \n" - - // weights col4 - "sub %[wh], #64 \n" - "vld1.32 {d4[0]}, [%[wh]], r0 \n" - "vld1.32 {d4[1]}, [%[wh]], r0 \n" - "vld1.32 {d5[0]}, [%[wh]], r0 \n" - "vld1.32 {d5[1]}, [%[wh]], r0 \n" - - // weights r4 - "vmla.f32 q9, q0, q6 \n" - "vmla.f32 q10, q0, q7 \n" - - "vext.32 q5, q3, q4, #1 \n" - - "vmla.f32 q9, q2, q3 \n" - "vmla.f32 q10, q2, q5 \n" - - "vld1.32 {d4[0]}, [%[wh]] \n" - "vld1.32 {d6}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - "vmla.f32 d18, d8, d4[0] \n" - - // add bias - "vadd.f32 d18, d18, d6 \n" - - // relu - "vmax.f32 d18, d18, d30 \n" - - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q15"); -} - -//! kernel for one out without extracting data pre -//! deal with two lines out -void compute_one_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "add %[wh], #4 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q0, q4 \n" - "vmla.f32 q10, q0, q5 \n" - - "vld1.32 {d0-d1}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q1, q5 \n" - "vmla.f32 q10, q1, q6 \n" - - // weights r4 - "vmla.f32 q9, q0, q6 \n" - "vmla.f32 q10, q0, q7 \n" - - // load bias - "vld1.32 {d0}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - // add bias - "vadd.f32 d18, d18, d0 \n" - - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); -} - -//! kernel for one out without extracting data pre -//! deal with two lines out -void compute_one_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "add %[wh], #4 \n" - "vmov.i32 q15, #0x0 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q0, q4 \n" - "vmla.f32 q10, q0, q5 \n" - - "vld1.32 {d0-d1}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q1, q5 \n" - "vmla.f32 q10, q1, q6 \n" - - // weights r4 - "vmla.f32 q9, q0, q6 \n" - "vmla.f32 q10, q0, q7 \n" - - // load bias - "vld1.32 {d0}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - // add bias - "vadd.f32 d18, d18, d0 \n" - - // relu - "vmax.f32 d18, d18, d30 \n" - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q15"); -} - -//! kernel for one out with extracting data post -//! deal with two lines out -void compute_one_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q0, q4 \n" - "vmla.f32 q10, q0, q5 \n" - - "vld1.32 {d0-d1}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q1, q5 \n" - "vmla.f32 q10, q1, q6 \n" - - // weights r4 - "vmla.f32 q9, q0, q6 \n" - "vmla.f32 q10, q0, q7 \n" - - "vld1.32 {d0}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - // add bias - "vadd.f32 d18, d18, d0 \n" - - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); -} - -//! kernel for one out with extracting data post -//! deal with two lines out -void compute_one_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vmov.i32 q15, #0x0 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q0, q4 \n" - "vmla.f32 q10, q0, q5 \n" - - "vld1.32 {d0-d1}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q1, q5 \n" - "vmla.f32 q10, q1, q6 \n" - - // weights r4 - "vmla.f32 q9, q0, q6 \n" - "vmla.f32 q10, q0, q7 \n" - - "vld1.32 {d0}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - // add bias - "vadd.f32 d18, d18, d0 \n" - - // relu - "vmax.f32 d18, d18, d30 \n" - - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q15"); -} - -//! kernel for two out with extracting data pre -//! deal with two lines out -void compute_two_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "mov r1, #0 \n" - "add %[wh], #8 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vmov.32 d1[1], r1 \n" - "vmov.32 d3[1], r1 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - "vmov.32 d25[1], r1 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vmov.32 d27[1], r1 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - "vmov.32 d29[1], r1 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - "vpadd.f32 d22, d18, d19 \n" - "vpadd.f32 d23, d20, d21 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vld1.32 {d28-d29}, [%[wh]]\n" - - "vpadd.f32 d22, d22, d23 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // store result - "vst1.32 {d22}, [%[dout0]] \n" - "vst1.32 {d23}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for two out with extracting data pre -//! deal with two lines out -void compute_two_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "mov r1, #0 \n" - "add %[wh], #8 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vmov.32 d1[1], r1 \n" - "vmov.32 d3[1], r1 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - "vmov.32 d25[1], r1 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vmov.32 d27[1], r1 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - "vmov.32 d29[1], r1 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - "vpadd.f32 d22, d18, d19 \n" - "vpadd.f32 d23, d20, d21 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vld1.32 {d28-d29}, [%[wh]]\n" - - "vpadd.f32 d22, d22, d23 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - "vmov.i32 q9, #0x0 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // relu - "vmax.f32 q11, q11, q9 \n" - // store result - "vst1.32 {d22}, [%[dout0]] \n" - "vst1.32 {d23}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for two out with extracting data post -//! deal with two lines out -void compute_two_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - //! out zero - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vpadd.f32 d22, d18, d19 \n" - "vpadd.f32 d23, d20, d21 \n" - "vpadd.f32 d22, d22, d23 \n" - - "vmov.f32 q15, #0.0 \n" - "vext.32 q2, q2, q15, #1 \n" - "vext.32 q3, q3, q15, #1 \n" - "vext.32 q4, q4, q15, #1 \n" - "vext.32 q5, q5, q15, #1 \n" - "vext.32 q6, q6, q15, #1 \n" - "vext.32 q7, q7, q15, #1 \n" - "vext.32 q8, q8, q15, #1 \n" - - //! out one - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // store result - "vst1.32 {d22}, [%[dout0]] \n" - "vst1.32 {d23}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for two out with extracting data post -//! deal with two lines out -void compute_two_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]]! \n" - "vld1.32 {d6-d7}, [%[din1]]! \n" - "vld1.32 {d8-d9}, [%[din2]]! \n" - "vld1.32 {d10-d11}, [%[din3]]! \n" - "vld1.32 {d12-d13}, [%[din4]]! \n" - "vld1.32 {d14-d15}, [%[din5]]! \n" - - //! out zero - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vpadd.f32 d22, d18, d19 \n" - "vpadd.f32 d23, d20, d21 \n" - "vpadd.f32 d22, d22, d23 \n" - - "vmov.f32 q15, #0.0 \n" - "vext.32 q2, q2, q15, #1 \n" - "vext.32 q3, q3, q15, #1 \n" - "vext.32 q4, q4, q15, #1 \n" - "vext.32 q5, q5, q15, #1 \n" - "vext.32 q6, q6, q15, #1 \n" - "vext.32 q7, q7, q15, #1 \n" - "vext.32 q8, q8, q15, #1 \n" - - //! out one - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - "vmov.i32 q9, #0x0 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // relu - "vmax.f32 q11, q11, q9 \n" - - // store result - "vst1.32 {d22}, [%[dout0]] \n" - "vst1.32 {d23}, [%[dout1]] \n" - - : [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for three out with extracting data pre -//! deal with two lines out -void compute_three_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "add %[wh], #12 \n" - "vld1.32 {d0}, [%[wh]], r0 \n" - "vld1.32 {d2}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]] \n" - "vld1.32 {d6-d7}, [%[din1]] \n" - "vld1.32 {d8-d9}, [%[din2]] \n" - "vld1.32 {d10-d11}, [%[din3]] \n" - "vld1.32 {d12-d13}, [%[din4]] \n" - "vld1.32 {d14-d15}, [%[din5]] \n" - - //! out zero - // weights r0 - "vmul.f32 d18, d0, d4 \n" - "vmul.f32 d20, d0, d6 \n" - - "vld1.32 {d24}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 d18, d2, d6 \n" - "vmla.f32 d20, d2, d8 \n" - - "vld1.32 {d26}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 d18, d24, d8 \n" - "vmla.f32 d20, d24, d10 \n" - - "vld1.32 {d28}, [%[wh]] \n" - - // weights r3 - "vmla.f32 d18, d26, d10 \n" - "vmla.f32 d20, d26, d12 \n" - - // load bias - "vld1.32 {d30-d31}, [%[bias]] \n" - - // weights r4 - "vmla.f32 d18, d28, d12 \n" - "vmla.f32 d20, d28, d14 \n" - "vpadd.f32 d22, d18, d20 \n" - - //! out one - "mov r1, #0 \n" - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vmov.32 d1[1], r1 \n" - "vmov.32 d3[1], r1 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - "vmov.32 d25[1], r1 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vmov.32 d27[1], r1 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - "vmov.32 d29[1], r1 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vld1.32 {d28-d29}, [%[wh]]\n" - - "vpadd.f32 d23, d18, d19 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // store result - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - - //! out two - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - // add bias - "vadd.f32 d18, d18, d30 \n" - - // store result - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights) - : [din0] "r"(din0), - [din1] "r"(din1), - [din2] "r"(din2), - [din3] "r"(din3), - [din4] "r"(din4), - [din5] "r"(din5), - [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for three out with extracting data pre -//! deal with two lines out -void compute_three_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "add %[wh], #12 \n" - "vld1.32 {d0}, [%[wh]], r0 \n" - "vld1.32 {d2}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]] \n" - "vld1.32 {d6-d7}, [%[din1]] \n" - "vld1.32 {d8-d9}, [%[din2]] \n" - "vld1.32 {d10-d11}, [%[din3]] \n" - "vld1.32 {d12-d13}, [%[din4]] \n" - "vld1.32 {d14-d15}, [%[din5]] \n" - - //! out zero - // weights r0 - "vmul.f32 d18, d0, d4 \n" - "vmul.f32 d20, d0, d6 \n" - - "vld1.32 {d24}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 d18, d2, d6 \n" - "vmla.f32 d20, d2, d8 \n" - - "vld1.32 {d26}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 d18, d24, d8 \n" - "vmla.f32 d20, d24, d10 \n" - - "vld1.32 {d28}, [%[wh]] \n" - - // weights r3 - "vmla.f32 d18, d26, d10 \n" - "vmla.f32 d20, d26, d12 \n" - - // load bias - "vld1.32 {d30-d31}, [%[bias]] \n" - - // weights r4 - "vmla.f32 d18, d28, d12 \n" - "vmla.f32 d20, d28, d14 \n" - "vpadd.f32 d22, d18, d20 \n" - - //! out one - "mov r1, #0 \n" - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vmov.32 d1[1], r1 \n" - "vmov.32 d3[1], r1 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - "vmov.32 d25[1], r1 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vmov.32 d27[1], r1 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - "vmov.32 d29[1], r1 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vld1.32 {d28-d29}, [%[wh]]\n" - - "vpadd.f32 d23, d18, d19 \n" - "vmov.i32 q8, #0x0 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // relu - "vmax.f32 q11, q11, q8 \n" - - // store result - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - - //! out two - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d18, d18, d19 \n" - - // add bias - "vadd.f32 d18, d18, d30 \n" - - // relu - "vmax.f32 d18, d18, d16 \n" - - // store result - "vst1.32 {d18[0]}, [%[dout0]] \n" - "vst1.32 {d18[1]}, [%[dout1]] \n" - - : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights) - : [din0] "r"(din0), - [din1] "r"(din1), - [din2] "r"(din2), - [din3] "r"(din3), - [din4] "r"(din4), - [din5] "r"(din5), - [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for three out with extracting data post -//! deal with two lines out -void compute_three_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]] \n" - "vld1.32 {d6-d7}, [%[din1]] \n" - "vld1.32 {d8-d9}, [%[din2]] \n" - "vld1.32 {d10-d11}, [%[din3]] \n" - "vld1.32 {d12-d13}, [%[din4]] \n" - "vld1.32 {d14-d15}, [%[din5]] \n" - - //! out zero && two - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - "vmul.f32 d16, d0, d5 \n" - "vmul.f32 d17, d0, d7 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - "vmla.f32 d16, d2, d7 \n" - "vmla.f32 d17, d2, d9 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - "vmla.f32 d16, d24, d9 \n" - "vmla.f32 d17, d24, d11 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - "vmla.f32 d16, d26, d11 \n" - "vmla.f32 d17, d26, d13 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vmla.f32 d16, d28, d13 \n" - "vmla.f32 d17, d28, d15 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d16, d16, d17 \n" - "vpadd.f32 d22, d18, d19 \n" - - "vmov.f32 q15, #0.0 \n" - "vext.32 q2, q2, q15, #1 \n" - "vext.32 q3, q3, q15, #1 \n" - "vext.32 q4, q4, q15, #1 \n" - "vext.32 q5, q5, q15, #1 \n" - "vext.32 q6, q6, q15, #1 \n" - "vext.32 q7, q7, q15, #1 \n" - - //! out one - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - // load bias - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - "vmov.i32 q9, #0x0 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - "vadd.f32 d16, d16, d30 \n" - - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - "vst1.32 {d16[0]}, [%[dout0]]! \n" - "vst1.32 {d16[1]}, [%[dout1]]! \n" - - : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights) - : [din0] "r"(din0), - [din1] "r"(din1), - [din2] "r"(din2), - [din3] "r"(din3), - [din4] "r"(din4), - [din5] "r"(din5), - [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for three out with extracting data post -//! deal with two lines out -void compute_three_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]] \n" - "vld1.32 {d6-d7}, [%[din1]] \n" - "vld1.32 {d8-d9}, [%[din2]] \n" - "vld1.32 {d10-d11}, [%[din3]] \n" - "vld1.32 {d12-d13}, [%[din4]] \n" - "vld1.32 {d14-d15}, [%[din5]] \n" - - //! out zero && two - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - "vmul.f32 d16, d0, d5 \n" - "vmul.f32 d17, d0, d7 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - "vmla.f32 d16, d2, d7 \n" - "vmla.f32 d17, d2, d9 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - "vmla.f32 d16, d24, d9 \n" - "vmla.f32 d17, d24, d11 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - "vmla.f32 d16, d26, d11 \n" - "vmla.f32 d17, d26, d13 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vmla.f32 d16, d28, d13 \n" - "vmla.f32 d17, d28, d15 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d16, d16, d17 \n" - "vpadd.f32 d22, d18, d19 \n" - - "vmov.f32 q15, #0.0 \n" - "vext.32 q2, q2, q15, #1 \n" - "vext.32 q3, q3, q15, #1 \n" - "vext.32 q4, q4, q15, #1 \n" - "vext.32 q5, q5, q15, #1 \n" - "vext.32 q6, q6, q15, #1 \n" - "vext.32 q7, q7, q15, #1 \n" - - //! out one - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - // load bias - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - "vmov.i32 q9, #0x0 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - "vadd.f32 d16, d16, d30 \n" - - // relu - "vmax.f32 q11, q11, q9 \n" - "vmax.f32 d16, d16, d18 \n" - - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - "vst1.32 {d16[0]}, [%[dout0]]! \n" - "vst1.32 {d16[1]}, [%[dout1]]! \n" - - : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights) - : [din0] "r"(din0), - [din1] "r"(din1), - [din2] "r"(din2), - [din3] "r"(din3), - [din4] "r"(din4), - [din5] "r"(din5), - [bias] "r"(bias) - : "memory", - "r0", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for four out with extracting data pre -//! deal with two lines out -void compute_four_out_extract_pre(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "add %[wh], #16 \n" - - //! out zero - // load input - "vld1.32 {d4[0]}, [%[din0]] \n" - "vld1.32 {d4[1]}, [%[din1]] \n" - "vld1.32 {d5[0]}, [%[din2]] \n" - "vld1.32 {d5[1]}, [%[din3]] \n" - "vld1.32 {d6[0]}, [%[din4]] \n" - "vld1.32 {d6[1]}, [%[din5]] \n" - - "vext.32 q4, q2, q3, #1 \n" - - // load weights - "vld1.32 d0[0], [%[wh]], r0 \n" - "vld1.32 d0[1], [%[wh]], r0 \n" - "vld1.32 d1[0], [%[wh]], r0 \n" - "vld1.32 d1[1], [%[wh]], r0 \n" - "vld1.32 d2[0], [%[wh]]\n" - - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q4 \n" - - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d22, d18, d19 \n" - - "vmla.f32 d22, d6, d2[0] \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0}, [%[wh]], r0 \n" - "vld1.32 {d2}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]] \n" - "vld1.32 {d6-d7}, [%[din1]] \n" - "vld1.32 {d8-d9}, [%[din2]] \n" - "vld1.32 {d10-d11}, [%[din3]] \n" - "vld1.32 {d12-d13}, [%[din4]] \n" - "vld1.32 {d14-d15}, [%[din5]] \n" - - //! out one - // weights r0 - "vmul.f32 d18, d0, d4 \n" - "vmul.f32 d20, d0, d6 \n" - - "vld1.32 {d24}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 d18, d2, d6 \n" - "vmla.f32 d20, d2, d8 \n" - - "vld1.32 {d26}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 d18, d24, d8 \n" - "vmla.f32 d20, d24, d10 \n" - - "vld1.32 {d28}, [%[wh]] \n" - - // weights r3 - "vmla.f32 d18, d26, d10 \n" - "vmla.f32 d20, d26, d12 \n" - - // weights r4 - "vmla.f32 d18, d28, d12 \n" - "vmla.f32 d20, d28, d14 \n" - - "vpadd.f32 d23, d18, d20 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // store result - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - - //! out two - "mov r1, #0 \n" - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vmov.32 d1[1], r1 \n" - "vmov.32 d3[1], r1 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - "vmov.32 d25[1], r1 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vmov.32 d27[1], r1 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - "vmov.32 d29[1], r1 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vld1.32 {d28-d29}, [%[wh]]\n" - - "vpadd.f32 d22, d18, d19 \n" - - //! out three - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // store result - "vst1.32 {d22}, [%[dout0]] \n" - "vst1.32 {d23}, [%[dout1]] \n" - - : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights) - : [din0] "r"(din0), - [din1] "r"(din1), - [din2] "r"(din2), - [din3] "r"(din3), - [din4] "r"(din4), - [din5] "r"(din5), - [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for four out with extracting data pre -//! deal with two lines out -void compute_four_out_extract_pre_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "add %[wh], #16 \n" - - //! out zero - // load input - "vld1.32 {d4[0]}, [%[din0]] \n" - "vld1.32 {d4[1]}, [%[din1]] \n" - "vld1.32 {d5[0]}, [%[din2]] \n" - "vld1.32 {d5[1]}, [%[din3]] \n" - "vld1.32 {d6[0]}, [%[din4]] \n" - "vld1.32 {d6[1]}, [%[din5]] \n" - - "vext.32 q4, q2, q3, #1 \n" - - // load weights - "vld1.32 d0[0], [%[wh]], r0 \n" - "vld1.32 d0[1], [%[wh]], r0 \n" - "vld1.32 d1[0], [%[wh]], r0 \n" - "vld1.32 d1[1], [%[wh]], r0 \n" - "vld1.32 d2[0], [%[wh]]\n" - - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q4 \n" - - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d22, d18, d19 \n" - - "vmla.f32 d22, d6, d2[0] \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0}, [%[wh]], r0 \n" - "vld1.32 {d2}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]] \n" - "vld1.32 {d6-d7}, [%[din1]] \n" - "vld1.32 {d8-d9}, [%[din2]] \n" - "vld1.32 {d10-d11}, [%[din3]] \n" - "vld1.32 {d12-d13}, [%[din4]] \n" - "vld1.32 {d14-d15}, [%[din5]] \n" - - //! out one - // weights r0 - "vmul.f32 d18, d0, d4 \n" - "vmul.f32 d20, d0, d6 \n" - - "vld1.32 {d24}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 d18, d2, d6 \n" - "vmla.f32 d20, d2, d8 \n" - - "vld1.32 {d26}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 d18, d24, d8 \n" - "vmla.f32 d20, d24, d10 \n" - - "vld1.32 {d28}, [%[wh]] \n" - - // weights r3 - "vmla.f32 d18, d26, d10 \n" - "vmla.f32 d20, d26, d12 \n" - - // weights r4 - "vmla.f32 d18, d28, d12 \n" - "vmla.f32 d20, d28, d14 \n" - - "vpadd.f32 d23, d18, d20 \n" - "vmov.i32 q8, #0x0 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // relu - "vmax.f32 q11, q11, q8 \n" - - // store result - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - - //! out two - "mov r1, #0 \n" - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vmov.32 d1[1], r1 \n" - "vmov.32 d3[1], r1 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - "vmov.32 d25[1], r1 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vmov.32 d27[1], r1 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - "vld1.32 {d28-d29}, [%[wh]]\n" - "vmov.32 d29[1], r1 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "sub %[wh], #84 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - "vld1.32 {d28-d29}, [%[wh]]\n" - - "vpadd.f32 d22, d18, d19 \n" - - //! out three - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // relu - "vmax.f32 q11, q11, q8 \n" - - // store result - "vst1.32 {d22}, [%[dout0]] \n" - "vst1.32 {d23}, [%[dout1]] \n" - - : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights) - : [din0] "r"(din0), - [din1] "r"(din1), - [din2] "r"(din2), - [din3] "r"(din3), - [din4] "r"(din4), - [din5] "r"(din5), - [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for three out with extracting data post -//! deal with two lines out -void compute_four_out_extract_post(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "mov r1, #12 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]], r1 \n" - "vld1.32 {d6-d7}, [%[din1]], r1 \n" - "vld1.32 {d8-d9}, [%[din2]], r1 \n" - "vld1.32 {d10-d11}, [%[din3]], r1 \n" - "vld1.32 {d12-d13}, [%[din4]], r1 \n" - "vld1.32 {d14-d15}, [%[din5]], r1 \n" - - //! out zero && two - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - "vmul.f32 d16, d0, d5 \n" - "vmul.f32 d17, d0, d7 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - "vmla.f32 d16, d2, d7 \n" - "vmla.f32 d17, d2, d9 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - "vmla.f32 d16, d24, d9 \n" - "vmla.f32 d17, d24, d11 \n" - - "vld1.32 {d28-d29}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - "vmla.f32 d16, d26, d11 \n" - "vmla.f32 d17, d26, d13 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vmla.f32 d16, d28, d13 \n" - "vmla.f32 d17, d28, d15 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d16, d16, d17 \n" - "vpadd.f32 d22, d18, d19 \n" - - //! out one - "vmov.f32 q15, #0.0 \n" - "vext.32 q2, q2, q15, #1 \n" - "vext.32 q3, q3, q15, #1 \n" - "vext.32 q4, q4, q15, #1 \n" - "vext.32 q5, q5, q15, #1 \n" - "vext.32 q6, q6, q15, #1 \n" - "vext.32 q7, q7, q15, #1 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // store result - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - - //! out three - "sub %[wh], #80 \n" - "vld1.32 {d4[0]}, [%[din0]] \n" - "vld1.32 {d4[1]}, [%[din1]] \n" - "vld1.32 {d5[0]}, [%[din2]] \n" - "vld1.32 {d5[1]}, [%[din3]] \n" - "vld1.32 {d6[0]}, [%[din4]] \n" - "vld1.32 {d6[1]}, [%[din5]] \n" - - "vext.32 q4, q2, q3, #1 \n" - - "vld1.32 {d0[0]}, [%[wh]], r0 \n" - "vld1.32 {d0[1]}, [%[wh]], r0 \n" - "vld1.32 {d1[0]}, [%[wh]], r0 \n" - "vld1.32 {d1[1]}, [%[wh]], r0 \n" - "vld1.32 {d2[0]}, [%[wh]] \n" - - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q4 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d20, d20, d21 \n" - "vpadd.f32 d17, d18, d20 \n" - - "vmla.f32 d17, d6, d2[0] \n" - - // trn out neon register - "vtrn.32 d16, d17 \n" - - // add bias - "vadd.f32 q8, q8, q15 \n" - - // store result - "vst1.32 {d16}, [%[dout0]] \n" - "vst1.32 {d17}, [%[dout1]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -//! kernel for three out with extracting data post -//! deal with two lines out -void compute_four_out_extract_post_relu(const float* din0, - const float* din1, - const float* din2, - const float* din3, - const float* din4, - const float* din5, - float* dout0, - float* dout1, - const float* weights, - const float* bias) { - asm volatile( - "mov r0, #20 \n" - "mov r1, #12 \n" - "vld1.32 {d0-d1}, [%[wh]], r0 \n" - "vld1.32 {d2-d3}, [%[wh]], r0 \n" - - "vld1.32 {d4-d5}, [%[din0]], r1 \n" - "vld1.32 {d6-d7}, [%[din1]], r1 \n" - "vld1.32 {d8-d9}, [%[din2]], r1 \n" - "vld1.32 {d10-d11}, [%[din3]], r1 \n" - "vld1.32 {d12-d13}, [%[din4]], r1 \n" - "vld1.32 {d14-d15}, [%[din5]], r1 \n" - - //! out zero && two - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - "vmul.f32 d16, d0, d5 \n" - "vmul.f32 d17, d0, d7 \n" - - "vld1.32 {d24-d25}, [%[wh]], r0 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - "vmla.f32 d16, d2, d7 \n" - "vmla.f32 d17, d2, d9 \n" - - "vld1.32 {d26-d27}, [%[wh]], r0 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - "vmla.f32 d16, d24, d9 \n" - "vmla.f32 d17, d24, d11 \n" - - "vld1.32 {d28-d29}, [%[wh]] \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - "vmla.f32 d16, d26, d11 \n" - "vmla.f32 d17, d26, d13 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - "vmla.f32 d16, d28, d13 \n" - "vmla.f32 d17, d28, d15 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d16, d16, d17 \n" - "vpadd.f32 d22, d18, d19 \n" - - //! out one - "vmov.f32 q15, #0.0 \n" - "vext.32 q2, q2, q15, #1 \n" - "vext.32 q3, q3, q15, #1 \n" - "vext.32 q4, q4, q15, #1 \n" - "vext.32 q5, q5, q15, #1 \n" - "vext.32 q6, q6, q15, #1 \n" - "vext.32 q7, q7, q15, #1 \n" - - // weights r0 - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q3 \n" - - // weights r1 - "vmla.f32 q9, q1, q3 \n" - "vmla.f32 q10, q1, q4 \n" - - // weights r2 - "vmla.f32 q9, q12, q4 \n" - "vmla.f32 q10, q12, q5 \n" - - // weights r3 - "vmla.f32 q9, q13, q5 \n" - "vmla.f32 q10, q13, q6 \n" - - // weights r4 - "vmla.f32 q9, q14, q6 \n" - "vmla.f32 q10, q14, q7 \n" - - "vld1.32 {d30-d31}, [%[bias]] \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d19, d20, d21 \n" - "vpadd.f32 d23, d18, d19 \n" - "vmov.i32 q5, #0x0 \n" - - // trn out neon register - "vtrn.32 d22, d23 \n" - - // add bias - "vadd.f32 q11, q11, q15 \n" - - // relu - "vmax.f32 q11, q11, q5 \n" - - // store result - "vst1.32 {d22}, [%[dout0]]! \n" - "vst1.32 {d23}, [%[dout1]]! \n" - - //! out three - "sub %[wh], #80 \n" - "vld1.32 {d4[0]}, [%[din0]] \n" - "vld1.32 {d4[1]}, [%[din1]] \n" - "vld1.32 {d5[0]}, [%[din2]] \n" - "vld1.32 {d5[1]}, [%[din3]] \n" - "vld1.32 {d6[0]}, [%[din4]] \n" - "vld1.32 {d6[1]}, [%[din5]] \n" - - "vext.32 q4, q2, q3, #1 \n" - - "vld1.32 {d0[0]}, [%[wh]], r0 \n" - "vld1.32 {d0[1]}, [%[wh]], r0 \n" - "vld1.32 {d1[0]}, [%[wh]], r0 \n" - "vld1.32 {d1[1]}, [%[wh]], r0 \n" - "vld1.32 {d2[0]}, [%[wh]] \n" - - "vmul.f32 q9, q0, q2 \n" - "vmul.f32 q10, q0, q4 \n" - - "vpadd.f32 d18, d18, d19 \n" - "vpadd.f32 d20, d20, d21 \n" - "vpadd.f32 d17, d18, d20 \n" - - "vmla.f32 d17, d6, d2[0] \n" - - // trn out neon register - "vtrn.32 d16, d17 \n" - - // add bias - "vadd.f32 q8, q8, q15 \n" - - // relu - "vmax.f32 q8, q8, q5 \n" - - // store result - "vst1.32 {d16}, [%[dout0]] \n" - "vst1.32 {d17}, [%[dout1]] \n" - - : [dout0] "+r"(dout0), - [dout1] "+r"(dout1), - [din0] "+r"(din0), - [din1] "+r"(din1), - [din2] "+r"(din2), - [din3] "+r"(din3), - [din4] "+r"(din4), - [din5] "+r"(din5), - [wh] "+r"(weights) - : [bias] "r"(bias) - : "memory", - "r0", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -} - -void conv_depthwise_5x5s1_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_out_new = h_out - 2 * pad_0; - int mid_out = w_out - 2 * pad; - int mid_cnt = mid_out >> 2; - int mid_remain = mid_out - (mid_cnt << 2); - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_c); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_c; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - const float* din_list[6]; - //! set din ptr with zero buffer - for (int i = 0; i < pad_new; ++i) { - din_list[i] = zero_ptr; - } - //! set din ptr with input data - for (int i = pad_new; i < 6; ++i) { - din_list[i] = din_ch; - din_ch += w_in; - } - //! every h loop, deal with 6 line input - const float* din0 = din_list[0]; - const float* din1 = din_list[1]; - const float* din2 = din_list[2]; - const float* din3 = din_list[3]; - const float* din4 = din_list[4]; - const float* din5 = din_list[5]; - - //! every h loop, deal with 2 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - //! load weights to neon register - const float* weights_c = weights + c * weights_saptial_size; - - //! h loop - for (int h = 0; h < h_out_new; h += 2) { - //! (h - pad_new) + 7 > h_in - 1 - if (h + 6 - pad_new > h_in) { - switch (h + 6 - pad_new - h_in) { - case 5: - din1 = zero_ptr; - case 4: - din2 = zero_ptr; - case 3: - din3 = zero_ptr; - case 2: - din4 = zero_ptr; - case 1: - din5 = zero_ptr; - default: - break; - } - } - if (h + 2 > h_out_new) { - dout1 = write_ptr; - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_c; - *dout_ptr1++ = bias_c; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - } - - //! deal with w_out pad_new column pre - switch (pad_new) { - case 4: - compute_four_out_extract_pre(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - break; - case 3: - compute_three_out_extract_pre(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - break; - case 2: - compute_two_out_extract_pre(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - break; - case 1: - compute_one_out_extract_pre(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - break; - } - - //! mid loop - if (mid_cnt > 0) { - int mid_loop = mid_cnt; - const float* weights_ptr = weights_c; - asm volatile( - //! din: q7-q12 - //! dout: q13, q14 - "mov r1, #20 \n" - //! load weights - "vld1.32 {d0-d1}, [%[wh]], r1 \n" - "vld1.32 {d2-d3}, [%[wh]], r1 \n" - "vld1.32 {d4-d5}, [%[wh]], r1 \n" - "vld1.32 {d6-d7}, [%[wh]], r1 \n" - "vld1.32 {d8-d9}, [%[wh]] \n" - - "sub %[wh], #64 \n" - "vld1.32 {d10[0]}, [%[wh]], r1 \n" - "vld1.32 {d10[1]}, [%[wh]], r1 \n" - "vld1.32 {d11[0]}, [%[wh]], r1 \n" - "vld1.32 {d11[1]}, [%[wh]], r1 \n" - "vld1.32 {d12[0]}, [%[wh]] \n" - - //! load input - "mov r1, #4 \n" - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - //! load bias - "vld1.32 {d30-d31}, [%[bias]] \n" - - "1: \n" - //! add bias to output - "vmov.32 q13, q15 \n" - "vmov.32 q14, q15 \n" - - "pld [%[din0]] \n" - "pld [%[din1]] \n" - "pld [%[din2]] \n" - "pld [%[din3]] \n" - "pld [%[din4]] \n" - "pld [%[din5]] \n" - - // weights col 0 - "vmla.f32 q13, q7, d0[0] \n" - "vmla.f32 q14, q8, d0[0] \n" - - "vmla.f32 q13, q8, d2[0] \n" - "vmla.f32 q14, q9, d2[0] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d4[0] \n" - "vmla.f32 q14, q10, d4[0] \n" - - "vmla.f32 q13, q10, d6[0] \n" - "vmla.f32 q14, q11, d6[0] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d8[0] \n" - "vmla.f32 q14, q12, d8[0] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 1 - "vmla.f32 q13, q7, d0[1] \n" - "vmla.f32 q14, q8, d0[1] \n" - - "vmla.f32 q13, q8, d2[1] \n" - "vmla.f32 q14, q9, d2[1] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d4[1] \n" - "vmla.f32 q14, q10, d4[1] \n" - - "vmla.f32 q13, q10, d6[1] \n" - "vmla.f32 q14, q11, d6[1] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d8[1] \n" - "vmla.f32 q14, q12, d8[1] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 2 - "vmla.f32 q13, q7, d1[0] \n" - "vmla.f32 q14, q8, d1[0] \n" - - "vmla.f32 q13, q8, d3[0] \n" - "vmla.f32 q14, q9, d3[0] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d5[0] \n" - "vmla.f32 q14, q10, d5[0] \n" - - "vmla.f32 q13, q10, d7[0] \n" - "vmla.f32 q14, q11, d7[0] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d9[0] \n" - "vmla.f32 q14, q12, d9[0] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 3 - "vmla.f32 q13, q7, d1[1] \n" - "vmla.f32 q14, q8, d1[1] \n" - - "vmla.f32 q13, q8, d3[1] \n" - "vmla.f32 q14, q9, d3[1] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d5[1] \n" - "vmla.f32 q14, q10, d5[1] \n" - - "vmla.f32 q13, q10, d7[1] \n" - "vmla.f32 q14, q11, d7[1] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d9[1] \n" - "vmla.f32 q14, q12, d9[1] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 4 - "vmla.f32 q13, q7, d10[0] \n" - "vmla.f32 q14, q8, d10[0] \n" - - "vmla.f32 q13, q8, d10[1] \n" - "vmla.f32 q14, q9, d10[1] \n" - - "vmla.f32 q13, q9, d11[0] \n" - "vmla.f32 q14, q10, d11[0] \n" - - "vmla.f32 q13, q10, d11[1] \n" - "vmla.f32 q14, q11, d11[1] \n" - - "vmla.f32 q13, q11, d12[0] \n" - "vmla.f32 q14, q12, d12[0] \n" - - // store reslult - "vst1.32 {d26-d27}, [%[out0]]! \n" - "vst1.32 {d28-d29}, [%[out1]]! \n" - - "subs %[cnt], #1 \n" - "bne 1b \n" - - "sub %[din0], r1 \n" - "sub %[din1], r1 \n" - "sub %[din2], r1 \n" - "sub %[din3], r1 \n" - "sub %[din4], r1 \n" - "sub %[din5], r1 \n" - - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [din4] "+r"(din_ptr4), - [din5] "+r"(din_ptr5), - [out0] "+r"(dout_ptr0), - [out1] "+r"(dout_ptr1), - [wh] "+r"(weights_ptr), - [cnt] "+r"(mid_loop) - : [bias] "r"(vbias) - : "cc", - "memory", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - //! deal with mid remain - for (int i = 0; i < mid_remain; ++i) { - compute_one_out_without_extract(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - din_ptr0++; - din_ptr1++; - din_ptr2++; - din_ptr3++; - din_ptr4++; - din_ptr5++; - - dout_ptr0++; - dout_ptr1++; - } - //! deal with w_out pad_new column post - switch (pad_new) { - case 4: - compute_four_out_extract_post(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - break; - case 3: - compute_three_out_extract_post(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - break; - case 2: - compute_two_out_extract_post(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - break; - case 1: - compute_one_out_extract_post(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - break; - } - - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - } - - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din5; - din4 = din3 + w_in; - din5 = din4 + w_in; - - dout0 = dout1 + w_out; - dout1 = dout0 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } -} - -void conv_depthwise_5x5s1_relu_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_out_new = h_out - 2 * pad_0; - int mid_out = w_out - 2 * pad; - int mid_cnt = mid_out >> 2; - int mid_remain = mid_out - (mid_cnt << 2); - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float bias_relu = bias_c > 0.f ? bias_c : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_relu); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_relu; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - const float* din_list[6]; - //! set din ptr with zero buffer - for (int i = 0; i < pad_new; ++i) { - din_list[i] = zero_ptr; - } - //! set din ptr with input data - for (int i = pad_new; i < 6; ++i) { - din_list[i] = din_ch; - din_ch += w_in; - } - //! every h loop, deal with 6 line input - const float* din0 = din_list[0]; - const float* din1 = din_list[1]; - const float* din2 = din_list[2]; - const float* din3 = din_list[3]; - const float* din4 = din_list[4]; - const float* din5 = din_list[5]; - - //! every h loop, deal with 2 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - //! load weights to neon register - const float* weights_c = weights + c * weights_saptial_size; - - //! h loop - for (int h = 0; h < h_out_new; h += 2) { - //! (h - pad_new) + 7 > h_in - 1 - if (h + 6 - pad_new > h_in) { - switch (h + 6 - pad_new - h_in) { - case 5: - din1 = zero_ptr; - case 4: - din2 = zero_ptr; - case 3: - din3 = zero_ptr; - case 2: - din4 = zero_ptr; - case 1: - din5 = zero_ptr; - default: - break; - } - } - if (h + 2 > h_out_new) { - dout1 = write_ptr; - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_relu; - *dout_ptr1++ = bias_relu; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - } - - //! deal with w_out pad_new column pre - switch (pad_new) { - case 4: - compute_four_out_extract_pre_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - break; - case 3: - compute_three_out_extract_pre_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - break; - case 2: - compute_two_out_extract_pre_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - break; - case 1: - compute_one_out_extract_pre_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - break; - } - - //! mid loop - if (mid_cnt > 0) { - int mid_loop = mid_cnt; - const float* weights_ptr = weights_c; - asm volatile( - //! din: q7-q12 - //! dout: q13, q14 - "mov r1, #20 \n" - "vmov.i32 q15, #0x0 \n" - //! load weights - "vld1.32 {d0-d1}, [%[wh]], r1 \n" - "vld1.32 {d2-d3}, [%[wh]], r1 \n" - "vld1.32 {d4-d5}, [%[wh]], r1 \n" - "vld1.32 {d6-d7}, [%[wh]], r1 \n" - "vld1.32 {d8-d9}, [%[wh]] \n" - - "sub %[wh], #64 \n" - "vld1.32 {d10[0]}, [%[wh]], r1 \n" - "vld1.32 {d10[1]}, [%[wh]], r1 \n" - "vld1.32 {d11[0]}, [%[wh]], r1 \n" - "vld1.32 {d11[1]}, [%[wh]], r1 \n" - "vld1.32 {d12[0]}, [%[wh]] \n" - - //! load input - "mov r1, #4 \n" - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - "1: \n" - - //! load bias to output - "vld1.32 {d26-d27}, [%[bias]] \n" - "vld1.32 {d28-d29}, [%[bias]] \n" - - "pld [%[din0]] \n" - "pld [%[din1]] \n" - "pld [%[din2]] \n" - "pld [%[din3]] \n" - "pld [%[din4]] \n" - "pld [%[din5]] \n" - - // weights col 0 - "vmla.f32 q13, q7, d0[0] \n" - "vmla.f32 q14, q8, d0[0] \n" - - "vmla.f32 q13, q8, d2[0] \n" - "vmla.f32 q14, q9, d2[0] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d4[0] \n" - "vmla.f32 q14, q10, d4[0] \n" - - "vmla.f32 q13, q10, d6[0] \n" - "vmla.f32 q14, q11, d6[0] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d8[0] \n" - "vmla.f32 q14, q12, d8[0] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 1 - "vmla.f32 q13, q7, d0[1] \n" - "vmla.f32 q14, q8, d0[1] \n" - - "vmla.f32 q13, q8, d2[1] \n" - "vmla.f32 q14, q9, d2[1] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d4[1] \n" - "vmla.f32 q14, q10, d4[1] \n" - - "vmla.f32 q13, q10, d6[1] \n" - "vmla.f32 q14, q11, d6[1] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d8[1] \n" - "vmla.f32 q14, q12, d8[1] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 2 - "vmla.f32 q13, q7, d1[0] \n" - "vmla.f32 q14, q8, d1[0] \n" - - "vmla.f32 q13, q8, d3[0] \n" - "vmla.f32 q14, q9, d3[0] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d5[0] \n" - "vmla.f32 q14, q10, d5[0] \n" - - "vmla.f32 q13, q10, d7[0] \n" - "vmla.f32 q14, q11, d7[0] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d9[0] \n" - "vmla.f32 q14, q12, d9[0] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 3 - "vmla.f32 q13, q7, d1[1] \n" - "vmla.f32 q14, q8, d1[1] \n" - - "vmla.f32 q13, q8, d3[1] \n" - "vmla.f32 q14, q9, d3[1] \n" - - "vld1.32 {d14-d15}, [%[din0]], r1 \n" - "vld1.32 {d16-d17}, [%[din1]], r1 \n" - - "vmla.f32 q13, q9, d5[1] \n" - "vmla.f32 q14, q10, d5[1] \n" - - "vmla.f32 q13, q10, d7[1] \n" - "vmla.f32 q14, q11, d7[1] \n" - - "vld1.32 {d18-d19}, [%[din2]], r1 \n" - "vld1.32 {d20-d21}, [%[din3]], r1 \n" - - "vmla.f32 q13, q11, d9[1] \n" - "vmla.f32 q14, q12, d9[1] \n" - - "vld1.32 {d22-d23}, [%[din4]], r1 \n" - "vld1.32 {d24-d25}, [%[din5]], r1 \n" - - // weights col 4 - "vmla.f32 q13, q7, d10[0] \n" - "vmla.f32 q14, q8, d10[0] \n" - - "vmla.f32 q13, q8, d10[1] \n" - "vmla.f32 q14, q9, d10[1] \n" - - "vmla.f32 q13, q9, d11[0] \n" - "vmla.f32 q14, q10, d11[0] \n" - - "vmla.f32 q13, q10, d11[1] \n" - "vmla.f32 q14, q11, d11[1] \n" - - "vmla.f32 q13, q11, d12[0] \n" - "vmla.f32 q14, q12, d12[0] \n" - - // relu - "vmax.f32 q13, q13, q15 \n" - "vmax.f32 q14, q14, q15 \n" - - // store result - "vst1.32 {d26-d27}, [%[out0]]! \n" - "vst1.32 {d28-d29}, [%[out1]]! \n" - - "subs %[cnt], #1 \n" - "bne 1b \n" - - "sub %[din0], r1 \n" - "sub %[din1], r1 \n" - "sub %[din2], r1 \n" - "sub %[din3], r1 \n" - "sub %[din4], r1 \n" - "sub %[din5], r1 \n" - - : [din0] "+r"(din_ptr0), - [din1] "+r"(din_ptr1), - [din2] "+r"(din_ptr2), - [din3] "+r"(din_ptr3), - [din4] "+r"(din_ptr4), - [din5] "+r"(din_ptr5), - [out0] "+r"(dout_ptr0), - [out1] "+r"(dout_ptr1), - [wh] "+r"(weights_ptr), - [cnt] "+r"(mid_loop) - : [bias] "r"(vbias) - : "cc", - "memory", - "r1", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - //! deal with mid remain - for (int i = 0; i < mid_remain; ++i) { - compute_one_out_without_extract_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - din_ptr0++; - din_ptr1++; - din_ptr2++; - din_ptr3++; - din_ptr4++; - din_ptr5++; - - dout_ptr0++; - dout_ptr1++; - } - //! deal with w_out pad_new column post - switch (pad_new) { - case 4: - compute_four_out_extract_post_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 4; - dout_ptr1 += 4; - break; - case 3: - compute_three_out_extract_post_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 3; - dout_ptr1 += 3; - break; - case 2: - compute_two_out_extract_post_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 2; - dout_ptr1 += 2; - break; - case 1: - compute_one_out_extract_post_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - dout_ptr0 += 1; - dout_ptr1 += 1; - break; - } - - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - } - - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din5; - din4 = din3 + w_in; - din5 = din4 + w_in; - - dout0 = dout1 + w_out; - dout1 = dout0 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } -} - -void conv_depthwise_5x5s1_small_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_in_new = h_in + 2 * pad_new; - int w_in_new = w_in + 2 * pad_new; - int h_out_new = h_out - 2 * pad_0; - int w_out_new = w_out - 2 * pad_0; - float zero_ptr[w_in_new + w_out]; - memset(zero_ptr, 0, w_in_new * sizeof(float)); - float* write_ptr = zero_ptr + w_in_new; - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in_new * h_in_new; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new); - for (int n = 0; n < num; ++n) { - const float* din_batch = din_new + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_c); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_c; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - //! every h loop, deal with 6 line input - const float* din0 = din_ch; - const float* din1 = din0 + w_in_new; - const float* din2 = din1 + w_in_new; - const float* din3 = din2 + w_in_new; - const float* din4 = din3 + w_in_new; - const float* din5 = din4 + w_in_new; - //! every h loop, deal with 2 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - - //! h loop - for (int h = 0; h < h_out_new; h += 2) { - //! (h - pad_new) + 6 > h_in - 1 - if (h + 6 > h_in_new) { - switch (h + 6 - h_in_new) { - case 5: - din1 = zero_ptr; - case 4: - din2 = zero_ptr; - case 3: - din3 = zero_ptr; - case 2: - din4 = zero_ptr; - case 1: - din5 = zero_ptr; - default: - break; - } - } - if (h + 2 > h_out_new) { - dout1 = write_ptr; - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_c; - *dout_ptr1++ = bias_c; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - } - //! mid loop - for (int i = 0; i < w_out_new; ++i) { - compute_one_out_without_extract(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - din_ptr0++; - din_ptr1++; - din_ptr2++; - din_ptr3++; - din_ptr4++; - din_ptr5++; - - dout_ptr0++; - dout_ptr1++; - } - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - } - - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din5; - din4 = din3 + w_in_new; - din5 = din4 + w_in_new; - - dout0 = dout1 + w_out; - dout1 = dout0 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } - free(din_new); -} - -void conv_depthwise_5x5s1_small_relu_impl(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - int pad_new = pad > 4 ? 4 : pad; - int pad_0 = pad - pad_new; - int h_in_new = h_in + 2 * pad_new; - int w_in_new = w_in + 2 * pad_new; - int h_out_new = h_out - 2 * pad_0; - int w_out_new = w_out - 2 * pad_0; - float zero_ptr[w_in_new + w_out]; - memset(zero_ptr, 0, w_in_new * sizeof(float)); - float* write_ptr = zero_ptr + w_in_new; - int pad_cnt = pad_0 >> 2; - int pad_remain = pad_0 - (pad_cnt << 2); - int bias_cnt = (w_out * pad_0) >> 2; - int bias_remain = (w_out * pad_0) - (bias_cnt << 2); - int in_spatial_size = w_in_new * h_in_new; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new); - for (int n = 0; n < num; ++n) { - const float* din_batch = din_new + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - float bias_c = flag_bias ? bias[c] : 0.f; - float bias_relu = bias_c > 0.f ? bias_c : 0.f; - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - float32x4_t vbias_c = vdupq_n_f32(bias_relu); - if (flag_bias) { - //! deal with h_out pad_0 line with bias - for (int i = 0; i < bias_cnt; ++i) { - vst1q_f32(dout_ch, vbias_c); - dout_ch += 4; - } - for (int i = 0; i < bias_remain; ++i) { - *dout_ch++ = bias_relu; - } - } else { - //! deal with h_out pad_0 line without bias - for (int i = 0; i < pad_0; ++i) { - memset(dout_ch, 0x00, w_out * sizeof(float)); - dout_ch += w_out; - } - } - //! every h loop, deal with 6 line input - const float* din0 = din_ch; - const float* din1 = din0 + w_in_new; - const float* din2 = din1 + w_in_new; - const float* din3 = din2 + w_in_new; - const float* din4 = din3 + w_in_new; - const float* din5 = din4 + w_in_new; - //! every h loop, deal with 2 line output - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - - //! h loop - for (int h = 0; h < h_out_new; h += 2) { - //! (h - pad_new) + 6 > h_in - 1 - if (h + 6 > h_in_new) { - switch (h + 6 - h_in_new) { - case 5: - din1 = zero_ptr; - case 4: - din2 = zero_ptr; - case 3: - din3 = zero_ptr; - case 2: - din4 = zero_ptr; - case 1: - din5 = zero_ptr; - default: - break; - } - } - if (h + 2 > h_out_new) { - dout1 = write_ptr; - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - - if (flag_bias) { - //! deal with w_out pad_0 column pre with bias - for (int i = 0; i < pad_cnt; i++) { - vst1q_f32(dout_ptr0, vbias_c); - vst1q_f32(dout_ptr1, vbias_c); - dout_ptr0 += 4; - dout_ptr1 += 4; - } - for (int i = 0; i < pad_remain; ++i) { - *dout_ptr0++ = bias_relu; - *dout_ptr1++ = bias_relu; - } - } else { - //! deal with w_out pad_0 column pre without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - dout_ptr0 += pad_0; - dout_ptr1 += pad_0; - } - //! mid loop - for (int i = 0; i < w_out_new; ++i) { - compute_one_out_without_extract_relu(din_ptr0, - din_ptr1, - din_ptr2, - din_ptr3, - din_ptr4, - din_ptr5, - dout_ptr0, - dout_ptr1, - weights_c, - vbias); - din_ptr0++; - din_ptr1++; - din_ptr2++; - din_ptr3++; - din_ptr4++; - din_ptr5++; - - dout_ptr0++; - dout_ptr1++; - } - if (flag_bias) { - //! deal with w_out pad_0 column post with bias - memcpy(dout_ptr0, dout0, pad_0 * sizeof(float)); - memcpy(dout_ptr1, dout1, pad_0 * sizeof(float)); - } else { - //! deal with w_out pad_0 column post without bias - memset(dout_ptr0, 0x00, pad_0 * sizeof(float)); - memset(dout_ptr1, 0x00, pad_0 * sizeof(float)); - } - - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din5; - din4 = din3 + w_in_new; - din5 = din4 + w_in_new; - - dout0 = dout1 + w_out; - dout1 = dout0 + w_out; - } - float* dout_pad_end = dout_ch + h_out_new * w_out; - if (flag_bias) { - //! deal with h_out pad_0 line with bias - memcpy(reinterpret_cast(dout_pad_end), - dout_ch - pad_0 * w_out, - pad_0 * w_out * sizeof(float)); - } else { - //! deal with h_out pad_0 line without bias - memset(reinterpret_cast(dout_pad_end), - 0x00, - pad_0 * w_out * sizeof(float)); - } - } - } - free(din_new); -} -#endif // __aarch64__ - -void conv_depthwise_5x5s1(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (win < 4) { - if (flag_relu) { - conv_depthwise_5x5s1_small_relu_impl(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else { - conv_depthwise_5x5s1_small_impl(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - pad, - flag_bias, - flag_relu, - ctx); - } - } else { - if (flag_relu) { - conv_depthwise_5x5s1_relu_impl(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else { - conv_depthwise_5x5s1_impl(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - pad, - flag_bias, - flag_relu, - ctx); - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc b/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc deleted file mode 100644 index 0d0034dd85..0000000000 --- a/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc +++ /dev/null @@ -1,618 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_5x5s1_int8(int32_t* dout, - const int8_t* din, - const int8_t* weights, - const int* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int chin, - const int hin, - const int win, - const int hout, - const int wout, - ARMContext* ctx, - PrecisionType out_type, - const float* scale); - -void conv_depthwise_5x5_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - ARMContext* ctx, - PrecisionType out_type, - const float* scale) { - int stride_h = param.strides[0]; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active){ - // if (param.activation_param.active == Active_relu || - // fabs(param.activation_param.negative_slope) > 1e-6f){ - // flag_relu = true; - // } - // } - if (stride_h == 1) { -#ifdef __aarch64__ - conv_depthwise_5x5s1_int8(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - chin, - hin, - win, - hout, - wout, - ctx, - out_type, - scale); -#else - - LOG(FATAL) << "5x5 dw conv armv7 has not impl"; -#endif - } -} - -/** - * \brief depthwise convolution, kernel size 5x5, stride 1, pad 1, with bias, - * width > 4 - */ -// 2 line -#ifdef __aarch64__ - -template -inline void prefetch(const Dtype* din) { -#ifdef __aarch64__ - asm volatile("PRFM PLDL1KEEP, [%[din]] \n" : : [din] "r"(din) : "memory"); -#else - asm volatile("pld [%[din]] \n" : : [din] "r"(din) : "memory"); -#endif -} - -void conv_depthwise_5x5s1_int8( - int32_t* dout, - const int8_t* din, - const int8_t* weights, - const int32_t* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int chin, - const int hin, - const int win, - const int hout, - const int wout, - ARMContext* ctx, - PrecisionType od_type, - float const* scales) { /// scale_size = channel-out - - // printf("5*5 multiply\n"); - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - int w_stride = 5 * 5; - - static int const stride_w = 1; - int const stride_h = stride_w; - int const chout = chin; - int const pad_w = 2; - int const pad_h = pad_w; - - int const wout_round = ((wout + 7) / 8) * 8; - int const win_round = wout_round * stride_w + 5 - 1; - int const hout_round = ((hout + 2) / 3) * 3; - int const hin_round = hout_round * stride_h + 5 - 1; - int const tile_h = hout_round / 3; - int const tile_w = wout_round / 8; - - int const pre_in_size = hin_round * win_round; - int const pre_out_size = hout_round * wout_round; - int const pre_io_size = pre_in_size + pre_out_size * sizeof(int); - - int const hs = -pad_h; - int const he = hs + hin_round; - int const ws = -pad_w; - int const we = ws + win_round; - - // signed char* tmp_work_space = new signed char [1024*5]; - signed char* tmp_work_space = ctx->workspace_data(); - signed char* ptr_zero = tmp_work_space; - int* ptr_write = reinterpret_cast(ptr_zero + win_round); - signed char* pre_data = - reinterpret_cast(ptr_write + wout_round); - - memset(ptr_zero, 0, win_round * sizeof(signed char)); - - for (int n = 0; n < num; ++n) { - signed char const* din_batch = din + n * chin * size_in_channel; - int* dout_batch = dout + n * chout * size_out_channel; - - // #pragma omp parallel for - for (int c = 0; c < chout; c++) { -#ifdef ARM_WITH_OMP - int const thno = omp_get_thread_num(); -#else - int const thno = 0; -#endif - signed char const* din_channel = din_batch + c * size_in_channel; - signed char* pre_din = pre_data + thno * pre_io_size; - int* pre_out = reinterpret_cast(pre_din + pre_in_size); - int* dout_ptr = pre_out; - - prepack_input_nxw(din_channel, - pre_din, - c, - c + 1, - hs, - he, - ws, - we, - 1, - win, - hin, - ptr_zero); - - signed char const* wei_ptr = weights + c * w_stride; - int bias_val = flag_bias ? bias[c] : 0.f; - - int8x8_t wr00 = vdup_n_s8(wei_ptr[0 * 5 + 0]); - int8x8_t wr01 = vdup_n_s8(wei_ptr[0 * 5 + 1]); - int8x8_t wr02 = vdup_n_s8(wei_ptr[0 * 5 + 2]); - int8x8_t wr03 = vdup_n_s8(wei_ptr[0 * 5 + 3]); - int8x8_t wr04 = vdup_n_s8(wei_ptr[0 * 5 + 4]); - - int8x8_t wr10 = vdup_n_s8(wei_ptr[1 * 5 + 0]); - int8x8_t wr11 = vdup_n_s8(wei_ptr[1 * 5 + 1]); - int8x8_t wr12 = vdup_n_s8(wei_ptr[1 * 5 + 2]); - int8x8_t wr13 = vdup_n_s8(wei_ptr[1 * 5 + 3]); - int8x8_t wr14 = vdup_n_s8(wei_ptr[1 * 5 + 4]); - - int8x8_t wr20 = vdup_n_s8(wei_ptr[2 * 5 + 0]); - int8x8_t wr21 = vdup_n_s8(wei_ptr[2 * 5 + 1]); - int8x8_t wr22 = vdup_n_s8(wei_ptr[2 * 5 + 2]); - int8x8_t wr23 = vdup_n_s8(wei_ptr[2 * 5 + 3]); - int8x8_t wr24 = vdup_n_s8(wei_ptr[2 * 5 + 4]); - - int8x8_t wr30 = vdup_n_s8(wei_ptr[3 * 5 + 0]); - int8x8_t wr31 = vdup_n_s8(wei_ptr[3 * 5 + 1]); - int8x8_t wr32 = vdup_n_s8(wei_ptr[3 * 5 + 2]); - int8x8_t wr33 = vdup_n_s8(wei_ptr[3 * 5 + 3]); - int8x8_t wr34 = vdup_n_s8(wei_ptr[3 * 5 + 4]); - - int8x8_t wr40 = vdup_n_s8(wei_ptr[4 * 5 + 0]); - int8x8_t wr41 = vdup_n_s8(wei_ptr[4 * 5 + 1]); - int8x8_t wr42 = vdup_n_s8(wei_ptr[4 * 5 + 2]); - int8x8_t wr43 = vdup_n_s8(wei_ptr[4 * 5 + 3]); - int8x8_t wr44 = vdup_n_s8(wei_ptr[4 * 5 + 4]); - - int* doutr0 = nullptr; - int* doutr1 = nullptr; - int* doutr2 = nullptr; - - signed char const* dr0 = pre_din; - signed char const* dr1 = dr0 + win_round; - signed char const* dr2 = dr1 + win_round; - signed char const* dr3 = dr2 + win_round; - signed char const* dr4 = dr3 + win_round; - signed char const* dr5 = dr4 + win_round; - signed char const* dr6 = dr5 + win_round; - - signed char const* din_ptr0 = nullptr; - signed char const* din_ptr1 = nullptr; - signed char const* din_ptr2 = nullptr; - signed char const* din_ptr3 = nullptr; - signed char const* din_ptr4 = nullptr; - signed char const* din_ptr5 = nullptr; - signed char const* din_ptr6 = nullptr; - - for (int h = 0; h < tile_h; h++) { - // printf("c:%d h:%d\n", c, h); - doutr0 = dout_ptr; - doutr1 = doutr0 + wout_round; - doutr2 = doutr1 + wout_round; - - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - din_ptr6 = dr6; - - prefetch(doutr0); - prefetch(doutr1); - prefetch(doutr2); - prefetch(din_ptr0); - prefetch(din_ptr1); - prefetch(din_ptr2); - prefetch(din_ptr3); - prefetch(din_ptr4); - prefetch(din_ptr5); - prefetch(din_ptr6); - - for (int j = 0; j < tile_w; ++j) { - // printf("j:%d\n", j); - int32x4_t voutr00 = vdupq_n_s32(bias_val); - int32x4_t voutr01 = vdupq_n_s32(bias_val); - int32x4_t voutr10 = vdupq_n_s32(bias_val); - int32x4_t voutr11 = vdupq_n_s32(bias_val); - int32x4_t voutr20 = vdupq_n_s32(bias_val); - int32x4_t voutr21 = vdupq_n_s32(bias_val); - - // din data - int8x8_t vinr00 = vld1_s8(din_ptr0 + 0); - int8x8_t vinr01 = vld1_s8(din_ptr0 + 8); - int8x8_t vinr10 = vld1_s8(din_ptr1 + 0); - int8x8_t vinr11 = vld1_s8(din_ptr1 + 8); - int8x8_t vinr20 = vld1_s8(din_ptr2 + 0); - int8x8_t vinr21 = vld1_s8(din_ptr2 + 8); - int8x8_t vinr30 = vld1_s8(din_ptr3 + 0); - int8x8_t vinr31 = vld1_s8(din_ptr3 + 8); - int8x8_t vinr40 = vld1_s8(din_ptr4 + 0); - int8x8_t vinr41 = vld1_s8(din_ptr4 + 8); - int8x8_t vinr50 = vld1_s8(din_ptr5 + 0); - int8x8_t vinr51 = vld1_s8(din_ptr5 + 8); - int8x8_t vinr60 = vld1_s8(din_ptr6 + 0); - int8x8_t vinr61 = vld1_s8(din_ptr6 + 8); - - /// the first row - // r0 - int8x8_t vtmp1 = vext_s8(vinr00, vinr01, 1); // 12345678 - int8x8_t vtmp2 = vext_s8(vinr00, vinr01, 2); // 2345678 - int8x8_t vtmp3 = vext_s8(vinr00, vinr01, 3); // 345678 - int8x8_t vtmp4 = vext_s8(vinr00, vinr01, 4); // 45678 - - int16x8_t tvoutr0 = vmull_s8(vinr00, wr00); - tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr01); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp2, wr02); - tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr03); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp4, wr04); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - - // r1 - vtmp1 = vext_s8(vinr10, vinr11, 1); // 12345678 - vtmp2 = vext_s8(vinr10, vinr11, 2); // 2345678 - vtmp3 = vext_s8(vinr10, vinr11, 3); // 345678 - vtmp4 = vext_s8(vinr10, vinr11, 4); // 45678 - - tvoutr0 = vmull_s8(vinr10, wr10); - tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr11); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp2, wr12); - tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr13); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp4, wr14); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - - int16x8_t tvoutr1 = vmull_s8(vinr10, wr00); - tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr01); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp2, wr02); - tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr03); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp4, wr04); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - - // r2 - vtmp1 = vext_s8(vinr20, vinr21, 1); // 12345678 - vtmp2 = vext_s8(vinr20, vinr21, 2); // 2345678 - vtmp3 = vext_s8(vinr20, vinr21, 3); // 345678 - vtmp4 = vext_s8(vinr20, vinr21, 4); // 45678 - - tvoutr0 = vmull_s8(vinr20, wr20); - tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr21); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp2, wr22); - tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr23); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp4, wr24); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - - tvoutr1 = vmull_s8(vinr20, wr10); - tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr11); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp2, wr12); - tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr13); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp4, wr14); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - - int16x8_t tvoutr2 = vmull_s8(vinr20, wr00); - tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr01); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp2, wr02); - tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr03); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp4, wr04); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - - // r3 - vtmp1 = vext_s8(vinr30, vinr31, 1); // 12345678 - vtmp2 = vext_s8(vinr30, vinr31, 2); // 2345678 - vtmp3 = vext_s8(vinr30, vinr31, 3); // 345678 - vtmp4 = vext_s8(vinr30, vinr31, 4); // 45678 - - tvoutr0 = vmull_s8(vinr30, wr30); - tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr31); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp2, wr32); - tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr33); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp4, wr34); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - - tvoutr1 = vmull_s8(vinr30, wr20); - tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr21); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp2, wr22); - tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr23); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp4, wr24); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - - tvoutr2 = vmull_s8(vinr30, wr10); - tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr11); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp2, wr12); - tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr13); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp4, wr14); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - - // r4 - vtmp1 = vext_s8(vinr40, vinr41, 1); // 12345678 - vtmp2 = vext_s8(vinr40, vinr41, 2); // 2345678 - vtmp3 = vext_s8(vinr40, vinr41, 3); // 345678 - vtmp4 = vext_s8(vinr40, vinr41, 4); // 45678 - - tvoutr0 = vmull_s8(vinr40, wr40); - tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr41); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp2, wr42); - tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr43); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - tvoutr0 = vmull_s8(vtmp4, wr44); - voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0)); - voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0)); - - tvoutr1 = vmull_s8(vinr40, wr30); - tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr31); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp2, wr32); - tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr33); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp4, wr34); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - - tvoutr2 = vmull_s8(vinr40, wr20); - tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr21); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp2, wr22); - tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr23); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp4, wr24); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - - // r5 - vtmp1 = vext_s8(vinr50, vinr51, 1); // 12345678 - vtmp2 = vext_s8(vinr50, vinr51, 2); // 2345678 - vtmp3 = vext_s8(vinr50, vinr51, 3); // 345678 - vtmp4 = vext_s8(vinr50, vinr51, 4); // 45678 - - tvoutr1 = vmull_s8(vinr50, wr40); - tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr41); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp2, wr42); - tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr43); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - tvoutr1 = vmull_s8(vtmp4, wr44); - voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1)); - voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1)); - - tvoutr2 = vmull_s8(vinr50, wr30); - tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr31); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp2, wr32); - tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr33); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp4, wr34); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - - // r6 - vtmp1 = vext_s8(vinr60, vinr61, 1); // 12345678 - vtmp2 = vext_s8(vinr60, vinr61, 2); // 2345678 - vtmp3 = vext_s8(vinr60, vinr61, 3); // 345678 - vtmp4 = vext_s8(vinr60, vinr61, 4); // 45678 - - tvoutr2 = vmull_s8(vinr60, wr40); - tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr41); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp2, wr42); - tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr43); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - tvoutr2 = vmull_s8(vtmp4, wr44); - voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2)); - voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2)); - - /// data shift 8 bytes - din_ptr0 += 8; - din_ptr1 += 8; - din_ptr2 += 8; - din_ptr3 += 8; - din_ptr4 += 8; - din_ptr5 += 8; - din_ptr6 += 8; - - /// store - vst1q_s32(doutr0, voutr00); - vst1q_s32(doutr1, voutr10); - vst1q_s32(doutr2, voutr20); - doutr0 += 4; - doutr1 += 4; - doutr2 += 4; - vst1q_s32(doutr0, voutr01); - vst1q_s32(doutr1, voutr11); - vst1q_s32(doutr2, voutr21); - doutr0 += 4; - doutr1 += 4; - doutr2 += 4; - } /// end of tile_w - - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - dr3 = dr6; - dr4 = dr3 + win_round; - dr5 = dr4 + win_round; - dr6 = dr5 + win_round; - - dout_ptr = dout_ptr + 3 * wout_round; - } /// end of tile_h - - if (scales == 0) { - write_to_output_numc(pre_out, - dout_batch, - 1, - hout_round, - c, - c + 1, - 0, - hout, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - ptr_write); - } else if (od_type == PRECISION(kFloat)) { - write2_to_output_numc(pre_out, - reinterpret_cast(dout_batch), - 1, - hout_round, - c, - c + 1, - 0, - hout, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - scales); - } else if (od_type == PRECISION(kInt8)) { - write2_to_output_numc(pre_out, - reinterpret_cast(dout_batch), - 1, - hout_round, - c, - c + 1, - 0, - hout, - 0, - wout_round, - chout, - hout, - wout, - flag_relu, - reinterpret_cast(ptr_write), - scales); - } - // else if (od_type == AK_INT32) { - // write2_to_output_numc(pre_out, (int*)dout_batch, 1, hout_round, c, - // c+1, - // 0, hout, 0, wout_round, chout, hout, wout, flag_relu, - // (int*)ptr_write, scales); - // } - } /// end of chout - } /// end of batch num -} - -#endif // __aarch64__ - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_5x5s2.cc b/lite/backends/arm/math/conv_depthwise_5x5s2.cc deleted file mode 100644 index dd715fd534..0000000000 --- a/lite/backends/arm/math/conv_depthwise_5x5s2.cc +++ /dev/null @@ -1,3746 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_5x5s2p2(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s2p2_relu(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s2p2_s(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s2p2_relu_s(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s2(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (pad == 2) { - if (win >= 9) { - if (flag_relu) { - conv_depthwise_5x5s2p2_relu(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - flag_bias, - flag_relu, - ctx); - } else { - conv_depthwise_5x5s2p2(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - flag_bias, - flag_relu, - ctx); - } - } else { - if (flag_relu) { - conv_depthwise_5x5s2p2_relu_s(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - flag_bias, - flag_relu, - ctx); - } else { - conv_depthwise_5x5s2p2_s(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - flag_bias, - flag_relu, - ctx); - } - } - } -} - -#ifdef __aarch64__ - -//! larger depthwise, win >= 9; -void conv_depthwise_5x5s2p2(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - CHECK_GE(w_in, 9) << "only support win >= 9"; - int w_out_round = (w_out + 3) / 4 * 4; - int cnt = (w_out_round - 4) / 4; - int mid_cnt = cnt - 1; - int right_start = cnt * 2 * 4 - 2; - int mask_cnt = 12 - (w_in - right_start); - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - const float* din5 = din4 + w_in; - const float* din6 = din5 + w_in; - - float out_buf0[4]; - float out_buf1[4]; - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - for (int h = 0; h < h_out; h += 2) { - //! (h * 2 - 2) + 6 > h_in - 1 - if (h * 2 + 5 > h_in) { - switch (h * 2 + 5 - h_in) { - case 6: - din1 = zero_ptr; - case 5: - din2 = zero_ptr; - case 4: - din3 = zero_ptr; - case 3: - din4 = zero_ptr; - case 2: - din5 = zero_ptr; - case 1: - din6 = zero_ptr; - default: - break; - } - } - if (h + 2 > h_out) { - switch (h + 2 - h_out) { - case 1: - dout1 = write_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - const float* din_ptr6 = din6; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - int loop = mid_cnt; - const int s_8 = 8; - const int s_16 = 16; - - //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8 v13 v18 v23 - //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9 v14 v19 v24 - //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6 v11 v16 v21 - //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7 v12 v17 v22 - //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10 v15 v20 v25 - //! out r0, r1 -- v26, v27 - asm volatile( - "movi v31.4s, #0x0\n" - "prfm pldl1keep, [%[din_ptr0]] \n" - "prfm pldl1keep, [%[din_ptr1]] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - "prfm pldl1keep, [%[din_ptr3]] \n" - "prfm pldl1keep, [%[din_ptr4]] \n" - "prfm pldl1keep, [%[din_ptr5]] \n" - "prfm pldl1keep, [%[din_ptr6]] \n" - "prfm pldl1keep, [%[weights]] \n" - "prfm pldl1keep, [%[mask]] \n" - // left - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32 \n" // r0 v6: 0 - // 2 4 6, - // v7: 1 3 - // 5 7 - "ext v8.16b, v31.16b, v6.16b, #12 \n" // r0 v8: x - // 0 2 4 - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n" // r1 v11: - // 0 2 4 6, - // v12: 1 3 - // 5 7 - "ext v9.16b, v31.16b, v7.16b, #12 \n" // r0 v9: x - // 1 3 5 - "ld1 {v0.4s, v1.4s}, [%[weights]], #32 \n" // load - // weights - // 0-7 - "ext v10.16b, v6.16b, v31.16b, #4 \n" - "ld1 {v10.s}[3], [%[din_ptr0]] \n" // r0 v10: - // 2 4 6 8 - "sub %[din_ptr0], %[din_ptr0], #8 \n" - "ext v13.16b, v31.16b, v11.16b, #12 \n" // r1 v13: - // x 0 2 4 - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n" // r2 v16: - // 0 2 4 6, - // v17: 1 3 - // 5 7 - "ext v14.16b, v31.16b, v12.16b, #12 \n" // r1 v14: - // x 1 3 5 - "ld1 {v2.4s, v3.4s}, [%[weights]], #32 \n" // load - // weights - // 8-15 - "ext v15.16b, v11.16b, v31.16b, #4 \n" - "ld1 {v15.s}[3], [%[din_ptr1]] \n" // r1 v15: - // 2 4 6 - "sub %[din_ptr1], %[din_ptr1], #8 \n" - "ext v18.16b, v31.16b, v16.16b, #12 \n" // r2 v18: - // x 0 2 4 - "ld1 {v4.4s, v5.4s}, [%[weights]], #32 \n" // load - // weights - // 16-23 - "ext v19.16b, v31.16b, v17.16b, #12 \n" // r2 v19: - // x 1 3 5 - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n" // r3 v21: - // 0 2 4 6, - // v22: 1 3 - // 5 7 - "ext v20.16b, v16.16b, v31.16b, #4 \n" - "ld1 {v20.s}[3], [%[din_ptr2]] \n" // r2 v20: - // 2 4 6 8 - "sub %[din_ptr2], %[din_ptr2], #8 \n" - "ext v23.16b, v31.16b, v21.16b, #12 \n" // r3 v23: - // x 0 2 4 - "ld1 {v30.4s}, [%[weights]] \n" // load - // weights - // 24 - "ext v24.16b, v31.16b, v22.16b, #12 \n" // r3 v24: - // x 1 3 5 - "ld1 {v26.4s}, [%[vbias]] \n" // load - // bias to - // out_r0 - "ext v25.16b, v21.16b, v31.16b, #4 \n" - "ld1 {v25.s}[3], [%[din_ptr3]] \n" // r2 v25: - // 2 4 6 8 - "sub %[din_ptr3], %[din_ptr3], #8 \n" - "mov v27.16b, v26.16b \n" // load - // bias to - // out_r1 - "mov v28.16b, v31.16b \n" // load - // zero to - // out_r0 - "mov v29.16b, v31.16b \n" // load - // zero to - // out_r1 - - "fmla v26.4s, v8.4s, v0.s[0] \n" // out r0: - // w0 - "fmla v28.4s, v9.4s, v0.s[1] \n" // out r0: - // w1 - "fmla v26.4s, v6.4s, v0.s[2] \n" // out r0: - // w2 - "fmla v28.4s, v7.4s, v0.s[3] \n" // out r0: - // w3 - - "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8] \n" // next r0 - // v8: 0 2 - // 4 6, v9: - // 1 3 5 7 - - "fmla v26.4s, v10.4s, v1.s[0] \n" // out r0: - // w4 - "fmla v28.4s, v13.4s, v1.s[1] \n" // out r0: - // w5 - "fmla v26.4s, v14.4s, v1.s[2] \n" // out r0: - // w6 - "fmla v28.4s, v11.4s, v1.s[3] \n" // out r0: - // w7 - - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8] \n" // next r0 - // v6: 2 4 - // 6 8, v7: - // 3 5 7 9 - - "fmla v26.4s, v12.4s, v2.s[0] \n" // out r0: - // w8 - "fmla v28.4s, v15.4s, v2.s[1] \n" // out r0: - // w9 - "fmla v26.4s, v18.4s, v2.s[2] \n" // out r0: - // w10 - "fmla v28.4s, v19.4s, v2.s[3] \n" // out r0: - // w11 - - "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16] \n" // next r0 - // v10: 4 6 - // 8 10, - // v11: - // trash - // register - - "fmla v26.4s, v16.4s, v3.s[0] \n" // out r0: - // w12 - "fmla v28.4s, v17.4s, v3.s[1] \n" // out r0: - // w13 - "fmla v26.4s, v20.4s, v3.s[2] \n" // out r0: - // w14 - "fmla v28.4s, v23.4s, v3.s[3] \n" // out r0: - // w15 - "prfm pldl1keep, [%[din_ptr0]] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32 \n" // r4 v11: - // 0 2 4 6, - // v12: 1 3 - // 5 7 - - "fmla v26.4s, v24.4s, v4.s[0] \n" // out r0: - // w16 - "fmla v28.4s, v21.4s, v4.s[1] \n" // out r0: - // w17 - - "ext v13.16b, v31.16b, v11.16b, #12 \n" // r4 v13: - // x 0 2 4 - "ext v14.16b, v31.16b, v12.16b, #12 \n" // r4 v14: - // x 1 3 5 - "ext v15.16b, v11.16b, v31.16b, #4 \n" - - "fmla v26.4s, v22.4s, v4.s[2] \n" // out r0: - // w18 - "fmla v28.4s, v25.4s, v4.s[3] \n" // out r0: - // w19 - - "ld1 {v15.s}[3], [%[din_ptr4]] \n" // r4 v15: - // 2 4 6 - - "fmla v27.4s, v18.4s, v0.s[0] \n" // out r1: - // w0 - "fmla v29.4s, v19.4s, v0.s[1] \n" // out r1: - // w1 - - "sub %[din_ptr4], %[din_ptr4], #8 \n" - - "fmla v27.4s, v16.4s, v0.s[2] \n" // out r1: - // w2 - "fmla v29.4s, v17.4s, v0.s[3] \n" // out r1: - // w3 - "fmla v27.4s, v20.4s, v1.s[0] \n" // out r1: - // w4 - "fmla v29.4s, v23.4s, v1.s[1] \n" // out r1: - // w5 - - "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32 \n" // r5 v16: - // 0 2 4 6, - // v17: 1 3 - // 5 7 - - "fmla v27.4s, v24.4s, v1.s[2] \n" // out r1: - // w6 - "fmla v29.4s, v21.4s, v1.s[3] \n" // out r1: - // w7 - - "ext v18.16b, v31.16b, v16.16b, #12 \n" // r5 v18: - // x 0 2 4 - "ext v19.16b, v31.16b, v17.16b, #12 \n" // r5 v19: - // x 1 3 5 - "ext v20.16b, v16.16b, v31.16b, #4 \n" - - "fmla v27.4s, v22.4s, v2.s[0] \n" // out r1: - // w8 - "fmla v29.4s, v25.4s, v2.s[1] \n" // out r1: - // w9 - - "ld1 {v20.s}[3], [%[din_ptr5]] \n" // r5 v20: - // 2 4 6 - "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32 \n" // r6 v21: - // 0 2 4 6, - // v22: 1 3 - // 5 7 - - "ext v23.16b, v31.16b, v21.16b, #12 \n" // r6 v23: - // x 0 2 4 - "ext v24.16b, v31.16b, v22.16b, #12 \n" // r6 v24: - // x 1 3 5 - "ext v25.16b, v21.16b, v31.16b, #4 \n" - "sub %[din_ptr5], %[din_ptr5], #8 \n" - - "fmla v26.4s, v11.4s, v5.s[2] \n" // out r0: - // w22 - "fmla v28.4s, v12.4s, v5.s[3] \n" // out r0: - // w23 - - "ld1 {v25.s}[3], [%[din_ptr6]] \n" // r6 v25: - // 2 4 6 - - "fmla v26.4s, v13.4s, v5.s[0] \n" // out r0: - // w20 - "fmla v28.4s, v14.4s, v5.s[1] \n" // out r0: - // w21 - - "sub %[din_ptr6], %[din_ptr6], #8 \n" - - "fmla v26.4s, v15.4s, v30.s[0] \n" // out r0: - // w24 - "fmla v27.4s, v13.4s, v2.s[2] \n" // out r1: - // w10 - - "fadd v26.4s, v26.4s, v28.4s \n" - "fmla v29.4s, v14.4s, v2.s[3] \n" // out r1: - // w11 - - "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8] \n" // next r1 - // v13: 0 2 - // 4 6, - // v14: 1 3 - // 5 7 - "fmla v27.4s, v11.4s, v3.s[0] \n" // out r1: - // w12 - "fmla v29.4s, v12.4s, v3.s[1] \n" // out r1: - // w13 - - "st1 {v26.4s}, [%[dout_ptr0]], %[s_16] \n" // store - // output - // r0 - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8] \n" // next r1 - // v11: 2 4 - // 6 8, - // v12: 3 5 - // 7 9 - - "fmla v27.4s, v15.4s, v3.s[2] \n" // out r1: - // w14 - "fmla v29.4s, v16.4s, v4.s[1] \n" // out r1: - // w17 - "fmla v27.4s, v18.4s, v3.s[3] \n" // out r1: - // w15 - "fmla v29.4s, v19.4s, v4.s[0] \n" // out r1: - // w16 - - "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16] \n" // next r1 - // v15: 4 6 - // 8 10, - // v16: - // trash - // register - - "fmla v27.4s, v17.4s, v4.s[2] \n" // out r1: - // w18 - "fmla v29.4s, v20.4s, v4.s[3] \n" // out r1: - // w19 - - "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8] \n" // next r2 - // v18: 0 2 - // 4 6, - // v19: 1 3 - // 5 7 - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8] \n" // next r2 - // v16: 2 4 - // 6 8, - // v11: 3 5 - // 7 9 - - "fmla v27.4s, v23.4s, v5.s[0] \n" // out r1: - // w20 - "fmla v29.4s, v21.4s, v5.s[2] \n" // out r1: - // w22 - "fmla v27.4s, v24.4s, v5.s[1] \n" // out r1: - // w21 - "fmla v29.4s, v22.4s, v5.s[3] \n" // out r1: - // w23 - - "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16] \n" // next r2 - // v20: 4 6 - // 8 10, - // v21: - // trash - // register - "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8] \n" // next r3 - // v23: 0 2 - // 4 6, - // v24: 1 3 - // 5 7 - - "fmla v27.4s, v25.4s, v30.s[0] \n" // out r1: - // w24 - - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8] \n" // next r3 - // v21: 2 4 - // 6 8, - // v22: 3 5 - // 7 9 - "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16] \n" // next r3 - // v25: 4 6 - // 8 10, - // v26: - // trash - // register - - "fadd v27.4s, v27.4s, v29.4s \n" - "cmp %w[mid_cnt], #1 \n" - - "prfm pldl1keep, [%[din_ptr1]] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - "prfm pldl1keep, [%[din_ptr3]] \n" - - "st1 {v27.4s}, [%[dout_ptr1]], #16 \n" - "blt 2f \n" - - // mid loop - "1: \n" - "ld1 {v26.4s}, [%[vbias]] \n" - "mov v27.16b, v26.16b \n" - "mov v28.16b, v31.16b \n" - "mov v29.16b, v31.16b \n" - - // out_r0 r0-r3 - "fmla v26.4s, v8.4s, v0.s[0] \n" - "fmla v28.4s, v9.4s, v0.s[1] \n" - "fmla v26.4s, v6.4s, v0.s[2] \n" - "fmla v28.4s, v7.4s, v0.s[3] \n" - - "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8] \n" - - "fmla v26.4s, v10.4s, v1.s[0] \n" - "fmla v28.4s, v11.4s, v1.s[3] \n" - - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8] \n" - - "fmla v26.4s, v14.4s, v1.s[2] \n" - "fmla v28.4s, v13.4s, v1.s[1] \n" - - "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr0]] \n" - - "fmla v26.4s, v12.4s, v2.s[0] \n" - "fmla v28.4s, v15.4s, v2.s[1] \n" - - "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v26.4s, v16.4s, v3.s[0] \n" - "fmla v27.4s, v16.4s, v0.s[2] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v28.4s, v19.4s, v2.s[3] \n" - "fmla v29.4s, v19.4s, v0.s[1] \n" - - "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr4]] \n" - - "fmla v26.4s, v18.4s, v2.s[2] \n" - "fmla v27.4s, v18.4s, v0.s[0] \n" - - "fmla v28.4s, v17.4s, v3.s[1] \n" - "fmla v29.4s, v17.4s, v0.s[3] \n" - - "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v26.4s, v20.4s, v3.s[2] \n" - "fmla v27.4s, v20.4s, v1.s[0] \n" - - "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v29.4s, v21.4s, v1.s[3] \n" - "fmla v28.4s, v21.4s, v4.s[1] \n" - "fmla v28.4s, v23.4s, v3.s[3] \n" - "fmla v29.4s, v23.4s, v1.s[1] \n" - - "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr5]] \n" - - "fmla v26.4s, v24.4s, v4.s[0] \n" - "fmla v27.4s, v24.4s, v1.s[2] \n" - - "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8] \n" - - "fmla v27.4s, v22.4s, v2.s[0] \n" - "fmla v26.4s, v22.4s, v4.s[2] \n" - - "fmla v28.4s, v25.4s, v4.s[3] \n" - "fmla v29.4s, v25.4s, v2.s[1] \n" - - "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8] \n" - "fadd v28.4s, v26.4s, v28.4s \n" - - "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16] \n" - "mov v26.16b, v31.16b \n" - "prfm pldl1keep, [%[din_ptr6]] \n" - - "fmla v26.4s, v13.4s, v5.s[0] \n" - "fmla v28.4s, v14.4s, v5.s[1] \n" - "fmla v27.4s, v13.4s, v2.s[2] \n" - "fmla v29.4s, v14.4s, v2.s[3] \n" - - "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8] \n" - - "fmla v26.4s, v11.4s, v5.s[2] \n" - "fmla v28.4s, v12.4s, v5.s[3] \n" - "fmla v27.4s, v11.4s, v3.s[0] \n" - "fmla v29.4s, v12.4s, v3.s[1] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8] \n" - - "fmla v26.4s, v15.4s, v30.s[0] \n" - "fmla v27.4s, v15.4s, v3.s[2] \n" - "fmla v29.4s, v16.4s, v4.s[1] \n" - "fmla v27.4s, v17.4s, v4.s[2] \n" - - "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr1]] \n" - - "fmla v29.4s, v18.4s, v3.s[3] \n" - "fmla v27.4s, v19.4s, v4.s[0] \n" - - "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8] \n" - - "fmla v29.4s, v20.4s, v4.s[3] \n" - - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8] \n" - - "fmla v27.4s, v23.4s, v5.s[0] \n" - "fmla v27.4s, v21.4s, v5.s[2] \n" - - "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16] \n" - - "fmla v29.4s, v24.4s, v5.s[1] \n" - - "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - - "fmla v29.4s, v22.4s, v5.s[3] \n" - - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8] \n" - - "fmla v27.4s, v25.4s, v30.s[0] \n" - - "fadd v26.4s, v26.4s, v28.4s \n" - - "prfm pldl1keep, [%[din_ptr3]] \n" - - "fadd v27.4s, v27.4s, v29.4s \n" - - "st1 {v26.4s}, [%[dout_ptr0]], #16 \n" - "st1 {v27.4s}, [%[dout_ptr1]], #16 \n" - - "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16] \n" - "subs %w[mid_cnt], %w[mid_cnt], #1 \n" - "bne 1b \n" - - "2: \n" - "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8] \n" - "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8] \n" - "bif v8.16b, v31.16b, v26.16b \n" - "bif v9.16b, v31.16b, v27.16b \n" - "bif v6.16b, v31.16b, v28.16b \n" - "bif v7.16b, v31.16b, v29.16b \n" - - "bif v13.16b, v31.16b, v26.16b \n" - "bif v14.16b, v31.16b, v27.16b \n" - "bif v11.16b, v31.16b, v28.16b \n" - "bif v12.16b, v31.16b, v29.16b \n" - - "bif v18.16b, v31.16b, v26.16b \n" - "bif v19.16b, v31.16b, v27.16b \n" - "bif v16.16b, v31.16b, v28.16b \n" - "bif v17.16b, v31.16b, v29.16b \n" - - "bif v23.16b, v31.16b, v26.16b \n" - "bif v24.16b, v31.16b, v27.16b \n" - "bif v21.16b, v31.16b, v28.16b \n" - "bif v22.16b, v31.16b, v29.16b \n" - - "ld2 {v28.4s, v29.4s}, [%[mask]] \n" - "ld1 {v26.4s}, [%[vbias]] \n" - "mov v29.16b, v31.16b \n" - - "bif v10.16b, v31.16b, v28.16b \n" - "bif v15.16b, v31.16b, v28.16b \n" - - "mov v27.16b, v26.16b \n" - - "bif v20.16b, v31.16b, v28.16b \n" - "bif v25.16b, v31.16b, v28.16b \n" - "mov v28.16b, v31.16b \n" - - "fmla v26.4s, v8.4s, v0.s[0] \n" - "fmla v28.4s, v9.4s, v0.s[1] \n" - "fmla v26.4s, v6.4s, v0.s[2] \n" - "fmla v28.4s, v7.4s, v0.s[3] \n" - - "fmla v26.4s, v10.4s, v1.s[0] \n" - "fmla v28.4s, v13.4s, v1.s[1] \n" - "fmla v26.4s, v14.4s, v1.s[2] \n" - "fmla v28.4s, v11.4s, v1.s[3] \n" - - "sub %[mask], %[mask], #16 \n" - "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8] \n" - "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8] \n" - "ld2 {v10.4s, v11.4s}, [%[mask]] \n" - - "fmla v26.4s, v12.4s, v2.s[0] \n" - "fmla v28.4s, v15.4s, v2.s[1] \n" - - "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v26.4s, v16.4s, v3.s[0] \n" - "fmla v28.4s, v17.4s, v3.s[1] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v27.4s, v16.4s, v0.s[2] \n" - "fmla v29.4s, v17.4s, v0.s[3] \n" - - "ld2 {v15.4s, v16.4s}, [%[din_ptr4]] \n" - - "fmla v26.4s, v18.4s, v2.s[2] \n" - "fmla v28.4s, v19.4s, v2.s[3] \n" - "fmla v27.4s, v18.4s, v0.s[0] \n" - "fmla v29.4s, v19.4s, v0.s[1] \n" - - "bif v13.16b, v31.16b, v6.16b \n" - "bif v14.16b, v31.16b, v7.16b \n" - "bif v11.16b, v31.16b, v8.16b \n" - "bif v12.16b, v31.16b, v9.16b \n" - "bif v15.16b, v31.16b, v10.16b \n" - - "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v26.4s, v20.4s, v3.s[2] \n" - "fmla v27.4s, v20.4s, v1.s[0] \n" - - "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v29.4s, v21.4s, v1.s[3] \n" - "fmla v28.4s, v21.4s, v4.s[1] \n" - - "ld2 {v20.4s, v21.4s}, [%[din_ptr5]] \n" - - "fmla v28.4s, v23.4s, v3.s[3] \n" - "fmla v29.4s, v23.4s, v1.s[1] \n" - "fmla v27.4s, v24.4s, v1.s[2] \n" - "fmla v26.4s, v24.4s, v4.s[0] \n" - - "bif v18.16b, v31.16b, v6.16b \n" - "bif v19.16b, v31.16b, v7.16b \n" - "bif v16.16b, v31.16b, v8.16b \n" - "bif v17.16b, v31.16b, v9.16b \n" - "bif v20.16b, v31.16b, v10.16b \n" - - "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8] \n" - - "fmla v27.4s, v22.4s, v2.s[0] \n" - "fmla v26.4s, v22.4s, v4.s[2] \n" - - "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8] \n" - - "fmla v28.4s, v25.4s, v4.s[3] \n" - "fmla v29.4s, v25.4s, v2.s[1] \n" - "fadd v28.4s, v28.4s, v26.4s \n" - - "ld2 {v25.4s, v26.4s}, [%[din_ptr6]] \n" - "mov v26.16b, v31.16b \n" - - "bif v23.16b, v31.16b, v6.16b \n" - "bif v24.16b, v31.16b, v7.16b \n" - "bif v21.16b, v31.16b, v8.16b \n" - "bif v22.16b, v31.16b, v9.16b \n" - "bif v25.16b, v31.16b, v10.16b \n" - - "fmla v26.4s, v13.4s, v5.s[0] \n" - "fmla v28.4s, v14.4s, v5.s[1] \n" - "fmla v26.4s, v11.4s, v5.s[2] \n" - "fmla v28.4s, v12.4s, v5.s[3] \n" - "fmla v26.4s, v15.4s, v30.s[0] \n" - - "fmla v27.4s, v13.4s, v2.s[2] \n" - "fmla v29.4s, v14.4s, v2.s[3] \n" - "fmla v27.4s, v11.4s, v3.s[0] \n" - "fmla v29.4s, v12.4s, v3.s[1] \n" - - "fadd v26.4s, v26.4s, v28.4s \n" - "fmla v27.4s, v15.4s, v3.s[2] \n" - "fmla v29.4s, v18.4s, v3.s[3] \n" - "fmla v27.4s, v19.4s, v4.s[0] \n" - "fmla v29.4s, v16.4s, v4.s[1] \n" - - "st1 {v26.4s}, [%[out_buf0]] \n" - "fmla v27.4s, v17.4s, v4.s[2] \n" - "fmla v29.4s, v20.4s, v4.s[3] \n" - "fmla v27.4s, v23.4s, v5.s[0] \n" - "fmla v29.4s, v24.4s, v5.s[1] \n" - - "fmla v27.4s, v21.4s, v5.s[2] \n" - "fmla v29.4s, v22.4s, v5.s[3] \n" - "fmla v27.4s, v25.4s, v30.s[0] \n" - "fadd v27.4s, v27.4s, v29.4s \n" - - "st1 {v27.4s}, [%[out_buf1]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [dout_ptr1] "+r"(dout_ptr1), - [mid_cnt] "+r"(loop), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [din_ptr6] "+r"(din_ptr6), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [out_buf1] "r"(out_buf1), - [s_8] "r"(s_8), - [s_16] "r"(s_16) - : "memory", - "cc", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); - - int remain_cnt = w_out - (mid_cnt + 1) * 4; - for (int i = 0; i < remain_cnt; ++i) { - dout_ptr0[i] = out_buf0[i]; - dout_ptr1[i] = out_buf1[i]; - } - din0 = din4; - din1 = din5; - din2 = din6; - din3 = din6 + w_in; - din4 = din3 + w_in; - din5 = din4 + w_in; - din6 = din5 + w_in; - dout0 = dout1 + w_out; - dout1 = dout0 + w_out; - } - } - } -} - -//! larger depthwise, win >= 9; -void conv_depthwise_5x5s2p2_relu(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - CHECK_GE(w_in, 9) << "only support win >= 9"; - int w_out_round = (w_out + 3) / 4 * 4; - int cnt = (w_out_round - 4) / 4; - int mid_cnt = cnt - 1; - int right_start = cnt * 2 * 4 - 2; - int mask_cnt = 12 - (w_in - right_start); - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; - -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - const float* din5 = din4 + w_in; - const float* din6 = din5 + w_in; - - float out_buf0[4]; - float out_buf1[4]; - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - for (int h = 0; h < h_out; h += 2) { - //! (h * 2 - 2) + 6 > h_in - 1 - if (h * 2 + 5 > h_in) { - switch (h * 2 + 5 - h_in) { - case 6: - din1 = zero_ptr; - case 5: - din2 = zero_ptr; - case 4: - din3 = zero_ptr; - case 3: - din4 = zero_ptr; - case 2: - din5 = zero_ptr; - case 1: - din6 = zero_ptr; - default: - break; - } - } - if (h + 2 > h_out) { - switch (h + 2 - h_out) { - case 1: - dout1 = write_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - const float* din_ptr5 = din5; - const float* din_ptr6 = din6; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - float* dout_ptr1 = dout1; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - int loop = mid_cnt; - const int s_8 = 8; - const int s_16 = 16; - - //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8 v13 v18 v23 - //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9 v14 v19 v24 - //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6 v11 v16 v21 - //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7 v12 v17 v22 - //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10 v15 v20 v25 - //! out r0, r1 -- v26, v27 - asm volatile( - "movi v31.4s, #0x0\n" - "prfm pldl1keep, [%[din_ptr0]] \n" - "prfm pldl1keep, [%[din_ptr1]] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - "prfm pldl1keep, [%[din_ptr3]] \n" - "prfm pldl1keep, [%[din_ptr4]] \n" - "prfm pldl1keep, [%[din_ptr5]] \n" - "prfm pldl1keep, [%[din_ptr6]] \n" - "prfm pldl1keep, [%[weights]] \n" - "prfm pldl1keep, [%[mask]] \n" - // left - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32 \n" // r0 v6: 0 - // 2 4 6, - // v7: 1 3 - // 5 7 - "ext v8.16b, v31.16b, v6.16b, #12 \n" // r0 v8: x - // 0 2 4 - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n" // r1 v11: - // 0 2 4 6, - // v12: 1 3 - // 5 7 - "ext v9.16b, v31.16b, v7.16b, #12 \n" // r0 v9: x - // 1 3 5 - "ld1 {v0.4s, v1.4s}, [%[weights]], #32 \n" // load - // weights - // 0-7 - "ext v10.16b, v6.16b, v31.16b, #4 \n" - "ld1 {v10.s}[3], [%[din_ptr0]] \n" // r0 v10: - // 2 4 6 8 - "sub %[din_ptr0], %[din_ptr0], #8 \n" - "ext v13.16b, v31.16b, v11.16b, #12 \n" // r1 v13: - // x 0 2 4 - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n" // r2 v16: - // 0 2 4 6, - // v17: 1 3 - // 5 7 - "ext v14.16b, v31.16b, v12.16b, #12 \n" // r1 v14: - // x 1 3 5 - "ld1 {v2.4s, v3.4s}, [%[weights]], #32 \n" // load - // weights - // 8-15 - "ext v15.16b, v11.16b, v31.16b, #4 \n" - "ld1 {v15.s}[3], [%[din_ptr1]] \n" // r1 v15: - // 2 4 6 - "sub %[din_ptr1], %[din_ptr1], #8 \n" - "ext v18.16b, v31.16b, v16.16b, #12 \n" // r2 v18: - // x 0 2 4 - "ld1 {v4.4s, v5.4s}, [%[weights]], #32 \n" // load - // weights - // 16-23 - "ext v19.16b, v31.16b, v17.16b, #12 \n" // r2 v19: - // x 1 3 5 - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n" // r3 v21: - // 0 2 4 6, - // v22: 1 3 - // 5 7 - "ext v20.16b, v16.16b, v31.16b, #4 \n" - "ld1 {v20.s}[3], [%[din_ptr2]] \n" // r2 v20: - // 2 4 6 8 - "sub %[din_ptr2], %[din_ptr2], #8 \n" - "ext v23.16b, v31.16b, v21.16b, #12 \n" // r3 v23: - // x 0 2 4 - "ld1 {v30.4s}, [%[weights]] \n" // load - // weights - // 24 - "ext v24.16b, v31.16b, v22.16b, #12 \n" // r3 v24: - // x 1 3 5 - "ld1 {v26.4s}, [%[vbias]] \n" // load - // bias to - // out_r0 - "ext v25.16b, v21.16b, v31.16b, #4 \n" - "ld1 {v25.s}[3], [%[din_ptr3]] \n" // r2 v25: - // 2 4 6 8 - "sub %[din_ptr3], %[din_ptr3], #8 \n" - "mov v27.16b, v26.16b \n" // load - // bias to - // out_r1 - "mov v28.16b, v31.16b \n" // load - // zero to - // out_r0 - "mov v29.16b, v31.16b \n" // load - // zero to - // out_r1 - - "fmla v26.4s, v8.4s, v0.s[0] \n" // out r0: - // w0 - "fmla v28.4s, v9.4s, v0.s[1] \n" // out r0: - // w1 - "fmla v26.4s, v6.4s, v0.s[2] \n" // out r0: - // w2 - "fmla v28.4s, v7.4s, v0.s[3] \n" // out r0: - // w3 - - "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8] \n" // next r0 - // v8: 0 2 - // 4 6, v9: - // 1 3 5 7 - - "fmla v26.4s, v10.4s, v1.s[0] \n" // out r0: - // w4 - "fmla v28.4s, v13.4s, v1.s[1] \n" // out r0: - // w5 - "fmla v26.4s, v14.4s, v1.s[2] \n" // out r0: - // w6 - "fmla v28.4s, v11.4s, v1.s[3] \n" // out r0: - // w7 - - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8] \n" // next r0 - // v6: 2 4 - // 6 8, v7: - // 3 5 7 9 - - "fmla v26.4s, v12.4s, v2.s[0] \n" // out r0: - // w8 - "fmla v28.4s, v15.4s, v2.s[1] \n" // out r0: - // w9 - "fmla v26.4s, v18.4s, v2.s[2] \n" // out r0: - // w10 - "fmla v28.4s, v19.4s, v2.s[3] \n" // out r0: - // w11 - - "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16] \n" // next r0 - // v10: 4 6 - // 8 10, - // v11: - // trash - // register - - "fmla v26.4s, v16.4s, v3.s[0] \n" // out r0: - // w12 - "fmla v28.4s, v17.4s, v3.s[1] \n" // out r0: - // w13 - "fmla v26.4s, v20.4s, v3.s[2] \n" // out r0: - // w14 - "fmla v28.4s, v23.4s, v3.s[3] \n" // out r0: - // w15 - "prfm pldl1keep, [%[din_ptr0]] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32 \n" // r4 v11: - // 0 2 4 6, - // v12: 1 3 - // 5 7 - - "fmla v26.4s, v24.4s, v4.s[0] \n" // out r0: - // w16 - "fmla v28.4s, v21.4s, v4.s[1] \n" // out r0: - // w17 - - "ext v13.16b, v31.16b, v11.16b, #12 \n" // r4 v13: - // x 0 2 4 - "ext v14.16b, v31.16b, v12.16b, #12 \n" // r4 v14: - // x 1 3 5 - "ext v15.16b, v11.16b, v31.16b, #4 \n" - - "fmla v26.4s, v22.4s, v4.s[2] \n" // out r0: - // w18 - "fmla v28.4s, v25.4s, v4.s[3] \n" // out r0: - // w19 - - "ld1 {v15.s}[3], [%[din_ptr4]] \n" // r4 v15: - // 2 4 6 - - "fmla v27.4s, v18.4s, v0.s[0] \n" // out r1: - // w0 - "fmla v29.4s, v19.4s, v0.s[1] \n" // out r1: - // w1 - - "sub %[din_ptr4], %[din_ptr4], #8 \n" - - "fmla v27.4s, v16.4s, v0.s[2] \n" // out r1: - // w2 - "fmla v29.4s, v17.4s, v0.s[3] \n" // out r1: - // w3 - "fmla v27.4s, v20.4s, v1.s[0] \n" // out r1: - // w4 - "fmla v29.4s, v23.4s, v1.s[1] \n" // out r1: - // w5 - - "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32 \n" // r5 v16: - // 0 2 4 6, - // v17: 1 3 - // 5 7 - - "fmla v27.4s, v24.4s, v1.s[2] \n" // out r1: - // w6 - "fmla v29.4s, v21.4s, v1.s[3] \n" // out r1: - // w7 - - "ext v18.16b, v31.16b, v16.16b, #12 \n" // r5 v18: - // x 0 2 4 - "ext v19.16b, v31.16b, v17.16b, #12 \n" // r5 v19: - // x 1 3 5 - "ext v20.16b, v16.16b, v31.16b, #4 \n" - - "fmla v27.4s, v22.4s, v2.s[0] \n" // out r1: - // w8 - "fmla v29.4s, v25.4s, v2.s[1] \n" // out r1: - // w9 - - "ld1 {v20.s}[3], [%[din_ptr5]] \n" // r5 v20: - // 2 4 6 - "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32 \n" // r6 v21: - // 0 2 4 6, - // v22: 1 3 - // 5 7 - - "ext v23.16b, v31.16b, v21.16b, #12 \n" // r6 v23: - // x 0 2 4 - "ext v24.16b, v31.16b, v22.16b, #12 \n" // r6 v24: - // x 1 3 5 - "ext v25.16b, v21.16b, v31.16b, #4 \n" - "sub %[din_ptr5], %[din_ptr5], #8 \n" - - "fmla v26.4s, v11.4s, v5.s[2] \n" // out r0: - // w22 - "fmla v28.4s, v12.4s, v5.s[3] \n" // out r0: - // w23 - - "ld1 {v25.s}[3], [%[din_ptr6]] \n" // r6 v25: - // 2 4 6 - - "fmla v26.4s, v13.4s, v5.s[0] \n" // out r0: - // w20 - "fmla v28.4s, v14.4s, v5.s[1] \n" // out r0: - // w21 - - "sub %[din_ptr6], %[din_ptr6], #8 \n" - - "fmla v26.4s, v15.4s, v30.s[0] \n" // out r0: - // w24 - "fmla v27.4s, v13.4s, v2.s[2] \n" // out r1: - // w10 - - "fadd v26.4s, v26.4s, v28.4s \n" - "fmla v29.4s, v14.4s, v2.s[3] \n" // out r1: - // w11 - "fmax v26.4s, v26.4s, v31.4s \n" - - "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8] \n" // next r1 - // v13: 0 2 - // 4 6, - // v14: 1 3 - // 5 7 - "fmla v27.4s, v11.4s, v3.s[0] \n" // out r1: - // w12 - "fmla v29.4s, v12.4s, v3.s[1] \n" // out r1: - // w13 - - "st1 {v26.4s}, [%[dout_ptr0]], %[s_16] \n" // store - // output - // r0 - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8] \n" // next r1 - // v11: 2 4 - // 6 8, - // v12: 3 5 - // 7 9 - - "fmla v27.4s, v15.4s, v3.s[2] \n" // out r1: - // w14 - "fmla v29.4s, v16.4s, v4.s[1] \n" // out r1: - // w17 - "fmla v27.4s, v18.4s, v3.s[3] \n" // out r1: - // w15 - "fmla v29.4s, v19.4s, v4.s[0] \n" // out r1: - // w16 - - "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16] \n" // next r1 - // v15: 4 6 - // 8 10, - // v16: - // trash - // register - - "fmla v27.4s, v17.4s, v4.s[2] \n" // out r1: - // w18 - "fmla v29.4s, v20.4s, v4.s[3] \n" // out r1: - // w19 - - "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8] \n" // next r2 - // v18: 0 2 - // 4 6, - // v19: 1 3 - // 5 7 - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8] \n" // next r2 - // v16: 2 4 - // 6 8, - // v11: 3 5 - // 7 9 - - "fmla v27.4s, v23.4s, v5.s[0] \n" // out r1: - // w20 - "fmla v29.4s, v21.4s, v5.s[2] \n" // out r1: - // w22 - "fmla v27.4s, v24.4s, v5.s[1] \n" // out r1: - // w21 - "fmla v29.4s, v22.4s, v5.s[3] \n" // out r1: - // w23 - - "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16] \n" // next r2 - // v20: 4 6 - // 8 10, - // v21: - // trash - // register - "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8] \n" // next r3 - // v23: 0 2 - // 4 6, - // v24: 1 3 - // 5 7 - - "fmla v27.4s, v25.4s, v30.s[0] \n" // out r1: - // w24 - - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8] \n" // next r3 - // v21: 2 4 - // 6 8, - // v22: 3 5 - // 7 9 - "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16] \n" // next r3 - // v25: 4 6 - // 8 10, - // v26: - // trash - // register - - "fadd v27.4s, v27.4s, v29.4s \n" - "fmax v27.4s, v27.4s, v31.4s \n" - "cmp %w[mid_cnt], #1 \n" - "prfm pldl1keep, [%[din_ptr1]] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - "prfm pldl1keep, [%[din_ptr3]] \n" - "st1 {v27.4s}, [%[dout_ptr1]], #16 \n" - "blt 2f \n" - - // mid loop - "1: \n" - "ld1 {v26.4s}, [%[vbias]] \n" - "mov v27.16b, v26.16b \n" - "mov v28.16b, v31.16b \n" - "mov v29.16b, v31.16b \n" - - // out_r0 r0-r3 - "fmla v26.4s, v8.4s, v0.s[0] \n" - "fmla v28.4s, v9.4s, v0.s[1] \n" - "fmla v26.4s, v6.4s, v0.s[2] \n" - "fmla v28.4s, v7.4s, v0.s[3] \n" - - "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8] \n" - - "fmla v26.4s, v10.4s, v1.s[0] \n" - "fmla v28.4s, v11.4s, v1.s[3] \n" - - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8] \n" - - "fmla v26.4s, v14.4s, v1.s[2] \n" - "fmla v28.4s, v13.4s, v1.s[1] \n" - - "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr0]] \n" - - "fmla v26.4s, v12.4s, v2.s[0] \n" - "fmla v28.4s, v15.4s, v2.s[1] \n" - - "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v26.4s, v16.4s, v3.s[0] \n" - "fmla v27.4s, v16.4s, v0.s[2] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v28.4s, v19.4s, v2.s[3] \n" - "fmla v29.4s, v19.4s, v0.s[1] \n" - - "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr4]] \n" - - "fmla v26.4s, v18.4s, v2.s[2] \n" - "fmla v27.4s, v18.4s, v0.s[0] \n" - - "fmla v28.4s, v17.4s, v3.s[1] \n" - "fmla v29.4s, v17.4s, v0.s[3] \n" - - "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v26.4s, v20.4s, v3.s[2] \n" - "fmla v27.4s, v20.4s, v1.s[0] \n" - - "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v29.4s, v21.4s, v1.s[3] \n" - "fmla v28.4s, v21.4s, v4.s[1] \n" - "fmla v28.4s, v23.4s, v3.s[3] \n" - "fmla v29.4s, v23.4s, v1.s[1] \n" - - "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr5]] \n" - - "fmla v26.4s, v24.4s, v4.s[0] \n" - "fmla v27.4s, v24.4s, v1.s[2] \n" - - "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8] \n" - - "fmla v27.4s, v22.4s, v2.s[0] \n" - "fmla v26.4s, v22.4s, v4.s[2] \n" - - "fmla v28.4s, v25.4s, v4.s[3] \n" - "fmla v29.4s, v25.4s, v2.s[1] \n" - - "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8] \n" - "fadd v28.4s, v26.4s, v28.4s \n" - - "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16] \n" - "mov v26.16b, v31.16b \n" - "prfm pldl1keep, [%[din_ptr6]] \n" - - "fmla v26.4s, v13.4s, v5.s[0] \n" - "fmla v28.4s, v14.4s, v5.s[1] \n" - "fmla v27.4s, v13.4s, v2.s[2] \n" - "fmla v29.4s, v14.4s, v2.s[3] \n" - - "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8] \n" - - "fmla v26.4s, v11.4s, v5.s[2] \n" - "fmla v28.4s, v12.4s, v5.s[3] \n" - "fmla v27.4s, v11.4s, v3.s[0] \n" - "fmla v29.4s, v12.4s, v3.s[1] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8] \n" - - "fmla v26.4s, v15.4s, v30.s[0] \n" - "fmla v27.4s, v15.4s, v3.s[2] \n" - "fmla v29.4s, v16.4s, v4.s[1] \n" - "fmla v27.4s, v17.4s, v4.s[2] \n" - - "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16] \n" - "prfm pldl1keep, [%[din_ptr1]] \n" - - "fmla v29.4s, v18.4s, v3.s[3] \n" - "fmla v27.4s, v19.4s, v4.s[0] \n" - - "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8] \n" - - "fmla v29.4s, v20.4s, v4.s[3] \n" - - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8] \n" - - "fmla v27.4s, v23.4s, v5.s[0] \n" - "fmla v27.4s, v21.4s, v5.s[2] \n" - - "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16] \n" - - "fmla v29.4s, v24.4s, v5.s[1] \n" - - "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - - "fmla v29.4s, v22.4s, v5.s[3] \n" - - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8] \n" - - "fmla v27.4s, v25.4s, v30.s[0] \n" - - "fadd v26.4s, v26.4s, v28.4s \n" - "fadd v27.4s, v27.4s, v29.4s \n" - "fmax v26.4s, v26.4s, v31.4s \n" - "fmax v27.4s, v27.4s, v31.4s \n" - - "prfm pldl1keep, [%[din_ptr3]] \n" - "st1 {v26.4s}, [%[dout_ptr0]], #16 \n" - "st1 {v27.4s}, [%[dout_ptr1]], #16 \n" - - "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16] \n" - "subs %w[mid_cnt], %w[mid_cnt], #1 \n" - "bne 1b \n" - - "2: \n" - "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8] \n" - "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8] \n" - "bif v8.16b, v31.16b, v26.16b \n" - "bif v9.16b, v31.16b, v27.16b \n" - "bif v6.16b, v31.16b, v28.16b \n" - "bif v7.16b, v31.16b, v29.16b \n" - - "bif v13.16b, v31.16b, v26.16b \n" - "bif v14.16b, v31.16b, v27.16b \n" - "bif v11.16b, v31.16b, v28.16b \n" - "bif v12.16b, v31.16b, v29.16b \n" - - "bif v18.16b, v31.16b, v26.16b \n" - "bif v19.16b, v31.16b, v27.16b \n" - "bif v16.16b, v31.16b, v28.16b \n" - "bif v17.16b, v31.16b, v29.16b \n" - - "bif v23.16b, v31.16b, v26.16b \n" - "bif v24.16b, v31.16b, v27.16b \n" - "bif v21.16b, v31.16b, v28.16b \n" - "bif v22.16b, v31.16b, v29.16b \n" - - "ld2 {v28.4s, v29.4s}, [%[mask]] \n" - "ld1 {v26.4s}, [%[vbias]] \n" - "mov v29.16b, v31.16b \n" - - "bif v10.16b, v31.16b, v28.16b \n" - "bif v15.16b, v31.16b, v28.16b \n" - - "mov v27.16b, v26.16b \n" - - "bif v20.16b, v31.16b, v28.16b \n" - "bif v25.16b, v31.16b, v28.16b \n" - "mov v28.16b, v31.16b \n" - - "fmla v26.4s, v8.4s, v0.s[0] \n" - "fmla v28.4s, v9.4s, v0.s[1] \n" - "fmla v26.4s, v6.4s, v0.s[2] \n" - "fmla v28.4s, v7.4s, v0.s[3] \n" - - "fmla v26.4s, v10.4s, v1.s[0] \n" - "fmla v28.4s, v13.4s, v1.s[1] \n" - "fmla v26.4s, v14.4s, v1.s[2] \n" - "fmla v28.4s, v11.4s, v1.s[3] \n" - - "sub %[mask], %[mask], #16 \n" - "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8] \n" - "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8] \n" - "ld2 {v10.4s, v11.4s}, [%[mask]] \n" - - "fmla v26.4s, v12.4s, v2.s[0] \n" - "fmla v28.4s, v15.4s, v2.s[1] \n" - - "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v26.4s, v16.4s, v3.s[0] \n" - "fmla v28.4s, v17.4s, v3.s[1] \n" - - "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8] \n" - - "fmla v27.4s, v16.4s, v0.s[2] \n" - "fmla v29.4s, v17.4s, v0.s[3] \n" - - "ld2 {v15.4s, v16.4s}, [%[din_ptr4]] \n" - - "fmla v26.4s, v18.4s, v2.s[2] \n" - "fmla v28.4s, v19.4s, v2.s[3] \n" - "fmla v27.4s, v18.4s, v0.s[0] \n" - "fmla v29.4s, v19.4s, v0.s[1] \n" - - "bif v13.16b, v31.16b, v6.16b \n" - "bif v14.16b, v31.16b, v7.16b \n" - "bif v11.16b, v31.16b, v8.16b \n" - "bif v12.16b, v31.16b, v9.16b \n" - "bif v15.16b, v31.16b, v10.16b \n" - - "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v26.4s, v20.4s, v3.s[2] \n" - "fmla v27.4s, v20.4s, v1.s[0] \n" - - "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8] \n" - - "fmla v29.4s, v21.4s, v1.s[3] \n" - "fmla v28.4s, v21.4s, v4.s[1] \n" - - "ld2 {v20.4s, v21.4s}, [%[din_ptr5]] \n" - - "fmla v28.4s, v23.4s, v3.s[3] \n" - "fmla v29.4s, v23.4s, v1.s[1] \n" - "fmla v27.4s, v24.4s, v1.s[2] \n" - "fmla v26.4s, v24.4s, v4.s[0] \n" - - "bif v18.16b, v31.16b, v6.16b \n" - "bif v19.16b, v31.16b, v7.16b \n" - "bif v16.16b, v31.16b, v8.16b \n" - "bif v17.16b, v31.16b, v9.16b \n" - "bif v20.16b, v31.16b, v10.16b \n" - - "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8] \n" - - "fmla v27.4s, v22.4s, v2.s[0] \n" - "fmla v26.4s, v22.4s, v4.s[2] \n" - - "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8] \n" - - "fmla v28.4s, v25.4s, v4.s[3] \n" - "fmla v29.4s, v25.4s, v2.s[1] \n" - "fadd v28.4s, v28.4s, v26.4s \n" - - "ld2 {v25.4s, v26.4s}, [%[din_ptr6]] \n" - "mov v26.16b, v31.16b \n" - - "bif v23.16b, v31.16b, v6.16b \n" - "bif v24.16b, v31.16b, v7.16b \n" - "bif v21.16b, v31.16b, v8.16b \n" - "bif v22.16b, v31.16b, v9.16b \n" - "bif v25.16b, v31.16b, v10.16b \n" - - "fmla v26.4s, v13.4s, v5.s[0] \n" - "fmla v28.4s, v14.4s, v5.s[1] \n" - "fmla v26.4s, v11.4s, v5.s[2] \n" - "fmla v28.4s, v12.4s, v5.s[3] \n" - "fmla v26.4s, v15.4s, v30.s[0] \n" - - "fmla v27.4s, v13.4s, v2.s[2] \n" - "fmla v29.4s, v14.4s, v2.s[3] \n" - "fmla v27.4s, v11.4s, v3.s[0] \n" - "fmla v29.4s, v12.4s, v3.s[1] \n" - - "fadd v26.4s, v26.4s, v28.4s \n" - "fmla v27.4s, v15.4s, v3.s[2] \n" - "fmla v29.4s, v18.4s, v3.s[3] \n" - "fmla v27.4s, v19.4s, v4.s[0] \n" - "fmla v29.4s, v16.4s, v4.s[1] \n" - - "fmax v26.4s, v26.4s, v31.4s \n" - "fmla v27.4s, v17.4s, v4.s[2] \n" - "fmla v29.4s, v20.4s, v4.s[3] \n" - "fmla v27.4s, v23.4s, v5.s[0] \n" - "fmla v29.4s, v24.4s, v5.s[1] \n" - - "st1 {v26.4s}, [%[out_buf0]] \n" - "fmla v27.4s, v21.4s, v5.s[2] \n" - "fmla v29.4s, v22.4s, v5.s[3] \n" - "fmla v27.4s, v25.4s, v30.s[0] \n" - "fadd v27.4s, v27.4s, v29.4s \n" - - "fmax v27.4s, v27.4s, v31.4s \n" - "st1 {v27.4s}, [%[out_buf1]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [dout_ptr1] "+r"(dout_ptr1), - [mid_cnt] "+r"(loop), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [din_ptr6] "+r"(din_ptr6), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [out_buf1] "r"(out_buf1), - [s_8] "r"(s_8), - [s_16] "r"(s_16) - : "memory", - "cc", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); - - int remain_cnt = w_out - (mid_cnt + 1) * 4; - for (int i = 0; i < remain_cnt; ++i) { - dout_ptr0[i] = out_buf0[i]; - dout_ptr1[i] = out_buf1[i]; - } - din0 = din4; - din1 = din5; - din2 = din6; - din3 = din6 + w_in; - din4 = din3 + w_in; - din5 = din4 + w_in; - din6 = din5 + w_in; - dout0 = dout1 + w_out; - dout1 = dout0 + w_out; - } - } - } -} - -//! small depthwise, win < 9; -void conv_depthwise_5x5s2p2_s(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - CHECK_LT(w_in, 9) << "only support win < 9"; - int w_out_round = (w_out + 3) / 4 * 4; - int mask_cnt = 12 - w_in - 2; - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - - float out_buf0[4]; - float out_buf1[4]; - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - for (int h = 0; h < h_out; h += 1) { - //! (h * 2 - 2) + 4 > h_in - 1 - if (h * 2 + 3 > h_in) { - switch (h * 2 + 3 - h_in) { - case 4: - din1 = zero_ptr; - case 3: - din2 = zero_ptr; - case 2: - din3 = zero_ptr; - case 1: - din4 = zero_ptr; - default: - break; - } - } - - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - const int s_8 = 8; - //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8 v13 v18 v23 v28 - //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9 v14 v19 v24 v29 - //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6 v11 v16 v21 v26 - //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7 v12 v17 v22 v27 - //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10 v15 v20 v25 v30 - //! out r0 -- v4 - asm volatile( - "movi v31.4s, #0x0\n" - "prfm pldl1keep, [%[din_ptr0]] \n" - "prfm pldl1keep, [%[din_ptr1]] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - "prfm pldl1keep, [%[din_ptr3]] \n" - "prfm pldl1keep, [%[din_ptr4]] \n" - "prfm pldl1keep, [%[weights]] \n" - "prfm pldl1keep, [%[mask]] \n" - - //! load mask - "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8] \n" - "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8] \n" - "ld2 {v4.4s, v5.4s}, [%[mask]] \n" - - //! load and extract input - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32 \n" - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n" - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n" - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n" - "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n" - - "ext v8.16b, v31.16b, v6.16b, #12 \n" - "ext v9.16b, v31.16b, v7.16b, #12 \n" - "ext v13.16b, v31.16b, v11.16b, #12 \n" - "ext v14.16b, v31.16b, v12.16b, #12 \n" - - "ext v18.16b, v31.16b, v16.16b, #12 \n" - "ext v19.16b, v31.16b, v17.16b, #12 \n" - "ext v23.16b, v31.16b, v21.16b, #12 \n" - "ext v24.16b, v31.16b, v22.16b, #12 \n" - "ext v28.16b, v31.16b, v26.16b, #12 \n" - "ext v29.16b, v31.16b, v27.16b, #12 \n" - - "ext v10.16b, v6.16b, v31.16b, #4 \n" - "ext v15.16b, v11.16b, v31.16b, #4 \n" - "ext v20.16b, v16.16b, v31.16b, #4 \n" - "ext v25.16b, v21.16b, v31.16b, #4 \n" - "ext v30.16b, v26.16b, v31.16b, #4 \n" - - "bif v8.16b, v31.16b, v0.16b \n" - "bif v9.16b, v31.16b, v1.16b \n" - "bif v6.16b, v31.16b, v2.16b \n" - "bif v7.16b, v31.16b, v3.16b \n" - - "bif v13.16b, v31.16b, v0.16b \n" - "bif v14.16b, v31.16b, v1.16b \n" - "bif v11.16b, v31.16b, v2.16b \n" - "bif v12.16b, v31.16b, v3.16b \n" - - "bif v18.16b, v31.16b, v0.16b \n" - "bif v19.16b, v31.16b, v1.16b \n" - "bif v16.16b, v31.16b, v2.16b \n" - "bif v17.16b, v31.16b, v3.16b \n" - - "ld1 {v10.s}[3], [%[din_ptr0]] \n" - "ld1 {v15.s}[3], [%[din_ptr1]] \n" - "ld1 {v20.s}[3], [%[din_ptr2]] \n" - "ld1 {v25.s}[3], [%[din_ptr3]] \n" - "ld1 {v30.s}[3], [%[din_ptr4]] \n" - - "bif v23.16b, v31.16b, v0.16b \n" - "bif v24.16b, v31.16b, v1.16b \n" - "bif v21.16b, v31.16b, v2.16b \n" - "bif v22.16b, v31.16b, v3.16b \n" - - "bif v28.16b, v31.16b, v0.16b \n" - "bif v29.16b, v31.16b, v1.16b \n" - "bif v26.16b, v31.16b, v2.16b \n" - "bif v27.16b, v31.16b, v3.16b \n" - - "bif v10.16b, v31.16b, v4.16b \n" - "bif v15.16b, v31.16b, v4.16b \n" - "bif v20.16b, v31.16b, v4.16b \n" - "bif v25.16b, v31.16b, v4.16b \n" - "bif v30.16b, v31.16b, v4.16b \n" - - "ld1 {v4.4s}, [%[vbias]] \n" - "mov v5.16b, v31.16b \n" - - "ld1 {v0.4s, v1.4s}, [%[weights]], #32 \n" // load weights 0-7 - "ld1 {v2.4s, v3.4s}, [%[weights]], #32 \n" // load weights 8-15 - - //! compute - "fmla v4.4s, v8.4s, v0.s[0] \n" // out r0: w0 - "fmla v5.4s, v9.4s, v0.s[1] \n" // out r0: w1 - "fmla v4.4s, v6.4s, v0.s[2] \n" // out r0: w2 - "fmla v5.4s, v7.4s, v0.s[3] \n" // out r0: w3 - - "fmla v4.4s, v10.4s, v1.s[0] \n" // out r0: w4 - "fmla v5.4s, v13.4s, v1.s[1] \n" // out r0: w5 - "fmla v4.4s, v14.4s, v1.s[2] \n" // out r0: w6 - "fmla v5.4s, v11.4s, v1.s[3] \n" // out r0: w7 - - "ld1 {v6.4s, v7.4s}, [%[weights]], #32 \n" // load weights 16-23 - "ld1 {v8.s}[0], [%[weights]] \n" // load weights 24 - - "fmla v4.4s, v12.4s, v2.s[0] \n" // out r0: w8 - "fmla v5.4s, v15.4s, v2.s[1] \n" // out r0: w9 - "fmla v4.4s, v18.4s, v2.s[2] \n" // out r0: w10 - "fmla v5.4s, v19.4s, v2.s[3] \n" // out r0: w11 - - "fmla v4.4s, v16.4s, v3.s[0] \n" // out r0: w12 - "fmla v5.4s, v17.4s, v3.s[1] \n" // out r0: w13 - "fmla v4.4s, v20.4s, v3.s[2] \n" // out r0: w14 - "fmla v5.4s, v23.4s, v3.s[3] \n" // out r0: w15 - - "fmla v4.4s, v24.4s, v6.s[0] \n" // out r0: w16 - "fmla v5.4s, v21.4s, v6.s[1] \n" // out r0: w17 - "fmla v4.4s, v22.4s, v6.s[2] \n" // out r0: w18 - "fmla v5.4s, v25.4s, v6.s[3] \n" // out r0: w19 - - "fmla v4.4s, v28.4s, v7.s[0] \n" // out r0: w20 - "fmla v5.4s, v29.4s, v7.s[1] \n" // out r0: w21 - "fmla v4.4s, v26.4s, v7.s[2] \n" // out r0: w22 - "fmla v5.4s, v27.4s, v7.s[3] \n" // out r0: w23 - "fmla v4.4s, v30.4s, v8.s[0] \n" // out r0: w24 - - "fadd v4.4s, v4.4s, v5.4s \n" // add out to v4 - "st1 {v4.4s}, [%[out_buf0]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [out_buf1] "r"(out_buf1), - [s_8] "r"(s_8) - : "memory", - "cc", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); - for (int i = 0; i < w_out; ++i) { - dout_ptr0[i] = out_buf0[i]; - } - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din2 + w_in; - din4 = din3 + w_in; - dout0 += w_out; - } - } - } -} - -//! small depthwise, win < 9; -void conv_depthwise_5x5s2p2_relu_s(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - CHECK_LT(w_in, 9) << "only support win < 9"; - int w_out_round = (w_out + 3) / 4 * 4; - int mask_cnt = 12 - w_in - 2; - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - - float out_buf0[4]; - float out_buf1[4]; - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - for (int h = 0; h < h_out; h += 1) { - //! (h * 2 - 2) + 4 > h_in - 1 - if (h * 2 + 3 > h_in) { - switch (h * 2 + 3 - h_in) { - case 4: - din1 = zero_ptr; - case 3: - din2 = zero_ptr; - case 2: - din3 = zero_ptr; - case 1: - din4 = zero_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - - const float* weights_ptr = weights_c; - float* dout_ptr0 = dout0; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - const int s_8 = 8; - //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8 v13 v18 v23 v28 - //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9 v14 v19 v24 v29 - //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6 v11 v16 v21 v26 - //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7 v12 v17 v22 v27 - //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10 v15 v20 v25 v30 - //! out r0 -- v4 - asm volatile( - "movi v31.4s, #0x0\n" - "prfm pldl1keep, [%[din_ptr0]] \n" - "prfm pldl1keep, [%[din_ptr1]] \n" - "prfm pldl1keep, [%[din_ptr2]] \n" - "prfm pldl1keep, [%[din_ptr3]] \n" - "prfm pldl1keep, [%[din_ptr4]] \n" - "prfm pldl1keep, [%[weights]] \n" - "prfm pldl1keep, [%[mask]] \n" - - //! load mask - "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8] \n" - "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8] \n" - "ld2 {v4.4s, v5.4s}, [%[mask]] \n" - - //! load and extract input - "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32 \n" - "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n" - "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n" - "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n" - "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n" - - "ext v8.16b, v31.16b, v6.16b, #12 \n" - "ext v9.16b, v31.16b, v7.16b, #12 \n" - "ext v13.16b, v31.16b, v11.16b, #12 \n" - "ext v14.16b, v31.16b, v12.16b, #12 \n" - - "ext v18.16b, v31.16b, v16.16b, #12 \n" - "ext v19.16b, v31.16b, v17.16b, #12 \n" - "ext v23.16b, v31.16b, v21.16b, #12 \n" - "ext v24.16b, v31.16b, v22.16b, #12 \n" - "ext v28.16b, v31.16b, v26.16b, #12 \n" - "ext v29.16b, v31.16b, v27.16b, #12 \n" - - "ext v10.16b, v6.16b, v31.16b, #4 \n" - "ext v15.16b, v11.16b, v31.16b, #4 \n" - "ext v20.16b, v16.16b, v31.16b, #4 \n" - "ext v25.16b, v21.16b, v31.16b, #4 \n" - "ext v30.16b, v26.16b, v31.16b, #4 \n" - - "bif v8.16b, v31.16b, v0.16b \n" - "bif v9.16b, v31.16b, v1.16b \n" - "bif v6.16b, v31.16b, v2.16b \n" - "bif v7.16b, v31.16b, v3.16b \n" - - "bif v13.16b, v31.16b, v0.16b \n" - "bif v14.16b, v31.16b, v1.16b \n" - "bif v11.16b, v31.16b, v2.16b \n" - "bif v12.16b, v31.16b, v3.16b \n" - - "bif v18.16b, v31.16b, v0.16b \n" - "bif v19.16b, v31.16b, v1.16b \n" - "bif v16.16b, v31.16b, v2.16b \n" - "bif v17.16b, v31.16b, v3.16b \n" - - "ld1 {v10.s}[3], [%[din_ptr0]] \n" - "ld1 {v15.s}[3], [%[din_ptr1]] \n" - "ld1 {v20.s}[3], [%[din_ptr2]] \n" - "ld1 {v25.s}[3], [%[din_ptr3]] \n" - "ld1 {v30.s}[3], [%[din_ptr4]] \n" - - "bif v23.16b, v31.16b, v0.16b \n" - "bif v24.16b, v31.16b, v1.16b \n" - "bif v21.16b, v31.16b, v2.16b \n" - "bif v22.16b, v31.16b, v3.16b \n" - - "bif v28.16b, v31.16b, v0.16b \n" - "bif v29.16b, v31.16b, v1.16b \n" - "bif v26.16b, v31.16b, v2.16b \n" - "bif v27.16b, v31.16b, v3.16b \n" - - "bif v10.16b, v31.16b, v4.16b \n" - "bif v15.16b, v31.16b, v4.16b \n" - "bif v20.16b, v31.16b, v4.16b \n" - "bif v25.16b, v31.16b, v4.16b \n" - "bif v30.16b, v31.16b, v4.16b \n" - - "ld1 {v4.4s}, [%[vbias]] \n" - "mov v5.16b, v31.16b \n" - - "ld1 {v0.4s, v1.4s}, [%[weights]], #32 \n" // load weights 0-7 - "ld1 {v2.4s, v3.4s}, [%[weights]], #32 \n" // load weights 8-15 - - //! compute - "fmla v4.4s, v8.4s, v0.s[0] \n" // out r0: w0 - "fmla v5.4s, v9.4s, v0.s[1] \n" // out r0: w1 - "fmla v4.4s, v6.4s, v0.s[2] \n" // out r0: w2 - "fmla v5.4s, v7.4s, v0.s[3] \n" // out r0: w3 - - "fmla v4.4s, v10.4s, v1.s[0] \n" // out r0: w4 - "fmla v5.4s, v13.4s, v1.s[1] \n" // out r0: w5 - "fmla v4.4s, v14.4s, v1.s[2] \n" // out r0: w6 - "fmla v5.4s, v11.4s, v1.s[3] \n" // out r0: w7 - - "ld1 {v6.4s, v7.4s}, [%[weights]], #32 \n" // load weights 16-23 - "ld1 {v8.s}[0], [%[weights]] \n" // load weights 24 - - "fmla v4.4s, v12.4s, v2.s[0] \n" // out r0: w8 - "fmla v5.4s, v15.4s, v2.s[1] \n" // out r0: w9 - "fmla v4.4s, v18.4s, v2.s[2] \n" // out r0: w10 - "fmla v5.4s, v19.4s, v2.s[3] \n" // out r0: w11 - - "fmla v4.4s, v16.4s, v3.s[0] \n" // out r0: w12 - "fmla v5.4s, v17.4s, v3.s[1] \n" // out r0: w13 - "fmla v4.4s, v20.4s, v3.s[2] \n" // out r0: w14 - "fmla v5.4s, v23.4s, v3.s[3] \n" // out r0: w15 - - "fmla v4.4s, v24.4s, v6.s[0] \n" // out r0: w16 - "fmla v5.4s, v21.4s, v6.s[1] \n" // out r0: w17 - "fmla v4.4s, v22.4s, v6.s[2] \n" // out r0: w18 - "fmla v5.4s, v25.4s, v6.s[3] \n" // out r0: w19 - - "fmla v4.4s, v28.4s, v7.s[0] \n" // out r0: w20 - "fmla v5.4s, v29.4s, v7.s[1] \n" // out r0: w21 - "fmla v4.4s, v26.4s, v7.s[2] \n" // out r0: w22 - "fmla v5.4s, v27.4s, v7.s[3] \n" // out r0: w23 - "fmla v4.4s, v30.4s, v8.s[0] \n" // out r0: w24 - - "fadd v4.4s, v4.4s, v5.4s \n" // add out to v4 - "fmax v4.4s, v4.4s, v31.4s \n" - "st1 {v4.4s}, [%[out_buf0]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [out_buf1] "r"(out_buf1), - [s_8] "r"(s_8) - : "memory", - "cc", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); - for (int i = 0; i < w_out; ++i) { - dout_ptr0[i] = out_buf0[i]; - } - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din2 + w_in; - din4 = din3 + w_in; - dout0 += w_out; - } - } - } -} - -#else - -//! larger depthwise, win >= 9; -void conv_depthwise_5x5s2p2(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - // printf("invoke 5x5s2p2 armv7\n"); - CHECK_GE(w_in, 9) << "only support win >= 9"; - int w_out_round = (w_out + 3) / 4 * 4; - int cnt = (w_out_round - 4) / 4; - int mid_cnt = cnt - 1; - int right_start = cnt * 2 * 4 - 2; - int mask_cnt = 12 - (w_in - right_start); - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - - float out_buf0[4]; - float* dout0 = dout_ch; - - const float* weights_c = weights + c * weights_saptial_size; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 4); - float32x4_t w2 = vld1q_f32(weights_c + 8); - float32x4_t w3 = vld1q_f32(weights_c + 12); - float32x4_t w4 = vld1q_f32(weights_c + 16); - float32x4_t w5 = vld1q_f32(weights_c + 20); - for (int h = 0; h < h_out; h += 1) { - //! (h * 2 - 2) + 4 > h_in - 1 - if (h * 2 + 3 > h_in) { - switch (h * 2 + 3 - h_in) { - case 4: - din1 = zero_ptr; - case 3: - din2 = zero_ptr; - case 2: - din3 = zero_ptr; - case 1: - din4 = zero_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - - const float* weights_ptr = weights_c + 24; - float* dout_ptr0 = dout0; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - int loop = mid_cnt; - const int s_8 = 8; - const int s_16 = 16; - - asm volatile( - "vmov.i32 q15, #0x0 \n" - "pld [%[din_ptr0]] \n" - "pld [%[din_ptr1]] \n" - "pld [%[din_ptr2]] \n" - "pld [%[din_ptr3]] \n" - "pld [%[din_ptr4]] \n" - "pld [%[mask]] \n" - - // left - "vld2.32 {d16-d19}, [%[din_ptr0]]! \n" - "vld1.32 {d26-d29}, [%[vbias]] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vmov.32 q14, q15 \n" - - // r0 - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld1.32 {d21[1]}, [%[din_ptr0]] \n" - "vld2.32 {d16-d19}, [%[din_ptr1]]! \n" - "sub %[din_ptr0], #8 \n" - - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - "vmla.f32 q13, q10, %e[w1][0] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r1 - "vmla.f32 q13, q8, %f[w1][1] \n" - "vmla.f32 q14, q9, %e[w2][0] \n" - - "vld1.32 {d21[1]}, [%[din_ptr1]] \n" - "vld2.32 {d16-d19}, [%[din_ptr2]]! \n" - "sub %[din_ptr1], #8 \n" - - "vmla.f32 q13, q6, %e[w1][1] \n" - "vmla.f32 q14, q7, %f[w1][0] \n" - "vmla.f32 q13, q10, %e[w2][1] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r2 - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld1.32 {d21[1]}, [%[din_ptr2]] \n" - "vld2.32 {d16-d19}, [%[din_ptr3]]! \n" - "sub %[din_ptr2], #8 \n" - - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - "vmla.f32 q13, q10, %f[w3][0] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r3 - "vmla.f32 q13, q8, %e[w4][1] \n" - "vmla.f32 q14, q9, %f[w4][0] \n" - - "vld1.32 {d21[1]}, [%[din_ptr3]] \n" - "vld2.32 {d16-d19}, [%[din_ptr4]]! \n" - "sub %[din_ptr3], #8 \n" - - "vmla.f32 q13, q6, %f[w3][1] \n" - "vmla.f32 q14, q7, %e[w4][0] \n" - "vmla.f32 q13, q10, %f[w4][1] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r4 - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - - "vld1.32 {d21[1]}, [%[din_ptr4]] \n" - "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8] \n" - "sub %[din_ptr4], #8 \n" - - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8] \n" - - "vmov.32 q12, %q[w0] \n" - "vld1.32 {%e[w0][0]}, [%[weights]] \n" - "vmla.f32 q13, q10, %e[w0][0] \n" - "vadd.f32 q13, q13, q14 \n" - "vmov.32 %q[w0], q12 \n" - "cmp %[mid_cnt], #1 \n" - "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16] \n" - "vst1.32 {d26-d27}, [%[dout_ptr0]]! \n" - "pld [%[din_ptr0]] \n" - "blt 2f \n" - - // mid - "1: \n" - "vld1.32 {d26-d27}, [%[vbias]] \n" - "vmov.32 q14, q15 \n" - - // r0 - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - - "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8] \n" - - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8] \n" - - "vmla.f32 q13, q10, %e[w1][0] \n" - - "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16] \n" - - // r1 - "vmla.f32 q13, q6, %e[w1][1] \n" - "vmla.f32 q14, q7, %f[w1][0] \n" - "pld [%[din_ptr1]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8] \n" - - "vmla.f32 q13, q8, %f[w1][1] \n" - "vmla.f32 q14, q9, %e[w2][0] \n" - - "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8] \n" - - "vmla.f32 q13, q10, %e[w2][1] \n" - - "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16] \n" - - // r2 - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - "pld [%[din_ptr2]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8] \n" - - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8] \n" - - "vmla.f32 q13, q10, %f[w3][0] \n" - - "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16] \n" - - // r3 - "vmla.f32 q13, q6, %f[w3][1] \n" - "vmla.f32 q14, q7, %e[w4][0] \n" - "pld [%[din_ptr3]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8] \n" - - "vmla.f32 q13, q8, %e[w4][1] \n" - "vmla.f32 q14, q9, %f[w4][0] \n" - - "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8] \n" - - "vmla.f32 q13, q10, %f[w4][1] \n" - - "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16] \n" - - // r4 - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - "pld [%[din_ptr4]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8] \n" - "vld1.32 {%e[w0][0]}, [%[weights]] \n" - - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8] \n" - - "vmla.f32 q13, q10, %e[w0][0] \n" - - "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16] \n" - - "vmov.32 %q[w0], q12 \n" - "vadd.f32 q13, q13, q14 \n" - "subs %[mid_cnt], #1 \n" - "vst1.32 {d26-d27}, [%[dout_ptr0]]! \n" - "bne 1b \n" - - "2: \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld1.32 {d26-d27}, [%[vbias]] \n" - "vmov.32 q14, q15 \n" - - // r0 - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %e[w1][0] \n" - - // r1 - "vld2.32 {d20-d23}, [%[din_ptr1]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w1][1] \n" - "vmla.f32 q14, q7, %f[w1][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w1][1] \n" - "vmla.f32 q14, q9, %e[w2][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %e[w2][1] \n" - - // r2 - "vld2.32 {d20-d23}, [%[din_ptr2]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - - "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %f[w3][0] \n" - - // r3 - "vld2.32 {d20-d23}, [%[din_ptr3]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %f[w3][1] \n" - "vmla.f32 q14, q7, %e[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %e[w4][1] \n" - "vmla.f32 q14, q9, %f[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %f[w4][1] \n" - - // r4 - "vld2.32 {d20-d23}, [%[din_ptr4]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld1.32 {d12[0]}, [%[weights]] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, d12[0] \n" - - "vadd.f32 q13, q13, q14 \n" - "vst1.32 {d26-d27}, [%[out_buf0]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [mid_cnt] "+r"(loop), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [s_8] "r"(s_8), - [s_16] "r"(s_16) - : "memory", - "cc", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - int remain_cnt = w_out - (mid_cnt + 1) * 4; - for (int i = 0; i < remain_cnt; ++i) { - dout_ptr0[i] = out_buf0[i]; - } - - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din2 + w_in; - din4 = din3 + w_in; - dout0 += w_out; - } - } - } -} - -//! larger depthwise, win >= 9; -void conv_depthwise_5x5s2p2_relu(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - // printf("invoke 5x5s2p2 armv7\n"); - CHECK_GE(w_in, 9) << "only support win >= 9"; - int w_out_round = (w_out + 3) / 4 * 4; - int cnt = (w_out_round - 4) / 4; - int mid_cnt = cnt - 1; - int right_start = cnt * 2 * 4 - 2; - int mask_cnt = 12 - (w_in - right_start); - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - - float out_buf0[4]; - float* dout0 = dout_ch; - - const float* weights_c = weights + c * weights_saptial_size; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 4); - float32x4_t w2 = vld1q_f32(weights_c + 8); - float32x4_t w3 = vld1q_f32(weights_c + 12); - float32x4_t w4 = vld1q_f32(weights_c + 16); - float32x4_t w5 = vld1q_f32(weights_c + 20); - for (int h = 0; h < h_out; h += 1) { - //! (h * 2 - 2) + 4 > h_in - 1 - if (h * 2 + 3 > h_in) { - switch (h * 2 + 3 - h_in) { - case 4: - din1 = zero_ptr; - case 3: - din2 = zero_ptr; - case 2: - din3 = zero_ptr; - case 1: - din4 = zero_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - - const float* weights_ptr = weights_c + 24; - float* dout_ptr0 = dout0; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - int loop = mid_cnt; - const int s_8 = 8; - const int s_16 = 16; - - asm volatile( - "vmov.i32 q15, #0x0 \n" - "pld [%[din_ptr0]] \n" - "pld [%[din_ptr1]] \n" - "pld [%[din_ptr2]] \n" - "pld [%[din_ptr3]] \n" - "pld [%[din_ptr4]] \n" - "pld [%[mask]] \n" - - // left - "vld2.32 {d16-d19}, [%[din_ptr0]]! \n" - "vld1.32 {d26-d29}, [%[vbias]] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vmov.32 q14, q15 \n" - - // r0 - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld1.32 {d21[1]}, [%[din_ptr0]] \n" - "vld2.32 {d16-d19}, [%[din_ptr1]]! \n" - "sub %[din_ptr0], #8 \n" - - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - "vmla.f32 q13, q10, %e[w1][0] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r1 - "vmla.f32 q13, q8, %f[w1][1] \n" - "vmla.f32 q14, q9, %e[w2][0] \n" - - "vld1.32 {d21[1]}, [%[din_ptr1]] \n" - "vld2.32 {d16-d19}, [%[din_ptr2]]! \n" - "sub %[din_ptr1], #8 \n" - - "vmla.f32 q13, q6, %e[w1][1] \n" - "vmla.f32 q14, q7, %f[w1][0] \n" - "vmla.f32 q13, q10, %e[w2][1] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r2 - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld1.32 {d21[1]}, [%[din_ptr2]] \n" - "vld2.32 {d16-d19}, [%[din_ptr3]]! \n" - "sub %[din_ptr2], #8 \n" - - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - "vmla.f32 q13, q10, %f[w3][0] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r3 - "vmla.f32 q13, q8, %e[w4][1] \n" - "vmla.f32 q14, q9, %f[w4][0] \n" - - "vld1.32 {d21[1]}, [%[din_ptr3]] \n" - "vld2.32 {d16-d19}, [%[din_ptr4]]! \n" - "sub %[din_ptr3], #8 \n" - - "vmla.f32 q13, q6, %f[w3][1] \n" - "vmla.f32 q14, q7, %e[w4][0] \n" - "vmla.f32 q13, q10, %f[w4][1] \n" - - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - - // r4 - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - - "vld1.32 {d21[1]}, [%[din_ptr4]] \n" - "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8] \n" - "sub %[din_ptr4], #8 \n" - - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8] \n" - - "vmov.32 q12, %q[w0] \n" - "vld1.32 {%e[w0][0]}, [%[weights]] \n" - "vmla.f32 q13, q10, %e[w0][0] \n" - "vadd.f32 q13, q13, q14 \n" - "vmov.f32 %q[w0], q12 \n" - "vmax.f32 q13, q13, q15 \n" - "cmp %[mid_cnt], #1 \n" - "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16] \n" - "vst1.32 {d26-d27}, [%[dout_ptr0]]! \n" - "pld [%[din_ptr0]] \n" - "blt 2f \n" - - // mid - "1: \n" - "vld1.32 {d26-d27}, [%[vbias]] \n" - "vmov.32 q14, q15 \n" - - // r0 - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - - "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8] \n" - - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8] \n" - - "vmla.f32 q13, q10, %e[w1][0] \n" - - "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16] \n" - - // r1 - "vmla.f32 q13, q6, %e[w1][1] \n" - "vmla.f32 q14, q7, %f[w1][0] \n" - "pld [%[din_ptr1]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8] \n" - - "vmla.f32 q13, q8, %f[w1][1] \n" - "vmla.f32 q14, q9, %e[w2][0] \n" - - "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8] \n" - - "vmla.f32 q13, q10, %e[w2][1] \n" - - "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16] \n" - - // r2 - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - "pld [%[din_ptr2]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8] \n" - - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8] \n" - - "vmla.f32 q13, q10, %f[w3][0] \n" - - "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16] \n" - - // r3 - "vmla.f32 q13, q6, %f[w3][1] \n" - "vmla.f32 q14, q7, %e[w4][0] \n" - "pld [%[din_ptr3]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8] \n" - - "vmla.f32 q13, q8, %e[w4][1] \n" - "vmla.f32 q14, q9, %f[w4][0] \n" - - "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8] \n" - - "vmla.f32 q13, q10, %f[w4][1] \n" - - "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16] \n" - - // r4 - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - "pld [%[din_ptr4]] \n" - - "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8] \n" - "vld1.32 {%e[w0][0]}, [%[weights]] \n" - - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8] \n" - - "vmla.f32 q13, q10, %e[w0][0] \n" - - "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16] \n" - - "vmov.32 %q[w0], q12 \n" - "vadd.f32 q13, q13, q14 \n" - "vmax.f32 q13, q13, q15 \n" - "subs %[mid_cnt], #1 \n" - "vst1.32 {d26-d27}, [%[dout_ptr0]]! \n" - "bne 1b \n" - - "2: \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld1.32 {d26-d27}, [%[vbias]] \n" - "vmov.32 q14, q15 \n" - - // r0 - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %e[w1][0] \n" - - // r1 - "vld2.32 {d20-d23}, [%[din_ptr1]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w1][1] \n" - "vmla.f32 q14, q7, %f[w1][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w1][1] \n" - "vmla.f32 q14, q9, %e[w2][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %e[w2][1] \n" - - // r2 - "vld2.32 {d20-d23}, [%[din_ptr2]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - - "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %f[w3][0] \n" - - // r3 - "vld2.32 {d20-d23}, [%[din_ptr3]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %f[w3][1] \n" - "vmla.f32 q14, q7, %e[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %e[w4][1] \n" - "vmla.f32 q14, q9, %f[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "sub %[mask], #16 \n" - "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %f[w4][1] \n" - - // r4 - "vld2.32 {d20-d23}, [%[din_ptr4]] \n" - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld1.32 {d12[0]}, [%[weights]] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, d12[0] \n" - - "vadd.f32 q13, q13, q14 \n" - "vmax.f32 q13, q13, q15 \n" - "vst1.32 {d26-d27}, [%[out_buf0]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [mid_cnt] "+r"(loop), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [s_8] "r"(s_8), - [s_16] "r"(s_16) - : "memory", - "cc", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - int remain_cnt = w_out - (mid_cnt + 1) * 4; - for (int i = 0; i < remain_cnt; ++i) { - dout_ptr0[i] = out_buf0[i]; - } - - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din2 + w_in; - din4 = din3 + w_in; - dout0 += w_out; - } - } - } -} - -//! small depthwise, win < 9; -void conv_depthwise_5x5s2p2_s(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - CHECK_LT(w_in, 9) << "only support win < 9"; - int w_out_round = (w_out + 3) / 4 * 4; - int mask_cnt = 12 - w_in - 2; - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - - float out_buf0[4]; - float out_buf1[4]; - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 4); - float32x4_t w2 = vld1q_f32(weights_c + 8); - float32x4_t w3 = vld1q_f32(weights_c + 12); - float32x4_t w4 = vld1q_f32(weights_c + 16); - float32x4_t w5 = vld1q_f32(weights_c + 20); - for (int h = 0; h < h_out; h += 1) { - //! (h * 2 - 2) + 4 > h_in - 1 - if (h * 2 + 3 > h_in) { - switch (h * 2 + 3 - h_in) { - case 4: - din1 = zero_ptr; - case 3: - din2 = zero_ptr; - case 2: - din3 = zero_ptr; - case 1: - din4 = zero_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - - const float* weights_ptr = weights_c + 24; - float* dout_ptr0 = dout0; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - const int s_8 = 8; - - asm volatile( - "vmov.i32 q15, #0x0 \n" - "pld [%[din_ptr0]] \n" - "pld [%[din_ptr1]] \n" - "pld [%[din_ptr2]] \n" - "pld [%[din_ptr3]] \n" - "pld [%[din_ptr4]] \n" - "vld1.32 {d26-d27}, [%[vbias]] \n" - "vmov.32 q14, q15 \n" - "vld2.32 {d16-d19}, [%[din_ptr0]]! \n" - - // r0 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr0]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr1]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %e[w1][0] \n" - - // r1 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr1]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q14, q6, %e[w1][1] \n" - "vmla.f32 q13, q7, %f[w1][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q14, q8, %f[w1][1] \n" - "vmla.f32 q13, q9, %e[w2][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr2]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q14, q10, %e[w2][1] \n" - - // r2 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr2]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr3]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %f[w3][0] \n" - - // r3 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr3]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q14, q6, %f[w3][1] \n" - "vmla.f32 q13, q7, %e[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q14, q8, %e[w4][1] \n" - "vmla.f32 q13, q9, %f[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr4]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q14, q10, %f[w4][1] \n" - - // r4 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr4]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld1.32 {d12[0]}, [%[weights]] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, d12[0] \n" - - "vadd.f32 q13, q13, q14 \n" - "vst1.32 {d26-d27}, [%[out_buf0]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [s_8] "r"(s_8), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5) - : "memory", - "cc", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - for (int i = 0; i < w_out; ++i) { - dout_ptr0[i] = out_buf0[i]; - } - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din2 + w_in; - din4 = din3 + w_in; - dout0 += w_out; - } - } - } -} - -//! small depthwise, win < 9; -void conv_depthwise_5x5s2p2_relu_s(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - CHECK_LT(w_in, 9) << "only support win < 9\n"; - int w_out_round = (w_out + 3) / 4 * 4; - int mask_cnt = 12 - w_in - 2; - int mask[12]; - memset(mask, 0xff, 12 * sizeof(int)); - for (int i = 0; i < mask_cnt; ++i) { - mask[11 - i] = 0; - } - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - int in_spatial_size = w_in * h_in; - int out_spatial_size = w_out * h_out; - int weights_saptial_size = 25; - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * in_spatial_size * ch_in; - float* dout_batch = dout + n * out_spatial_size * ch_out; -#pragma omp parallel for - for (int c = 0; c < ch_in; ++c) { - const float* din_ch = din_batch + c * in_spatial_size; - float* dout_ch = dout_batch + c * out_spatial_size; - const float* din0 = zero_ptr; - const float* din1 = zero_ptr; - const float* din2 = din_ch; - const float* din3 = din2 + w_in; - const float* din4 = din3 + w_in; - - float out_buf0[4]; - float out_buf1[4]; - float* dout0 = dout_ch; - float* dout1 = dout0 + w_out; - - const float* weights_c = weights + c * weights_saptial_size; - float32x4_t w0 = vld1q_f32(weights_c); - float32x4_t w1 = vld1q_f32(weights_c + 4); - float32x4_t w2 = vld1q_f32(weights_c + 8); - float32x4_t w3 = vld1q_f32(weights_c + 12); - float32x4_t w4 = vld1q_f32(weights_c + 16); - float32x4_t w5 = vld1q_f32(weights_c + 20); - for (int h = 0; h < h_out; h += 1) { - //! (h * 2 - 2) + 4 > h_in - 1 - if (h * 2 + 3 > h_in) { - switch (h * 2 + 3 - h_in) { - case 4: - din1 = zero_ptr; - case 3: - din2 = zero_ptr; - case 2: - din3 = zero_ptr; - case 1: - din4 = zero_ptr; - default: - break; - } - } - const float* din_ptr0 = din0; - const float* din_ptr1 = din1; - const float* din_ptr2 = din2; - const float* din_ptr3 = din3; - const float* din_ptr4 = din4; - - const float* weights_ptr = weights_c + 24; - float* dout_ptr0 = dout0; - - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[c]; - } - float vbias[4] = {bias_c, bias_c, bias_c, bias_c}; - int* mask_ptr = mask; - const int s_8 = 8; - - asm volatile( - "vmov.i32 q15, #0x0 \n" - "pld [%[din_ptr0]] \n" - "pld [%[din_ptr1]] \n" - "pld [%[din_ptr2]] \n" - "pld [%[din_ptr3]] \n" - "pld [%[din_ptr4]] \n" - "vld1.32 {d26-d27}, [%[vbias]] \n" - "vmov.32 q14, q15 \n" - "vld2.32 {d16-d19}, [%[din_ptr0]]! \n" - - // r0 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr0]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w0][0] \n" - "vmla.f32 q14, q7, %e[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w0][0] \n" - "vmla.f32 q14, q9, %f[w0][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr1]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %e[w1][0] \n" - - // r1 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr1]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q14, q6, %e[w1][1] \n" - "vmla.f32 q13, q7, %f[w1][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q14, q8, %f[w1][1] \n" - "vmla.f32 q13, q9, %e[w2][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr2]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q14, q10, %e[w2][1] \n" - - // r2 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr2]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %f[w2][0] \n" - "vmla.f32 q14, q7, %f[w2][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %e[w3][0] \n" - "vmla.f32 q14, q9, %e[w3][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr3]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, %f[w3][0] \n" - - // r3 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr3]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q14, q6, %f[w3][1] \n" - "vmla.f32 q13, q7, %e[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q14, q8, %e[w4][1] \n" - "vmla.f32 q13, q9, %f[w4][0] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vld2.32 {d16-d19}, [%[din_ptr4]]! \n" - "sub %[mask], #16 \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q14, q10, %f[w4][1] \n" - - // r4 - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vext.32 q6, q15, q8, #3 \n" - "vext.32 q7, q15, q9, #3 \n" - "vext.32 q10, q8, q15, #1 \n" - "vld1.32 {d21[1]}, [%[din_ptr4]] \n" - - "vbif.32 q6, q15, q11 \n" - "vbif.32 q7, q15, q12 \n" - "vmla.f32 q13, q6, %e[w5][0] \n" - "vmla.f32 q14, q7, %e[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]], %[s_8] \n" - "vld1.32 {d12[0]}, [%[weights]] \n" - "vbif.32 q8, q15, q11 \n" - "vbif.32 q9, q15, q12 \n" - "vmla.f32 q13, q8, %f[w5][0] \n" - "vmla.f32 q14, q9, %f[w5][1] \n" - - "vld2.32 {d22-d25}, [%[mask]] \n" - "vbif.32 q10, q15, q11 \n" - "vmla.f32 q13, q10, d12[0] \n" - - "vadd.f32 q13, q13, q14 \n" - "vmax.f32 q13, q13, q15 \n" - "vst1.32 {d26-d27}, [%[out_buf0]] \n" - - : [dout_ptr0] "+r"(dout_ptr0), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [mask] "+r"(mask_ptr), - [weights] "+r"(weights_ptr) - : [vbias] "r"(vbias), - [out_buf0] "r"(out_buf0), - [s_8] "r"(s_8), - [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5) - : "memory", - "cc", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - for (int i = 0; i < w_out; ++i) { - dout_ptr0[i] = out_buf0[i]; - } - din0 = din2; - din1 = din3; - din2 = din4; - din3 = din2 + w_in; - din4 = din3 + w_in; - dout0 += w_out; - } - } - } -} -#endif // __aarch64__ - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_direct.cc b/lite/backends/arm/math/conv_direct.cc deleted file mode 100644 index 51526aa2b3..0000000000 --- a/lite/backends/arm/math/conv_direct.cc +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_direct.h" -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -bool DirectConv::create(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ic = x_dims[1]; - int ow = o_dims[3]; - int oc = o_dims[1]; - int kw = w_dims[3]; - int sw = param.strides[1]; - // select dw conv kernel - const auto* w_data = param.filter->data(); - if (kw == 3 && sw == 1) { - VLOG(5) << "invoke 3x3s1 direct conv"; - impl_ = conv_3x3s1_direct_fp32; - - constexpr int cblock = 4; - int cround = (oc + cblock - 1) / cblock * cblock; - weights_trans_.Resize({cround, ic, kw, kw}); - float* transed_w_data = weights_trans_.mutable_data(); - - conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw); - is_weights_transed_ = true; - } else if (kw == 3 && sw == 2) { - VLOG(5) << "invoke 3x3s2 direct conv"; - impl_ = conv_3x3s2_direct_fp32; - - constexpr int cblock = 4; - int cround = (oc + cblock - 1) / cblock * cblock; - weights_trans_.Resize({cround, ic, kw, kw}); - float* transed_w_data = weights_trans_.mutable_data(); - conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw); - is_weights_transed_ = true; - } else { - LOG(ERROR) << "this type direct conv not impl"; - return false; - } - return true; -} - -template <> -bool DirectConv::init(const operators::ConvParam& param, - Context* ctx) { - this->ctx_ = ctx; - return create(param, ctx); -} - -template <> -bool DirectConv::run(const operators::ConvParam& param) { - // start timer - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - auto* o_data = param.output->mutable_data(); - - if (is_weights_transed_ == true) { - w_data = weights_trans_.data(); - } - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int bs = x_dims[0]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int oc = o_dims[1]; - - impl_(i_data, - o_data, - bs, - oc, - oh, - ow, - ic, - ih, - iw, - w_data, - b_data, - param, - this->ctx_); - - // timer end - return true; -} - -template -bool DirectConvInt8::create(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ic = x_dims[1]; - int ow = o_dims[3]; - int oc = o_dims[1]; - int kw = w_dims[3]; - int sw = param.strides[1]; - // select dw conv kernel - w_scale_ = param.weight_scale; - //! update weights scale - const auto* w_data = param.filter->data(); - if (Ptype_out == PRECISION(kInt8) || Ptype_out == PRECISION(kFloat)) { - CHECK_EQ(this->w_scale_.size(), oc) << "weights scale size must be chout"; - float input_scale = param.input_scale; - for (auto& w_s : w_scale_) { - w_s *= input_scale; - if (Ptype_out == PRECISION(kInt8)) { - w_s /= param.output_scale; - } - } - } - if (kw == 3 && sw == 1) { - VLOG(5) << "invoke 3x3s1 direct conv"; - impl_int8_ = conv_3x3s1_direct_int8; - - constexpr int cblock = 4; - int inpad = 4; - int cround = (oc + cblock - 1) / cblock * cblock; - weights_trans_.Resize({cround, ic, kw, kw}); - int8_t* transed_w_data = weights_trans_.mutable_data(); - conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw); - - int wout_round = ((ow + 3) / 4) * 4; - int win_round = wout_round * sw + inpad; - int row_out = 2; - int row_in = 4; - int tmp_size_out = wout_round * row_out * cblock; - int in_len = win_round * ic; - int tmp_size_in = row_in * in_len; - ctx_->ExtendWorkspace(ctx_->threads() * tmp_size_out + - (tmp_size_in + 3) / 4 * 4 + wout_round + win_round); - is_weights_transed_ = true; - - } else if (kw == 3 && sw == 2) { - VLOG(5) << "invoke 3x3s2 direct conv"; - impl_int8_ = conv_3x3s2_direct_int8; - - // constexpr int cblock = 4; - int cblock = conv_3x3s2_direct_int8_c_num(); - int cround = (oc + cblock - 1) / cblock * cblock; - weights_trans_.Resize({cround, ic, kw, kw}); - int8_t* transed_w_data = weights_trans_.mutable_data(); - conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw); - is_weights_transed_ = true; - - } else { - LOG(ERROR) << "this type direct conv not impl"; - return false; - } - return true; -} - -template -bool DirectConvInt8::init(const operators::ConvParam& param, - Context* ctx) { - this->ctx_ = ctx; - return create(param, ctx); -} - -template -bool DirectConvInt8::run(const operators::ConvParam& param) { - // start timer - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - auto* o_data = param.output->mutable_data(); - if (is_weights_transed_ == true) { - w_data = weights_trans_.data(); - } - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int bs = x_dims[0]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int oc = o_dims[1]; - - impl_int8_(i_data, - o_data, - bs, - oc, - oh, - ow, - ic, - ih, - iw, - w_data, - b_data, - param, - this->ctx_, - Ptype_out, - w_scale_.data()); - - // Modified from int32 for debug convenience - if (Ptype_out == PRECISION(kInt8)) param.output->mutable_data(); - return true; -} - -template class DirectConvInt8; -template class DirectConvInt8; -template class DirectConvInt8; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_direct.h b/lite/backends/arm/math/conv_direct.h deleted file mode 100644 index e6132dca5e..0000000000 --- a/lite/backends/arm/math/conv_direct.h +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -class DirectConv : public ImplBase { - public: - typedef void (*conv_direct_impl)(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx); - - DirectConv() = default; - ~DirectConv() {} - - virtual bool init(const operators::ConvParam& param, - Context* ctx); - - virtual bool create(const operators::ConvParam& param, - Context* ctx); - - virtual bool run(const operators::ConvParam& param); - - protected: - bool is_weights_transed_{false}; - Tensor weights_trans_; - Tensor _tmp_out; - - private: - conv_direct_impl impl_{nullptr}; -}; - -template -class DirectConvInt8 - : public ImplBase { - public: - typedef void (*conv_direct_int8_impl)(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - - DirectConvInt8() = default; - ~DirectConvInt8() {} - - virtual bool init(const operators::ConvParam& param, - Context* ctx); - - virtual bool create(const operators::ConvParam& param, - Context* ctx); - - virtual bool run(const operators::ConvParam& param); - - private: - bool is_weights_transed_{false}; - Tensor weights_trans_; - Tensor _tmp_out; - conv_direct_int8_impl impl_int8_{nullptr}; - std::vector w_scale_; -}; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_direct_3x3s1.cc b/lite/backends/arm/math/conv_direct_3x3s1.cc deleted file mode 100644 index 6991481ee1..0000000000 --- a/lite/backends/arm/math/conv_direct_3x3s1.cc +++ /dev/null @@ -1,1067 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_3x3s1_direct_fp32(const float* i_data, - float* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - const int threads = ctx->threads(); - int l2_size = ctx->llc_size() / sizeof(float); - - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - const int hout_c_block = 4; - const int hout_r_kernel = 2; - const int wout_block = 4; - const int wout_round = ((ow + wout_block - 1) / wout_block) * wout_block; - const int win_round = wout_round + 2; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active) { - // if (param.activation_param.active == Active_relu && - // fabs(param.activation_param.negative_slope) < 1e-6f) { - // flag_relu = true; - // } - // } - int hout_r_block = (l2_size - 2 * win_round * ic) / - (win_round * ic + hout_c_block * wout_round * threads); - hout_r_block = hout_r_block > oh ? oh : hout_r_block; - hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel; - hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block; - - const int hin_r_block = hout_r_block + 2; - - float* tmp_work_space = ctx->workspace_data(); - float ptr_zero[win_round]; // NOLINT - memset(ptr_zero, 0, sizeof(float) * win_round); - float ptr_write[wout_round]; // NOLINT - - int in_len = win_round * ic; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - float* pre_din = tmp_work_space; - - int size_in_channel = win * ih; - int size_out_channel = ow * oh; - int w_stride = ic * 9; // kernel_w * kernel_h; - int w_stride_chin = hout_c_block * 9; // kernel_w * kernel_h * - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int c_remain = oc - (oc / hout_c_block) * hout_c_block; - int c_round_down = (oc / hout_c_block) * hout_c_block; - - int out_row_stride = hout_c_block * wout_round; - for (int n = 0; n < bs; ++n) { - const float* din_batch = i_data + n * ic * size_in_channel; - float* dout_batch = o_data + n * oc * size_out_channel; - for (int h = 0; h < oh; h += hout_r_block) { - int h_kernel = hout_r_block; - if (h + hout_r_block > oh) { - h_kernel = oh - h; - } - int hs = h - pad_h; - int he = hs + h_kernel + 2; - prepack_input_nxw( - din_batch, pre_din, 0, ic, hs, he, ws, we, ic, win, ih, ptr_zero); -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < oc - (hout_c_block - 1); c += hout_c_block) { -#ifdef ARM_WITH_OMP - float* pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float* pre_out = pre_din + pre_in_size; -#endif - const float* block_inr0 = pre_din; - const float* block_inr1 = block_inr0 + in_len; - const float* block_inr2 = block_inr1 + in_len; - const float* block_inr3 = block_inr2 + in_len; - - const float* weight_c = weights + c * w_stride; - const float* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = bias + c; - } - fill_packed_biasc4( - pre_out, bias_ptr, wout_round * hout_c_block * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float* wc0 = weight_c; - - const float* inr0 = block_inr0; - const float* inr1 = block_inr1; - const float* inr2 = block_inr2; - const float* inr3 = block_inr3; - - float* pre_out0 = pre_out + hk * out_row_stride; - float* pre_out1 = pre_out0 + out_row_stride; -#ifdef __aarch64__ - for (int i = 0; i < ic; ++i) { - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - float32x4_t w0 = vld1q_f32(wc0); // w0, v23 - float32x4_t w1 = vld1q_f32(wc0 + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(wc0 + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(wc0 + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(wc0 + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(wc0 + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(wc0 + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(wc0 + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(wc0 + 32); // w8, v31 - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - - int cnt = w_loop; - asm volatile( - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, - outr01*/ - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - "ldp q19, q20, [%[ptr_out1]] \n" /* load outr10, outr11*/ - "ldp q21, q22, [%[ptr_out1], #32]\n" /* load outr10, outr11*/ - "ldp q0, q1, [%[r0]], #16 \n" /* load input r0*/ - "ldp q2, q3, [%[r1]], #16 \n" /* load input r1*/ - "2: \n" /* main loop*/ - /* r0, r1, mul w0, get out r0, r1 */ - "fmla v15.4s , %[w0].4s, v0.s[0]\n" /* outr00 = w0 * r0[0]*/ - "fmla v16.4s , %[w0].4s, v0.s[1]\n" /* outr01 = w0 * r0[1]*/ - "fmla v17.4s , %[w0].4s, v0.s[2]\n" /* outr02 = w0 * r0[2]*/ - "fmla v18.4s , %[w0].4s, v0.s[3]\n" /* outr03 = w0 * r0[3]*/ - "fmla v19.4s , %[w0].4s, v2.s[0]\n" /* outr10 = w0 * r1[0]*/ - "fmla v20.4s , %[w0].4s, v2.s[1]\n" /* outr11 = w0 * r1[1]*/ - "fmla v21.4s , %[w0].4s, v2.s[2]\n" /* outr12 = w0 * r1[2]*/ - "fmla v22.4s , %[w0].4s, v2.s[3]\n" /* outr13 = w0 * r1[3]*/ - - /* r0, r1, mul w1, get out r0, r1 */ - "fmla v15.4s , %[w1].4s, v0.s[1]\n" /* outr00 = w1 * r0[1]*/ - "fmla v16.4s , %[w1].4s, v0.s[2]\n" /* outr01 = w1 * r0[2]*/ - "fmla v17.4s , %[w1].4s, v0.s[3]\n" /* outr02 = w1 * r0[3]*/ - "fmla v18.4s , %[w1].4s, v1.s[0]\n" /* outr03 = w1 * r0[4]*/ - "fmla v19.4s , %[w1].4s, v2.s[1]\n" /* outr10 = w1 * r1[1]*/ - "fmla v20.4s , %[w1].4s, v2.s[2]\n" /* outr11 = w1 * r1[2]*/ - "fmla v21.4s , %[w1].4s, v2.s[3]\n" /* outr12 = w1 * r1[3]*/ - "fmla v22.4s , %[w1].4s, v3.s[0]\n" /* outr13 = w1 * r1[4]*/ - - "ldp q4, q5, [%[r2]], #16 \n" /* load input r2*/ - - /* r0, r1, mul w2, get out r0, r1 */ - "fmla v15.4s , %[w2].4s, v0.s[2]\n" /* outr00 = w2 * r0[2]*/ - "fmla v16.4s , %[w2].4s, v0.s[3]\n" /* outr01 = w2 * r0[3]*/ - "fmla v17.4s , %[w2].4s, v1.s[0]\n" /* outr02 = w2 * r0[0]*/ - "fmla v18.4s , %[w2].4s, v1.s[1]\n" /* outr03 = w2 * r0[1]*/ - "fmla v19.4s , %[w2].4s, v2.s[2]\n" /* outr10 = w2 * r1[2]*/ - "fmla v20.4s , %[w2].4s, v2.s[3]\n" /* outr11 = w2 * r1[3]*/ - "fmla v21.4s , %[w2].4s, v3.s[0]\n" /* outr12 = w2 * r1[0]*/ - "fmla v22.4s , %[w2].4s, v3.s[1]\n" /* outr13 = w2 * r1[1]*/ - - /* r1, r2, mul w3, get out r0, r1 */ - "fmla v15.4s , %[w3].4s, v2.s[0]\n" /* outr00 = w3 * r1[0]*/ - "fmla v16.4s , %[w3].4s, v2.s[1]\n" /* outr01 = w3 * r1[1]*/ - "fmla v17.4s , %[w3].4s, v2.s[2]\n" /* outr02 = w3 * r1[2]*/ - "fmla v18.4s , %[w3].4s, v2.s[3]\n" /* outr03 = w3 * r1[3]*/ - "fmla v19.4s , %[w3].4s, v4.s[0]\n" /* outr10 = w3 * r2[0]*/ - "fmla v20.4s , %[w3].4s, v4.s[1]\n" /* outr11 = w3 * r2[1]*/ - "fmla v21.4s , %[w3].4s, v4.s[2]\n" /* outr12 = w3 * r2[2]*/ - "fmla v22.4s , %[w3].4s, v4.s[3]\n" /* outr13 = w3 * r2[3]*/ - - "ldp q0, q1, [%[r0]], #16 \n" /* load next input r0*/ - - /* r1, r2, mul w4, get out r0, r1 */ - "fmla v15.4s , %[w4].4s, v2.s[1]\n" /* outr00 = w4 * r1[1]*/ - "fmla v16.4s , %[w4].4s, v2.s[2]\n" /* outr01 = w4 * r1[2]*/ - "fmla v17.4s , %[w4].4s, v2.s[3]\n" /* outr02 = w4 * r1[3]*/ - "fmla v18.4s , %[w4].4s, v3.s[0]\n" /* outr03 = w4 * r1[4]*/ - "fmla v19.4s , %[w4].4s, v4.s[1]\n" /* outr10 = w4 * r2[1]*/ - "fmla v20.4s , %[w4].4s, v4.s[2]\n" /* outr11 = w4 * r2[2]*/ - "fmla v21.4s , %[w4].4s, v4.s[3]\n" /* outr12 = w4 * r2[3]*/ - "fmla v22.4s , %[w4].4s, v5.s[0]\n" /* outr13 = w4 * r2[4]*/ - - "ldp q6, q7, [%[r3]], #16 \n" /* load input r3*/ - - /* r1, r2, mul w5, get out r0, r1 */ - "fmla v15.4s , %[w5].4s, v2.s[2]\n" /* outr00 = w5 * r1[2]*/ - "fmla v16.4s , %[w5].4s, v2.s[3]\n" /* outr01 = w5 * r1[3]*/ - "fmla v17.4s , %[w5].4s, v3.s[0]\n" /* outr02 = w5 * r1[0]*/ - "fmla v18.4s , %[w5].4s, v3.s[1]\n" /* outr03 = w5 * r1[1]*/ - "fmla v19.4s , %[w5].4s, v4.s[2]\n" /* outr10 = w5 * r2[2]*/ - "fmla v20.4s , %[w5].4s, v4.s[3]\n" /* outr11 = w5 * r2[3]*/ - "fmla v21.4s , %[w5].4s, v5.s[0]\n" /* outr12 = w5 * r2[0]*/ - "fmla v22.4s , %[w5].4s, v5.s[1]\n" /* outr13 = w5 * r2[1]*/ - - /* r2, r3, mul w6, get out r0, r1 */ - "fmla v15.4s , %[w6].4s, v4.s[0]\n" /* outr00 = w6 * r2[0]*/ - "fmla v16.4s , %[w6].4s, v4.s[1]\n" /* outr01 = w6 * r2[1]*/ - "fmla v17.4s , %[w6].4s, v4.s[2]\n" /* outr02 = w6 * r2[2]*/ - "fmla v18.4s , %[w6].4s, v4.s[3]\n" /* outr03 = w6 * r2[3]*/ - "fmla v19.4s , %[w6].4s, v6.s[0]\n" /* outr10 = w6 * r3[0]*/ - "fmla v20.4s , %[w6].4s, v6.s[1]\n" /* outr11 = w6 * r3[1]*/ - "fmla v21.4s , %[w6].4s, v6.s[2]\n" /* outr12 = w6 * r3[2]*/ - "fmla v22.4s , %[w6].4s, v6.s[3]\n" /* outr13 = w6 * r3[3]*/ - - "ldp q2, q3, [%[r1]], #16 \n" /* load next input r1*/ - - /* r2, r3, mul w7, get out r0, r1 */ - "fmla v15.4s , %[w7].4s, v4.s[1]\n" /* outr00 = w7 * r2[1]*/ - "fmla v16.4s , %[w7].4s, v4.s[2]\n" /* outr01 = w7 * r2[2]*/ - "fmla v17.4s , %[w7].4s, v4.s[3]\n" /* outr02 = w7 * r2[3]*/ - "fmla v18.4s , %[w7].4s, v5.s[0]\n" /* outr03 = w7 * r2[4]*/ - "fmla v19.4s , %[w7].4s, v6.s[1]\n" /* outr10 = w7 * r3[1]*/ - "fmla v20.4s , %[w7].4s, v6.s[2]\n" /* outr11 = w7 * r3[2]*/ - "fmla v21.4s , %[w7].4s, v6.s[3]\n" /* outr12 = w7 * r3[3]*/ - "fmla v22.4s , %[w7].4s, v7.s[0]\n" /* outr13 = w7 * r3[4]*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - - /* r2, r3, mul w8, get out r0, r1 */ - "fmla v15.4s , %[w8].4s, v4.s[2]\n" /* outr00 = w8 * r2[2]*/ - "fmla v16.4s , %[w8].4s, v4.s[3]\n" /* outr01 = w8 * r2[3]*/ - "fmla v17.4s , %[w8].4s, v5.s[0]\n" /* outr02 = w8 * r2[0]*/ - "fmla v18.4s , %[w8].4s, v5.s[1]\n" /* outr03 = w8 * r2[1]*/ - - "stp q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/ - "fmla v19.4s , %[w8].4s, v6.s[2]\n" /* outr10 = w8 * r3[2]*/ - "stp q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/ - "fmla v20.4s , %[w8].4s, v6.s[3]\n" /* outr11 = w8 * r3[3]*/ - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, outr01*/ - "fmla v21.4s , %[w8].4s, v7.s[0]\n" /* outr12 = w8 * r3[0]*/ - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - "fmla v22.4s , %[w8].4s, v7.s[1]\n" /* outr13 = w8 * r3[1]*/ - "stp q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/ - "stp q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/ - "ldp q19, q20, [%[ptr_out1]] \n" /* load outr10, outr11*/ - "ldp q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/ - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [w7] "w"(w7), - [w8] "w"(w8) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < ic; ++i) { - const float* wc0 = weight_c + i * w_stride_chin; - - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - - int cnt = w_loop; - asm volatile( - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ " - "load outr0, w0, w1, c0~c3\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - - /* load weights */ - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - /* load r0, r1 */ - "vld1.32 {d0-d1}, [%[r0]]! @ load r0, " - "4 float\n" - "vld1.32 {d2}, [%[r0]] @ load r0, " - "2 float\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - /* main loop */ - "0: @ main " - "loop\n" - /* mul r0 with w0, w1, w2, get out r0 */ - "vld1.32 {d24-d27}, [%[ptr_out1]]! @ load " - "outr1, w0, w1, c0~c3\n" - "vmla.f32 q8, q5, d0[0] @ w0 * " - "inr00\n" - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load " - "outr1, w2, w3, c0~c3\n" - "vmla.f32 q9, q5, d0[1] @ w0 * " - "inr01\n" - "vmla.f32 q10, q5, d1[0] @ w0 * " - "inr02\n" - "vmla.f32 q11, q5, d1[1] @ w0 * " - "inr03\n" - "vld1.32 {d3-d4}, [%[r1]]! @ load r1, " - "4 float\n" - "vmla.f32 q8, q6, d0[1] @ w1 * " - "inr01\n" - "vmla.f32 q9, q6, d1[0] @ w1 * " - "inr02\n" - "vmla.f32 q10, q6, d1[1] @ w1 * " - "inr03\n" - "vmla.f32 q11, q6, d2[0] @ w1 * " - "inr04\n" - "vld1.32 {d5}, [%[r1]] @ load r0, " - "2 float\n" - "vmla.f32 q8, q7, d1[0] @ w2 * " - "inr02\n" - "vmla.f32 q9, q7, d1[1] @ w2 * " - "inr03\n" - "vmla.f32 q10, q7, d2[0] @ w2 * " - "inr04\n" - "vmla.f32 q11, q7, d2[1] @ w2 * " - "inr05\n" - - "sub %[ptr_out1], %[ptr_out1], #32 @ ptr_out1 " - "- 32, to start address\n" - - /* mul r1 with w0, w1, w2, get out r1 */ - "vmla.f32 q12, q5, d3[0] @ w0 * " - "inr10\n" - "vmla.f32 q13, q5, d3[1] @ w0 * " - "inr11\n" - "vmla.f32 q14, q5, d4[0] @ w0 * " - "inr12\n" - "vmla.f32 q15, q5, d4[1] @ w0 * " - "inr13\n" - "vmla.f32 q12, q6, d3[1] @ w1 * " - "inr11\n" - "vmla.f32 q13, q6, d4[0] @ w1 * " - "inr12\n" - "vmla.f32 q14, q6, d4[1] @ w1 * " - "inr13\n" - "vmla.f32 q15, q6, d5[0] @ w1 * " - "inr14\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w3, " - "w4, to q5, q6\n" - "vmla.f32 q12, q7, d4[0] @ w2 * " - "inr12\n" - "vmla.f32 q13, q7, d4[1] @ w2 * " - "inr13\n" - "vmla.f32 q14, q7, d5[0] @ w2 * " - "inr14\n" - "vmla.f32 q15, q7, d5[1] @ w2 * " - "inr15\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w5, " - "to q7\n" - - /* mul r1 with w3, w4, w5, get out r0 */ - "vmla.f32 q8, q5, d3[0] @ w3 * " - "inr10\n" - "vmla.f32 q9, q5, d3[1] @ w3 * " - "inr11\n" - "vmla.f32 q10, q5, d4[0] @ w3 * " - "inr12\n" - "vmla.f32 q11, q5, d4[1] @ w3 * " - "inr13\n" - "vld1.32 {d0-d1}, [%[r2]]! @ load r2, " - "4 float\n" - "vmla.f32 q8, q6, d3[1] @ w4 * " - "inr11\n" - "vmla.f32 q9, q6, d4[0] @ w4 * " - "inr12\n" - "vmla.f32 q10, q6, d4[1] @ w4 * " - "inr13\n" - "vmla.f32 q11, q6, d5[0] @ w4 * " - "inr14\n" - "vld1.32 {d2}, [%[r2]] @ load r2, " - "2 float\n" - "vmla.f32 q8, q7, d4[0] @ w5 * " - "inr12\n" - "vmla.f32 q9, q7, d4[1] @ w5 * " - "inr13\n" - "vmla.f32 q10, q7, d5[0] @ w5 * " - "inr14\n" - "vmla.f32 q11, q7, d5[1] @ w5 * " - "inr15\n" - - /* mul r2 with w3, w4, w5, get out r1 */ - "vmla.f32 q12, q5, d0[0] @ w3 * " - "inr20\n" - "vmla.f32 q13, q5, d0[1] @ w3 * " - "inr21\n" - "vmla.f32 q14, q5, d1[0] @ w3 * " - "inr22\n" - "vmla.f32 q15, q5, d1[1] @ w3 * " - "inr23\n" - "vmla.f32 q12, q6, d0[1] @ w4 * " - "inr21\n" - "vmla.f32 q13, q6, d1[0] @ w4 * " - "inr22\n" - "vmla.f32 q14, q6, d1[1] @ w4 * " - "inr23\n" - "vmla.f32 q15, q6, d2[0] @ w4 * " - "inr24\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w6, " - "w7, to q5, q6\n" - "vmla.f32 q12, q7, d1[0] @ w5 * " - "inr22\n" - "vmla.f32 q13, q7, d1[1] @ w5 * " - "inr23\n" - "vmla.f32 q14, q7, d2[0] @ w5 * " - "inr24\n" - "vmla.f32 q15, q7, d2[1] @ w5 * " - "inr25\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w8, " - "to q7\n" - - "sub %[wc0], %[wc0], #144 @ wc0 - " - "144 to start address\n" - - /* mul r2 with w6, w7, w8, get out r0 */ - "vmla.f32 q8, q5, d0[0] @ w6 * " - "inr20\n" - "vmla.f32 q9, q5, d0[1] @ w6 * " - "inr21\n" - "vld1.32 {d3-d4}, [%[r3]]! @ load r3, " - "4 float\n" - "vmla.f32 q10, q5, d1[0] @ w6 * " - "inr22\n" - "vmla.f32 q11, q5, d1[1] @ w6 * " - "inr23\n" - "vmla.f32 q8, q6, d0[1] @ w7 * " - "inr21\n" - "vmla.f32 q9, q6, d1[0] @ w7 * " - "inr22\n" - "vld1.32 {d5}, [%[r3]] @ load r3, " - "2 float\n" - "vmla.f32 q10, q6, d1[1] @ w7 * " - "inr23\n" - "vmla.f32 q11, q6, d2[0] @ w7 * " - "inr24\n" - "vmla.f32 q8, q7, d1[0] @ w8 * " - "inr22\n" - "vmla.f32 q9, q7, d1[1] @ w8 * " - "inr23\n" - "vld1.32 {d0-d1}, [%[r0]]! @ load r0, " - "4 float\n" - "vmla.f32 q10, q7, d2[0] @ w8 * " - "inr24\n" - "vmla.f32 q11, q7, d2[1] @ w8 * " - "inr25\n" - "vld1.32 {d2}, [%[r0]] @ load r0, " - "2 float\n" - - /* mul r3 with w6, w7, w8, get out r1 */ - "vmla.f32 q12, q5, d3[0] @ w6 * " - "inr20\n" - "vmla.f32 q13, q5, d3[1] @ w6 * " - "inr21\n" - "vst1.32 {d16-d19}, [%[ptr_out0]]! @ save " - "r00, r01, c0~c3\n" - "vmla.f32 q14, q5, d4[0] @ w6 * " - "inr22\n" - "vmla.f32 q15, q5, d4[1] @ w6 * " - "inr23\n" - "vst1.32 {d20-d23}, [%[ptr_out0]]! @ save " - "r02, r03, c0~c3\n" - "vmla.f32 q12, q6, d3[1] @ w7 * " - "inr21\n" - "vmla.f32 q13, q6, d4[0] @ w7 * " - "inr22\n" - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ load " - "outr0, w0, w1, c0~c3\n" - "vmla.f32 q14, q6, d4[1] @ w7 * " - "inr23\n" - "vmla.f32 q15, q6, d5[0] @ w7 * " - "inr24\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vmla.f32 q12, q7, d4[0] @ w8 * " - "inr22\n" - "vmla.f32 q13, q7, d4[1] @ w8 * " - "inr23\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - "vmla.f32 q14, q7, d5[0] @ w8 * " - "inr24\n" - "vmla.f32 q15, q7, d5[1] @ w8 * " - "inr25\n" - - "vst1.32 {d24-d27}, [%[ptr_out1]]! @ save " - "r10, r11, c0~c3\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save " - "r12, r13, c0~c3\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - "subs %[cnt], #1 @ loop " - "count--\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1), - [wc0] "+r"(wc0) - : - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr2; - block_inr1 = block_inr3; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - } - write_to_output_c4_fp32(pre_out, - dout_batch, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - oc, - oh, - ow, - flag_relu, - ptr_write); - } - const float* weight_remain_ptr = weights + c_round_down * w_stride; -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < c_remain; ++c) { -#ifdef ARM_WITH_OMP - float* pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float* pre_out = pre_din + pre_in_size; -#endif - - int c_idx = c_round_down + c; - - int h_kernel = hout_r_block; - if (h + hout_r_block > oh) { - h_kernel = oh - h; - } - - const float* block_inr0 = pre_din; - const float* block_inr1 = block_inr0 + in_len; - const float* block_inr2 = block_inr1 + in_len; - const float* block_inr3 = block_inr2 + in_len; - - const float* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = bias + c_idx; - } - fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float* wc0 = weight_remain_ptr; - - const float* inr0 = block_inr0; - const float* inr1 = block_inr1; - const float* inr2 = block_inr2; - const float* inr3 = block_inr3; - - float* pre_out0 = pre_out + hk * wout_round; - float* pre_out1 = pre_out0 + wout_round; -#ifdef __aarch64__ - for (int i = 0; i < ic; ++i) { - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - float32x4_t w0 = vdupq_n_f32(wc0[c]); // w0, v23 - float32x4_t w1 = vdupq_n_f32(wc0[4 + c]); // w1, v24 - float32x4_t w2 = vdupq_n_f32(wc0[8 + c]); // w2, v25 - float32x4_t w3 = vdupq_n_f32(wc0[12 + c]); // w3, v26 - float32x4_t w4 = vdupq_n_f32(wc0[16 + c]); // w4, v27 - float32x4_t w5 = vdupq_n_f32(wc0[20 + c]); // w5, v28 - float32x4_t w6 = vdupq_n_f32(wc0[24 + c]); // w6, v29 - float32x4_t w7 = vdupq_n_f32(wc0[28 + c]); // w7, v30 - float32x4_t w8 = vdupq_n_f32(wc0[32 + c]); // w8, v31 - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - - int cnt = w_loop; - asm volatile( - "ldr q21, [%[ptr_out0]] \n" /* load outr0, - w0~w3*/ - "ldr q22, [%[ptr_out1]] \n" /* load outr1, w0~w3*/ - "ldp q0, q1, [%[r0]], #16 \n" /* load input r0*/ - "ldp q2, q3, [%[r1]], #16 \n" /* load input r1*/ - "ldp q4, q5, [%[r2]], #16 \n" /* load input r2*/ - "ldp q6, q7, [%[r3]], #16 \n" /* load input r3*/ - "2: \n" /* main loop*/ - - "fmla v21.4s , %[w0].4s, v0.4s \n" /* outr0 = w0 * r0*/ - "fmla v22.4s , %[w0].4s, v2.4s \n" /* outr1 = w0 * r1*/ - - "ext v8.16b, v0.16b, v1.16b, #4 \n" /* shift r0 left 1*/ - "ext v10.16b, v2.16b, v3.16b, #4 \n" /* shift r1 left 1*/ - "ext v9.16b, v0.16b, v1.16b, #8 \n" /* shift r0 left 2*/ - "ext v11.16b, v2.16b, v3.16b, #8 \n" /* shift r1 left 2*/ - - "ldp q0, q1, [%[r0]], #16 \n" /* load input r0*/ - - "fmla v21.4s , %[w1].4s, v8.4s \n" /* outr0 = w1 * r1*/ - "fmla v22.4s , %[w1].4s, v10.4s \n" /* outr1 = w1 * r2*/ - - "fmla v21.4s , %[w2].4s, v9.4s \n" /* outr0 = w2 * r1*/ - "fmla v22.4s , %[w2].4s, v11.4s \n" /* outr1 = w2 * r2*/ - - "fmla v21.4s , %[w3].4s, v2.4s \n" /* outr0 = w3 * r1*/ - "fmla v22.4s , %[w3].4s, v4.4s \n" /* outr1 = w3 * r2*/ - - "ext v12.16b, v4.16b, v5.16b, #4\n" /* shift r2 left 1*/ - "ext v14.16b, v6.16b, v7.16b, #4\n" /* shift r3 left 1*/ - "ext v13.16b, v4.16b, v5.16b, #8\n" /* shift r2 left 2*/ - "ext v15.16b, v6.16b, v7.16b, #8\n" /* shift r3 left 2*/ - - "fmla v21.4s , %[w4].4s, v10.4s \n" /* outr0 = w4 * r1*/ - "fmla v22.4s , %[w4].4s, v12.4s \n" /* outr1 = w4 * r2*/ - - "fmla v21.4s , %[w5].4s, v11.4s \n" /* outr0 = w5 * r1*/ - "fmla v22.4s , %[w5].4s, v13.4s \n" /* outr1 = w5 * r2*/ - - "ldp q2, q3, [%[r1]], #16 \n" /* load input r0*/ - - "fmla v21.4s , %[w6].4s, v4.4s \n" /* outr0 = w6 * r2*/ - "fmla v22.4s , %[w6].4s, v6.4s \n" /* outr1 = w6 * r3*/ - - "ldp q4, q5, [%[r2]], #16 \n" /* load input r2*/ - - "fmla v21.4s , %[w7].4s, v12.4s \n" /* outr0 = w7 * r1*/ - "fmla v22.4s , %[w7].4s, v14.4s \n" /* outr1 = w7 * r2*/ - - "ldp q6, q7, [%[r3]], #16 \n" /* load input r3*/ - - "fmla v21.4s , %[w8].4s, v13.4s \n" /* outr0 = w8 * r1*/ - "fmla v22.4s , %[w8].4s, v15.4s \n" /* outr1 = w8 * r2*/ - - "str q21, [%[ptr_out0]], #16 \n" /*write output r0*/ - "str q22, [%[ptr_out1]], #16 \n" /*write output r1*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - - "ldr q21, [%[ptr_out0]] \n" /* load outr0, w0~w3*/ - "ldr q22, [%[ptr_out1]] \n" /* load outr1, w0~w3*/ - - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [w7] "w"(w7), - [w8] "w"(w8) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v21", - "v22"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < ic; ++i) { - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - //! get valid weights of current output channel - float w_tmp[10] = {wc0[c], - wc0[c + 4], - wc0[c + 8], - wc0[c + 12], - wc0[c + 16], - wc0[c + 20], - wc0[c + 24], - wc0[c + 28], - wc0[c + 32], - 0.f}; - float32x4_t w0 = vld1q_f32(w_tmp); // w0, w1, w2, q0 - float32x4_t w1 = vld1q_f32(w_tmp + 3); // w3, w4, w5, q1 - float32x4_t w2 = vld1q_f32(w_tmp + 6); // w6, w7, w8, q2 - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - int cnt = w_loop / 2; - if (cnt > 0) { - asm volatile( - "vld1.32 {d24-d27}, [%[ptr_out0]] @ " - "load or00, or01\n" - "vld1.32 {d6-d9}, [%[r0]]! @ load r0, 8 " - "float\n" - "vld1.32 {d10}, [%[r0]] @ load r0, 2 " - "float\n" - /* main loop */ - "0: @ main loop\n" - /* r0 * w0, w1, w2, get out r0*/ - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load or10, " - "or11\n" - "vext.32 q8, q3, q4, #1 @ r0, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r0, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q12, q3, %e[w0][0] @ w00 * r0, " - "0, 1, 2, 3\n" - "vmla.f32 q13, q4, %e[w0][0] @ w00 * r0, " - "4, 5, 6, 7\n" - "vext.32 q10, q3, q4, #2 @ r0, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r0, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q12, q8, %e[w0][1] @ w01 * r0, " - "1, 2, 3, 4\n" - "vmla.f32 q13, q9, %e[w0][1] @ w01 * r0, " - "5, 6, 7, 8\n" - "vld1.32 {d6-d9}, [%[r1]]! @ load r1, 8 " - "float\n" - "vmla.f32 q12, q10, %f[w0][0] @ w02 * r0, " - "2, 3, 4, 5\n" - "vmla.f32 q13, q11, %f[w0][0] @ w02 * r0, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r1]] @ load r1, 2 " - "float\n" - - /* r1 * w3, w4, w5, get out r0*/ - /* r1 * w0, w1, w2, get out r1*/ - "vmla.f32 q12, q3, %e[w1][0] @ w10 * r1, " - "0, 1, 2, 3\n" - "vmla.f32 q13, q4, %e[w1][0] @ w10 * r1, " - "4, 5, 6, 7\n" - "vext.32 q8, q3, q4, #1 @ r1, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r1, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q14, q3, %e[w0][0] @ w00 * r1, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q4, %e[w0][0] @ w00 * r1, " - "4, 5, 6, 7\n" - "vext.32 q10, q3, q4, #2 @ r1, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r1, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q12, q8, %e[w1][1] @ w11 * r1, " - "1, 2, 3, 4\n" - "vmla.f32 q13, q9, %e[w1][1] @ w11 * r1, " - "5, 6, 7, 8\n" - "vmla.f32 q14, q8, %e[w0][1] @ w01 * r1, " - "1, 2, 3, 4\n" - "vmla.f32 q15, q9, %e[w0][1] @ w01 * r1, " - "5, 6, 7, 8\n" - "vld1.32 {d6-d9}, [%[r2]]! @ load r2, 8 " - "float\n" - "vmla.f32 q12, q10, %f[w1][0] @ w12 * r1, " - "2, 3, 4, 5\n" - "vmla.f32 q13, q11, %f[w1][0] @ w12 * r1, " - "6, 7, 8, 9\n" - "vmla.f32 q14, q10, %f[w0][0] @ w02 * r1, " - "2, 3, 4, 5\n" - "vmla.f32 q15, q11, %f[w0][0] @ w02 * r1, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r2]] @ load r2, 2 " - "float\n" - - /* r2 * w6, w7, w8, get out r0*/ - /* r2 * w3, w4, w5, get out r1*/ - "vmla.f32 q12, q3, %e[w2][0] @ w20 * r2, " - "0, 1, 2, 3\n" - "vmla.f32 q13, q4, %e[w2][0] @ w20 * r2, " - "4, 5, 6, 7\n" - "vext.32 q8, q3, q4, #1 @ r2, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r2, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q14, q3, %e[w1][0] @ w10 * r2, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q4, %e[w1][0] @ w10 * r2, " - "4, 5, 6, 7\n" - "vext.32 q10, q3, q4, #2 @ r2, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r2, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q12, q8, %e[w2][1] @ w21 * r2, " - "1, 2, 3, 4\n" - "vmla.f32 q13, q9, %e[w2][1] @ w21 * r2, " - "5, 6, 7, 8\n" - "vmla.f32 q14, q8, %e[w1][1] @ w11 * r2, " - "1, 2, 3, 4\n" - "vmla.f32 q15, q9, %e[w1][1] @ w11 * r2, " - "5, 6, 7, 8\n" - "vld1.32 {d6-d9}, [%[r3]]! @ load r3, 8 " - "float\n" - "vmla.f32 q12, q10, %f[w2][0] @ w22 * r2, " - "2, 3, 4, 5\n" - "vmla.f32 q13, q11, %f[w2][0] @ w22 * r2, " - "6, 7, 8, 9\n" - "vmla.f32 q14, q10, %f[w1][0] @ w12 * r2, " - "2, 3, 4, 5\n" - "vmla.f32 q15, q11, %f[w1][0] @ w12 * r2, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r3]] @ load r3, 2 " - "float\n" - - /* r3 * w6, w7, w8, get out r1*/ - "vext.32 q8, q3, q4, #1 @ r3, shift " - "left 1, get 1, 2, 3, 4\n" - "vext.32 q9, q4, q5, #1 @ r3, shift " - "left 1, get 5, 6, 7, 8\n" - "vmla.f32 q14, q3, %e[w2][0] @ w20 * r3, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q4, %e[w2][0] @ w20 * r3, " - "4, 5, 6, 7\n" - "vst1.32 {d24-d27}, [%[ptr_out0]]! @ save or00, " - "or01\n" - "vext.32 q10, q3, q4, #2 @ r3, shift " - "left 2, get 2, 3, 4, 5\n" - "vext.32 q11, q4, q5, #2 @ r3, shift " - "left 2, get 6, 7, 8, 9\n" - "vmla.f32 q14, q8, %e[w2][1] @ w21 * r3, " - "0, 1, 2, 3\n" - "vmla.f32 q15, q9, %e[w2][1] @ w21 * r3, " - "4, 5, 6, 7\n" - "vld1.32 {d24-d27}, [%[ptr_out0]] @ load or00, " - "or01\n" - "vld1.32 {d6-d9}, [%[r0]]! @ load r3, 8 " - "float\n" - "vmla.f32 q14, q10, %f[w2][0] @ w22 * r3, " - "2, 3, 4, 5\n" - "vmla.f32 q15, q11, %f[w2][0] @ w22 * r3, " - "6, 7, 8, 9\n" - "vld1.32 {d10}, [%[r0]] @ load r0, 2 " - "float\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save or10, " - "or11\n" - - "subs %[cnt], #1 @loop count " - "-1\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - r0 -= 8; - } - //! deal with remain ow - if (w_loop & 1) { - ptr_out0[0] += - r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] + - r1[0] * w_tmp[3] + r1[1] * w_tmp[4] + r1[2] * w_tmp[5] + - r2[0] * w_tmp[6] + r2[1] * w_tmp[7] + r2[2] * w_tmp[8]; - - ptr_out0[1] += - r0[1] * w_tmp[0] + r0[2] * w_tmp[1] + r0[3] * w_tmp[2] + - r1[1] * w_tmp[3] + r1[2] * w_tmp[4] + r1[3] * w_tmp[5] + - r2[1] * w_tmp[6] + r2[2] * w_tmp[7] + r2[3] * w_tmp[8]; - - ptr_out0[2] += - r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] + - r1[2] * w_tmp[3] + r1[3] * w_tmp[4] + r1[4] * w_tmp[5] + - r2[2] * w_tmp[6] + r2[3] * w_tmp[7] + r2[4] * w_tmp[8]; - - ptr_out0[3] += - r0[3] * w_tmp[0] + r0[4] * w_tmp[1] + r0[5] * w_tmp[2] + - r1[3] * w_tmp[3] + r1[4] * w_tmp[4] + r1[5] * w_tmp[5] + - r2[3] * w_tmp[6] + r2[4] * w_tmp[7] + r2[5] * w_tmp[8]; - - ptr_out1[0] += - r1[0] * w_tmp[0] + r1[1] * w_tmp[1] + r1[2] * w_tmp[2] + - r2[0] * w_tmp[3] + r2[1] * w_tmp[4] + r2[2] * w_tmp[5] + - r3[0] * w_tmp[6] + r3[1] * w_tmp[7] + r3[2] * w_tmp[8]; - - ptr_out1[1] += - r1[1] * w_tmp[0] + r1[2] * w_tmp[1] + r1[3] * w_tmp[2] + - r2[1] * w_tmp[3] + r2[2] * w_tmp[4] + r2[3] * w_tmp[5] + - r3[1] * w_tmp[6] + r3[2] * w_tmp[7] + r3[3] * w_tmp[8]; - - ptr_out1[2] += - r1[2] * w_tmp[0] + r1[3] * w_tmp[1] + r1[4] * w_tmp[2] + - r2[2] * w_tmp[3] + r2[3] * w_tmp[4] + r2[4] * w_tmp[5] + - r3[2] * w_tmp[6] + r3[3] * w_tmp[7] + r3[4] * w_tmp[8]; - - ptr_out1[3] += - r1[3] * w_tmp[0] + r1[4] * w_tmp[1] + r1[5] * w_tmp[2] + - r2[3] * w_tmp[3] + r2[4] * w_tmp[4] + r2[5] * w_tmp[5] + - r3[3] * w_tmp[6] + r3[4] * w_tmp[7] + r3[5] * w_tmp[8]; - } - - wc0 += 36; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr2; - block_inr1 = block_inr3; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - } - write_to_output_c1_fp32(pre_out, - dout_batch, - c_idx, - c_idx + 1, - h, - h + h_kernel, - 0, - wout_round, - oc, - oh, - ow, - flag_relu, - ptr_write); - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_direct_3x3s2.cc b/lite/backends/arm/math/conv_direct_3x3s2.cc deleted file mode 100644 index 4bc9c5d25b..0000000000 --- a/lite/backends/arm/math/conv_direct_3x3s2.cc +++ /dev/null @@ -1,1209 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_3x3s2_direct_fp32(const float* i_data, - float* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - //! 3x3s2 convolution, implemented by direct algorithm - //! prepack input to tmp buffer - //! write output to tmp buffer - const int threads = ctx->threads(); - int l2_size = ctx->llc_size() / sizeof(float); - const int pad_w = param.paddings[1]; - const int pad_h = param.paddings[0]; - const int hout_c_block = 4; - const int hout_r_kernel = 2; - const int wout_block = 4; - const int wout_round = ((ow + wout_block - 1) / wout_block) * wout_block; - const int win_round = wout_round * 2 /*stride_w*/ + 1; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active) { - // if (param.activation_param.active == Active_relu && - // fabs(param.activation_param.negative_slope) < 1e-6f) { - // flag_relu = true; - // } - // } - //! get h block - //! win_round * ic * hin_r_block + wout_round * hout_c_block * hout_r_block - //! * threads = l2_size - //! win_round = 2 * wout_round + 1 - //! hin_r_block = 2 * hout_r_block + 1 - int hout_r_block = - (l2_size - 2 * wout_round * ic - ic) / - ((4 * wout_round + 2) * ic + wout_round * hout_c_block * threads); - hout_r_block = hout_r_block > oh ? oh : hout_r_block; - hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel; - hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block; - - const int hin_r_block = hout_r_block * 2 /*stride_h*/ + 1; - - float* tmp_work_space = ctx->workspace_data(); - float ptr_zero[win_round]; // NOLINT - memset(ptr_zero, 0, sizeof(float) * win_round); - float ptr_write[wout_round]; // NOLINT - - int in_len = win_round * ic; - int pre_in_size = hin_r_block * in_len; - int pre_out_size = hout_c_block * hout_r_block * wout_round; - - //! l2_cache start - float* pre_din = tmp_work_space; - - int size_in_channel = win * ih; - int size_out_channel = ow * oh; - int w_stride = ic * 9; /*kernel_w * kernel_h*/ - int w_stride_chin = hout_c_block * 9; // kernel_w * kernel_h * - - int ws = -pad_w; - int we = ws + win_round; - int w_loop = wout_round / 4; - - int c_remain = oc - (oc / hout_c_block) * hout_c_block; - int c_round_down = (oc / hout_c_block) * hout_c_block; - - int out_row_stride = hout_c_block * wout_round; - - for (int n = 0; n < bs; ++n) { - const float* din_batch = i_data + n * ic * size_in_channel; - float* dout_batch = o_data + n * oc * size_out_channel; - for (int h = 0; h < oh; h += hout_r_block) { - int h_kernel = hout_r_block; - if (h + hout_r_block > oh) { - h_kernel = oh - h; - } - - int hs = h * 2 /*stride_h*/ - pad_h; - int he = hs + h_kernel * 2 /*stride_h*/ + 1; - - prepack_input_nxw( - din_batch, pre_din, 0, ic, hs, he, ws, we, ic, win, ih, ptr_zero); - - const float* cblock_inr0 = pre_din; - const float* cblock_inr1 = cblock_inr0 + in_len; - const float* cblock_inr2 = cblock_inr1 + in_len; - const float* cblock_inr3 = cblock_inr2 + in_len; - const float* cblock_inr4 = cblock_inr3 + in_len; - -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < c_round_down; c += hout_c_block) { -#ifdef ARM_WITH_OMP - float* pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float* pre_out = pre_din + pre_in_size; -#endif - const float* block_inr0 = cblock_inr0; - const float* block_inr1 = cblock_inr1; - const float* block_inr2 = cblock_inr2; - const float* block_inr3 = cblock_inr3; - const float* block_inr4 = cblock_inr4; - - const float* weight_c = weights + c * w_stride; - const float* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = bias + c; - } - fill_packed_biasc4( - pre_out, bias_ptr, wout_round * hout_c_block * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float* wc0 = weight_c; - - const float* inr0 = block_inr0; - const float* inr1 = block_inr1; - const float* inr2 = block_inr2; - const float* inr3 = block_inr3; - const float* inr4 = block_inr4; - - float* pre_out0 = pre_out + hk * out_row_stride; - float* pre_out1 = pre_out0 + out_row_stride; -#ifdef __aarch64__ - for (int i = 0; i < ic; ++i) { - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - float32x4_t w0 = vld1q_f32(wc0); // w0, v23 - float32x4_t w1 = vld1q_f32(wc0 + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(wc0 + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(wc0 + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(wc0 + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(wc0 + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(wc0 + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(wc0 + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(wc0 + 32); // w8, v31 - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - const float* r4 = inr4; - - int cnt = w_loop; - asm volatile( - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, - outr01*/ - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - - "ldp q0, q1, [%[r0]], #32 \n" /* load input r0*/ - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - "ldp q4, q5, [%[r2]], #32 \n" /* load input r2*/ - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "2: \n" /* main loop*/ - /* r0, r2, mul w0, get out r0, r1 */ - "ldp q19, q20, [%[ptr_out1]] \n" /* load outr10, outr11*/ - "ldp q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/ - "fmla v15.4s , %[w0].4s, v0.s[0]\n" /* outr00 = w0 * r0[0]*/ - "fmla v16.4s , %[w0].4s, v0.s[2]\n" /* outr01 = w0 * r0[2]*/ - "fmla v17.4s , %[w0].4s, v1.s[0]\n" /* outr02 = w0 * r0[4]*/ - "fmla v18.4s , %[w0].4s, v1.s[2]\n" /* outr03 = w0 * r0[6]*/ - "fmla v19.4s , %[w0].4s, v4.s[0]\n" /* outr10 = w0 * r2[0]*/ - "fmla v20.4s , %[w0].4s, v4.s[2]\n" /* outr11 = w0 * r2[2]*/ - "fmla v21.4s , %[w0].4s, v5.s[0]\n" /* outr12 = w0 * r2[4]*/ - "fmla v22.4s , %[w0].4s, v5.s[2]\n" /* outr13 = w0 * r2[6]*/ - - "ldp q2, q3, [%[r1]], #32 \n" /* load input r1*/ - - /* r2 mul w6, get out r0*/ - "fmla v15.4s , %[w6].4s, v4.s[0]\n" /* outr00 = w6 * r2[0]*/ - "fmla v16.4s , %[w6].4s, v4.s[2]\n" /* outr01 = w6 * r2[2]*/ - "fmla v17.4s , %[w6].4s, v5.s[0]\n" /* outr02 = w6 * r2[4]*/ - "fmla v18.4s , %[w6].4s, v5.s[2]\n" /* outr03 = w6 * r2[6]*/ - - "ldr d11, [%[r1]] \n" /* load input r1, 9th - element*/ - - /* r0, r2, mul w1, get out r0, r1 */ - "fmla v15.4s , %[w1].4s, v0.s[1]\n" /* outr00 = w1 * r0[1]*/ - "fmla v16.4s , %[w1].4s, v0.s[3]\n" /* outr01 = w1 * r0[3]*/ - "fmla v17.4s , %[w1].4s, v1.s[1]\n" /* outr02 = w1 * r0[5]*/ - "fmla v18.4s , %[w1].4s, v1.s[3]\n" /* outr03 = w1 * r0[7]*/ - "fmla v19.4s , %[w1].4s, v4.s[1]\n" /* outr10 = w1 * r2[1]*/ - "fmla v20.4s , %[w1].4s, v4.s[3]\n" /* outr11 = w1 * r2[3]*/ - "fmla v21.4s , %[w1].4s, v5.s[1]\n" /* outr12 = w1 * r2[5]*/ - "fmla v22.4s , %[w1].4s, v5.s[3]\n" /* outr13 = w1 * r2[7]*/ - - "ldp q6, q7, [%[r3]], #32 \n" /* load input r3*/ - - /* r2 mul w7, get out r0 */ - "fmla v15.4s , %[w7].4s, v4.s[1]\n" /* outr00 = w7 * r2[1]*/ - "fmla v16.4s , %[w7].4s, v4.s[3]\n" /* outr01 = w7 * r2[3]*/ - "fmla v17.4s , %[w7].4s, v5.s[1]\n" /* outr02 = w7 * r2[5]*/ - "fmla v18.4s , %[w7].4s, v5.s[3]\n" /* outr03 = w7 * r2[7]*/ - - "ldr d13, [%[r3]] \n" /* load input r3, 9th - element*/ - - /* r0, r2, mul w2, get out r0, r1 */ - "fmla v15.4s , %[w2].4s, v0.s[2]\n" /* outr00 = w2 * r0[2]*/ - "fmla v16.4s , %[w2].4s, v1.s[0]\n" /* outr01 = w2 * r0[4]*/ - "fmla v17.4s , %[w2].4s, v1.s[2]\n" /* outr02 = w2 * r0[6]*/ - "fmla v18.4s , %[w2].4s, v10.s[0]\n" /* outr03 = w2 * - r0[8]*/ - "fmla v19.4s , %[w2].4s, v4.s[2]\n" /* outr10 = w2 * r2[2]*/ - "fmla v20.4s , %[w2].4s, v5.s[0]\n" /* outr11 = w2 * r2[4]*/ - "fmla v21.4s , %[w2].4s, v5.s[2]\n" /* outr12 = w2 * r2[6]*/ - "fmla v22.4s , %[w2].4s, v12.s[0]\n" /* outr13 = w2 * - r2[8]*/ - - "ldp q8, q9, [%[r4]], #32 \n" /* load input r4*/ - - /* r2, mul w8, get out r0 */ - "fmla v15.4s , %[w8].4s, v4.s[2]\n" /* outr00 = w8 * r2[2]*/ - "fmla v16.4s , %[w8].4s, v5.s[0]\n" /* outr01 = w8 * r2[4]*/ - "fmla v17.4s , %[w8].4s, v5.s[2]\n" /* outr02 = w8 * r2[6]*/ - "fmla v18.4s , %[w8].4s, v12.s[0]\n" /* outr03 = w8 * - r2[8]*/ - - "ldr d14, [%[r4]] \n" /* load input r4, 9th - element*/ - - /* r1, r3, mul w3, get out r0, r1 */ - "fmla v15.4s , %[w3].4s, v2.s[0]\n" /* outr00 = w3 * r1[0]*/ - "fmla v16.4s , %[w3].4s, v2.s[2]\n" /* outr01 = w3 * r1[2]*/ - "fmla v17.4s , %[w3].4s, v3.s[0]\n" /* outr02 = w3 * r1[4]*/ - "fmla v18.4s , %[w3].4s, v3.s[2]\n" /* outr03 = w3 * r1[6]*/ - "fmla v19.4s , %[w3].4s, v6.s[0]\n" /* outr10 = w3 * r3[0]*/ - "fmla v20.4s , %[w3].4s, v6.s[2]\n" /* outr11 = w3 * r3[2]*/ - "fmla v21.4s , %[w3].4s, v7.s[0]\n" /* outr12 = w3 * r3[4]*/ - "fmla v22.4s , %[w3].4s, v7.s[2]\n" /* outr13 = w3 * r3[6]*/ - - "ldp q0, q1, [%[r0]], #32 \n" /* load input r0*/ - - /* r1, r3, mul w4, get out r0, r1 */ - "fmla v15.4s , %[w4].4s, v2.s[1]\n" /* outr00 = w4 * r1[1]*/ - "fmla v16.4s , %[w4].4s, v2.s[3]\n" /* outr01 = w4 * r1[3]*/ - "fmla v17.4s , %[w4].4s, v3.s[1]\n" /* outr02 = w4 * r1[5]*/ - "fmla v18.4s , %[w4].4s, v3.s[3]\n" /* outr03 = w4 * r1[7]*/ - "fmla v19.4s , %[w4].4s, v6.s[1]\n" /* outr10 = w4 * r3[1]*/ - "fmla v20.4s , %[w4].4s, v6.s[3]\n" /* outr11 = w4 * r3[3]*/ - "fmla v21.4s , %[w4].4s, v7.s[1]\n" /* outr12 = w4 * r3[5]*/ - "fmla v22.4s , %[w4].4s, v7.s[3]\n" /* outr13 = w4 * r3[7]*/ - - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - - /* r1, r3, mul w5, get out r0, r1 */ - "fmla v15.4s , %[w5].4s, v2.s[2]\n" /* outr00 = w5 * r1[2]*/ - "fmla v16.4s , %[w5].4s, v3.s[0]\n" /* outr01 = w5 * r1[4]*/ - "fmla v17.4s , %[w5].4s, v3.s[2]\n" /* outr02 = w5 * r1[6]*/ - "fmla v18.4s , %[w5].4s, v11.s[0]\n" /* outr03 = w5 * - r1[8]*/ - - "ldp q4, q5, [%[r2]], #32 \n" /* load input r2*/ - "stp q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/ - - "fmla v19.4s , %[w5].4s, v6.s[2]\n" /* outr10 = w5 * r3[2]*/ - "fmla v20.4s , %[w5].4s, v7.s[0]\n" /* outr11 = w5 * r3[4]*/ - "fmla v21.4s , %[w5].4s, v7.s[2]\n" /* outr12 = w5 * r3[6]*/ - "fmla v22.4s , %[w5].4s, v13.s[0]\n" /* outr13 = w5 * - r3[8]*/ - - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "stp q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/ - - /* r4, mul w6, get out r1 */ - "fmla v19.4s , %[w6].4s, v8.s[0]\n" /* outr10 = w6 * r4[0]*/ - "fmla v20.4s , %[w6].4s, v8.s[2]\n" /* outr11 = w6 * r4[2]*/ - "fmla v21.4s , %[w6].4s, v9.s[0]\n" /* outr12 = w6 * r4[4]*/ - "fmla v22.4s , %[w6].4s, v9.s[2]\n" /* outr13 = w6 * r4[6]*/ - - "ldp q15, q16, [%[ptr_out0]] \n" /* load outr00, outr01*/ - - /* r4, mul w7, get out r1 */ - "fmla v19.4s , %[w7].4s, v8.s[1]\n" /* outr10 = w7 * r4[1]*/ - "fmla v20.4s , %[w7].4s, v8.s[3]\n" /* outr11 = w7 * r4[3]*/ - "fmla v21.4s , %[w7].4s, v9.s[1]\n" /* outr12 = w7 * r4[5]*/ - "fmla v22.4s , %[w7].4s, v9.s[3]\n" /* outr13 = w7 * r4[7]*/ - - "ldp q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/ - - /* r4, mul w8, get out r1 */ - "fmla v19.4s , %[w8].4s, v8.s[2]\n" /* outr10 = w8 * r4[2]*/ - "fmla v20.4s , %[w8].4s, v9.s[0]\n" /* outr11 = w8 * r4[4]*/ - "fmla v21.4s , %[w8].4s, v9.s[2]\n" /* outr12 = w8 * r4[6]*/ - "fmla v22.4s , %[w8].4s, v14.s[0]\n" /* outr13 = w8 * - r4[8]*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - - "stp q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/ - "stp q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/ - - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [r4] "+r"(r4), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [w7] "w"(w7), - [w8] "w"(w8) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22"); - - wc0 += 9 * hout_c_block; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < ic; ++i) { - const float* wc0 = weight_c + i * w_stride_chin; - - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - const float* r4 = inr4; - - int cnt = w_loop; - asm volatile( - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ " - "load outr0, w0, w1, c0~c3\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - - /* load weights */ - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - /* load r0, r2 */ - "vld1.32 {d0-d3}, [%[r0]]! @ load r0, " - "8 float\n" - "vld1.32 {d8}, [%[r0]] @ load r0, " - "9th float\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - /* main loop */ - "0: @ main " - "loop\n" - /* mul r0, with w0, w1, w2 */ - "vld1.32 {d24-d27}, [%[ptr_out1]]! @ load " - "outr1, w0, w1, c0~c3\n" - "vmla.f32 q8, q5, d0[0] @ w0 * " - "inr00\n" - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load " - "outr1, w2, w3, c0~c3\n" - "vmla.f32 q9, q5, d1[0] @ w0 * " - "inr02\n" - "vmla.f32 q10, q5, d2[0] @ w0 * " - "inr04\n" - "vmla.f32 q11, q5, d3[0] @ w0 * " - "inr06\n" - "vld1.32 {d4-d7}, [%[r2]]! @ load r2, " - "8 float\n" - "vmla.f32 q8, q6, d0[1] @ w1 * " - "inr01\n" - "vmla.f32 q9, q6, d1[1] @ w1 * " - "inr03\n" - "vmla.f32 q10, q6, d2[1] @ w1 * " - "inr05\n" - "vmla.f32 q11, q6, d3[1] @ w1 * " - "inr07\n" - "vld1.32 {d9}, [%[r2]] @ load r2, " - "9th float\n" - "vmla.f32 q8, q7, d1[0] @ w2 * " - "inr02\n" - "vmla.f32 q9, q7, d2[0] @ w2 * " - "inr04\n" - "vmla.f32 q10, q7, d3[0] @ w2 * " - "inr06\n" - "vmla.f32 q11, q7, d8[0] @ w2 * " - "inr08\n" - - "sub %[r2], %[r2], #32 @ r2 - 32, " - "load r2 twice\n" - - /* mul r2, with w0, w1, w2 */ - "vld1.32 {d0-d3}, [%[r1]]! @ load r1, " - "8 float\n" - "vmla.f32 q12, q5, d4[0] @ w0 * " - "inr20\n" - "vmla.f32 q13, q5, d5[0] @ w0 * " - "inr22\n" - "vmla.f32 q14, q5, d6[0] @ w0 * " - "inr24\n" - "vmla.f32 q15, q5, d7[0] @ w0 * " - "inr26\n" - "vld1.32 {d8}, [%[r1]] @ load r1, " - "9th float\n" - "vmla.f32 q12, q6, d4[1] @ w1 * " - "inr21\n" - "vmla.f32 q13, q6, d5[1] @ w1 * " - "inr23\n" - "vmla.f32 q14, q6, d6[1] @ w1 * " - "inr25\n" - "vmla.f32 q15, q6, d7[1] @ w1 * " - "inr27\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w3, " - "w4, to q5, q6\n" - "vmla.f32 q12, q7, d5[0] @ w2 * " - "inr22\n" - "vmla.f32 q13, q7, d6[0] @ w2 * " - "inr24\n" - "vmla.f32 q14, q7, d7[0] @ w2 * " - "inr26\n" - "vmla.f32 q15, q7, d9[0] @ w2 * " - "inr28\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w5, " - "to q7\n" - - /* mul r1, with w3, w4, w5 */ - "vmla.f32 q8, q5, d0[0] @ w3 * " - "inr10\n" - "vmla.f32 q9, q5, d1[0] @ w3 * " - "inr12\n" - "vmla.f32 q10, q5, d2[0] @ w3 * " - "inr14\n" - "vmla.f32 q11, q5, d3[0] @ w3 * " - "inr16\n" - "vld1.32 {d4-d7}, [%[r3]]! @ load r3, " - "8 float\n" - "vmla.f32 q8, q6, d0[1] @ w4 * " - "inr11\n" - "vmla.f32 q9, q6, d1[1] @ w4 * " - "inr13\n" - "vmla.f32 q10, q6, d2[1] @ w4 * " - "inr15\n" - "vmla.f32 q11, q6, d3[1] @ w4 * " - "inr17\n" - "vld1.32 {d9}, [%[r3]] @ load r3, " - "9th float\n" - "vmla.f32 q8, q7, d1[0] @ w5 * " - "inr12\n" - "vmla.f32 q9, q7, d2[0] @ w5 * " - "inr14\n" - "vmla.f32 q10, q7, d3[0] @ w5 * " - "inr16\n" - "vmla.f32 q11, q7, d8[0] @ w5 * " - "inr18\n" - - "sub %[ptr_out1], %[ptr_out1], #32 @ ptr_out1 " - "- 32, to start address\n" - - /* mul r3, with w3, w4, w5 */ - "vld1.32 {d0-d3}, [%[r2]]! @ load r2, " - "8 float\n" - "vmla.f32 q12, q5, d4[0] @ w3 * " - "inr30\n" - "vmla.f32 q13, q5, d5[0] @ w3 * " - "inr32\n" - "vmla.f32 q14, q5, d6[0] @ w3 * " - "inr34\n" - "vmla.f32 q15, q5, d7[0] @ w3 * " - "inr36\n" - "vld1.32 {d8}, [%[r2]] @ load r2, " - "9th float\n" - "vmla.f32 q12, q6, d4[1] @ w4 * " - "inr31\n" - "vmla.f32 q13, q6, d5[1] @ w4 * " - "inr33\n" - "vmla.f32 q14, q6, d6[1] @ w4 * " - "inr35\n" - "vmla.f32 q15, q6, d7[1] @ w4 * " - "inr37\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w6, " - "w7, to q5, q6\n" - "vmla.f32 q12, q7, d5[0] @ w5 * " - "inr32\n" - "vmla.f32 q13, q7, d6[0] @ w5 * " - "inr34\n" - "vmla.f32 q14, q7, d7[0] @ w5 * " - "inr36\n" - "vmla.f32 q15, q7, d9[0] @ w5 * " - "inr38\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w8, " - "to q7\n" - - /* mul r2, with w6, w7, w8 */ - "vmla.f32 q8, q5, d0[0] @ w6 * " - "inr20\n" - "vmla.f32 q9, q5, d1[0] @ w6 * " - "inr22\n" - "vmla.f32 q10, q5, d2[0] @ w6 * " - "inr24\n" - "vmla.f32 q11, q5, d3[0] @ w6 * " - "inr26\n" - "vld1.32 {d4-d7}, [%[r4]]! @ load r4, " - "8 float\n" - "vmla.f32 q8, q6, d0[1] @ w7 * " - "inr21\n" - "vmla.f32 q9, q6, d1[1] @ w7 * " - "inr23\n" - "vmla.f32 q10, q6, d2[1] @ w7 * " - "inr25\n" - "vmla.f32 q11, q6, d3[1] @ w7 * " - "inr27\n" - "vld1.32 {d9}, [%[r4]] @ load r4, " - "9th float\n" - "vmla.f32 q8, q7, d1[0] @ w8 * " - "inr22\n" - "vmla.f32 q9, q7, d2[0] @ w8 * " - "inr24\n" - "vmla.f32 q10, q7, d3[0] @ w8 * " - "inr26\n" - "vmla.f32 q11, q7, d8[0] @ w8 * " - "inr28\n" - - "sub %[wc0], %[wc0], #144 @ wc0 - " - "144 to start address\n" - - /* mul r4, with w6, w7, w8 */ - "vld1.32 {d0-d3}, [%[r0]]! @ load r0, " - "8 float\n" - "vmla.f32 q12, q5, d4[0] @ w3 * " - "inr40\n" - "vst1.32 {d16-d19}, [%[ptr_out0]]! @ save " - "r00, r01, c0~c3\n" - "vmla.f32 q13, q5, d5[0] @ w3 * " - "inr42\n" - "vst1.32 {d20-d23}, [%[ptr_out0]]! @ save " - "r02, r03, c0~c3\n" - "vmla.f32 q14, q5, d6[0] @ w3 * " - "inr44\n" - "vmla.f32 q15, q5, d7[0] @ w3 * " - "inr46\n" - "vld1.32 {d8}, [%[r0]] @ load " - "r0, 9th float\n" - "vmla.f32 q12, q6, d4[1] @ w4 * " - "inr41\n" - "vmla.f32 q13, q6, d5[1] @ w4 * " - "inr43\n" - "vmla.f32 q14, q6, d6[1] @ w4 * " - "inr45\n" - "vmla.f32 q15, q6, d7[1] @ w4 * " - "inr47\n" - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, " - "w1, to q5, q6\n" - "vmla.f32 q12, q7, d5[0] @ w5 * " - "inr42\n" - "vmla.f32 q13, q7, d6[0] @ w5 * " - "inr44\n" - "vmla.f32 q14, q7, d7[0] @ w5 * " - "inr46\n" - "vmla.f32 q15, q7, d9[0] @ w5 * " - "inr48\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, " - "to q7\n" - - "vst1.32 {d24-d27}, [%[ptr_out1]]! @ save " - "r10, r11, c0~c3\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save " - "r12, r13, c0~c3\n" - - "vld1.32 {d16-d19}, [%[ptr_out0]]! @ load " - "outr0, w0, w1, c0~c3\n" - "vld1.32 {d20-d23}, [%[ptr_out0]] @ load " - "outr0, w2, w3, c0~c3\n" - - "sub %[ptr_out0], %[ptr_out0], #32 @ ptr_out0 " - "- 32, to start address\n" - - "subs %[cnt], #1 @ loop " - "count--\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [r4] "+r"(r4), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1), - [wc0] "+r"(wc0) - : - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr4; - block_inr1 = block_inr0 + in_len; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - block_inr4 = block_inr3 + in_len; - } - - write_to_output_c4_fp32(pre_out, - dout_batch, - c, - c + hout_c_block, - h, - h + h_kernel, - 0, - wout_round, - oc, - oh, - ow, - flag_relu, - ptr_write); - } - -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < c_remain; ++c) { -#ifdef ARM_WITH_OMP - float* pre_out = - pre_din + pre_in_size + omp_get_thread_num() * pre_out_size; -#else - float* pre_out = pre_din + pre_in_size; -#endif - - const float* block_inr0 = cblock_inr0; - const float* block_inr1 = cblock_inr1; - const float* block_inr2 = cblock_inr2; - const float* block_inr3 = cblock_inr3; - const float* block_inr4 = cblock_inr4; - - //! get weights ptr of remained - const float* weight_c = weights + c_round_down * w_stride; - - //! fill bias to one channel - const float* bias_ptr = ptr_zero; - if (flag_bias) { - bias_ptr = bias + c_round_down + c; - } - fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel); - - for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) { - const float* wc0 = weight_c; - - const float* inr0 = block_inr0; - const float* inr1 = block_inr1; - const float* inr2 = block_inr2; - const float* inr3 = block_inr3; - const float* inr4 = block_inr4; - - float* pre_out0 = pre_out + hk * wout_round; - float* pre_out1 = pre_out0 + wout_round; -#ifdef __aarch64__ - for (int i = 0; i < ic; ++i) { - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - //! get valid weights of current output channel - float32x4_t w0 = vdupq_n_f32(wc0[c]); // w0, v23 - float32x4_t w1 = vdupq_n_f32(wc0[c + 4]); // w1, v24 - float32x4_t w2 = vdupq_n_f32(wc0[c + 8]); // w2, v25 - float32x4_t w3 = vdupq_n_f32(wc0[c + 12]); // w3, v26 - float32x4_t w4 = vdupq_n_f32(wc0[c + 16]); // w4, v27 - float32x4_t w5 = vdupq_n_f32(wc0[c + 20]); // w5, v28 - float32x4_t w6 = vdupq_n_f32(wc0[c + 24]); // w6, v29 - float32x4_t w7 = vdupq_n_f32(wc0[c + 28]); // w7, v30 - float32x4_t w8 = vdupq_n_f32(wc0[c + 32]); // w8, v31 - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - const float* r4 = inr4; - - int cnt = w_loop; - asm volatile( - "ldr q21, [%[ptr_out0]] \n" /* load outr00, - outr01, - outr02, - outr03*/ - - "ld2 {v0.4s, v1.4s}, [%[r0]], #32 \n" /* load input r0*/ - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - "ld2 {v4.4s, v5.4s}, [%[r2]], #32 \n" /* load input r2*/ - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "2: \n" /* main loop*/ - /* r0, r2, mul w0, get out r0, r1 */ - "ldr q22, [%[ptr_out1]] \n" /* load outr10, outr11, - outr12, outr13*/ - - "fmla v21.4s , %[w0].4s, v0.4s \n" /* outr0 = w0 * r0[0, 2, - 4, 6]*/ - "fmla v22.4s , %[w0].4s, v4.4s \n" /* outr1 = w0 * r2[0, 2, - 4, 6]*/ - - "ld2 {v2.4s, v3.4s}, [%[r1]], #32 \n" /* load input r1*/ - - /* r2 mul w6, get out r0*/ - "fmla v21.4s , %[w6].4s, v4.4s \n" /* outr0 = w6 * r2[0, 2, - 4, 6]*/ - "ldr d11, [%[r1]] \n" /* load input r1, 9th - element*/ - - /* shift left 1 */ - "ext v15.16b, v0.16b, v10.16b, #4\n" /* shift left r0 1*/ - "ext v16.16b, v4.16b, v12.16b, #4\n" /* shift left r2 1*/ - - /* r0, r2, mul w1, get out r0, r1 */ - "fmla v21.4s , %[w1].4s, v1.4s \n" /* outr0 = w1 * r0[1, 3, - 5, 7]*/ - "fmla v22.4s , %[w1].4s, v5.4s \n" /* outr1 = w1 * r2[1, 3, - 5, 7]*/ - - "ld2 {v6.4s, v7.4s}, [%[r3]], #32 \n" /* load input r3*/ - - /* r2 mul w7, get out r0 */ - "fmla v21.4s , %[w7].4s, v5.4s \n" /* outr00 = w7 * r2[1, - 3, 5, 7]*/ - - "ldr d13, [%[r3]] \n" /* load input r3, 9th - element*/ - - /* r0, r2, mul w2, get out r0, r1 */ - "fmla v21.4s , %[w2].4s, v15.4s \n" /* outr0 = w2 * r0[2, 4, - 6, 8]*/ - "fmla v22.4s , %[w2].4s, v16.4s \n" /* outr1 = w2 * r2[2, 4, - 6, 8]*/ - - "ld2 {v8.4s, v9.4s}, [%[r4]], #32 \n" /* load input r4*/ - - /* r2, mul w8, get out r0 */ - "fmla v21.4s , %[w8].4s, v16.4s \n" /* outr00 = w8 * r2[2, - 4, 6, 8]*/ - - "ldr d14, [%[r4]] \n" /* load input r4, 9th - element*/ - - /* r1, r3, mul w3, get out r0, r1 */ - "fmla v21.4s , %[w3].4s, v2.4s \n" /* outr0 = w3 * r1[0, 2, - 4, 6]*/ - "fmla v22.4s , %[w3].4s, v6.4s \n" /* outr1 = w3 * r3[0, 2, - 4, 6]*/ - - /* shift left 1 */ - "ext v15.16b, v2.16b, v11.16b, #4\n" /* shift left r1 1*/ - "ext v16.16b, v6.16b, v13.16b, #4\n" /* shift left r3 1*/ - - "ld2 {v0.4s, v1.4s}, [%[r0]], #32 \n" /* load input r0*/ - - /* r1, r3, mul w4, get out r0, r1 */ - "fmla v21.4s , %[w4].4s, v3.4s \n" /* outr0 = w4 * r1[1, 3, - 5, 7]*/ - "fmla v22.4s , %[w4].4s, v7.4s \n" /* outr1 = w4 * r3[1, 3, - 5, 7]*/ - - "ldr d10, [%[r0]] \n" /* load input r0, 9th - element*/ - - /* r1, r3, mul w5, get out r0, r1 */ - "fmla v21.4s , %[w5].4s, v15.4s \n" /* outr0 = w5 * r1[2]*/ - "fmla v22.4s , %[w5].4s, v16.4s \n" /* outr1 = w5 * r1[4]*/ - - "ld2 {v4.4s, v5.4s}, [%[r2]], #32 \n" /* load input r2*/ - "ldr d12, [%[r2]] \n" /* load input r2, 9th - element*/ - "str q21, [%[ptr_out0]], #16 \n" /* save outr00, outr01*/ - - /* r4, mul w6, get out r1 */ - "fmla v22.4s , %[w6].4s, v8.4s \n" /* outr1 = w6 * r4[0, 2, - 4, 6]*/ - - "ext v15.16b, v8.16b, v14.16b, #4\n" /* shift left r1 1*/ - "ldr q21, [%[ptr_out0]] \n" /* load outr0*/ - - /* r4, mul w7, get out r1 */ - "fmla v22.4s , %[w7].4s, v9.4s \n" /* outr1 = w7 * r4[1, 3, - 5, 7]*/ - - /* r4, mul w8, get out r1 */ - "fmla v22.4s , %[w8].4s, v15.4s \n" /* outr1 = w8 * r4[2, 4, - 6, 8]*/ - - "subs %w[cnt], %w[cnt], #1 \n" /*loop count -1*/ - "str q22, [%[ptr_out1]], #16 \n" /* save outr1*/ - "bne 2b \n" /* jump to main loop*/ - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [r4] "+r"(r4), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), - [w1] "w"(w1), - [w2] "w"(w2), - [w3] "w"(w3), - [w4] "w"(w4), - [w5] "w"(w5), - [w6] "w"(w6), - [w7] "w"(w7), - [w8] "w"(w8) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v21", - "v22"); - - wc0 += 36; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#else // not __aarch64__ - for (int i = 0; i < ic; ++i) { - float* ptr_out0 = pre_out0; - float* ptr_out1 = pre_out1; - - //! get valid weights of current output channel - float w_tmp[12] = {wc0[c], - wc0[c + 4], - wc0[c + 8], - 0.f, - wc0[c + 12], - wc0[c + 16], - wc0[c + 20], - 0.f, - wc0[c + 24], - wc0[c + 28], - wc0[c + 32], - 0.f}; - float32x4_t w0 = vld1q_f32(w_tmp); // w0, w1, w2, q0 - float32x4_t w1 = vld1q_f32(w_tmp + 4); // w3, w4, w5, q1 - float32x4_t w2 = vld1q_f32(w_tmp + 8); // w6, w7, w8, q2 - - const float* r0 = inr0; - const float* r1 = inr1; - const float* r2 = inr2; - const float* r3 = inr3; - const float* r4 = inr4; - - int cnt = w_loop / 2; - if (cnt > 0) { - asm volatile( - /* main loop */ - "0: @ " - "main loop\n" - "vld1.32 {d24-d27}, [%[ptr_out0]] @ load or00, " - "or01\n" - "vld1.32 {d28-d31}, [%[ptr_out1]] @ load or10, " - "or11\n" - "vld2.32 {d6-d9}, [%[r2]]! @ load r2, 8 " - "float, interleave\n" - "vld2.32 {d10-d13}, [%[r2]]! @ load r2, 8 " - "float, interleave\n" - "vld1.32 {d22}, [%[r2]] @ load 16th " - "float\n" - - /* r2 * w2, r2 * w0, get or0, or1 */ - "vmla.f32 q12, q4, %e[w2][1] @ w21 * r2, " - "1, 3, 5, 7\n" - "vmla.f32 q13, q6, %e[w2][1] @ w21 * r2, " - "9, 11, 13, 15\n" - "vld2.32 {d14-d17}, [%[r0]]! @ load r0, 8 " - "float, interleave\n" - "vmla.f32 q14, q4, %e[w0][1] @ w01 * r2, " - "1, 3, 5, 7\n" - "vmla.f32 q15, q6, %e[w0][1] @ w01 * r2, " - "9, 11, 13, 15\n" - - "vext.32 q4, q3, q5, #1 @ r2, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q6, q5, q11, #1 @ r2, shift " - "left 1, get 10, 12, 14, 16\n" - - "vmla.f32 q12, q3, %e[w2][0] @ w20 * r2, " - "0, 2, 4, 6\n" - "vmla.f32 q13, q5, %e[w2][0] @ w20 * r2, " - "8, 10, 12, 14\n" - "vld2.32 {d18-d21}, [%[r0]]! @ load r0, 8 " - "float, interleave\n" - "vmla.f32 q14, q3, %e[w0][0] @ w00 * r2, " - "0, 2, 4, 6\n" - "vmla.f32 q15, q5, %e[w0][0] @ w00 * r2, " - "8, 10, 12, 14\n" - - "vld1.32 {d22}, [%[r0]] @ load 16th " - "float\n" - - "vmla.f32 q12, q4, %f[w2][0] @ w22 * r2, " - "2, 4, 6, 8\n" - "vmla.f32 q14, q4, %f[w0][0] @ w02 * r2, " - "2, 4, 6, 8\n" - "vld2.32 {d6-d9}, [%[r3]]! @ load r3, 8 " - "float, interleave\n" - "vmla.f32 q13, q6, %f[w2][0] @ w22 * r2, " - "10, 12, 14, 16\n" - "vmla.f32 q15, q6, %f[w0][0] @ w02 * r2, " - "10, 12, 14, 16\n" - "vld2.32 {d10-d13}, [%[r3]]! @ load r3, 8 " - "float, interleave\n" - - /* r0 * w0, get or0, r3 * w1, get or1*/ - "vmla.f32 q12, q8, %e[w0][1] @ w01 * r0, " - "1, 3, 5, 7\n" - "vmla.f32 q13, q10, %e[w0][1] @ w01 * r0, " - "9, 11, 13, 15\n" - "vext.32 q8, q7, q9, #1 @ r0, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q10, q9, q11, #1 @ r0, shift " - "left 1, get 10, 12, 14, 16\n" - "vld1.32 {d22}, [%[r3]] @ load 16th " - "float\n" - "vmla.f32 q14, q4, %e[w1][1] @ w11 * r3, " - "1, 3, 5, 7\n" - "vmla.f32 q15, q6, %e[w1][1] @ w11 * r3, " - "9, 11, 13, 15\n" - - "vmla.f32 q12, q7, %e[w0][0] @ w00 * r0, " - "0, 2, 4, 6\n" - "vmla.f32 q13, q9, %e[w0][0] @ w00 * r0, " - "8, 10, 12, 14\n" - "vext.32 q4, q3, q5, #1 @ r3, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q6, q5, q11, #1 @ r3, shift " - "left 1, get 10, 12, 14, 16\n" - "vmla.f32 q14, q3, %e[w1][0] @ w10 * r3, " - "0, 2, 4, 6\n" - "vmla.f32 q15, q5, %e[w1][0] @ w10 * r3, " - "8, 10, 12, 14\n" - - "vmla.f32 q12, q8, %f[w0][0] @ w02 * r0, " - "2, 4, 6, 8\n" - "vld2.32 {d14-d17}, [%[r1]]! @ load r1, 8 " - "float, interleave\n" - "vmla.f32 q13, q10,%f[w0][0] @ w02 * r0, " - "10, 12, 14, 16\n" - "vld2.32 {d18-d21}, [%[r1]]! @ load r1, 8 " - "float, interleave\n" - "vmla.f32 q14, q4, %f[w1][0] @ w12 * r3, " - "2, 4, 6, 8\n" - "vld2.32 {d6-d9}, [%[r4]]! @ load r4, 8 " - "float, interleave\n" - "vmla.f32 q15, q6, %f[w1][0] @ w12 * r3, " - "10, 12, 14, 16\n" - "vld2.32 {d10-d13}, [%[r4]]! @ load r4, 8 " - "float, interleave\n" - - "vld1.32 {d22}, [%[r1]] @ load 16th " - "float\n" - - /* r1 * w1, get or0, r4 * w2, get or1 */ - "vmla.f32 q12, q8, %e[w1][1] @ w11 * r1, " - "1, 3, 5, 7\n" - "vmla.f32 q13, q10, %e[w1][1] @ w11 * r1, " - "9, 11, 13, 15\n" - "vext.32 q8, q7, q9, #1 @ r1, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q10, q9, q11, #1 @ r1, shift " - "left 1, get 10, 12, 14, 16\n" - "vmla.f32 q14, q4, %e[w2][1] @ w21 * r4, " - "1, 3, 5, 7\n" - "vmla.f32 q15, q6, %e[w2][1] @ w21 * r4, " - "9, 11, 13, 15\n" - "vld1.32 {d22}, [%[r4]] @ load 16th " - "float\n" - - "vmla.f32 q12, q7, %e[w1][0] @ w10 * r1, " - "0, 2, 4, 6\n" - "vmla.f32 q13, q9, %e[w1][0] @ w10 * r1, " - "8, 10, 12, 14\n" - "vext.32 q4, q3, q5, #1 @ r1, shift " - "left 1, get 2, 4, 6, 8\n" - "vext.32 q6, q5, q11, #1 @ r1, shift " - "left 1, get 10, 12, 14, 16\n" - "vmla.f32 q14, q3, %e[w2][0] @ w20 * r4, " - "0, 2, 4, 6\n" - "vmla.f32 q15, q5, %e[w2][0] @ w20 * r4, " - "8, 10, 12, 14\n" - - "vmla.f32 q12, q8, %f[w1][0] @ w12 * r1, " - "2, 4, 6, 8\n" - "vmla.f32 q13, q10, %f[w1][0] @ w12 * r1, " - "10, 12, 14, 16\n" - "vmla.f32 q14, q4, %f[w2][0] @ w22 * r4, " - "2, 4, 6, 8\n" - "vmla.f32 q15, q6, %f[w2][0] @ w22 * r4, " - "10, 12, 14, 16\n" - - "vst1.32 {d24-d27}, [%[ptr_out0]]! @ save or0\n" - "vst1.32 {d28-d31}, [%[ptr_out1]]! @ save or0\n" - - "subs %[cnt], #1 @loop count " - "-1\n" - "bne 0b @ jump to " - "main loop\n" - - : [cnt] "+r"(cnt), - [r0] "+r"(r0), - [r1] "+r"(r1), - [r2] "+r"(r2), - [r3] "+r"(r3), - [r4] "+r"(r4), - [ptr_out0] "+r"(ptr_out0), - [ptr_out1] "+r"(ptr_out1) - : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - //! deal with remain ow - if (w_loop & 1) { - ptr_out0[0] += - r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] + - r1[0] * w_tmp[4] + r1[1] * w_tmp[5] + r1[2] * w_tmp[6] + - r2[0] * w_tmp[8] + r2[1] * w_tmp[9] + r2[2] * w_tmp[10]; - - ptr_out0[1] += - r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] + - r1[2] * w_tmp[4] + r1[3] * w_tmp[5] + r1[4] * w_tmp[6] + - r2[2] * w_tmp[8] + r2[3] * w_tmp[9] + r2[4] * w_tmp[10]; - - ptr_out0[2] += - r0[4] * w_tmp[0] + r0[5] * w_tmp[1] + r0[6] * w_tmp[2] + - r1[4] * w_tmp[4] + r1[5] * w_tmp[5] + r1[6] * w_tmp[6] + - r2[4] * w_tmp[8] + r2[5] * w_tmp[9] + r2[6] * w_tmp[10]; - - ptr_out0[3] += - r0[6] * w_tmp[0] + r0[7] * w_tmp[1] + r0[8] * w_tmp[2] + - r1[6] * w_tmp[4] + r1[7] * w_tmp[5] + r1[8] * w_tmp[6] + - r2[6] * w_tmp[8] + r2[7] * w_tmp[9] + r2[8] * w_tmp[10]; - - ptr_out1[0] += - r2[0] * w_tmp[0] + r2[1] * w_tmp[1] + r2[2] * w_tmp[2] + - r3[0] * w_tmp[4] + r3[1] * w_tmp[5] + r3[2] * w_tmp[6] + - r4[0] * w_tmp[8] + r4[1] * w_tmp[9] + r4[2] * w_tmp[10]; - - ptr_out1[1] += - r2[2] * w_tmp[0] + r2[3] * w_tmp[1] + r2[4] * w_tmp[2] + - r3[2] * w_tmp[4] + r3[3] * w_tmp[5] + r3[4] * w_tmp[6] + - r4[2] * w_tmp[8] + r4[3] * w_tmp[9] + r4[4] * w_tmp[10]; - - ptr_out1[2] += - r2[4] * w_tmp[0] + r2[5] * w_tmp[1] + r2[6] * w_tmp[2] + - r3[4] * w_tmp[4] + r3[5] * w_tmp[5] + r3[6] * w_tmp[6] + - r4[4] * w_tmp[8] + r4[5] * w_tmp[9] + r4[6] * w_tmp[10]; - - ptr_out1[3] += - r2[6] * w_tmp[0] + r2[7] * w_tmp[1] + r2[8] * w_tmp[2] + - r3[6] * w_tmp[4] + r3[7] * w_tmp[5] + r3[8] * w_tmp[6] + - r4[6] * w_tmp[8] + r4[7] * w_tmp[9] + r4[8] * w_tmp[10]; - } - - wc0 += 36; - inr0 += win_round; - inr1 += win_round; - inr2 += win_round; - inr3 += win_round; - inr4 += win_round; - } -#endif // __aarch64__ - block_inr0 = block_inr4; - block_inr1 = block_inr0 + in_len; - block_inr2 = block_inr1 + in_len; - block_inr3 = block_inr2 + in_len; - block_inr4 = block_inr3 + in_len; - } - write_to_output_c1_fp32(pre_out, - dout_batch, - c + c_round_down, - c + c_round_down + 1, - h, - h + h_kernel, - 0, - wout_round, - oc, - oh, - ow, - flag_relu, - ptr_write); - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_gemmlike.cc b/lite/backends/arm/math/conv_gemmlike.cc deleted file mode 100644 index 1dd102db1e..0000000000 --- a/lite/backends/arm/math/conv_gemmlike.cc +++ /dev/null @@ -1,285 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_gemmlike.h" -#include -#include "lite/backends/arm/math/gemm_prepacked_int8.h" -#include "lite/backends/arm/math/packed_sgemm.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -/********************* Gemmlike Conv Precision Is Float ***********************/ -template <> -bool GemmLikeConv::create(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; - int oc = o_dims[1]; - int kw = w_dims[3]; - int kh = w_dims[2]; - int sw = param.strides[1]; - int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; - - int m = oc / param.groups; - int k = ic * kh * kw / param.groups; - int n = oh * ow; - bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh); - bool ks_equal = (sw == sh) && (kw == kh); - //! select conv gemmlike kernel - if (kw == 1 && sw == 1 && pw == 0 && kps_equal) { - //! 1x1s1p0 gemmlike conv - impl_ = conv1x1s1_gemm; - } else { - //! otherwise case - if (kw == 3 && sw == 1 && n > 1 && ks_equal) { - idx_data_.Resize({1, 1, 1, n * kh * kw}); - int* idx_out = idx_data_.mutable_data(); - for (int i = 0; i < oh; ++i) { - for (int j = 0; j < ow; ++j) { - compute_offset(idx_out, i, j, kh, kw, ih, iw, ph, pw, dh, dw); - idx_out += kh * kw; - } - } - } - //! im2col gemmlike conv - impl_ = conv_im2col_gemm; - this->ctx_->ExtendWorkspace(k * n * sizeof(float)); - } - - if (n > 1) { - int hblock = get_hblock(this->ctx_->arch()); - int m_roundup = hblock * ((m + hblock - 1) / hblock); - int group_size_round_up = ((m_roundup * k + 15) / 16) * 16; - float* w_trans_ptr = nullptr; - weights_trans_.Resize({1, 1, 1, group_size_round_up * param.groups}); - w_trans_ptr = weights_trans_.mutable_data(); - const auto* w_data = param.filter->data(); - for (int g = 0; g < param.groups; ++g) { - const float* weights_group = w_data + g * m * k; - float* weights_trans_ptr = w_trans_ptr + g * group_size_round_up; - prepackA(weights_trans_ptr, - weights_group, - 1.f, - k, - 0, - m, - 0, - k, - false, - this->ctx_); - } - is_weights_transed_ = true; - } - return true; -} - -template <> -bool GemmLikeConv::init(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - return create(param, ctx); -} - -template <> -bool GemmLikeConv::run(const operators::ConvParam& param) { - // start timer - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - auto* o_data = param.output->mutable_data(); - const int* idx_data = idx_data_.mutable_data(); - - if (is_weights_transed_) { - w_data = weights_trans_.data(); - } - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int bs = x_dims[0]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int oc = o_dims[1]; - - impl_(i_data, - o_data, - bs, - oc, - oh, - ow, - ic, - ih, - iw, - w_data, - b_data, - param, - this->ctx_, - idx_data); - - // timer end - return true; -} - -/********************* Gemmlike Conv Precision Is Int8 ************************/ -template -bool GemmLikeConvInt8::create(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; - int oc = o_dims[1]; - int kw = w_dims[3]; - int kh = w_dims[2]; - int sw = param.strides[1]; - int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; - - int m = oc / param.groups; - int k = ic * kh * kw / param.groups; - int n = oh * ow; - w_scale_ = param.weight_scale; - //! update weights scale - if (Ptype_out == PRECISION(kInt8) || Ptype_out == PRECISION(kFloat)) { - CHECK_EQ(this->w_scale_.size(), oc) << "weights scale size must be chout"; - float input_scale = param.input_scale; - for (auto& w_s : w_scale_) { - w_s *= input_scale; - if (Ptype_out == PRECISION(kInt8)) { - w_s /= param.output_scale; - } - } - } - - bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh); - bool ks_equal = (sw == sh) && (kw == kh); - //! select conv gemmlike kernel - if (kw == 1 && sw == 1 && pw == 0 && kps_equal) { - //! 1x1s1p0 gemmlike conv - impl_int8_ = conv1x1s1_gemm_int8; - } else { - //! otherwise case - if (kw == 3 && sw == 1 && n > 1 && ks_equal) { - idx_data_.Resize({1, 1, 1, n * kh * kw}); - int* idx_out = idx_data_.mutable_data(); - for (int i = 0; i < oh; ++i) { - for (int j = 0; j < ow; ++j) { - compute_offset(idx_out, i, j, kh, kw, ih, iw, ph, pw, dh, dw); - idx_out += kh * kw; - } - } - } - //! im2col gemmlike conv - impl_int8_ = conv_im2col_gemm_int8; - this->ctx_->ExtendWorkspace(k * n); - } - - if (n > 1) { - prepackA_int8(&this->weights_trans_, - *param.filter, - m, - k, - param.groups, - false, - this->ctx_); - this->is_weights_transed_ = true; - } - return true; -} - -template -bool GemmLikeConvInt8::init(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - return create(param, ctx); -} - -template -bool GemmLikeConvInt8::run(const operators::ConvParam& param) { - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - auto* o_data = param.output->mutable_data(); - const int32_t* idx_data = idx_data_.mutable_data(); - - if (this->is_weights_transed_ == true) { - w_data = this->weights_trans_.template data(); - } - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int bs = x_dims[0]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int oc = o_dims[1]; - - impl_int8_(i_data, - o_data, - bs, - oc, - oh, - ow, - ic, - ih, - iw, - w_data, - b_data, - param, - this->ctx_, - Ptype_out, - this->w_scale_.data(), - idx_data); - - return true; -} - -template class GemmLikeConvInt8; -template class GemmLikeConvInt8; -template class GemmLikeConvInt8; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_gemmlike.h b/lite/backends/arm/math/conv_gemmlike.h deleted file mode 100644 index 5986b5c2c8..0000000000 --- a/lite/backends/arm/math/conv_gemmlike.h +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -class GemmLikeConv - : public ImplBase { - public: - typedef void (*conv_im2col_gemm_impl)(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx, - const int* idx_ptr); - - GemmLikeConv() = default; - ~GemmLikeConv() {} - - virtual bool init(const operators::ConvParam& param, ARMContext* ctx) { - LOG(FATAL) << "GemmLikeConv::init() not implemented."; - } - - virtual bool create(const operators::ConvParam& param, ARMContext* ctx) { - LOG(FATAL) << "GemmLikeConv::create() not implemented."; - } - - virtual bool run(const operators::ConvParam& param) { - LOG(FATAL) << "GemmLikeConv::run() not implemented."; - } - - protected: - bool is_weights_transed_{false}; - Tensor idx_data_; - Tensor weights_trans_; - - private: - conv_im2col_gemm_impl impl_{nullptr}; -}; - -template -class GemmLikeConvInt8 : public GemmLikeConv { - public: - typedef void (*conv_im2col_gemm_int8_impl)(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - ARMContext* ctx, - PrecisionType out_type, - const float* scale, - const int* idx_ptr); - - GemmLikeConvInt8() = default; - ~GemmLikeConvInt8() {} - - virtual bool init(const operators::ConvParam& param, ARMContext* ctx); - - virtual bool create(const operators::ConvParam& param, ARMContext* ctx); - - virtual bool run(const operators::ConvParam& param); - - private: - conv_im2col_gemm_int8_impl impl_int8_{nullptr}; - std::vector w_scale_; -}; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc deleted file mode 100644 index dbea9d643e..0000000000 --- a/lite/backends/arm/math/conv_impl.cc +++ /dev/null @@ -1,900 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// #include "saber/funcs/impl/arm/neon/impl/conv_arm_depthwise.h" -// #include "saber/funcs/impl/arm/neon/impl/conv_arm_impl.h" -// #include "saber/funcs/impl/arm/neon/impl/gemm_prepacked_int8.h" -// #include "saber/funcs/impl/arm/neon/impl/gemv_arm_int8.h" -// #include "saber/funcs/impl/arm/neon/impl/sgemv_arm.h" - -#include "lite/backends/arm/math/conv_impl.h" -#include -#include "lite/backends/arm/math/gemm_prepacked_int8.h" -#include "lite/backends/arm/math/gemv_arm_int8.h" -#include "lite/backends/arm/math/packed_sgemm.h" -#include "lite/backends/arm/math/sgemv.h" -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" -#include "lite/operators/op_params.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -/** - * \brief neon implementation to add bias - * @param tensor - * @param bias - * @param channel - * @param channel_size - */ -void fill_bias(float* tensor, - const float* bias, - int channel, - int channel_size) { - if (tensor == nullptr) { - return; - } - float* data = tensor; - - for (int j = 0; j < channel; ++j) { - float32x4_t vdata = vdupq_n_f32(bias[j]); - int i = 0; - for (; i < channel_size - 3; i += 4) { - vst1q_f32(data + i, vdata); - } - for (; i < channel_size; i++) { - data[i] = bias[j]; - } - data += channel_size; - } -} - -void fill_bias_int8(int* tensor, - const int* bias, - int channel, - int channel_size) { - if (tensor == nullptr) { - return; - } - int* data = tensor; - for (int j = 0; j < channel; ++j) { - int32x4_t vdata = vdupq_n_s32(bias[j]); - int i = 0; - for (; i < channel_size - 3; i += 4) { - vst1q_s32(data + i, vdata); - } - for (; i < channel_size; i++) { - data[i] = bias[j]; - } - data += channel_size; - } -} - -/** - * \brief inline funcs used in im2col - * @param a - * @param b - * @return - */ -inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { - return static_cast(a) < static_cast(b); -} - -/** - * \brief normal im2col function for gemm conv - * @tparam dtype - * @param data_im - * @param channels - * @param height - * @param width - * @param kernel_size - * @param pad - * @param stride - * @param data_col - */ -template -void im2col(const Dtype* data_im, - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - Dtype* data_col) { - const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; - const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; - const int channel_size = height * width; - for (int channel = channels; channel--; data_im += channel_size) { - for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; - for (int output_rows = output_h; output_rows; output_rows--) { - if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { - for (int output_cols = output_w; output_cols; output_cols--) { - *(data_col++) = 0; - } - } else { - int input_col = -pad_w + kernel_col * dilation_w; - for (int output_col = output_w; output_col; output_col--) { - if (is_a_ge_zero_and_a_lt_b(input_col, width)) { - *(data_col++) = data_im[input_row * width + input_col]; - } else { - *(data_col++) = 0; - } - input_col += stride_w; - } - } - input_row += stride_h; - } - } - } - } -} -void compute_offset(int* idx_out, - int h, - int w, - int kernel_h, - int kernel_w, - int height, - int width, - int pad_h, - int pad_w, - int dilation_h, - int dilation_w) { - int idx_h[kernel_h]; // NOLINT - int idx_w[kernel_w]; // NOLINT - for (int i = 0; i < kernel_h; ++i) { - idx_h[i] = h - pad_h + i * dilation_h; - } - for (int i = 0; i < kernel_w; ++i) { - idx_w[i] = w - pad_w + i * dilation_w; - } - for (int k_h = 0; k_h < kernel_h; ++k_h) { - for (int k_w = 0; k_w < kernel_w; ++k_w) { - idx_out[k_h * kernel_w + k_w] = - (idx_h[k_h] >= 0 && idx_w[k_w] >= 0 && idx_h[k_h] < height && - idx_w[k_w] < width) - ? idx_h[k_h] * width + idx_w[k_w] - : -1; - } - } -} -template -void im2col3x3(const Dtype* data_im, - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - Dtype* data_col, - const int* idx) { - const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; - const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; - int kernel_stride = kernel_h * kernel_w; - int in_channel_stride = height * width; - const int* idx_out = idx; - Dtype* data_col_ptr = data_col; - - bool flag_continue = false; - if (dilation_h == 1 && dilation_w == 1) { - flag_continue = true; - } - - for (int o = 0; o < output_h * output_w; o += 1) { - const Dtype* data_im_ptr = data_im; - - // int* idx_out_d = idx_out; - - int idx_out_d0 = idx_out[0]; - int idx_out_d1 = idx_out[1]; - int idx_out_d2 = idx_out[2]; - int idx_out_d3 = idx_out[3]; - int idx_out_d4 = idx_out[4]; - int idx_out_d5 = idx_out[5]; - int idx_out_d6 = idx_out[6]; - int idx_out_d7 = idx_out[7]; - int idx_out_d8 = idx_out[8]; - - for (int i = 0; i < channels; i += 1) { - if (idx_out_d0 >= 0 && idx_out_d2 >= 0 && idx_out_d6 >= 0 && - idx_out_d8 >= 0) { - if (flag_continue) { - memcpy( - data_col_ptr, data_im_ptr + idx_out_d0, kernel_w * sizeof(Dtype)); - memcpy(data_col_ptr + kernel_w, - data_im_ptr + idx_out_d3, - kernel_w * sizeof(Dtype)); - memcpy(data_col_ptr + kernel_w + kernel_w, - data_im_ptr + idx_out_d6, - kernel_w * sizeof(Dtype)); - } else { - data_col_ptr[0] = data_im_ptr[idx_out_d0]; - data_col_ptr[1] = data_im_ptr[idx_out_d1]; - data_col_ptr[2] = data_im_ptr[idx_out_d2]; - data_col_ptr[3] = data_im_ptr[idx_out_d3]; - data_col_ptr[4] = data_im_ptr[idx_out_d4]; - data_col_ptr[5] = data_im_ptr[idx_out_d5]; - data_col_ptr[6] = data_im_ptr[idx_out_d6]; - data_col_ptr[7] = data_im_ptr[idx_out_d7]; - data_col_ptr[8] = data_im_ptr[idx_out_d8]; - } - } else { - data_col_ptr[0] = (idx_out_d0 < 0) ? 0 : data_im_ptr[idx_out_d0]; - data_col_ptr[1] = (idx_out_d1 < 0) ? 0 : data_im_ptr[idx_out_d1]; - data_col_ptr[2] = (idx_out_d2 < 0) ? 0 : data_im_ptr[idx_out_d2]; - data_col_ptr[3] = (idx_out_d3 < 0) ? 0 : data_im_ptr[idx_out_d3]; - data_col_ptr[4] = (idx_out_d4 < 0) ? 0 : data_im_ptr[idx_out_d4]; - data_col_ptr[5] = (idx_out_d5 < 0) ? 0 : data_im_ptr[idx_out_d5]; - data_col_ptr[6] = (idx_out_d6 < 0) ? 0 : data_im_ptr[idx_out_d6]; - data_col_ptr[7] = (idx_out_d7 < 0) ? 0 : data_im_ptr[idx_out_d7]; - data_col_ptr[8] = (idx_out_d8 < 0) ? 0 : data_im_ptr[idx_out_d8]; - } - data_im_ptr += height * width; - data_col_ptr += kernel_stride; - } - // data_col_ptr += channels * kernel_stride; - // idx_out += kernel_stride * 2; - idx_out += kernel_stride; - } -} - -/** - * \brief convolution function for kernel size 1x1, stride size 1, gemm - * implementation - */ -void conv1x1s1_gemm(const float* i_data, - float* o_data, - int num, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx, - const int* idx_ptr) { - int channel_size_out = ow * oh; - int channel_size_in = win * ih; - - const int group = param.groups; - const int m = oc / group; - const int n = oh * ow; - const int k = ic / group; - - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active) { - // if (param.activation_param.active == Active_relu && - // fabs(param.activation_param.negative_slope) < 1e-6f) { - // flag_relu = true; - // } - // } - int hblock = get_hblock(ctx->arch()); - int m_roundup = hblock * ((m + hblock - 1) / hblock); - int weights_size_per_group = m * k; - if (n > 1) { - weights_size_per_group = ((m_roundup * k + 15) / 16) * 16; - } - - // int weights_size_per_group = m_roundup * k;//oc * ic / (group * - // group); - //! use gemv when the output channel size = 1 - for (int b = 0; b < num; ++b) { - // dC - for (int g = 0; g < group; ++g) { - float* dout_group = - static_cast(o_data) + (b * oc + g * m) * channel_size_out; - const float* din_group = static_cast(i_data) + - (b * ic + g * k) * channel_size_in; - const float* weights_group = - static_cast(weights) + g * weights_size_per_group; - const float* bias_group = static_cast(bias) + g * m; - - if (n == 1) { - sgemv(weights_group, - din_group, - dout_group, - false, - m, - k, - flag_bias, - bias_group, - flag_relu); - } else { - sgemm_prepack(false, - m, - n, - k, - weights_group, - din_group, - n, - 0.f, - dout_group, - n, - bias_group, - flag_bias, - flag_relu, - ctx); - } - } - } -} - -void conv1x1s1_gemm_int8(const int8_t* i_data, - int32_t* o_data, - int num, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - ARMContext* ctx, - PrecisionType out_type, - const float* scale, - const int32_t* idx_ptr) { - int group = param.groups; - int channel_size_out = ow * oh; - int channel_size_in = win * ih; - const int m = oc / group; - const int n = oh * ow; - const int k = ic / group; - int hblock = get_hblock_int8(ctx); - int k_roundup = ROUNDUP(k, KBLOCK_INT8); - int m_roundup = ROUNDUP(m, hblock); - int weights_size_per_group = m * k; - if (n > 1) { - weights_size_per_group = ((m_roundup * k_roundup + 15) / 16) * 16; - } - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - //! use gemv when the output channel size = 1 - for (int b = 0; b < num; ++b) { - // dC - for (int g = 0; g < group; ++g) { - signed char* dout_group = - reinterpret_cast(o_data) + - (b * oc + g * m) * channel_size_out * PrecisionTypeLength(out_type); - const int8_t* din_group = i_data + (b * ic + g * k) * channel_size_in; - const int8_t* weights_group = weights + g * weights_size_per_group; - const int* bias_group = bias + g * m; - const float* scale_group = scale + g * m; - if (n == 1) { - if (out_type == PRECISION(kFloat)) { - gemv_int8(weights_group, - din_group, - reinterpret_cast(dout_group), - false, - m, - k, - scale_group, - flag_bias, - bias_group, - flag_relu); - } else if (out_type == PRECISION(kInt8)) { // int8 - gemv_int8(weights_group, - din_group, - dout_group, - false, - m, - k, - scale_group, - flag_bias, - bias_group, - flag_relu); - } else { - gemv_int8(weights_group, - din_group, - reinterpret_cast(dout_group), - false, - m, - k, - scale_group, - flag_bias, - bias_group, - flag_relu); - } - } else { - if (out_type == PRECISION(kFloat)) { - gemm_prepack_int8(weights_group, - din_group, - bias_group, - reinterpret_cast(dout_group), - m, - n, - k, - flag_bias, - flag_relu, - false, - scale_group, - ctx); - } else if (out_type == PRECISION(kInt8)) { // int8 - gemm_prepack_int8(weights_group, - din_group, - bias_group, - dout_group, - m, - n, - k, - flag_bias, - flag_relu, - false, - scale_group, - ctx); - } else { - gemm_prepack_int8(weights_group, - din_group, - bias_group, - reinterpret_cast(dout_group), - m, - n, - k, - flag_bias, - flag_relu, - false, - scale_group, - ctx); - } - } - } - } -} - -/** - * \brief convolution function for kernel size 3x3, stride size 2, gemm - * implementation - */ -void conv_im2col_gemm(const float* i_data, - float* o_data, - int num, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx, - const int* idx_ptr) { - const int group = param.groups; - auto filter_dims = param.filter->dims(); - const int kernel_h = filter_dims[2]; - const int kernel_w = filter_dims[3]; // nchw - const int m = oc / group; - const int n = oh * ow; - const int k = ic * kernel_h * kernel_w / group; - const int chin_per_group = ic / group; - int channel_size_out = ow * oh; - int channel_size_in = win * ih; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active) { - // if (param.activation_param.active == Active_relu && - // fabs(param.activation_param.negative_slope) < 1e-6f) { - // flag_relu = true; - // } - // } - int hblock = get_hblock(ctx->arch()); - int m_roundup = hblock * ((m + hblock - 1) / hblock); - int weights_size_per_group = m * k; - if (n > 1) { - weights_size_per_group = ((m_roundup * k + 15) / 16) * 16; - } - - bool flag_im2col2 = (kernel_h == 3 && kernel_w == 3 && - param.strides[0] == 1 && param.strides[1] == 1 && n > 1); - - float* tmp_work_space = - ctx->workspace_data() + ctx->llc_size() / sizeof(float); - - //! use gemv when the output channel size = 1 - for (int b = 0; b < num; ++b) { - // dC - for (int g = 0; g < group; ++g) { - float* dout_group = o_data + (b * oc + g * m) * channel_size_out; - const float* din_group = - i_data + (b * ic + g * chin_per_group) * channel_size_in; - const float* weights_group = weights + g * weights_size_per_group; - const float* bias_group = bias + g * m; - float* dB = tmp_work_space; - - if (flag_im2col2) { - im2col3x3(din_group, - chin_per_group, - ih, - win, - kernel_h, - kernel_w, - param.paddings[0], - param.paddings[1], - param.strides[0], - param.strides[1], - param.dilations[0], - param.dilations[1], - dB, - idx_ptr); - } else { - im2col(din_group, - chin_per_group, - ih, - win, - kernel_h, - kernel_w, - param.paddings[0], - param.paddings[1], - param.strides[0], - param.strides[1], - param.dilations[0], - param.dilations[1], - dB); - } - if (n == 1) { - sgemv(weights_group, - dB, - dout_group, - false, - m, - k, - flag_bias, - bias_group, - flag_relu); - } else { - int ldb = n; - if (flag_im2col2) { - ldb = k; - } - sgemm_prepack(flag_im2col2, - m, - n, - k, - weights_group, - dB, - ldb, - 0.f, - dout_group, - n, - bias_group, - flag_bias, - flag_relu, - ctx); - } - } - } -} - -void conv_im2col_gemm_int8(const int8_t* i_data, - int32_t* o_data, - int num, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - ARMContext* ctx, - PrecisionType out_type, - const float* scale, - const int32_t* idx_ptr) { - int group = param.groups; - auto filter_dims = param.filter->dims(); - int kernel_h = filter_dims[2]; - int kernel_w = filter_dims[3]; - int stride_h = param.strides[0]; - int stride_w = param.strides[1]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; - const int m = oc / group; - const int n = oh * ow; - const int k = ic * kernel_h * kernel_w / group; - const int chin_per_group = ic / group; - int channel_size_out = ow * oh; - int channel_size_in = win * ih; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - - int hblock = get_hblock_int8(ctx); - int k_roundup = ROUNDUP(k, KBLOCK_INT8); - int m_roundup = ROUNDUP(m, hblock); - int weights_size_per_group = m * k; - if (n > 1) { - weights_size_per_group = ((m_roundup * k_roundup + 15) / 16) * 16; - } - - bool flag_im2col2 = (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && - stride_w == 1 && n > 1); - - int8_t* tmp_work_space = - ctx->workspace_data() + ctx->llc_size() / sizeof(int8_t); - - //! use gemv when the output channel size = 1 - for (int b = 0; b < num; ++b) { - // dC - for (int g = 0; g < group; ++g) { - signed char* dout_group = - reinterpret_cast(o_data) + - (b * oc + g * m) * channel_size_out * PrecisionTypeLength(out_type); - const int8_t* din_group = static_cast(i_data) + - (b * ic + g * chin_per_group) * channel_size_in; - const int8_t* weights_group = - static_cast(weights) + g * weights_size_per_group; - const int* bias_group = static_cast(bias) + g * m; - int8_t* dB = tmp_work_space; - const float* scale_group = scale + g * m; - - if (flag_im2col2) { - im2col3x3(din_group, - chin_per_group, - ih, - win, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dila_h, - dila_w, - dB, - idx_ptr); - - } else { - im2col(din_group, - chin_per_group, - ih, - win, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dila_h, - dila_w, - dB); - } - if (n == 1) { - if (out_type == PRECISION(kFloat)) { - gemv_int8(weights_group, - dB, - reinterpret_cast(dout_group), - false, - m, - k, - scale_group, - flag_bias, - bias_group, - flag_relu); - } else if (out_type == PRECISION(kInt8)) { // int8 - gemv_int8(weights_group, - dB, - dout_group, - false, - m, - k, - scale_group, - flag_bias, - bias_group, - flag_relu); - } else { - gemv_int8(weights_group, - dB, - reinterpret_cast(dout_group), - false, - m, - k, - scale_group, - flag_bias, - bias_group, - flag_relu); - } - } else { - if (out_type == PRECISION(kFloat)) { - gemm_prepack_int8(weights_group, - dB, - bias_group, - reinterpret_cast(dout_group), - m, - n, - k, - flag_bias, - flag_relu, - flag_im2col2, - scale_group, - ctx); - } else if (out_type == PRECISION(kInt8)) { // int8 - gemm_prepack_int8(weights_group, - dB, - bias_group, - dout_group, - m, - n, - k, - flag_bias, - flag_relu, - flag_im2col2, - scale_group, - ctx); - } else { - gemm_prepack_int8(weights_group, - dB, - bias_group, - reinterpret_cast(dout_group), - m, - n, - k, - flag_bias, - flag_relu, - flag_im2col2, - scale_group, - ctx); - } - } - } - } -} - -void conv_depthwise_3x3(const float* i_data, - float* o_data, - int num, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - int pad = param.paddings[1]; - int stride = param.strides[1]; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active) { - // if (param.activation_param.active == Active_relu && - // fabs(param.activation_param.negative_slope) < 1e-6f) { - // flag_relu = true; - // } - // } - if (pad == 1) { - conv_depthwise_3x3p1(i_data, - o_data, - num, - oc, - oh, - ow, - ic, - ih, - win, - weights, - bias, - stride, - flag_bias, - flag_relu, - ctx); - } else if (pad == 0 && ih > 2) { - conv_depthwise_3x3p0(i_data, - o_data, - num, - oc, - oh, - ow, - ic, - ih, - win, - weights, - bias, - stride, - flag_bias, - flag_relu, - ctx); - } else { - LOG(FATAL) << "unsupport this type 3x3 dw conv"; - } -} - -void conv_depthwise_5x5(const float* i_data, - float* o_data, - int num, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - int pad = param.paddings[1]; - int stride = param.strides[1]; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - // if (param.activation_param.has_active && - // fabs(param.activation_param.negative_slope) < 1e-6f) { - // if (param.activation_param.active == Active_relu) { - // flag_relu = true; - // } - // } - if (pad == 2 && stride == 2) { - conv_depthwise_5x5s2(i_data, - o_data, - num, - oc, - oh, - ow, - ic, - ih, - win, - weights, - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else if (stride == 1) { - conv_depthwise_5x5s1(i_data, - o_data, - num, - oc, - oh, - ow, - ic, - ih, - win, - weights, - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else { - LOG(FATAL) << "unsupport this type 5x5 dw conv"; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h deleted file mode 100644 index 38d799bb4c..0000000000 --- a/lite/backends/arm/math/conv_impl.h +++ /dev/null @@ -1,423 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" -#include "lite/operators/op_params.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -// TODO(TJ): move to somewhere else common -template -class ImplBase { - public: - ImplBase() {} - virtual ~ImplBase() {} - - virtual bool create(const Param& param, Context* ctx) { return false; } - - virtual bool init(const Param& param, Context* ctx) { return false; } - - virtual bool run(const Param& param) { return false; } - // void set_op_name(const char* name){_op_name = name;} - // const char* get_op_name() { return _op_name.c_str();} - - protected: - Param* param_; - Context* ctx_; -}; - -void conv_3x3s1_direct_fp32(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx); - -void conv_3x3s1_direct_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - -void conv_3x3s1_direct_int7(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - -void conv_3x3s2_direct_fp32(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx); - -int conv_3x3s2_direct_int8_c_num(); - -void conv_3x3s2_direct_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - -void conv_1x5s1_direct(const void* din, - void* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const void* weights, - const void* bias, - int group, - int kernel_w, - int kernel_h, - int stride_w, - int stride_h, - int dila_w, - int dila_h, - int pad_w, - int pad_h, - bool flag_bias, - bool flag_relu, - Context& ctx, - void* work_space, - const void* idx_ptr); - -void conv_5x1s1_direct(const void* din, - void* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const void* weights, - const void* bias, - int group, - int kernel_w, - int kernel_h, - int stride_w, - int stride_h, - int dila_w, - int dila_h, - int pad_w, - int pad_h, - bool flag_bias, - bool flag_relu, - Context& ctx, - void* work_space, - const void* idx_ptr); - -void conv1x1s1_gemm(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx, - const int* idx_ptr); - -void conv1x1s1_gemm_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale, - const int32_t* idx_ptr); - -void conv_im2col_gemm(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx, - const int* idx_ptr); - -void conv_im2col_gemm_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale, - const int32_t* idx_ptr); - -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias - */ - -void conv_depthwise_3x3p0(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_3x3p1(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s1(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_5x5s2(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_3x3(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx); - -void conv_depthwise_3x3_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - -void conv_depthwise_3x3_int7(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - -void conv_depthwise_5x5(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx); - -void conv_depthwise_5x5_int8(const int8_t* din, - int32_t* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const int8_t* weights, - const int32_t* bias, - const operators::ConvParam& param, - Context* ctx, - PrecisionType out_type, - const float* scale); - -void conv_winograd3x3(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx); - -void winograd_transform_weights( - void* dout, const void* din, int ch_out, int ch_in, void* work_space); - -void compute_offset(int* idx_out, - int h, - int w, - int kernel_h, - int kernel_w, - int height, - int width, - int pad_h, - int pad_w, - int dilation_h, - int dilation_w); - -void fill_bias(float* tensor, const float* bias, int channel, int channel_size); - -void fill_bias_int8(int* tensor, - const int* bias, - int channel, - int channel_size); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_winograd.cc b/lite/backends/arm/math/conv_winograd.cc deleted file mode 100644 index 43ad9e2cd8..0000000000 --- a/lite/backends/arm/math/conv_winograd.cc +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_winograd.h" -#include -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/backends/arm/math/packed_sgemm.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -bool WinogradConv::create(const operators::ConvParam& param, - ARMContext* ctx) { - this->ctx_ = ctx; - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; - int oc = o_dims[1]; - int kw = w_dims[3]; - int sw = param.strides[1]; - if (kw == 3) { - is_weights_transed_ = true; - int tile_w = (ow + 5) / 6; - int tile_h = (oh + 5) / 6; - int size_tile = tile_h * tile_w; - int size_trans_channel = 8 * 8 * size_tile; - int max_ch = ic > oc ? ic : oc; - - const int m_wino = oc; - const int n_wino = size_tile; - int hblock = get_hblock(this->ctx_->arch()); - int m_round = hblock * ((m_wino + hblock - 1) / hblock); - weights_trans_.Resize({1, 1, 1, 8 * 8 * m_round * ic}); - this->ctx_->ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) * - sizeof(float)); - auto weights_wino = - static_cast(malloc(sizeof(float) * 8 * 8 * oc * ic)); - void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); - if (weights_wino && trans_tmp_ptr) { - winograd_transform_weights( - weights_wino, param.filter->data(), oc, ic, trans_tmp_ptr); - auto weights_trans = weights_trans_.mutable_data(); - for (int i = 0; i < 64; ++i) { - float* packed_weights = weights_trans + i * m_round * ic; - const float* weights_wino_ptr = weights_wino + i * oc * ic; - prepackA(packed_weights, - weights_wino_ptr, - 1.f, - ic, - 0, - m_wino, - 0, - ic, - false, - this->ctx_); - } - impl_ = conv_winograd3x3; - free(trans_tmp_ptr); - free(weights_wino); - return true; - } - free(trans_tmp_ptr); - free(weights_wino); - } else { - LOG(ERROR) << "this type winograd conv not impl"; - } - return false; -} - -template <> -bool WinogradConv::init(const operators::ConvParam& param, - Context* ctx) { - this->ctx_ = ctx; - return create(param, ctx); -} - -template <> -bool WinogradConv::run(const operators::ConvParam& param) { - // start timer - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - auto* o_data = param.output->mutable_data(); - - if (is_weights_transed_) { - w_data = weights_trans_.data(); - } - - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int bs = x_dims[0]; - int oh = o_dims[2]; - int ow = o_dims[3]; - int oc = o_dims[1]; - - impl_(i_data, - o_data, - bs, - oc, - oh, - ow, - ic, - ih, - iw, - w_data, - b_data, - param, - this->ctx_); - - // timer end - return true; -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_winograd.h b/lite/backends/arm/math/conv_winograd.h deleted file mode 100644 index 1ae5edb0aa..0000000000 --- a/lite/backends/arm/math/conv_winograd.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -class WinogradConv - : public ImplBase { - public: - typedef void (*conv_winograd_impl)(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - Context* ctx); - - WinogradConv() = default; - ~WinogradConv() {} - - virtual bool init(const operators::ConvParam& param, - Context* ctx); - - virtual bool create(const operators::ConvParam& param, - Context* ctx); - - virtual bool run(const operators::ConvParam& param); - - private: - conv_winograd_impl impl_{nullptr}; - bool is_weights_transed_{false}; - Tensor weights_trans_; -}; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc deleted file mode 100644 index 87f51381e6..0000000000 --- a/lite/backends/arm/math/conv_winograd_3x3.cc +++ /dev/null @@ -1,479 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/backends/arm/math/packed_sgemm.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void transpose(float* data_out, const float* data_in, int w_in, int h_in); -void transform_input_f6x6(float* dout, const float* din); -void transform_output_f6x6(float* output, const float* din, float bias); -void conv_winograd3x3(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - int threads = ctx->threads(); - - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - int size_in_channel = win * hin; - int size_out_channel = wout * hout; - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - - //! transform input - int tile_w = (wout + 5) / 6; - int tile_h = (hout + 5) / 6; - int size_tile = tile_h * tile_w; - int size_trans_channel = 8 * 8 * size_tile; - int max_ch = chin > chout ? chin : chout; - - int m = chout; - int n = size_tile; - int k = chin; - - float* tmp_work_space = - ctx->workspace_data() + ctx->llc_size() / sizeof(float); - - //! tmp data buffer for input transform - float* tmp_data1 = tmp_work_space; - //! tmp data buffer for dot mul - float* tmp_data2 = tmp_data1 + size_trans_channel * max_ch; - - for (int i = 0; i < num; ++i) { - const float* din_batch = din + i * chin * size_in_channel; - float* dout_batch = dout + i * chout * size_out_channel; - -//! transform input Bt * data * B -#pragma omp parallel for num_threads(threads) - for (int j = 0; j < chin; ++j) { - const float* din_channel = din_batch + j * size_in_channel; - float* data_trans_channel = tmp_data1 + j * size_trans_channel; - - for (int h = 0; h < tile_h; h++) { - for (int w = 0; w < tile_w; w++) { - //! prepare data 8x8 - //! row 8 - float data_in_tmp[8][8] = {0.f}; - // memset(data_in_tmp[0], 0, sizeof(float) * 64); - for (int j = 0; j < 8; ++j) { - int start_row = h * 6 + j - pad_h; - if (start_row >= 0 && start_row < hin) { - for (int k = 0; k < 8; ++k) { - int start_col = w * 6 + k - pad_w; - if (start_col >= 0 && start_col < win) { - data_in_tmp[j][k] = din_channel[start_row * win + start_col]; - } - } - } - } - transform_input_f6x6(data_trans_channel, data_in_tmp[0]); - data_trans_channel += 64; - } - } - } - //! end of transform input - - //////////////////////////////////////////////////////////////////////////////// - //! dot mul - //! transpose input, convert from ch_in * tile_h * tile_w * 64 to - //! 64 * ch_in * tile_h * tile_w - int hblock = get_hblock(ctx->arch()); - int m_round = hblock * ((chout + hblock - 1) / hblock); - int stride_a = m_round * chin; - int stride_b = chin * size_tile; - int stride_c = chout * size_tile; - transpose(tmp_data2, tmp_data1, 64, stride_b); - - //! gemm - // #pragma omp parallel for - for (int l = 0; l < 64; ++l) { - const float* ptr_a = weights + l * stride_a; - const float* ptr_b = tmp_data2 + l * stride_b; - float* ptr_c = tmp_data1 + l * stride_c; - sgemm_prepack(false, - chout, - size_tile, - chin, - ptr_a, - ptr_b, - size_tile, - 0.f, - ptr_c, - size_tile, - nullptr, - false, - false, - ctx); - } - - //! transpose output, convert from 64 * ch_out * tile_h * tile_w to - //! ch_out * tile_h * tile_w * 64 - transpose(tmp_data2, tmp_data1, stride_c, 64); -//! end of dot mul - -/////////////////////////////////////////////////////////////////////////////// -//! transform output -#pragma omp parallel for - for (int i = 0; i < chout; ++i) { - float bias_value = flag_bias ? bias[i] : 0.f; - float* dout_tmp = tmp_data2 + i * size_trans_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - for (int h = 0; h < tile_h; ++h) { - for (int w = 0; w < tile_w; ++w) { - float out_tmp[6][6]; - - transform_output_f6x6(out_tmp[0], dout_tmp, bias_value); - dout_tmp += 64; - - for (int j = 0; j < 6; ++j) { - int end_row = h * 6 + j; - if (end_row < hout) { - for (int k = 0; k < 6; ++k) { - int end_col = w * 6 + k; - if (end_col < wout) { - if (flag_relu) { - dout_channel[end_row * wout + end_col] = - out_tmp[j][k] > 0.f ? out_tmp[j][k] : 0.f; - } else { - dout_channel[end_row * wout + end_col] = out_tmp[j][k]; - } - } - } - } - } - } - } - } - //! end of transform output - } -} - -/** - * \brief transpose with arm neon optimization - * @param data_out - * @param data_in - * @param w_in - * @param h_in - */ -void transpose(float* data_out, const float* data_in, int w_in, int h_in) { - int nw = w_in >> 2; - int nh = h_in >> 2; - int size_in = w_in * h_in; - - float* ptr_out = data_out; - const float* ptr_in = data_in; -#pragma omp parallel for - for (int h = 0; h < nh; h++) { - const float* ptr_din_row = ptr_in + h * 4 * w_in; - for (int w = 0; w < nw; w++) { - float* data_out_ptr = ptr_out + w * 4 * h_in + h * 4; - const float* din0 = ptr_din_row; - const float* din1 = din0 + w_in; - const float* din2 = din1 + w_in; - const float* din3 = din2 + w_in; - - float* dout0 = data_out_ptr; - float* dout1 = dout0 + h_in; - float* dout2 = dout1 + h_in; - float* dout3 = dout2 + h_in; -#ifdef __aarch64__ - asm("ldr q0, [%[in0]] \n" /*load input 0*/ - "ldr q1, [%[in1]] \n" - "ldr q2, [%[in2]] \n" - "ldr q3, [%[in3]] \n" - "trn1 v4.4s, v0.4s, v1.4s \n" - "trn2 v5.4s, v0.4s, v1.4s \n" - "trn1 v6.4s, v2.4s, v3.4s \n" - "trn2 v7.4s, v2.4s, v3.4s \n" - "trn1 v8.2d, v4.2d, v6.2d \n" - "trn1 v9.2d, v5.2d, v7.2d \n" - "trn2 v10.2d, v4.2d, v6.2d \n" - "trn2 v11.2d, v5.2d, v7.2d \n" - "str q8, [%[out0]] \n" - "str q9, [%[out1]] \n" - "str q10, [%[out2]] \n" - "str q11, [%[out3]] \n" - : - : [out0] "r"(dout0), - [out1] "r"(dout1), - [out2] "r"(dout2), - [out3] "r"(dout3), - [in0] "r"(din0), - [in1] "r"(din1), - [in2] "r"(din2), - [in3] "r"(din3) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11"); -#else - asm("vld1.32 {d0, d1}, [%[in0]] \n" - "vld1.32 {d2, d3}, [%[in1]] \n" - "vld1.32 {d4, d5}, [%[in2]] \n" - "vld1.32 {d6, d7}, [%[in3]] \n" - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - "vswp d1, d4 \n" - "vswp d3, d6 \n" - "vst1.32 {d0, d1}, [%[out0]] \n" - "vst1.32 {d2, d3}, [%[out1]] \n" - "vst1.32 {d4, d5}, [%[out2]] \n" - "vst1.32 {d6, d7}, [%[out3]] \n" - : - : [out0] "r"(dout0), - [out1] "r"(dout1), - [out2] "r"(dout2), - [out3] "r"(dout3), - [in0] "r"(din0), - [in1] "r"(din1), - [in2] "r"(din2), - [in3] "r"(din3) - : "q0", "q1", "q2", "q3"); -#endif - ptr_din_row += 4; - } - } - // remian - for (int h = 0; h < h_in; h++) { - for (int w = nw * 4; w < w_in; w++) { - const float* data_in_ptr = ptr_in + h * w_in + w; - float* data_out_ptr = ptr_out + w * h_in + h; - *data_out_ptr = *data_in_ptr; - } - } - for (int w = 0; w < w_in; w++) { - for (int h = nh * 4; h < h_in; h++) { - const float* data_in_ptr = ptr_in + h * w_in + w; - float* data_out_ptr = ptr_out + w * h_in + h; - *data_out_ptr = *data_in_ptr; - } - } -} - -/** - * \brief winograd transform conv3x3 weights, f63 - * this is done in op initialization or creation, only do once - * dout = G * g * GT, where G is the transform coeff, g is the input weights - * @param dout - * @param din - * @param ch_out - * @param ch_in - * @param work_space - */ -void winograd_transform_weights( - void* dout, const void* din, int ch_out, int ch_in, void* work_space) { - const float coeff[8][3] = {{1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {32.0f / 45, 16.0f / 45, 8.0f / 45}, - {32.0f / 45, -16.0f / 45, 8.0f / 45}, - {0.0f, 0.0f, 1.0f}}; - - float* ptr_out = static_cast(work_space); - - for (int i = 0; i < ch_out; i++) { - for (int j = 0; j < ch_in; j++) { - const float* kernel0 = - static_cast(din) + (i * ch_in + j) * 9; - float* ptr_channel = ptr_out + (i * ch_in + j) * 64; - - //! transform kernel, transposed - const float* k0 = kernel0; - const float* k1 = kernel0 + 3; - const float* k2 = kernel0 + 6; - - //! h - float tmp[8][3]; - for (int i = 0; i < 8; i++) { - tmp[i][0] = - k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; - tmp[i][1] = - k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; - tmp[i][2] = - k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; - } - - //! v - for (int j = 0; j < 8; j++) { - float* tmpp = &tmp[j][0]; - for (int i = 0; i < 8; i++) { - ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] + - tmpp[1] * coeff[i][1] + - tmpp[2] * coeff[i][2]; - } - } - } - } - transpose(static_cast(dout), ptr_out, 64, ch_out * ch_in); -} - -/** - * \brief winograd conv, transform input, f6x3 - * dout = BT * d * B, whrer B is the transform - * BT = 1 0 -21/4 0 21/4 0 -1 0 - * 0 1 1 -17/4 -17/4 1 1 0 - * 0 -1 1 17/4 -17/4 -1 1 0 - * 0 1/2 1/4 -5/2 -5/4 2 1 0 - * 0 -1/2 1/4 5/2 -5/4 -2 1 0 - * 0 2 4 -5/2 -5 1/2 1 0 - * 0 -2 4 5/2 -5 -1/2 1 0 - * 0 -1 0 21/4 0 -21/4 0 1 - * @param dout - * @param din - */ -void transform_input_f6x6(float* dout, const float* din) { - float tmp[8][8]; - //! BT * d - for (int m = 0; m < 8; m++) { - tmp[0][m] = din[0] - din[6] + (din[4] - din[2]) * 5.25f; - tmp[7][m] = din[7] - din[1] + (din[3] - din[5]) * 5.25f; - - float tmp12a = din[2] + din[6] - din[4] * 4.25f; - float tmp12b = din[1] + din[5] - din[3] * 4.25f; - - tmp[1][m] = tmp12a + tmp12b; - tmp[2][m] = tmp12a - tmp12b; - - float tmp34a = din[6] + din[2] * 0.25f - din[4] * 1.25f; - float tmp34b = din[1] * 0.5f - din[3] * 2.5f + din[5] * 2.f; - - tmp[3][m] = tmp34a + tmp34b; - tmp[4][m] = tmp34a - tmp34b; - - float tmp56a = din[6] + (din[2] - din[4] * 1.25f) * 4.f; - float tmp56b = din[1] * 2.f - din[3] * 2.5f + din[5] * 0.5f; - - tmp[5][m] = tmp56a + tmp56b; - tmp[6][m] = tmp56a - tmp56b; - - din += 8; - } - - for (int m = 0; m < 8; m++) { - const float* tmp0 = tmp[m]; - - dout[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f; - dout[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f; - - float tmp12a = tmp0[2] + tmp0[6] - tmp0[4] * 4.25f; - float tmp12b = tmp0[1] + tmp0[5] - tmp0[3] * 4.25f; - - dout[1] = tmp12a + tmp12b; - dout[2] = tmp12a - tmp12b; - - float tmp34a = tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f; - float tmp34b = tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f; - - dout[3] = tmp34a + tmp34b; - dout[4] = tmp34a - tmp34b; - - float tmp56a = tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f; - float tmp56b = tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f; - - dout[5] = tmp56a + tmp56b; - dout[6] = tmp56a - tmp56b; - - dout += 8; - } -} - -/** - * \brief winograd conv, transform output, f63 - * out = AT * din * A - * AT = 1 1 1 1 1 1 1 0 - * 0 1 -1 2 -2 1/2 -1/2 0 - * 0 1 1 4 4 1/4 1/4 0 - * 0 1 -1 8 -8 1/8 -1/8 0 - * 0 1 1 16 16 1/16 1/16 0 - * 0 1 -1 32 -32 1/32 -1/32 1 - * @param output - * @param din - * @param bias - */ -void transform_output_f6x6(float* output, const float* din, float bias) { - float tmp[6][8]; - for (int m = 0; m < 8; m++) { - float tmp024a = din[1] + din[2]; - float tmp135a = din[1] - din[2]; - - float tmp024b = din[3] + din[4]; - float tmp135b = din[3] - din[4]; - - float tmp024c = din[5] + din[6]; - float tmp135c = din[5] - din[6]; - - tmp[0][m] = din[0] + tmp024a + tmp024b + tmp024c; - tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 0.25f; - tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c * 0.0625f; - - tmp[1][m] = tmp135a + tmp135b * 2 + tmp135c * 0.5f; - tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 0.125f; - tmp[5][m] = din[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f; - - din += 8; - } - - for (int m = 0; m < 6; m++) { - const float* tmp0 = tmp[m]; - - float tmp024a = tmp0[1] + tmp0[2]; - float tmp135a = tmp0[1] - tmp0[2]; - - float tmp024b = tmp0[3] + tmp0[4]; - float tmp135b = tmp0[3] - tmp0[4]; - - float tmp024c = tmp0[5] + tmp0[6]; - float tmp135c = tmp0[5] - tmp0[6]; - - output[0] = bias + tmp0[0] + tmp024a + tmp024b + tmp024c; - output[2] = bias + tmp024a + tmp024b * 4 + tmp024c * 0.25f; - output[4] = bias + tmp024a + tmp024b * 16 + tmp024c * 0.0625f; - - output[1] = bias + tmp135a + tmp135b * 2 + tmp135c * 0.5f; - output[3] = bias + tmp135a + tmp135b * 8 + tmp135c * 0.125f; - output[5] = bias + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f; - - output += 6; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/decode_bboxes.cc b/lite/backends/arm/math/decode_bboxes.cc deleted file mode 100644 index 12ee42ebb3..0000000000 --- a/lite/backends/arm/math/decode_bboxes.cc +++ /dev/null @@ -1,651 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/decode_bboxes.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void decode_bbox_corner_variance_kernel(const int batch_num, - const T* loc_data, - const T* prior_data, - const T* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - T* bbox_data); - -template -void decode_bbox_corner_no_variance_kernel(const int batch_num, - const T* loc_data, - const T* prior_data, - const T* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - T* bbox_data); - -template -void decode_bbox_center_variance_kernel(const int batch_num, - const T* loc_data, - const T* prior_data, - const T* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - T* bbox_data); - -template -void decode_bbox_center_no_variance_kernel(const int batch_num, - const float* loc_data, - const float* prior_data, - const float* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data); - -template -void decode_bbox_corner_size_variance_kernel(const int batch_num, - const T* loc_data, - const T* prior_data, - const T* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - T* bbox_data); - -template -void decode_bbox_corner_size_no_variance_kernel(const int batch_num, - const T* loc_data, - const T* prior_data, - const T* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - T* bbox_data); - -template <> -void decode_bbox_corner_variance_kernel(const int batch_num, - const float* loc_data, - const float* prior_data, - const float* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data) { - if (!share_location) { - CHECK_EQ(share_location, true) - << "ERROR: decode boxes without share_location is unimplemented\n"; - return; - } - - int cnt = num_priors / 4; - int len_batch = num_priors * 4; - - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - int idx = i * 16; - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4_t vloc1 = vld1q_f32(ptr_loc); - float32x4_t vloc2 = vld1q_f32(ptr_loc + 4); - float32x4_t vloc3 = vld1q_f32(ptr_loc + 8); - float32x4_t vloc4 = vld1q_f32(ptr_loc + 12); - - float32x4_t vprior1 = vld1q_f32(ptr_prior); - float32x4_t vprior2 = vld1q_f32(ptr_prior + 4); - float32x4_t vprior3 = vld1q_f32(ptr_prior + 8); - float32x4_t vprior4 = vld1q_f32(ptr_prior + 12); - - vst1q_f32(ptr_bbox, vaddq_f32(vloc1, vprior1)); - vst1q_f32(ptr_bbox + 4, vaddq_f32(vloc2, vprior2)); - vst1q_f32(ptr_bbox + 8, vaddq_f32(vloc3, vprior3)); - vst1q_f32(ptr_bbox + 12, vaddq_f32(vloc4, vprior4)); - } -#pragma omp parallel for - for (int i = cnt * 4; i < num_priors; i++) { - int idx = i * 4; - float32x4_t vloc = vld1q_f32(ptr_loc_batch + idx); - float32x4_t vprior = vld1q_f32(prior_data + idx); - vst1q_f32(ptr_bbox_batch + idx, vaddq_f32(vloc, vprior)); - } - } -} - -template <> -void decode_bbox_corner_no_variance_kernel(const int batch_num, - const float* loc_data, - const float* prior_data, - const float* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data) { - if (!share_location) { - CHECK_EQ(share_location, true) - << "ERROR: decode boxes without share_location is unimplemented\n"; - return; - } - - int cnt = num_priors / 4; - int len_batch = num_priors * 4; - - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; - -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - int idx = i * 16; - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - const float* ptr_var = variance + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4_t vloc1 = vld1q_f32(ptr_loc); - float32x4_t vprior1 = vld1q_f32(ptr_prior); - float32x4_t vvar1 = vld1q_f32(ptr_var); - float32x4_t vout1 = vmulq_f32(vloc1, vvar1); - - float32x4_t vloc2 = vld1q_f32(ptr_loc + 4); - float32x4_t vprior2 = vld1q_f32(ptr_prior + 4); - float32x4_t vvar2 = vld1q_f32(ptr_var + 4); - float32x4_t vout2 = vmulq_f32(vloc2, vvar2); - - float32x4_t vloc3 = vld1q_f32(ptr_loc + 8); - float32x4_t vprior3 = vld1q_f32(ptr_prior + 8); - float32x4_t vvar3 = vld1q_f32(ptr_var + 8); - float32x4_t vout3 = vmulq_f32(vloc3, vvar3); - - float32x4_t vloc4 = vld1q_f32(ptr_loc + 12); - float32x4_t vprior4 = vld1q_f32(ptr_prior + 12); - float32x4_t vvar4 = vld1q_f32(ptr_var + 12); - float32x4_t vout4 = vmulq_f32(vloc4, vvar4); - - vst1q_f32(ptr_bbox, vaddq_f32(vout1, vprior1)); - vst1q_f32(ptr_bbox + 4, vaddq_f32(vout2, vprior2)); - vst1q_f32(ptr_bbox + 8, vaddq_f32(vout3, vprior3)); - vst1q_f32(ptr_bbox + 12, vaddq_f32(vout4, vprior4)); - } - - for (int i = cnt * 4; i < num_priors; i++) { - int idx = i * 4; - float32x4_t vloc = vld1q_f32(ptr_loc_batch + idx); - float32x4_t vprior = vld1q_f32(prior_data + idx); - float32x4_t vvar = vld1q_f32(variance + idx); - float32x4_t vout = vmulq_f32(vloc, vvar); - vst1q_f32(ptr_bbox_batch + idx, vaddq_f32(vout, vprior)); - } - } -} - -template <> -void decode_bbox_center_variance_kernel(const int batch_num, - const float* loc_data, - const float* prior_data, - const float* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data) { - if (!share_location) { - CHECK_EQ(share_location, true) - << "ERROR: decode boxes without share_location is unimplemented\n"; - return; - } - - int cnt = num_priors / 4; - //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! vvar - float32x4_t vhalf = vdupq_n_f32(0.5f); - - int len_batch = num_priors * 4; - - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; - -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - int idx = i * 16; - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4x4_t vprior = vld4q_f32(ptr_prior); - float32x4x4_t vloc = vld4q_f32(ptr_loc); - float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); - float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); - float32x4_t vprior_cx = - vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf); - float32x4_t vprior_cy = - vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf); - - float32x4_t vdec_bbx_cx = - vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); - float32x4_t vdec_bbx_cy = - vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); - float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); - float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); - vprior_width = vmulq_f32(vprior_width, vhalf); - vprior_height = vmulq_f32(vprior_height, vhalf); - vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); - vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); - - vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); - vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h); - - vst4q_f32(ptr_bbox, vloc); - } -#pragma omp parallel for - for (int i = cnt * 4; i < num_priors; i++) { - int idx = i * 4; - float p_xmin = prior_data[idx]; - float p_ymin = prior_data[idx + 1]; - float p_xmax = prior_data[idx + 2]; - float p_ymax = prior_data[idx + 3]; - float prior_width = p_xmax - p_xmin; - float prior_height = p_ymax - p_ymin; - float prior_center_x = (p_xmin + p_xmax) / 2.f; - float prior_center_y = (p_ymin + p_ymax) / 2.f; - - float xmin = ptr_loc_batch[idx]; - float ymin = ptr_loc_batch[idx + 1]; - float xmax = ptr_loc_batch[idx + 2]; - float ymax = ptr_loc_batch[idx + 3]; - - //! variance is encoded in target, we simply need to retore the offset - //! predictions. - float decode_bbox_center_x = xmin * prior_width + prior_center_x; - float decode_bbox_center_y = ymin * prior_height + prior_center_y; - float decode_bbox_width = expf(xmax) * prior_width; - float decode_bbox_height = expf(ymax) * prior_height; - - ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; - ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f; - } - } -} - -template <> -void decode_bbox_center_no_variance_kernel(const int batch_num, - const float* loc_data, - const float* prior_data, - const float* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data) { - if (!share_location) { - CHECK_EQ(share_location, true) - << "ERROR: decode boxes without share_location is unimplemented\n"; - return; - } - - int cnt = num_priors / 4; - //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! vvar - float32x4_t vhalf = vdupq_n_f32(0.5f); - - int len_batch = num_priors * 4; - - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; - -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - int idx = i * 16; - - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - const float* ptr_var = variance + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4x4_t vprior = vld4q_f32(ptr_prior); - float32x4x4_t vloc = vld4q_f32(ptr_loc); - float32x4x4_t vvar = vld4q_f32(ptr_var); - float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); - float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); - float32x4_t vprior_cx = - vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf); - float32x4_t vprior_cy = - vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf); - - vloc.val[0] = vmulq_f32(vloc.val[0], vvar.val[0]); - vloc.val[1] = vmulq_f32(vloc.val[1], vvar.val[1]); - vloc.val[2] = vmulq_f32(vloc.val[2], vvar.val[2]); - vloc.val[3] = vmulq_f32(vloc.val[3], vvar.val[3]); - - float32x4_t vdec_bbx_cx = - vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); - float32x4_t vdec_bbx_cy = - vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); - float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); - float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); - vprior_width = vmulq_f32(vprior_width, vhalf); - vprior_height = vmulq_f32(vprior_height, vhalf); - vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); - vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); - - vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); - vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w); - vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h); - - vst4q_f32(ptr_bbox, vloc); - } - -#pragma omp parallel for - for (int i = cnt * 4; i < num_priors; i++) { - int idx = i * 4; - float p_xmin = prior_data[idx]; - float p_ymin = prior_data[idx + 1]; - float p_xmax = prior_data[idx + 2]; - float p_ymax = prior_data[idx + 3]; - float prior_width = p_xmax - p_xmin; - float prior_height = p_ymax - p_ymin; - float prior_center_x = (p_xmin + p_xmax) / 2.f; - float prior_center_y = (p_ymin + p_ymax) / 2.f; - - float xmin = ptr_loc_batch[idx]; - float ymin = ptr_loc_batch[idx + 1]; - float xmax = ptr_loc_batch[idx + 2]; - float ymax = ptr_loc_batch[idx + 3]; - - //! variance is encoded in target, we simply need to retore the offset - //! predictions. - float decode_bbox_center_x = - variance[idx] * xmin * prior_width + prior_center_x; - float decode_bbox_center_y = - variance[idx + 1] * ymin * prior_height + prior_center_y; - float decode_bbox_width = expf(variance[idx + 2] * xmax) * prior_width; - float decode_bbox_height = expf(variance[idx + 3] * ymax) * prior_height; - - ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; - ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f; - ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f; - } - } -} - -template <> -void decode_bbox_corner_size_variance_kernel( - const int batch_num, - const float* loc_data, - const float* prior_data, - const float* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data) { - if (!share_location) { - CHECK_EQ(share_location, true) - << "ERROR: decode boxes without share_location is unimplemented\n"; - return; - } - - int cnt = num_priors / 4; - //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! bbx - - int len_batch = num_priors * 4; - - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; - -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - int idx = i * 16; - - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - const float* ptr_var = variance + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4x4_t vprior = vld4q_f32(ptr_prior); - float32x4x4_t vloc = vld4q_f32(ptr_loc); - - float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); - float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); - - float32x4x4_t vbbx; - vbbx.val[0] = vmulq_f32(vloc.val[0], vprior_width); - vbbx.val[1] = vmulq_f32(vloc.val[1], vprior_height); - vbbx.val[2] = vmulq_f32(vloc.val[2], vprior_width); - vbbx.val[3] = vmulq_f32(vloc.val[3], vprior_height); - - vbbx.val[0] = vaddq_f32(vprior.val[0], vbbx.val[0]); - vbbx.val[1] = vaddq_f32(vprior.val[1], vbbx.val[1]); - vbbx.val[2] = vaddq_f32(vprior.val[2], vbbx.val[2]); - vbbx.val[3] = vaddq_f32(vprior.val[3], vbbx.val[3]); - - vst4q_f32(ptr_bbox, vbbx); - } - -#pragma omp parallel for - for (int i = cnt * 4; i < num_priors; i++) { - int idx = i * 4; - float p_xmin = prior_data[idx]; - float p_ymin = prior_data[idx + 1]; - float p_xmax = prior_data[idx + 2]; - float p_ymax = prior_data[idx + 3]; - float prior_width = p_xmax - p_xmin; - float prior_height = p_ymax - p_ymin; - - ptr_bbox_batch[idx] = p_xmin + ptr_loc_batch[idx] * prior_width; - ptr_bbox_batch[idx + 1] = p_ymin + ptr_loc_batch[idx + 1] * prior_height; - ptr_bbox_batch[idx + 2] = p_xmax + ptr_loc_batch[idx + 2] * prior_width; - ptr_bbox_batch[idx + 3] = p_ymax + ptr_loc_batch[idx + 3] * prior_height; - } - } -} - -template <> -void decode_bbox_corner_size_no_variance_kernel( - const int batch_num, - const float* loc_data, - const float* prior_data, - const float* variance, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data) { - if (!share_location) { - CHECK_EQ(share_location, true) - << "ERROR: decode boxes without share_location is unimplemented\n"; - return; - } - - int cnt = num_priors / 4; - //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax - //! bbx - - int len_batch = num_priors * 4; - - for (int n = 0; n < batch_num; ++n) { - const float* ptr_loc_batch = loc_data + n * len_batch; - float* ptr_bbox_batch = bbox_data + n * len_batch; - -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - int idx = i * 16; - - const float* ptr_loc = ptr_loc_batch + idx; - const float* ptr_prior = prior_data + idx; - const float* ptr_var = variance + idx; - float* ptr_bbox = ptr_bbox_batch + idx; - - float32x4x4_t vprior = vld4q_f32(ptr_prior); - float32x4x4_t vloc = vld4q_f32(ptr_loc); - - float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); - float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); - - float32x4x4_t vbbx; - vbbx.val[0] = vmulq_f32(vloc.val[0], vprior_width); - vbbx.val[1] = vmulq_f32(vloc.val[1], vprior_height); - vbbx.val[2] = vmulq_f32(vloc.val[2], vprior_width); - vbbx.val[3] = vmulq_f32(vloc.val[3], vprior_height); - - vloc = vld4q_f32(ptr_var); - vbbx.val[0] = vmulq_f32(vbbx.val[0], vloc.val[0]); - vbbx.val[1] = vmulq_f32(vbbx.val[1], vloc.val[1]); - vbbx.val[2] = vmulq_f32(vbbx.val[2], vloc.val[2]); - vbbx.val[3] = vmulq_f32(vbbx.val[3], vloc.val[3]); - - vbbx.val[0] = vaddq_f32(vprior.val[0], vbbx.val[0]); - vbbx.val[1] = vaddq_f32(vprior.val[1], vbbx.val[1]); - vbbx.val[2] = vaddq_f32(vprior.val[2], vbbx.val[2]); - vbbx.val[3] = vaddq_f32(vprior.val[3], vbbx.val[3]); - - vst4q_f32(ptr_bbox, vbbx); - } -#pragma omp parallel for - for (int i = cnt * 4; i < num_priors; i++) { - int idx = i * 4; - float p_xmin = prior_data[idx]; - float p_ymin = prior_data[idx + 1]; - float p_xmax = prior_data[idx + 2]; - float p_ymax = prior_data[idx + 3]; - float prior_width = p_xmax - p_xmin; - float prior_height = p_ymax - p_ymin; - - ptr_bbox_batch[idx] = - p_xmin + ptr_loc_batch[idx] * variance[idx] * prior_width; - ptr_bbox_batch[idx + 1] = - p_ymin + ptr_loc_batch[idx + 1] * variance[idx + 1] * prior_height; - ptr_bbox_batch[idx + 2] = - p_xmax + ptr_loc_batch[idx + 2] * variance[idx + 2] * prior_width; - ptr_bbox_batch[idx + 3] = - p_ymax + ptr_loc_batch[idx + 3] * variance[idx + 3] * prior_height; - } - } -} - -template <> -void decode_bboxes(const int batch_num, - const float* loc_data, - const float* prior_data, - const std::string code_type, - const bool variance_encoded_in_target, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - float* bbox_data) { - const float* variance_data = prior_data + 4 * num_priors; - if (code_type == "corner") { - if (variance_encoded_in_target) { - decode_bbox_corner_variance_kernel(batch_num, - loc_data, - prior_data, - variance_data, - num_priors, - share_location, - num_loc_classes, - background_label_id, - bbox_data); - } else { - decode_bbox_corner_no_variance_kernel(batch_num, - loc_data, - prior_data, - variance_data, - num_priors, - share_location, - num_loc_classes, - background_label_id, - bbox_data); - } - } else if (code_type == "center_size") { - if (variance_encoded_in_target) { - decode_bbox_center_variance_kernel(batch_num, - loc_data, - prior_data, - variance_data, - num_priors, - share_location, - num_loc_classes, - background_label_id, - bbox_data); - } else { - decode_bbox_center_no_variance_kernel(batch_num, - loc_data, - prior_data, - variance_data, - num_priors, - share_location, - num_loc_classes, - background_label_id, - bbox_data); - } - } else if (code_type == "corner_size") { - if (variance_encoded_in_target) { - decode_bbox_corner_size_variance_kernel(batch_num, - loc_data, - prior_data, - variance_data, - num_priors, - share_location, - num_loc_classes, - background_label_id, - bbox_data); - } else { - decode_bbox_corner_size_no_variance_kernel(batch_num, - loc_data, - prior_data, - variance_data, - num_priors, - share_location, - num_loc_classes, - background_label_id, - bbox_data); - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/decode_bboxes.h b/lite/backends/arm/math/decode_bboxes.h deleted file mode 100644 index f18bfe6420..0000000000 --- a/lite/backends/arm/math/decode_bboxes.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void decode_bboxes(const int batch_num, - const T* loc_data, - const T* prior_data, - const std::string code_type, - const bool variance_encoded_in_target, - const int num_priors, - const bool share_location, - const int num_loc_classes, - const int background_label_id, - T* bbox_data); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/dot_toolchain_support.h b/lite/backends/arm/math/dot_toolchain_support.h deleted file mode 100644 index 8342ffee19..0000000000 --- a/lite/backends/arm/math/dot_toolchain_support.h +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// This file is modified according to -// https://github.com/ARM-software/ComputeLibrary -// * Copyright (c) 2017-2018 ARM Limited. -// * -// * SPDX-License-Identifier: MIT -// * -// * Permission is hereby granted, free of charge, to any person obtaining a -// copy -// * of this software and associated documentation files (the "Software"), to -// * deal in the Software without restriction, including without limitation the -// * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -// * sell copies of the Software, and to permit persons to whom the Software is -// * furnished to do so, subject to the following conditions: -// * -// * The above copyright notice and this permission notice shall be included in -// all -// * copies or substantial portions of the Software. -// * -// * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, -// * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE -// * SOFTWARE. - -#pragma once - -#define _DECLARE_SDOT_ELEMENT \ - ".altmacro\n" \ - ".macro sdot opd:req, opn:req, opm:req\n" \ - "local vd, vn, vm, h, l\n" \ - ".irp " \ - "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25," \ - "26,27,28,29,30,31\n" \ - ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \ - ".set vd,\\reg\n" \ - ".endif\n" \ - ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \ - ".set vn,\\reg\n" \ - ".endif\n" \ - ".irp idx,0,1,2,3\n" \ - ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \ - ".set vm,\\reg\n" \ - ".set h,\\idx / 2\n" \ - ".set l,\\idx %% 2\n" \ - ".endif\n" \ - ".endr\n" \ - ".endr\n" \ - ".ifndef vd\n" \ - ".error \"Bad operand \\opd\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vn\n" \ - ".error \"Bad operand \\opn\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vm\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef h\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef l\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \ - ".endm\n" - -#define _DECLARE_SDOT_VECTOR \ - ".altmacro\n" \ - ".macro sdot opd:req, opn:req, opm:req\n" \ - "local vd, vn, vm\n" \ - ".irp " \ - "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25," \ - "26,27,28,29,30,31\n" \ - ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \ - ".set vd,\\reg\n" \ - ".endif\n" \ - ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \ - ".set vn,\\reg\n" \ - ".endif\n" \ - ".ifeqs \"\\opm\",\"v\\reg\\.16b\"\n" \ - ".set vm,\\reg\n" \ - ".endif\n" \ - ".endr\n" \ - ".endr\n" \ - ".ifndef vd\n" \ - ".error \"Bad operand \\opd\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vn\n" \ - ".error \"Bad operand \\opn\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vm\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".int 0x4e809400 | vd | (vn << 5) | (vm << 16)\n" \ - ".endm\n" - -#define _DECLARE_SDOT_VECTOR_2s \ - ".altmacro\n" \ - ".macro sdot opd:req, opn:req, opm:req\n" \ - "local vd, vn, vm\n" \ - ".irp " \ - "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25," \ - "26,27,28,29,30,31\n" \ - ".ifeqs \"\\opd\",\"v\\reg\\.2s\"\n" \ - ".set vd,\\reg\n" \ - ".endif\n" \ - ".ifeqs \"\\opn\",\"v\\reg\\.8b\"\n" \ - ".set vn,\\reg\n" \ - ".endif\n" \ - ".ifeqs \"\\opm\",\"v\\reg\\.8b\"\n" \ - ".set vm,\\reg\n" \ - ".endif\n" \ - ".endr\n" \ - ".endr\n" \ - ".ifndef vd\n" \ - ".error \"Bad operand \\opd\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vn\n" \ - ".error \"Bad operand \\opn\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vm\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".int 0x0e809400 | vd | (vn << 5) | (vm << 16)\n" \ - ".endm\n" - -#define _DECLARE_SDOT_ELEMENT_2s \ - ".altmacro\n" \ - ".macro sdot opd:req, opn:req, opm:req\n" \ - "local vd, vn, vm, h, l\n" \ - ".irp " \ - "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25," \ - "26,27,28,29,30,31\n" \ - ".ifeqs \"\\opd\",\"v\\reg\\.2s\"\n" \ - ".set vd,\\reg\n" \ - ".endif\n" \ - ".ifeqs \"\\opn\",\"v\\reg\\.8b\"\n" \ - ".set vn,\\reg\n" \ - ".endif\n" \ - ".irp idx,0,1,2,3\n" \ - ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \ - ".set vm,\\reg\n" \ - ".set h,\\idx / 2\n" \ - ".set l,\\idx %% 2\n" \ - ".endif\n" \ - ".endr\n" \ - ".endr\n" \ - ".ifndef vd\n" \ - ".error \"Bad operand \\opd\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vn\n" \ - ".error \"Bad operand \\opn\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef vm\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef h\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".ifndef l\n" \ - ".error \"Bad operand \\opm\"\n" \ - ".exitm\n" \ - ".endif\n" \ - ".int 0x0f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \ - ".endm\n" diff --git a/lite/backends/arm/math/dropout.cc b/lite/backends/arm/math/dropout.cc deleted file mode 100644 index 406c850ef5..0000000000 --- a/lite/backends/arm/math/dropout.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/dropout.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void dropout_down(const float* din, float* dout, int num, float prob) { - const float scale = 1.0f - prob; - int cnt = num >> 4; - int remain = num % 16; - float32x4_t vscale = vdupq_n_f32(scale); -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* din_ptr = din + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - float32x4_t vmul0 = vmulq_f32(din0, vscale); - float32x4_t vmul1 = vmulq_f32(din1, vscale); - float32x4_t vmul2 = vmulq_f32(din2, vscale); - float32x4_t vmul3 = vmulq_f32(din3, vscale); - - vst1q_f32(dout_ptr, vmul0); - vst1q_f32(dout_ptr + 4, vmul1); - vst1q_f32(dout_ptr + 8, vmul2); - vst1q_f32(dout_ptr + 12, vmul3); - } - if (remain > 0) { - const float* din_ptr = din + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *din_ptr * scale; - dout_ptr++; - din_ptr++; - } - } -} - -template <> -void dropout_up(const float* din, float* dout, int num) { - int cnt = num >> 4; - int remain = num % 16; -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* din_ptr = din + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - } - if (remain > 0) { - const float* din_ptr = din + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *din_ptr; - dout_ptr++; - din_ptr++; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/dropout.h b/lite/backends/arm/math/dropout.h deleted file mode 100644 index df2be016de..0000000000 --- a/lite/backends/arm/math/dropout.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void dropout_down(const T* din, T* dout, int num, float prob); - -template -void dropout_up(const T* din, T* dout, int num); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc deleted file mode 100644 index a4c61f9a9d..0000000000 --- a/lite/backends/arm/math/elementwise.cc +++ /dev/null @@ -1,1290 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/elementwise.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void elementwise_add(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vaddq_f32(dinx0, diny0); - dinx1 = vaddq_f32(dinx1, diny1); - dinx2 = vaddq_f32(dinx2, diny2); - dinx3 = vaddq_f32(dinx3, diny3); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *dinx_ptr + *diny_ptr; - dout_ptr++; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_add_relu(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vaddq_f32(dinx0, diny0); - dinx1 = vaddq_f32(dinx1, diny1); - dinx2 = vaddq_f32(dinx2, diny2); - dinx3 = vaddq_f32(dinx3, diny3); - - // relu - dinx0 = vmaxq_f32(dinx0, vzero); - dinx1 = vmaxq_f32(dinx1, vzero); - dinx2 = vmaxq_f32(dinx2, vzero); - dinx3 = vmaxq_f32(dinx3, vzero); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - float tmp = *dinx_ptr + *diny_ptr; - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_add_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vaddq_f32(din0, rb); - din1 = vaddq_f32(din1, rb); - din2 = vaddq_f32(din2, rb); - din3 = vaddq_f32(din3, rb); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vaddq_f32(din0, rb); - din1 = vaddq_f32(din1, rb); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vaddq_f32(din0, rb); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; p++) { - *dout_ptr = *din_ptr + diny_data; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_add_relu_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vaddq_f32(din0, rb); - din1 = vaddq_f32(din1, rb); - din2 = vaddq_f32(din2, rb); - din3 = vaddq_f32(din3, rb); - - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - din2 = vmaxq_f32(din2, vzero); - din3 = vmaxq_f32(din3, vzero); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vaddq_f32(din0, rb); - din1 = vaddq_f32(din1, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vaddq_f32(din0, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; p++) { - float tmp = *din_ptr + diny_data; - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_sub(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vsubq_f32(dinx0, diny0); - dinx1 = vsubq_f32(dinx1, diny1); - dinx2 = vsubq_f32(dinx2, diny2); - dinx3 = vsubq_f32(dinx3, diny3); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *dinx_ptr - *diny_ptr; - dout_ptr++; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_sub_relu(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vsubq_f32(dinx0, diny0); - dinx1 = vsubq_f32(dinx1, diny1); - dinx2 = vsubq_f32(dinx2, diny2); - dinx3 = vsubq_f32(dinx3, diny3); - - // relu - dinx0 = vmaxq_f32(dinx0, vzero); - dinx1 = vmaxq_f32(dinx1, vzero); - dinx2 = vmaxq_f32(dinx2, vzero); - dinx3 = vmaxq_f32(dinx3, vzero); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - float tmp = *dinx_ptr - *diny_ptr; - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_sub_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vsubq_f32(din0, rb); - din1 = vsubq_f32(din1, rb); - din2 = vsubq_f32(din2, rb); - din3 = vsubq_f32(din3, rb); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vsubq_f32(din0, rb); - din1 = vsubq_f32(din1, rb); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vsubq_f32(din0, rb); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; p++) { - *dout_ptr = *din_ptr - diny_data; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_sub_relu_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vsubq_f32(din0, rb); - din1 = vsubq_f32(din1, rb); - din2 = vsubq_f32(din2, rb); - din3 = vsubq_f32(din3, rb); - - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - din2 = vmaxq_f32(din2, vzero); - din3 = vmaxq_f32(din3, vzero); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vsubq_f32(din0, rb); - din1 = vsubq_f32(din1, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vsubq_f32(din0, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; p++) { - float tmp = *din_ptr - diny_data; - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_mul(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vmulq_f32(dinx0, diny0); - dinx1 = vmulq_f32(dinx1, diny1); - dinx2 = vmulq_f32(dinx2, diny2); - dinx3 = vmulq_f32(dinx3, diny3); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *dinx_ptr * *diny_ptr; - dout_ptr++; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_mul_relu(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vmulq_f32(dinx0, diny0); - dinx1 = vmulq_f32(dinx1, diny1); - dinx2 = vmulq_f32(dinx2, diny2); - dinx3 = vmulq_f32(dinx3, diny3); - - // relu - dinx0 = vmaxq_f32(dinx0, vzero); - dinx1 = vmaxq_f32(dinx1, vzero); - dinx2 = vmaxq_f32(dinx2, vzero); - dinx3 = vmaxq_f32(dinx3, vzero); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - float tmp = *dinx_ptr * *diny_ptr; - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_mul_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vmulq_f32(din0, rb); - din1 = vmulq_f32(din1, rb); - din2 = vmulq_f32(din2, rb); - din3 = vmulq_f32(din3, rb); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vmulq_f32(din0, rb); - din1 = vmulq_f32(din1, rb); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vmulq_f32(din0, rb); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; ++p) { - *dout_ptr = *din_ptr * diny_data; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_mul_relu_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vmulq_f32(din0, rb); - din1 = vmulq_f32(din1, rb); - din2 = vmulq_f32(din2, rb); - din3 = vmulq_f32(din3, rb); - - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - din2 = vmaxq_f32(din2, vzero); - din3 = vmaxq_f32(din3, vzero); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vmulq_f32(din0, rb); - din1 = vmulq_f32(din1, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vmulq_f32(din0, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; ++p) { - float tmp = *din_ptr * diny_data; - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_max(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vmaxq_f32(dinx0, diny0); - dinx1 = vmaxq_f32(dinx1, diny1); - dinx2 = vmaxq_f32(dinx2, diny2); - dinx3 = vmaxq_f32(dinx3, diny3); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; ++i) { - *(dout_ptr++) = std::max(*(dinx_ptr++), *(diny_ptr++)); - } - } -} - -template <> -void elementwise_max_relu(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - - dinx0 = vmaxq_f32(dinx0, diny0); - dinx1 = vmaxq_f32(dinx1, diny1); - dinx2 = vmaxq_f32(dinx2, diny2); - dinx3 = vmaxq_f32(dinx3, diny3); - - // relu - dinx0 = vmaxq_f32(dinx0, vzero); - dinx1 = vmaxq_f32(dinx1, vzero); - dinx2 = vmaxq_f32(dinx2, vzero); - dinx3 = vmaxq_f32(dinx3, vzero); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; ++i) { - float tmp = std::max(*(dinx_ptr++), *(diny_ptr++)); - *(dout_ptr++) = tmp > 0.f ? tmp : 0.f; - } - } -} - -template <> -void elementwise_max_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vmaxq_f32(din0, rb); - din1 = vmaxq_f32(din1, rb); - din2 = vmaxq_f32(din2, rb); - din3 = vmaxq_f32(din3, rb); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vmaxq_f32(din0, rb); - din1 = vmaxq_f32(din1, rb); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vmaxq_f32(din0, rb); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; ++p) { - *dout_ptr = std::max(*din_ptr, diny_data); - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_max_relu_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - din0 = vmaxq_f32(din0, rb); - din1 = vmaxq_f32(din1, rb); - din2 = vmaxq_f32(din2, rb); - din3 = vmaxq_f32(din3, rb); - - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - din2 = vmaxq_f32(din2, vzero); - din3 = vmaxq_f32(din3, vzero); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - din0 = vmaxq_f32(din0, rb); - din1 = vmaxq_f32(din1, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); - din0 = vmaxq_f32(din0, rb); - // relu - din0 = vmaxq_f32(din0, vzero); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; ++p) { - float tmp = std::max(*din_ptr, diny_data); - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_div(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - -#ifdef __aarch64__ - dinx0 = vdivq_f32(dinx0, diny0); - dinx1 = vdivq_f32(dinx1, diny1); - dinx2 = vdivq_f32(dinx2, diny2); - dinx3 = vdivq_f32(dinx3, diny3); -#else - dinx0 = div_ps(dinx0, diny0); - dinx1 = div_ps(dinx1, diny1); - dinx2 = div_ps(dinx2, diny2); - dinx3 = div_ps(dinx3, diny3); -#endif - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *dinx_ptr / *diny_ptr; - dout_ptr++; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_div_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - -#ifdef __aarch64__ - din0 = vdivq_f32(din0, rb); - din1 = vdivq_f32(din1, rb); - din2 = vdivq_f32(din2, rb); - din3 = vdivq_f32(din3, rb); -#else - din0 = div_ps(din0, rb); - din1 = div_ps(din1, rb); - din2 = div_ps(din2, rb); - din3 = div_ps(din3, rb); -#endif - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); -#ifdef __aarch64__ - din0 = vdivq_f32(din0, rb); - din1 = vdivq_f32(din1, rb); -#else - din0 = div_ps(din0, rb); - din1 = div_ps(din1, rb); -#endif - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); -#ifdef __aarch64__ - din0 = vdivq_f32(din0, rb); -#else - din0 = div_ps(din0, rb); -#endif - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; p++) { - *dout_ptr = *din_ptr / diny_data; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -template <> -void elementwise_div_relu(const float* dinx, - const float* diny, - float* dout, - int num) { - int cnt = num >> 4; - int remain = num % 16; - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for - for (int i = 0; i < cnt; ++i) { - const float* dinx_ptr = dinx + (i << 4); - const float* diny_ptr = diny + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t dinx0 = vld1q_f32(dinx_ptr); - float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4); - float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8); - float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12); - - float32x4_t diny0 = vld1q_f32(diny_ptr); - float32x4_t diny1 = vld1q_f32(diny_ptr + 4); - float32x4_t diny2 = vld1q_f32(diny_ptr + 8); - float32x4_t diny3 = vld1q_f32(diny_ptr + 12); - -#ifdef __aarch64__ - dinx0 = vdivq_f32(dinx0, diny0); - dinx1 = vdivq_f32(dinx1, diny1); - dinx2 = vdivq_f32(dinx2, diny2); - dinx3 = vdivq_f32(dinx3, diny3); -#else - dinx0 = div_ps(dinx0, diny0); - dinx1 = div_ps(dinx1, diny1); - dinx2 = div_ps(dinx2, diny2); - dinx3 = div_ps(dinx3, diny3); -#endif - // relu - dinx0 = vmaxq_f32(dinx0, vzero); - dinx1 = vmaxq_f32(dinx1, vzero); - dinx2 = vmaxq_f32(dinx2, vzero); - dinx3 = vmaxq_f32(dinx3, vzero); - - vst1q_f32(dout_ptr, dinx0); - vst1q_f32(dout_ptr + 4, dinx1); - vst1q_f32(dout_ptr + 8, dinx2); - vst1q_f32(dout_ptr + 12, dinx3); - } - if (remain > 0) { - const float* dinx_ptr = dinx + (cnt << 4); - const float* diny_ptr = diny + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; ++i) { - float tmp = *dinx_ptr / *diny_ptr; - *(dout_ptr++) = tmp > 0.f ? tmp : 0.f; - dinx_ptr++; - diny_ptr++; - } - } -} - -template <> -void elementwise_div_relu_broadcast(const float* dinx, - const float* diny, - float* dout, - int batch, - int channels, - int num) { - float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for collapse(2) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const float* din_ptr = dinx + offset; - const float diny_data = diny[j]; - float* dout_ptr = dout + offset; - - int cnt = num >> 4; - int remain = num % 16; - float32x4_t rb = vdupq_n_f32(diny_data); - for (int k = 0; k < cnt; ++k) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - -#ifdef __aarch64__ - din0 = vdivq_f32(din0, rb); - din1 = vdivq_f32(din1, rb); - din2 = vdivq_f32(din2, rb); - din3 = vdivq_f32(din3, rb); -#else - din0 = div_ps(din0, rb); - din1 = div_ps(din1, rb); - din2 = div_ps(din2, rb); - din3 = div_ps(din3, rb); -#endif - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - din2 = vmaxq_f32(din2, vzero); - din3 = vmaxq_f32(din3, vzero); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - din_ptr += 16; - dout_ptr += 16; - } - if (remain >= 8) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); -#ifdef __aarch64__ - din0 = vdivq_f32(din0, rb); - din1 = vdivq_f32(din1, rb); -#else - din0 = div_ps(din0, rb); - din1 = div_ps(din1, rb); -#endif - // relu - din0 = vmaxq_f32(din0, vzero); - din1 = vmaxq_f32(din1, vzero); - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - din_ptr += 8; - dout_ptr += 8; - remain -= 8; - } - if (remain >= 4) { - float32x4_t din0 = vld1q_f32(din_ptr); -#ifdef __aarch64__ - din0 = vdivq_f32(din0, rb); -#else - din0 = div_ps(din0, rb); -#endif - // relu - din0 = vmaxq_f32(din0, vzero); - vst1q_f32(dout_ptr, din0); - din_ptr += 4; - dout_ptr += 4; - remain -= 4; - } - if (remain > 0) { - for (int p = 0; p < remain; p++) { - float tmp = *din_ptr / diny_data; - *dout_ptr = tmp > 0.f ? tmp : 0.f; - dout_ptr++; - din_ptr++; - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h deleted file mode 100644 index f8273a5bb3..0000000000 --- a/lite/backends/arm/math/elementwise.h +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void elementwise_add(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_add_relu(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_add_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_add_relu_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_sub(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_sub_relu(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_sub_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_sub_relu_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_mul(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_mul_relu(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_mul_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_mul_relu_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_max(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_max_relu(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_max_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_max_relu_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_div(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_div_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -template -void elementwise_div_relu(const T* dinx, const T* diny, T* dout, int num); - -template -void elementwise_div_relu_broadcast( - const T* dinx, const T* diny, T* dout, int batch, int channels, int num); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/fill_bias_relu.cc b/lite/backends/arm/math/fill_bias_relu.cc deleted file mode 100644 index 7137a0363b..0000000000 --- a/lite/backends/arm/math/fill_bias_relu.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/fill_bias_relu.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void fill_bias_relu(float* tensor, - const float* bias, - int channel, - int channel_size, - bool flag_bias, - bool flag_relu) { - float* data = tensor; - if (flag_relu) { - for (int j = 0; j < channel; ++j) { - float bias_data = flag_bias ? bias[j] : 0.f; - float32x4_t vbias = vdupq_n_f32(bias_data); - float32x4_t vzero = vdupq_n_f32(0.f); - int i = 0; - for (; i < channel_size - 3; i += 4) { - float32x4_t vdata = vld1q_f32(&data[i]); - vdata = vaddq_f32(vdata, vbias); - float32x4_t vmax = vmaxq_f32(vdata, vzero); - vst1q_f32(data + i, vmax); - } - for (; i < channel_size; i++) { - data[i] += bias_data; - data[i] = data[i] > 0 ? data[i] : 0.f; - } - data += channel_size; - } - } else { - for (int j = 0; j < channel; ++j) { - float bias_data = flag_bias ? bias[j] : 0.f; - float32x4_t vbias = vdupq_n_f32(bias_data); - int i = 0; - for (; i < channel_size - 3; i += 4) { - float32x4_t vdata = vld1q_f32(&data[i]); - vdata = vaddq_f32(vdata, vbias); - vst1q_f32(data + i, vdata); - } - for (; i < channel_size; i++) { - data[i] += bias_data; - } - data += channel_size; - } - } -} - -template <> -void fill_bias_relu(int* tensor, - const int* bias, - int channel, - int channel_size, - bool flag_bias, - bool flag_relu) { - int* data = tensor; - if (flag_relu) { - for (int j = 0; j < channel; ++j) { - int bias_data = flag_bias ? bias[j] : 0; - int32x4_t vbias = vdupq_n_s32(bias_data); - int32x4_t vzero = vdupq_n_s32(0); - int i = 0; - for (; i < channel_size - 7; i += 8) { - int32x4_t vdata1 = vld1q_s32(data + i); - int32x4_t vdata2 = vld1q_s32(data + i + 4); - vdata1 = vaddq_s32(vdata1, vbias); - vdata2 = vaddq_s32(vdata2, vbias); - int32x4_t vmax1 = vmaxq_s32(vdata1, vzero); - int32x4_t vmax2 = vmaxq_s32(vdata2, vzero); - vst1q_s32(data + i, vmax1); - vst1q_s32(data + i + 4, vmax2); - } - for (; i < channel_size; i++) { - data[i] += bias_data; - data[i] = data[i] > 0 ? data[i] : 0; - } - data += channel_size; - } - } else { - for (int j = 0; j < channel; ++j) { - int bias_data = flag_bias ? bias[j] : 0; - int32x4_t vbias = vdupq_n_s32(bias_data); - int i = 0; - for (; i < channel_size - 7; i += 8) { - int32x4_t vdata1 = vld1q_s32(data + i); - int32x4_t vdata2 = vld1q_s32(data + i + 4); - vdata1 = vaddq_s32(vdata1, vbias); - vdata2 = vaddq_s32(vdata2, vbias); - vst1q_s32(data + i, vdata1); - vst1q_s32(data + i + 4, vdata2); - } - for (; i < channel_size; i++) { - data[i] += bias_data; - } - data += channel_size; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/fill_bias_relu.h b/lite/backends/arm/math/fill_bias_relu.h deleted file mode 100644 index 254d6d43be..0000000000 --- a/lite/backends/arm/math/fill_bias_relu.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/op_lite.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -/** - * * \brief neon implementation to add bias and relu - * * @param tensor - * * @param bias - * * @param channel - * * @param channel_size - * - */ -template -void fill_bias_relu(Dtype* tensor, - const Dtype* bias, - int channel, - int channel_size, - bool flag_bias, - bool flag_relu); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/funcs.cc b/lite/backends/arm/math/funcs.cc deleted file mode 100644 index e4425ade2e..0000000000 --- a/lite/backends/arm/math/funcs.cc +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/funcs.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void fill_bias_fc(float *out, const float *bias, int num, int channel) { - int cnt = channel >> 4; - int remain = channel & 15; - - for (int j = 0; j < num; ++j) { - const float *ptr_bias = bias; - float *ptr_out = out + j * channel; - - float32x4_t vout1; - float32x4_t vout2; - float32x4_t vout3; - float32x4_t vout4; - - for (int i = 0; i < cnt; ++i) { - float32x4_t vin1 = vld1q_f32(ptr_out); - float32x4_t vb1 = vld1q_f32(ptr_bias); - - float32x4_t vin2 = vld1q_f32(ptr_out + 4); - float32x4_t vb2 = vld1q_f32(ptr_bias + 4); - - float32x4_t vin3 = vld1q_f32(ptr_out + 8); - float32x4_t vb3 = vld1q_f32(ptr_bias + 8); - - float32x4_t vin4 = vld1q_f32(ptr_out + 12); - float32x4_t vb4 = vld1q_f32(ptr_bias + 12); - - vout1 = vaddq_f32(vin1, vb1); - vout2 = vaddq_f32(vin2, vb2); - vout3 = vaddq_f32(vin3, vb3); - vout4 = vaddq_f32(vin4, vb4); - - vst1q_f32(ptr_out, vout1); - vst1q_f32(ptr_out + 4, vout2); - vst1q_f32(ptr_out + 8, vout3); - vst1q_f32(ptr_out + 12, vout4); - - ptr_out += 16; - ptr_bias += 16; - } -#if 0 - if (cnt > 0) { - asm( - "1: \n" - "vld1.32 {d0-d1}, [%[ptr_out]] @ load data\n" - "vld1.32 {d2-d3}, [%[ptr_bias]]! @ load data\n" - "vadd.f32 q2, q0, q1 @ add bias\n" - "vst1.32 {d4-d5}, [%[ptr_out]]! @ store result\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 1b @ jump to main loop\n" - :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \ - [cnt] "+r"(cnt) - : - :"q0", "q1", "q2" - ); - } -#endif - for (int i = 0; i < remain; ++i) { - *(ptr_out++) += *(ptr_bias++); - } - } -} - -template <> -void fill_bias_fc(int *out, const int *bias, int num, int channel) { - int cnt = channel >> 4; - int remain = channel & 15; - - for (int j = 0; j < num; ++j) { - const int *ptr_bias = bias; - int *ptr_out = out + j * channel; - - int32x4_t vout1; - int32x4_t vout2; - int32x4_t vout3; - int32x4_t vout4; - - for (int i = 0; i < cnt; ++i) { - int32x4_t vin1 = vld1q_s32(ptr_out); - int32x4_t vb1 = vld1q_s32(ptr_bias); - - int32x4_t vin2 = vld1q_s32(ptr_out + 4); - int32x4_t vb2 = vld1q_s32(ptr_bias + 4); - - int32x4_t vin3 = vld1q_s32(ptr_out + 8); - int32x4_t vb3 = vld1q_s32(ptr_bias + 8); - - int32x4_t vin4 = vld1q_s32(ptr_out + 12); - int32x4_t vb4 = vld1q_s32(ptr_bias + 12); - - vout1 = vaddq_s32(vin1, vb1); - vout2 = vaddq_s32(vin2, vb2); - vout3 = vaddq_s32(vin3, vb3); - vout4 = vaddq_s32(vin4, vb4); - - vst1q_s32(ptr_out, vout1); - vst1q_s32(ptr_out + 4, vout2); - vst1q_s32(ptr_out + 8, vout3); - vst1q_s32(ptr_out + 12, vout4); - - ptr_out += 16; - ptr_bias += 16; - } - -#if 0 - if (cnt > 0) { - asm( - "1: \n" - "vld1.32 {d0-d1}, [%[ptr_out]] @ load data\n" - "vld1.32 {d2-d3}, [%[ptr_bias]]! @ load data\n" - "vadd.s32 q2, q0, q1 @ add bias\n" - "vst1.32 {d4-d5}, [%[ptr_out]]! @ store result\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 1b @ jump to main loop\n" - :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \ - [cnt] "+r"(cnt) - : - :"q0", "q1", "q2" - ); - } -#endif - for (int i = 0; i < remain; ++i) { - *(ptr_out++) += *(ptr_bias++); - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h deleted file mode 100644 index 9438a997b6..0000000000 --- a/lite/backends/arm/math/funcs.h +++ /dev/null @@ -1,427 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "lite/backends/arm/math/activation.h" -#include "lite/backends/arm/math/affine_channel.h" -#include "lite/backends/arm/math/anchor_generator.h" -#include "lite/backends/arm/math/argmax.h" -#include "lite/backends/arm/math/axpy.h" -#include "lite/backends/arm/math/beam_search.h" -#include "lite/backends/arm/math/box_coder.h" -#include "lite/backends/arm/math/col_im_transform.h" -#include "lite/backends/arm/math/concat.h" -#include "lite/backends/arm/math/conv_depthwise.h" -#include "lite/backends/arm/math/conv_direct.h" -#include "lite/backends/arm/math/conv_gemmlike.h" -#include "lite/backends/arm/math/conv_winograd.h" -#include "lite/backends/arm/math/decode_bboxes.h" -#include "lite/backends/arm/math/dropout.h" -#include "lite/backends/arm/math/elementwise.h" -#include "lite/backends/arm/math/fill_bias_relu.h" -#include "lite/backends/arm/math/im2sequence.h" -#include "lite/backends/arm/math/increment.h" -#include "lite/backends/arm/math/interpolate.h" -#include "lite/backends/arm/math/lrn.h" -#include "lite/backends/arm/math/negative.h" -#include "lite/backends/arm/math/norm.h" -#include "lite/backends/arm/math/packed_sgemm.h" -#include "lite/backends/arm/math/pad2d.h" -#include "lite/backends/arm/math/pooling.h" -#include "lite/backends/arm/math/power.h" -#include "lite/backends/arm/math/prior_box.h" -#include "lite/backends/arm/math/reduce_max.h" -#include "lite/backends/arm/math/reduce_mean.h" -#include "lite/backends/arm/math/scale.h" -#include "lite/backends/arm/math/sequence_expand.h" -#include "lite/backends/arm/math/sequence_pool.h" -#include "lite/backends/arm/math/sequence_softmax.h" -#include "lite/backends/arm/math/sgemm.h" -#include "lite/backends/arm/math/sgemv.h" -#include "lite/backends/arm/math/shuffle_channel.h" -#include "lite/backends/arm/math/slice.h" -#include "lite/backends/arm/math/softmax.h" -#include "lite/backends/arm/math/split.h" -#include "lite/backends/arm/math/stack.h" -#include "lite/backends/arm/math/topk.h" -#include "lite/backends/arm/math/yolo_box.h" -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -#define c_inv_mant_mask ~0x7f800000u -#define c_cephes_SQRTHF 0.707106781186547524 -#define c_cephes_log_p0 7.0376836292E-2 -#define c_cephes_log_p1 -1.1514610310E-1 -#define c_cephes_log_p2 1.1676998740E-1 -#define c_cephes_log_p3 -1.2420140846E-1 -#define c_cephes_log_p4 +1.4249322787E-1 -#define c_cephes_log_p5 -1.6668057665E-1 -#define c_cephes_log_p6 +2.0000714765E-1 -#define c_cephes_log_p7 -2.4999993993E-1 -#define c_cephes_log_p8 +3.3333331174E-1 -#define c_cephes_log_q1 -2.12194440e-4 -#define c_cephes_log_q2 0.693359375 - -// natural logarithm computed for 4 simultaneous float -// return NaN for x <= 0 -inline float32x4_t log_ps(float32x4_t x) { - float32x4_t one = vdupq_n_f32(1); - - x = vmaxq_f32(x, vdupq_n_f32(0)); // force flush to zero on denormal values - uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); - - int32x4_t ux = vreinterpretq_s32_f32(x); - - int32x4_t emm0 = vshrq_n_s32(ux, 23); - - // keep only the fractional part - ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); - ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); - x = vreinterpretq_f32_s32(ux); - - emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); - float32x4_t e = vcvtq_f32_s32(emm0); - - e = vaddq_f32(e, one); - - // part2: - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { - // x = x - 1.0; - // } - // - uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); - float32x4_t tmp = - vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); - x = vsubq_f32(x, one); - e = vsubq_f32( - e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); - x = vaddq_f32(x, tmp); - - float32x4_t z = vmulq_f32(x, x); - - float32x4_t y = vdupq_n_f32(c_cephes_log_p0); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); - y = vmulq_f32(y, x); - - y = vmulq_f32(y, z); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); - y = vaddq_f32(y, tmp); - - tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); - y = vsubq_f32(y, tmp); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); - x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); - x = vreinterpretq_f32_u32(vorrq_u32( - vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN - return x; -} - -#define c_exp_hi 88.3762626647949f -#define c_exp_lo -88.3762626647949f - -#define c_cephes_LOG2EF 1.44269504088896341 -#define c_cephes_exp_C1 0.693359375 -#define c_cephes_exp_C2 -2.12194440e-4 - -#define c_cephes_exp_p0 1.9875691500E-4 -#define c_cephes_exp_p1 1.3981999507E-3 -#define c_cephes_exp_p2 8.3334519073E-3 -#define c_cephes_exp_p3 4.1665795894E-2 -#define c_cephes_exp_p4 1.6666665459E-1 -#define c_cephes_exp_p5 5.0000001201E-1 - -// exp() computed for 4 float at once -inline float32x4_t exp_ps(float32x4_t x) { - float32x4_t tmp, fx; - - float32x4_t one = vdupq_n_f32(1); - x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); - x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); - - // express exp(x) as exp(g + n*log(2)) - fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); - - // perform a floorf - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - // if greater, substract 1 - uint32x4_t mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); - float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - static const float cephes_exp_p[6] = {c_cephes_exp_p0, - c_cephes_exp_p1, - c_cephes_exp_p2, - c_cephes_exp_p3, - c_cephes_exp_p4, - c_cephes_exp_p5}; - float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0); - float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1); - float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2); - float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3); - float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4); - float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5); - - y = vmulq_f32(y, x); - z = vmulq_f32(x, x); - - y = vaddq_f32(y, c1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, one); - - // build 2^n - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); - mm = vshlq_n_s32(mm, 23); - float32x4_t pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; -} - -#define c_minus_cephes_DP1 -0.78515625 -#define c_minus_cephes_DP2 -2.4187564849853515625e-4 -#define c_minus_cephes_DP3 -3.77489497744594108e-8 -#define c_sincof_p0 -1.9515295891E-4 -#define c_sincof_p1 8.3321608736E-3 -#define c_sincof_p2 -1.6666654611E-1 -#define c_coscof_p0 2.443315711809948E-005 -#define c_coscof_p1 -1.388731625493765E-003 -#define c_coscof_p2 4.166664568298827E-002 -#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI - -// evaluation of 4 sines & cosines at once. -// -// The code is the exact rewriting of the cephes sinf function. -// Precision is excellent as long as x < 8192 (I did not bother to -// take into account the special handling they have for greater values -// -- it does not return garbage for arguments over 8192, though, but -// the extra precision is missing). -// -// Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the -// surprising but correct result. -// -// Note also that when you compute sin(x), cos(x) is available at -// almost no extra price so both sin_ps and cos_ps make use of -// sincos_ps.. -// -inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos) { - // any x - float32x4_t xmm1, xmm2, xmm3, y; - - uint32x4_t emm2; - - uint32x4_t sign_mask_sin, sign_mask_cos; - sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); - x = vabsq_f32(x); - - // scale by 4/Pi - y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); - - // store the integer part of y in mm0 - emm2 = vcvtq_u32_f32(y); - // j=(j+1) & (~1) (see the cephes sources) - emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); - emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); - y = vcvtq_f32_u32(emm2); - - // get the polynom selection mask - // there is one polynom for 0 <= x <= Pi/4 - // and another one for Pi/4 -void fill_bias_fc(T *tensor, const T *bias, int num, int channel); - -template -inline float32x4_t vactive_f32(const float32x4_t &x) { - return x; -} - -template <> -inline float32x4_t vactive_f32( - const float32x4_t &x) { - float32x4_t __zero = vdupq_n_f32(0.f); - return vmaxq_f32(x, __zero); -} - -template <> -inline float32x4_t vactive_f32( - const float32x4_t &x) { - float32x4_t __zero = vdupq_n_f32(0.f); - float32x4_t __six = vdupq_n_f32(6.f); - return vminq_f32(vmaxq_f32(x, __zero), __six); -} - -template <> -inline float32x4_t vactive_f32( - const float32x4_t &x) { - float32x4_t __one = vdupq_n_f32(1.f); - float32x4_t __x = vnegq_f32(x); - __x = exp_ps(__x); - __x = vaddq_f32(__x, __one); - float32x4_t __out = vrecpeq_f32(__x); - return vmulq_f32(vrecpsq_f32(__x, __out), __out); -} - -template <> -inline float32x4_t vactive_f32( - const float32x4_t &x) { - float32x4_t __one = vdupq_n_f32(1.f); - float32x4_t __x = vmulq_n_f32(x, -2.f); - __x = exp_ps(__x); - __x = vaddq_f32(__x, __one); - float32x4_t __out = vrecpeq_f32(__x); - __out = vmulq_f32(vrecpsq_f32(__x, __out), __out); - __out = vmulq_n_f32(__out, 2.f); - return vsubq_f32(__out, __one); -} - -template -inline float active_f32(const float &x) { - return x; -} - -template <> -inline float active_f32(const float &x) { - return std::max(x, 0.f); -} - -template <> -inline float active_f32(const float &x) { - return std::min(std::max(x, 0.f), 6.f); -} - -template <> -inline float active_f32(const float &x) { - return 1.f / (1.f + exp(-x)); -} - -template <> -inline float active_f32(const float &x) { - return 2.f / (1.f + exp(-2.f * x)) - 1.f; -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/gemm_prepacked_int8.cc b/lite/backends/arm/math/gemm_prepacked_int8.cc deleted file mode 100644 index 9efae11157..0000000000 --- a/lite/backends/arm/math/gemm_prepacked_int8.cc +++ /dev/null @@ -1,3942 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/gemm_prepacked_int8.h" -#include -#include "lite/backends/arm/math/dot_toolchain_support.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void prepackA_m4k2x2_int8(int8_t* out, - const int8_t* in, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void prepackA_m4k2x2_trans_int8(int8_t* out, - const int8_t* in, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void packb_int8(int8_t* out, - const int8_t* in, - int ldin, - int k0, - int kmax, - int n0, - int nmax, - const int8_t* zerobuf); - -void packb_trans_int8(int8_t* out, - const int8_t* in, - int ldin, - int k0, - int kmax, - int n0, - int nmax, - const int8_t* zerobuf); - -#ifdef WITH_ARM_DOTPROD -void prepackA_m8k4_int8(int8_t* out, - const int8_t* in, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void prepackA_m8k4_trans_int8(int8_t* out, - const int8_t* in, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void packb_sdot_int8(int8_t* out, - const int8_t* in, - int ldin, - int k0, - int kmax, - int n0, - int nmax); - -void packb_sdot_trans_int8(int8_t* out, - const int8_t* in, - int ldin, - int k0, - int kmax, - int n0, - int nmax); -#endif - -void prepackA_int8(void* out, - const void* in, - int ldin, - int m0, - int mmax, - int k0, - int kmax, - bool is_trans, - ARMContext* ctx) { -#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD) - if (is_trans) { - if (ctx->has_dot()) { - prepackA_m8k4_trans_int8(static_cast(out), - static_cast(in), - ldin, - m0, - mmax, - k0, - kmax); - } else { - prepackA_m4k2x2_trans_int8(static_cast(out), - static_cast(in), - ldin, - m0, - mmax, - k0, - kmax); - } - } else { - if (ctx->has_dot()) { - prepackA_m8k4_int8(static_cast(out), - static_cast(in), - ldin, - m0, - mmax, - k0, - kmax); - } else { - prepackA_m4k2x2_int8(static_cast(out), - static_cast(in), - ldin, - m0, - mmax, - k0, - kmax); - } - } -#else - if (is_trans) { - prepackA_m4k2x2_trans_int8(static_cast(out), - static_cast(in), - ldin, - m0, - mmax, - k0, - kmax); - } else { - prepackA_m4k2x2_int8(static_cast(out), - static_cast(in), - ldin, - m0, - mmax, - k0, - kmax); - } -#endif -} - -void prepackA_int8(TensorLite* tout, - const TensorLite& tin, - int m, - int k, - int group, - bool is_trans, - ARMContext* ctx) { - int hblock = get_hblock_int8(ctx); - int m_roundup = ROUNDUP(m, hblock); - // round up to 128 bits - int kup = ROUNDUP(k, KBLOCK_INT8); - int group_size_round_up = ((m_roundup * kup + 15) / 16) * 16; - - if (tout->numel() < group_size_round_up * group) { - tout->Resize({1, 1, 1, group_size_round_up * group}); - } - int lda = k; - if (is_trans) { - lda = m; - } - for (int g = 0; g < group; ++g) { - const char* weights_group = tin.data() + g * m * k; - char* weights_trans_ptr = - tout->mutable_data() + g * group_size_round_up; - prepackA_int8( - weights_trans_ptr, weights_group, lda, 0, m, 0, k, is_trans, ctx); - } -} - -template -inline void gemm_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - Dtype*& c_ptr0, // NOLINT - Dtype*& c_ptr1, // NOLINT - Dtype*& c_ptr2, // NOLINT - Dtype*& c_ptr3, // NOLINT - const float* scale, - bool is_relu, - int k, - int rem); -#ifdef __aarch64__ -#define GEMM_INT8_KERNEL \ - "ld1 {v0.16b}, [%[a_ptr]],#16\n" /* load a to q0, q1 */ \ - "ld1 {v4.16b, v5.16b}, [%[b_ptr]],#32\n" /* load b to q4, q5 */ \ - "ld1 {v6.16b, v7.16b}, [%[b_ptr]],#32\n" /* load b to q6, q7 */ \ - "ldr q8, [%[bias]]\n" /* load bias */ \ - "ext v9.16b, v8.16b, v8.16b, #4\n" /* shift left 1s */ \ - "ext v10.16b, v8.16b, v8.16b, #8\n" /* shift left 2s */ \ - "ext v11.16b, v8.16b, v8.16b, #12\n" /* shift left 3s */ \ - "and v16.16b, v8.16b, v8.16b\n" /* set bias0 to out00 */ \ - "and v17.16b, v9.16b, v9.16b\n" /* set bias0 to out01 */ \ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ - "and v18.16b, v10.16b, v10.16b\n" /* set bias0 to out02 */ \ - "and v19.16b, v11.16b, v11.16b\n" /* set bias0 to out03 */ \ - "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ - "and v20.16b, v8.16b, v8.16b\n" /* set bias0 to out10 */ \ - "and v21.16b, v9.16b, v9.16b\n" /* set bias0 to out11 */ \ - "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload a*/ \ - "and v22.16b, v10.16b, v10.16b\n" /* set bias0 to out12 */ \ - "and v23.16b, v11.16b, v11.16b\n" /* set bias0 to out13 */ \ - "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ \ - "and v24.16b, v8.16b, v8.16b\n" /* set bias0 to out20 */ \ - "and v25.16b, v9.16b, v9.16b\n" /* set bias0 to out21 */ \ - "prfm pldl1keep, [%[a_ptr], #192]\n" /* preload a*/ \ - "and v26.16b, v10.16b, v10.16b\n" /* set bias0 to out22 */ \ - "and v27.16b, v11.16b, v11.16b\n" /* set bias0 to out23 */ \ - "prfm pldl1keep, [%[b_ptr], #192]\n" /* preload b*/ \ - "and v28.16b, v8.16b, v8.16b\n" /* set bias0 to out30 */ \ - "and v29.16b, v9.16b, v9.16b\n" /* set bias0 to out31 */ \ - "prfm pldl1keep, [%[b_ptr], #256]\n" /* preload b*/ \ - "and v30.16b, v10.16b, v10.16b\n" /* set bias0 to out32 */ \ - "and v31.16b, v11.16b, v11.16b\n" /* set bias0 to out33 */ \ - "ext v1.16b, v0.16b, v0.16b, #2\n" /* shift left 2bytes */ \ - "ins v1.h[3], v0.h[0]\n" /* insert element */ \ - "ins v1.h[7], v0.h[4]\n" /* insert element */ \ - "rev64 v2.4s, v0.4s\n" /* get low: 22,33,00,11; hi: 66,77,44,55 */ \ - "rev64 v3.4s, v1.4s\n" /* get low: 33,00,11,22; hi: 77,44,55,66 */ \ - "prfm pldl1keep, [%[b_ptr], #320]\n" /* preload a*/ \ - "prfm pldl1keep, [%[b_ptr], #384]\n" /* preload b*/ \ - "cbz %w[k], 3f\n" /* if k = 0, jump to remains */ /* 1st b0, b1 */ \ - "smull v8.8h, v0.8b, v4.8b\n" /* a0 * b0 = c00 */ \ - "smull v12.8h, v0.8b, v5.8b\n" /* a0 * b1 = c01 */ \ - "smull v9.8h, v1.8b, v4.8b\n" /* a1 * b0 = c10 */ \ - "smull v13.8h, v1.8b, v5.8b\n" /* a1 * b1 = c11 */ \ - "smull v10.8h, v2.8b, v4.8b\n" /* a2 * b0 = c20 */ \ - "smull v14.8h, v2.8b, v5.8b\n" /* a2 * b1 = c21 */ \ - "smull v11.8h, v3.8b, v4.8b\n" /* a3 * b0 = c30 */ \ - "smull v15.8h, v3.8b, v5.8b\n" /* a3 * b1 = c31 */ \ - "subs %w[k], %w[k], #1\n" /* loop count -1 */ /* 2nd b0, b1 */ \ - "smlal2 v8.8h, v0.16b, v4.16b\n" /* a0 * b0 = c00 */ \ - "smlal2 v12.8h, v0.16b, v5.16b\n" /* a0 * b1 = c01 */ \ - "smlal2 v9.8h, v1.16b, v4.16b\n" /* a1 * b0 = c10 */ \ - "smlal2 v13.8h, v1.16b, v5.16b\n" /* a1 * b1 = c11 */ \ - "smlal2 v10.8h, v2.16b, v4.16b\n" /* a2 * b0 = c20 */ \ - "smlal2 v14.8h, v2.16b, v5.16b\n" /* a2 * b1 = c21 */ \ - "smlal2 v11.8h, v3.16b, v4.16b\n" /* a3 * b0 = c30 */ \ - "smlal2 v15.8h, v3.16b, v5.16b\n" /* a3 * b1 = c31 */ \ - "beq 8f\n" /* skip main loop */ /* main loop*/ \ - "0:\n" /* main loop */ \ - "ld1 {v4.16b, v5.16b}, [%[b_ptr]],#32\n" /* load b to q4, q5 */ \ - "sadalp v16.4s, v8.8h\n" /* pairwise accumulate to int32, out00 */ \ - "smull v8.8h, v0.8b, v6.8b\n" /* a0 * b2 = c02 */ \ - "sadalp v20.4s, v12.8h\n" /* pairwise accumulate to int32, out01 */ \ - "smull v12.8h, v0.8b, v7.8b\n" /* a0 * b3 = c03 */ \ - "sadalp v17.4s, v9.8h\n" /* pairwise accumulate to int32, out10 */ \ - "smull v9.8h, v1.8b, v6.8b\n" /* a1 * b2 = c12 */ \ - "sadalp v21.4s, v13.8h\n" /* pairwise accumulate to int32, out11 */ \ - "smull v13.8h, v1.8b, v7.8b\n" /* a1 * b3 = c13 */ \ - "sadalp v18.4s, v10.8h\n" /* pairwise accumulate to int32, out20 */ \ - "smull v10.8h, v2.8b, v6.8b\n" /* a2 * b2 = c22 */ \ - "sadalp v22.4s, v14.8h\n" /* pairwise accumulate to int32, out21 */ \ - "smull v14.8h, v2.8b, v7.8b\n" /* a2 * b3 = c23 */ \ - "sadalp v19.4s, v11.8h\n" /* pairwise accumulate to int32, out30 */ \ - "smlal2 v8.8h, v0.16b, v6.16b\n" /* a0 * b2 = c02 */ \ - "smlal2 v12.8h, v0.16b, v7.16b\n" /* a0 * b3 = c03 */ \ - "ld1 {v0.16b}, [%[a_ptr]],#16\n" /* load a to q0, q1 */ \ - "smull v11.8h, v3.8b, v6.8b\n" /* a3 * b2 = c32 */ \ - "sadalp v23.4s, v15.8h\n" /* pairwise accumulate to int32, out31 */ \ - "smull v15.8h, v3.8b, v7.8b\n" /* a3 * b3 = c33 */ /* 2nd b2, b3 */ \ - "smlal2 v9.8h, v1.16b, v6.16b\n" /* a1 * b2 = c12 */ \ - "smlal2 v13.8h, v1.16b, v7.16b\n" /* a1 * b3 = c13 */ \ - "smlal2 v10.8h, v2.16b, v6.16b\n" /* a2 * b2 = c22 */ \ - "ext v1.16b, v0.16b, v0.16b, #2\n" /* shift left 2bytes*/ \ - "ins v1.h[3], v0.h[0]\n" /* insert element */ \ - "ins v1.h[7], v0.h[4]\n" /* insert element */ \ - "smlal2 v14.8h, v2.16b, v7.16b\n" /* a2 * b3 = c23 */ \ - "smlal2 v11.8h, v3.16b, v6.16b\n" /* a3 * b2 = c32 */ \ - "smlal2 v15.8h, v3.16b, v7.16b\n" /* a3 * b3 = c33 */ /* pre-process a */ \ - "rev64 v2.4s, v0.4s\n" /* get low: 22,33,00,11; hi: 66,77,44,55 */ \ - "rev64 v3.4s, v1.4s\n" /* get low: 33,00,11,22; hi: 77,44,55,66 */ \ - "ld1 {v6.16b, v7.16b}, [%[b_ptr]],#32\n" /* load b to q6, q7 */ \ - "sadalp v24.4s, v8.8h\n" /* pairwise accumulate to int32, out02 */ \ - "smull v8.8h, v0.8b, v4.8b\n" /* a0 * b0 = c00 */ \ - "sadalp v28.4s, v12.8h\n" /* pairwise accumulate to int32, out03 */ \ - "smull v12.8h, v0.8b, v5.8b\n" /* a0 * b1 = c01 */ \ - "sadalp v25.4s, v9.8h\n" /* pairwise accumulate to int32, out12 */ \ - "smull v9.8h, v1.8b, v4.8b\n" /* a1 * b0 = c00 */ \ - "sadalp v29.4s, v13.8h\n" /* pairwise accumulate to int32, out13 */ \ - "smull v13.8h, v1.8b, v5.8b\n" /* a1 * b1 = c01 */ \ - "sadalp v26.4s, v10.8h\n" /* pairwise accumulate to int32, out22 */ \ - "smull v10.8h, v2.8b, v4.8b\n" /* a2 * b0 = c00 */ \ - "sadalp v30.4s, v14.8h\n" /* pairwise accumulate to int32, out23 */ \ - "smull v14.8h, v2.8b, v5.8b\n" /* a2 * b1 = c01 */ \ - "sadalp v27.4s, v11.8h\n" /* pairwise accumulate to int32, out32 */ \ - "smull v11.8h, v3.8b, v4.8b\n" /* a3 * b0 = c00 */ \ - "sadalp v31.4s, v15.8h\n" /* pairwise accumulate to int32, out33 */ \ - "smull v15.8h, v3.8b, v5.8b\n" /* a3 * b1 = c01 */ \ - "subs %w[k], %w[k], #1\n" /* loop count -1 */ /* 2nd b0, b1 */ \ - "smlal2 v8.8h, v0.16b, v4.16b\n" /* a0 * b0 = c00 */ \ - "smlal2 v12.8h, v0.16b, v5.16b\n" /* a0 * b1 = c01 */ \ - "smlal2 v9.8h, v1.16b, v4.16b\n" /* a1 * b0 = c10 */ \ - "smlal2 v13.8h, v1.16b, v5.16b\n" /* a1 * b1 = c11 */ \ - "smlal2 v10.8h, v2.16b, v4.16b\n" /* a2 * b0 = c20 */ \ - "smlal2 v14.8h, v2.16b, v5.16b\n" /* a2 * b1 = c21 */ \ - "smlal2 v11.8h, v3.16b, v4.16b\n" /* a3 * b0 = c30 */ \ - "smlal2 v15.8h, v3.16b, v5.16b\n" /* a3 * b1 = c31 */ \ - "bgt 0b\n" /* jump to main loop */ \ - "8:\n" /* finish main loop */ /* 1st b2, b3 */ \ - "sadalp v16.4s, v8.8h\n" /* pairwise accumulate to int32, out00 */ \ - "smull v8.8h, v0.8b, v6.8b\n" /* a0 * b0 = c02 */ \ - "sadalp v20.4s, v12.8h\n" /* pairwise accumulate to int32, out01 */ \ - "smull v12.8h, v0.8b, v7.8b\n" /* a0 * b1 = c03 */ \ - "sadalp v17.4s, v9.8h\n" /* pairwise accumulate to int32, out10 */ \ - "smull v9.8h, v1.8b, v6.8b\n" /* a1 * b0 = c12 */ \ - "sadalp v21.4s, v13.8h\n" /* pairwise accumulate to int32, out11 */ \ - "smull v13.8h, v1.8b, v7.8b\n" /* a1 * b1 = c13 */ \ - "sadalp v18.4s, v10.8h\n" /* pairwise accumulate to int32, out20 */ \ - "smull v10.8h, v2.8b, v6.8b\n" /* a2 * b0 = c22 */ \ - "sadalp v22.4s, v14.8h\n" /* pairwise accumulate to int32, out21 */ \ - "smull v14.8h, v2.8b, v7.8b\n" /* a2 * b1 = c23 */ \ - "sadalp v19.4s, v11.8h\n" /* pairwise accumulate to int32, out30 */ \ - "smull v11.8h, v3.8b, v6.8b\n" /* a3 * b0 = c32 */ \ - "sadalp v23.4s, v15.8h\n" /* pairwise accumulate to int32, out31 */ \ - "smull v15.8h, v3.8b, v7.8b\n" /* a3 * b1 = c33 */ /* 2nd b2, b3 */ \ - "smlal2 v8.8h, v0.16b, v6.16b\n" /* a0 * b0 = c02 */ \ - "smlal2 v12.8h, v0.16b, v7.16b\n" /* a0 * b1 = c03 */ \ - "smlal2 v9.8h, v1.16b, v6.16b\n" /* a1 * b0 = c12 */ \ - "smlal2 v13.8h, v1.16b, v7.16b\n" /* a1 * b1 = c23 */ \ - "smlal2 v10.8h, v2.16b, v6.16b\n" /* a2 * b0 = c13 */ \ - "smlal2 v14.8h, v2.16b, v7.16b\n" /* a2 * b1 = c32 */ \ - "smlal2 v11.8h, v3.16b, v6.16b\n" /* a3 * b0 = c22 */ \ - "smlal2 v15.8h, v3.16b, v7.16b\n" /* a3 * b1 = c33 */ \ - "cbz %w[rem], 5f\n" /* skip remain */ \ - "ld1 {v0.8b}, [%[a_ptr]]\n" /* load a to q0, final */ \ - "ld1 {v4.16b, v5.16b}, [%[b_ptr]],#32\n" /* load b to q4, q5 */ \ - "ld1 {v6.16b, v7.16b}, [%[b_ptr]],#32\n" /* load b to q6, q7 */ \ - "5:\n" /* no remain */ \ - "sadalp v24.4s, v8.8h\n" /* pairwise accumulate to int32, out02 */ \ - "sadalp v28.4s, v12.8h\n" /* pairwise accumulate to int32, out03 */ \ - "sadalp v25.4s, v9.8h\n" /* pairwise accumulate to int32, out12 */ \ - "sadalp v29.4s, v13.8h\n" /* pairwise accumulate to int32, out13 */ \ - "sadalp v26.4s, v10.8h\n" /* pairwise accumulate to int32, out22 */ \ - "sadalp v30.4s, v14.8h\n" /* pairwise accumulate to int32, out23 */ \ - "sadalp v27.4s, v11.8h\n" /* pairwise accumulate to int32, out32 */ \ - "sadalp v31.4s, v15.8h\n" /* pairwise accumulate to int32, out33 */ \ - "3: \n" /* process remains */ \ - "cbz %w[rem], 7f\n" /* skip remain */ /* process remain k */ \ - "4: \n" /* remain = 1, 2 */ \ - "ext v1.8b, v0.8b, v0.8b, #2\n" /* shift left 2bytes */ \ - "ext v2.8b, v0.8b, v0.8b, #4\n" /* shift left 4bytes */ \ - "ext v3.8b, v0.8b, v0.8b, #6\n" /* shift left 6bytes */ /* 1st b0, b1 */ \ - "smull v8.8h, v0.8b, v4.8b\n" /* a0 * b0 = c00 */ \ - "smull v12.8h, v0.8b, v5.8b\n" /* a0 * b1 = c01 */ \ - "smull v9.8h, v1.8b, v4.8b\n" /* a1 * b0 = c10 */ \ - "smull v13.8h, v1.8b, v5.8b\n" /* a1 * b1 = c11 */ \ - "smull v10.8h, v2.8b, v4.8b\n" /* a2 * b0 = c20 */ \ - "smull v14.8h, v2.8b, v5.8b\n" /* a2 * b1 = c21 */ \ - "smull v11.8h, v3.8b, v4.8b\n" /* a3 * b0 = c30 */ \ - "smull v15.8h, v3.8b, v5.8b\n" /* a3 * b1 = c31 */ /* 1st b2, b3 */ \ - "sadalp v16.4s, v8.8h\n" /* pairwise accumulate to int32, out00 */ \ - "smull v8.8h, v0.8b, v6.8b\n" /* a0 * b0 = c02 */ \ - "sadalp v20.4s, v12.8h\n" /* pairwise accumulate to int32, out01 */ \ - "smull v12.8h, v0.8b, v7.8b\n" /* a0 * b1 = c03 */ \ - "sadalp v17.4s, v9.8h\n" /* pairwise accumulate to int32, out10 */ \ - "smull v9.8h, v1.8b, v6.8b\n" /* a1 * b0 = c12 */ \ - "sadalp v21.4s, v13.8h\n" /* pairwise accumulate to int32, out11 */ \ - "smull v13.8h, v1.8b, v7.8b\n" /* a1 * b1 = c13 */ \ - "sadalp v18.4s, v10.8h\n" /* pairwise accumulate to int32, out20 */ \ - "smull v10.8h, v2.8b, v6.8b\n" /* a2 * b0 = c22 */ \ - "sadalp v22.4s, v14.8h\n" /* pairwise accumulate to int32, out21 */ \ - "smull v14.8h, v2.8b, v7.8b\n" /* a2 * b1 = c23 */ \ - "sadalp v19.4s, v11.8h\n" /* pairwise accumulate to int32, out30 */ \ - "smull v11.8h, v3.8b, v6.8b\n" /* a3 * b0 = c32 */ \ - "sadalp v23.4s, v15.8h\n" /* pairwise accumulate to int32, out31 */ \ - "smull v15.8h, v3.8b, v7.8b\n" /* a3 * b1 = c33 */ \ - "sadalp v24.4s, v8.8h\n" /* pairwise accumulate to int32, out02 */ \ - "sadalp v28.4s, v12.8h\n" /* pairwise accumulate to int32, out03 */ \ - "sadalp v25.4s, v9.8h\n" /* pairwise accumulate to int32, out12 */ \ - "sadalp v29.4s, v13.8h\n" /* pairwise accumulate to int32, out13 */ \ - "sadalp v26.4s, v10.8h\n" /* pairwise accumulate to int32, out22 */ \ - "sadalp v30.4s, v14.8h\n" /* pairwise accumulate to int32, out23 */ \ - "sadalp v27.4s, v11.8h\n" /* pairwise accumulate to int32, out32 */ \ - "sadalp v31.4s, v15.8h\n" /* pairwise accumulate to int32, out33 */ \ - "7: \n" /* do relu */ /* do relu */ \ - "cbz %w[is_relu], 9f\n" /* not relu, jump to unpack */ \ - "movi v0.4s, #0\n" /* for relu */ \ - "smax v16.4s, v16.4s, v0.4s\n" /* relu */ \ - "smax v17.4s, v17.4s, v0.4s\n" /* relu */ \ - "smax v18.4s, v18.4s, v0.4s\n" /* relu */ \ - "smax v19.4s, v19.4s, v0.4s\n" /* relu */ \ - "smax v20.4s, v20.4s, v0.4s\n" /* relu */ \ - "smax v21.4s, v21.4s, v0.4s\n" /* relu */ \ - "smax v22.4s, v22.4s, v0.4s\n" /* relu */ \ - "smax v23.4s, v23.4s, v0.4s\n" /* relu */ \ - "smax v24.4s, v24.4s, v0.4s\n" /* relu */ \ - "smax v25.4s, v25.4s, v0.4s\n" /* relu */ \ - "smax v26.4s, v26.4s, v0.4s\n" /* relu */ \ - "smax v27.4s, v27.4s, v0.4s\n" /* relu */ \ - "smax v28.4s, v28.4s, v0.4s\n" /* relu */ \ - "smax v29.4s, v29.4s, v0.4s\n" /* relu */ \ - "smax v30.4s, v30.4s, v0.4s\n" /* relu */ \ - "smax v31.4s, v31.4s, v0.4s\n" /* relu */ /* unpack the result */ \ - "9:\n" /* unpack */ /* trans 1 */ \ - "trn1 v0.4s, v16.4s, v17.4s\n" /* get a0,b0, a2,b2 */ \ - "trn2 v1.4s, v16.4s, v17.4s\n" /* get a1,b1, a3,b3 */ \ - "trn1 v2.4s, v18.4s, v19.4s\n" /* get c0,d0, c2,c2 */ \ - "trn2 v3.4s, v18.4s, v19.4s\n" /* get c1,d1, c3,d3 */ \ - "trn1 v4.4s, v20.4s, v21.4s\n" \ - "trn2 v5.4s, v20.4s, v21.4s\n" \ - "trn1 v6.4s, v22.4s, v23.4s\n" \ - "trn2 v7.4s, v22.4s, v23.4s\n" \ - "trn1 v8.4s, v24.4s, v25.4s\n" \ - "trn2 v9.4s, v24.4s, v25.4s\n" \ - "trn1 v10.4s, v26.4s, v27.4s\n" \ - "trn2 v11.4s, v26.4s, v27.4s\n" \ - "trn1 v12.4s, v28.4s, v29.4s\n" \ - "trn2 v13.4s, v28.4s, v29.4s\n" \ - "trn1 v14.4s, v30.4s, v31.4s\n" \ - "trn2 v15.4s, v30.4s, v31.4s\n" /* trans 2 */ \ - "trn1 v16.2d, v0.2d, v2.2d\n" /* get a0,b0, c0,d0 */ \ - "trn2 v18.2d, v0.2d, v2.2d\n" /* get a2,b2, c2,d2 */ \ - "trn1 v17.2d, v1.2d, v3.2d\n" /* get a1,b1, c1,d1 */ \ - "trn2 v19.2d, v1.2d, v3.2d\n" /* get a3,b3, c3,d3 */ \ - "trn1 v20.2d, v4.2d, v6.2d\n" \ - "trn2 v22.2d, v4.2d, v6.2d\n" \ - "trn1 v21.2d, v5.2d, v7.2d\n" \ - "trn2 v23.2d, v5.2d, v7.2d\n" \ - "trn1 v24.2d, v8.2d, v10.2d\n" \ - "trn2 v26.2d, v8.2d, v10.2d\n" \ - "trn1 v25.2d, v9.2d, v11.2d\n" \ - "trn2 v27.2d, v9.2d, v11.2d\n" \ - "trn1 v28.2d, v12.2d, v14.2d\n" \ - "trn2 v30.2d, v12.2d, v14.2d\n" \ - "trn1 v29.2d, v13.2d, v15.2d\n" \ - "trn2 v31.2d, v13.2d, v15.2d\n" /* shift */ \ - "ext v17.16b, v17.16b, v17.16b, #12\n" /* circular shift left 1 */ \ - "ext v18.16b, v18.16b, v18.16b, #8\n" /* circular shift left 2 */ \ - "ext v19.16b, v19.16b, v19.16b, #4\n" /* circular shift left 3 */ \ - "ext v21.16b, v21.16b, v21.16b, #12\n" /* circular shift left 1 */ \ - "ext v22.16b, v22.16b, v22.16b, #8\n" /* circular shift left 2 */ \ - "ext v23.16b, v23.16b, v23.16b, #4\n" /* circular shift left 3 */ \ - "ext v25.16b, v25.16b, v25.16b, #12\n" /* circular shift left 1 */ \ - "ext v26.16b, v26.16b, v26.16b, #8\n" /* circular shift left 2 */ \ - "ext v27.16b, v27.16b, v27.16b, #4\n" /* circular shift left 3 */ \ - "ext v29.16b, v29.16b, v29.16b, #12\n" /* circular shift left 1 */ \ - "ext v30.16b, v30.16b, v30.16b, #8\n" /* circular shift left 2 */ \ - "ext v31.16b, v31.16b, v31.16b, #4\n" /* circular shift left 3 */ \ - "trn1 v0.4s, v16.4s, v17.4s\n" /* get a0,b0, a2,b2 */ \ - "trn2 v1.4s, v16.4s, v17.4s\n" /* get a1,b1, a3,b3 */ \ - "trn1 v2.4s, v18.4s, v19.4s\n" /* get c0,d0, c2,c2 */ \ - "trn2 v3.4s, v18.4s, v19.4s\n" /* get c1,d1, c3,d3 */ \ - "trn1 v4.4s, v20.4s, v21.4s\n" \ - "trn2 v5.4s, v20.4s, v21.4s\n" \ - "trn1 v6.4s, v22.4s, v23.4s\n" \ - "trn2 v7.4s, v22.4s, v23.4s\n" \ - "trn1 v8.4s, v24.4s, v25.4s\n" \ - "trn2 v9.4s, v24.4s, v25.4s\n" \ - "trn1 v10.4s, v26.4s, v27.4s\n" \ - "trn2 v11.4s, v26.4s, v27.4s\n" \ - "trn1 v12.4s, v28.4s, v29.4s\n" \ - "trn2 v13.4s, v28.4s, v29.4s\n" \ - "trn1 v14.4s, v30.4s, v31.4s\n" \ - "trn2 v15.4s, v30.4s, v31.4s\n" /* trans 2 */ \ - "trn1 v16.2d, v0.2d, v2.2d\n" /* get a0,b0, c0,d0 */ \ - "trn2 v24.2d, v0.2d, v2.2d\n" /* get a2,b2, c2,d2 */ \ - "trn1 v20.2d, v1.2d, v3.2d\n" /* get a1,b1, c1,d1 */ \ - "trn2 v28.2d, v1.2d, v3.2d\n" /* get a3,b3, c3,d3 */ \ - "trn1 v17.2d, v4.2d, v6.2d\n" \ - "trn2 v25.2d, v4.2d, v6.2d\n" \ - "trn1 v21.2d, v5.2d, v7.2d\n" \ - "trn2 v29.2d, v5.2d, v7.2d\n" \ - "trn1 v18.2d, v8.2d, v10.2d\n" \ - "trn2 v26.2d, v8.2d, v10.2d\n" \ - "trn1 v22.2d, v9.2d, v11.2d\n" \ - "trn2 v30.2d, v9.2d, v11.2d\n" \ - "trn1 v19.2d, v12.2d, v14.2d\n" \ - "trn2 v27.2d, v12.2d, v14.2d\n" \ - "trn1 v23.2d, v13.2d, v15.2d\n" \ - "trn2 v31.2d, v13.2d, v15.2d\n" - -// clang-format off -#define GEMM_INT8_INT32_OUT \ - /* store */ \ - "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[c_ptr0]], #64\n" \ - "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%[c_ptr1]], #64\n" \ - "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%[c_ptr2]], #64\n" \ - "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [%[c_ptr3]], #64\n" -// clang-format on - -#define GEMM_INT8_FP32_OUT \ - /* store */ \ - "ldr q15, [%[scale]]\n" /* load scale */ \ - "scvtf v0.4s , v16.4s\n" /* 00, convert to fp32 */ \ - "scvtf v1.4s , v17.4s\n" /* 01, convert to fp32 */ \ - "scvtf v2.4s , v18.4s\n" /* 02, convert to fp32 */ \ - "scvtf v3.4s , v19.4s\n" /* 03, convert to fp32 */ \ - "scvtf v4.4s , v20.4s\n" /* 10, convert to fp32 */ \ - "scvtf v5.4s , v21.4s\n" /* 11, convert to fp32 */ \ - "scvtf v6.4s , v22.4s\n" /* 12, convert to fp32 */ \ - "scvtf v7.4s , v23.4s\n" /* 13, convert to fp32 */ \ - "fmul v16.4s, v0.4s, v15.s[0]\n" /* 00, mul scale to get final result */ \ - "fmul v17.4s, v1.4s, v15.s[0]\n" /* 01, mul scale to get final result */ \ - "fmul v18.4s, v2.4s, v15.s[0]\n" /* 02, mul scale to get final result */ \ - "fmul v19.4s, v3.4s, v15.s[0]\n" /* 03, mul scale to get final result */ \ - "fmul v20.4s, v4.4s, v15.s[1]\n" /* 10, mul scale to get final result */ \ - "fmul v21.4s, v5.4s, v15.s[1]\n" /* 11, mul scale to get final result */ \ - "fmul v22.4s, v6.4s, v15.s[1]\n" /* 12, mul scale to get final result */ \ - "fmul v23.4s, v7.4s, v15.s[1]\n" /* 13, mul scale to get final result */ \ - "scvtf v0.4s , v24.4s\n" /* 20, convert to fp32 */ \ - "scvtf v1.4s , v25.4s\n" /* 21, convert to fp32 */ \ - "stp q16, q17, [%[c_ptr0]], #32\n" /* write r0, 0,1 */ \ - "scvtf v2.4s , v26.4s\n" /* 22, convert to fp32 */ \ - "scvtf v3.4s , v27.4s\n" /* 23, convert to fp32 */ \ - "stp q18, q19, [%[c_ptr0]], #32\n" /* write r0, 2,3 */ \ - "scvtf v4.4s , v28.4s\n" /* 30, convert to fp32 */ \ - "scvtf v5.4s , v29.4s\n" /* 31, convert to fp32 */ \ - "stp q20, q21, [%[c_ptr1]], #32\n" /* write r1, 0,1 */ \ - "scvtf v6.4s , v30.4s\n" /* 32, convert to fp32 */ \ - "scvtf v7.4s , v31.4s\n" /* 33, convert to fp32 */ \ - "stp q22, q23, [%[c_ptr1]], #32\n" /* write r1, 2,3 */ \ - "fmul v24.4s, v0.4s, v15.s[2]\n" /* 20, mul scale to get final result */ \ - "fmul v25.4s, v1.4s, v15.s[2]\n" /* 21, mul scale to get final result */ \ - "fmul v26.4s, v2.4s, v15.s[2]\n" /* 22, mul scale to get final result */ \ - "fmul v27.4s, v3.4s, v15.s[2]\n" /* 23, mul scale to get final result */ \ - "fmul v28.4s, v4.4s, v15.s[3]\n" /* 30, mul scale to get final result */ \ - "fmul v29.4s, v5.4s, v15.s[3]\n" /* 31, mul scale to get final result */ \ - "stp q24, q25, [%[c_ptr2]], #32\n" /* write r2, 2,3 */ \ - "fmul v30.4s, v6.4s, v15.s[3]\n" /* 32, mul scale to get final result */ \ - "stp q26, q27, [%[c_ptr2]], #32\n" /* write r2, 2,3 */ \ - "fmul v31.4s, v7.4s, v15.s[3]\n" /* 33, mul scale to get final result */ \ - "stp q28, q29, [%[c_ptr3]], #32\n" /* write r3, 2,3 */ \ - "stp q30, q31, [%[c_ptr3]], #32\n" /* write r3, 2,3 */ - -#define GEMM_INT8_INT8_OUT \ - /* store */ \ - "ldr q15, [%[scale]]\n" /* load scale */ \ - "scvtf v0.4s , v16.4s\n" /* 00, convert to fp32 */ \ - "scvtf v1.4s , v17.4s\n" /* 01, convert to fp32 */ \ - "scvtf v2.4s , v18.4s\n" /* 02, convert to fp32 */ \ - "scvtf v3.4s , v19.4s\n" /* 03, convert to fp32 */ \ - "scvtf v4.4s , v20.4s\n" /* 10, convert to fp32 */ \ - "scvtf v5.4s , v21.4s\n" /* 11, convert to fp32 */ \ - "scvtf v6.4s , v22.4s\n" /* 12, convert to fp32 */ \ - "scvtf v7.4s , v23.4s\n" /* 13, convert to fp32 */ \ - "fmul v16.4s, v0.4s, v15.s[0]\n" /* 00, mul scale to get final result */ \ - "fmul v17.4s, v1.4s, v15.s[0]\n" /* 01, mul scale to get final result */ \ - "fmul v18.4s, v2.4s, v15.s[0]\n" /* 02, mul scale to get final result */ \ - "fmul v19.4s, v3.4s, v15.s[0]\n" /* 03, mul scale to get final result */ \ - "fmul v20.4s, v4.4s, v15.s[1]\n" /* 20, mul scale to get final result */ \ - "fmul v21.4s, v5.4s, v15.s[1]\n" /* 21, mul scale to get final result */ \ - "fmul v22.4s, v6.4s, v15.s[1]\n" /* 22, mul scale to get final result */ \ - "fmul v23.4s, v7.4s, v15.s[1]\n" /* 23, mul scale to get final result */ \ - "scvtf v0.4s , v24.4s\n" /* 20, convert to fp32 */ \ - "scvtf v1.4s , v25.4s\n" /* 21, convert to fp32 */ \ - "scvtf v2.4s , v26.4s\n" /* 22, convert to fp32 */ \ - "scvtf v3.4s , v27.4s\n" /* 23, convert to fp32 */ \ - "scvtf v4.4s , v28.4s\n" /* 30, convert to fp32 */ \ - "scvtf v5.4s , v29.4s\n" /* 31, convert to fp32 */ \ - "scvtf v6.4s , v30.4s\n" /* 32, convert to fp32 */ \ - "scvtf v7.4s , v31.4s\n" /* 33, convert to fp32 */ \ - "fmul v24.4s, v0.4s, v15.s[2]\n" /* 20, mul scale to get final result */ \ - "fmul v25.4s, v1.4s, v15.s[2]\n" /* 21, mul scale to get final result */ \ - "fmul v26.4s, v2.4s, v15.s[2]\n" /* 22, mul scale to get final result */ \ - "fmul v27.4s, v3.4s, v15.s[2]\n" /* 23, mul scale to get final result */ \ - "fmul v28.4s, v4.4s, v15.s[3]\n" /* 30, mul scale to get final result */ \ - "fmul v29.4s, v5.4s, v15.s[3]\n" /* 31, mul scale to get final result */ \ - "fmul v30.4s, v6.4s, v15.s[3]\n" /* 32, mul scale to get final result */ \ - "fmul v31.4s, v7.4s, v15.s[3]\n" /* 33, mul scale to get final result */ \ - "fcvtas v0.4s, v16.4s\n" /* 00, cvt to int */ \ - "fcvtas v1.4s, v17.4s\n" /* 01, cvt to int */ \ - "fcvtas v2.4s, v18.4s\n" /* 02, cvt to int */ \ - "fcvtas v3.4s, v19.4s\n" /* 03, cvt to int */ \ - "fcvtas v4.4s, v20.4s\n" /* 10, cvt to int */ \ - "fcvtas v5.4s, v21.4s\n" /* 11, cvt to int */ \ - "fcvtas v6.4s, v22.4s\n" /* 12, cvt to int */ \ - "fcvtas v7.4s, v23.4s\n" /* 13, cvt to int */ \ - "sqxtn v16.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ - "fcvtas v8.4s, v24.4s\n" /* 20, cvt to int */ \ - "sqxtn2 v16.8h, v1.4s\n" /* 01, cvt int32 to int16 */ \ - "fcvtas v9.4s, v25.4s\n" /* 21, cvt to int */ \ - "sqxtn v17.4h, v2.4s\n" /* 02, cvt int32 to int16 */ \ - "fcvtas v10.4s, v26.4s\n" /* 22, cvt to int */ \ - "sqxtn2 v17.8h, v3.4s\n" /* 03, cvt int32 to int16 */ \ - "fcvtas v11.4s, v27.4s\n" /* 23, cvt to int */ \ - "sqxtn v18.4h, v4.4s\n" /* 10, cvt int32 to int16 */ \ - "fcvtas v12.4s, v28.4s\n" /* 30, cvt to int */ \ - "sqxtn2 v18.8h, v5.4s\n" /* 11, cvt int32 to int16 */ \ - "fcvtas v13.4s, v29.4s\n" /* 31, cvt to int */ \ - "sqxtn v19.4h, v6.4s\n" /* 12, cvt int32 to int16 */ \ - "fcvtas v14.4s, v30.4s\n" /* 32, cvt to int */ \ - "sqxtn2 v19.8h, v7.4s\n" /* 13, cvt int32 to int16 */ \ - "fcvtas v15.4s, v31.4s\n" /* 33, cvt to int */ \ - "sqxtn v0.8b, v16.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn2 v0.16b, v17.8h\n" /* 02, 03, cvt int16 to int8 */ \ - "sqxtn v1.8b, v18.8h\n" /* 10, 11, cvt int16 to int8 */ \ - "sqxtn2 v1.16b, v19.8h\n" /* 12, 13, cvt int16 to int8 */ \ - "sqxtn v20.4h, v8.4s\n" /* 20, cvt int32 to int16 */ \ - "sqxtn2 v20.8h, v9.4s\n" /* 21, cvt int32 to int16 */ \ - "sqxtn v21.4h, v10.4s\n" /* 22, cvt int32 to int16 */ \ - "sqxtn2 v21.8h, v11.4s\n" /* 23, cvt int32 to int16 */ \ - "sqxtn v22.4h, v12.4s\n" /* 30, cvt int32 to int16 */ \ - "sqxtn2 v22.8h, v13.4s\n" /* 31, cvt int32 to int16 */ \ - "sqxtn v23.4h, v14.4s\n" /* 32, cvt int32 to int16 */ \ - "sqxtn2 v23.8h, v15.4s\n" /* 33, cvt int32 to int16 */ \ - "sqxtn v2.8b, v20.8h\n" /* 20, 21, cvt int16 to int8 */ \ - "sqxtn2 v2.16b, v21.8h\n" /* 22, 23, cvt int16 to int8 */ \ - "sqxtn v3.8b, v22.8h\n" /* 30, 31, cvt int16 to int8 */ \ - "sqxtn2 v3.16b, v23.8h\n" /* 32, 33, cvt int16 to int8 */ \ - "str q0, [%[c_ptr0]], #16\n" /* write r0 */ \ - "str q1, [%[c_ptr1]], #16\n" /* write r1 */ \ - "str q2, [%[c_ptr2]], #16\n" /* write r2 */ \ - "str q3, [%[c_ptr3]], #16\n" /* write r3 */ - -template <> -inline void gemm_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - int32_t*& c_ptr0, // NOLINT - int32_t*& c_ptr1, // NOLINT - int32_t*& c_ptr2, // NOLINT - int32_t*& c_ptr3, // NOLINT - const float* scale, // NOLINT - bool is_relu, // NOLINT - int k, - int rem) { - asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT32_OUT - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [k] "+r"(k) - : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31", - "cc"); -} -template <> -inline void gemm_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - float*& c_ptr0, // NOLINT - float*& c_ptr1, // NOLINT - float*& c_ptr2, // NOLINT - float*& c_ptr3, // NOLINT - const float* scale, - bool is_relu, - int k, - int rem) { - asm volatile(GEMM_INT8_KERNEL GEMM_INT8_FP32_OUT - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [k] "+r"(k) - : [is_relu] "r"(is_relu), - [bias] "r"(bias), - [rem] "r"(rem), - [scale] "r"(scale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31", - "cc"); -} - -template <> -inline void gemm_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - int8_t*& c_ptr0, // NOLINT - int8_t*& c_ptr1, // NOLINT - int8_t*& c_ptr2, // NOLINT - int8_t*& c_ptr3, // NOLINT - const float* scale, - bool is_relu, - int k, - int rem) { - asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [k] "+r"(k) - : [is_relu] "r"(is_relu), - [bias] "r"(bias), - [rem] "r"(rem), - [scale] "r"(scale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31", - "cc"); -} - -#ifdef WITH_ARM_DOTPROD -template -inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - Dtype*& c_ptr0, // NOLINT - Dtype*& c_ptr1, // NOLINT - Dtype*& c_ptr2, // NOLINT - Dtype*& c_ptr3, // NOLINT - Dtype*& c_ptr4, // NOLINT - Dtype*& c_ptr5, // NOLINT - Dtype*& c_ptr6, // NOLINT - Dtype*& c_ptr7, // NOLINT - const float32_t* scale, - bool is_relu, - int k, - int rem); - -#define GEMM_SDOT_INT8_KERNEL \ - "ldp q2, q3, [%[bias_ptr]]\n" /* load bias to q2, q3*/ \ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ \ - "dup v8.4s, v2.s[0]\n" /* out0 = 0 */ \ - "dup v9.4s, v2.s[0]\n" /* out1 = 0*/ \ - "dup v10.4s, v2.s[0]\n" /* out2 = 0*/ \ - "dup v11.4s, v2.s[1]\n" /* out3 = 0*/ \ - "dup v12.4s, v2.s[1]\n" /* out4 = 0*/ \ - "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ \ - "dup v13.4s, v2.s[1]\n" /* out5 = 0*/ \ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ \ - "dup v14.4s, v2.s[2]\n" /* out6 = 0*/ \ - "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ \ - "dup v15.4s, v2.s[2]\n" /* out7 = 0*/ \ - "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload a*/ \ - "dup v16.4s, v2.s[2]\n" /* out8 = 0*/ \ - "prfm pldl1keep, [%[b_ptr], #192]\n" /* preload b*/ \ - "dup v17.4s, v2.s[3]\n" /* out9 = 0*/ \ - "prfm pldl1keep, [%[b_ptr], #256]\n" /* preload b*/ \ - "dup v18.4s, v2.s[3]\n" /* out10 = 0*/ \ - "prfm pldl1keep, [%[a_ptr], #192]\n" /* preload a*/ \ - "dup v19.4s, v2.s[3]\n" /* out11 = 0*/ \ - "prfm pldl1keep, [%[b_ptr], #320]\n" /* preload b*/ \ - "dup v20.4s, v3.s[0]\n" /* out12 = 0*/ \ - "prfm pldl1keep, [%[a_ptr], #256]\n" /* preload a*/ \ - "dup v21.4s, v3.s[0]\n" /* out13 = 0*/ \ - "prfm pldl1keep, [%[b_ptr], #384]\n" /* preload b*/ \ - "dup v22.4s, v3.s[0]\n" /* out14 = 0*/ \ - "dup v23.4s, v3.s[1]\n" /* out15 = 0*/ \ - "dup v24.4s, v3.s[1]\n" /* out16 = 0*/ \ - "dup v25.4s, v3.s[1]\n" /* out17 = 0*/ \ - "dup v26.4s, v3.s[2]\n" /* out18 = 0*/ \ - "dup v27.4s, v3.s[2]\n" /* out19 = 0*/ \ - "dup v28.4s, v3.s[2]\n" /* out20 = 0*/ \ - "dup v29.4s, v3.s[3]\n" /* out21 = 0*/ \ - "dup v30.4s, v3.s[3]\n" /* out22 = 0*/ \ - "dup v31.4s, v3.s[3]\n" /* out23 = 0*/ \ - "cbz %w[k], 2f\n" /* check loop count > 0 */ \ - "1:\n" /* main loop */ \ - "sdot v8.4s , v4.16b, v0.4b[0]\n" /* out0 = b0 * a00[0], b0 = q4 */ \ - "sdot v11.4s , v4.16b, v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q4 */ \ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7 */ \ - "sdot v14.4s, v4.16b, v0.4b[2]\n" /* out2 = b0 * a00[2], b0 = q4 */ \ - "sdot v17.4s, v4.16b, v0.4b[3]\n" /* out3 = b0 * a00[3], b0 = q4 */ \ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4 */ \ - "sdot v20.4s, v4.16b, v1.4b[0]\n" /* out4 = b0 * a01[0], b0 = q4 */ \ - "sdot v23.4s, v4.16b, v1.4b[1]\n" /* out5 = b0 * a01[1], b0 = q4 */ \ - "sdot v26.4s, v4.16b, v1.4b[2]\n" /* out6 = b0 * a01[2], b0 = q4 */ \ - "sdot v29.4s, v4.16b, v1.4b[3]\n" /* out7 = b0 * a01[3], b0 = q4 */ \ - "sdot v9.4s, v5.16b, v0.4b[0]\n" /* out8 = b1 * a00[0], b1 = q5 */ \ - "sdot v12.4s, v5.16b, v0.4b[1]\n" /* out9 = b1 * a00[1], b1 = q5 */ \ - "sdot v15.4s, v5.16b, v0.4b[2]\n" /* out10 = b1 * a00[2], b1 = q5*/ \ - "sdot v18.4s, v5.16b, v0.4b[3]\n" /* out11 = b1 * a00[3], b1 = q5*/ \ - "sdot v21.4s, v5.16b, v1.4b[0]\n" /* out12 = b1 * a01[0], b1 = q5*/ \ - "sdot v24.4s, v5.16b, v1.4b[1]\n" /* out13 = b1 * a01[1], b1 = q5*/ \ - "sdot v27.4s, v5.16b, v1.4b[2]\n" /* out14 = b1 * a01[2], b1 = q5*/ \ - "sdot v30.4s, v5.16b, v1.4b[3]\n" /* out15 = b1 * a01[3], b1 = q5*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */ \ - "sdot v10.4s, v6.16b, v0.4b[0]\n" /* out16 = b2 * a00[0], b2 = q6*/ \ - "sdot v13.4s, v6.16b, v0.4b[1]\n" /* out17 = b2 * a00[1], b2 = q6*/ \ - "prfm pldl1keep, [%[b_ptr], #384]\n" \ - "sdot v16.4s, v6.16b, v0.4b[2]\n" /* out18 = b2 * a00[2], b2 = q6*/ \ - "sdot v19.4s, v6.16b, v0.4b[3]\n" /* out19 = b2 * a00[3], b2 = q6*/ \ - "sdot v22.4s, v6.16b, v1.4b[0]\n" /* out20 = b2 * a00[0], b2 = q6*/ \ - "sdot v25.4s, v6.16b, v1.4b[1]\n" /* out21 = b2 * a00[1], b2 = q6*/ \ - "sdot v28.4s, v6.16b, v1.4b[2]\n" /* out22 = b2 * a00[2], b2 = q6*/ \ - "sdot v31.4s, v6.16b, v1.4b[3]\n" /* out23 = b2 * a00[3], b2 = q6*/ \ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */ \ - "sdot v8.4s , v7.16b, v2.4b[0]\n" /* out0 = b0 * a10[0], b0 = q7 */ \ - "sdot v11.4s , v7.16b, v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q7 */ \ - "sdot v14.4s, v7.16b, v2.4b[2]\n" /* out2 = b0 * a10[2], b0 = q7 */ \ - "prfm pldl1keep, [%[a_ptr], #256]\n" \ - "sdot v17.4s, v7.16b, v2.4b[3]\n" /* out3 = b0 * a10[3], b0 = q7 */ \ - "sdot v20.4s, v7.16b, v3.4b[0]\n" /* out4 = b0 * a11[0], b0 = q7 */ \ - "sdot v23.4s, v7.16b, v3.4b[1]\n" /* out5 = b0 * a11[1], b0 = q7 */ \ - "sdot v26.4s, v7.16b, v3.4b[2]\n" /* out6 = b0 * a11[2], b0 = q7 */ \ - "sdot v29.4s, v7.16b, v3.4b[3]\n" /* out7 = b0 * a11[3], b0 = q7 */ \ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */ \ - "sdot v9.4s, v4.16b, v2.4b[0]\n" /* out8 = b0 * a10[0], b1 = q4 */ \ - "sdot v12.4s, v4.16b, v2.4b[1]\n" /* out9 = b0 * a10[1], b1 = q4 */ \ - "sdot v15.4s, v4.16b, v2.4b[2]\n" /* out10 = b1 * a10[2], b1 = q4*/ \ - "sdot v18.4s, v4.16b, v2.4b[3]\n" /* out11 = b1 * a10[3], b1 = q4*/ \ - "sdot v21.4s, v4.16b, v3.4b[0]\n" /* out12 = b1 * a10[0], b1 = q4*/ \ - "sdot v24.4s, v4.16b, v3.4b[1]\n" /* out13 = b1 * a10[1], b1 = q4*/ \ - "sdot v27.4s, v4.16b, v3.4b[2]\n" /* out14 = b1 * a10[2], b1 = q4*/ \ - "sdot v30.4s, v4.16b, v3.4b[3]\n" /* out15 = b1 * a10[3], b1 = q4*/ \ - "sdot v10.4s, v5.16b, v2.4b[0]\n" /* out16 = b2 * a10[0], b2 = q5*/ \ - "sdot v13.4s, v5.16b, v2.4b[1]\n" /* out17 = b2 * a10[0], b2 = q5*/ \ - "sdot v16.4s, v5.16b, v2.4b[2]\n" /* out18 = b2 * a10[0], b2 = q5*/ \ - "sdot v19.4s, v5.16b, v2.4b[3]\n" /* out19 = b2 * a10[0], b2 = q5*/ \ - "sdot v22.4s, v5.16b, v3.4b[0]\n" /* out20 = b2 * a10[0], b2 = q5*/ \ - "sdot v25.4s, v5.16b, v3.4b[1]\n" /* out21 = b2 * a10[0], b2 = q5*/ \ - "sdot v28.4s, v5.16b, v3.4b[2]\n" /* out22 = b2 * a10[0], b2 = q5*/ \ - "sdot v31.4s, v5.16b, v3.4b[3]\n" /* out23 = b2 * a10[0], b2 = q5*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5 */ \ - "sdot v8.4s , v6.16b, v0.4b[0]\n" /* out0 = b0 * a00[0], b0 = q6 */ \ - "sdot v11.4s , v6.16b, v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q6 */ \ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ \ - "sdot v14.4s, v6.16b, v0.4b[2]\n" /* out2 = b0 * a00[2], b0 = q6*/ \ - "sdot v17.4s, v6.16b, v0.4b[3]\n" /* out3 = b0 * a00[3], b0 = q6*/ \ - "sdot v20.4s, v6.16b, v1.4b[0]\n" /* out4 = b0 * a01[0], b0 = q6*/ \ - "sdot v23.4s, v6.16b, v1.4b[1]\n" /* out5 = b0 * a01[1], b0 = q6*/ \ - "sdot v26.4s, v6.16b, v1.4b[2]\n" /* out6 = b0 * a01[2], b0 = q6*/ \ - "sdot v29.4s, v6.16b, v1.4b[3]\n" /* out7 = b0 * a01[3], b0 = q6*/ \ - "sdot v9.4s, v7.16b, v0.4b[0]\n" /* out8 = b1 * a00[0], b1 = q7*/ \ - "sdot v12.4s, v7.16b, v0.4b[1]\n" /* out9 = b1 * a00[1], b1 = q7*/ \ - "prfm pldl1keep, [%[b_ptr], #384]\n" \ - "sdot v15.4s, v7.16b, v0.4b[2]\n" /* out10 = b1 * a00[2], b1 = q7*/ \ - "sdot v18.4s, v7.16b, v0.4b[3]\n" /* out11 = b1 * a00[3], b1 = q7*/ \ - "sdot v21.4s, v7.16b, v1.4b[0]\n" /* out12 = b1 * a01[0], b1 = q7*/ \ - "sdot v24.4s, v7.16b, v1.4b[1]\n" /* out13 = b1 * a01[1], b1 = q7*/ \ - "sdot v27.4s, v7.16b, v1.4b[2]\n" /* out14 = b1 * a01[2], b1 = q7*/ \ - "sdot v30.4s, v7.16b, v1.4b[3]\n" /* out15 = b1 * a01[3], b1 = q7*/ \ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ \ - "sdot v10.4s, v4.16b, v0.4b[0]\n" /* out16 = b2 * a00[0], b2 = q4*/ \ - "sdot v13.4s, v4.16b, v0.4b[1]\n" /* out17 = b2 * a00[1], b2 = q4*/ \ - "sdot v16.4s, v4.16b, v0.4b[2]\n" /* out18 = b2 * a00[2], b2 = q4*/ \ - "sdot v19.4s, v4.16b, v0.4b[3]\n" /* out19 = b2 * a00[3], b2 = q4*/ \ - "sdot v22.4s, v4.16b, v1.4b[0]\n" /* out20 = b2 * a00[0], b2 = q4*/ \ - "sdot v25.4s, v4.16b, v1.4b[1]\n" /* out21 = b2 * a00[1], b2 = q4*/ \ - "sdot v28.4s, v4.16b, v1.4b[2]\n" /* out22 = b2 * a00[2], b2 = q4*/ \ - "sdot v31.4s, v4.16b, v1.4b[3]\n" /* out23 = b2 * a00[3], b2 = q4*/ \ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 */ /* unrool 3*/ \ - "sdot v8.4s , v5.16b, v2.4b[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ \ - "sdot v11.4s , v5.16b, v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/ \ - "sdot v14.4s, v5.16b, v2.4b[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ \ - "sdot v17.4s, v5.16b, v2.4b[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ \ - "sdot v20.4s, v5.16b, v3.4b[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ \ - "sdot v23.4s, v5.16b, v3.4b[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ \ - "sdot v26.4s, v5.16b, v3.4b[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ \ - "sdot v29.4s, v5.16b, v3.4b[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ \ - "sdot v9.4s, v6.16b, v2.4b[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ \ - "sdot v12.4s, v6.16b, v2.4b[1]\n" /* out9 = b0 * a10[1], b1 = q6*/ \ - "prfm pldl1keep, [%[a_ptr], #256]\n" \ - "sdot v15.4s, v6.16b, v2.4b[2]\n" /* out10 = b1 * a10[2], b1 = q6*/ \ - "sdot v18.4s, v6.16b, v2.4b[3]\n" /* out11 = b1 * a10[3], b1 = q6*/ \ - "sdot v21.4s, v6.16b, v3.4b[0]\n" /* out12 = b1 * a10[0], b1 = q6*/ \ - "sdot v24.4s, v6.16b, v3.4b[1]\n" /* out13 = b1 * a10[1], b1 = q6*/ \ - "sdot v27.4s, v6.16b, v3.4b[2]\n" /* out14 = b1 * a10[2], b1 = q6*/ \ - "prfm pldl1keep, [%[b_ptr], #384]\n" \ - "sdot v30.4s, v6.16b, v3.4b[3]\n" /* out15 = b1 * a10[3], b1 = q6*/ \ - "sdot v10.4s, v7.16b, v2.4b[0]\n" /* out16 = b2 * a10[0], b2 = q7*/ \ - "sdot v13.4s, v7.16b, v2.4b[1]\n" /* out17 = b2 * a10[0], b2 = q7*/ \ - "sdot v16.4s, v7.16b, v2.4b[2]\n" /* out18 = b2 * a10[0], b2 = q7*/ \ - "sdot v19.4s, v7.16b, v2.4b[3]\n" /* out19 = b2 * a10[0], b2 = q7*/ \ - "sdot v22.4s, v7.16b, v3.4b[0]\n" /* out20 = b2 * a10[0], b2 = q7*/ \ - "sdot v25.4s, v7.16b, v3.4b[1]\n" /* out21 = b2 * a10[0], b2 = q7*/ \ - "subs %w[k], %w[k], #1\n" /* loop count - 1*/ \ - "sdot v28.4s, v7.16b, v3.4b[2]\n" /* out22 = b2 * a10[0], b2 = q7*/ \ - "sdot v31.4s, v7.16b, v3.4b[3]\n" /* out23 = b2 * a10[0], b2 = q7*/ \ - "bne 1b\n" \ - "2:\n" /* process tail*/ \ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ - "beq 3f\n" \ - "sdot v8.4s , v4.16b, v0.4b[0]\n" /* out0 = b0 * a00[0], b0 = q4*/ \ - "sdot v11.4s , v4.16b, v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q4*/ \ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7*/ \ - "sdot v14.4s, v4.16b, v0.4b[2]\n" /* out2 = b0 * a00[2], b0 = q4*/ \ - "sdot v17.4s, v4.16b, v0.4b[3]\n" /* out3 = b0 * a00[3], b0 = q4*/ \ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q2, q3*/ \ - "sdot v20.4s, v4.16b, v1.4b[0]\n" /* out4 = b0 * a01[0], b0 = q4*/ \ - "sdot v23.4s, v4.16b, v1.4b[1]\n" /* out5 = b0 * a01[1], b0 = q4*/ \ - "sdot v26.4s, v4.16b, v1.4b[2]\n" /* out6 = b0 * a01[2], b0 = q4*/ \ - "sdot v29.4s, v4.16b, v1.4b[3]\n" /* out7 = b0 * a01[3], b0 = q4*/ \ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ - "sdot v9.4s, v5.16b, v0.4b[0]\n" /* out8 = b1 * a00[0], b1 = q5*/ \ - "sdot v12.4s, v5.16b, v0.4b[1]\n" /* out9 = b1 * a00[1], b1 = q5*/ \ - "sdot v15.4s, v5.16b, v0.4b[2]\n" /* out10 = b1 * a00[2], b1 = q5*/ \ - "sdot v18.4s, v5.16b, v0.4b[3]\n" /* out11 = b1 * a00[3], b1 = q5*/ \ - "sdot v21.4s, v5.16b, v1.4b[0]\n" /* out12 = b1 * a01[0], b1 = q5*/ \ - "sdot v24.4s, v5.16b, v1.4b[1]\n" /* out13 = b1 * a01[1], b1 = q5*/ \ - "sdot v27.4s, v5.16b, v1.4b[2]\n" /* out14 = b1 * a01[2], b1 = q5*/ \ - "sdot v30.4s, v5.16b, v1.4b[3]\n" /* out15 = b1 * a01[3], b1 = q5*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5*/ \ - "sdot v10.4s, v6.16b, v0.4b[0]\n" /* out16 = b2 * a00[0], b2 = q6*/ \ - "sdot v13.4s, v6.16b, v0.4b[1]\n" /* out17 = b2 * a00[1], b2 = q6*/ \ - "sdot v16.4s, v6.16b, v0.4b[2]\n" /* out18 = b2 * a00[2], b2 = q6*/ \ - "sdot v19.4s, v6.16b, v0.4b[3]\n" /* out19 = b2 * a00[3], b2 = q6*/ \ - "sdot v22.4s, v6.16b, v1.4b[0]\n" /* out20 = b2 * a00[0], b2 = q6*/ \ - "sdot v25.4s, v6.16b, v1.4b[1]\n" /* out21 = b2 * a00[1], b2 = q6*/ \ - "sdot v28.4s, v6.16b, v1.4b[2]\n" /* out22 = b2 * a00[2], b2 = q6*/ \ - "sdot v31.4s, v6.16b, v1.4b[3]\n" /* out23 = b2 * a00[3], b2 = q6*/ \ - "beq 4f\n" /*jump to tail = 2*/ /* unrool 1, tail > 2*/ \ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ \ - "sdot v8.4s , v7.16b, v2.4b[0]\n" /* out0 = b0 * a10[0], b0 = q7*/ \ - "sdot v11.4s , v7.16b, v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q7*/ \ - "sdot v14.4s, v7.16b, v2.4b[2]\n" /* out2 = b0 * a10[2], b0 = q7*/ \ - "sdot v17.4s, v7.16b, v2.4b[3]\n" /* out3 = b0 * a10[3], b0 = q7*/ \ - "sdot v20.4s, v7.16b, v3.4b[0]\n" /* out4 = b0 * a11[0], b0 = q7*/ \ - "sdot v23.4s, v7.16b, v3.4b[1]\n" /* out5 = b0 * a11[1], b0 = q7*/ \ - "sdot v26.4s, v7.16b, v3.4b[2]\n" /* out6 = b0 * a11[2], b0 = q7*/ \ - "sdot v29.4s, v7.16b, v3.4b[3]\n" /* out7 = b0 * a11[3], b0 = q7*/ \ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7*/ \ - "sdot v9.4s, v4.16b, v2.4b[0]\n" /* out8 = b0 * a10[0], b1 = q4*/ \ - "sdot v12.4s, v4.16b, v2.4b[1]\n" /* out9 = b0 * a10[1], b1 = q4*/ \ - "sdot v15.4s, v4.16b, v2.4b[2]\n" /* out10 = b1 * a10[2], b1 = q4*/ \ - "sdot v18.4s, v4.16b, v2.4b[3]\n" /* out11 = b1 * a10[3], b1 = q4*/ \ - "sdot v21.4s, v4.16b, v3.4b[0]\n" /* out12 = b1 * a10[0], b1 = q4*/ \ - "sdot v24.4s, v4.16b, v3.4b[1]\n" /* out13 = b1 * a10[1], b1 = q4*/ \ - "sdot v27.4s, v4.16b, v3.4b[2]\n" /* out14 = b1 * a10[2], b1 = q4*/ \ - "sdot v30.4s, v4.16b, v3.4b[3]\n" /* out15 = b1 * a10[3], b1 = q4*/ \ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ \ - "sdot v10.4s, v5.16b, v2.4b[0]\n" /* out16 = b2 * a10[0], b2 = q5*/ \ - "sdot v13.4s, v5.16b, v2.4b[1]\n" /* out17 = b2 * a10[0], b2 = q5*/ \ - "sdot v16.4s, v5.16b, v2.4b[2]\n" /* out18 = b2 * a10[0], b2 = q5*/ \ - "sdot v19.4s, v5.16b, v2.4b[3]\n" /* out19 = b2 * a10[0], b2 = q5*/ \ - "sdot v22.4s, v5.16b, v3.4b[0]\n" /* out20 = b2 * a10[0], b2 = q5*/ \ - "sdot v25.4s, v5.16b, v3.4b[1]\n" /* out21 = b2 * a10[0], b2 = q5*/ \ - "sdot v28.4s, v5.16b, v3.4b[2]\n" /* out22 = b2 * a10[0], b2 = q5*/ \ - "sdot v31.4s, v5.16b, v3.4b[3]\n" /* out23 = b2 * a10[0], b2 = q5*/ \ - "beq 5f\n" /*jump to tail = 3*/ /* unrool 2, tail = 4*/ \ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5*/ \ - "sdot v8.4s , v6.16b, v0.4b[0]\n" /* out0 = b0 * a00[0], b0 = q6*/ \ - "sdot v11.4s , v6.16b, v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q6*/ \ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ \ - "sdot v14.4s, v6.16b, v0.4b[2]\n" /* out2 = b0 * a00[2], b0 = q6*/ \ - "sdot v17.4s, v6.16b, v0.4b[3]\n" /* out3 = b0 * a00[3], b0 = q6*/ \ - "sdot v20.4s, v6.16b, v1.4b[0]\n" /* out4 = b0 * a01[0], b0 = q6*/ \ - "sdot v23.4s, v6.16b, v1.4b[1]\n" /* out5 = b0 * a01[1], b0 = q6*/ \ - "sdot v26.4s, v6.16b, v1.4b[2]\n" /* out6 = b0 * a01[2], b0 = q6*/ \ - "sdot v29.4s, v6.16b, v1.4b[3]\n" /* out7 = b0 * a01[3], b0 = q6*/ \ - "sdot v9.4s, v7.16b, v0.4b[0]\n" /* out8 = b1 * a00[0], b1 = q7*/ \ - "sdot v12.4s, v7.16b, v0.4b[1]\n" /* out9 = b1 * a00[1], b1 = q7*/ \ - "sdot v15.4s, v7.16b, v0.4b[2]\n" /* out10 = b1 * a00[2], b1 = q7*/ \ - "sdot v18.4s, v7.16b, v0.4b[3]\n" /* out11 = b1 * a00[3], b1 = q7*/ \ - "sdot v21.4s, v7.16b, v1.4b[0]\n" /* out12 = b1 * a01[0], b1 = q7*/ \ - "sdot v24.4s, v7.16b, v1.4b[1]\n" /* out13 = b1 * a01[1], b1 = q7*/ \ - "sdot v27.4s, v7.16b, v1.4b[2]\n" /* out14 = b1 * a01[2], b1 = q7*/ \ - "sdot v30.4s, v7.16b, v1.4b[3]\n" /* out15 = b1 * a01[3], b1 = q7*/ \ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ \ - "sdot v10.4s, v4.16b, v0.4b[0]\n" /* out16 = b2 * a00[0], b2 = q4*/ \ - "sdot v13.4s, v4.16b, v0.4b[1]\n" /* out17 = b2 * a00[1], b2 = q4*/ \ - "sdot v16.4s, v4.16b, v0.4b[2]\n" /* out18 = b2 * a00[2], b2 = q4*/ \ - "sdot v19.4s, v4.16b, v0.4b[3]\n" /* out19 = b2 * a00[3], b2 = q4*/ \ - "sdot v22.4s, v4.16b, v1.4b[0]\n" /* out20 = b2 * a00[0], b2 = q4*/ \ - "sdot v25.4s, v4.16b, v1.4b[1]\n" /* out21 = b2 * a00[1], b2 = q4*/ \ - "sdot v28.4s, v4.16b, v1.4b[2]\n" /* out22 = b2 * a00[2], b2 = q4*/ \ - "sdot v31.4s, v4.16b, v1.4b[3]\n" /* out23 = b2 * a00[3], b2 = q4*/ \ - "sdot v8.4s , v5.16b, v2.4b[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ \ - "sdot v11.4s , v5.16b, v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/ \ - "sdot v14.4s, v5.16b, v2.4b[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ \ - "sdot v17.4s, v5.16b, v2.4b[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ \ - "sdot v20.4s, v5.16b, v3.4b[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ \ - "sdot v23.4s, v5.16b, v3.4b[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ \ - "sdot v26.4s, v5.16b, v3.4b[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ \ - "sdot v29.4s, v5.16b, v3.4b[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ \ - "sdot v9.4s, v6.16b, v2.4b[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ \ - "sdot v12.4s, v6.16b, v2.4b[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ \ - "sdot v15.4s, v6.16b, v2.4b[2]\n" /* out10 = b1 * a10[2], b1 = q6*/ \ - "sdot v18.4s, v6.16b, v2.4b[3]\n" /* out11 = b1 * a10[3], b1 = q6*/ \ - "sdot v21.4s, v6.16b, v3.4b[0]\n" /* out12 = b1 * a10[0], b1 = q6*/ \ - "sdot v24.4s, v6.16b, v3.4b[1]\n" /* out13 = b1 * a10[1], b1 = q6*/ \ - "sdot v27.4s, v6.16b, v3.4b[2]\n" /* out14 = b1 * a10[2], b1 = q6*/ \ - "sdot v30.4s, v6.16b, v3.4b[3]\n" /* out15 = b1 * a10[3], b1 = q6*/ \ - "sdot v10.4s, v7.16b, v2.4b[0]\n" /* out16 = b2 * a10[0], b2 = q7*/ \ - "sdot v13.4s, v7.16b, v2.4b[1]\n" /* out17 = b2 * a10[0], b2 = q7*/ \ - "sdot v16.4s, v7.16b, v2.4b[2]\n" /* out18 = b2 * a10[0], b2 = q7*/ \ - "sdot v19.4s, v7.16b, v2.4b[3]\n" /* out19 = b2 * a10[0], b2 = q7*/ \ - "sdot v22.4s, v7.16b, v3.4b[0]\n" /* out20 = b2 * a10[0], b2 = q7*/ \ - "sdot v25.4s, v7.16b, v3.4b[1]\n" /* out21 = b2 * a10[0], b2 = q7*/ \ - "sdot v28.4s, v7.16b, v3.4b[2]\n" /* out22 = b2 * a10[0], b2 = q7*/ \ - "sdot v31.4s, v7.16b, v3.4b[3]\n" /* out23 = b2 * a10[0], b2 = q7*/ \ - "b 11f\n" /* tails==1 final tail*/ \ - "3: \n" /* tail=1*/ \ - "ldr q6, [%[b_ptr]], #16\n" /* load b2 to q6*/ \ - "sdot v8.4s , v4.16b, v0.4b[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ \ - "sdot v11.4s , v4.16b, v0.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/ \ - "sdot v14.4s, v4.16b, v0.4b[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ \ - "sdot v17.4s, v4.16b, v0.4b[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ \ - "sdot v20.4s, v4.16b, v1.4b[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ \ - "sdot v23.4s, v4.16b, v1.4b[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ \ - "sdot v26.4s, v4.16b, v1.4b[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ \ - "sdot v29.4s, v4.16b, v1.4b[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ \ - "sdot v9.4s, v5.16b, v0.4b[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ \ - "sdot v12.4s, v5.16b, v0.4b[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ \ - "sdot v15.4s, v5.16b, v0.4b[2]\n" /* out10 = b1 * a10[2], b1 = q6*/ \ - "sdot v18.4s, v5.16b, v0.4b[3]\n" /* out11 = b1 * a10[3], b1 = q6*/ \ - "sdot v21.4s, v5.16b, v1.4b[0]\n" /* out12 = b1 * a10[0], b1 = q6*/ \ - "sdot v24.4s, v5.16b, v1.4b[1]\n" /* out13 = b1 * a10[1], b1 = q6*/ \ - "sdot v27.4s, v5.16b, v1.4b[2]\n" /* out14 = b1 * a10[2], b1 = q6*/ \ - "sdot v30.4s, v5.16b, v1.4b[3]\n" /* out15 = b1 * a10[3], b1 = q6*/ \ - "sdot v10.4s, v6.16b, v0.4b[0]\n" /* out16 = b2 * a10[0], b2 = q7*/ \ - "sdot v13.4s, v6.16b, v0.4b[1]\n" /* out17 = b2 * a10[0], b2 = q7*/ \ - "sdot v16.4s, v6.16b, v0.4b[2]\n" /* out18 = b2 * a10[0], b2 = q7*/ \ - "sdot v19.4s, v6.16b, v0.4b[3]\n" /* out19 = b2 * a10[0], b2 = q7*/ \ - "sdot v22.4s, v6.16b, v1.4b[0]\n" /* out20 = b2 * a10[0], b2 = q7*/ \ - "sdot v25.4s, v6.16b, v1.4b[1]\n" /* out21 = b2 * a10[0], b2 = q7*/ \ - "sdot v28.4s, v6.16b, v1.4b[2]\n" /* out22 = b2 * a10[0], b2 = q7*/ \ - "sdot v31.4s, v6.16b, v1.4b[3]\n" /* out23 = b2 * a10[0], b2 = q7*/ \ - "b 11f\n" /* tails==2 final tail*/ \ - "4:\n" /* tail = 2*/ \ - "sdot v8.4s , v7.16b, v2.4b[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ \ - "sdot v11.4s , v7.16b, v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/ \ - "sdot v14.4s, v7.16b, v2.4b[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ \ - "sdot v17.4s, v7.16b, v2.4b[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ \ - "sdot v20.4s, v7.16b, v3.4b[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ \ - "sdot v23.4s, v7.16b, v3.4b[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ \ - "sdot v26.4s, v7.16b, v3.4b[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ \ - "sdot v29.4s, v7.16b, v3.4b[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ \ - "sdot v9.4s, v4.16b, v2.4b[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ \ - "sdot v12.4s, v4.16b, v2.4b[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ \ - "sdot v15.4s, v4.16b, v2.4b[2]\n" /* out10 = b1 * a10[2], b1 = q6*/ \ - "sdot v18.4s, v4.16b, v2.4b[3]\n" /* out11 = b1 * a10[3], b1 = q6*/ \ - "sdot v21.4s, v4.16b, v3.4b[0]\n" /* out12 = b1 * a10[0], b1 = q6*/ \ - "sdot v24.4s, v4.16b, v3.4b[1]\n" /* out13 = b1 * a10[1], b1 = q6*/ \ - "sdot v27.4s, v4.16b, v3.4b[2]\n" /* out14 = b1 * a10[2], b1 = q6*/ \ - "sdot v30.4s, v4.16b, v3.4b[3]\n" /* out15 = b1 * a10[3], b1 = q6*/ \ - "sdot v10.4s, v5.16b, v2.4b[0]\n" /* out16 = b2 * a10[0], b2 = q7*/ \ - "sdot v13.4s, v5.16b, v2.4b[1]\n" /* out17 = b2 * a10[0], b2 = q7*/ \ - "sdot v16.4s, v5.16b, v2.4b[2]\n" /* out18 = b2 * a10[0], b2 = q7*/ \ - "sdot v19.4s, v5.16b, v2.4b[3]\n" /* out19 = b2 * a10[0], b2 = q7*/ \ - "sdot v22.4s, v5.16b, v3.4b[0]\n" /* out20 = b2 * a10[0], b2 = q7*/ \ - "sdot v25.4s, v5.16b, v3.4b[1]\n" /* out21 = b2 * a10[0], b2 = q7*/ \ - "sdot v28.4s, v5.16b, v3.4b[2]\n" /* out22 = b2 * a10[0], b2 = q7*/ \ - "sdot v31.4s, v5.16b, v3.4b[3]\n" /* out23 = b2 * a10[0], b2 = q7*/ \ - "b 11f\n" /* tails==3 final tail*/ \ - "5:\n" /* tail = 3*/ \ - "ldr q4, [%[b_ptr]], #16\n" /* load b2, b0 to q4*/ \ - "sdot v8.4s , v6.16b, v0.4b[0]\n" /* out0 = b0 * a10[0], b0 = q5*/ \ - "sdot v11.4s , v6.16b, v0.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/ \ - "sdot v14.4s, v6.16b, v0.4b[2]\n" /* out2 = b0 * a10[2], b0 = q5*/ \ - "sdot v17.4s, v6.16b, v0.4b[3]\n" /* out3 = b0 * a10[3], b0 = q5*/ \ - "sdot v20.4s, v6.16b, v1.4b[0]\n" /* out4 = b0 * a11[0], b0 = q5*/ \ - "sdot v23.4s, v6.16b, v1.4b[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ \ - "sdot v26.4s, v6.16b, v1.4b[2]\n" /* out6 = b0 * a11[2], b0 = q5*/ \ - "sdot v29.4s, v6.16b, v1.4b[3]\n" /* out7 = b0 * a11[3], b0 = q5*/ \ - "sdot v9.4s, v7.16b, v0.4b[0]\n" /* out8 = b0 * a10[0], b1 = q6*/ \ - "sdot v12.4s, v7.16b, v0.4b[1]\n" /* out9 = b1 * a10[1], b1 = q6*/ \ - "sdot v15.4s, v7.16b, v0.4b[2]\n" /* out10 = b1 * a10[2], b1 = q6*/ \ - "sdot v18.4s, v7.16b, v0.4b[3]\n" /* out11 = b1 * a10[3], b1 = q6*/ \ - "sdot v21.4s, v7.16b, v1.4b[0]\n" /* out12 = b1 * a10[0], b1 = q6*/ \ - "sdot v24.4s, v7.16b, v1.4b[1]\n" /* out13 = b1 * a10[1], b1 = q6*/ \ - "sdot v27.4s, v7.16b, v1.4b[2]\n" /* out14 = b1 * a10[2], b1 = q6*/ \ - "sdot v30.4s, v7.16b, v1.4b[3]\n" /* out15 = b1 * a10[3], b1 = q6*/ \ - "sdot v10.4s, v4.16b, v0.4b[0]\n" /* out16 = b2 * a10[0], b2 = q7*/ \ - "sdot v13.4s, v4.16b, v0.4b[1]\n" /* out17 = b2 * a10[0], b2 = q7*/ \ - "sdot v16.4s, v4.16b, v0.4b[2]\n" /* out18 = b2 * a10[0], b2 = q7*/ \ - "sdot v19.4s, v4.16b, v0.4b[3]\n" /* out19 = b2 * a10[0], b2 = q7*/ \ - "sdot v22.4s, v4.16b, v1.4b[0]\n" /* out20 = b2 * a10[0], b2 = q7*/ \ - "sdot v25.4s, v4.16b, v1.4b[1]\n" /* out21 = b2 * a10[0], b2 = q7*/ \ - "sdot v28.4s, v4.16b, v1.4b[2]\n" /* out22 = b2 * a10[0], b2 = q7*/ \ - "sdot v31.4s, v4.16b, v1.4b[3]\n" /* out23 = b2 * a10[0], b2 = q7*/ \ - "11: \n" /* check if relu */ \ - "cbz %w[relu], 12f\n" /* skip relu */ \ - "movi v2.4s, #0\n" /* for relu*/ \ - "smax v8.4s, v8.4s, v2.4s\n" /* relu*/ \ - "smax v9.4s, v9.4s, v2.4s\n" /* relu*/ \ - "smax v10.4s, v10.4s, v2.4s\n" /* relu*/ \ - "smax v11.4s, v11.4s, v2.4s\n" /* relu*/ \ - "smax v12.4s, v12.4s, v2.4s\n" /* relu*/ \ - "smax v13.4s, v13.4s, v2.4s\n" /* relu*/ \ - "smax v14.4s, v14.4s, v2.4s\n" /* relu*/ \ - "smax v15.4s, v15.4s, v2.4s\n" /* relu*/ \ - "smax v16.4s,v16.4s,v2.4s\n" /* relu*/ \ - "smax v17.4s,v17.4s,v2.4s\n" /* relu*/ \ - "smax v18.4s, v18.4s, v2.4s\n" /* relu*/ \ - "smax v19.4s, v19.4s, v2.4s\n" /* relu*/ \ - "smax v20.4s, v20.4s, v2.4s\n" /* relu*/ \ - "smax v21.4s, v21.4s, v2.4s\n" /* relu*/ \ - "smax v22.4s, v22.4s, v2.4s\n" /* relu*/ \ - "smax v23.4s, v23.4s, v2.4s\n" /* relu*/ \ - "smax v24.4s, v24.4s, v2.4s\n" /* relu*/ \ - "smax v25.4s, v25.4s, v2.4s\n" /* relu*/ \ - "smax v26.4s, v26.4s, v2.4s\n" /* relu*/ \ - "smax v27.4s, v27.4s, v2.4s\n" /* relu*/ \ - "smax v28.4s, v28.4s, v2.4s\n" /* relu*/ \ - "smax v29.4s, v29.4s, v2.4s\n" /* relu*/ \ - "smax v30.4s, v30.4s, v2.4s\n" /* relu*/ \ - "smax v31.4s, v31.4s, v2.4s\n" /* relu*/ \ - "12: \n" - -#define GEMM_SDOT_INT32_OUT \ - "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */ \ - "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ \ - "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ \ - "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */ \ - "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */ \ - "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */ \ - "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */ \ - "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */ - -#define GEMM_SDOT_FP32_OUT \ - "ldp q0, q1, [%[scale]]\n" /* load scale */ \ - "scvtf v2.4s , v8.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v9.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v10.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v11.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v12.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v13.4s\n" /* 00, convert to fp32 */ \ - "fmul v8.4s, v2.4s, v0.s[0]\n" /* 00, mul scale to get final */ \ - "fmul v9.4s, v3.4s, v0.s[0]\n" /* 00, mul scale to get final */ \ - "fmul v10.4s, v4.4s, v0.s[0]\n" /* 00, mul scale to get final */ \ - "fmul v11.4s, v5.4s, v0.s[1]\n" /* 00, mul scale to get final */ \ - "fmul v12.4s, v6.4s, v0.s[1]\n" /* 00, mul scale to get final */ \ - "fmul v13.4s, v7.4s, v0.s[1]\n" /* 00, mul scale to get final */ \ - "scvtf v2.4s , v14.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v15.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v16.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v17.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v18.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v19.4s\n" /* 00, convert to fp32 */ \ - "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */ \ - "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ \ - "fmul v14.4s, v2.4s, v0.s[2]\n" /* 00, mul scale to get final */ \ - "fmul v15.4s, v3.4s, v0.s[2]\n" /* 00, mul scale to get final */ \ - "fmul v16.4s, v4.4s, v0.s[2]\n" /* 00, mul scale to get final */ \ - "fmul v17.4s, v5.4s, v0.s[3]\n" /* 00, mul scale to get final */ \ - "fmul v18.4s, v6.4s, v0.s[3]\n" /* 00, mul scale to get final */ \ - "fmul v19.4s, v7.4s, v0.s[3]\n" /* 00, mul scale to get final */ \ - "scvtf v2.4s , v20.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v21.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v22.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v23.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v24.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v25.4s\n" /* 00, convert to fp32 */ \ - "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ \ - "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */ \ - "fmul v20.4s, v2.4s, v1.s[0]\n" /* 00, mul scale to get final */ \ - "fmul v21.4s, v3.4s, v1.s[0]\n" /* 00, mul scale to get final */ \ - "fmul v22.4s, v4.4s, v1.s[0]\n" /* 00, mul scale to get final */ \ - "fmul v23.4s, v5.4s, v1.s[1]\n" /* 00, mul scale to get final */ \ - "fmul v24.4s, v6.4s, v1.s[1]\n" /* 00, mul scale to get final */ \ - "fmul v25.4s, v7.4s, v1.s[1]\n" /* 00, mul scale to get final */ \ - "scvtf v2.4s , v26.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v27.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v28.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v29.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v30.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v31.4s\n" /* 00, convert to fp32 */ \ - "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */ \ - "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */ \ - "fmul v26.4s, v2.4s, v1.s[2]\n" /* 00, mul scale to get final */ \ - "fmul v27.4s, v3.4s, v1.s[2]\n" /* 00, mul scale to get final */ \ - "fmul v28.4s, v4.4s, v1.s[2]\n" /* 00, mul scale to get final */ \ - "fmul v29.4s, v5.4s, v1.s[3]\n" /* 00, mul scale to get final */ \ - "fmul v30.4s, v6.4s, v1.s[3]\n" /* 00, mul scale to get final */ \ - "fmul v31.4s, v7.4s, v1.s[3]\n" /* 00, mul scale to get final */ \ - "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */ \ - "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */ - -#define GEMM_SDOT_INT8_OUT \ - "ldp q0, q1, [%[scale]]\n" /* load scale */ \ - "scvtf v2.4s , v8.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v9.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v10.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v11.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v12.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v13.4s\n" /* 00, convert to fp32 */ \ - "fmul v8.4s, v2.4s, v0.s[0]\n" /* 00, mul scale to get final*/ \ - "fmul v9.4s, v3.4s, v0.s[0]\n" /* 00, mul scale to get final*/ \ - "fmul v10.4s, v4.4s, v0.s[0]\n" /* 00, mul scale to get final*/ \ - "fmul v11.4s, v5.4s, v0.s[1]\n" /* 00, mul scale to get final*/ \ - "fmul v12.4s, v6.4s, v0.s[1]\n" /* 00, mul scale to get final*/ \ - "fmul v13.4s, v7.4s, v0.s[1]\n" /* 00, mul scale to get final*/ \ - "scvtf v2.4s , v14.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v15.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v16.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v17.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v18.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v19.4s\n" /* 00, convert to fp32 */ \ - "fmul v14.4s, v2.4s, v0.s[2]\n" /* 00, mul scale to get final*/ \ - "fmul v15.4s, v3.4s, v0.s[2]\n" /* 00, mul scale to get final*/ \ - "fmul v16.4s, v4.4s, v0.s[2]\n" /* 00, mul scale to get final*/ \ - "fmul v17.4s, v5.4s, v0.s[3]\n" /* 00, mul scale to get final*/ \ - "fmul v18.4s, v6.4s, v0.s[3]\n" /* 00, mul scale to get final*/ \ - "fmul v19.4s, v7.4s, v0.s[3]\n" /* 00, mul scale to get final*/ \ - "scvtf v2.4s , v20.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v21.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v22.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v23.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v24.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v25.4s\n" /* 00, convert to fp32 */ \ - "fmul v20.4s, v2.4s, v1.s[0]\n" /* 00, mul scale to get final*/ \ - "fmul v21.4s, v3.4s, v1.s[0]\n" /* 00, mul scale to get final*/ \ - "fmul v22.4s, v4.4s, v1.s[0]\n" /* 00, mul scale to get final*/ \ - "fmul v23.4s, v5.4s, v1.s[1]\n" /* 00, mul scale to get final*/ \ - "fmul v24.4s, v6.4s, v1.s[1]\n" /* 00, mul scale to get final*/ \ - "fmul v25.4s, v7.4s, v1.s[1]\n" /* 00, mul scale to get final*/ \ - "scvtf v2.4s , v26.4s\n" /* 00, convert to fp32 */ \ - "scvtf v3.4s , v27.4s\n" /* 01, convert to fp32 */ \ - "scvtf v4.4s , v28.4s\n" /* 02, convert to fp32 */ \ - "scvtf v5.4s , v29.4s\n" /* 03, convert to fp32 */ \ - "scvtf v6.4s , v30.4s\n" /* 00, convert to fp32 */ \ - "scvtf v7.4s , v31.4s\n" /* 00, convert to fp32 */ \ - "fmul v26.4s, v2.4s, v1.s[2]\n" /* 00, mul scale to get final*/ \ - "fmul v27.4s, v3.4s, v1.s[2]\n" /* 00, mul scale to get final*/ \ - "fmul v28.4s, v4.4s, v1.s[2]\n" /* 00, mul scale to get final*/ \ - "fmul v29.4s, v5.4s, v1.s[3]\n" /* 00, mul scale to get final*/ \ - "fmul v30.4s, v6.4s, v1.s[3]\n" /* 00, mul scale to get final*/ \ - "fmul v31.4s, v7.4s, v1.s[3]\n" /* 00, mul scale to get final*/ \ - "fcvtas v0.4s, v8.4s\n" /* 00, cvt to int */ \ - "fcvtas v1.4s, v9.4s\n" /* 00, cvt to int */ \ - "fcvtas v2.4s, v10.4s\n" /* 00, cvt to int */ \ - "fcvtas v3.4s, v11.4s\n" /* 00, cvt to int */ \ - "fcvtas v4.4s, v12.4s\n" /* 00, cvt to int */ \ - "fcvtas v5.4s, v13.4s\n" /* 00, cvt to int */ \ - "sqxtn v8.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v8.8h, v1.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v9.4h, v2.4s\n" /* 00, cvt int32 to int16 */ \ - "fcvtas v0.4s, v14.4s\n" /* 00, cvt to int */ \ - "fcvtas v1.4s, v15.4s\n" /* 00, cvt to int */ \ - "fcvtas v2.4s, v16.4s\n" /* 00, cvt to int */ \ - "sqxtn v11.4h, v3.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v11.8h, v4.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v12.4h, v5.4s\n" /* 00, cvt int32 to int16 */ \ - "fcvtas v3.4s, v17.4s\n" /* 00, cvt to int */ \ - "fcvtas v4.4s, v18.4s\n" /* 00, cvt to int */ \ - "fcvtas v5.4s, v19.4s\n" /* 00, cvt to int */ \ - "sqxtn v14.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v14.8h, v1.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v15.4h, v2.4s\n" /* 00, cvt int32 to int16 */ \ - "fcvtas v0.4s, v20.4s\n" /* 00, cvt to int */ \ - "fcvtas v1.4s, v21.4s\n" /* 00, cvt to int */ \ - "fcvtas v2.4s, v22.4s\n" /* 00, cvt to int */ \ - "sqxtn v17.4h, v3.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v17.8h, v4.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v18.4h, v5.4s\n" /* 00, cvt int32 to int16 */ \ - "fcvtas v3.4s, v23.4s\n" /* 00, cvt to int */ \ - "fcvtas v4.4s, v24.4s\n" /* 00, cvt to int */ \ - "fcvtas v5.4s, v25.4s\n" /* 00, cvt to int */ \ - "sqxtn v20.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v20.8h, v1.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v21.4h, v2.4s\n" /* 00, cvt int32 to int16 */ \ - "fcvtas v0.4s, v26.4s\n" /* 00, cvt to int */ \ - "fcvtas v1.4s, v27.4s\n" /* 00, cvt to int */ \ - "fcvtas v2.4s, v28.4s\n" /* 00, cvt to int */ \ - "sqxtn v23.4h, v3.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v23.8h, v4.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v24.4h, v5.4s\n" /* 00, cvt int32 to int16 */ \ - "fcvtas v3.4s, v29.4s\n" /* 00, cvt to int */ \ - "fcvtas v4.4s, v30.4s\n" /* 00, cvt to int */ \ - "fcvtas v5.4s, v31.4s\n" /* 00, cvt to int */ \ - "sqxtn v26.4h, v0.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v26.8h, v1.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v27.4h, v2.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v29.4h, v3.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn2 v29.8h, v4.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v30.4h, v5.4s\n" /* 00, cvt int32 to int16 */ \ - "sqxtn v4.8b, v8.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v0.8b, v9.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v5.8b, v11.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v1.8b, v12.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v6.8b, v14.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v2.8b, v15.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v7.8b, v17.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v3.8b, v18.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v16.8b, v20.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v15.8b, v21.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v20.8b, v23.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v17.8b, v24.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v24.8b, v26.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v18.8b, v27.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v28.8b, v29.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "sqxtn v19.8b, v30.8h\n" /* 00, 01, cvt int16 to int8 */ \ - "st1 {v4.8b},[%[c_ptr0]], #8\n" /* store r0 */ \ - "st1 {v5.8b},[%[c_ptr1]], #8\n" /* store r0 */ \ - "st1 {v6.8b},[%[c_ptr2]], #8\n" /* store r0 */ \ - "st1 {v7.8b},[%[c_ptr3]], #8\n" /* store r0 */ \ - "st1 {v16.8b},[%[c_ptr4]], #8\n" /* store r0 */ \ - "st1 {v20.8b},[%[c_ptr5]], #8\n" /* store r0 */ \ - "st1 {v24.8b},[%[c_ptr6]], #8\n" /* store r0 */ \ - "st1 {v28.8b},[%[c_ptr7]], #8\n" /* store r0 */ \ - "str s0,[%[c_ptr0]], #4\n" /* store r0 */ \ - "str s1,[%[c_ptr1]], #4\n" /* store r0 */ \ - "str s2,[%[c_ptr2]], #4\n" /* store r0 */ \ - "str s3,[%[c_ptr3]], #4\n" /* store r0 */ \ - "str s15,[%[c_ptr4]], #4\n" /* store r0 */ \ - "str s17,[%[c_ptr5]], #4\n" /* store r0 */ \ - "str s18,[%[c_ptr6]], #4\n" /* store r0 */ \ - "str s19,[%[c_ptr7]], #4\n" /* store r0 */ - -template <> -inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - int32_t*& c_ptr0, // NOLINT - int32_t*& c_ptr1, // NOLINT - int32_t*& c_ptr2, // NOLINT - int32_t*& c_ptr3, // NOLINT - int32_t*& c_ptr4, // NOLINT - int32_t*& c_ptr5, // NOLINT - int32_t*& c_ptr6, // NOLINT - int32_t*& c_ptr7, // NOLINT - const float32_t* scale, - bool is_relu, - int k, - int tail) { - asm volatile(_DECLARE_SDOT_ELEMENT GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT32_OUT - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [k] "+r"(k), - [tail] "+r"(tail), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [c_ptr4] "+r"(c_ptr4), - [c_ptr5] "+r"(c_ptr5), - [c_ptr6] "+r"(c_ptr6), - [c_ptr7] "+r"(c_ptr7) - : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); -} -template <> -inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - float32_t*& c_ptr0, // NOLINT - float32_t*& c_ptr1, // NOLINT - float32_t*& c_ptr2, // NOLINT - float32_t*& c_ptr3, // NOLINT - float32_t*& c_ptr4, // NOLINT - float32_t*& c_ptr5, // NOLINT - float32_t*& c_ptr6, // NOLINT - float32_t*& c_ptr7, // NOLINT - const float32_t* scale, - bool is_relu, - int k, - int tail) { - asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_FP32_OUT - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [k] "+r"(k), - [tail] "+r"(tail), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [c_ptr4] "+r"(c_ptr4), - [c_ptr5] "+r"(c_ptr5), - [c_ptr6] "+r"(c_ptr6), - [c_ptr7] "+r"(c_ptr7) - : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); -} -template <> -inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr, - const int8_t*& b_ptr, // NOLINT - const int32_t* bias, - int8_t*& c_ptr0, // NOLINT - int8_t*& c_ptr1, // NOLINT - int8_t*& c_ptr2, // NOLINT - int8_t*& c_ptr3, // NOLINT - int8_t*& c_ptr4, // NOLINT - int8_t*& c_ptr5, // NOLINT - int8_t*& c_ptr6, // NOLINT - int8_t*& c_ptr7, // NOLINT - const float32_t* scale, - bool is_relu, - int k, - int tail) { - asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [k] "+r"(k), - [tail] "+r"(tail), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [c_ptr4] "+r"(c_ptr4), - [c_ptr5] "+r"(c_ptr5), - [c_ptr6] "+r"(c_ptr6), - [c_ptr7] "+r"(c_ptr7) - : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); -} -#endif - -#else // armv7 -// clang-format off -#define GEMM_INT8_KERNEL \ - "vld1.8 {d0-d1}, [%[a_ptr]: 128]!\n" /* load 4x2x2 int8, A, k2x2 */ \ - "vld1.8 {d4-d7}, [%[b_ptr]: 128]!\n" /* load 8x2x2 int8, B, k2x2 */ \ - "vld1.8 {d8-d9}, [%[bias]]\n" /* load int32x4 bias */ \ - "vext.8 q5, q4, q4, #4\n" /* bias shift 1 int32 */ \ - "vext.8 q6, q4, q4, #8\n" /* bias shift 2 int32 */ \ - "vext.8 q7, q4, q4, #12\n" /* bias shift 3 int32 */ \ - "pld [%[a_ptr]]\n" /* preload A */ \ - "vand q8, q4, q4\n" /* set bias to out00 */ \ - "vand q9, q4, q4\n" /* set bias to out01 */ \ - "pld [%[b_ptr]]\n" /* preload B */ \ - "vand q10, q5, q5\n" /* set bias to out10 */ \ - "vand q11, q5, q5\n" /* set bias to out11 */ \ - "pld [%[b_ptr], #64]\n" /* preload B */ \ - "vand q12, q6, q6\n" /* set bias to out20 */ \ - "vand q13, q6, q6\n" /* set bias to out21 */ \ - "pld [%[b_ptr], #128]\n" /* preload B */ \ - "vand q14, q7, q7\n" /* set bias to out30 */ \ - "vand q15, q7, q7\n" /* set bias to out31 */ \ - "pld [%[a_ptr], #64]\n" /* preload A */ \ - "vext.8 d2, d0, d0, #2\n" /* shift left circular by 2byte */ \ - "vext.8 d3, d1, d1, #2\n" /* shift left circular by 2byte */ \ - "pld [%[b_ptr], #192]\n" /* preload b */ \ - "pld [%[b_ptr], #256]\n" /* preload b */ \ - "pld [%[a_ptr], #128]\n" /* preload a */ \ - "cmp %[k], #0\n" /* check main loop count */ \ - "beq 3f\n" /* if k = 0, jump to remains */ /* 1st r0, r1 */ \ - "vmull.s8 q4, d0, d4\n" /* a0 * b0 = c00 */ \ - "vmull.s8 q5, d0, d5\n" /* a0 * b1 = c01 */ \ - "vmull.s8 q6, d2, d4\n" /* a1 * b0 = c10 */ \ - "vmull.s8 q7, d2, d5\n" /* a1 * b1 = c11 */ \ - "subs %[k], %[k], #1\n" /* loop count -1 */ /* 2nd r0, r1 */ \ - "vmlal.s8 q4, d1, d6\n" /* a0 * b0 = c00 */ \ - "vmlal.s8 q5, d1, d7\n" /* a0 * b1 = c01 */ \ - "vrev64.32 q0, q0\n" /* shift left circular by 4byte */ \ - "vmlal.s8 q6, d3, d6\n" /* a1 * b0 = c10 */ \ - "vmlal.s8 q7, d3, d7\n" /* a1 * b1 = c11 */ \ - "vrev64.32 q1, q1\n" /* shift left circular by 4byte */ \ - "beq 8f\n" /* skip main loop */ /* main loop*/ \ - "0:\n" /* main loop */ /* 1st r2, r3 */ \ - "vpadal.s16 q8, q4\n" /* pair add and accumulate to int32, c00 */ \ - "vmull.s8 q4, d0, d4\n" /* a2 * b0 = c20 */ \ - "vpadal.s16 q9, q5\n" /* pair add and accumulate to int32, c01 */ \ - "vmull.s8 q5, d0, d5\n" /* a2 * b1 = c21 */ \ - "vpadal.s16 q10,q6\n" /* pair add and accumulate to int32, c10 */ \ - "vmull.s8 q6, d2, d4\n" /* a3 * b0 = c30 */ \ - "vpadal.s16 q11,q7\n" /* pair add and accumulate to int32, c11 */ \ - "vmull.s8 q7, d2, d5\n" /* a3 * b1 = c31 */ \ - "vld1.8 {d4-d5}, [%[b_ptr]: 128]!\n" /* load 4x2x2 int8, B, k2x2 */ \ - "vmlal.s8 q4, d1, d6\n" /* a0 * b0 = c00 */ \ - "vmlal.s8 q5, d1, d7\n" /* a0 * b1 = c01 */ \ - "vld1.8 {d0-d1}, [%[a_ptr]: 128]!\n" /* load 4x2x2 int8, A, k2x2 */ \ - "vmlal.s8 q6, d3, d6\n" /* a1 * b0 = c10 */ \ - "vmlal.s8 q7, d3, d7\n" /* a1 * b1 = c11 */ \ - "vld1.8 {d6-d7}, [%[b_ptr]: 128]!\n" /* load 4x2x2 int8, B, k2x2 */ \ - "vext.8 d2, d0, d0, #2\n" /* shift left circular by 2byte */ \ - "vext.8 d3, d1, d1, #2\n" /* shift left circular by 2byte */ \ - "vpadal.s16 q12,q4\n" /* pair add and accumulate to int32, c20 */ \ - "vmull.s8 q4, d0, d4\n" /* a0 * b0 = c00 */ \ - "vpadal.s16 q13,q5\n" /* pair add and accumulate to int32, c21 */ \ - "vmull.s8 q5, d0, d5\n" /* a0 * b1 = c01 */ \ - "vpadal.s16 q14,q6\n" /* pair add and accumulate to int32, c30 */ \ - "vmull.s8 q6, d2, d4\n" /* a1 * b0 = c10 */ \ - "vpadal.s16 q15,q7\n" /* pair add and accumulate to int32, c31 */ \ - "vmull.s8 q7, d2, d5\n" /* a1 * b1 = c11 */ \ - "subs %[k], %[k], #1\n" /* loop count -1 */ /* 2nd r0, r1 */ \ - "vmlal.s8 q4, d1, d6\n" /* a0 * b0 = c00 */ \ - "vmlal.s8 q5, d1, d7\n" /* a0 * b1 = c01 */ \ - "vrev64.32 q0, q0\n" /* shift left circular by 2 */ \ - "vmlal.s8 q6, d3, d6\n" /* a1 * b0 = c10 */ \ - "vmlal.s8 q7, d3, d7\n" /* a1 * b1 = c11 */ \ - "vrev64.32 q1, q1\n" /* shift left circular by 2 */ \ - "bgt 0b\n" /* jump to main loop */ \ - "8:\n" /* end of main loop */ /* 1st r2, r3 */ \ - "vpadal.s16 q8, q4\n" /* pair add and accumulate to int32, c00 */ \ - "vmull.s8 q4, d0, d4\n" /* a2 * b0 = c20 */ \ - "vpadal.s16 q9, q5\n" /* pair add and accumulate to int32, c01 */ \ - "vmull.s8 q5, d0, d5\n" /* a2 * b1 = c21 */ \ - "vpadal.s16 q10,q6\n" /* pair add and accumulate to int32, c10 */ \ - "vmull.s8 q6, d2, d4\n" /* a3 * b0 = c30 */ \ - "vpadal.s16 q11,q7\n" /* pair add and accumulate to int32, c11 */ \ - "vmull.s8 q7, d2, d5\n" /* a3 * b1 = c31 */ /* 2nd r2, r3 */ \ - "vmlal.s8 q4, d1, d6\n" /* a0 * b0 = c20 */ \ - "vmlal.s8 q5, d1, d7\n" /* a0 * b1 = c21 */ \ - "vmlal.s8 q6, d3, d6\n" /* a1 * b0 = c30 */ \ - "vmlal.s8 q7, d3, d7\n" /* a1 * b1 = c31 */ \ - "cmp %[rem], #0\n" /* skip remain */ \ - "beq 5f\n" \ - "mov r0, #32\n" /* address offset */ \ - "vld1.8 {d0}, [%[a_ptr]]\n" /* load a to d0, final */ \ - "vld1.8 {d4-d5}, [%[b_ptr]], r0\n" /* load b to d4, d5 */ \ - "5:\n" /* skip rem */ \ - "vpadal.s16 q12, q4\n" /* pair add and accumulate to int32, c20 */ \ - "vpadal.s16 q13, q5\n" /* pair add and accumulate to int32, c21 */ \ - "vpadal.s16 q14, q6\n" /* pair add and accumulate to int32, c30 */ \ - "vpadal.s16 q15, q7\n" /* pair add and accumulate to int32, c31 */ \ - "3:\n" /* process remain k */ \ - "cmp %[rem], #0\n" /* skip remain */ \ - "beq 7f\n" /* process remain k */ \ - "vext.8 d1, d0, d0, #2\n" /* shift left 2bytes */ \ - "vext.8 d2, d0, d0, #4\n" /* shift left 4bytes */ \ - "vext.8 d3, d0, d0, #6\n" /* shift left 6bytes */ /* 1st r0, r1 */ \ - "vmull.s8 q4, d0, d4\n" /* a0 * b0 = c00 */ \ - "vmull.s8 q5, d0, d5\n" /* a0 * b1 = c01 */ \ - "vmull.s8 q6, d1, d4\n" /* a1 * b0 = c10 */ \ - "vmull.s8 q7, d1, d5\n" /* a1 * b1 = c11 */ /* 1st r2, r3 */ \ - "vpadal.s16 q8, q4\n" /* pair add and accumulate to int32, c00 */ \ - "vmull.s8 q4, d2, d4\n" /* a2 * b0 = c20 */ \ - "vpadal.s16 q9, q5\n" /* pair add and accumulate to int32, c01 */ \ - "vmull.s8 q5, d2, d5\n" /* a2 * b1 = c21 */ \ - "vpadal.s16 q10,q6\n" /* pair add and accumulate to int32, c10 */ \ - "vmull.s8 q6, d3, d4\n" /* a3 * b0 = c30 */ \ - "vpadal.s16 q11,q7\n" /* pair add and accumulate to int32, c11 */ \ - "vmull.s8 q7, d3, d5\n" /* a3 * b1 = c31 */ \ - "vpadal.s16 q12, q4\n" /* pair add and accumulate to int32, c20 */ \ - "vpadal.s16 q13, q5\n" /* pair add and accumulate to int32, c21 */ \ - "vpadal.s16 q14, q6\n" /* pair add and accumulate to int32, c30 */ \ - "vpadal.s16 q15, q7\n" /* pair add and accumulate to int32, c31 */ \ - "7: \n" /* do relu */ /* do relu */ \ - "cmp %[is_relu], #0\n" /* skip relu */ \ - "beq 9f\n" /* skip relu */ \ - "vmov.i32 q0, #0\n" /* for relu */ \ - "vmax.s32 q8, q8, q0\n" /* relu */ \ - "vmax.s32 q9, q9, q0\n" /* relu */ \ - "vmax.s32 q10,q10, q0\n" /* relu */ \ - "vmax.s32 q11,q11, q0\n" /* relu */ \ - "vmax.s32 q12,q12, q0\n" /* relu */ \ - "vmax.s32 q13,q13, q0\n" /* relu */ \ - "vmax.s32 q14,q14, q0\n" /* relu */ \ - "vmax.s32 q15,q15, q0\n" /* relu */ /* unpack the result */ \ - "9:\n" /* unpack */ /* trans 1 */ \ - "vtrn.32 q8, q10\n" /* get q8 */ \ - "vtrn.32 q12, q14\n" /* get q12 */ \ - "vtrn.32 q9, q11\n" /* get q9 */ \ - "vtrn.32 q13, q15\n" /* get q13*/ \ - "vswp d17, d24\n" /* get q8*/ \ - "vswp d21, d28\n" /* get q10 */ \ - "vswp d19, d26\n" /* get q9 */ \ - "vswp d23, d30\n" /* get q11 */ \ - "vext.8 q0, q10, q10, #12\n" /* circular shift left 1 q0 */ \ - "vext.8 q2, q12, q12, #8\n" /* circular shift left 2 q2 */ \ - "vext.8 q4, q14, q14, #4\n" /* circular shift left 3 q4 */ \ - "vext.8 q1, q11, q11, #12\n" /* circular shift left 1 q1 */ \ - "vext.8 q3, q13, q13, #8\n" /* circular shift left 2 q3 */ \ - "vext.8 q5, q15, q15, #4\n" /* circular shift left 3 q5 */ \ - "vtrn.32 q8, q0\n" /* get q8 */ \ - "vtrn.32 q2, q4\n" /* get q2 */ \ - "vtrn.32 q9, q1\n" /* get q9 */ \ - "vtrn.32 q3, q5\n" /* get q3 */ /* trans 2 */ \ - "vswp d17, d4\n" /* get q8 */ \ - "vswp d1, d8\n" /* get q0: a1*/ \ - "vswp d19, d6\n" /* get q9: */ \ - "vswp d3, d10\n" /* get q1: a3b3 */ - -// clang-format off - -#define GEMM_INT8_INT32_OUT \ - /* write output */ \ - "vst1.32 {d16-d19}, [%[c_ptr0]]!\n" /* write outr0 */ \ - "vst1.32 {d0-d3}, [%[c_ptr1]]!\n" /* write outr1 */ \ - "vst1.32 {d4-d7}, [%[c_ptr2]]!\n" /* write outr2 */ \ - "vst1.32 {d8-d11}, [%[c_ptr3]]!\n" /* write outr3 */ - -#define GEMM_INT8_FP32_OUT \ - /* write output */ \ - "vld1.32 {d12-d13}, [%[scale]]\n" /* load scale */ \ - "vcvt.f32.s32 q10, q8\n" /* r00, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q11, q9\n" /* r01, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q12, q0\n" /* r10, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q13, q1\n" /* r11, cvt int32 to fp32*/ \ - "vmul.f32 q8, q10, d12[0]\n" /* r00, mul scale to get final result */ \ - "vmul.f32 q9, q11, d12[0]\n" /* r01, mul scale to get final result */ \ - "vmul.f32 q0, q12, d12[1]\n" /* r10, mul scale to get final result */ \ - "vmul.f32 q1, q13, d12[1]\n" /* r11, mul scale to get final result */ \ - "vcvt.f32.s32 q10, q2\n" /* r20, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q11, q3\n" /* r21, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q12, q4\n" /* r30, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q13, q5\n" /* r31, cvt int32 to fp32*/ \ - "vst1.32 {d16-d19}, [%[c_ptr0]]!\n" /* write r0, float32x4 x2 */ \ - "vmul.f32 q2, q10, d13[0]\n" /* r20, mul scale to get final result */ \ - "vmul.f32 q3, q11, d13[0]\n" /* r21, mul scale to get final result */ \ - "vst1.32 {d0-d3}, [%[c_ptr1]]!\n" /* write r1, float32x4 x2 */ \ - "vmul.f32 q4, q12, d13[1]\n" /* r30, mul scale to get final result */ \ - "vmul.f32 q5, q13, d13[1]\n" /* r31, mul scale to get final result */ \ - "vst1.32 {d4-d7}, [%[c_ptr2]]!\n" /* write r2, float32x4 x2 */ \ - "vst1.32 {d8-d11}, [%[c_ptr3]]!\n" /* write r3, float32x4 x2 */ - -#define GEMM_INT8_INT8_OUT \ - /* write output */ \ - "vld1.32 {d12-d13}, [%[scale]]\n" /* load scale */ \ - "vmov.f32 q7, #-0.5\n" /* neg offset */ \ - "vcvt.f32.s32 q10, q8\n" /* r00, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q11, q9\n" /* r01, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q12, q0\n" /* r10, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q13, q1\n" /* r11, cvt int32 to fp32*/ \ - "vmov.f32 q8, #0.5\n" /* pos offset */ \ - "vmov.f32 q9, #0.5\n" /* pos offset */ \ - "vmov.f32 q0, #0.5\n" /* pos offset */ \ - "vmov.f32 q1, #0.5\n" /* pos offset */ \ - "vcgt.f32 q14, q10, #0\n" /* get pos mask */ \ - "vcgt.f32 q15, q11, #0\n" /* get pos mask */ \ - "vbif.f32 q8, q7, q14\n" /* get right offset */ \ - "vbif.f32 q9, q7, q15\n" /* get right offset */ \ - "vcgt.f32 q14, q12, #0\n" /* get pos mask */ \ - "vcgt.f32 q15, q13, #0\n" /* get pos mask */ \ - "vbif.f32 q0, q7, q14\n" /* get right offset */ \ - "vbif.f32 q1, q7, q15\n" /* get right offset */ \ - "vmla.f32 q8, q10, d12[0]\n" /* r00, mul scale to get final result */ \ - "vmla.f32 q9, q11, d12[0]\n" /* r01, mul scale to get final result */ \ - "vmla.f32 q0, q12, d12[1]\n" /* r10, mul scale to get final result */ \ - "vmla.f32 q1, q13, d12[1]\n" /* r11, mul scale to get final result */ \ - "vcvt.f32.s32 q10, q2\n" /* r20, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q11, q3\n" /* r21, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q12, q4\n" /* r30, cvt int32 to fp32*/ \ - "vcvt.f32.s32 q13, q5\n" /* r31, cvt int32 to fp32*/ \ - "vmov.f32 q2, #0.5\n" /* pos offset */ \ - "vmov.f32 q3, #0.5\n" /* pos offset */ \ - "vmov.f32 q4, #0.5\n" /* pos offset */ \ - "vmov.f32 q5, #0.5\n" /* pos offset */ \ - "vcgt.f32 q14, q10, #0\n" /* get pos mask */ \ - "vcgt.f32 q15, q11, #0\n" /* get pos mask */ \ - "vbif.f32 q2, q7, q14\n" /* get right offset */ \ - "vbif.f32 q3, q7, q15\n" /* get right offset */ \ - "vcgt.f32 q14, q12, #0\n" /* get pos mask */ \ - "vcgt.f32 q15, q13, #0\n" /* get pos mask */ \ - "vbif.f32 q4, q7, q14\n" /* get right offset */ \ - "vbif.f32 q5, q7, q15\n" /* get right offset */ \ - "vmla.f32 q2, q10, d13[0]\n" /* r20, mul scale to get final result */ \ - "vmla.f32 q3, q11, d13[0]\n" /* r21, mul scale to get final result */ \ - "vmla.f32 q4, q12, d13[1]\n" /* r30, mul scale to get final result */ \ - "vmla.f32 q5, q13, d13[1]\n" /* r31, mul scale to get final result */ \ - "vcvt.s32.f32 q6, q8\n" /* r00, fp32->int32 */ \ - "vcvt.s32.f32 q7, q9\n" /* r01, fp32->int32 */ \ - "vcvt.s32.f32 q10, q0\n" /* r10, fp32->int32 */ \ - "vcvt.s32.f32 q11, q1\n" /* r11, fp32->int32 */ \ - "vcvt.s32.f32 q12, q2\n" /* r20, fp32->int32 */ \ - "vcvt.s32.f32 q13, q3\n" /* r21, fp32->int32 */ \ - "vcvt.s32.f32 q14, q4\n" /* r30, fp32->int32 */ \ - "vcvt.s32.f32 q15, q5\n" /* r31, fp32->int32 */ \ - "vqmovn.s32 d0, q6\n" /* r00, int32 -> int16 */ \ - "vqmovn.s32 d1, q7\n" /* r01, int32 -> int16 */ \ - "vqmovn.s32 d2, q10\n" /* r10, int32 -> int16 */ \ - "vqmovn.s32 d3, q11\n" /* r11, int32 -> int16 */ \ - "vqmovn.s32 d4, q12\n" /* r00, int32 -> int16 */ \ - "vqmovn.s32 d5, q13\n" /* r01, int32 -> int16 */ \ - "vqmovn.s32 d6, q14\n" /* r10, int32 -> int16 */ \ - "vqmovn.s32 d7, q15\n" /* r11, int32 -> int16 */ \ - "vqmovn.s16 d8, q0\n" /* 0, int16 -> int8 */ \ - "vqmovn.s16 d9, q1\n" /* 1, int16 -> int8 */ \ - "vqmovn.s16 d10, q2\n" /* 2, int16 -> int8 */ \ - "vqmovn.s16 d11, q3\n" /* 3, int16 -> int8 */ \ - "vst1.32 {d8}, [%[c_ptr0]]!\n" /* write r0*/ \ - "vst1.32 {d9}, [%[c_ptr1]]!\n" /* write r1*/ \ - "vst1.32 {d10}, [%[c_ptr2]]!\n" /* write r2*/ \ - "vst1.32 {d11}, [%[c_ptr3]]!\n" /* write r3*/ - -template <> -inline void gemm_int8_kernel(const int8_t* a_ptr, const int8_t*& b_ptr, // NOLINT - const int32_t* bias, int32_t*& c_ptr0, // NOLINT - int32_t*& c_ptr1, int32_t*& c_ptr2, // NOLINT - int32_t*& c_ptr3, const float* scale, bool is_relu, // NOLINT - int k, int rem) { - asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT32_OUT - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3), [k] "+r"(k) - : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15", "r0", "cc"); -} - -template <> -inline void gemm_int8_kernel(const int8_t* a_ptr, const int8_t*& b_ptr, // NOLINT - const int32_t* bias, float*& c_ptr0, // NOLINT - float*& c_ptr1, float*& c_ptr2, float*& c_ptr3, // NOLINT - const float* scale, bool is_relu, int k, int rem) { - asm volatile(GEMM_INT8_KERNEL GEMM_INT8_FP32_OUT - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3), [k] "+r"(k) - : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem), - [scale] "r"(scale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15", "r0", "cc"); -} - -template <> -inline void gemm_int8_kernel(const int8_t* a_ptr, const int8_t*& b_ptr, // NOLINT - const int32_t* bias, int8_t*& c_ptr0, // NOLINT - int8_t*& c_ptr1, int8_t*& c_ptr2, int8_t*& c_ptr3, // NOLINT - const float* scale, bool is_relu, int k, int rem) { - asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT - : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3), [k] "+r"(k) - : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem), - [scale] "r"(scale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15", "r0", "cc"); -} -#endif //__aarch64__ // NOLINT - -// gemm wrapper -template -void gemm_prepack_oth_int8(const int8_t* A_packed, - const int8_t* B, - const int* bias, - Dtype* C, - int M, - int N, - int K, - bool is_bias, - bool is_relu, - bool is_transB, - const float* scale, - ARMContext* ctx) { - const int KUP = ROUNDUP(K, KBLOCK_INT8); - size_t llc_size = ctx->llc_size() / 4; - auto workspace = ctx->workspace_data(); - int threads = ctx->threads(); - int x_block = llc_size / (sizeof(int8_t) * (KUP + MBLOCK_INT8_OTH)); - x_block /= NBLOCK_INT8_OTH; - x_block *= NBLOCK_INT8_OTH; - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK_INT8_OTH - 1) / NBLOCK_INT8_OTH; - x_block *= NBLOCK_INT8_OTH; - int k = K / KBLOCK_INT8; - int k_rem = K & (KBLOCK_INT8 - 1); - if (k_rem > KBLOCK_INT8 / 2) { - k_rem = 0; - k += 1; - } - int n_rem = N & (NBLOCK_INT8_OTH - 1); - - auto* b_tmp = static_cast(workspace); - - auto* zerobuf = static_cast(malloc(x_block * \ - (sizeof(int8_t) + sizeof(Dtype)))); - memset(zerobuf, 0, x_block * sizeof(int8_t)); - auto* trash_ptr = reinterpret_cast(zerobuf + \ - x_block * sizeof(int8_t)); - - //! apanel is pre_compute outside gemm - - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - bool flag_rem = false; - if (xmax >= N) { - xmax = N; - flag_rem = n_rem > 0; - } - int bblocks = (xmax - x0 + NBLOCK_INT8_OTH - 1) / NBLOCK_INT8_OTH; - //! load bpanel - int8_t* b_pannel = b_tmp; - if (is_transB) { - packb_trans_int8(b_pannel, B, K, 0, K, x0, xmax, zerobuf); - } else { - packb_int8(b_pannel, B, N, 0, K, x0, xmax, zerobuf); - } - -#pragma omp parallel for num_threads(threads) - for (unsigned int y = 0; y < M; y += MBLOCK_INT8_OTH) { - Dtype out0[NBLOCK_INT8_OTH] = {0}; - Dtype out1[NBLOCK_INT8_OTH] = {0}; - Dtype out2[NBLOCK_INT8_OTH] = {0}; - Dtype out3[NBLOCK_INT8_OTH] = {0}; - Dtype* c_ptr0 = C + y * N + x0; - Dtype* c_ptr1 = c_ptr0 + N; - Dtype* c_ptr2 = c_ptr1 + N; - Dtype* c_ptr3 = c_ptr2 + N; - Dtype* tmp0 = nullptr; - Dtype* tmp1 = nullptr; - Dtype* tmp2 = nullptr; - Dtype* tmp3 = nullptr; - float32_t scale_local[4]; - int32_t bias_local[4] = {0, 0, 0, 0}; - if (is_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - } - if (scale) { - scale_local[0] = scale[y]; - scale_local[1] = scale[y + 1]; - scale_local[2] = scale[y + 2]; - scale_local[3] = scale[y + 3]; - } - if (y + MBLOCK_INT8_OTH > M) { - switch (y + MBLOCK_INT8_OTH - M) { - case 3: - c_ptr1 = trash_ptr; - case 2: - c_ptr2 = trash_ptr; - case 1: - c_ptr3 = trash_ptr; - default: - break; - } - } - const int8_t* a_ptr_l = A_packed + y * KUP; - const int8_t* b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if (flag_rem && (xb == bblocks - 1)) { - tmp0 = c_ptr0; - tmp1 = c_ptr1; - tmp2 = c_ptr2; - tmp3 = c_ptr3; - c_ptr0 = out0; - c_ptr1 = out1; - c_ptr2 = out2; - c_ptr3 = out3; - } - gemm_int8_kernel(a_ptr_l, b_ptr, bias_local, - c_ptr0, c_ptr1, c_ptr2, c_ptr3, - scale_local, is_relu, k, k_rem); - if (flag_rem && (xb == bblocks - 1)) { - for (int i = 0; i < n_rem; ++i) { - *(tmp0++) = out0[i]; - *(tmp1++) = out1[i]; - *(tmp2++) = out2[i]; - *(tmp3++) = out3[i]; - } - } - } - } - } - free(zerobuf); -} - -/***********************************************************************/ -// prepack A according to gemm kernel -// A block size: (<4x2>x1) x2, with unroll=2 can be described as below: -// origin A data: -// A_origin(no trans, m x k): -// r0: ==> a0, b0, c0, d0, e0, f0, g0, h0 -// r1: ==> a1, b1, c1, d1, e1, f1, g1, h1 -// r2: ==> a2, b2, c2, d2, e2, f2, g2, h2 -// r3: ==> a3, b3, c3, d3, e3, f3, g3, h3 -// packed A -// a0,b0, a1,b1, a2,b2, a3,b3; -// c0,d0, c1,d1, c2,d2, c3,d3; -// e0,f0, e1,f1, e2,f2, e3,f3; -// g0,h0, g1,h1, g2,h2, g3,h3; -/***********************************************************************/ -void prepackA_m4k2x2_int8(int8_t* out, const int8_t* in, const int ldin, - const int m0, const int mmax, const int k0, - const int kmax) { - int y_len = mmax - m0; - int x_len = kmax - k0; - int x_len_roundup = ROUNDUP(x_len, KBLOCK_INT8); - auto zerobuff = static_cast(malloc(x_len_roundup * sizeof(char))); - memset(zerobuff, 0, sizeof(char) * x_len_roundup); - - const int8_t* inptr = in + m0 * ldin + k0; - uint8_t remain = static_cast(x_len & (KBLOCK_INT8 - 1)); - -#pragma omp parallel for - for (int y = 0; y < y_len; y += MBLOCK_INT8_OTH) { - const int8_t* ptr0 = inptr + y * ldin; - const int8_t* ptr1 = ptr0 + ldin; - const int8_t* ptr2 = ptr1 + ldin; - const int8_t* ptr3 = ptr2 + ldin; - //! cope with row index exceed real size, set to zero buffer - if ((y + MBLOCK_INT8_OTH) > y_len) { - switch ((y + MBLOCK_INT8_OTH) - y_len) { - case 3: - ptr1 = zerobuff; - case 2: - ptr2 = zerobuff; - case 1: - ptr3 = zerobuff; - default: - break; - } - } - int8_t* ptr_out = out + y * x_len_roundup; - int i = 0; - for (; i < x_len + 1 - 2 * KBLOCK_INT8; i += 2 * KBLOCK_INT8) { -#ifdef __aarch64__ - asm volatile( - "ld1 {v0.8b}, [%[ptr0]], #8\n" /* load r0, 8 int8 */ - "ld1 {v1.8b}, [%[ptr1]], #8\n" /* load r1, 8 int8 */ - "ld1 {v2.8b}, [%[ptr2]], #8\n" /* load r2, 8 int8 */ - "ld1 {v3.8b}, [%[ptr3]], #8\n" /* load r3, 8 int8 */ - "trn1 v4.4h, v0.4h, v1.4h\n" /* get a0,b0, a2,b2 */ - "trn2 v5.4h, v0.4h, v1.4h\n" /* get a1,b1, a3,b3 */ - "trn1 v6.4h, v2.4h, v3.4h\n" /* get c0,d0, c2,d2 */ - "trn2 v7.4h, v2.4h, v3.4h\n" /* get c1,d1, c3,d3 */ - "trn1 v0.2s, v4.2s, v6.2s\n" /* get a0,b0, c0,d0 */ - "trn2 v2.2s, v4.2s, v6.2s\n" /* get a2,b2, c2,d2 */ - "trn1 v1.2s, v5.2s, v7.2s\n" /* get a1,b1, c1,d1 */ - "trn2 v3.2s, v5.2s, v7.2s\n" /* get a3,b3, c3,d3 */ - "st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[ptr_out]], #32\n" /* write - out*/ - : [ptr_out] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory"); -#else // armv7 - asm volatile( - "vld1.8 {d0}, [%[ptr0]]!\n" /* load r0, 8 int8, - a0,b0,c0,d0,e0,f0,g0,h0 */ - "vld1.8 {d1}, [%[ptr1]]!\n" /* load r1, 8 int8, - a1,b1,c1,d1,e1,f1,g1,h1 */ - "vld1.8 {d2}, [%[ptr2]]!\n" /* load r2, 8 int8, - a2,b2,c2,d2,e2,f2,g2,h2 */ - "vld1.8 {d3}, [%[ptr3]]!\n" /* load r3, 8 int8, - a3,b3,c3,d3,e3,f3,g3,h3 */ - "vtrn.16 d0, d1\n" /* trans, d0: a0,b0,a1,b1, e0,f0,e1,f1; d1: - c0,d0,c1,d1, g0,h0,g1,h1 */ - "vtrn.16 d2, d3\n" /* trans, d2: a2,b2,a3,b3, e2,f2,e3,f3; d3: - c2,d2,c3,d3, g2,h2,g3,h3 */ - "vtrn.32 d0, d2\n" /* trans, d0: a0,b0,a1,b1, a2,b2,a3,b3; d2: - e0,f0,e1,f1, e2,f2,e3,f3 */ - "vtrn.32 d1, d3\n" /* trans, d1: c0,d0,c1,d1, e2,f2,e3,f3; d3: - g0,h0,g1,h1, g2,h2,g3,h3 */ - "vst1.32 {d0-d3}, [%[outptr]]!\n" /* write to output ptr */ - : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3) - : - : "q0", "q1", "cc", "memory"); -#endif //__aarch64 // NOLINT - } - if (i + KBLOCK_INT8 <= x_len) { - ptr_out[0] = ptr0[0]; - ptr_out[1] = ptr0[1]; - ptr_out[2] = ptr1[0]; - ptr_out[3] = ptr1[1]; - ptr_out[4] = ptr2[0]; - ptr_out[5] = ptr2[1]; - ptr_out[6] = ptr3[0]; - ptr_out[7] = ptr3[1]; - // unroll - ptr_out[8] = ptr0[2]; - ptr_out[9] = ptr0[3]; - ptr_out[10] = ptr1[2]; - ptr_out[11] = ptr1[3]; - ptr_out[12] = ptr2[2]; - ptr_out[13] = ptr2[3]; - ptr_out[14] = ptr3[2]; - ptr_out[15] = ptr3[3]; - ptr_out += 16; - ptr0 += 4; - ptr1 += 4; - ptr2 += 4; - ptr3 += 4; - } - switch (remain) { - case 0: - break; - case 1: - ptr_out[0] = ptr0[0]; - ptr_out[1] = 0; - ptr_out[2] = ptr1[0]; - ptr_out[3] = 0; - ptr_out[4] = ptr2[0]; - ptr_out[5] = 0; - ptr_out[6] = ptr3[0]; - ptr_out[7] = 0; - // unroll - ptr_out[8] = 0; - ptr_out[9] = 0; - ptr_out[10] = 0; - ptr_out[11] = 0; - ptr_out[12] = 0; - ptr_out[13] = 0; - ptr_out[14] = 0; - ptr_out[15] = 0; - ptr_out += 16; - break; - case 2: - ptr_out[0] = ptr0[0]; - ptr_out[1] = ptr0[1]; - ptr_out[2] = ptr1[0]; - ptr_out[3] = ptr1[1]; - ptr_out[4] = ptr2[0]; - ptr_out[5] = ptr2[1]; - ptr_out[6] = ptr3[0]; - ptr_out[7] = ptr3[1]; - // unroll - ptr_out[8] = 0; - ptr_out[9] = 0; - ptr_out[10] = 0; - ptr_out[11] = 0; - ptr_out[12] = 0; - ptr_out[13] = 0; - ptr_out[14] = 0; - ptr_out[15] = 0; - ptr_out += 16; - break; - case 3: - ptr_out[0] = ptr0[0]; - ptr_out[1] = ptr0[1]; - ptr_out[2] = ptr1[0]; - ptr_out[3] = ptr1[1]; - ptr_out[4] = ptr2[0]; - ptr_out[5] = ptr2[1]; - ptr_out[6] = ptr3[0]; - ptr_out[7] = ptr3[1]; - // unroll - ptr_out[8] = ptr0[2]; - ptr_out[9] = 0; - ptr_out[10] = ptr1[2]; - ptr_out[11] = 0; - ptr_out[12] = ptr2[2]; - ptr_out[13] = 0; - ptr_out[14] = ptr3[2]; - ptr_out[15] = 0; - ptr_out += 16; - break; - default: - break; - } - } - free(zerobuff); -} - -/***************************************************************************/ -// prepack A according to gemm kernel -// A block size: <4x2>x2, unroll x4, can be described as below: -// origin A data: -// A_origin(no trans, k x m): -// r0: ==> a0, a1, a2, a3 .... a12, a13, a14, a15 -// r1: ==> b0, b1, b2, b3 .... b12, b13, b14, b15 -// r2: ==> c0, c1, c2, c3 .... c12, c13, c14, c15 -// r3: ==> d0, d1, d2, d3 .... d12, d13, d14, d15 -// packed A: -// a0,b0, a1,b1, a2,b2, a3,b3; -// c0,d0, c1,d1, c2,d2, c3,d3;----block0 -// a4,b4, a5,b5, a6,b6, a7,b7; -// c4,d4, c5,d5, c6,d6, c7,d7;----block1 -// a8,b8, a9,b9, a10,b10, a11,b11; -// c8,d8, c9,d9, c10,d10, c11,d11;----block2 -// a12,b12, a13,b13, a14,b14, a15,b15; -// c12,d12, c13,d13, c14,d14, c15,d15;----block3 -/***************************************************************************/ -void prepackA_m4k2x2_trans_int8(int8_t* out, const int8_t* in, const int ldin, - const int m0, const int mmax, const int k0, - const int kmax) { - int xlen = mmax - m0; - int ylen = kmax - k0; - int ylen_roundup = ROUNDUP(ylen, KBLOCK_INT8); - int xlen_roundup = ROUNDUP(xlen, MBLOCK_INT8_OTH); - - const int MUNROLL = 4; - int mcnt = xlen / (MUNROLL * MBLOCK_INT8_OTH); - int x_rem = xlen & (MUNROLL * MBLOCK_INT8_OTH - 1); - int m_rem = (x_rem + MBLOCK_INT8_OTH - 1) / MBLOCK_INT8_OTH; - - const uint8_t mask_buffer[16] = {0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15}; - int8x16_t vzero = vdupq_n_s8(0); - uint8x16_t vmask = vcltq_u8(vld1q_u8(mask_buffer), vdupq_n_u8(x_rem)); - - int stride_out = ylen_roundup * MBLOCK_INT8_OTH; - - int8_t* zerobuf = static_cast(malloc(xlen_roundup)); - memset(zerobuf, 0, xlen_roundup); - - const int8_t* inr = in + ldin * k0 + m0; -#pragma omp parallel for - for (int y = 0; y < ylen; y += KBLOCK_INT8) { - const int8_t* ptr0 = inr + y * ldin; - const int8_t* ptr1 = ptr0 + ldin; - const int8_t* ptr2 = ptr1 + ldin; - const int8_t* ptr3 = ptr2 + ldin; - int8_t* ptr_out = out + MBLOCK_INT8_OTH * y; - if (y + KBLOCK_INT8 > ylen) { - switch (y + KBLOCK_INT8 - ylen) { - case 3: - ptr1 = zerobuf; - case 2: - ptr2 = zerobuf; - case 1: - ptr3 = zerobuf; - default: - break; - } - } - int k = mcnt; - int rem = m_rem; -#ifdef __aarch64__ - asm volatile( - "ld1 {v0.16b}, [%[ptr0]], #16\n" /* load r0 */ - "ld1 {v1.16b}, [%[ptr1]], #16\n" /* load r1 */ - "ld1 {v2.16b}, [%[ptr2]], #16\n" /* load r2 */ - "ld1 {v3.16b}, [%[ptr3]], #16\n" /* load r3 */ - "cbz %w[k], 1f\n" /* jump to remain */ - "0:\n" /* main loop */ - /* trans 16b */ - "trn1 v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6, - a8,b8, a10,b10, a12,b12, a14,b14 */ - "trn2 v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7, - a9,b9, a11,b11, a13,b13, a15,b15 */ - "trn1 v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6, - c8,d8, c10,d10, c12,d12, c14,d14 */ - "trn2 v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7, - c9,d9, c11,d11, c13,d13, c15,d15 */ - "ld1 {v0.16b}, [%[ptr0]], #16\n" /* load r0 */ - "ld1 {v1.16b}, [%[ptr1]], #16\n" /* load r1 */ - "subs %w[k], %w[k], #1\n" /* loop cnt -1 */ - /* trans 8h */ - "trn1 v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8, - a9,b9, a12,b12, a13,b13 */ - "trn2 v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7, - a10,b10, a11,b11, a14,b14, a15,b15 */ - "trn1 v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5, - c8,d8, c9,d9, c12,d12, c13,d13 */ - "trn2 v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7, - c10,d10, c11,d11, c14,d14, c15,d15 */ - /* trans 4s */ - "ld1 {v2.16b}, [%[ptr2]], #16\n" /* load r2 */ - "trn1 v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8, - a9,b9, a10,b10, a11,b11 */ - "trn2 v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, - a12,b12, a13,b13, a14,b14, a15,b15 */ - "trn1 v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn2 v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7, - c12,d12, c13,d13, c14,d14, c15,d15 - */ - /* trans 2d */ - "ld1 {v3.16b}, [%[ptr3]], #16\n" /* load r3 */ - "trn1 v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, - c1,d1, c2,d2, c3,d3 */ - "trn1 v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, - c5,d5, c6,d6, c7,d7 */ - "trn2 v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn2 v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14, - a15,b15, c12,d12, c13,d13, c14,d14, - c15,d15 */ - "st1 {v8.16b}, [%[ptr_out]], %[stride]\n" /* write block0, address + - stride */ - "st1 {v9.16b}, [%[ptr_out]], %[stride]\n" /* write block1, address + - stride */ - "st1 {v10.16b}, [%[ptr_out]], %[stride]\n" /* write block2, address + - stride */ - "st1 {v11.16b}, [%[ptr_out]], %[stride]\n" /* write block3, address + - stride */ - "bgt 0b\n" /* jump to main loop */ - "1:\n" /* process remain */ - "cbz %w[rem], 2f\n" /* skip to remain */ - /* bit select */ - "bif v0.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v1.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v2.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v3.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - /* trans 16b */ - "trn1 v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6, - a8,b8, a10,b10, a12,b12, a14,b14 */ - "trn2 v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7, - a9,b9, a11,b11, a13,b13, a15,b15 */ - "trn1 v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6, - c8,d8, c10,d10, c12,d12, c14,d14 */ - "trn2 v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7, - c9,d9, c11,d11, c13,d13, c15,d15 */ - /* trans 8h */ - "trn1 v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8, - a9,b9, a12,b12, a13,b13 */ - "trn2 v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7, - a10,b10, a11,b11, a14,b14, a15,b15 */ - "trn1 v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5, - c8,d8, c9,d9, c12,d12, c13,d13 */ - "trn2 v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7, - c10,d10, c11,d11, c14,d14, c15,d15 */ - /* trans 4s */ - "trn1 v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8, - a9,b9, a10,b10, a11,b11 */ - "trn2 v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, - a12,b12, a13,b13, a14,b14, a15,b15 */ - "trn1 v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn2 v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7, - c12,d12, c13,d13, c14,d14, c15,d15 - */ - /* trans 2d */ - "trn1 v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, - c1,d1, c2,d2, c3,d3 */ - "trn1 v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, - c5,d5, c6,d6, c7,d7 */ - "trn2 v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn2 v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14, - a15,b15, c12,d12, c13,d13, c14,d14, - c15,d15 */ - /* check remain size */ - "subs %w[rem], %w[rem], #1\n" /* check remain num */ - "st1 {v8.16b}, [%[ptr_out]], %[stride]\n" /* write 0 */ - "beq 2f\n" /* remain = 1 */ - "subs %w[rem], %w[rem], #1\n" /* check remain num */ - "st1 {v9.16b}, [%[ptr_out]], %[stride]\n" /* write 1 */ - "beq 2f\n" /* remain = 2 */ - "subs %w[rem], %w[rem], #1\n" /* check remain num */ - "st1 {v10.16b}, [%[ptr_out]], %[stride]\n" /* write 2 */ - "beq 2f\n" /* remain = 3 */ - "st1 {v11.16b}, [%[ptr_out]]\n" /* write 3 */ - /* end */ - "2:\n" /* end */ - : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3), [k] "+r"(k), [rem] "+r"(rem), - [ptr_out] "+r"(ptr_out) - : [mask] "w"(vmask), [vzero] "w"(vzero), [stride] "r"(stride_out) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "cc"); -#else // armv7 - asm volatile( - "vld1.8 {d0-d1}, [%[ptr0]]!\n" /* load r0 */ - "vld1.8 {d2-d3}, [%[ptr1]]!\n" /* load r1 */ - "vld1.8 {d4-d5}, [%[ptr2]]!\n" /* load r2 */ - "vld1.8 {d6-d7}, [%[ptr3]]!\n" /* load r3 */ - "cmp %[k], #0\n" /* check main loop */ - "beq 1f\n" /* jump to remain */ - "0:\n" /* main loop */ - /* trans 16b */ - "vtrn.8 q0, q1\n" /* get q0: a0,b0, a2,b2, a4,b4, a6,b6, a8,b8, a10,b10, - a12,b12, a14,b14; q1: a1,b1, a3,b3, a5,b5, a7,b7, - a9,b9, a11,b11, a13,b13, a15,b15 */ - "vtrn.8 q2, q3\n" /* get q2: c0,d0, c2,d2, c4,d4, c6,d6, c8,d8, c10,d10, - c12,d12, c14,d14; q3: c0,d0, c2,d2, c4,d4, c6,d6, - c8,d8, c10,d10, c12,d12, c14,d14 */ - "subs %[k], %[k], #1\n" /* loop cnt -1 */ - /* trans 8h */ - "vtrn.16 q0, q1\n" /* get q0: a0,b0, a1,b1, a4,b4, a5,b5, a8,b8, - a9,b9, a12,b12, a13,b13; q1: a2,b2, a3,b3, - a6,b6, a7,b7, a10,b10, a11,b11, a14,b14, - a15,b15 */ - "vtrn.16 q2, q3\n" /* get q2: c0,d0, c1,d1, c4,d4, c5,d5, c8,d8, - c9,d9, c12,d12, c13,d13; q3: c2,d2, c3,d3, - c6,d6, c7,d7, c10,d10, c11,d11, c14,d14, - c15,d15 */ - /* trans 4s */ - "vtrn.32 q0, q1\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, a8,b8, - a9,b9, a10,b10, a11,b11; q1: a4,b4, a5,b5, - a6,b6, a7,b7, a12,b12, a13,b13, a14,b14, - a15,b15 */ - "vtrn.32 q2, q3\n" /* get q2: c0,d0, c1,d1, c2,d2, c3,d3, c8,d8, - c9,d9, c10,d10, c11,d11; q3: c4,d4, c5,d5, - c6,d6, c7,d7, c12,d12, c13,d13, c14,d14, - c15,d15 */ - /* trans 2d */ - "vswp d1, d4\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, c1,d1, - c2,d2, c3,d3; q2: a8,b8, a9,b9, a10,b10, a11,b11, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "vswp d3, d6\n" /* get q1: a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, c5,d5, - c6,d6, c7,d7; q3: a12,b12, a13,b13, a14,b14, - a15,b15, c12,d12, c13,d13, c14,d14, c15,d15 */ - "vst1.8 {d0-d1}, [%[ptr_out]], %[stride]\n" /* write block0, address + - stride */ - "vst1.8 {d2-d3}, [%[ptr_out]], %[stride]\n" /* write block1, address + - stride */ - "vst1.8 {d4-d5}, [%[ptr_out]], %[stride]\n" /* write block2, address + - stride */ - "vst1.8 {d6-d7}, [%[ptr_out]], %[stride]\n" /* write block3, address + - stride */ - "vld1.8 {d0-d1}, [%[ptr0]]!\n" /* load r0 */ - "vld1.8 {d2-d3}, [%[ptr1]]!\n" /* load r1 */ - "vld1.8 {d4-d5}, [%[ptr2]]!\n" /* load r2 */ - "vld1.8 {d6-d7}, [%[ptr3]]!\n" /* load r3 */ - "bgt 0b\n" /* jump to main loop */ - "1:\n" /* process remain */ - "cmp %[rem], #0\n" /* check remain */ - "beq 2f\n" /* skip to remain */ - /* bit select */ - "vbif q0, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q1, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q2, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q3, %q[vzero], %q[mask]\n" /* pad 0 */ - /* trans 16b */ - "vtrn.8 q0, q1\n" /* get q0: a0,b0, a2,b2, a4,b4, a6,b6, a8,b8, a10,b10, - a12,b12, a14,b14; q1: a1,b1, a3,b3, a5,b5, a7,b7, - a9,b9, a11,b11, a13,b13, a15,b15 */ - "vtrn.8 q2, q3\n" /* get q2: c0,d0, c2,d2, c4,d4, c6,d6, c8,d8, c10,d10, - c12,d12, c14,d14; q3: c0,d0, c2,d2, c4,d4, c6,d6, - c8,d8, c10,d10, c12,d12, c14,d14 */ - /* trans 8h */ - "vtrn.16 q0, q1\n" /* get q0: a0,b0, a1,b1, a4,b4, a5,b5, a8,b8, - a9,b9, a12,b12, a13,b13; q1: a2,b2, a3,b3, - a6,b6, a7,b7, a10,b10, a11,b11, a14,b14, - a15,b15 */ - "vtrn.16 q2, q3\n" /* get q2: c0,d0, c1,d1, c4,d4, c5,d5, c8,d8, - c9,d9, c12,d12, c13,d13; q3: c2,d2, c3,d3, - c6,d6, c7,d7, c10,d10, c11,d11, c14,d14, - c15,d15 */ - /* trans 4s */ - "vtrn.32 q0, q1\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, a8,b8, - a9,b9, a10,b10, a11,b11; q1: a4,b4, a5,b5, - a6,b6, a7,b7, a12,b12, a13,b13, a14,b14, - a15,b15 */ - "vtrn.32 q2, q3\n" /* get q2: c0,d0, c1,d1, c2,d2, c3,d3, c8,d8, - c9,d9, c10,d10, c11,d11; q3: c4,d4, c5,d5, - c6,d6, c7,d7, c12,d12, c13,d13, c14,d14, - c15,d15 */ - /* trans 2d */ - "vswp d1, d4\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, c1,d1, - c2,d2, c3,d3; q2: a8,b8, a9,b9, a10,b10, a11,b11, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "vswp d3, d6\n" /* get q1: a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, c5,d5, - c6,d6, c7,d7; q3: a12,b12, a13,b13, a14,b14, - a15,b15, c12,d12, c13,d13, c14,d14, c15,d15 */ - /* check remain size */ - "subs %[rem], %[rem], #1\n" /* check remain num */ - "vst1.8 {d0-d1}, [%[ptr_out]], %[stride]\n" /* write 0 */ - "beq 2f\n" /* remain = 1 */ - "subs %[rem], %[rem], #1\n" /* check remain num */ - "vst1.8 {d2-d3}, [%[ptr_out]], %[stride]\n" /* write 1 */ - "beq 2f\n" /* remain = 2 */ - "subs %[rem], %[rem], #1\n" /* check remain num */ - "vst1.8 {d4-d5}, [%[ptr_out]], %[stride]\n" /* write 2 */ - "beq 2f\n" /* remain = 3 */ - "vst1.8 {d6-d7}, [%[ptr_out]], %[stride]\n" /* write 3 */ - /* end */ - "2:\n" /* end */ - : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3), [k] "+r"(k), [rem] "+r"(rem), - [ptr_out] "+r"(ptr_out) - : [mask] "w"(vmask), [vzero] "w"(vzero), [stride] "r"(stride_out) - : "q0", "q1", "q2", "q3", "cc"); -#endif //__aarch64__ // NOLINT - } - free(zerobuf); -} - -/**************************************************************************/ -// for armv8 -// prepack B according to gemm kernel -// B block size: (<4x2>x4) x2, can be described as below: -// origin B data: -// B_origin(no trans, k x n): -// r0: ==> a0, a1, a2, a3 .... a12, a13, a14, a15 -// r1: ==> b0, b1, b2, b3 .... b12, b13, b14, b15 -// r2: ==> c0, c1, c2, c3 .... c12, c13, c14, c15 -// r3: ==> d0, d1, d2, d3 .... d12, d13, d14, d15 -// packed B: -// a0,b0, a1,b1, a2,b2, a3,b3; -// c0,d0, c1,d1, c2,d2, c3,d3; -// . -// . -// . -// a12,b12, a13,b13, a14,b14, a15,b15; -// c12,d12, c13,d13, c14,d14, c15,d15; -// for armv7 -// prepack B according to gemm kernel -// B block size: (<4x2>x4) x2, can be described as below: -// origin B data: -// B_origin(no trans, k x n): -// r0: ==> a0, a1, a2, a3, a4, a5, a6, a7 -// r1: ==> b0, b1, b2, b3, b4, b5, b6, b7 -// r2: ==> c0, c1, c2, c3, c4, c5, c6, c7 -// r3: ==> d0, d1, d2, d3, d4, d5, d6, d7 -// packed B: -// a0,b0, a1,b1, a2,b2, a3,b3; -// a4,b4, a5,b5, a6,b6, a7,b7; -// c0,d0, c1,d1, c2,d2, c3,d3; -// c4,d4, c5,d5, c6,d6, c7,d7; -/***************************************************************************/ -void packb_int8(int8_t* out, const int8_t* in, const int ldin, const int k0, - const int kmax, const int n0, const int nmax, - const int8_t* zerobuf) { - const int8_t* inptr = in + k0 * ldin + n0; - const uint8_t mask_buffer[16] = {0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15}; - int x_len = nmax - n0; - int y_len = kmax - k0; - int kup = ROUNDUP(y_len, KBLOCK_INT8); - int kcnt = x_len / NBLOCK_INT8_OTH; - int rem = x_len & (NBLOCK_INT8_OTH - 1); - int stride_out = NBLOCK_INT8_OTH * kup; - - int8x16_t vzero = vdupq_n_s8(0); - uint8x16_t vmask = vcltq_u8(vld1q_u8(mask_buffer), vdupq_n_u8(rem)); -#pragma omp parallel for - for (int y = 0; y < y_len; y += KBLOCK_INT8) { - const int8_t* ptr0 = inptr + y * ldin; - const int8_t* ptr1 = ptr0 + ldin; - const int8_t* ptr2 = ptr1 + ldin; - const int8_t* ptr3 = ptr2 + ldin; - if (y + KBLOCK_INT8 > y_len) { - switch (y + KBLOCK_INT8 - y_len) { - case 3: - ptr1 = zerobuf; - case 2: - ptr2 = zerobuf; - case 1: - ptr3 = zerobuf; - default: - break; - } - } - int8_t* outptr_row_col = out + y * NBLOCK_INT8_OTH; - int k = kcnt; -#ifdef __aarch64__ - asm volatile( - "ld1 {v0.16b}, [%[ptr0]], #16\n" /* load r0 */ - "ld1 {v1.16b}, [%[ptr1]], #16\n" /* load r1 */ - "ld1 {v2.16b}, [%[ptr2]], #16\n" /* load r2 */ - "ld1 {v3.16b}, [%[ptr3]], #16\n" /* load r3 */ - "cbz %w[k], 1f\n" /* jump to remain */ - "0:\n" /* main loop */ - /* trans 16b */ - "trn1 v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6, - a8,b8, a10,b10, a12,b12, a14,b14 */ - "trn2 v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7, - a9,b9, a11,b11, a13,b13, a15,b15 */ - "trn1 v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6, - c8,d8, c10,d10, c12,d12, c14,d14 */ - "trn2 v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7, - c9,d9, c11,d11, c13,d13, c15,d15 */ - "ld1 {v0.16b}, [%[ptr0]], #16\n" /* load r0 */ - "ld1 {v1.16b}, [%[ptr1]], #16\n" /* load r1 */ - "subs %w[k], %w[k], #1\n" /* loop cnt -1 */ - /* trans 8h */ - "trn1 v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8, - a9,b9, a12,b12, a13,b13 */ - "trn2 v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7, - a10,b10, a11,b11, a14,b14, a15,b15 */ - "trn1 v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5, - c8,d8, c9,d9, c12,d12, c13,d13 */ - "trn2 v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7, - c10,d10, c11,d11, c14,d14, c15,d15 */ - /* trans 4s */ - "ld1 {v2.16b}, [%[ptr2]], #16\n" /* load r2 */ - "trn1 v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8, - a9,b9, a10,b10, a11,b11 */ - "trn2 v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, - a12,b12, a13,b13, a14,b14, a15,b15 */ - "trn1 v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn2 v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7, - c12,d12, c13,d13, c14,d14, c15,d15 - */ - /* trans 2d */ - "ld1 {v3.16b}, [%[ptr3]], #16\n" /* load r3 */ - "trn1 v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, - c1,d1, c2,d2, c3,d3 */ - "trn2 v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn1 v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, - c5,d5, c6,d6, c7,d7 */ - "trn2 v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14, - a15,b15, c12,d12, c13,d13, c14,d14, - c15,d15 */ - "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%[ptr_out]], %[stride]\n" - "bgt 0b\n" /* jump to main loop */ - "1:\n" /* process remain */ - "cbz %w[rem], 2f\n" /* jump to remain */ - /* bit select */ - "bif v0.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v1.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v2.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v3.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - /* trans 16b */ - "trn1 v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6, - a8,b8, a10,b10, a12,b12, a14,b14 */ - "trn2 v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7, - a9,b9, a11,b11, a13,b13, a15,b15 */ - "trn1 v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6, - c8,d8, c10,d10, c12,d12, c14,d14 */ - "trn2 v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7, - c9,d9, c11,d11, c13,d13, c15,d15 */ - /* trans 8h */ - "trn1 v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8, - a9,b9, a12,b12, a13,b13 */ - "trn2 v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7, - a10,b10, a11,b11, a14,b14, a15,b15 */ - "trn1 v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5, - c8,d8, c9,d9, c12,d12, c13,d13 */ - "trn2 v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7, - c10,d10, c11,d11, c14,d14, c15,d15 */ - /* trans 4s */ - "trn1 v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8, - a9,b9, a10,b10, a11,b11 */ - "trn2 v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, - a12,b12, a13,b13, a14,b14, a15,b15 */ - "trn1 v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn2 v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7, - c12,d12, c13,d13, c14,d14, c15,d15 - */ - /* trans 2d */ - "trn1 v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, - c1,d1, c2,d2, c3,d3 */ - "trn2 v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11, - c8,d8, c9,d9, c10,d10, c11,d11 */ - "trn1 v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, - c5,d5, c6,d6, c7,d7 */ - "trn2 v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14, - a15,b15, c12,d12, c13,d13, c14,d14, - c15,d15 */ - "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%[ptr_out]]\n" /* save to - memory - */ - /* end */ - "2:\n" /* end */ - : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3), [k] "+r"(k), [ptr_out] "+r"(outptr_row_col) - : [rem] "r"(rem), [mask] "w"(vmask), [vzero] "w"(vzero), - [stride] "r"(stride_out) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "cc"); -#else // armv7 - asm volatile( - "vld1.8 {d0}, [%[ptr0]]!\n" /* load r0, a0,a1,a2,a3,a4,a5,a6,a7 */ - "vld1.8 {d1}, [%[ptr1]]!\n" /* load r1, b0,b1,b2,b3,b4,b5,b6,b7 */ - "vld1.8 {d2}, [%[ptr2]]!\n" /* load r2, c0,c1,c2,c3,c4,c5,c6,c7 */ - "vld1.8 {d3}, [%[ptr3]]!\n" /* load r3, d0,d1,d2,d3,d4,d5,d6,d7 */ - "cmp %[k], #0\n" /* check main loop count */ - "beq 1f\n" /* jump to remain */ - "0:\n" /* main loop */ - /* trans 8b */ - "vtrn.8 d0, d1\n" /* get d0: a0,b0, a2,b2, a4,b4, a6,b6; d1: a1,b1, - a3,b3, a5,b5, a7,b7 */ - "vtrn.8 d2, d3\n" /* get d2: c0,d0, c2,d2, c4,d4, c6,d6; d3: c1,d1, - c3,d3, c5,d5, c7,d7 */ - /* trans 4h */ - "vtrn.16 d0, d1\n" /* get d0: a0,b0, a1,b1, a4,b4, a5,b5; d1: a2,b2, - a3,b3, a6,b6, a7,b7 */ - "vtrn.16 d2, d3\n" /* get d2: c0,d0, c1,d1, c4,d4, c5,d5; d3: c2,d2, - c3,d3, c6,d6, c7,d7 */ - "subs %[k], %[k], #1\n" /* loop - 1 */ - /* trans 2s */ - "vtrn.32 d0, d1\n" /* get d0: a0,b0, a1,b1, a2,b2, a3,b3; d1: a4,b4, - a5,b5, a6,b6, a7,b7 */ - "vtrn.32 d2, d3\n" /* get d2: c0,d0, c1,d1, c2,d2, c3,d3; d3: c4,d4, - c5,d5, c6,d6, c7,d7 */ - "vst1.8 {d0-d3}, [%[ptr_out]], %[stride]\n" /* save to memory */ - "vld1.8 {d0}, [%[ptr0]]!\n" /* load r0, a0,a1,a2,a3,a4,a5,a6,a7 */ - "vld1.8 {d1}, [%[ptr1]]!\n" /* load r1, b0,b1,b2,b3,b4,b5,b6,b7 */ - "vld1.8 {d2}, [%[ptr2]]!\n" /* load r2, c0,c1,c2,c3,c4,c5,c6,c7 */ - "vld1.8 {d3}, [%[ptr3]]!\n" /* load r3, d0,d1,d2,d3,d4,d5,d6,d7 */ - "bgt 0b\n" /* jump to main loop */ - "1:\n" /* process remain */ - "cmp %[rem], #0\n" /* check remain size */ - "beq 2f\n" /* jump to end */ - /* bit select */ - "vbif d0, %e[vzero], %e[mask]\n" /* pad 0 */ - "vbif d1, %e[vzero], %e[mask]\n" /* pad 0 */ - "vbif d2, %e[vzero], %e[mask]\n" /* pad 0 */ - "vbif d3, %e[vzero], %e[mask]\n" /* pad 0 */ - /* trans 8b */ - "vtrn.8 d0, d1\n" /* get d0: a0,b0, a2,b2, a4,b4, a6,b6; d1: a1,b1, - a3,b3, a5,b5, a7,b7 */ - "vtrn.8 d2, d3\n" /* get d2: c0,d0, c2,d2, c4,d4, c6,d6; d3: c1,d1, - c3,d3, c5,d5, c7,d7 */ - /* trans 4h */ - "vtrn.16 d0, d1\n" /* get d0: a0,b0, a1,b1, a4,b4, a5,b5; d1: a2,b2, - a3,b3, a6,b6, a7,b7 */ - "vtrn.16 d2, d3\n" /* get d2: c0,d0, c1,d1, c4,d4, c5,d5; d3: c2,d2, - c3,d3, c6,d6, c7,d7 */ - /* trans 2s */ - "vtrn.32 d0, d1\n" /* get d0: a0,b0, a1,b1, a2,b2, a3,b3; d1: a4,b4, - a5,b5, a6,b6, a7,b7 */ - "vtrn.32 d2, d3\n" /* get d2: c0,d0, c1,d1, c2,d2, c3,d3; d3: c4,d4, - c5,d5, c6,d6, c7,d7 */ - "vst1.8 {d0-d3}, [%[ptr_out]]\n" /* save to memory */ - /* end */ - "2:\n" /* end */ - : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3), [k] "+r"(k), [ptr_out] "+r"(outptr_row_col) - : [rem] "r"(rem), [mask] "w"(vmask), [vzero] "w"(vzero), - [stride] "r"(stride_out) - : "q0", "q1", "cc"); -#endif //__aarch64__ // NOLINT - } -} - -/************************************************************************/ -// prepack B according to gemm kernel -// origin B data: -// B_origin(transpose, n x k: -// k unroll 2, a0=k0,k1 -// r0: ==> a0, a1, a2, a3, a4, a5, a6, a7 -// r1: ==> b0, b1, b2, b3, b4, b5, b6, b7 -// r2: ==> c0, c1, c2, c3, c4, c5, c6, c7 -// r3: ==> d0, d1, d2, d3, d4, d5, d6, d7 -// r4: ==> e0, e1, e2, e3, e4, e5, e6, e7 -// r5: ==> f0, f1, f2, f3, f4, f5, f6, f7 -// r6: ==> g0, g1, g2, g3, g4, g5, g6, g7 -// r7: ==> h0, h1, h2, h3, h4, h5, h6, h7 -// for armv8: -// B block size: (<4x2>x4) x2, can be described as below: -// packed B: -// a0,b0, c0,d0, a1,b1, c1,d1; -// e0,f0, g0,h0, e1,f1, g1,h1;--block0, address+64 -// . -// . -// . -// a6,b6, c6,d6, a7,b7, c7,d7; -// e6,f6, g6,h6, e7,f7, g7,h7;--block3, address+64 -// for armv7: -// B block size: (<8x2>x1) x2, can be described as below: -// packed B: -// a0,b0, c0,d0, e0,f0, g0,h0; -// a1,b1, c1,d1, e1,f1, g1,h1;--block0, address+32 -// . -// . -// . -// a6,b6, c6,d6, e6,f6, g6,h6; -// a7,b7, c7,d7, e7,f7, g7,h7;--block3, address+32 -/*******************************************************************/ -void packb_trans_int8(int8_t* out, const int8_t* in, const int ldin, - const int k0, const int kmax, const int n0, - const int nmax, const int8_t* zerobuf) { - const int KUNROLL = 4; - const int NUNROLL = 8; - const int RATIO = NBLOCK_INT8_OTH / NUNROLL; - const int8_t* inptr = in + n0 * ldin + k0; - const uint8_t mask_buffer[16] = {0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15}; - int y_len = nmax - n0; - int x_len = kmax - k0; - int yup = ROUNDUP(y_len, NBLOCK_INT8_OTH); - const int kup = ROUNDUP(x_len, KBLOCK_INT8); - const int KSTRIDE = KBLOCK_INT8 * KUNROLL; - int kcnt = x_len / KSTRIDE; - int x_rem = (x_len & (KSTRIDE - 1)); - int k_rem = (x_rem + KBLOCK_INT8 - 1) / KBLOCK_INT8; - const int stride_inner = KBLOCK_INT8 * NUNROLL; - const int stride_outer = kup * NBLOCK_INT8_OTH; - const int ncnt = yup / NUNROLL; - - int8x16_t vzero = vdupq_n_s8(0); - uint8x16_t vmask = vcltq_u8(vld1q_u8(mask_buffer), vdupq_n_u8(x_rem)); - -#pragma omp parallel for - for (int y = 0; y < ncnt; y++) { - int idx = y * NUNROLL; - const int8_t* ptr0 = inptr + idx * ldin; - const int8_t* ptr1 = ptr0 + ldin; - const int8_t* ptr2 = ptr1 + ldin; - const int8_t* ptr3 = ptr2 + ldin; - const int8_t* ptr4 = ptr3 + ldin; - const int8_t* ptr5 = ptr4 + ldin; - const int8_t* ptr6 = ptr5 + ldin; - const int8_t* ptr7 = ptr6 + ldin; - // only for ratio = 0 or 1 - int8_t* ptr_out = - out + (y & (RATIO - 1)) * stride_inner + (y / RATIO) * stride_outer; - if (idx + NUNROLL > y_len) { - switch (idx + NUNROLL - y_len) { - case 8: - ptr0 = zerobuf; - case 7: - ptr1 = zerobuf; - case 6: - ptr2 = zerobuf; - case 5: - ptr3 = zerobuf; - case 4: - ptr4 = zerobuf; - case 3: - ptr5 = zerobuf; - case 2: - ptr6 = zerobuf; - case 1: - ptr7 = zerobuf; - default: - break; - } - } - int k = kcnt; - int rem = k_rem; -#ifdef __aarch64__ - asm volatile( - "cbz %w[k], 1f\n" /* skip main loop */ - /* main loop */ - "0:\n" /* main loop */ - "ld1 {v0.16b}, [%[ptr0]], #16\n" /* load n0, k0~k15 */ - "ld1 {v1.16b}, [%[ptr1]], #16\n" /* load n1, k0~k15 */ - "ld1 {v2.16b}, [%[ptr2]], #16\n" /* load n2, k0~k15 */ - "ld1 {v3.16b}, [%[ptr3]], #16\n" /* load n3, k0~k15 */ - "ld1 {v4.16b}, [%[ptr4]], #16\n" /* load n4, k0~k15 */ - "ld1 {v5.16b}, [%[ptr5]], #16\n" /* load n5, k0~k15 */ - "ld1 {v6.16b}, [%[ptr6]], #16\n" /* load n6, k0~k15 */ - "ld1 {v7.16b}, [%[ptr7]], #16\n" /* load n7, k0~k15 */ - /* trans, 8h */ - "trn1 v8.8h, v0.8h, v1.8h\n" /* trans, zip n0,n1 */ - "trn2 v9.8h, v0.8h, v1.8h\n" /* trans, zip n0,n1 */ - "trn1 v10.8h, v2.8h, v3.8h\n" /* trans, zip n2,n3 */ - "trn2 v11.8h, v2.8h, v3.8h\n" /* trans, zip n2,n3 */ - "trn1 v12.8h, v4.8h, v5.8h\n" /* trans, zip n4,n5 */ - "trn2 v13.8h, v4.8h, v5.8h\n" /* trans, zip n4,n5 */ - "trn1 v14.8h, v6.8h, v7.8h\n" /* trans, zip n6,n7 */ - "trn2 v15.8h, v6.8h, v7.8h\n" /* trans, zip n6,n7 */ - /* trans, 4s */ - "trn1 v16.4s, v8.4s, v10.4s\n" /* trans, block 0 */ - "trn2 v17.4s, v8.4s, v10.4s\n" /* trans, block 0 */ - "trn1 v18.4s, v9.4s, v11.4s\n" /* trans, block 0 */ - "trn2 v19.4s, v9.4s, v11.4s\n" /* trans, block 0 */ - "trn1 v20.4s, v12.4s, v14.4s\n" /* trans, block 1 */ - "trn2 v21.4s, v12.4s, v14.4s\n" /* trans, block 1 */ - "trn1 v22.4s, v13.4s, v15.4s\n" /* trans, block 1 */ - "trn2 v23.4s, v13.4s, v15.4s\n" /* trans, block 1 */ - "subs %w[k], %w[k], #1\n" /* loop count -1 */ - /* trans, 2d */ - "trn1 v8.2d, v16.2d, v18.2d\n" /* trans, block 0, out0 */ - "trn1 v9.2d, v20.2d, v22.2d\n" /* trans, block 1, out0 */ - "trn1 v10.2d, v17.2d, v19.2d\n" /* trans, block 0, out1 */ - "trn1 v11.2d, v21.2d, v23.2d\n" /* trans, block 1, out1 */ - "trn2 v12.2d, v16.2d, v18.2d\n" /* trans, block 0, out2 */ - "trn2 v13.2d, v20.2d, v22.2d\n" /* trans, block 1, out2 */ - "trn2 v14.2d, v17.2d, v19.2d\n" /* trans, block 0, out3 */ - "trn2 v15.2d, v21.2d, v23.2d\n" /* trans, block 1, out3 */ - /* store result */ - "stp q8, q9, [%[ptr_out]],#64\n" /* write 0 */ - "stp q10, q11, [%[ptr_out]],#64\n" /* write 1 */ - "stp q12, q13, [%[ptr_out]],#64\n" /* write 2 */ - "stp q14, q15, [%[ptr_out]],#64\n" /* write 3 */ - "bgt 0b\n" /* jump to main loop */ - /* process remain */ - "1:\n" /* process remains */ - "cbz %w[rem], 2f\n" /* no remain, jump to end */ - "ld1 {v0.16b}, [%[ptr0]]\n" /* load n0, k0~k15 */ - "ld1 {v1.16b}, [%[ptr1]]\n" /* load n1, k0~k15 */ - "ld1 {v2.16b}, [%[ptr2]]\n" /* load n2, k0~k15 */ - "ld1 {v3.16b}, [%[ptr3]]\n" /* load n3, k0~k15 */ - "ld1 {v4.16b}, [%[ptr4]]\n" /* load n4, k0~k15 */ - "ld1 {v5.16b}, [%[ptr5]]\n" /* load n5, k0~k15 */ - "ld1 {v6.16b}, [%[ptr6]]\n" /* load n6, k0~k15 */ - "ld1 {v7.16b}, [%[ptr7]]\n" /* load n7, k0~k15 */ - /* bit select */ - "bif v0.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v1.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v2.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v3.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v4.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v5.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v6.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - "bif v7.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */ - /* trans, 8h */ - "trn1 v8.8h, v0.8h, v1.8h\n" /* trans, zip n0,n1 */ - "trn2 v9.8h, v0.8h, v1.8h\n" /* trans, zip n0,n1 */ - "trn1 v10.8h, v2.8h, v3.8h\n" /* trans, zip n2,n3 */ - "trn2 v11.8h, v2.8h, v3.8h\n" /* trans, zip n2,n3 */ - "trn1 v12.8h, v4.8h, v5.8h\n" /* trans, zip n4,n5 */ - "trn2 v13.8h, v4.8h, v5.8h\n" /* trans, zip n4,n5 */ - "trn1 v14.8h, v6.8h, v7.8h\n" /* trans, zip n6,n7 */ - "trn2 v15.8h, v6.8h, v7.8h\n" /* trans, zip n6,n7 */ - /* trans, 4s */ - "trn1 v16.4s, v8.4s, v10.4s\n" /* trans, block 0 */ - "trn2 v17.4s, v8.4s, v10.4s\n" /* trans, block 0 */ - "trn1 v18.4s, v9.4s, v11.4s\n" /* trans, block 0 */ - "trn2 v19.4s, v9.4s, v11.4s\n" /* trans, block 0 */ - "trn1 v20.4s, v12.4s, v14.4s\n" /* trans, block 1 */ - "trn2 v21.4s, v12.4s, v14.4s\n" /* trans, block 1 */ - "trn1 v22.4s, v13.4s, v15.4s\n" /* trans, block 1 */ - "trn2 v23.4s, v13.4s, v15.4s\n" /* trans, block 1 */ - /* trans, 2d */ - "trn1 v8.2d, v16.2d, v18.2d\n" /* trans, block 0, out0 */ - "trn1 v9.2d, v20.2d, v22.2d\n" /* trans, block 1, out0 */ - "trn1 v10.2d, v17.2d, v19.2d\n" /* trans, block 0, out1 */ - "trn1 v11.2d, v21.2d, v23.2d\n" /* trans, block 1, out1 */ - "trn2 v12.2d, v16.2d, v18.2d\n" /* trans, block 0, out2 */ - "trn2 v13.2d, v20.2d, v22.2d\n" /* trans, block 1, out2 */ - "trn2 v14.2d, v17.2d, v19.2d\n" /* trans, block 0, out3 */ - "trn2 v15.2d, v21.2d, v23.2d\n" /* trans, block 1, out3 */ - /* check remain size */ - "subs %w[rem], %w[rem], #1\n" /* check remain num */ - "stp q8, q9, [%[ptr_out]],#64\n" /* write 0 */ - "beq 2f\n" /* remain = 1 */ - "subs %w[rem], %w[rem], #1\n" /* check remain num */ - "stp q10, q11, [%[ptr_out]],#64\n" /* write 1 */ - "beq 2f\n" /* remain = 2 */ - "subs %w[rem], %w[rem], #1\n" /* check remain num */ - "stp q12, q13, [%[ptr_out]],#64\n" /* write 2 */ - "beq 2f\n" /* remain = 3 */ - "stp q14, q15, [%[ptr_out]]\n" /* write 3 */ - /* end */ - "2:\n" /* end */ - : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3), [ptr4] "+r"(ptr4), [ptr5] "+r"(ptr5), - [ptr6] "+r"(ptr6), [ptr7] "+r"(ptr7), [ptr_out] "+r"(ptr_out), - [k] "+r"(k), [rem] "+r"(rem) - : [mask] "w"(vmask), [vzero] "w"(vzero) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "cc"); -#else // armv7 - asm volatile( - "cmp %[k], #0\n" /* check main loop */ - "beq 1f\n" /* skip main loop */ - /* main loop */ - "0:\n" /* main loop */ - "vld1.8 {d0-d1}, [%[ptr0]]!\n" /* load n0, a0~a7 */ - "vld1.8 {d2-d3}, [%[ptr1]]!\n" /* load n1, b0~b7 */ - "vld1.8 {d4-d5}, [%[ptr2]]!\n" /* load n2, c0~c7 */ - "vld1.8 {d6-d7}, [%[ptr3]]!\n" /* load n3, d0~d7 */ - "vld1.8 {d8-d9}, [%[ptr4]]!\n" /* load n4, e0~e7 */ - "vld1.8 {d10-d11}, [%[ptr5]]!\n" /* load n5, f0~f7 */ - "vld1.8 {d12-d13}, [%[ptr6]]!\n" /* load n6, g0~g7 */ - "vld1.8 {d14-d15}, [%[ptr7]]!\n" /* load n7, h0~h7 */ - /* trans, 8h */ - "vtrn.16 q0, q1\n" /* trans, zip n0,n1, q0: a0b0,a2b2, a4b4,a6b6, q1: - a1b1,a3b3, a5b5,a7b7 */ - "vtrn.16 q2, q3\n" /* trans, zip n2,n3, q2: c0d0,c2d2, c4d4,c6d6, q3: - c1d1,c3d3, c5d5,c7d7 */ - "vtrn.16 q4, q5\n" /* trans, zip n4,n5, q4: e0f0,e2f2, e4f4,e6f6, q5: - e1f1,e3f3, e5f5,e7f7 */ - "vtrn.16 q6, q7\n" /* trans, zip n6,n7, q6: g0h0,g2h2, g4h4,g6h6, q7: - g1h1,g3h3, g5h5,g7h7 */ - /* trans, 4s */ - "vtrn.32 q0, q2\n" /* trans, q0: a0b0,c0d0, a4b4,c4d4, q2: a2b2,c2d2, - a6b6,c6d6 */ - "vtrn.32 q1, q3\n" /* trans, q1: a1b1,c1d1, a5b5,c5d5, q3: a3b3,c3d3, - a7b7,c7d7 */ - "vtrn.32 q4, q6\n" /* trans, q4: e0f0,g0h0, e4f4,g4h4, q6: e2f2,g2h2, - e6f6,g6h6 */ - "vtrn.32 q5, q7\n" /* trans, q5: e1f1,g1h1, e5f5,g5h5, q7: e3f3,g3h3, - e7f7,g7h7 */ - "subs %[k], %[k], #1\n" /* loop count -1 */ - /* trans, 2d */ - "vswp d1, d8\n" /* q0: a0b0,c0d0, e0f0,g0h0, q4: a4b4,c4d4, e4f4,g4h4 - */ - "vswp d3, d10\n" /* q1: a1b1,c1d1, e1f1,g1h1, q5: a5b5,c5d5, e5f5,g5h5 - */ - "vswp d5, d12\n" /* q2: a2b2,c2d2, e2f2,g2h2, q6: a6b6,c6d6, e6f6,g6h6 - */ - "vswp d7, d14\n" /* q3: a3b3,c3d3, e3f3,g3h3, q7: a7b7,c7d7, e7f7,g7h7 - */ - /* store result */ - "vst1.8 {d0-d3}, [%[ptr_out]]!\n" /* write 0 */ - "vst1.8 {d4-d7}, [%[ptr_out]]!\n" /* write 1 */ - "vst1.8 {d8-d11}, [%[ptr_out]]!\n" /* write 2 */ - "vst1.8 {d12-d15}, [%[ptr_out]]!\n" /* write 3 */ - "bgt 0b\n" /* jump to main loop */ - /* process remain */ - "1:\n" /* process remains */ - "cmp %[rem], #0\n" /* check remain */ - "beq 2f\n" /* no remain, jump to end */ - "vld1.8 {d0-d1}, [%[ptr0]]!\n" /* load n0, a0~a7 */ - "vld1.8 {d2-d3}, [%[ptr1]]!\n" /* load n1, b0~b7 */ - "vld1.8 {d4-d5}, [%[ptr2]]!\n" /* load n2, c0~c7 */ - "vld1.8 {d6-d7}, [%[ptr3]]!\n" /* load n3, d0~d7 */ - "vld1.8 {d8-d9}, [%[ptr4]]!\n" /* load n4, e0~e7 */ - "vld1.8 {d10-d11}, [%[ptr5]]!\n" /* load n5, f0~f7 */ - "vld1.8 {d12-d13}, [%[ptr6]]!\n" /* load n6, g0~g7 */ - "vld1.8 {d14-d15}, [%[ptr7]]!\n" /* load n7, h0~h7 */ - /* bit select */ - "vbif q0, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q1, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q2, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q3, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q4, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q5, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q6, %q[vzero], %q[mask]\n" /* pad 0 */ - "vbif q7, %q[vzero], %q[mask]\n" /* pad 0 */ - /* trans, 8h */ - "vtrn.16 q0, q1\n" /* trans, zip n0,n1, q0: a0b0,a2b2, a4b4,a6b6, q1: - a1b1,a3b3, a5b5,a7b7 */ - "vtrn.16 q2, q3\n" /* trans, zip n2,n3, q2: c0d0,c2d2, c4d4,c6d6, q3: - c1d1,c3d3, c5d5,c7d7 */ - "vtrn.16 q4, q5\n" /* trans, zip n4,n5, q4: e0f0,e2f2, e4f4,e6f6, q5: - e1f1,e3f3, e5f5,e7f7 */ - "vtrn.16 q6, q7\n" /* trans, zip n6,n7, q6: g0h0,g2h2, g4h4,g6h6, q7: - g1h1,g3h3, g5h5,g7h7 */ - /* trans, 4s */ - "vtrn.32 q0, q2\n" /* trans, q0: a0b0,c0d0, a4b4,c4d4, q2: a2b2,c2d2, - a6b6,c6d6 */ - "vtrn.32 q1, q3\n" /* trans, q1: a1b1,c1d1, a5b5,c5d5, q3: a3b3,c3d3, - a7b7,c7d7 */ - "vtrn.32 q4, q6\n" /* trans, q4: e0f0,g0h0, e4f4,g4h4, q6: e2f2,g2h2, - e6f6,g6h6 */ - "vtrn.32 q5, q7\n" /* trans, q5: e1f1,g1h1, e5f5,g5h5, q7: e3f3,g3h3, - e7f7,g7h7 */ - /* trans, 2d */ - "vswp d1, d8\n" /* q0: a0b0,c0d0, e0f0,g0h0, q4: a4b4,c4d4, e4f4,g4h4 - */ - "vswp d3, d10\n" /* q1: a1b1,c1d1, e1f1,g1h1, q5: a5b5,c5d5, e5f5,g5h5 - */ - "vswp d5, d12\n" /* q2: a2b2,c2d2, e2f2,g2h2, q6: a6b6,c6d6, e6f6,g6h6 - */ - "vswp d7, d14\n" /* q3: a3b3,c3d3, e3f3,g3h3, q7: a7b7,c7d7, e7f7,g7h7 - */ - /* check remain size */ - "subs %[rem], %[rem], #1\n" /* check remain num */ - "vst1.8 {d0-d3}, [%[ptr_out]]!\n" /* write 0 */ - "beq 2f\n" /* remain = 1 */ - "subs %[rem], %[rem], #1\n" /* check remain num */ - "vst1.8 {d4-d7}, [%[ptr_out]]!\n" /* write 1 */ - "beq 2f\n" /* remain = 2 */ - "subs %[rem], %[rem], #1\n" /* check remain num */ - "vst1.8 {d8-d11}, [%[ptr_out]]!\n" /* write 2 */ - "beq 2f\n" /* remain = 3 */ - "vst1.8 {d12-d15}, [%[ptr_out]]!\n" /* write 3 */ - /* end */ - "2:\n" /* end */ - : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3), [ptr4] "+r"(ptr4), [ptr5] "+r"(ptr5), - [ptr6] "+r"(ptr6), [ptr7] "+r"(ptr7), [ptr_out] "+r"(ptr_out), - [k] "+r"(k), [rem] "+r"(rem) - : [mask] "w"(vmask), [vzero] "w"(vzero) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc"); -#endif //__aarch64__ // NOLINT - } -} - -#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD) - -template -void gemm_prepack_sdot_int8(const int8_t* A_packed, - const int8_t* B, - const int* bias, - Dtype* C, - int M, - int N, - int K, - bool is_bias, - bool is_relu, - bool is_transB, - const float* scale, - ARMContext* ctx) { - size_t llc_size = ctx->llc_size() / 4; - auto workspace = ctx->workspace_data(); - //! MBLOCK_INT8_DOT * x (result) + MBLOCK_INT8_DOT * k (A) + x * k (B) = l2 - int x_block = (llc_size - (MBLOCK_INT8_DOT * K)) / \ - (sizeof(int8_t) * (K + MBLOCK_INT8_DOT)); - x_block /= NBLOCK_INT8_DOT; - x_block *= NBLOCK_INT8_DOT; - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK_INT8_DOT - 1) / NBLOCK_INT8_DOT; - x_block *= NBLOCK_INT8_DOT; - x_block = x_block < NBLOCK_INT8_DOT ? NBLOCK_INT8_DOT : x_block; - - int kup = ROUNDUP(K, KBLOCK_INT8); - // unroll 2 loop - int tail_pre = ((kup / 4) & (KBLOCK_INT8 - 1)); - int k_pre = (((kup / 4) + KBLOCK_INT8 - 1) / KBLOCK_INT8) - 1; - - bool flag_p_remain = false; - int remain = 0; - - //! apanel is pre_compute outside gemm - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) { - xmax = N; - } - int bblocks = (xmax - x0 + NBLOCK_INT8_DOT - 1) / NBLOCK_INT8_DOT; - remain = xmax - x0 - (bblocks - 1) * NBLOCK_INT8_DOT; - if (remain > 0) { - flag_p_remain = true; - } - //! load bpanel - auto b_pannel = static_cast(workspace); - if (!is_transB) { - // K * N - packb_sdot_int8(b_pannel, B, N, 0, K, x0, xmax); - } else { - // N X K - packb_sdot_trans_int8(b_pannel, B, K, 0, K, x0, xmax); - } -#pragma omp parallel for - for (unsigned int y = 0; y < M; y += MBLOCK_INT8_DOT) { - unsigned int ymax = y + MBLOCK_INT8_DOT; - if (ymax > M) { - ymax = M; - } - - int32_t bias_local[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - if (is_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - bias_local[4] = bias[y + 4]; - bias_local[5] = bias[y + 5]; - bias_local[6] = bias[y + 6]; - bias_local[7] = bias[y + 7]; - } - float32_t scale_local[8]; - if (scale) { - scale_local[0] = scale[y]; - scale_local[1] = scale[y + 1]; - scale_local[2] = scale[y + 2]; - scale_local[3] = scale[y + 3]; - scale_local[4] = scale[y + 4]; - scale_local[5] = scale[y + 5]; - scale_local[6] = scale[y + 6]; - scale_local[7] = scale[y + 7]; - } - - Dtype cout0[NBLOCK_INT8_DOT]; - Dtype cout1[NBLOCK_INT8_DOT]; - Dtype cout2[NBLOCK_INT8_DOT]; - Dtype cout3[NBLOCK_INT8_DOT]; - Dtype cout4[NBLOCK_INT8_DOT]; - Dtype cout5[NBLOCK_INT8_DOT]; - Dtype cout6[NBLOCK_INT8_DOT]; - Dtype cout7[NBLOCK_INT8_DOT]; - - Dtype *c_ptr0 = C + y * N + x0; - Dtype *c_ptr1 = c_ptr0 + N; - Dtype *c_ptr2 = c_ptr1 + N; - Dtype *c_ptr3 = c_ptr2 + N; - Dtype *c_ptr4 = c_ptr3 + N; - Dtype *c_ptr5 = c_ptr4 + N; - Dtype *c_ptr6 = c_ptr5 + N; - Dtype *c_ptr7 = c_ptr6 + N; - - Dtype *pout0 = c_ptr0; - Dtype *pout1 = c_ptr1; - Dtype *pout2 = c_ptr2; - Dtype *pout3 = c_ptr3; - Dtype *pout4 = c_ptr4; - Dtype *pout5 = c_ptr5; - Dtype *pout6 = c_ptr6; - Dtype *pout7 = c_ptr7; - - // const int8_t *a_ptr_l = A_packed + y * K; - const int8_t *a_ptr_l = A_packed + y * kup; - const int8_t *b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - c_ptr1 = cout1; - case 5: - c_ptr2 = cout2; - case 4: - c_ptr3 = cout3; - case 3: - c_ptr4 = cout4; - case 2: - c_ptr5 = cout5; - case 1: - c_ptr6 = cout6; - case 0: - c_ptr7 = cout7; - default: - break; - } - } - if (flag_p_remain && (xb == bblocks - 1)) { - pout0 = c_ptr0; - pout1 = c_ptr1; - pout2 = c_ptr2; - pout3 = c_ptr3; - pout4 = c_ptr4; - pout5 = c_ptr5; - pout6 = c_ptr6; - pout7 = c_ptr7; - - c_ptr0 = cout0; - c_ptr1 = cout1; - c_ptr2 = cout2; - c_ptr3 = cout3; - c_ptr4 = cout4; - c_ptr5 = cout5; - c_ptr6 = cout6; - c_ptr7 = cout7; - } - const int8_t *a_ptr = a_ptr_l; - int tail = tail_pre; - int k = k_pre; - sgemm_sdot_int8_kernel(a_ptr, b_ptr, - bias_local, c_ptr0, c_ptr1, c_ptr2, c_ptr3, \ - c_ptr4, c_ptr5, c_ptr6, c_ptr7, scale_local, \ - is_relu, k, tail); - if (flag_p_remain && (xb == bblocks - 1)) { - for (int i = 0; i < remain; ++i) { - *pout0++ = cout0[i]; - *pout1++ = cout1[i]; - *pout2++ = cout2[i]; - *pout3++ = cout3[i]; - *pout4++ = cout4[i]; - *pout5++ = cout5[i]; - *pout6++ = cout6[i]; - *pout7++ = cout7[i]; - } - } - } - } - } -} - -void prepackA_m8k4_int8(int8_t* out, - const int8_t* in, - const int ldin, - const int m0, - const int mmax, - const int k0, - const int kmax) { - int x_len = (kmax - k0); - int8_t zerobuff[x_len]; //NOLINT - memset(zerobuff, 0, sizeof(int8_t) * x_len); - - int8_t *dout = out; - const int8_t *inptr = in; - int kup = ROUNDUP(x_len, KBLOCK_INT8); - int stride = kup * 8; - int remain = x_len % 4; -#pragma omp parallel for - for (int y = m0; y < mmax; y += 8) { - int8_t* outptr = dout + stride * (y - m0) / 8; - const int8_t * inptr_row[8]; - inptr_row[0] = inptr + y * ldin + k0; - for (int i = 1; i < 8; i++) { - inptr_row[i] = inptr_row[i - 1] + ldin; - } - //! cope with row index exceed real size, set to zero buffer - if ((y + 7) >= mmax) { - switch ((y + 7) - mmax) { - case 6: - inptr_row[1] = zerobuff; - case 5: - inptr_row[2] = zerobuff; - case 4: - inptr_row[3] = zerobuff; - case 3: - inptr_row[4] = zerobuff; - case 2: - inptr_row[5] = zerobuff; - case 1: - inptr_row[6] = zerobuff; - case 0: - inptr_row[7] = zerobuff; - default: - break; - } - } - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - "prfm pldl1keep, [%[ptr4]] \n" - "prfm pldl1keep, [%[ptr4], #64] \n" - "prfm pldl1keep, [%[ptr5]] \n" - "prfm pldl1keep, [%[ptr5], #64] \n" - "prfm pldl1keep, [%[ptr6]] \n" - "prfm pldl1keep, [%[ptr6], #64] \n" - "prfm pldl1keep, [%[ptr7]] \n" - "prfm pldl1keep, [%[ptr7], #64] \n" - : - :[ptr0] "r"(inptr_row[0]),[ptr1] "r"(inptr_row[1]),[ptr2] "r"(inptr_row[2]),[ptr3] "r"(inptr_row[3]),\ - [ptr4] "r"(inptr_row[4]),[ptr5] "r"(inptr_row[5]),[ptr6] "r"(inptr_row[6]),[ptr7] "r"(inptr_row[7]) - :"memory" - ); - - int x = x_len; - - for (; x > 7; x -= 8) { - asm volatile( - "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0=a0a1a2a3a4a5a6a7 - "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v1=b0b1b2b3b4b5b6b7 - "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v2=c0c1c2c3c4c5c6c7 - "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v3=d0d1d2d3d4d5d6d7 - - "ld1 {v4.8b}, [%[inptr4]], #8 \n" // v0=e0e1a2a3a4a5a6a7 - "ld1 {v5.8b}, [%[inptr5]], #8 \n" // v1=f0f1b2b3b4b5b6b7 - "ld1 {v6.8b}, [%[inptr6]], #8 \n" // v2=g0g1c2c3c4c5c6c7 - "ld1 {v7.8b}, [%[inptr7]], #8 \n" // v3=h0h1d2d3d4d5d6d7 - - "trn1 v8.2s, v0.2s, v1.2s \n" // v0=a0a1a2a3b0b1b2b3 - "trn2 v9.2s, v0.2s, v1.2s \n" // v0=a4a5a6a7b4b5b6b7 - "trn1 v10.2s, v2.2s, v3.2s \n" // v0=c0c1c2c3d0d1d2d3 - "trn2 v11.2s, v2.2s, v3.2s \n" // v0=c4c5c6c7d4d5d6d7 - - "trn1 v12.2s, v4.2s, v5.2s \n" // v0=e0e1e2e3f0f1f2f3 - "trn2 v13.2s, v4.2s, v5.2s \n" // v0=e4e5e6e7f4f5f6f7 - "trn1 v14.2s, v6.2s, v7.2s \n" // v0=g0g1g2g3h0h1h2h3 - "trn2 v15.2s, v6.2s, v7.2s \n" // v0=g4g5g6g7h4h5h6h7 - - "st1 {v8.2s}, [%[outptr]], #8\n" - "st1 {v10.2s}, [%[outptr]], #8\n" - "st1 {v12.2s}, [%[outptr]], #8\n" - "st1 {v14.2s}, [%[outptr]], #8\n" - - "st1 {v9.2s}, [%[outptr]], #8\n" - "st1 {v11.2s}, [%[outptr]], #8\n" - "st1 {v13.2s}, [%[outptr]], #8\n" - "st1 {v15.2s}, [%[outptr]], #8\n" - - :[inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]), - [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]), - [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]), - [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]), - [outptr] "+r"(outptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "cc", "memory" - ); - } - if (x >= 4) { - asm volatile( - "mov x1, #4 \n" - "ld1 {v0.8b}, [%[inptr0]], x1 \n" // v0=a0a1a2a3a4a5a6a7 - "ld1 {v1.8b}, [%[inptr1]], x1 \n" // v1=b0b1b2b3b4b5b6b7 - "ld1 {v2.8b}, [%[inptr2]], x1 \n" // v2=c0c1c2c3c4c5c6c7 - "ld1 {v3.8b}, [%[inptr3]], x1 \n" // v3=d0d1d2d3d4d5d6d7 - - "ld1 {v4.8b}, [%[inptr4]], x1 \n" // v0=e0e1a2a3a4a5a6a7 - "ld1 {v5.8b}, [%[inptr5]], x1 \n" // v1=f0f1b2b3b4b5b6b7 - "ld1 {v6.8b}, [%[inptr6]], x1 \n" // v2=g0g1c2c3c4c5c6c7 - "ld1 {v7.8b}, [%[inptr7]], x1 \n" // v3=h0h1d2d3d4d5d6d7 - - "trn1 v8.2s, v0.2s, v1.2s \n" // v0=a0a1a2a3b0b1b2b3 - "trn1 v10.2s, v2.2s, v3.2s \n" // v0=c0c1c2c3d0d1d2d3 - - "trn1 v12.2s, v4.2s, v5.2s \n" // v0=e0e1e2e3f0f1f2f3 - "trn1 v14.2s, v6.2s, v7.2s \n" // v0=g0g1g2g3h0h1h2h3 - - "st1 {v8.2s}, [%[outptr]], #8\n" - "st1 {v10.2s}, [%[outptr]], #8\n" - - "st1 {v12.2s}, [%[outptr]], #8\n" - "st1 {v14.2s}, [%[outptr]], #8\n" - - :[inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]), - [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]), - [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]), - [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]), - [outptr] "+r"(outptr) - : - : "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "v14", "v15", "v16", "cc", "memory" - ); - x -= 4; - } - if (x > 0) { - for (int i = 0; i < 8; i++) { - for (int j = x; j > 0; j--) { - *outptr++ = *inptr_row[i]++; - } - for (int j = 0; j < 4 - remain; j++) { - *outptr++ = 0; - } - } - } - } -} - -void prepackA_m8k4_trans_int8(int8_t* out, - const int8_t* in, - const int ldin, - const int m0, - const int mmax, - const int k0, - const int kmax) { - int8_t *outptr = out; - const int8_t *inptr = in + k0 * ldin + m0; - int x_len = mmax - m0; - int y_len = kmax - k0; - int right_remain = x_len % 8; - int kup = ROUNDUP(y_len, KBLOCK_INT8); - - int stride_out = 8 * kup; - int8_t zerobuff[x_len]; //NOLINT - memset(zerobuff, 0, sizeof(int8_t) * x_len); - printf("right_remain: %d \n", right_remain); - -#pragma omp parallel for - for (int y = 0; y < y_len; y += 4) { - const int8_t* inptr0 = inptr + y * ldin; - const int8_t* inptr1 = inptr0 + ldin; - const int8_t* inptr2 = inptr1 + ldin; - const int8_t* inptr3 = inptr2 + ldin; - - if (y + 4 > y_len) { - switch (y + 4 - y_len) { - case 3: - inptr1 = zerobuff; - case 2: - inptr2 = zerobuff; - case 1: - inptr3 = zerobuff; - default: - break; - } - } - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - : - :[ptr0] "r"(inptr0),[ptr1] "r"(inptr1),[ptr2] "r"(inptr2), - [ptr3] "r"(inptr3) - :"memory" - ); - - int8_t *outptr_row = outptr + y * 8; - int x = 0; - for (; x < x_len - 7; x += 8) { - int8_t *out0 = outptr_row; - asm volatile ( - "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0 = a0a1a2a3a4a5a6a7 - "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0 = b0b1b2b3b4b5b6b7 - "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0 = c0c1c2c3c4c5c6c7 - "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0 = d0d1d2d3d4d5d6d7 - - "trn1 v4.8b, v0.8b, v1.8b \n" // v4 = a0b0a2b2a4b4a6b6 - "trn2 v5.8b, v0.8b, v1.8b \n" // v4 = a1b1a3b3a5b5a7b7 - "trn1 v6.8b, v2.8b, v3.8b \n" // v4 = c0d0c2d2a4b4a6b6 - "trn2 v7.8b, v2.8b, v3.8b \n" // v4 = c1d1c3d3a5b5a7b7 - - "trn1 v0.4h, v4.4h, v6.4h \n" // v4 = a0b0c0d0a4b4c4d4 - "trn2 v1.4h, v4.4h, v6.4h \n" // v4 = a2b2c2d2a6b6c6d6 - "trn1 v2.4h, v5.4h, v7.4h \n" // v4 = a1b1c1d1a5b5c5d5 - "trn2 v3.4h, v5.4h, v7.4h \n" // v4 = a3b3c3d3a7b7c7d7 - - "trn1 v4.2s, v0.2s, v2.2s \n" //v4 =a0b0c0d0a1b1c1d1 - "trn2 v5.2s, v0.2s, v2.2s \n" //v4 =a4b4c4d4a5b5c5d5 - "trn1 v6.2s, v1.2s, v3.2s \n" //v4 =a2b2c2d2a3b3c3d3 - "trn2 v7.2s, v1.2s, v3.2s \n" //v4 =a6b6c6d6a7b7c7d7 - - "st1 {v4.2s}, [%[outr]], #8\n" - "st1 {v6.2s}, [%[outr]], #8\n" - "st1 {v5.2s}, [%[outr]], #8\n" - "st1 {v7.2s}, [%[outr]], #8\n" - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), - [outr] "+r"(out0) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", - "cc", "memory" - ); - outptr_row += stride_out; - } - if (right_remain > 0) { - int8_t *out0 = outptr_row; - for (; x < x_len; x++) { - *out0++ = *inptr0++; - *out0++ = *inptr1++; - *out0++ = *inptr2++; - *out0++ = *inptr3++; - } - for (int i = 0; i < 8 - right_remain; i++) { - *out0++ = 0; - *out0++ = 0; - *out0++ = 0; - *out0++ = 0; - } - } - } -} - -void packb_sdot_int8(int8_t* out, - const int8_t* in, - const int ldin, - const int k0, - const int kmax, - const int n0, - const int nmax) { - int y_len = kmax - k0; - int x_len = nmax - n0; - int kup = ROUNDUP(y_len, KBLOCK_INT8); // 4k - int8_t zerobuff[x_len]; //NOLINT - memset(zerobuff, 0, sizeof(int8_t) * x_len); - int8_t *outptr = out; - const int8_t *inptr = in + k0 * ldin + n0; - - int stride_out = 12 * kup; - // int stride_y = 48; - int remain = x_len % 12; - - // data B is not transposed, transpose B to k * 12 -#pragma omp parallel for - for (int y = 0; y < y_len; y += 4) { - // cope with row index exceed real size, set to zero - const int8_t *inptr0 = inptr + y * ldin; - const int8_t *inptr1 = inptr0 + ldin; - const int8_t *inptr2 = inptr1 + ldin; - const int8_t *inptr3 = inptr2 + ldin; - if (y + 4 > y_len) { - switch (y + 4 - y_len) { - case 3: - inptr1 = zerobuff; - case 2: - inptr2 = zerobuff; - case 1: - inptr3 = zerobuff; - default: - break; - } - } - asm volatile( - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr0], #64] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr1], #64] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr2], #64] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr3], #64] \n" - : - :[inptr0] "r"(inptr0), [inptr1] "r"(inptr1), - [inptr2] "r"(inptr2), [inptr3] "r"(inptr3) - :"memory" - ); - int8_t* outptr_row = outptr + y * 12; - int x = 0; - for (; x < x_len - 11; x += 12) { - int8_t *out0 = outptr_row; - asm volatile ( - "mov x1, #4 \n" - "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0 = a0a1a2a3a4a5a6a7 - "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0 = b0b1b2b3b4b5b6b7 - "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0 = c0c1c2c3c4c5c6c7 - "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0 = d0d1d2d3d4d5d6d7 - - "ld1 {v8.8b}, [%[inptr0]] \n" // v0 = a8a9a10a11 - "ld1 {v9.8b}, [%[inptr1]] \n" // v0 = b8b9b10b11 - "ld1 {v10.8b}, [%[inptr2]] \n" // v0 = c8c9c10c11 - "ld1 {v11.8b}, [%[inptr3]] \n" // v0 = d8d9d10d11 - - "trn1 v4.8b, v0.8b, v1.8b \n" // v4 = a0b0a2b2a4b4a6b6 - "trn2 v5.8b, v0.8b, v1.8b \n" // v4 = a1b1a3b3a5b5a7b7 - "trn1 v6.8b, v2.8b, v3.8b \n" // v4 = c0d0c2d2a4b4a6b6 - "trn2 v7.8b, v2.8b, v3.8b \n" // v4 = c1d1c3d3a5b5a7b7 - - "trn1 v12.8b, v8.8b, v9.8b \n" // v4 = a8b8a10b10a4b4a6b6 - "trn2 v13.8b, v8.8b, v9.8b \n" // v4 = a9b9a11b11a5b5a7b7 - "trn1 v14.8b, v10.8b, v11.8b \n" // v4 = c8d8c10d10a4b4a6b6 - "trn2 v15.8b, v10.8b, v11.8b \n" // v4 = c9d9c11d11a5b5a7b7 - - "trn1 v0.4h, v4.4h, v6.4h \n" // v4 = a0b0c0d0a4b4c4d4 - "trn2 v1.4h, v4.4h, v6.4h \n" // v4 = a2b2c2d2a6b6c6d6 - "trn1 v2.4h, v5.4h, v7.4h \n" // v4 = a1b1c1d1a5b5c5d5 - "trn2 v3.4h, v5.4h, v7.4h \n" // v4 = a3b3c3d3a7b7c7d7 - - "trn1 v8.4h, v12.4h, v14.4h \n" // v4 = a8b8c8d8 - "trn2 v9.4h, v12.4h, v14.4h \n" // v4 = a10b10c10d10 - "trn1 v10.4h, v13.4h, v15.4h \n" // v4 = a9b9c9d9 - "trn2 v11.4h, v13.4h, v15.4h \n" // v4 = a11b11c11d11 - - "trn1 v4.2s, v0.2s, v2.2s \n" //v4 =a0b0c0d0a1b1c1d1 - "trn2 v5.2s, v0.2s, v2.2s \n" //v4 =a4b4c4d4a5b5c5d5 - "trn1 v6.2s, v1.2s, v3.2s \n" //v4 =a2b2c2d2a3b3c3d3 - "trn2 v7.2s, v1.2s, v3.2s \n" //v4 =a6b6c6d6a7b7c7d7 - - "trn1 v0.2s, v8.2s, v10.2s \n" //v4 =a8b8c8d8a9b9c9d9 - "trn1 v1.2s, v9.2s, v11.2s \n" //v4 =a10b10c10d10a11b11c11d11 - - "st1 {v4.2s}, [%[outr]], #8\n" - "st1 {v6.2s}, [%[outr]], #8\n" - "add %[inptr0], %[inptr0], #4\n" - "add %[inptr1], %[inptr1], #4\n" - "st1 {v5.2s}, [%[outr]], #8\n" - "st1 {v7.2s}, [%[outr]], #8\n" - "add %[inptr2], %[inptr2], #4\n" - "add %[inptr3], %[inptr3], #4\n" - "st1 {v0.2s}, [%[outr]], #8\n" - "st1 {v1.2s}, [%[outr]], #8\n" - : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3), - [outr] "+r"(out0) - : - : "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16", "cc", "memory" - ); - outptr_row += stride_out; - } - int8_t* out0 = outptr_row; // outptr + stride_out + y * remain; - for (; x < x_len; x++) { - *out0++ = *inptr0++; - *out0++ = *inptr1++; - *out0++ = *inptr2++; - *out0++ = *inptr3++; - } - for (int i = 0; i < 12 - remain; i++) { - *out0++ = 0; - *out0++ = 0; - *out0++ = 0; - *out0++ = 0; - } - } -} - -void packb_sdot_trans_int8(int8_t* out, - const int8_t* in, - const int ldin, - const int k0, - const int kmax, - const int n0, - const int nmax) { - int8_t *outptr = out; - const int8_t *inptr = in + n0 * ldin + k0; - int y_len = nmax - n0; - int x_len = kmax - k0; - - int kup = ROUNDUP(x_len, KBLOCK_INT8); // 4 - - int8_t zerobuff[kup]; //NOLINT - memset(zerobuff, 0, sizeof(int8_t) * kup); - - int stride_y = 48; - int stride_out = kup; - - int remain = x_len % 8; - -#pragma omp parallel for - for (int y = 0; y < y_len; y += 12) { - const int8_t *inptr_row[12]; - inptr_row[0] = inptr + y * ldin; - for (int i = 1; i < 12; i++) { - inptr_row[i] = inptr_row[i - 1] + ldin; - } - if (y + 12 > y_len) { - for (int i = y + 12 - y_len; i > 0; i--) { - // inptr_row[12 - i] = zero_ptr[12 - i - 1]; - inptr_row[12 - i] = zerobuff; - } - } - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr4]] \n" - "prfm pldl1keep, [%[ptr5]] \n" - "prfm pldl1keep, [%[ptr6]] \n" - "prfm pldl1keep, [%[ptr7]] \n" - "prfm pldl1keep, [%[ptr8]] \n" - "prfm pldl1keep, [%[ptr9]] \n" - "prfm pldl1keep, [%[ptr10]] \n" - "prfm pldl1keep, [%[ptr11]] \n" - : - :[ptr0] "r"(inptr_row[0]), [ptr1] "r"(inptr_row[1]), - [ptr2] "r"(inptr_row[2]), [ptr3] "r"(inptr_row[3]), - [ptr4] "r"(inptr_row[4]), [ptr5] "r"(inptr_row[5]), - [ptr6] "r"(inptr_row[6]), [ptr7] "r"(inptr_row[7]), - [ptr8] "r"(inptr_row[8]), [ptr9] "r"(inptr_row[9]), - [ptr10] "r"(inptr_row[10]), [ptr11] "r"(inptr_row[11]) - :"memory" - ); - int right_remain = remain; - int8_t *outptr_row = outptr + y * stride_out; - for (int x = 0; x < x_len - 7; x += 8) { - int8_t *out0 = outptr_row; - int8_t *out1 = out0 + stride_y; - asm volatile( - "ld1 {v0.8b}, [%[inptr0]], #8 \n" // q0=A0A1A2A3A4A5A6A7 - "ld1 {v1.8b}, [%[inptr1]], #8 \n" // q0=B0b1b2b3A4A5A6A7 - "ld1 {v2.8b}, [%[inptr2]], #8 \n" // q0=c0c1c2c3A4A5A6A7 - "ld1 {v3.8b}, [%[inptr3]], #8 \n" // q0=d0d1d2d3A4A5A6A7 - - "ld1 {v4.8b}, [%[inptr4]], #8 \n" // q0=A0A1A2A3A4A5A6A7 - "ld1 {v5.8b}, [%[inptr5]], #8 \n" // q0=B0b1b2b3A4A5A6A7 - "ld1 {v6.8b}, [%[inptr6]], #8 \n" // q0=c0c1c2c3A4A5A6A7 - "ld1 {v7.8b}, [%[inptr7]], #8 \n" // q0=d0d1d2d3A4A5A6A7 - - "trn1 v8.2s, v0.2s, v1.2s \n" //v0=a0a1a2a3'b0b1b2b3 -00 01 - "trn2 v12.2s, v0.2s, v1.2s \n" //v0=a4a5a6a7'b4b5b6b7 - 10 11 - "trn1 v9.2s, v2.2s, v3.2s \n" //v0=c0c1a2a3'd0b1b2b3 -02 03 - "trn2 v13.2s, v2.2s, v3.2s \n" //v0=c4a5a6a7'c4b5b6b7 - 12 13 - - "ld1 {v0.8b}, [%[inptr8]], #8 \n" // q0=A0A1A2A3A4A5A6A7 - "ld1 {v1.8b}, [%[inptr9]], #8 \n" // q0=B0b1b2b3A4A5A6A7 - "ld1 {v2.8b}, [%[inptr10]], #8 \n" // q0=c0c1c2c3A4A5A6A7 - "ld1 {v3.8b}, [%[inptr11]], #8 \n" // q0=d0d1d2d3A4A5A6A7 - - "st1 {v8.8b}, [%[outptr_row0]], #8 \n" - "st1 {v12.8b}, [%[outptr_row1]], #8 \n" - "st1 {v9.8b}, [%[outptr_row0]], #8 \n" - "st1 {v13.8b}, [%[outptr_row1]], #8 \n" - - "trn1 v10.2s, v4.2s, v5.2s \n" //v0=a0b0a0b0'a4b4a4b4 -04 05 - "trn2 v14.2s, v4.2s, v5.2s \n" //v0=a2b2a2b2'a6b6a6b6 -14 15 - "trn1 v11.2s, v6.2s, v7.2s \n" //v0=a0b0a0b0'a4b4a4b4 -06 07 - "trn2 v15.2s, v6.2s, v7.2s \n" //v0=a2b2a2b2'a6b6a6b6 -16 17 - - "trn1 v4.2s, v0.2s, v1.2s \n" //v0=a0b0a0b0'a4b4a4b4 -08 09 - "trn2 v5.2s, v0.2s, v1.2s \n" //v0=a2b2a2b2'a6b6a6b6 -18 19 - "trn1 v6.2s, v2.2s, v3.2s \n" //v0=a0b0a0b0'a4b4a4b4 -010 011 - "trn2 v7.2s, v2.2s, v3.2s \n" //v0=a2b2a2b2'a6b6a6b6 -110 111 - - "st1 {v10.8b}, [%[outptr_row0]], #8 \n" - "st1 {v14.8b}, [%[outptr_row1]], #8 \n" - "st1 {v11.8b}, [%[outptr_row0]], #8 \n" - "st1 {v15.8b}, [%[outptr_row1]], #8 \n" - - "st1 {v4.8b}, [%[outptr_row0]], #8 \n" - "st1 {v5.8b}, [%[outptr_row1]], #8 \n" - "st1 {v6.8b}, [%[outptr_row0]], #8 \n" - "st1 {v7.8b}, [%[outptr_row1]], #8 \n" - : [inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]), - [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]), - [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]), - [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]), - [inptr8] "+r"(inptr_row[8]), [inptr9] "+r"(inptr_row[9]), - [inptr10] "+r"(inptr_row[10]), [inptr11] "+r"(inptr_row[11]), - [outptr_row0] "+r"(out0), [outptr_row1] "+r"(out1) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory" - ); - outptr_row += 96; - } - int8_t *out0 = outptr_row; - if (right_remain >= 4) { - asm volatile( - "mov x1, #4 \n" - "ld1 {v0.8b}, [%[inptr0]], x1 \n" // q0=A0A1A2A3A4A5A6A7 - "ld1 {v1.8b}, [%[inptr1]], x1 \n" // q0=B0b1b2b3A4A5A6A7 - "ld1 {v2.8b}, [%[inptr2]], x1 \n" // q0=c0c1c2c3A4A5A6A7 - "ld1 {v3.8b}, [%[inptr3]], x1 \n" // q0=d0d1d2d3A4A5A6A7 - - "ld1 {v4.8b}, [%[inptr4]], x1 \n" // q0=A0A1A2A3A4A5A6A7 - "ld1 {v5.8b}, [%[inptr5]], x1 \n" // q0=B0b1b2b3A4A5A6A7 - "ld1 {v6.8b}, [%[inptr6]], x1 \n" // q0=c0c1c2c3A4A5A6A7 - "ld1 {v7.8b}, [%[inptr7]], x1 \n" // q0=d0d1d2d3A4A5A6A7 - - "trn1 v8.2s, v0.2s, v1.2s \n" //v0=a0a1a2a3'b0b1b2b3 -00 01 - "trn1 v9.2s, v2.2s, v3.2s \n" //v0=c0c1a2a3'd0b1b2b3 -02 03 - - "ld1 {v12.8b}, [%[inptr8]], x1 \n" // q0=A0A1A2A3A4A5A6A7 - "ld1 {v13.8b}, [%[inptr9]], x1 \n" // q0=B0b1b2b3A4A5A6A7 - "ld1 {v14.8b}, [%[inptr10]], x1 \n" // q0=c0c1c2c3A4A5A6A7 - "ld1 {v15.8b}, [%[inptr11]], x1 \n" // q0=d0d1d2d3A4A5A6A7 - - "trn1 v10.2s, v4.2s, v5.2s \n" //v0=a0b0a0b0'a4b4a4b4 -04 05 - "trn1 v11.2s, v6.2s, v7.2s \n" //v0=a0b0a0b0'a4b4a4b4 -06 07 - - "trn1 v4.2s, v12.2s, v13.2s \n" //v0=a0b0a0b0'a4b4a4b4 -08 09 - "trn1 v6.2s, v14.2s, v15.2s \n" //v0=a0b0a0b0'a4b4a4b4 -010 011 - - "st1 {v8.8b}, [%[outptr_row0]], #8 \n" - "st1 {v9.8b}, [%[outptr_row0]], #8 \n" - "st1 {v10.8b}, [%[outptr_row0]], #8 \n" - "st1 {v11.8b}, [%[outptr_row0]], #8 \n" - "st1 {v4.8b}, [%[outptr_row0]], #8 \n" - "st1 {v6.8b}, [%[outptr_row0]], #8 \n" - : [inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]), - [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]), - [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]), - [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]), - [inptr8] "+r"(inptr_row[8]), [inptr9] "+r"(inptr_row[9]), - [inptr10] "+r"(inptr_row[10]), [inptr11] "+r"(inptr_row[11]), \ - [outptr_row0] "+r"(out0) - : - : "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory" - ); - right_remain -= 4; - } - if (right_remain > 0) { - for (int i = 0; i < 12; i++) { - for (int x = 0; x < right_remain; x++) { - *out0++ = *inptr_row[i]++; - } - for (int x = 0; x < 4 - right_remain; x++) { - *out0++ = 0; - } - } - } - } -} -#endif //dotprod //NOLINT - -template <> -void gemm_prepack_int8(const int8_t* A_packed, - const int8_t* B, - const int* bias, - float32_t* C, - int M, - int N, - int K, - bool is_bias, - bool is_relu, - bool is_transB, - const float* scale, - ARMContext* ctx) { -#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD) - if (ctx->has_dot()) { - gemm_prepack_sdot_int8(A_packed, - B, bias, C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); - } else { - gemm_prepack_oth_int8(A_packed, B, - bias, C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); - } -#else - gemm_prepack_oth_int8(A_packed, B, - bias, C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); -#endif -} - -template <> -void gemm_prepack_int8(const int8_t* A_packed, - const int8_t* B, - const int* bias, - int8_t* C, - int M, - int N, - int K, - bool is_bias, - bool is_relu, - bool is_transB, - const float* scale, - ARMContext* ctx) { -#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD) - if (ctx->has_dot()) { - gemm_prepack_sdot_int8(A_packed, B, bias, - C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); - } else { - gemm_prepack_oth_int8(A_packed, B, bias, - C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); - } -#else - gemm_prepack_oth_int8(A_packed, B, bias, - C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); -#endif -} - -template <> -void gemm_prepack_int8(const int8_t* A_packed, - const int8_t* B, - const int* bias, - int32_t* C, - int M, - int N, - int K, - bool is_bias, - bool is_relu, - bool is_transB, - const float* scale, - ARMContext* ctx) { -#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD) - if (ctx->has_dot()) { - gemm_prepack_sdot_int8(A_packed, B, - bias, C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); - } else { - gemm_prepack_oth_int8(A_packed, B, - bias, C, M, N, K, is_bias, is_relu, - is_transB, scale, ctx); - } -#else - gemm_prepack_oth_int8(A_packed, B, bias, - C, M, N, K, is_bias, is_relu, is_transB, scale, ctx); -#endif -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/gemm_prepacked_int8.h b/lite/backends/arm/math/gemm_prepacked_int8.h deleted file mode 100644 index 7f54eea398..0000000000 --- a/lite/backends/arm/math/gemm_prepacked_int8.h +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/context.h" -#include "lite/core/device_info.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -const int KBLOCK_INT8 = 4; -#ifdef __aarch64__ -// for int7/int8 gemm -// const int HBLOCK = 4; -// const int NBLOCK = 16; -const int MBLOCK_INT8_OTH = 4; -const int NBLOCK_INT8_OTH = 16; - -const int MBLOCK_INT8_DOT = 8; -const int NBLOCK_INT8_DOT = 12; - -inline int get_hblock_int8(const ARMContext* ctx) { -#ifdef WITH_ARM_DOTPROD - if (ctx->has_dot()) { - return MBLOCK_INT8_DOT; - } else { - return MBLOCK_INT8_OTH; - } -#else - return MBLOCK_INT8_OTH; -#endif -} -#else -// const int HBLOCK = 4; -// const int WBLOCK = 8; -const int MBLOCK_INT8_OTH = 4; -const int NBLOCK_INT8_OTH = 8; - -inline int get_hblock_int8(const ARMContext* ctx) { return 4; } -#endif // __aarch64__ - -void prepackA_int8(void* out, - const void* in, - int ldin, - int m0, - int mmax, - int k0, - int kmax, - bool is_trans, - ARMContext* ctx); - -void prepackA_int8(TensorLite* tout, - const TensorLite& tin, - int m, - int k, - int group, - bool is_trans, - ARMContext* ctx); - -template -void gemm_prepack_int8(const int8_t* A_packed, - const int8_t* B, - const int* bias, - dtype* C, - int M, - int N, - int K, - bool is_bias, - bool is_relu, - bool is_transB, - const float* scale, - ARMContext* ctx); - -#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b)) - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/gemv_arm_int8.cc b/lite/backends/arm/math/gemv_arm_int8.cc deleted file mode 100644 index dff3024ba4..0000000000 --- a/lite/backends/arm/math/gemv_arm_int8.cc +++ /dev/null @@ -1,480 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/gemv_arm_int8.h" -#include -#include "lite/backends/arm/math/saturate.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -inline void write_gemv_out(const int* in, dtype* out, const float* scale); - -template <> -inline void write_gemv_out(const int* in, int* out, const float* scale) { - out[0] = in[0]; -} - -template <> -inline void write_gemv_out(const int* in, float* out, const float* scale) { - out[0] = in[0] * scale[0]; -} - -template <> -inline void write_gemv_out(const int* in, - signed char* out, - const float* scale) { - out[0] = saturate_cast(roundf(in[0] * scale[0])); -} - -template -bool gemv_int8(const int8_t* A, - const int8_t* x, - dtype* y, - bool transA, - int M, - int N, - const float* scale, - bool is_bias, - const int* bias, - bool is_relu) { - if (transA) { - LOG(ERROR) << "ERROR: sgemv, transA is not supported now"; - return false; - } - dtype* data_out = y; - const int8_t* data_in = x; - const int8_t* weights_ptr = A; - int cnt = N >> 4; - int tail = N & 15; - int flag_bias = is_bias ? 1 : 0; - -#ifdef __aarch64__ - int out_cnt = M >> 3; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 8; - dtype* out_ptr = data_out + out_idx; - const float* scale_ptr = scale + out_idx; - int ptr_out[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - const int8_t* ptr_in = data_in; - const int8_t* ptr_w0 = weights_ptr + (N * out_idx); - const int8_t* ptr_w1 = ptr_w0 + N; - const int8_t* ptr_w2 = ptr_w1 + N; - const int8_t* ptr_w3 = ptr_w2 + N; - const int8_t* ptr_w4 = ptr_w3 + N; - const int8_t* ptr_w5 = ptr_w4 + N; - const int8_t* ptr_w6 = ptr_w5 + N; - const int8_t* ptr_w7 = ptr_w6 + N; - const int* bias_ptr = is_bias ? (bias + out_idx) : nullptr; - int cnt_loop = cnt; - asm volatile( - "prfm pldl1keep, [%[in]] \n" /* preload din */ - "prfm pldl1keep, [%[w0]] \n" /* preload w0 */ - "prfm pldl1keep, [%[w1]] \n" /* preload w1 */ - "prfm pldl1keep, [%[w2]] \n" /* preload w2 */ - "prfm pldl1keep, [%[w3]] \n" /* preload w3 */ - "prfm pldl1keep, [%[w4]] \n" /* preload w4 */ - "prfm pldl1keep, [%[w5]] \n" /* preload w5 */ - "prfm pldl1keep, [%[w6]] \n" /* preload w6 */ - "prfm pldl1keep, [%[w7]] \n" /* preload w7 */ - "movi v0.4s, #0 \n" /* set out0 to 0 */ - "movi v1.4s, #0 \n" /* set out1 to 0 */ - "movi v2.4s, #0 \n" /* set out2 to 0 */ - "movi v3.4s, #0 \n" /* set out3 to 0 */ - "movi v4.4s, #0 \n" /* set out4 to 0 */ - "movi v5.4s, #0 \n" /* set out5 to 0 */ - "movi v6.4s, #0 \n" /* set out6 to 0 */ - "movi v7.4s, #0 \n" /* set out7 to 0 */ - /* check main loop */ - "cmp %w[cnt], #1 \n" /* check whether has main loop */ - "blt 2f \n" /* jump to tail */ - /* main loop */ - "1: \n" /* main loop */ - "ldr q8, [%[in]], #16 \n" /* load input, 16 int8 */ - "ldr q9, [%[w0]], #16 \n" /* load w0, 16 int8 */ - "ldr q10, [%[w1]], #16 \n" /* load w1, 16 int8 */ - "ldr q11, [%[w2]], #16 \n" /* load w2, 16 int8 */ - "ldr q12, [%[w3]], #16 \n" /* load w3, 16 int8 */ - "ldr q13, [%[w4]], #16 \n" /* load w4, 16 int8 */ - "ldr q14, [%[w5]], #16 \n" /* load w5, 16 int8 */ - "ldr q15, [%[w6]], #16 \n" /* load w6, 16 int8 */ - "ldr q16, [%[w7]], #16 \n" /* load w7, 16 int8 */ - /* mul, lower 8 int8 * int8 = int16 */ - "smull v18.8h, v8.8b, v9.8b \n" /* mul in * w0, low, 8 int8 */ - "smull v19.8h, v8.8b, v10.8b\n" /* mul in * w1, low, 8 int8 */ - "smull v20.8h, v8.8b, v11.8b\n" /* mul in * w2, low, 8 int8 */ - "smull v21.8h, v8.8b, v12.8b\n" /* mul in * w3, low, 8 int8 */ - "smull v22.8h, v8.8b, v13.8b\n" /* mul in * w4, low, 8 int8 */ - "smull v23.8h, v8.8b, v14.8b\n" /* mul in * w5, low, 8 int8 */ - "smull v24.8h, v8.8b, v15.8b\n" /* mul in * w6, low, 8 int8 */ - "smull v25.8h, v8.8b, v16.8b\n" /* mul in * w7, low, 8 int8 */ - /* mul, higher 8 int8 * int8 + int16 = int16 */ - "smlal2 v18.8h,v8.16b,v9.16b \n" /* mul in * w0, high, 8 int8 */ - "smlal2 v19.8h,v8.16b,v10.16b\n" /* mul in * w1, high, 8 int8 */ - "smlal2 v20.8h,v8.16b,v11.16b\n" /* mul in * w2, high, 8 int8 */ - "smlal2 v21.8h,v8.16b,v12.16b\n" /* mul in * w2, high, 8 int8 */ - "smlal2 v22.8h,v8.16b,v13.16b\n" /* mul in * w2, high, 8 int8 */ - "smlal2 v23.8h,v8.16b,v14.16b\n" /* mul in * w2, high, 8 int8 */ - "smlal2 v24.8h,v8.16b,v15.16b\n" /* mul in * w2, high, 8 int8 */ - "smlal2 v25.8h,v8.16b,v16.16b\n" /* mul in * w2, high, 8 int8 */ - "subs %w[cnt], %w[cnt], #1 \n" /* sub main loop count */ - /* add int16 to int32 */ - "sadalp v0.4s, v18.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "sadalp v1.4s, v19.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "sadalp v2.4s, v20.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "sadalp v3.4s, v21.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "sadalp v4.4s, v22.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "sadalp v5.4s, v23.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "sadalp v6.4s, v24.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "sadalp v7.4s, v25.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "bne 1b \n" /* jump to main loop */ - /* pair add to final result */ - "2: \n" /* reduce to scale */ - "addp v8.4s , v0.4s , v1.4s \n" /* pair add to 4 int32*/ - "addp v9.4s , v2.4s , v3.4s \n" /* pair add to 4 int32*/ - "addp v10.4s, v4.4s , v5.4s \n" /* pair add to 4 int32*/ - "addp v11.4s, v6.4s , v7.4s \n" /* pair add to 4 int32*/ - - "addp v12.4s, v8.4s , v9.4s \n" /* pair add to 4 int32*/ - "addp v13.4s, v10.4s, v11.4s \n" /* pair add to 4 int32*/ - - "cmp %w[bias], #1 \n" /* check whether has bias */ - "blt 0f \n" /* jump to tail */ - "ldp q8, q9, [%[bias_ptr]]\n" /* load bias to q8, q9*/ - "add v12.4s, v12.4s, v8.4s \n" /* add bias */ - "add v13.4s, v13.4s, v9.4s \n" /* add bias */ - "0: \n" /* end of add bias */ - - /* write to output */ - "stp q12, q13, [%[out]] \n" /* save result */ - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [w4] "+r"(ptr_w4), - [w5] "+r"(ptr_w5), - [w6] "+r"(ptr_w6), - [w7] "+r"(ptr_w7), - [cnt] "+r"(cnt_loop) - : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr), [bias] "r"(flag_bias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - for (int i = 0; i < tail; ++i) { - ptr_out[0] += ptr_in[i] * ptr_w0[i]; - ptr_out[1] += ptr_in[i] * ptr_w1[i]; - ptr_out[2] += ptr_in[i] * ptr_w2[i]; - ptr_out[3] += ptr_in[i] * ptr_w3[i]; - ptr_out[4] += ptr_in[i] * ptr_w4[i]; - ptr_out[5] += ptr_in[i] * ptr_w5[i]; - ptr_out[6] += ptr_in[i] * ptr_w6[i]; - ptr_out[7] += ptr_in[i] * ptr_w7[i]; - } - if (is_relu) { - ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0; - ptr_out[1] = ptr_out[1] > 0 ? ptr_out[1] : 0; - ptr_out[2] = ptr_out[2] > 0 ? ptr_out[2] : 0; - ptr_out[3] = ptr_out[3] > 0 ? ptr_out[3] : 0; - ptr_out[4] = ptr_out[4] > 0 ? ptr_out[4] : 0; - ptr_out[5] = ptr_out[5] > 0 ? ptr_out[5] : 0; - ptr_out[6] = ptr_out[6] > 0 ? ptr_out[6] : 0; - ptr_out[7] = ptr_out[7] > 0 ? ptr_out[7] : 0; - } - - write_gemv_out(ptr_out, out_ptr, scale_ptr); - write_gemv_out(ptr_out + 1, out_ptr + 1, scale_ptr + 1); - write_gemv_out(ptr_out + 2, out_ptr + 2, scale_ptr + 2); - write_gemv_out(ptr_out + 3, out_ptr + 3, scale_ptr + 3); - write_gemv_out(ptr_out + 4, out_ptr + 4, scale_ptr + 4); - write_gemv_out(ptr_out + 5, out_ptr + 5, scale_ptr + 5); - write_gemv_out(ptr_out + 6, out_ptr + 6, scale_ptr + 6); - write_gemv_out(ptr_out + 7, out_ptr + 7, scale_ptr + 7); - } - -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 8; j < M; j++) { - // int *ptr_out = data_out + j; - dtype* out_ptr = data_out + j; - const float* scale_ptr = scale + j; - int ptr_out[1] = {0}; - const int8_t* ptr_in = data_in; - const int8_t* ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int bias0 = is_bias ? bias[j] : 0; - asm volatile( - "prfm pldl1keep, [%[in]] \n" /* preload din */ - "prfm pldl1keep, [%[w0]] \n" /* preload w0 */ - "movi v0.4s, #0 \n" /* set out0 to 0 */ - "fmov s0, %w[bias0] \n" /* set bias */ - /* check main loop */ - "cmp %w[cnt], #1 \n" /* check whether has main loop */ - "blt 2f \n" /* jump to tail */ - /* main loop */ - "1: \n" /* main loop */ - "ldr q8, [%[in]], #16 \n" /* load input, 16 int8 */ - "ldr q9, [%[w0]], #16 \n" /* load w0, 16 int8 */ - /* mul, lower 8 int8 * int8 = int16 */ - "smull v18.8h, v8.8b, v9.8b \n" /* mul in * w0, low, 8 int8 */ - "subs %w[cnt], %w[cnt], #1 \n" /* sub main loop count */ - /* mul, higher 8 int8 * int8 + int16 = int16 */ - "smlal2 v18.8h,v8.16b,v9.16b \n" /* mul in * w0, high, 8 int8 */ - /* add int16 to int32 */ - "sadalp v0.4s, v18.8h \n" /* pair acc, 8 int16 -> 4 int32 */ - "bne 1b \n" /* jump to main loop */ - /* pair add to final result */ - "2: \n" /* reduce to scale */ - "addv s8, v0.4s \n" /* reduction to out0 */ - /* write to output */ - "str s8, [%[out]] \n" /* save result */ - : [in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [cnt] "+r"(cnt_loop) - : [out] "r"(ptr_out), [bias0] "r"(bias0) - : "cc", "memory", "v0", "v8", "v9", "v18"); - for (int i = 0; i < tail; ++i) { - ptr_out[0] += ptr_in[i] * ptr_w0[i]; - } - if (is_relu) { - ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0; - } - write_gemv_out(ptr_out, out_ptr, scale_ptr); - } -#else //__aarch64__ // NOLINT - int out_cnt = M >> 2; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 4; - dtype* out_ptr = data_out + out_idx; - const float* scale_ptr = scale + out_idx; - int ptr_out[4] = {0, 0, 0, 0}; - const int8_t* ptr_in = data_in; - const int8_t* ptr_w0 = weights_ptr + (N * out_idx); - const int8_t* ptr_w1 = ptr_w0 + N; - const int8_t* ptr_w2 = ptr_w1 + N; - const int8_t* ptr_w3 = ptr_w2 + N; - int cnt_loop = cnt; - int bias0 = is_bias ? bias[out_idx] : 0; - int bias1 = is_bias ? bias[out_idx + 1] : 0; - int bias2 = is_bias ? bias[out_idx + 2] : 0; - int bias3 = is_bias ? bias[out_idx + 3] : 0; - asm volatile( - "pld [%[in]] @ preload cache line, input\n" - "pld [%[w0]] @ preload cache line, weights r0\n" - "pld [%[w1]] @ preload cache line, weights r1\n" - "pld [%[w2]] @ preload cache line, weights r2\n" - "pld [%[w3]] @ preload cache line, weights r3\n" - "vmov.u32 q0, #0 @ set q0 to 0\n" - "vmov.u32 q1, #0 @ set q1 to 0\n" - "vmov.u32 q2, #0 @ set q2 to 0\n" - "vmov.u32 q3, #0 @ set q3 to 0\n" - "vmov s0, %[bias0] @ set q0 to bias0\n" - "vmov s4, %[bias1] @ set q1 to bias1\n" - "vmov s8, %[bias2] @ set q2 to bias2\n" - "vmov s12,%[bias3] @ set q3 to bias3\n" - // "vld1.32 {d20-d21}, %[bias] @ load bias data" - "cmp %[cnt], #1 @ check whether has main loop\n" - "blt 2f @ jump to pair add\n" - /* main loop */ - "1: @ main loop\n" - "vld1.8 {d8-d9}, [%[in]]! @ load input, q4\n" - "vld1.8 {d12-d13}, [%[w0]]! @ load weights r0, q6\n" - "vld1.8 {d14-d15}, [%[w1]]! @ load weights r1, q7\n" - "vld1.8 {d16-d17}, [%[w2]]! @ load weights r2, q8\n" - "vld1.8 {d18-d19}, [%[w3]]! @ load weights r3, q9\n" - /* mul, int8 * int8 = int16 */ - "vmull.s8 q12, d8, d12 @ mul add\n" - "vmull.s8 q13, d8, d14 @ mul add\n" - "vmull.s8 q14, d8, d16 @ mul add\n" - "vmull.s8 q15, d8, d18 @ mul add\n" - /* mla, int8 * int8 + int16 = int16 */ - "vmlal.s8 q12, d9, d13 @ mul add\n" - "vmlal.s8 q13, d9, d15 @ mul add\n" - "vmlal.s8 q14, d9, d17 @ mul add\n" - "vmlal.s8 q15, d9, d19 @ mul add\n" - /* pacc, int16 + int32 = int32 */ - "vpadal.s16 q0, q12 @ pair acc\n" - "vpadal.s16 q1, q13 @ pair acc\n" - "vpadal.s16 q2, q14 @ pair acc\n" - "vpadal.s16 q3, q15 @ pair acc\n" - "subs %[cnt], #1 @ sub loop count \n" - /* check loop end */ - "bne 1b @ jump to main loop\n" - /* pair add to final result */ - "2: @ pair add \n" - "vpadd.s32 d8, d0, d1 @ pair add, first step\n" - "vpadd.s32 d9, d2, d3 @ pair add, first step\n" - "vpadd.s32 d10, d4, d5 @ pair add, first step\n" - "vpadd.s32 d11, d6, d7 @ pair add, first step\n" - "vpadd.s32 d0, d8, d9 @ pair add, second step\n" - "vpadd.s32 d1, d10, d11 @ pair add, second step\n" - /* write output */ - "vst1.32 {d0-d1}, [%[out]] @ save result\n" - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [cnt] "+r"(cnt_loop) - : [bias0] "r"(bias0), - [bias1] "r"(bias1), - [bias2] "r"(bias2), - [bias3] "r"(bias3), - [out] "r"(ptr_out) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q12", - "q13", - "q14", - "q15"); - for (int i = 0; i < tail; ++i) { - ptr_out[0] += ptr_in[i] * ptr_w0[i]; - ptr_out[1] += ptr_in[i] * ptr_w1[i]; - ptr_out[2] += ptr_in[i] * ptr_w2[i]; - ptr_out[3] += ptr_in[i] * ptr_w3[i]; - } - if (is_relu) { - ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0; - ptr_out[1] = ptr_out[1] > 0 ? ptr_out[1] : 0; - ptr_out[2] = ptr_out[2] > 0 ? ptr_out[2] : 0; - ptr_out[3] = ptr_out[3] > 0 ? ptr_out[3] : 0; - } - write_gemv_out(ptr_out, out_ptr, scale_ptr); - write_gemv_out(ptr_out + 1, out_ptr + 1, scale_ptr + 1); - write_gemv_out(ptr_out + 2, out_ptr + 2, scale_ptr + 2); - write_gemv_out(ptr_out + 3, out_ptr + 3, scale_ptr + 3); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 4; j < M; j++) { - dtype* out_ptr = data_out + j; - const float* scale_ptr = scale + j; - int ptr_out[1] = {0}; - const int8_t* ptr_in = data_in; - const int8_t* ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int bias0 = is_bias ? bias[j] : 0; - asm volatile( - "pld [%[in]] @ preload cache line, " - "input\n" - "pld [%[w0]] @ preload cache line, weights r0\n" - "vmov.u32 q0, #0 @ set q0 to 0\n" - "vmov s0, %[bias0] @ set q0 to bias0\n" - "cmp %[cnt], #1 @ check whether has main loop\n" - "blt 2f @ jump to tail\n" - /* main loop */ - "1: @ main loop\n" - "vld1.8 {d24-d25}, [%[in]]! @ load input, q12\n" - "vld1.8 {d28-d29}, [%[w0]]! @ load weights q14\n" - /* mull int8 * int8 = int16*/ - "vmull.s8 q1, d24, d28 @ mul add\n" - "vmlal.s8 q1, d25, d29 @ mul add\n" - "subs %[cnt] , #1 @ sub loop count \n" - /* pacc int16 + int32 = int32*/ - "vpadal.s16 q0, q1 @ pair acc\n" - "bne 1b @ jump to main loop\n" - /* pair add to final result */ - "2: @ end processing\n" - "vpadd.s32 d2, d0, d1 @ pair add, first step\n" - "vpadd.s32 d0, d2, d2 @ pair add, final step\n" - /* write output */ - "vst1.32 {d0[0]}, [%[out]] @ save result\n" - : [in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [cnt] "+r"(cnt_loop) - : [bias0] "r"(bias0), [out] "r"(ptr_out) - : "cc", "memory", "q0", "q1", "q12", "q13"); - for (int i = 0; i < tail; ++i) { - ptr_out[0] += ptr_in[i] * ptr_w0[i]; - } - if (is_relu) { - ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0; - } - write_gemv_out(ptr_out, out_ptr, scale_ptr); - } -#endif //__aarch64__ // NOLINT - return true; -} - -template bool gemv_int8(const int8_t* A, - const int8_t* x, - float* y, - bool transA, - int M, - int N, - const float* scale, - bool is_bias, - const int* bias, - bool is_relu); -template bool gemv_int8(const int8_t* A, - const int8_t* x, - int* y, - bool transA, - int M, - int N, - const float* scale, - bool is_bias, - const int* bias, - bool is_relu); -template bool gemv_int8(const int8_t* A, - const int8_t* x, - signed char* y, - bool transA, - int M, - int N, - const float* scale, - bool is_bias, - const int* bias, - bool is_relu); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/gemv_arm_int8.h b/lite/backends/arm/math/gemv_arm_int8.h deleted file mode 100644 index 3021120695..0000000000 --- a/lite/backends/arm/math/gemv_arm_int8.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/device_info.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -// fixme now only support transA = false -template -bool gemv_int8(const int8_t* A, - const int8_t* x, - dtype* y, - bool transA, - int M, - int N, - const float* scale, - bool is_bias = false, - const int* bias = nullptr, - bool is_relu = false); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/gru_utils.h b/lite/backends/arm/math/gru_utils.h deleted file mode 100644 index 9bef1889b8..0000000000 --- a/lite/backends/arm/math/gru_utils.h +++ /dev/null @@ -1,434 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/backends/arm/math/sgemm.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -struct GRUMetaValue { - T* gate_weight; - T* state_weight; - T* gate_value; - T* reset_output_value; - T* output_value; - T* prev_out_value; -}; - -template -inline void gru_add_with_bias( - const Dtype* din, const Dtype* bias, Dtype* dout, int batch, int size); - -template <> -inline void gru_add_with_bias( - const float* din, const float* bias, float* dout, int batch, int size) { -#pragma omp parallel for - for (int i = 0; i < batch; ++i) { - int j = 0; - auto din_batch = din + i * size; - auto dout_batch = dout + i * size; - float32x4_t vb0 = vld1q_f32(bias); - float32x4_t vin0 = vld1q_f32(din_batch); - float32x4_t vout0; - float32x4_t vout1; - float32x4_t vin1; - float32x4_t vb1; - for (; j < size - 7; j += 8) { - vin1 = vld1q_f32(din_batch + j + 4); - vb1 = vld1q_f32(bias + j + 4); - vout0 = vaddq_f32(vb0, vin0); - vout1 = vaddq_f32(vb1, vin1); - vb0 = vld1q_f32(bias + j + 8); - vin0 = vld1q_f32(din_batch + j + 8); - vst1q_f32(dout_batch + j, vout0); - vst1q_f32(dout_batch + j + 4, vout1); - } - for (; j < size; ++j) { - dout_batch[j] = din_batch[j] + bias[j]; - } - } -} - -template -static void gru_unit_reset_act_impl(float* updata_gate, - int stride_update, - float* reset_gate, - int stride_reset, - const float* hidden_prev, - int stride_hidden_prev, - float* reset_hidden_prev, - int stride_reset_hidden_prev, - int frame_size, - int batch_size) { -#pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - float32x4_t vpre0 = vdupq_n_f32(0.f); - float32x4_t vpre1 = vdupq_n_f32(0.f); - float prev = 0.f; - int i = 0; - for (; i < frame_size - 7; i += 8) { - float32x4_t vu0 = vld1q_f32(updata_gate + i); - float32x4_t vu1 = vld1q_f32(updata_gate + i + 4); - float32x4_t vr0 = vld1q_f32(reset_gate + i); - float32x4_t vr1 = vld1q_f32(reset_gate + i + 4); - - float32x4_t vau0 = lite::arm::math::vactive_f32(vu0); - float32x4_t vau1 = lite::arm::math::vactive_f32(vu1); - - if (hidden_prev) { - vpre0 = vld1q_f32(hidden_prev + i); - vpre1 = vld1q_f32(hidden_prev + i + 4); - } - - float32x4_t var0 = lite::arm::math::vactive_f32(vr0); - float32x4_t var1 = lite::arm::math::vactive_f32(vr1); - - vst1q_f32(updata_gate + i, vau0); - vst1q_f32(updata_gate + i + 4, vau1); - - float32x4_t vres0 = vmulq_f32(vpre0, var0); - float32x4_t vres1 = vmulq_f32(vpre1, var1); - - vst1q_f32(reset_gate + i, var0); - vst1q_f32(reset_gate + i + 4, var1); - vst1q_f32(reset_hidden_prev + i, vres0); - vst1q_f32(reset_hidden_prev + i + 4, vres1); - } - - for (; i < frame_size; ++i) { - updata_gate[i] = lite::arm::math::active_f32(updata_gate[i]); - reset_gate[i] = lite::arm::math::active_f32(reset_gate[i]); - if (hidden_prev) { - prev = hidden_prev[i]; - } - reset_hidden_prev[i] = reset_gate[i] * prev; - } - - updata_gate += stride_update; - reset_gate += stride_reset; - if (hidden_prev) { - hidden_prev += stride_hidden_prev; - } - reset_hidden_prev += stride_reset_hidden_prev; - } -} - -template -static void gru_unit_out_act_impl(bool origin_mode, - float* updata_gate, - int stride_update, - float* cell_state, - int stride_cell_state, - const float* hidden_prev, - int stride_hidden_prev, - float* hidden, - int stride_hidden, - int frame_size, - int batch_size) { -#pragma omp parallel for - for (int b = 0; b < batch_size; ++b) { - float32x4_t vpre0 = vdupq_n_f32(0.f); - float32x4_t vpre1 = vdupq_n_f32(0.f); - float prev = 0.f; - int i = 0; - if (origin_mode) { - for (; i < frame_size - 7; i += 8) { - float32x4_t vc0 = vld1q_f32(cell_state + i); - float32x4_t vc1 = vld1q_f32(cell_state + i + 4); - float32x4_t vu0 = vld1q_f32(updata_gate + i); - float32x4_t vu1 = vld1q_f32(updata_gate + i + 4); - - float32x4_t vac0 = lite::arm::math::vactive_f32(vc0); - float32x4_t vac1 = lite::arm::math::vactive_f32(vc1); - if (hidden_prev) { - vpre0 = vld1q_f32(hidden_prev + i); - vpre1 = vld1q_f32(hidden_prev + i + 4); - } - - float32x4_t vh0 = vmlsq_f32(vac0, vu0, vac0); - float32x4_t vh1 = vmlsq_f32(vac1, vu1, vac1); - - vst1q_f32(cell_state + i, vac0); - vst1q_f32(cell_state + i + 4, vac1); - - vh0 = vmlaq_f32(vh0, vu0, vpre0); - vh1 = vmlaq_f32(vh1, vu1, vpre1); - - vst1q_f32(hidden + i, vh0); - vst1q_f32(hidden + i + 4, vh1); - } - - for (; i < frame_size; ++i) { - if (hidden_prev) { - prev = hidden_prev[i]; - } - cell_state[i] = lite::arm::math::active_f32(cell_state[i]); - hidden[i] = - cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] * prev; - } - } else { - for (; i < frame_size - 7; i += 8) { - float32x4_t vc0 = vld1q_f32(cell_state + i); - float32x4_t vc1 = vld1q_f32(cell_state + i + 4); - float32x4_t vu0 = vld1q_f32(updata_gate + i); - float32x4_t vu1 = vld1q_f32(updata_gate + i + 4); - - float32x4_t vac0 = lite::arm::math::vactive_f32(vc0); - float32x4_t vac1 = lite::arm::math::vactive_f32(vc1); - - if (hidden_prev) { - vpre0 = vld1q_f32(hidden_prev + i); - vpre1 = vld1q_f32(hidden_prev + i + 4); - } - - float32x4_t vh0 = vmlsq_f32(vpre0, vpre0, vu0); - float32x4_t vh1 = vmlsq_f32(vpre1, vpre1, vu1); - - vst1q_f32(cell_state + i, vac0); - vst1q_f32(cell_state + i + 4, vac1); - - vh0 = vmlaq_f32(vh0, vu0, vac0); - vh1 = vmlaq_f32(vh1, vu1, vac1); - - vst1q_f32(hidden + i, vh0); - vst1q_f32(hidden + i + 4, vh1); - } - - for (; i < frame_size; ++i) { - cell_state[i] = lite::arm::math::active_f32(cell_state[i]); - if (hidden_prev) { - prev = hidden_prev[i]; - } - hidden[i] = - prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i]; - } - } - updata_gate += stride_update; - cell_state += stride_cell_state; - if (hidden_prev) { - hidden_prev += stride_hidden_prev; - } - hidden += stride_hidden; - } -} - -inline void gru_unit_reset_act(lite_api::ActivationType act_type, - GRUMetaValue value, - int frame_size, - int batch_size) { - auto updata_gate = value.gate_value; - auto reset_gate = value.gate_value + frame_size; - auto hidden_prev = value.prev_out_value; - auto reset_hidden_prev = value.reset_output_value; - int stride_update = 3 * frame_size; - int stride_reset = 3 * frame_size; - int stride_hidden_prev = frame_size; - int stride_reset_hidden_prev = frame_size; - - switch (act_type) { - case lite_api::ActivationType::kIndentity: - gru_unit_reset_act_impl( - updata_gate, - stride_update, - reset_gate, - stride_reset, - hidden_prev, - stride_hidden_prev, - reset_hidden_prev, - stride_reset_hidden_prev, - frame_size, - batch_size); - break; - case lite_api::ActivationType::kTanh: - gru_unit_reset_act_impl( - updata_gate, - stride_update, - reset_gate, - stride_reset, - hidden_prev, - stride_hidden_prev, - reset_hidden_prev, - stride_reset_hidden_prev, - frame_size, - batch_size); - break; - case lite_api::ActivationType::kSigmoid: - gru_unit_reset_act_impl( - updata_gate, - stride_update, - reset_gate, - stride_reset, - hidden_prev, - stride_hidden_prev, - reset_hidden_prev, - stride_reset_hidden_prev, - frame_size, - batch_size); - break; - case lite_api::ActivationType::kRelu: - gru_unit_reset_act_impl( - updata_gate, - stride_update, - reset_gate, - stride_reset, - hidden_prev, - stride_hidden_prev, - reset_hidden_prev, - stride_reset_hidden_prev, - frame_size, - batch_size); - break; - default: - break; - } -} - -inline void gru_unit_out_act(lite_api::ActivationType act_type, - bool origin_mode, - GRUMetaValue value, - int frame_size, - int batch_size) { - auto updata_gate = value.gate_value; - auto cell_state = value.gate_value + 2 * frame_size; - auto hidden_prev = value.prev_out_value; - auto hidden = value.output_value; - - int stride_update = 3 * frame_size; - int stride_cell_state = 3 * frame_size; - int stride_hidden_prev = frame_size; - int stride_hidden = frame_size; - - switch (act_type) { - case lite_api::ActivationType::kIndentity: - gru_unit_out_act_impl( - origin_mode, - updata_gate, - stride_update, - cell_state, - stride_cell_state, - hidden_prev, - stride_hidden_prev, - hidden, - stride_hidden, - frame_size, - batch_size); - break; - case lite_api::ActivationType::kTanh: - gru_unit_out_act_impl(origin_mode, - updata_gate, - stride_update, - cell_state, - stride_cell_state, - hidden_prev, - stride_hidden_prev, - hidden, - stride_hidden, - frame_size, - batch_size); - break; - case lite_api::ActivationType::kSigmoid: - gru_unit_out_act_impl( - origin_mode, - updata_gate, - stride_update, - cell_state, - stride_cell_state, - hidden_prev, - stride_hidden_prev, - hidden, - stride_hidden, - frame_size, - batch_size); - break; - case lite_api::ActivationType::kRelu: - gru_unit_out_act_impl(origin_mode, - updata_gate, - stride_update, - cell_state, - stride_cell_state, - hidden_prev, - stride_hidden_prev, - hidden, - stride_hidden, - frame_size, - batch_size); - break; - default: - break; - } -} - -template -struct GRUUnitFunctor { - static void compute(GRUMetaValue value, - int frame_size, - int batch_size, - const lite_api::ActivationType active_node, - const lite_api::ActivationType active_gate, - bool origin_mode, - ARMContext* ctx) { - if (value.prev_out_value) { - sgemm(false, - false, - batch_size, - frame_size * 2, - frame_size, - 1.f, - value.prev_out_value, - frame_size, - value.gate_weight, - frame_size * 2, - 1.f, - value.gate_value, - frame_size * 3, - nullptr, - false, - false, - ctx); - } - gru_unit_reset_act(active_gate, value, frame_size, batch_size); - - if (value.prev_out_value) { - sgemm(false, - false, - batch_size, - frame_size, - frame_size, - 1.f, - value.reset_output_value, - frame_size, - value.state_weight, - frame_size, - 1.f, - value.gate_value + frame_size * 2, - frame_size * 3, - nullptr, - false, - false, - ctx); - } - - gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size); - } -}; - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/im2sequence.cc b/lite/backends/arm/math/im2sequence.cc deleted file mode 100644 index 39fb9b477e..0000000000 --- a/lite/backends/arm/math/im2sequence.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/im2sequence.h" -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void im2sequence(const float* input, - const int input_c, - const int input_h, - const int input_w, - const int kernel_h, - const int kernel_w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const int stride_h, - const int stride_w, - const int out_h, - const int out_w, - float* out, - Context* ctx) { - int window_size = kernel_h * kernel_w; - int out_rows = out_h * out_w; - int out_cols = input_c * window_size; - int H_pad = input_h + pad_top + pad_bottom; - int W_pad = input_w + pad_left + pad_right; - for (int h_id = 0; h_id < out_h; h_id++) { - for (int w_id = 0; w_id < out_w; w_id++) { - // consider dilation. - int start_h = h_id * stride_h - pad_top; - int start_w = w_id * stride_w - pad_left; - for (int c_id = 0; c_id < input_c; c_id++) { - for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) { - int in_h_id = start_h + k_h_id; - bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad); - int out_start_id = - (h_id * out_w + w_id) * out_cols + c_id * window_size; - for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) { - int in_w_id = start_w + k_w_id; - exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad); - int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id; - int out_id = out_start_id + k_h_id * kernel_w + k_w_id; - out[out_id] = exceed_flag ? 0.f : input[input_id]; - } - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/im2sequence.h b/lite/backends/arm/math/im2sequence.h deleted file mode 100644 index 5fd06c2608..0000000000 --- a/lite/backends/arm/math/im2sequence.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void im2sequence(const float* input, - const int input_c, - const int input_h, - const int input_w, - const int kernel_h, - const int kernel_w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const int stride_h, - const int stride_w, - const int out_h, - const int out_w, - float* out, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/increment.cc b/lite/backends/arm/math/increment.cc deleted file mode 100644 index 094fe78de9..0000000000 --- a/lite/backends/arm/math/increment.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/increment.h" -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void increment(const int* input, - const int n, - const float step, - int* out, - Context* ctx) { - for (int i = 0; i < n; i++) { - out[i] = input[i] + step; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/increment.h b/lite/backends/arm/math/increment.h deleted file mode 100644 index 80aec62885..0000000000 --- a/lite/backends/arm/math/increment.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void increment(const int* input, - const int n, - const float step, - int* out, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc deleted file mode 100644 index c32494c2ba..0000000000 --- a/lite/backends/arm/math/interpolate.cc +++ /dev/null @@ -1,534 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/interpolate.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -// The following function bilinear_interp is partially base on -// https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp -// Tencent is pleased to support the open source community by making ncnn -// available. -// -// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this -// file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. -void bilinear_interp(const float* src, - int w_in, - int h_in, - float* dst, - int w_out, - int h_out, - float scale_x, - float scale_y, - bool with_align) { - int* buf = new int[w_out + h_out + w_out * 2 + h_out * 2]; - - int* xofs = buf; - int* yofs = buf + w_out; - - float* alpha = reinterpret_cast(buf + w_out + h_out); - float* beta = reinterpret_cast(buf + w_out + h_out + w_out * 2); - - float fx = 0.0f; - float fy = 0.0f; - int sx = 0; - int sy = 0; - if (with_align) { - scale_x = static_cast(w_in - 1) / (w_out - 1); - scale_y = static_cast(h_in - 1) / (h_out - 1); - // calculate x axis coordinate - for (int dx = 0; dx < w_out; dx++) { - fx = dx * scale_x; - sx = static_cast(fx); - fx -= sx; - xofs[dx] = sx; - alpha[dx * 2] = 1.f - fx; - alpha[dx * 2 + 1] = fx; - } - // calculate y axis coordinate - for (int dy = 0; dy < h_out; dy++) { - fy = dy * scale_y; - sy = static_cast(fy); - fy -= sy; - yofs[dy] = sy; - beta[dy * 2] = 1.f - fy; - beta[dy * 2 + 1] = fy; - } - } else { - scale_x = static_cast(w_in / w_out); - scale_y = static_cast(h_in / h_out); - // calculate x axis coordinate - for (int dx = 0; dx < w_out; dx++) { - fx = scale_x * (dx + 0.5f) - 0.5f; - fx = fx < 0 ? 0.f : fx; - sx = static_cast(fx); - fx -= sx; - xofs[dx] = sx; - alpha[dx * 2] = 1.f - fx; - alpha[dx * 2 + 1] = fx; - } - // calculate y axis coordinate - for (int dy = 0; dy < h_out; dy++) { - fy = scale_y * (dy + 0.5f) - 0.5f; - fy = fy < 0 ? 0.f : fy; - sy = static_cast(fy); - fy -= sy; - yofs[dy] = sy; - beta[dy * 2] = 1.f - fy; - beta[dy * 2 + 1] = fy; - } - } - float* rowsbuf0 = new float[w_out]; - float* rowsbuf1 = new float[w_out]; - float* rows0 = rowsbuf0; - float* rows1 = rowsbuf1; - // output w , h boundary - int w_bound = w_out; - int h_bound = h_out; - if (with_align) { - w_bound = ceil((w_in - 1) / scale_x); - h_bound = ceil((h_in - 1) / scale_y); - } else { - w_bound = ceil((w_in - 0.5f) / scale_x - 0.5f); - h_bound = ceil((h_in - 0.5f) / scale_y - 0.5f); - } - // h_bound loop - for (int dy = 0; dy < h_bound; dy++) { - int sy = yofs[dy]; - - const float* s0 = src + sy * w_in; - const float* s1 = src + (sy + 1) * w_in; - - const float* alphap = alpha; - float* rows0p = rows0; - float* rows1p = rows1; - - int dx = 0; - // w_bound loop - for (; dx + 1 < w_bound; dx += 2) { - int sx = xofs[dx]; - int sxn = xofs[dx + 1]; - const float* s0p = s0 + sx; - const float* s1p = s1 + sx; - const float* s0np = s0 + sxn; - const float* s1np = s1 + sxn; - - float32x4_t _a = vld1q_f32(alphap); - float32x2_t _s0 = vld1_f32(s0p); - float32x2_t _s1 = vld1_f32(s1p); - float32x2_t _s0n = vld1_f32(s0np); - float32x2_t _s1n = vld1_f32(s1np); - - float32x4_t _s0s0n = vcombine_f32(_s0, _s0n); - float32x4_t _ms0 = vmulq_f32(_s0s0n, _a); - float32x4_t _s1s1n = vcombine_f32(_s1, _s1n); - float32x4_t _ms1 = vmulq_f32(_s1s1n, _a); - - float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); - vst1_f32(rows0p + dx, _rows0); - float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); - vst1_f32(rows1p + dx, _rows1); - - alphap += 4; - } - // w_bound remain loop - for (; dx < w_bound; dx++) { - int sx = xofs[dx]; - const float* s0p = s0 + sx; - const float* s1p = s1 + sx; - - float a0 = alphap[0]; - float a1 = alphap[1]; - rows0p[dx] = s0p[0] * a0 + s0p[1] * a1; - rows1p[dx] = s1p[0] * a0 + s1p[1] * a1; - - alphap += 2; - } - - const float buffer1[2] = {*(src + sy * w_in + w_in - 1), - *(src + sy * w_in + w_in - 1)}; - const float buffer2[2] = {*(src + (sy + 1) * w_in + w_in - 1), - *(src + (sy + 1) * w_in + w_in - 1)}; - // w_bound - w_out loop - for (; dx + 1 < w_out; dx += 2) { - const float* s0p = buffer1; - const float* s1p = buffer2; - const float* s0np = buffer1; - const float* s1np = buffer2; - - float32x4_t _a = vld1q_f32(alphap); - float32x2_t _s0 = vld1_f32(s0p); - float32x2_t _s1 = vld1_f32(s1p); - float32x2_t _s0n = vld1_f32(s0np); - float32x2_t _s1n = vld1_f32(s1np); - - float32x4_t _s0s0n = vcombine_f32(_s0, _s0n); - float32x4_t _ms0 = vmulq_f32(_s0s0n, _a); - float32x4_t _s1s1n = vcombine_f32(_s1, _s1n); - float32x4_t _ms1 = vmulq_f32(_s1s1n, _a); - - float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); - vst1_f32(rows0p + dx, _rows0); - float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); - vst1_f32(rows1p + dx, _rows1); - - alphap += 4; - } - // w_bound - w_out remain loop - for (; dx < w_out; dx++) { - const float* s0p = buffer1; - const float* s1p = buffer2; - - float a0 = alphap[0]; - float a1 = alphap[1]; - rows0p[dx] = s0p[0] * a0 + s0p[1] * a1; - rows1p[dx] = s1p[0] * a0 + s1p[1] * a1; - - alphap += 2; - } - - float b0 = beta[0]; - float b1 = beta[1]; - - float* dp = dst + dy * w_out; - - int nn = w_out >> 3; - int remain = w_out - (nn << 3); - -#ifdef __aarch64__ - float32x4_t _b0 = vdupq_n_f32(b0); - float32x4_t _b1 = vdupq_n_f32(b1); - // calculate and store results - for (; nn > 0; nn--) { - float32x4_t _rows0 = vld1q_f32(rows0p); - float32x4_t _d = vmulq_f32(_rows0, _b0); - float32x4_t _rows1 = vld1q_f32(rows1p); - _d = vmlaq_f32(_d, _rows1, _b1); - - float32x4_t _rows0n = vld1q_f32(rows0p + 4); - float32x4_t _rows1n = vld1q_f32(rows1p + 4); - - float32x4_t _dn = vmulq_f32(_rows0n, _b0); - vst1q_f32(dp, _d); - _dn = vmlaq_f32(_dn, _rows1n, _b1); - vst1q_f32(dp + 4, _dn); - - dp += 8; - rows0p += 8; - rows1p += 8; - } - -#else - if (nn > 0) { - asm volatile( - "vdup.32 q0, %[b0] @dup b0 to q1\n" - "vdup.32 q1, %[b1] @dup b1 to q0\n" - "1: \n" - "vld1.32 {d4-d5}, [%[rows0p]]! @loads rows0p to q2\n" - "vld1.32 {d6-d7}, [%[rows1p]]! @loads rows0p to q3\n" - "vmul.f32 q2, q2, q0 @mul\n" - "vmla.f32 q2, q3, q1 @mul add\n" - "vst1.32 {d4-d5}, [%[out]]! @store out to q2 \n" - "pld [%[rows0p]] @preload rows0p\n" - - "vld1.32 {d4-d5}, [%[rows0p]]! @loads rows0p to q2\n" - "vld1.32 {d6-d7}, [%[rows1p]]! @load rows1p to q3\n" - "vmul.f32 q2, q2, q0 @mul\n" - "vmla.f32 q2, q3, q1 @mul add\n" - "vst1.32 {d4-d5}, [%[out]]! @store out to q2 \n" - "pld [%[rows1p]] @preload rows1p\n" - "subs %[loopc], #1 @loop count minus #1\n" - "bne 1b @jump to 1\n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [out] "+r"(dp), - [loopc] "+r"(nn) - : [b0] "r"(b0), [b1] "r"(b1) - : "cc", "memory", "q0", "q1", "q2", "q3"); - } -#endif - // calculate and store remain resluts - for (; remain; --remain) { - *dp++ = *rows0p++ * b0 + *rows1p++ * b1; - } - beta += 2; - } - - // h_bound - h_out loop - for (int dy = h_bound; dy < h_out; dy++) { - int sy = h_in - 1; - const float* s0 = src + sy * w_in; - const float* s1 = s0; - const float* alphap = alpha; - float* rows0p = rows0; - float* rows1p = rows1; - - int dx = 0; - // w_bound loop - for (; dx + 1 < w_bound; dx += 2) { - int sx = xofs[dx]; - int sxn = xofs[dx + 1]; - const float* s0p = s0 + sx; - const float* s1p = s1 + sx; - const float* s0np = s0 + sxn; - const float* s1np = s1 + sxn; - - float32x4_t _a = vld1q_f32(alphap); - float32x2_t _s0 = vld1_f32(s0p); - float32x2_t _s1 = vld1_f32(s1p); - float32x2_t _s0n = vld1_f32(s0np); - float32x2_t _s1n = vld1_f32(s1np); - - float32x4_t _s0s0n = vcombine_f32(_s0, _s0n); - float32x4_t _ms0 = vmulq_f32(_s0s0n, _a); - float32x4_t _s1s1n = vcombine_f32(_s1, _s1n); - float32x4_t _ms1 = vmulq_f32(_s1s1n, _a); - - float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); - vst1_f32(rows0p + dx, _rows0); - float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); - vst1_f32(rows1p + dx, _rows1); - - alphap += 4; - } - // w_bound remain loop - for (; dx < w_bound; dx++) { - int sx = xofs[dx]; - const float* s0p = s0 + sx; - float a0 = alphap[0]; - float a1 = alphap[1]; - rows0p[dx] = s0p[0] * a0 + s0p[1] * a1; - rows1p[dx] = rows0p[dx]; - - alphap += 2; - } - - const float buffer1[2] = {*(src + sy * w_in + w_in - 1), - *(src + sy * w_in + w_in - 1)}; - // w_bound - w_out loop - for (; dx + 1 < w_out; dx += 2) { - const float* s0p = buffer1; - const float* s1p = buffer1; - const float* s0np = buffer1; - const float* s1np = buffer1; - - float32x4_t _a = vld1q_f32(alphap); - float32x2_t _s0 = vld1_f32(s0p); - float32x2_t _s1 = vld1_f32(s1p); - float32x2_t _s0n = vld1_f32(s0np); - float32x2_t _s1n = vld1_f32(s1np); - - float32x4_t _s0s0n = vcombine_f32(_s0, _s0n); - float32x4_t _ms0 = vmulq_f32(_s0s0n, _a); - float32x4_t _s1s1n = vcombine_f32(_s1, _s1n); - float32x4_t _ms1 = vmulq_f32(_s1s1n, _a); - - float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); - vst1_f32(rows0p + dx, _rows0); - float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); - vst1_f32(rows1p + dx, _rows1); - - alphap += 4; - } - // w_bound - wout remain loop - for (; dx < w_out; dx++) { - const float* s0p = buffer1; - float a0 = alphap[0]; - float a1 = alphap[1]; - rows0p[dx] = s0p[0] * a0 + s0p[1] * a1; - rows1p[dx] = rows0p[dx]; - alphap += 2; - } - - float b0 = beta[0]; - float b1 = beta[1]; - - float* dp = dst + dy * w_out; - - int nn = w_out >> 3; - int remain = w_out - (nn << 3); - -#ifdef __aarch64__ - float32x4_t _b0 = vdupq_n_f32(b0); - float32x4_t _b1 = vdupq_n_f32(b1); - // calculate and store results - for (; nn > 0; nn--) { - float32x4_t _rows0 = vld1q_f32(rows0p); - float32x4_t _d = vmulq_f32(_rows0, _b0); - float32x4_t _rows1 = vld1q_f32(rows1p); - _d = vmlaq_f32(_d, _rows1, _b1); - - float32x4_t _rows0n = vld1q_f32(rows0p + 4); - float32x4_t _rows1n = vld1q_f32(rows1p + 4); - - float32x4_t _dn = vmulq_f32(_rows0n, _b0); - vst1q_f32(dp, _d); - _dn = vmlaq_f32(_dn, _rows1n, _b1); - vst1q_f32(dp + 4, _dn); - - dp += 8; - rows0p += 8; - rows1p += 8; - } - -#else - if (nn > 0) { - asm volatile( - "vdup.32 q0, %[b0] @dup b0 to q1\n" - "vdup.32 q1, %[b1] @dup b1 to q0\n" - "1: \n" - "vld1.32 {d4-d5}, [%[rows0p]]! @loads rows0p to q2\n" - "vld1.32 {d6-d7}, [%[rows1p]]! @loads rows0p to q3\n" - "vmul.f32 q2, q2, q0 @mul\n" - "vmla.f32 q2, q3, q1 @mul add\n" - "vst1.32 {d4-d5}, [%[out]]! @store out to q2 \n" - "pld [%[rows0p]] @preload rows0p\n" - - "vld1.32 {d4-d5}, [%[rows0p]]! @loads rows0p to q2\n" - "vld1.32 {d6-d7}, [%[rows1p]]! @load rows1p to q3\n" - "vmul.f32 q2, q2, q0 @mul\n" - "vmla.f32 q2, q3, q1 @mul add\n" - "vst1.32 {d4-d5}, [%[out]]! @store out to q2 \n" - "pld [%[rows1p]] @preload rows1p\n" - "subs %[loopc], #1 @loop count minus #1\n" - "bne 1b @jump to 1\n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [out] "+r"(dp), - [loopc] "+r"(nn) - : [b0] "r"(b0), [b1] "r"(b1) - : "cc", "memory", "q0", "q1", "q2", "q3"); - } -#endif - // calculate and store remain results - for (; remain; --remain) { - *dp++ = *rows0p++ * b0 + *rows1p++ * b1; - } - - beta += 2; - } - delete[] buf; - delete[] rowsbuf0; - delete[] rowsbuf1; -} - -void nearest_interp(const float* src, - int w_in, - int h_in, - float* dst, - int w_out, - int h_out, - float scale_x, - float scale_y, - bool with_align) { - float scale_w_new = (with_align) - ? (static_cast(w_in - 1) / (w_out - 1)) - : (static_cast(w_in) / (w_out)); - float scale_h_new = (with_align) - ? (static_cast(h_in - 1) / (h_out - 1)) - : (static_cast(h_in) / (h_out)); - -#pragma omp parallel for collapse(2) schedule(static) - for (int h = 0; h < h_out; ++h) { - for (int w = 0; w < w_out; ++w) { - int near_x = (with_align) ? static_cast(scale_w_new * w + 0.5) - : static_cast(scale_w_new * w); - int near_y = (with_align) ? static_cast(scale_h_new * h + 0.5) - : static_cast(scale_h_new * h); - near_x = near_x < 0 ? 0 : near_x; - near_y = near_y < 0 ? 0 : near_y; - dst[h * w_out + w] = src[near_y * w_in + near_x]; - } - } -} - -void interpolate(lite::Tensor* X, - lite::Tensor* OutSize, - lite::Tensor* Out, - int out_height, - int out_width, - float height_scale, - float width_scale, - bool with_align, - std::string interpolate_type) { - if (out_width > 0 && out_height > 0) { - height_scale = static_cast(out_height / X->dims()[2]); - width_scale = static_cast(out_width / X->dims()[3]); - } - if (OutSize != nullptr) { - auto OutSize_data = OutSize->data(); - int h_out = OutSize_data[0]; // HW - int w_out = OutSize_data[1]; // HW - int num_cout = Out->dims()[0]; - int c_cout = Out->dims()[1]; - Out->Resize({num_cout, c_cout, h_out, w_out}); - } - - float* dout = Out->mutable_data(); - const float* din = X->data(); - int out_num = Out->dims()[0]; - int out_c = Out->dims()[1]; - int count = out_num * out_c; - int in_h = X->dims()[2]; - int in_w = X->dims()[3]; - int out_h = Out->dims()[2]; - int out_w = Out->dims()[3]; - int spatial_in = in_h * in_w; - int spatial_out = out_h * out_w; - for (int i = 0; i < count; ++i) { - if ("Bilinear" == interpolate_type) { - bilinear_interp(din + spatial_in * i, - in_w, - in_h, - dout + spatial_out * i, - out_w, - out_h, - 1.f / width_scale, - 1.f / height_scale, - with_align); - } else if ("Nearest" == interpolate_type) { - nearest_interp(din + spatial_in * i, - in_w, - in_h, - dout + spatial_out * i, - out_w, - out_h, - 1.f / width_scale, - 1.f / height_scale, - with_align); - } - } -} - -} /* namespace math */ -} /* namespace arm */ -} /* namespace lite */ -} /* namespace paddle */ diff --git a/lite/backends/arm/math/interpolate.h b/lite/backends/arm/math/interpolate.h deleted file mode 100644 index be250f6a5e..0000000000 --- a/lite/backends/arm/math/interpolate.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void bilinear_interp(const float* src, - int w_in, - int h_in, - float* dst, - int w_out, - int h_out, - float scale_x, - float scale_y, - bool with_align); - -void nearest_interp(const float* src, - int w_in, - int h_in, - float* dst, - int w_out, - int h_out, - float scale_x, - float scale_y, - bool with_align); - -void interpolate(lite::Tensor* X, - lite::Tensor* OutSize, - lite::Tensor* Out, - int out_height, - int out_width, - float height_scale, - float width_scale, - bool with_align, - std::string interpolate_type); - -} /* namespace math */ -} /* namespace arm */ -} /* namespace lite */ -} /* namespace paddle */ diff --git a/lite/backends/arm/math/lrn.cc b/lite/backends/arm/math/lrn.cc deleted file mode 100644 index 7c89e9fed3..0000000000 --- a/lite/backends/arm/math/lrn.cc +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/lrn.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void compute_across_channels(const float* din, - float* dout, - int num, - int channel, - int h, - int w, - int local_size, - float alpha, - float beta, - float k) { - int channel_size = h * w; - int cnt = channel_size / 4; - int remain = channel_size % 4; - int pre_pad = (local_size - 1) / 2; - int post_pad = local_size - pre_pad - 1; - float32x4_t k_val = vdupq_n_f32(k); - float32x4_t alpha_val = vdupq_n_f32(alpha); - float32x4_t beta_val = vdupq_n_f32(-beta); - for (int n = 0; n < num; ++n) { - const float* din_ptr = din + n * channel * channel_size; - float* dout_ptr = dout + n * channel * channel_size; - for (int c = 0; c < channel; ++c) { - const float* din_ch_ptr = din_ptr + c * channel_size; - float* dout_ch_ptr = dout_ptr + c * channel_size; - int cs = (c - pre_pad) < 0 ? 0 : (c - pre_pad); - int ce = (c + post_pad) >= channel ? channel : (c + pre_pad + 1); - for (int i = 0; i < cnt; ++i) { - int idx = i * 4; - float32x4_t sum = vdupq_n_f32(0.f); - float32x4_t din = vld1q_f32(din_ch_ptr); - for (int k = cs; k < ce; ++k) { - float32x4_t v0 = vld1q_f32(&din_ptr[k * channel_size + idx]); - sum = vmlaq_f32(sum, v0, v0); - } - sum = vmulq_f32(sum, alpha_val); - sum = vaddq_f32(sum, k_val); - float32x4_t res0 = pow_ps(sum, beta_val); - float32x4_t res1 = vmulq_f32(din, res0); - vst1q_f32(dout_ch_ptr, res1); - dout_ch_ptr += 4; - din_ch_ptr += 4; - } - int idx = cnt * 4; - for (int i = 0; i < remain; ++i) { - float sum = 0.0; - for (int k = cs; k < ce; ++k) { - sum += - din_ptr[k * channel_size + idx] * din_ptr[k * channel_size + idx]; - } - sum = k + sum * alpha; - dout_ch_ptr[0] = din_ch_ptr[0] * pow(sum, -beta); - dout_ch_ptr++; - din_ch_ptr++; - idx++; - } - } - } -} - -template <> -void compute_within_channels(const float* din, - float* dout, - int num, - int channel, - int h, - int w, - int local_size, - float alpha, - float beta, - float k) { - LOG(ERROR) << "unsupported method!!"; - return; -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/lrn.h b/lite/backends/arm/math/lrn.h deleted file mode 100644 index 0355123189..0000000000 --- a/lite/backends/arm/math/lrn.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void compute_across_channels(const T* din, - T* dout, - int num, - int channel, - int h, - int w, - int local_size, - float alpha, - float beta, - float k); - -template -void compute_within_channels(const T* din, - T* dout, - int num, - int channel, - int h, - int w, - int local_size, - float alpha, - float beta, - float k); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/negative.cc b/lite/backends/arm/math/negative.cc deleted file mode 100644 index 30eba11e35..0000000000 --- a/lite/backends/arm/math/negative.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/negative.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void negative_func(const float* din, float* dout, int num) { - for (int i = 0; i < num; i++) { - dout[i] = -din[i]; - LOG(INFO) << "arm i:" << i; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/negative.h b/lite/backends/arm/math/negative.h deleted file mode 100644 index 9a5648743d..0000000000 --- a/lite/backends/arm/math/negative.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/operators/op_params.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void negative_func(const T* din, T* dout, int num); -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/norm.cc b/lite/backends/arm/math/norm.cc deleted file mode 100644 index 4780ef68c1..0000000000 --- a/lite/backends/arm/math/norm.cc +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/norm.h" -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void norm(const float* input, - const int pre_n, - const int n, - const int post_n, - const float epsilon, - float* out, - Context* ctx) { - for (int i = 0; i < pre_n; i++) { - for (int k = 0; k < post_n; k++) { - float sum = epsilon; - const float* in_tmp = input + i * n * post_n + k; - for (int j = 0; j < n; j++) { - sum += in_tmp[j * post_n] * in_tmp[j * post_n]; - } - sum = std::sqrt(sum); - float* out_tmp = out + i * n * post_n + k; - for (int j = 0; j < n; j++) { - out_tmp[j * post_n] = in_tmp[j * post_n] / sum; - } - } - } - LOG(INFO) << "norm math finished"; -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/norm.h b/lite/backends/arm/math/norm.h deleted file mode 100644 index 503d2c5af4..0000000000 --- a/lite/backends/arm/math/norm.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void norm(const float* input, - const int pre_n, - const int n, - const int post_n, - const float epsilon, - float* out, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc deleted file mode 100644 index 77b3beae80..0000000000 --- a/lite/backends/arm/math/packed_sgemm.cc +++ /dev/null @@ -1,3481 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/packed_sgemm.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -#ifdef __aarch64__ -void prepackA_8x12(float *out, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void prepackA_trans_8x12(float *out, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void sgemm_prepacked_8x12(bool is_transB, - int M, - int N, - int K, - const float *A_packed, - const float *B, - int ldb, - float beta, - float *C, - int ldc, - const float *bias, - bool has_bias, - bool has_relu, - ARMContext *ctx); -#else -// for kA72 -void prepackA_6x8(float *out, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void prepackA_trans_6x8(float *out, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax); -// for kA73 -void prepackA_4x8(float *out, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -void prepackA_trans_4x8(float *out, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax); - -// for kA72, 6x8 -void sgemm_prepacked_6x8(bool is_transB, - int M, - int N, - int K, - const float *A_packed, - const float *B, - int ldb, - float beta, - float *C, - int ldc, - const float *bias, - bool has_bias, - bool has_relu, - ARMContext *ctx); -// for kA73, 4x8 -void sgemm_prepacked_4x8(bool is_transB, - int M, - int N, - int K, - const float *A_packed, - const float *B, - int ldb, - float beta, - float *C, - int ldc, - const float *bias, - bool has_bias, - bool has_relu, - ARMContext *ctx); -#endif // __aarch64__ - -/** - * \brief input data is not transpose - * for arm-v7a, transform data to block x k x 6 layout - * for arm-v8a, transform data to block x k x 8 layout - */ -void prepackA(float *out, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax, - bool is_trans, - ARMContext *ctx) { -#ifdef __aarch64__ - if (is_trans) { - prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); - } else { - prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); - } -#else - if (ctx->arch() == kA73) { - if (is_trans) { - prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax); - } else { - prepackA_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax); - } - } else { - if (is_trans) { - prepackA_trans_6x8(out, in, alpha, ldin, m0, mmax, k0, kmax); - } else { - prepackA_6x8(out, in, alpha, ldin, m0, mmax, k0, kmax); - } - } -#endif -} - -void prepackA(TensorLite *tout, - const TensorLite &tin, - float alpha, - int m, - int k, - int group, - bool is_trans, - ARMContext *ctx) { - int hblock = get_hblock(ctx->arch()); - int m_roundup = hblock * ((m + hblock - 1) / hblock); - int group_size_round_up = ((m_roundup * k + 15) / 16) * 16; - if (tout->numel() < group_size_round_up * group) { - tout->Resize({group_size_round_up * group}); - } - int lda = k; - if (is_trans) { - lda = m; - } - for (int g = 0; g < group; ++g) { - const float *weights_group = tin.data() + g * m * k; - float *weights_trans_ptr = - tout->mutable_data() + g * group_size_round_up; - prepackA(weights_trans_ptr, - weights_group, - alpha, - lda, - 0, - m, - 0, - k, - is_trans, - ctx); - } -} - -/// a: m*k b: k*n c: m*n -void sgemm_prepack(bool is_transB, - int M, - int N, - int K, - const float *A_packed, - const float *B, - int ldb, - float beta, - float *C, - int ldc, - const float *bias, - bool has_bias, - bool has_relu, - ARMContext *ctx) { -#ifdef __aarch64__ - sgemm_prepacked_8x12(is_transB, - M, - N, - K, - A_packed, - B, - ldb, - beta, - C, - ldc, - bias, - has_bias, - has_relu, - ctx); -#else // armv7 - if (ctx->arch() == kA73) { - sgemm_prepacked_4x8(is_transB, - M, - N, - K, - A_packed, - B, - ldb, - beta, - C, - ldc, - bias, - has_bias, - has_relu, - ctx); - } else { - sgemm_prepacked_6x8(is_transB, - M, - N, - K, - A_packed, - B, - ldb, - beta, - C, - ldc, - bias, - has_bias, - has_relu, - ctx); - } -#endif // arm64 -} - -#ifdef __aarch64__ -/* - * The following function prepackA_8x12 is base on - * https://github.com/ARM-software/ComputeLibrary/ - * - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -void prepackA_8x12(float *dout, - const float *inptr, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax) { - int x_len = kmax - k0; - int stride = x_len * 8; - float zerobuff[x_len]; // NOLINT - memset(zerobuff, 0, sizeof(float) * x_len); - bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; - -#pragma omp parallel for - for (int y = m0; y < mmax; y += 8) { - float *outptr = dout + stride * (y - m0) / 8; - - const float *inptr0 = inptr + y * ldin + k0; - const float *inptr1 = inptr0 + ldin; - const float *inptr2 = inptr1 + ldin; - const float *inptr3 = inptr2 + ldin; - const float *inptr4 = inptr3 + ldin; - const float *inptr5 = inptr4 + ldin; - const float *inptr6 = inptr5 + ldin; - const float *inptr7 = inptr6 + ldin; - - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - "prfm pldl1keep, [%[ptr4]] \n" - "prfm pldl1keep, [%[ptr4], #64] \n" - "prfm pldl1keep, [%[ptr5]] \n" - "prfm pldl1keep, [%[ptr5], #64] \n" - "prfm pldl1keep, [%[ptr6]] \n" - "prfm pldl1keep, [%[ptr6], #64] \n" - "prfm pldl1keep, [%[ptr7]] \n" - "prfm pldl1keep, [%[ptr7], #64] \n" - : - : [ptr0] "r"(inptr0), - [ptr1] "r"(inptr1), - [ptr2] "r"(inptr2), - [ptr3] "r"(inptr3), - [ptr4] "r"(inptr4), - [ptr5] "r"(inptr5), - [ptr6] "r"(inptr6), - [ptr7] "r"(inptr7) - : "memory"); - - int x = x_len; - //! cope with row index exceed real size, set to zero buffer - if ((y + 7) >= mmax) { - switch ((y + 7) - mmax) { - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - default: - break; - } - } - for (; x > 7; x -= 8) { - asm volatile( - "cbz %w[has_alpha], 0f\n" /* check alpha == 1.f? */ - "dup v31.4s, %w[alpha]\n" /* alpha to vector */ - "ldp q0, q1, [%[inptr0]], #32\n" /* load r0, a0~a7 */ - "ldp q2, q3, [%[inptr1]], #32\n" /* load r1, b0~b7 */ - "fmul v0.4s, v31.4s, v0.4s\n" /* mul alpha */ - "fmul v1.4s, v31.4s, v1.4s\n" /* mul alpha */ - "ldp q4, q5, [%[inptr2]], #32\n" /* load r2, c0~c7 */ - "fmul v2.4s, v31.4s, v2.4s\n" /* mul alpha */ - "fmul v3.4s, v31.4s, v3.4s\n" /* mul alpha */ - "ldp q6, q7, [%[inptr3]], #32\n" /* load r3, d0~d7 */ - "fmul v4.4s, v31.4s, v4.4s\n" /* mul alpha */ - "fmul v5.4s, v31.4s, v5.4s\n" /* mul alpha */ - "ldp q8, q9, [%[inptr4]], #32\n" /* load r4, e0~e7 */ - "fmul v6.4s, v31.4s, v6.4s\n" /* mul alpha */ - "fmul v7.4s, v31.4s, v7.4s\n" /* mul alpha */ - "ldp q10, q11, [%[inptr5]], #32\n" /* load r5, f0~f7 */ - "fmul v8.4s, v31.4s, v8.4s\n" /* mul alpha */ - "fmul v9.4s, v31.4s, v9.4s\n" /* mul alpha */ - "ldp q12, q13, [%[inptr6]], #32\n" /* load r6, g0~g7 */ - "fmul v10.4s, v31.4s, v10.4s\n" /* mul alpha */ - "fmul v11.4s, v31.4s, v11.4s\n" /* mul alpha */ - "ldp q14, q15, [%[inptr7]], #32\n" /* load r7, h0~h7 */ - "fmul v12.4s, v31.4s, v12.4s\n" /* mul alpha */ - "fmul v13.4s, v31.4s, v13.4s\n" /* mul alpha */ - "fmul v14.4s, v31.4s, v14.4s\n" /* mul alpha */ - "fmul v15.4s, v31.4s, v15.4s\n" /* mul alpha */ - "b 1f\n" /* to main process */ - "0: \n" /* alpha == 1 */ - "ldp q0, q1, [%[inptr0]], #32\n" /* load r0, a0~a7 */ - "ldp q2, q3, [%[inptr1]], #32\n" /* load r1, b0~b7 */ - "ldp q4, q5, [%[inptr2]], #32\n" /* load r2, c0~c7 */ - "ldp q6, q7, [%[inptr3]], #32\n" /* load r3, d0~d7 */ - "ldp q8, q9, [%[inptr4]], #32\n" /* load r4, e0~e7 */ - "ldp q10, q11, [%[inptr5]], #32\n" /* load r5, f0~f7 */ - "ldp q12, q13, [%[inptr6]], #32\n" /* load r6, g0~g7 */ - "ldp q14, q15, [%[inptr7]], #32\n" /* load r7, h0~h7 */ - "1: \n" /* main process */ - "trn1 v16.4s, v0.4s, v2.4s\n" /* a0b0a2b2*/ - "trn2 v17.4s, v0.4s, v2.4s\n" /* a1b1a3b3*/ - "trn1 v18.4s, v1.4s, v3.4s\n" /* a4b4a6b6*/ - "trn2 v19.4s, v1.4s, v3.4s\n" /* a5b5a7b7*/ - - "trn1 v20.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/ - "trn2 v21.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/ - "trn1 v22.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/ - "trn2 v23.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/ - - "trn1 v24.4s, v8.4s, v10.4s\n" /* e0f0e2f2*/ - "trn2 v25.4s, v8.4s, v10.4s\n" /* e1f1e3f3*/ - "trn1 v26.4s, v9.4s, v11.4s\n" /* e4f4e6f6*/ - "trn2 v27.4s, v9.4s, v11.4s\n" /* e5f5e7f7*/ - - "trn1 v28.4s, v12.4s, v14.4s\n" /* g0h0g2h2*/ - "trn2 v29.4s, v12.4s, v14.4s\n" /* g1h1g3h3*/ - "trn1 v30.4s, v13.4s, v15.4s\n" /* g4h4g6h6*/ - "trn2 v31.4s, v13.4s, v15.4s\n" /* g5h5g7h7*/ - - "trn1 v0.2d, v16.2d, v20.2d\n" /* a0b0c0d0 */ - "trn1 v1.2d, v24.2d, v28.2d\n" /* e0f0g0h0 */ - "trn1 v2.2d, v17.2d, v21.2d\n" /* a1b1c1d1 */ - "trn1 v3.2d, v25.2d, v29.2d\n" /* e1b1c1d1 */ - - "trn2 v4.2d, v16.2d, v20.2d\n" /* a2b2c2d2 */ - "trn2 v5.2d, v24.2d, v28.2d\n" /* e2f2g2h2 */ - "stp q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/ - "trn2 v6.2d, v17.2d, v21.2d\n" /* a3b3c3d3 */ - "trn2 v7.2d, v25.2d, v29.2d\n" /* e3f3g3h3 */ - "stp q2, q3, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/ - - "trn1 v8.2d, v18.2d, v22.2d\n" /* a4b4c4d4 */ - "trn1 v9.2d, v26.2d, v30.2d\n" /* e4f4g4h4 */ - "stp q4, q5, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/ - "trn1 v10.2d, v19.2d, v23.2d\n" /* a5b5c5d5 */ - "trn1 v11.2d, v27.2d, v31.2d\n" /* e5f5g5h5 */ - "stp q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/ - - "trn2 v12.2d, v18.2d, v22.2d\n" /* a6b6c6d6 */ - "trn2 v13.2d, v26.2d, v30.2d\n" /* e6f6g6h6 */ - "stp q8, q9, [%[outptr]], #32\n" /* save q8, q9, a4~h4*/ - "trn2 v14.2d, v19.2d, v23.2d\n" /* a7b7c7d7 */ - "trn2 v15.2d, v27.2d, v31.2d\n" /* e7f7g7h7 */ - "stp q10, q11, [%[outptr]], #32\n" /* save q10, q11, a5~h5*/ - - "stp q12, q13, [%[outptr]], #32\n" /* save q12, q13, a6~h6*/ - "stp q14, q15, [%[outptr]], #32\n" /* save q14, q15, a7~h7*/ - : [inptr0] "+r"(inptr0), - [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), - [inptr5] "+r"(inptr5), - [inptr6] "+r"(inptr6), - [inptr7] "+r"(inptr7), - [outptr] "+r"(outptr) - : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31", - "cc", - "memory"); - } - - for (; x > 0; x--) { - if (has_alpha) { - *outptr++ = *inptr0++ * alpha; - *outptr++ = *inptr1++ * alpha; - *outptr++ = *inptr2++ * alpha; - *outptr++ = *inptr3++ * alpha; - *outptr++ = *inptr4++ * alpha; - *outptr++ = *inptr5++ * alpha; - *outptr++ = *inptr6++ * alpha; - *outptr++ = *inptr7++ * alpha; - } else { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } - } -} - -void prepackA_trans_8x12(float *outptr, - const float *in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax) { - auto inptr = in + k0 * ldin + m0; - uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - int x_len = mmax - m0; - int y_len = kmax - k0; - int right_remain = x_len - 8 * (x_len / 8); - int stride_out = 8 * y_len; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask1 = - vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); - uint32x4_t vmask2 = - vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); - - bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; - float32x4_t valpha = vdupq_n_f32(alpha); - -#pragma omp parallel for - for (int y = 0; y < y_len - 3; y += 4) { - const float *ptr0 = inptr + y * ldin; - const float *ptr1 = ptr0 + ldin; - const float *ptr2 = ptr1 + ldin; - const float *ptr3 = ptr2 + ldin; - - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - : - : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3) - : "memory"); - - float *outptr_row_col = outptr + y * 8; - int i = 0; - for (; i < x_len - 7; i += 8) { - float32x4_t vr00 = vld1q_f32(ptr0); - float32x4_t vr01 = vld1q_f32(ptr0 + 4); - float32x4_t vr10 = vld1q_f32(ptr1); - float32x4_t vr11 = vld1q_f32(ptr1 + 4); - float32x4_t vr20 = vld1q_f32(ptr2); - float32x4_t vr21 = vld1q_f32(ptr2 + 4); - float32x4_t vr30 = vld1q_f32(ptr3); - float32x4_t vr31 = vld1q_f32(ptr3 + 4); - if (has_alpha) { - vr00 = vmulq_f32(vr00, valpha); - vr01 = vmulq_f32(vr01, valpha); - vr10 = vmulq_f32(vr10, valpha); - vr11 = vmulq_f32(vr11, valpha); - vr20 = vmulq_f32(vr20, valpha); - vr21 = vmulq_f32(vr21, valpha); - vr30 = vmulq_f32(vr30, valpha); - vr31 = vmulq_f32(vr31, valpha); - } - - vst1q_f32(outptr_row_col, vr00); - vst1q_f32(outptr_row_col + 4, vr01); - vst1q_f32(outptr_row_col + 8, vr10); - vst1q_f32(outptr_row_col + 12, vr11); - vst1q_f32(outptr_row_col + 16, vr20); - vst1q_f32(outptr_row_col + 20, vr21); - vst1q_f32(outptr_row_col + 24, vr30); - vst1q_f32(outptr_row_col + 28, vr31); - - ptr0 += 8; - ptr1 += 8; - ptr2 += 8; - ptr3 += 8; - - outptr_row_col += stride_out; - } - if (right_remain > 0) { - float32x4_t vr00 = vld1q_f32(ptr0); - float32x4_t vr01 = vld1q_f32(ptr0 + 4); - float32x4_t vr10 = vld1q_f32(ptr1); - float32x4_t vr11 = vld1q_f32(ptr1 + 4); - float32x4_t vr20 = vld1q_f32(ptr2); - float32x4_t vr21 = vld1q_f32(ptr2 + 4); - float32x4_t vr30 = vld1q_f32(ptr3); - float32x4_t vr31 = vld1q_f32(ptr3 + 4); - - if (has_alpha) { - vr00 = vmulq_f32(vr00, valpha); - vr01 = vmulq_f32(vr01, valpha); - vr10 = vmulq_f32(vr10, valpha); - vr11 = vmulq_f32(vr11, valpha); - vr20 = vmulq_f32(vr20, valpha); - vr21 = vmulq_f32(vr21, valpha); - vr30 = vmulq_f32(vr30, valpha); - vr31 = vmulq_f32(vr31, valpha); - } - - float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero); - float32x4_t vr01_1 = vbslq_f32(vmask2, vr01, vzero); - float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero); - float32x4_t vr11_1 = vbslq_f32(vmask2, vr11, vzero); - float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero); - float32x4_t vr21_1 = vbslq_f32(vmask2, vr21, vzero); - float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero); - float32x4_t vr31_1 = vbslq_f32(vmask2, vr31, vzero); - - vst1q_f32(outptr_row_col, vr00_1); - vst1q_f32(outptr_row_col + 4, vr01_1); - vst1q_f32(outptr_row_col + 8, vr10_1); - vst1q_f32(outptr_row_col + 12, vr11_1); - vst1q_f32(outptr_row_col + 16, vr20_1); - vst1q_f32(outptr_row_col + 20, vr21_1); - vst1q_f32(outptr_row_col + 24, vr30_1); - vst1q_f32(outptr_row_col + 28, vr31_1); - } - } - -#pragma omp parallel for - for (int y = 4 * (y_len / 4); y < y_len; ++y) { - const float *ptr0 = inptr + y * ldin; - float *outptr_row_col = outptr + y * 8; - int i = 0; - for (; i < x_len - 7; i += 8) { - float32x4_t vr0 = vld1q_f32(ptr0); - float32x4_t vr1 = vld1q_f32(ptr0 + 4); - if (has_alpha) { - vr0 = vmulq_f32(vr0, valpha); - vr1 = vmulq_f32(vr1, valpha); - } - vst1q_f32(outptr_row_col, vr0); - vst1q_f32(outptr_row_col + 4, vr1); - - ptr0 += 8; - - outptr_row_col += stride_out; - } - if (right_remain > 0) { - float32x4_t vr0 = vld1q_f32(ptr0); - float32x4_t vr1 = vld1q_f32(ptr0 + 4); - - if (has_alpha) { - vr0 = vmulq_f32(vr0, valpha); - vr1 = vmulq_f32(vr1, valpha); - } - - float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero); - float32x4_t vr1_1 = vbslq_f32(vmask2, vr1, vzero); - - vst1q_f32(outptr_row_col, vr0_1); - vst1q_f32(outptr_row_col + 4, vr1_1); - } - } -} - -#else // __aarch64__ -void prepackA_6x8(float* outptr, - const float* inptr, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax) { - int x_len = kmax - k0; - float zerobuff[x_len]; // NOLINT - memset(zerobuff, 0, sizeof(float) * x_len); - - bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; - float32x4_t valpha = vdupq_n_f32(alpha); - - for (int y = m0; y < mmax; y += 6) { - const float* inptr0 = inptr + y * ldin + k0; - const float* inptr1 = inptr0 + ldin; - const float* inptr2 = inptr1 + ldin; - const float* inptr3 = inptr2 + ldin; - const float* inptr4 = inptr3 + ldin; - const float* inptr5 = inptr4 + ldin; - - int x = x_len; - if ((y + 5) >= mmax) { - switch ((y + 5) - mmax) { - case 4: - inptr1 = zerobuff; - case 3: - inptr2 = zerobuff; - case 2: - inptr3 = zerobuff; - case 1: - inptr4 = zerobuff; - case 0: - inptr5 = zerobuff; - default: - break; - } - } - - for (; x > 7; x -= 8) { - asm volatile( - "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, " - "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" - "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, " - "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" - "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, " - "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" - "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, " - "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" - "vld4.32 {d16-d19}, [%[inptr4]]! @ zip load r4, " - "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n" - "vld4.32 {d20-d23}, [%[inptr5]]! @ zip load r5, " - "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q1, q1, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q2, q2, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q3, q3, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q4, q4, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q5, q5, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q6, q6, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q7, q7, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q8, q8, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q9, q9, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q10, q10, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q11, q11, %q[alpha]\n" /* mul alpha */ - "0: \n" - "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; " - "q2=r04,r14,r05,r15\n" - "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; " - "q6=r24,r34,r25,r35\n" - "vtrn.32 q8, q10 @ trans data: q8=r40,r50,r41,r51; " - "q10=r44,r54,r45,r55\n" - - "vswp d1, d8 @ swap d1, d8, q0=r00,r10,r20,r30; " - "q4=r01,r11,r21,r31\n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write q0:r00,r10,r20,r30\n" - "vst1.32 {d16}, [%[outptr]]! @ write d16(q8,low),r40,r50\n" - "vst1.32 {d8-d9}, [%[outptr]]! @ write q4:r01,r11,r21,r31\n" - "vst1.32 {d17}, [%[outptr]]! @ write d16(q8,high),r41,r51\n" - - "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; " - "q3=r06,r16,r07,r17\n" - "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; " - "q7=r26,r36,r27,r37\n" - "vtrn.32 q9, q11 @ trans data: q9=r42,r52,r43,r53; " - "q11=r46,r56,r47,r57\n" - - "vswp d3, d10 @ swap d3, d10, " - "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n" - "vst1.32 {d2-d3}, [%[outptr]]! @ write q1:r02,r12,r22,r32\n" - "vst1.32 {d18}, [%[outptr]]! @ write d18(q9,low),r42,r52\n" - "vst1.32 {d10-d11},[%[outptr]]! @ write q5:r03,r13,r23,r33\n" - "vst1.32 {d19}, [%[outptr]]! @ write d19(q9,high),r43,r53\n" - - "vswp d5, d12 @ swap d5, d12,q2=r04,r14,r24,r34; " - "q6=r05,r15,r25,r35\n" - "vst1.32 {d4-d5}, [%[outptr]]! @ write q2:r04,r14,r24,r34\n" - "vst1.32 {d20}, [%[outptr]]! @ write d20(q10,low),r44,r54\n" - "vst1.32 {d12-d13},[%[outptr]]! @ write q6:r05,r15,r25,r35\n" - "vst1.32 {d21}, [%[outptr]]! @ write d21(q10,high),r45,r55\n" - - "vswp d7, d14 @ swap d7, d14, " - "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n" - "vst1.32 {d6-d7}, [%[outptr]]! @ write q3:r06,r16,r26,r36\n" - "vst1.32 {d22}, [%[outptr]]! @ write d22(q11,low),r46,r56\n" - "vst1.32 {d14-d15},[%[outptr]]! @ write q7:r07,r17,r27,r37\n" - "vst1.32 {d23}, [%[outptr]]! @ write d23(q11,high),r47,r57\n" - : [inptr0] "+r"(inptr0), - [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), - [inptr5] "+r"(inptr5), - [outptr] "+r"(outptr) - : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q15", - "cc", - "memory"); - } - - for (; x > 0; x--) { - if (has_alpha) { - *outptr++ = *inptr0++ * alpha; - *outptr++ = *inptr1++ * alpha; - *outptr++ = *inptr2++ * alpha; - *outptr++ = *inptr3++ * alpha; - *outptr++ = *inptr4++ * alpha; - *outptr++ = *inptr5++ * alpha; - } else { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - } - } - } -} - -void prepackA_trans_6x8(float* outptr, - const float* in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax) { - auto inptr = in + k0 * ldin + m0; - - bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; - float32x4_t valpha = vdupq_n_f32(alpha); - - uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - int x_len = mmax - m0; - int y_len = kmax - k0; - int right_remain = x_len - 6 * (x_len / 6); - int right_pad = 6 - right_remain; - if (right_remain == 0) { - right_pad = 0; - } - - float* outptr_row = outptr; - int stride_out = 6 * y_len; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask1 = - vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); - uint32x4_t vmask2 = - vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); - -#pragma omp parallel for - for (int y = 0; y < y_len - 3; y += 4) { - const float* ptr0 = inptr + y * ldin; - const float* ptr1 = ptr0 + ldin; - const float* ptr2 = ptr1 + ldin; - const float* ptr3 = ptr2 + ldin; - - float* outptr_row_col = outptr_row + y * 6; - int i = 0; - for (; i < x_len - 5; i += 6) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" - "vld1.32 {d4-d6}, [%[ptr1]]! @ load r1, 6 elements\n" - "vld1.32 {d8-d10}, [%[ptr2]]! @ load r2, 6 elements\n" - "vld1.32 {d12-d14}, [%[ptr3]]! @ load r3, 6 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d2, d2, %e[alpha]\n" /* mul alpha */ - "vmul.f32 q2, q2, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d6, d6, %e[alpha]\n" /* mul alpha */ - "vmul.f32 q4, q4, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d10, d10, %e[alpha]\n" /* mul alpha */ - "vmul.f32 q6, q6, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d14, d14, %e[alpha]\n" /* mul alpha */ - "0: \n" - "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d6}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d8-d10}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d12-d14}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), - [ptr0] "+r"(ptr0), - [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3) - : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_pad > 0) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" - "vld1.32 {d4-d6}, [%[ptr1]]! @ load r1, 6 elements\n" - "vld1.32 {d8-d10}, [%[ptr2]]! @ load r2, 8 elements\n" - "vld1.32 {d12-d14}, [%[ptr3]]! @ load r3, 8 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d2, d2, %e[alpha]\n" /* mul alpha */ - "vmul.f32 q2, q2, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d6, d6, %e[alpha]\n" /* mul alpha */ - "vmul.f32 q4, q4, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d10, d10, %e[alpha]\n" /* mul alpha */ - "vmul.f32 q6, q6, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d14, d14, %e[alpha]\n" /* mul alpha */ - "0: \n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif d2, %e[vzero], %e[vmask2] @ bit select, pad zero\n" - "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif d6, %e[vzero], %e[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d6}, [%[outptr]]! @ write to output ptr\n" - "vbif q4, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif d10, %e[vzero], %e[vmask2] @ bit select, pad zero\n" - "vbif q6, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif d14, %e[vzero], %e[vmask2] @ bit select, pad zero\n" - "vst1.32 {d8-d10}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d12-d14}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), - [ptr0] "+r"(ptr0), - [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3) - : [vmask1] "w"(vmask1), - [vmask2] "w"(vmask2), - [vzero] "w"(vzero), - [has_alpha] "r"(has_alpha), - [alpha] "w"(valpha) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory"); - } - } - -#pragma omp parallel for - for (int y = 4 * (y_len / 4); y < y_len; ++y) { - const float* ptr0 = inptr + y * ldin; - float* outptr_row_col = outptr_row + y * 6; - int i = 0; - for (; i < x_len - 5; i += 6) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d2, d2, %e[alpha]\n" /* mul alpha */ - "0: \n" - "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha) - : "q0", "q1", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_pad > 0) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d2}, [%[ptr0]]! @ load r0, 6 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 d2, d2, %e[alpha]\n" /* mul alpha */ - "0: \n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif d2, %e[vzero], %e[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d2}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : [vmask1] "w"(vmask1), - [vmask2] "w"(vmask2), - [vzero] "w"(vzero), - [has_alpha] "r"(has_alpha), - [alpha] "w"(valpha) - : "q0", "q1", "cc", "memory"); - } - } -} - -void prepackA_4x8(float* outptr, - const float* inptr, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax) { - int x_len = kmax - k0; - float zerobuff[x_len]; // NOLINT - memset(zerobuff, 0, sizeof(float) * x_len); - - bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; - float32x4_t valpha = vdupq_n_f32(alpha); - - for (int y = m0; y < mmax; y += 4) { - const float* inptr0 = inptr + y * ldin + k0; - const float* inptr1 = inptr0 + ldin; - const float* inptr2 = inptr1 + ldin; - const float* inptr3 = inptr2 + ldin; - - int x = x_len; - if ((y + 3) >= mmax) { - switch ((y + 3) - mmax) { - case 2: - inptr1 = zerobuff; - case 1: - inptr2 = zerobuff; - case 0: - inptr3 = zerobuff; - default: - break; - } - } - - for (; x > 7; x -= 8) { - asm volatile( - "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, " - "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" - "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, " - "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" - "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, " - "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" - "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, " - "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q1, q1, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q2, q2, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q3, q3, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q4, q4, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q5, q5, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q6, q6, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q7, q7, %q[alpha]\n" /* mul alpha */ - "0: \n" - "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; " - "q2=r04,r14,r05,r15\n" - "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; " - "q6=r24,r34,r25,r35\n" - - "vswp d1, d8 @ swap d1, d8, q0=r00,r10,r20,r30; " - "q4=r01,r11,r21,r31\n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write q0:r00,r10,r20,r30\n" - "vst1.32 {d8-d9}, [%[outptr]]! @ write q4:r01,r11,r21,r31\n" - - "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; " - "q3=r06,r16,r07,r17\n" - "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; " - "q7=r26,r36,r27,r37\n" - - "vswp d3, d10 @ swap d3, d10, " - "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n" - "vst1.32 {d2-d3}, [%[outptr]]! @ write q1:r02,r12,r22,r32\n" - "vst1.32 {d10-d11},[%[outptr]]! @ write q5:r03,r13,r23,r33\n" - - "vswp d5, d12 @ swap d5, d12,q2=r04,r14,r24,r34; " - "q6=r05,r15,r25,r35\n" - "vst1.32 {d4-d5}, [%[outptr]]! @ write q2:r04,r14,r24,r34\n" - "vst1.32 {d12-d13},[%[outptr]]! @ write q6:r05,r15,r25,r35\n" - - "vswp d7, d14 @ swap d7, d14, " - "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n" - "vst1.32 {d6-d7}, [%[outptr]]! @ write q3:r06,r16,r26,r36\n" - "vst1.32 {d14-d15},[%[outptr]]! @ write q7:r07,r17,r27,r37\n" - : [inptr0] "+r"(inptr0), - [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), - [outptr] "+r"(outptr) - : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory"); - } - - for (; x > 0; x--) { - if (has_alpha) { - *outptr++ = *inptr0++ * alpha; - *outptr++ = *inptr1++ * alpha; - *outptr++ = *inptr2++ * alpha; - *outptr++ = *inptr3++ * alpha; - } else { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - } - } - } -} - -void prepackA_trans_4x8(float* outptr, - const float* in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax) { - auto inptr = in + k0 * ldin + m0; - bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; - float32x4_t valpha = vdupq_n_f32(alpha); - - uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - int x_len = mmax - m0; - int y_len = kmax - k0; - int right_remain = x_len - 4 * (x_len / 4); - int right_pad = 4 - right_remain; - if (right_remain == 0) { - right_pad = 0; - } - - int stride_out = 4 * y_len; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask1 = - vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); - -#pragma omp parallel for - for (int y = 0; y < y_len - 3; y += 4) { - const float* ptr0 = inptr + y * ldin; - const float* ptr1 = ptr0 + ldin; - const float* ptr2 = ptr1 + ldin; - const float* ptr3 = ptr2 + ldin; - - float* outptr_row_col = outptr + y * 4; - int i = 0; - for (; i < x_len - 3; i += 4) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d1}, [%[ptr0]]! @ load r0, 4 elements\n" - "vld1.32 {d2-d3}, [%[ptr1]]! @ load r1, 4 elements\n" - "vld1.32 {d4-d5}, [%[ptr2]]! @ load r2, 4 elements\n" - "vld1.32 {d6-d7}, [%[ptr3]]! @ load r3, 4 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q1, q1, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q2, q2, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q3, q3, %q[alpha]\n" /* mul alpha */ - "0: \n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d2-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d5}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d6-d7}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), - [ptr0] "+r"(ptr0), - [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3) - : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha) - : "q0", "q1", "q2", "q3", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_pad > 0) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d1}, [%[ptr0]]! @ load r0, 4 elements\n" - "vld1.32 {d2-d3}, [%[ptr1]]! @ load r1, 4 elements\n" - "vld1.32 {d4-d5}, [%[ptr2]]! @ load r2, 4 elements\n" - "vld1.32 {d6-d7}, [%[ptr3]]! @ load r3, 4 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q1, q1, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q2, q2, %q[alpha]\n" /* mul alpha */ - "vmul.f32 q3, q3, %q[alpha]\n" /* mul alpha */ - "0: \n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q1, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q3, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d2-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d5}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d6-d7}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), - [ptr0] "+r"(ptr0), - [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3) - : [vmask1] "w"(vmask1), - [vzero] "w"(vzero), - [has_alpha] "r"(has_alpha), - [alpha] "w"(valpha) - : "q0", "q1", "q2", "q3", "cc", "memory"); - } - } - -#pragma omp parallel for - for (int y = 4 * (y_len / 4); y < y_len; ++y) { - const float* ptr0 = inptr + y * ldin; - float* outptr_row_col = outptr + y * 4; - int i = 0; - for (; i < x_len - 3; i += 4) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d1}, [%[ptr0]]! @ load r0, 4 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "0: \n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha) - : "q0", "q1", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_pad > 0) { - float* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d1}, [%[ptr0]]! @ load r0, 4 elements\n" - "cmp %[has_alpha], #0\n" - "beq 0f\n" /* check whether alpha == 1? */ - "vmul.f32 q0, q0, %q[alpha]\n" /* mul alpha */ - "0: \n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vst1.32 {d0-d1}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : [vmask1] "w"(vmask1), - [vzero] "w"(vzero), - [has_alpha] "r"(has_alpha), - [alpha] "w"(valpha) - : "q0", "q1", "cc", "memory"); - } - } -} - -#endif // __aarch64__ - -/** -* \brief input data is transpose -* for arm-v7a, transform data to block x k x 8 layout -* for arm-v8a, transform data to block x k x 12 layout -*/ -#ifdef __aarch64__ -void loadb( - float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) { - auto outptr = reinterpret_cast(out); - auto inptr = reinterpret_cast(in) + k0 * ldin + n0; - uint32_t mask_buffer[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - int x_len = nmax - n0; - int y_len = kmax - k0; - int right_remain = x_len - 12 * (x_len / 12); - int right_pad = 12 - right_remain; - - uint32_t *outptr_row = outptr; - int stride_out = 12 * y_len; - - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = - vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); - uint32x4_t vmask2 = - vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); - uint32x4_t vmask3 = - vcltq_u32(vld1q_u32(mask_buffer + 8), vdupq_n_u32(right_remain)); - -#pragma omp parallel for - for (int y = 0; y < y_len - 3; y += 4) { - const uint32_t *ptr0 = inptr + y * ldin; - const uint32_t *ptr1 = ptr0 + ldin; - const uint32_t *ptr2 = ptr1 + ldin; - const uint32_t *ptr3 = ptr2 + ldin; - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - : - : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3) - : "memory"); - - uint32_t *outptr_row_col = outptr_row + y * 12; - - int i = 0; - for (; i < x_len - 11; i += 12) { - uint32x4_t vr00 = vld1q_u32(ptr0); - uint32x4_t vr01 = vld1q_u32(ptr0 + 4); - uint32x4_t vr02 = vld1q_u32(ptr0 + 8); - - uint32x4_t vr10 = vld1q_u32(ptr1); - uint32x4_t vr11 = vld1q_u32(ptr1 + 4); - uint32x4_t vr12 = vld1q_u32(ptr1 + 8); - - vst1q_u32(outptr_row_col, vr00); - vst1q_u32(outptr_row_col + 4, vr01); - vst1q_u32(outptr_row_col + 8, vr02); - - uint32x4_t vr20 = vld1q_u32(ptr2); - uint32x4_t vr21 = vld1q_u32(ptr2 + 4); - uint32x4_t vr22 = vld1q_u32(ptr2 + 8); - - vst1q_u32(outptr_row_col + 12, vr10); - vst1q_u32(outptr_row_col + 16, vr11); - vst1q_u32(outptr_row_col + 20, vr12); - - uint32x4_t vr30 = vld1q_u32(ptr3); - uint32x4_t vr31 = vld1q_u32(ptr3 + 4); - uint32x4_t vr32 = vld1q_u32(ptr3 + 8); - - vst1q_u32(outptr_row_col + 24, vr20); - vst1q_u32(outptr_row_col + 28, vr21); - vst1q_u32(outptr_row_col + 32, vr22); - - vst1q_u32(outptr_row_col + 36, vr30); - vst1q_u32(outptr_row_col + 40, vr31); - vst1q_u32(outptr_row_col + 44, vr32); - - ptr0 += 12; - ptr1 += 12; - ptr2 += 12; - ptr3 += 12; - - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32x4_t vr00 = vld1q_u32(ptr0); - uint32x4_t vr01 = vld1q_u32(ptr0 + 4); - uint32x4_t vr02 = vld1q_u32(ptr0 + 8); - - uint32x4_t vr10 = vld1q_u32(ptr1); - uint32x4_t vr11 = vld1q_u32(ptr1 + 4); - uint32x4_t vr12 = vld1q_u32(ptr1 + 8); - - uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero); - uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero); - uint32x4_t vr02_1 = vbslq_u32(vmask3, vr02, vzero); - - uint32x4_t vr20 = vld1q_u32(ptr2); - uint32x4_t vr21 = vld1q_u32(ptr2 + 4); - uint32x4_t vr22 = vld1q_u32(ptr2 + 8); - - vst1q_u32(outptr_row_col, vr00_1); - vst1q_u32(outptr_row_col + 4, vr01_1); - vst1q_u32(outptr_row_col + 8, vr02_1); - - uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero); - uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero); - uint32x4_t vr12_1 = vbslq_u32(vmask3, vr12, vzero); - - uint32x4_t vr30 = vld1q_u32(ptr3); - uint32x4_t vr31 = vld1q_u32(ptr3 + 4); - uint32x4_t vr32 = vld1q_u32(ptr3 + 8); - - vst1q_u32(outptr_row_col + 12, vr10_1); - vst1q_u32(outptr_row_col + 16, vr11_1); - vst1q_u32(outptr_row_col + 20, vr12_1); - - uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero); - uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero); - uint32x4_t vr22_1 = vbslq_u32(vmask3, vr22, vzero); - - uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero); - uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero); - uint32x4_t vr32_1 = vbslq_u32(vmask3, vr32, vzero); - - vst1q_u32(outptr_row_col + 24, vr20_1); - vst1q_u32(outptr_row_col + 28, vr21_1); - vst1q_u32(outptr_row_col + 32, vr22_1); - - vst1q_u32(outptr_row_col + 36, vr30_1); - vst1q_u32(outptr_row_col + 40, vr31_1); - vst1q_u32(outptr_row_col + 44, vr32_1); - } - } - -#pragma omp parallel for - for (int y = 4 * (y_len / 4); y < y_len; ++y) { - const uint32_t *ptr0 = inptr + y * ldin; - uint32_t *outptr_row_col = outptr_row + y * 12; - - int i = 0; - for (; i < x_len - 11; i += 12) { - uint32x4_t vr0 = vld1q_u32(ptr0); - uint32x4_t vr1 = vld1q_u32(ptr0 + 4); - uint32x4_t vr2 = vld1q_u32(ptr0 + 8); - vst1q_u32(outptr_row_col, vr0); - vst1q_u32(outptr_row_col + 4, vr1); - vst1q_u32(outptr_row_col + 8, vr2); - - ptr0 += 12; - - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32x4_t vr0 = vld1q_u32(ptr0); - uint32x4_t vr1 = vld1q_u32(ptr0 + 4); - uint32x4_t vr2 = vld1q_u32(ptr0 + 8); - - uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero); - uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero); - uint32x4_t vr2_1 = vbslq_u32(vmask3, vr2, vzero); - - vst1q_u32(outptr_row_col, vr0_1); - vst1q_u32(outptr_row_col + 4, vr1_1); - vst1q_u32(outptr_row_col + 8, vr2_1); - } - } -} - -void loadb_trans( - float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) { - int x_len = kmax - k0; - uint32_t zerobuff[x_len]; // NOLINT - memset(zerobuff, 0, sizeof(uint32_t) * x_len); - auto outptr = reinterpret_cast(out); - auto inptr = reinterpret_cast(in); - - //! data B is not transposed, transpose B to k * 12 - for (int y = n0; y < nmax; y += 12) { - const uint32_t *inptr0 = inptr + y * ldin + k0; - const uint32_t *inptr1 = inptr0 + ldin; - const uint32_t *inptr2 = inptr1 + ldin; - const uint32_t *inptr3 = inptr2 + ldin; - const uint32_t *inptr4 = inptr3 + ldin; - const uint32_t *inptr5 = inptr4 + ldin; - const uint32_t *inptr6 = inptr5 + ldin; - const uint32_t *inptr7 = inptr6 + ldin; - const uint32_t *inptr8 = inptr7 + ldin; - const uint32_t *inptr9 = inptr8 + ldin; - const uint32_t *inptr10 = inptr9 + ldin; - const uint32_t *inptr11 = inptr10 + ldin; - - asm volatile( - "prfm pldl1keep, [%[ptr0]] \n" - "prfm pldl1keep, [%[ptr0], #64] \n" - "prfm pldl1keep, [%[ptr1]] \n" - "prfm pldl1keep, [%[ptr1], #64] \n" - "prfm pldl1keep, [%[ptr2]] \n" - "prfm pldl1keep, [%[ptr2], #64] \n" - "prfm pldl1keep, [%[ptr3]] \n" - "prfm pldl1keep, [%[ptr3], #64] \n" - "prfm pldl1keep, [%[ptr4]] \n" - "prfm pldl1keep, [%[ptr4], #64] \n" - "prfm pldl1keep, [%[ptr5]] \n" - "prfm pldl1keep, [%[ptr5], #64] \n" - "prfm pldl1keep, [%[ptr6]] \n" - "prfm pldl1keep, [%[ptr6], #64] \n" - "prfm pldl1keep, [%[ptr7]] \n" - "prfm pldl1keep, [%[ptr7], #64] \n" - "prfm pldl1keep, [%[ptr8]] \n" - "prfm pldl1keep, [%[ptr8], #64] \n" - "prfm pldl1keep, [%[ptr9]] \n" - "prfm pldl1keep, [%[ptr9], #64] \n" - "prfm pldl1keep, [%[ptr10]] \n" - "prfm pldl1keep, [%[ptr10], #64] \n" - "prfm pldl1keep, [%[ptr11]] \n" - "prfm pldl1keep, [%[ptr11], #64] \n" - : - : [ptr0] "r"(inptr0), - [ptr1] "r"(inptr1), - [ptr2] "r"(inptr2), - [ptr3] "r"(inptr3), - [ptr4] "r"(inptr4), - [ptr5] "r"(inptr5), - [ptr6] "r"(inptr6), - [ptr7] "r"(inptr7), - [ptr8] "r"(inptr8), - [ptr9] "r"(inptr9), - [ptr10] "r"(inptr10), - [ptr11] "r"(inptr11) - : "memory"); - - int x = x_len; - - //! cope with row index exceed real size, set to zero buffer - if ((y + 11) >= nmax) { - switch ((y + 11) - nmax) { - case 10: - inptr1 = zerobuff; - case 9: - inptr2 = zerobuff; - case 8: - inptr3 = zerobuff; - case 7: - inptr4 = zerobuff; - case 6: - inptr5 = zerobuff; - case 5: - inptr6 = zerobuff; - case 4: - inptr7 = zerobuff; - case 3: - inptr8 = zerobuff; - case 2: - inptr9 = zerobuff; - case 1: - inptr10 = zerobuff; - case 0: - inptr11 = zerobuff; - default: - break; - } - } - for (; x > 7; x -= 8) { - asm volatile( - "ldp q0, q1, [%[inptr0]], #32\n" /* r0, a0~a7 */ - "ldp q2, q3, [%[inptr1]], #32\n" /* r1, b0~b7 */ - "ldp q4, q5, [%[inptr2]], #32\n" /* r2, c0~c7 */ - "ldp q6, q7, [%[inptr3]], #32\n" /* r3, d0~d7 */ - - "zip1 v16.4s, v0.4s, v4.4s\n" /* a0c0a1c1 */ - "zip1 v17.4s, v2.4s, v6.4s\n" /* b0d0b1d1 */ - "prfm pldl1keep, [%[inptr0], #128] \n" - - "ldp q8, q9, [%[inptr4]], #32\n" /* r4, e0~e7 */ - "ldp q10, q11, [%[inptr5]], #32\n" /* r5, f0~f7 */ - "ldp q12, q13, [%[inptr6]], #32\n" /* r6, g0~g7 */ - "ldp q14, q15, [%[inptr7]], #32\n" /* r7, h0~h7 */ - - "zip1 v18.4s, v8.4s, v12.4s\n" /* e0g0e1g1 */ - "zip1 v19.4s, v10.4s, v14.4s\n" /* f0h0f1h1 */ - "prfm pldl1keep, [%[inptr1], #128]\n" - "zip1 v20.4s, v16.4s, v17.4s\n" /* a0b0c0d0 */ - "zip1 v21.4s, v18.4s, v19.4s\n" /* e0f0g0h0 */ - "prfm pldl1keep, [%[inptr2], #128]\n" - "zip2 v22.4s, v16.4s, v17.4s\n" /* a1b1c1d1 */ - "zip2 v23.4s, v18.4s, v19.4s\n" /* e1f1g1h1 */ - - "ldp q24, q25, [%[inptr8]], #32\n" /* r8, i0~i7 */ - "ldp q26, q27, [%[inptr9]], #32\n" /* r9, j0~j7 */ - "ldp q28, q29, [%[inptr10]], #32\n" /* r10, k0~k7 */ - "ldp q30, q31, [%[inptr11]], #32\n" /* r11, l0~l7 */ - - "stp q20, q21, [%[outptr]], #32\n" /* save a0~h0 */ - "prfm pldl1keep, [%[inptr3], #128]\n" - - "zip1 v16.4s, v24.4s, v28.4s\n" /* i0k0i1k1 */ - "zip1 v17.4s, v26.4s, v30.4s\n" /* j0l0j1l1 */ - "prfm pldl1keep, [%[inptr4], #128]\n" - "zip1 v18.4s, v16.4s, v17.4s\n" /* i0j0k0l0 */ - "zip2 v19.4s, v16.4s, v17.4s\n" /* i1j1k1l1 */ - "prfm pldl1keep, [%[inptr5], #128]\n" - "zip2 v16.4s, v0.4s, v4.4s\n" /* a2c2a3c3 */ - "zip2 v17.4s, v2.4s, v6.4s\n" /* b2d2b3d3 */ - - "str q18, [%[outptr]], #16\n" /* save j0~l0 */ - "stp q22, q23, [%[outptr]], #32\n" /* save a1~h1 */ - "str q19, [%[outptr]], #16\n" /* save j1~l1 */ - - "zip2 v18.4s, v8.4s, v12.4s\n" /* e2g2e3g3 */ - "zip2 v19.4s, v10.4s, v14.4s\n" /* f2h2f3h3 */ - "prfm pldl1keep, [%[inptr6], #128]\n" - "zip1 v20.4s, v16.4s, v17.4s\n" /* a2b2c2d2 */ - "zip1 v21.4s, v18.4s, v19.4s\n" /* e2f2g2h2 */ - "prfm pldl1keep, [%[inptr7], #128]\n" - "zip2 v22.4s, v16.4s, v17.4s\n" /* a3b3c3d3 */ - "zip2 v23.4s, v18.4s, v19.4s\n" /* e3f3g3h3 */ - "prfm pldl1keep, [%[inptr8], #128]\n" - "zip2 v16.4s, v24.4s, v28.4s\n" /* i2k2i3k3 */ - "zip2 v17.4s, v26.4s, v30.4s\n" /* j2l2j3l3 */ - - "stp q20, q21, [%[outptr]], #32\n" /* save a2~h2 */ - - "zip1 v18.4s, v16.4s, v17.4s\n" /* i2j2k2l2 */ - "zip2 v19.4s, v16.4s, v17.4s\n" /* i3j3k3l3 */ - "prfm pldl1keep, [%[inptr9], #128]\n" - "zip1 v16.4s, v1.4s, v5.4s\n" /* a4c4a5c5 */ - "zip1 v17.4s, v3.4s, v7.4s\n" /* b4d4b5d5 */ - - "str q18, [%[outptr]], #16\n" /* save i2~l2 */ - "stp q22, q23, [%[outptr]], #32\n" /* save a3~h3 */ - "str q19, [%[outptr]], #16\n" /* save i3~l3 */ - - "zip1 v18.4s, v9.4s, v13.4s\n" /* e4g4e5g5 */ - "zip1 v19.4s, v11.4s, v15.4s\n" /* f4h4f5h5 */ - "prfm pldl1keep, [%[inptr10], #128]\n" - "zip1 v20.4s, v16.4s, v17.4s\n" /* a4b4c4d4 */ - "zip1 v21.4s, v18.4s, v19.4s\n" /* e4f4g4h4 */ - "prfm pldl1keep, [%[inptr11], #128]\n" - "zip2 v22.4s, v16.4s, v17.4s\n" /* a5b5c5d5 */ - "zip2 v23.4s, v18.4s, v19.4s\n" /* e5f5g5h5 */ - "zip1 v16.4s, v25.4s, v29.4s\n" /* i4k4i5k5 */ - "zip1 v17.4s, v27.4s, v31.4s\n" /* j4l4j5l5 */ - - "stp q20, q21, [%[outptr]], #32\n" /* save a4~h4 */ - - "zip1 v18.4s, v16.4s, v17.4s\n" /* i4j4k4l4 */ - "zip2 v19.4s, v16.4s, v17.4s\n" /* i5j5k5l5 */ - "zip2 v16.4s, v1.4s, v5.4s\n" /* a6c6a7c7 */ - "zip2 v17.4s, v3.4s, v7.4s\n" /* b6d6b7d7 */ - - "str q18, [%[outptr]], #16\n" /* save i4~l4 */ - "stp q22, q23, [%[outptr]], #32\n" /* save a5~h5 */ - "str q19, [%[outptr]], #16\n" /* save i5~l5 */ - - "zip2 v18.4s, v9.4s, v13.4s\n" /* e6g6e7g7 */ - "zip2 v19.4s, v11.4s, v15.4s\n" /* f6h6f7h7 */ - "zip1 v20.4s, v16.4s, v17.4s\n" /* a6b6c6d6 */ - "zip1 v21.4s, v18.4s, v19.4s\n" /* e6f6g6h6 */ - "zip2 v22.4s, v16.4s, v17.4s\n" /* a7b7c7d7 */ - "zip2 v23.4s, v18.4s, v19.4s\n" /* e7f7g7h7 */ - "zip2 v16.4s, v25.4s, v29.4s\n" /* i6k6i7k7 */ - "zip2 v17.4s, v27.4s, v31.4s\n" /* j6l6j7l7 */ - - "stp q20, q21, [%[outptr]], #32\n" /* save a6~h6 */ - - "zip1 v18.4s, v16.4s, v17.4s\n" /* i6j6k6l6 */ - "zip2 v19.4s, v16.4s, v17.4s\n" /* i7j7k7l7 */ - - "str q18, [%[outptr]], #16\n" /* save i6~l6 */ - "stp q22, q23, [%[outptr]], #32\n" /* save a7~h7 */ - "str q19, [%[outptr]], #16\n" /* save i7~l7 */ - : [inptr0] "+r"(inptr0), - [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), - [inptr5] "+r"(inptr5), - [inptr6] "+r"(inptr6), - [inptr7] "+r"(inptr7), - [inptr8] "+r"(inptr8), - [inptr9] "+r"(inptr9), - [inptr10] "+r"(inptr10), - [inptr11] "+r"(inptr11), - [outptr] "+r"(outptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31", - "cc", - "memory"); - } - - for (; x > 0; x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - *outptr++ = *inptr8++; - *outptr++ = *inptr9++; - *outptr++ = *inptr10++; - *outptr++ = *inptr11++; - } - } -} - -#else // __aarch64__ -void loadb( - float* out, const float* in, int ldin, int k0, int kmax, int n0, int nmax) { - auto outptr = reinterpret_cast(out); - auto inptr = reinterpret_cast(in) + k0 * ldin + n0; - uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - int x_len = nmax - n0; - int y_len = kmax - k0; - int right_remain = x_len - 8 * (x_len / 8); - int right_pad = 8 - right_remain; - - uint32_t* outptr_row = outptr; - int stride_out = 8 * y_len; - - uint32x4_t vzero = vdupq_n_u32(0); - uint32x4_t vmask1 = - vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); - uint32x4_t vmask2 = - vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain)); - -#pragma omp parallel for - for (int y = 0; y < y_len - 3; y += 4) { - const uint32_t* ptr0 = inptr + y * ldin; - const uint32_t* ptr1 = ptr0 + ldin; - const uint32_t* ptr2 = ptr1 + ldin; - const uint32_t* ptr3 = ptr2 + ldin; - uint32_t* outptr_row_col = outptr_row + y * 8; - int i = 0; - for (; i < x_len - 7; i += 8) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr1]]! @ load r1, 8 elements\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - - "vld1.32 {d0-d3}, [%[ptr2]]! @ load r2, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr3]]! @ load r3, 8 elements\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), - [ptr0] "+r"(ptr0), - [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3) - : - : "q0", "q1", "q2", "q3", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr1]]! @ load r1, 8 elements\n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - //"vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - - "vld1.32 {d0-d3}, [%[ptr2]]! @ load r2, 8 elements\n" - "vld1.32 {d4-d7}, [%[ptr3]]! @ load r3, 8 elements\n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - //"vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vbif q2, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q3, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - "vst1.32 {d4-d7}, [%[outptr]]! @ write to output ptr\n" - : [outptr] "+r"(ptr_out), - [ptr0] "+r"(ptr0), - [ptr1] "+r"(ptr1), - [ptr2] "+r"(ptr2), - [ptr3] "+r"(ptr3) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero) - : "q0", "q1", "q2", "q3", "cc", "memory"); - } - } -#pragma omp parallel for - for (int y = 4 * (y_len / 4); y < y_len; ++y) { - const uint32_t* ptr0 = inptr + y * ldin; - uint32_t* outptr_row_col = outptr_row + y * 8; - int i = 0; - for (; i < x_len - 7; i += 8) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : - : "q0", "q1", "cc", "memory"); - outptr_row_col += stride_out; - } - if (right_remain > 0) { - uint32_t* ptr_out = outptr_row_col; - asm volatile( - "vld1.32 {d0-d3}, [%[ptr0]]! @ load r0, 8 elements\n" - "vbif q0, %q[vzero], %q[vmask1] @ bit select, pad zero\n" - "vbif q1, %q[vzero], %q[vmask2] @ bit select, pad zero\n" - "vst1.32 {d0-d3}, [%[outptr]]! @ write to output ptr\n" - : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out) - : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero) - : "q0", "q1", "cc", "memory"); - } - } -} - -void loadb_trans( - float* out, const float* in, int ldin, int k0, int kmax, int n0, int nmax) { - int x_len = kmax - k0; - uint32_t zerobuff[x_len]; // NOLINT - memset(zerobuff, 0, sizeof(uint32_t) * x_len); - - auto outptr = reinterpret_cast(out); - auto inptr = reinterpret_cast(in); - //! data B is not transposed, transpose B to k * 8 - for (int y = n0; y < nmax; y += 8) { - const uint32_t* inptr0 = inptr + y * ldin + k0; - const uint32_t* inptr1 = inptr0 + ldin; - const uint32_t* inptr2 = inptr1 + ldin; - const uint32_t* inptr3 = inptr2 + ldin; - const uint32_t* inptr4 = inptr3 + ldin; - const uint32_t* inptr5 = inptr4 + ldin; - const uint32_t* inptr6 = inptr5 + ldin; - const uint32_t* inptr7 = inptr6 + ldin; - - int x = x_len; - - //! cope with row index exceed real size, set to zero buffer - if ((y + 7) >= nmax) { - switch ((y + 7) - nmax) { - case 6: - inptr1 = zerobuff; - case 5: - inptr2 = zerobuff; - case 4: - inptr3 = zerobuff; - case 3: - inptr4 = zerobuff; - case 2: - inptr5 = zerobuff; - case 1: - inptr6 = zerobuff; - case 0: - inptr7 = zerobuff; - default: - break; - } - } - - for (; x > 7; x -= 8) { - //! zip load 8 elements (2 neon Q registers) from each of 8 rows - asm volatile( - "vld4.32 {d0-d3}, [%[inptr0]]! @ zip load r0, " - "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n" - "vld4.32 {d4-d7}, [%[inptr1]]! @ zip load r1, " - "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n" - "vtrn.32 q0, q2 @ trans data: q0=r00,r10,r01,r11; " - "q2=r04,r14,r05,r15\n" - "vst1.32 {d0}, [%[outptr]]! @ write d0(q0,low),r00,r10\n" - - "vld4.32 {d8-d11}, [%[inptr2]]! @ zip load r2, " - "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n" - "vld4.32 {d12-d15}, [%[inptr3]]! @ zip load r3, " - "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n" - "vtrn.32 q4, q6 @ trans data: q4=r20,r30,r21,r31; " - "q6=r24,r34,r25,r35\n" - "vst1.32 {d8}, [%[outptr]]! @ write d8(q4,low),r20,r30\n" - - "vld4.32 {d16-d19}, [%[inptr4]]! @ zip load r4, " - "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n" - "vld4.32 {d20-d23}, [%[inptr5]]! @ zip load r5, " - "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n" - "vtrn.32 q8, q10 @ trans data: q8=r40,r50,r41,r51; " - "q10=r44,r54,r45,r55\n" - "vst1.32 {d16}, [%[outptr]]! @ write d16(q8,low),r40,r50\n" - - "vld4.32 {d24-d27}, [%[inptr6]]! @ zip load r6, " - "q12,q13=r60,r64,r61,r65,r62,r66,r63,r67\n" - "vld4.32 {d28-d31}, [%[inptr7]]! @ zip load r7, " - "q14,q15=r70,r74,r71,r75,r72,r76,r73,r77\n" - "vtrn.32 q12, q14 @ trans data:q12=r60,r70,r61,r71; " - "q14=r64,r74,r65,r75\n" - "vst1.32 {d24}, [%[outptr]]! @ write d24(q8,low),r60,r70\n" - - //"pld [%[inptr0], #128] @ preload r0 data to cache, fill - // pipeline\n" - "vst1.32 {d1}, [%[outptr]]! @ write d1(q0,high),r01,r11\n" - "vst1.32 {d9}, [%[outptr]]! @ write d9(q4,high),r21,r31\n" - "vst1.32 {d17}, [%[outptr]]! @ write d17(q8,high),r41,r51\n" - "vst1.32 {d25}, [%[outptr]]! @ write d25(q12,high),r61,r71\n" - - "vtrn.32 q1, q3 @ trans data: q1=r02,r12,r03,r13; " - "q3=r06,r16,r07,r17\n" - "vst1.32 {d2}, [%[outptr]]! @ write d2(q1,low),r02,r12\n" - "vtrn.32 q5, q7 @ trans data: q5=r22,r32,r23,r33; " - "q7=r26,r36,r27,r37\n" - "vst1.32 {d10}, [%[outptr]]! @ write d10(q5,low),r22,r32\n" - "vtrn.32 q9, q11 @ trans data: q9=r42,r52,r43,r53; " - "q11=r46,r56,r47,r57\n" - "vst1.32 {d18}, [%[outptr]]! @ write d18(q9,low),r42,r52\n" - "vtrn.32 q13, q15 @ trans data:q13=r62,r72,r63,r73; " - "q15=r66,r76,r67,r77\n" - "vst1.32 {d26}, [%[outptr]]! @ write d18(q9,low),r62,r72\n" - - //"pld [%[inptr1], #128] @ preload r1 data to cache, fill - // pipeline\n" - "vst1.32 {d3}, [%[outptr]]! @ write d3(q1,high),r03,r13\n" - "vst1.32 {d11}, [%[outptr]]! @ write d11(q5,high),r23,r33\n" - "vst1.32 {d19}, [%[outptr]]! @ write d19(q9,high),r43,r53\n" - "vst1.32 {d27}, [%[outptr]]! @ write d27(q13,high),r63,r73\n" - - //"pld [%[inptr2], #128] @ preload r2 data to cache, fill - // pipeline\n" - "vst1.32 {d4}, [%[outptr]]! @ write d4(q2,low),r04,r14\n" - "vst1.32 {d12}, [%[outptr]]! @ write d12(q6,low),r24,r34\n" - "vst1.32 {d20}, [%[outptr]]! @ write d20(q10,low),r44,r54\n" - "vst1.32 {d28}, [%[outptr]]! @ write d28(q14,low),r64,r74\n" - - //"pld [%[inptr3], #128] @ preload r3 data to cache, fill - // pipeline\n" - "vst1.32 {d5}, [%[outptr]]! @ write d5(q2,high),r05,r15\n" - "vst1.32 {d13}, [%[outptr]]! @ write d13(q6,high),r25,r35\n" - "vst1.32 {d21}, [%[outptr]]! @ write d21(q10,high),r45,r55\n" - "vst1.32 {d29}, [%[outptr]]! @ write d29(q14,high),r65,r75\n" - - //"pld [%[inptr4], #128] @ preload r4 data to cache, fill - // pipeline\n" - "vst1.32 {d6}, [%[outptr]]! @ write d6(q3,low),r06,r16\n" - "vst1.32 {d14}, [%[outptr]]! @ write d14(q7,low),r26,r36\n" - "vst1.32 {d22}, [%[outptr]]! @ write d22(q11,low),r46,r56\n" - "vst1.32 {d30}, [%[outptr]]! @ write d30(q15,low),r66,r76\n" - - //"pld [%[inptr5], #128] @ preload r5 data to cache, fill - // pipeline\n" - "vst1.32 {d7}, [%[outptr]]! @ write d7(q3,high),r07,r17\n" - "vst1.32 {d15}, [%[outptr]]! @ write d15(q7,high),r27,r37\n" - "vst1.32 {d23}, [%[outptr]]! @ write d23(q11,high),r47,r57\n" - "vst1.32 {d31}, [%[outptr]]! @ write d31(q15,high),r67,r77\n" - : [inptr0] "+r"(inptr0), - [inptr1] "+r"(inptr1), - [inptr2] "+r"(inptr2), - [inptr3] "+r"(inptr3), - [inptr4] "+r"(inptr4), - [inptr5] "+r"(inptr5), - [inptr6] "+r"(inptr6), - [inptr7] "+r"(inptr7), - [outptr] "+r"(outptr) - : - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15", - "cc", - "memory"); - } - - for (; x > 0; x--) { - *outptr++ = *inptr0++; - *outptr++ = *inptr1++; - *outptr++ = *inptr2++; - *outptr++ = *inptr3++; - *outptr++ = *inptr4++; - *outptr++ = *inptr5++; - *outptr++ = *inptr6++; - *outptr++ = *inptr7++; - } - } -} - -#endif // __aarch64__ - -#ifdef __aarch64__ -void sgemm_prepacked_8x12(bool is_transB, - int M, - int N, - int K, - const float *A_packed, - const float *B, - int ldb, - float beta, - float *C, - int ldc, - const float *bias, - bool has_bias, - bool has_relu, - ARMContext *ctx) { - size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; - auto workspace = ctx->workspace_data(); - int threads = ctx->threads(); - //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 - int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK)); - x_block /= NBLOCK; - x_block *= NBLOCK; - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK - 1) / NBLOCK; - x_block *= NBLOCK; - x_block = x_block < NBLOCK ? NBLOCK : x_block; - - // unroll 2 loop - int tail_pre = (K & (KBLOCK - 1)); - int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; - - bool flag_p_remain = false; - int remain = 0; - - int has_beta = fabsf(beta) > 1e-8f ? 1 : 0; - - //! apanel is pre_compute outside gemm - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) { - xmax = N; - } - int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; - remain = xmax - x0 - (bblocks - 1) * NBLOCK; - if (remain > 0) { - flag_p_remain = true; - } - //! load bpanel - float *b_pannel = workspace; - if (is_transB) { - loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax); - } else { - loadb(b_pannel, B, ldb, 0, K, x0, xmax); - } -#pragma omp parallel for num_threads(threads) - for (unsigned int y = 0; y < M; y += MBLOCK) { - unsigned int ymax = y + MBLOCK; - if (ymax > M) { - ymax = M; - } - - float bias_local[8] = {0}; - if (has_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - bias_local[4] = bias[y + 4]; - bias_local[5] = bias[y + 5]; - bias_local[6] = bias[y + 6]; - bias_local[7] = bias[y + 7]; - } - - float cout0[NBLOCK]; - float cout1[NBLOCK]; - float cout2[NBLOCK]; - float cout3[NBLOCK]; - float cout4[NBLOCK]; - float cout5[NBLOCK]; - float cout6[NBLOCK]; - float cout7[NBLOCK]; - - float *c_ptr0 = C + y * ldc + x0; - float *c_ptr1 = c_ptr0 + ldc; - float *c_ptr2 = c_ptr1 + ldc; - float *c_ptr3 = c_ptr2 + ldc; - float *c_ptr4 = c_ptr3 + ldc; - float *c_ptr5 = c_ptr4 + ldc; - float *c_ptr6 = c_ptr5 + ldc; - float *c_ptr7 = c_ptr6 + ldc; - - float *pout0 = c_ptr0; - float *pout1 = c_ptr1; - float *pout2 = c_ptr2; - float *pout3 = c_ptr3; - float *pout4 = c_ptr4; - float *pout5 = c_ptr5; - float *pout6 = c_ptr6; - float *pout7 = c_ptr7; - - const float *a_ptr_l = A_packed + y * K; - const float *b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if ((y + 7) >= ymax) { - switch ((y + 7) - ymax) { - case 6: - c_ptr1 = cout1; - case 5: - c_ptr2 = cout2; - case 4: - c_ptr3 = cout3; - case 3: - c_ptr4 = cout4; - case 2: - c_ptr5 = cout5; - case 1: - c_ptr6 = cout6; - case 0: - c_ptr7 = cout7; - default: - break; - } - } - if (flag_p_remain && (xb == bblocks - 1)) { - pout0 = c_ptr0; - pout1 = c_ptr1; - pout2 = c_ptr2; - pout3 = c_ptr3; - pout4 = c_ptr4; - pout5 = c_ptr5; - pout6 = c_ptr6; - pout7 = c_ptr7; - - c_ptr0 = cout0; - c_ptr1 = cout1; - c_ptr2 = cout2; - c_ptr3 = cout3; - c_ptr4 = cout4; - c_ptr5 = cout5; - c_ptr6 = cout6; - c_ptr7 = cout7; - if (has_beta) { - for (int i = 0; i < remain; ++i) { - cout0[i] = pout0[i]; - cout1[i] = pout1[i]; - cout2[i] = pout2[i]; - cout3[i] = pout3[i]; - cout4[i] = pout4[i]; - cout5[i] = pout5[i]; - cout6[i] = pout6[i]; - cout7[i] = pout7[i]; - } - } - } - const float *a_ptr = a_ptr_l; - int tail = tail_pre; - int k = k_pre; - - asm volatile( - "prfm pldl1keep, [%[a_ptr]]\n" /* preload a*/ - "ldp q2, q3, [%[bias_ptr]]\n" /* load bias to q2, q3*/ - "dup v8.4s, v2.s[0]\n" /* out0 = 0 */ - "dup v9.4s, v2.s[0]\n" /* out1 = 0*/ - "dup v10.4s, v2.s[0]\n" /* out2 = 0*/ - "prfm pldl1keep, [%[b_ptr]]\n" /* preload b*/ - "dup v11.4s, v2.s[1]\n" /* out3 = 0*/ - "dup v12.4s, v2.s[1]\n" /* out4 = 0*/ - "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ - "dup v13.4s, v2.s[1]\n" /* out5 = 0*/ - "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ - "dup v14.4s, v2.s[2]\n" /* out6 = 0*/ - "prfm pldl1keep, [%[b_ptr], #128]\n" /* preload b*/ - "dup v15.4s, v2.s[2]\n" /* out7 = 0*/ - "prfm pldl1keep, [%[a_ptr], #128]\n" /* preload a*/ - "dup v16.4s, v2.s[2]\n" /* out8 = 0*/ - "prfm pldl1keep, [%[b_ptr], #192]\n" /* preload b*/ - "dup v17.4s, v2.s[3]\n" /* out9 = 0*/ - "prfm pldl1keep, [%[b_ptr], #256]\n" /* preload b*/ - "dup v18.4s, v2.s[3]\n" /* out10 = 0*/ - "prfm pldl1keep, [%[a_ptr], #192]\n" /* preload a*/ - "dup v19.4s, v2.s[3]\n" /* out11 = 0*/ - "prfm pldl1keep, [%[b_ptr], #320]\n" /* preload b*/ - "dup v20.4s, v3.s[0]\n" /* out12 = 0*/ - "prfm pldl1keep, [%[a_ptr], #256]\n" /* preload a*/ - "dup v21.4s, v3.s[0]\n" /* out13 = 0*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" /* preload b*/ - "dup v22.4s, v3.s[0]\n" /* out14 = 0*/ - "dup v23.4s, v3.s[1]\n" /* out15 = 0*/ - "dup v24.4s, v3.s[1]\n" /* out16 = 0*/ - "dup v25.4s, v3.s[1]\n" /* out17 = 0*/ - "dup v26.4s, v3.s[2]\n" /* out18 = 0*/ - "dup v27.4s, v3.s[2]\n" /* out19 = 0*/ - "dup v28.4s, v3.s[2]\n" /* out20 = 0*/ - "dup v29.4s, v3.s[3]\n" /* out21 = 0*/ - "dup v30.4s, v3.s[3]\n" /* out22 = 0*/ - "dup v31.4s, v3.s[3]\n" /* out23 = 0*/ - "cbz %w[has_beta], 0f\n" /* check beta == 0? */ - /* process beta */ - "dup v7.4s, %w[beta]\n" /* beta to vector */ - "ld1 {v0.4s, v1.4s, v2.4s}, [%[c_ptr0]]\n" /* load output r0 */ - "ld1 {v3.4s, v4.4s, v5.4s}, [%[c_ptr1]]\n" /* load output r1 */ - "fmla v8.4s, v0.4s, v7.4s\n" /* cr00 += beta * c_r00*/ - "fmla v9.4s, v1.4s, v7.4s\n" /* cr01 += beta * c_r01*/ - "fmla v10.4s, v2.4s, v7.4s\n" /* cr02 += beta * c_r02*/ - "ld1 {v0.4s, v1.4s, v2.4s}, [%[c_ptr2]]\n" /* load output r2*/ - "fmla v11.4s, v3.4s, v7.4s\n" /* cr10 += beta * c_r10*/ - "fmla v12.4s, v4.4s, v7.4s\n" /* cr11 += beta * c_r11*/ - "fmla v13.4s, v5.4s, v7.4s\n" /* cr12 += beta * c_r12*/ - "ld1 {v3.4s, v4.4s, v5.4s}, [%[c_ptr3]]\n" /* load output r3*/ - "fmla v14.4s, v0.4s, v7.4s\n" /* cr20 += beta * c_r20*/ - "fmla v15.4s, v1.4s, v7.4s\n" /* cr21 += beta * c_r21*/ - "fmla v16.4s, v2.4s, v7.4s\n" /* cr22 += beta * c_r22*/ - "ld1 {v0.4s, v1.4s, v2.4s}, [%[c_ptr4]]\n" /* load output r4*/ - "fmla v17.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/ - "fmla v18.4s, v4.4s, v7.4s\n" /* cr31 += beta * c_r31*/ - "fmla v19.4s, v5.4s, v7.4s\n" /* cr32 += beta * c_r32*/ - "ld1 {v3.4s, v4.4s, v5.4s}, [%[c_ptr5]]\n" /* load output r5*/ - "fmla v20.4s, v0.4s, v7.4s\n" /* cr40 += beta * c_r40*/ - "fmla v21.4s, v1.4s, v7.4s\n" /* cr41 += beta * c_r41*/ - "fmla v22.4s, v2.4s, v7.4s\n" /* cr42 += beta * c_r42*/ - "ld1 {v0.4s, v1.4s, v2.4s}, [%[c_ptr6]]\n" /* load output r6*/ - "fmla v23.4s, v3.4s, v7.4s\n" /* cr50 += beta * c_r50*/ - "fmla v24.4s, v4.4s, v7.4s\n" /* cr51 += beta * c_r51*/ - "fmla v25.4s, v5.4s, v7.4s\n" /* cr52 += beta * c_r52*/ - "ld1 {v3.4s, v4.4s, v5.4s}, [%[c_ptr7]]\n" /* load output r7*/ - "fmla v26.4s, v0.4s, v7.4s\n" /* cr60 += beta * c_r60*/ - "fmla v27.4s, v1.4s, v7.4s\n" /* cr61 += beta * c_r61*/ - "fmla v28.4s, v2.4s, v7.4s\n" /* cr62 += beta * c_r62*/ - "fmla v29.4s, v3.4s, v7.4s\n" /* cr70 += beta * c_r70*/ - "fmla v30.4s, v4.4s, v7.4s\n" /* cr71 += beta * c_r71*/ - "fmla v31.4s, v5.4s, v7.4s\n" /* cr72 += beta * c_r72*/ - "0: \n" /* check loop count */ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ - "cbz %w[k], 2f\n" /* check loop count > 0 */ - /* main loop */ - /* unrool 0*/ - "1:\n" /* main loop */ - "fmla v8.4s , v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */ - "fmla v11.4s , v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4 - */ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7 */ - "fmla v14.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4 */ - "fmla v17.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4 */ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4 */ - "fmla v20.4s, v4.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 =q4 */ - "fmla v23.4s, v4.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 =q4 */ - "fmla v26.4s, v4.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 =q4 */ - "fmla v29.4s, v4.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 =q4 */ - - "fmla v9.4s, v5.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 =q5 */ - "fmla v12.4s, v5.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 =q5 */ - "fmla v15.4s, v5.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q5*/ - "fmla v18.4s, v5.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q5*/ - "fmla v21.4s, v5.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q5*/ - "fmla v24.4s, v5.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q5*/ - "fmla v27.4s, v5.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q5*/ - "fmla v30.4s, v5.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q5*/ - - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */ - - "fmla v10.4s, v6.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q6*/ - "fmla v13.4s, v6.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q6*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" - "fmla v16.4s, v6.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q6*/ - "fmla v19.4s, v6.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q6*/ - "fmla v22.4s, v6.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q6*/ - "fmla v25.4s, v6.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q6*/ - "fmla v28.4s, v6.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q6*/ - "fmla v31.4s, v6.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q6*/ - - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */ - - /* unrool 1 */ - "fmla v8.4s , v7.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q7 */ - "fmla v11.4s , v7.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 =q7 */ - "fmla v14.4s, v7.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 =q7 */ - "prfm pldl1keep, [%[a_ptr], #256]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 =q7 */ - "fmla v20.4s, v7.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q7 */ - "fmla v23.4s, v7.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7*/ - "fmla v26.4s, v7.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 =q7 */ - "fmla v29.4s, v7.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 =q7 */ - - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */ - - "fmla v9.4s, v4.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 =q4 */ - "fmla v12.4s, v4.4s, v2.s[1]\n" /* out9 = b0 * a10[1], b1 =q4 */ - "fmla v15.4s, v4.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q4*/ - "fmla v18.4s, v4.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q4*/ - "fmla v21.4s, v4.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q4*/ - "fmla v24.4s, v4.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q4*/ - "fmla v27.4s, v4.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q4*/ - "fmla v30.4s, v4.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q4*/ - - "fmla v10.4s, v5.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q5*/ - "fmla v13.4s, v5.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q5*/ - "fmla v16.4s, v5.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q5*/ - "fmla v19.4s, v5.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q5*/ - "fmla v22.4s, v5.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q5*/ - "fmla v25.4s, v5.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q5*/ - "fmla v28.4s, v5.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q5*/ - "fmla v31.4s, v5.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q5*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5 */ - /* unrool 2*/ - "fmla v8.4s , v6.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q6 */ - "fmla v11.4s , v6.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q6 - */ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ - "fmla v14.4s, v6.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q6*/ - "fmla v17.4s, v6.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q6*/ - "fmla v20.4s, v6.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 =q6*/ - "fmla v23.4s, v6.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 =q6*/ - "fmla v26.4s, v6.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 =q6*/ - "fmla v29.4s, v6.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 =q6*/ - "fmla v9.4s, v7.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 =q7*/ - "fmla v12.4s, v7.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 =q7*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" - "fmla v15.4s, v7.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q7*/ - "fmla v18.4s, v7.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q7*/ - "fmla v21.4s, v7.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q7*/ - "fmla v24.4s, v7.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q7*/ - "fmla v27.4s, v7.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q7*/ - "fmla v30.4s, v7.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q7*/ - - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ - - "fmla v10.4s, v4.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q4*/ - "fmla v13.4s, v4.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q4*/ - "fmla v16.4s, v4.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q4*/ - "fmla v19.4s, v4.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q4*/ - "fmla v22.4s, v4.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q4*/ - "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q4*/ - "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q4*/ - "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q4*/ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ - /* unrool 3*/ - "fmla v8.4s , v5.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/ - "fmla v11.4s , v5.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 =q5*/ - "fmla v14.4s, v5.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 =q5*/ - "fmla v17.4s, v5.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 =q5*/ - "fmla v20.4s, v5.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/ - "fmla v23.4s, v5.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q5*/ - "fmla v26.4s, v5.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 =q5*/ - "fmla v29.4s, v5.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 =q5*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ - "fmla v9.4s, v6.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 =q6*/ - "fmla v12.4s, v6.4s, v2.s[1]\n" /* out9 = b0 * a10[1], b1 =q6*/ - "prfm pldl1keep, [%[a_ptr], #256]\n" - "fmla v15.4s, v6.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q6*/ - "fmla v18.4s, v6.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q6*/ - "fmla v21.4s, v6.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q6*/ - "fmla v24.4s, v6.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q6*/ - "fmla v27.4s, v6.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q6*/ - "prfm pldl1keep, [%[b_ptr], #384]\n" - "fmla v30.4s, v6.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q6*/ - "fmla v10.4s, v7.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q7*/ - "fmla v13.4s, v7.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q7*/ - "fmla v16.4s, v7.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q7*/ - "fmla v19.4s, v7.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q7*/ - "fmla v22.4s, v7.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q7*/ - "fmla v25.4s, v7.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/ - "subs %w[k], %w[k], #1\n" /* loop count - 1*/ - "fmla v28.4s, v7.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ - "fmla v31.4s, v7.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ - "bne 1b\n" - "2:\n" /* process tail*/ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ - "beq 3f\n" /*jump to tail = 1*/ - /* final unrool 0*/ - /* unrool 0, tail > 1*/ - "fmla v8.4s , v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4*/ - "fmla v11.4s , v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4*/ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7*/ - "fmla v14.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4*/ - "fmla v17.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4*/ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q2, q3*/ - "fmla v20.4s, v4.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 =q4*/ - "fmla v23.4s, v4.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 =q4*/ - "fmla v26.4s, v4.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 =q4*/ - "fmla v29.4s, v4.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 =q4*/ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ - "fmla v9.4s, v5.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 =q5*/ - "fmla v12.4s, v5.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 =q5*/ - "fmla v15.4s, v5.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q5*/ - "fmla v18.4s, v5.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q5*/ - "fmla v21.4s, v5.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q5*/ - "fmla v24.4s, v5.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q5*/ - "fmla v27.4s, v5.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q5*/ - "fmla v30.4s, v5.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q5*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5*/ - "fmla v10.4s, v6.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q6*/ - "fmla v13.4s, v6.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q6*/ - "fmla v16.4s, v6.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q6*/ - "fmla v19.4s, v6.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q6*/ - "fmla v22.4s, v6.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q6*/ - "fmla v25.4s, v6.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q6*/ - "fmla v28.4s, v6.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q6*/ - "fmla v31.4s, v6.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q6*/ - "beq 4f\n" /*jump to tail = 2*/ - /* unrool 1, tail > 2*/ - "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1*/ - "fmla v8.4s , v7.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q7*/ - "fmla v11.4s , v7.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 =q7*/ - "fmla v14.4s, v7.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 =q7*/ - "fmla v17.4s, v7.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 =q7*/ - "fmla v20.4s, v7.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q7*/ - "fmla v23.4s, v7.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q7*/ - "fmla v26.4s, v7.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 =q7*/ - "fmla v29.4s, v7.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 =q7*/ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7*/ - "fmla v9.4s, v4.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 =q4*/ - "fmla v12.4s, v4.4s, v2.s[1]\n" /* out9 = b0 * a10[1], b1 =q4*/ - "fmla v15.4s, v4.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q4*/ - "fmla v18.4s, v4.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q4*/ - "fmla v21.4s, v4.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q4*/ - "fmla v24.4s, v4.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q4*/ - "fmla v27.4s, v4.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q4*/ - "fmla v30.4s, v4.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q4*/ - "subs %w[tail], %w[tail], #1\n" /* tail--*/ - "fmla v10.4s, v5.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q5*/ - "fmla v13.4s, v5.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q5*/ - "fmla v16.4s, v5.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q5*/ - "fmla v19.4s, v5.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q5*/ - "fmla v22.4s, v5.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q5*/ - "fmla v25.4s, v5.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q5*/ - "fmla v28.4s, v5.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q5*/ - "fmla v31.4s, v5.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q5*/ - "beq 5f\n" /*jump to tail = 3*/ - /* unrool 2, tail = 4*/ - "ldp q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5*/ - "fmla v8.4s , v6.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q6*/ - "fmla v11.4s , v6.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q6*/ - "ldp q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4*/ - "fmla v14.4s, v6.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q6*/ - "fmla v17.4s, v6.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q6*/ - "fmla v20.4s, v6.4s, v1.s[0]\n" /* out4 = b0 * a01[0], b0 =q6*/ - "fmla v23.4s, v6.4s, v1.s[1]\n" /* out5 = b0 * a01[1], b0 =q6*/ - "fmla v26.4s, v6.4s, v1.s[2]\n" /* out6 = b0 * a01[2], b0 =q6*/ - "fmla v29.4s, v6.4s, v1.s[3]\n" /* out7 = b0 * a01[3], b0 =q6*/ - "fmla v9.4s, v7.4s, v0.s[0]\n" /* out8 = b1 * a00[0], b1 =q7*/ - "fmla v12.4s, v7.4s, v0.s[1]\n" /* out9 = b1 * a00[1], b1 =q7*/ - "fmla v15.4s, v7.4s, v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q7*/ - "fmla v18.4s, v7.4s, v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q7*/ - "fmla v21.4s, v7.4s, v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q7*/ - "fmla v24.4s, v7.4s, v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q7*/ - "fmla v27.4s, v7.4s, v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q7*/ - "fmla v30.4s, v7.4s, v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q7*/ - "ldp q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/ - "fmla v10.4s, v4.4s, v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q4*/ - "fmla v13.4s, v4.4s, v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q4*/ - "fmla v16.4s, v4.4s, v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q4*/ - "fmla v19.4s, v4.4s, v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q4*/ - "fmla v22.4s, v4.4s, v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q4*/ - "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q4*/ - "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q4*/ - "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q4*/ - /* unrool 3, tail = 4*/ - "fmla v8.4s , v5.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/ - "fmla v11.4s , v5.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 =q5*/ - "fmla v14.4s, v5.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 =q5*/ - "fmla v17.4s, v5.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 =q5*/ - "fmla v20.4s, v5.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/ - "fmla v23.4s, v5.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q5*/ - "fmla v26.4s, v5.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 =q5*/ - "fmla v29.4s, v5.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 =q5*/ - "fmla v9.4s, v6.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 =q6*/ - "fmla v12.4s, v6.4s, v2.s[1]\n" /* out9 = b1 * a10[1], b1 =q6*/ - "fmla v15.4s, v6.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q6*/ - "fmla v18.4s, v6.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q6*/ - "fmla v21.4s, v6.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q6*/ - "fmla v24.4s, v6.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q6*/ - "fmla v27.4s, v6.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q6*/ - "fmla v30.4s, v6.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q6*/ - "fmla v10.4s, v7.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q7*/ - "fmla v13.4s, v7.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q7*/ - "fmla v16.4s, v7.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q7*/ - "fmla v19.4s, v7.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q7*/ - "fmla v22.4s, v7.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q7*/ - "fmla v25.4s, v7.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/ - "fmla v28.4s, v7.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ - "fmla v31.4s, v7.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ - "b 11f\n" - /* tails==1 final tail*/ - "3: \n" /* tail=1*/ - "ldr q6, [%[b_ptr]], #16\n" /* load b2 to q6*/ - "fmla v8.4s , v4.4s, v0.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/ - "fmla v11.4s , v4.4s, v0.s[1]\n" /* out1 = b0 * a10[1], b0 =q5*/ - "fmla v14.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a10[2], b0 =q5*/ - "fmla v17.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a10[3], b0 =q5*/ - "fmla v20.4s, v4.4s, v1.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/ - "fmla v23.4s, v4.4s, v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v4.4s, v1.s[2]\n" /* out6 = b0 * a11[2], b0 =q5*/ - "fmla v29.4s, v4.4s, v1.s[3]\n" /* out7 = b0 * a11[3], b0 =q5*/ - "fmla v9.4s, v5.4s, v0.s[0]\n" /* out8 = b0 * a10[0], b1 =q6*/ - "fmla v12.4s, v5.4s, v0.s[1]\n" /* out9 = b1 * a10[1], b1 =q6*/ - "fmla v15.4s, v5.4s, v0.s[2]\n" /* out10 = b1 * a10[2], b1 =q6*/ - "fmla v18.4s, v5.4s, v0.s[3]\n" /* out11 = b1 * a10[3], b1 =q6*/ - "fmla v21.4s, v5.4s, v1.s[0]\n" /* out12 = b1 * a10[0], b1 =q6*/ - "fmla v24.4s, v5.4s, v1.s[1]\n" /* out13 = b1 * a10[1], b1 =q6*/ - "fmla v27.4s, v5.4s, v1.s[2]\n" /* out14 = b1 * a10[2], b1 =q6*/ - "fmla v30.4s, v5.4s, v1.s[3]\n" /* out15 = b1 * a10[3], b1 =q6*/ - "fmla v10.4s, v6.4s, v0.s[0]\n" /* out16 = b2 * a10[0], b2 =q7*/ - "fmla v13.4s, v6.4s, v0.s[1]\n" /* out17 = b2 * a10[0], b2 =q7*/ - "fmla v16.4s, v6.4s, v0.s[2]\n" /* out18 = b2 * a10[0], b2 =q7*/ - "fmla v19.4s, v6.4s, v0.s[3]\n" /* out19 = b2 * a10[0], b2 =q7*/ - "fmla v22.4s, v6.4s, v1.s[0]\n" /* out20 = b2 * a10[0], b2 =q7*/ - "fmla v25.4s, v6.4s, v1.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/ - "fmla v28.4s, v6.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ - "fmla v31.4s, v6.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ - "b 11f\n" - /* tails==2 final tail*/ - "4:\n" /* tail = 2*/ - "fmla v8.4s , v7.4s, v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/ - "fmla v11.4s , v7.4s, v2.s[1]\n" /* out1 = b0 * a10[1], b0 =q5*/ - "fmla v14.4s, v7.4s, v2.s[2]\n" /* out2 = b0 * a10[2], b0 =q5*/ - "fmla v17.4s, v7.4s, v2.s[3]\n" /* out3 = b0 * a10[3], b0 =q5*/ - "fmla v20.4s, v7.4s, v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/ - "fmla v23.4s, v7.4s, v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v7.4s, v3.s[2]\n" /* out6 = b0 * a11[2], b0 =q5*/ - "fmla v29.4s, v7.4s, v3.s[3]\n" /* out7 = b0 * a11[3], b0 =q5*/ - "fmla v9.4s, v4.4s, v2.s[0]\n" /* out8 = b0 * a10[0], b1 =q6*/ - "fmla v12.4s, v4.4s, v2.s[1]\n" /* out9 = b1 * a10[1], b1 =q6*/ - "fmla v15.4s, v4.4s, v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q6*/ - "fmla v18.4s, v4.4s, v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q6*/ - "fmla v21.4s, v4.4s, v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q6*/ - "fmla v24.4s, v4.4s, v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q6*/ - "fmla v27.4s, v4.4s, v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q6*/ - "fmla v30.4s, v4.4s, v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q6*/ - "fmla v10.4s, v5.4s, v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q7*/ - "fmla v13.4s, v5.4s, v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q7*/ - "fmla v16.4s, v5.4s, v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q7*/ - "fmla v19.4s, v5.4s, v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q7*/ - "fmla v22.4s, v5.4s, v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q7*/ - "fmla v25.4s, v5.4s, v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/ - "fmla v28.4s, v5.4s, v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ - "fmla v31.4s, v5.4s, v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ - "b 11f\n" - /* tails==3 final tail*/ - "5:\n" /* tail = 3*/ - "ldr q4, [%[b_ptr]], #16\n" /* load b2, b0 to q4*/ - "fmla v8.4s , v6.4s, v0.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/ - "fmla v11.4s , v6.4s, v0.s[1]\n" /* out1 = b0 * a10[1], b0 =q5*/ - "fmla v14.4s, v6.4s, v0.s[2]\n" /* out2 = b0 * a10[2], b0 =q5*/ - "fmla v17.4s, v6.4s, v0.s[3]\n" /* out3 = b0 * a10[3], b0 =q5*/ - "fmla v20.4s, v6.4s, v1.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/ - "fmla v23.4s, v6.4s, v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/ - "fmla v26.4s, v6.4s, v1.s[2]\n" /* out6 = b0 * a11[2], b0 =q5*/ - "fmla v29.4s, v6.4s, v1.s[3]\n" /* out7 = b0 * a11[3], b0 =q5*/ - "fmla v9.4s, v7.4s, v0.s[0]\n" /* out8 = b0 * a10[0], b1 =q6*/ - "fmla v12.4s, v7.4s, v0.s[1]\n" /* out9 = b1 * a10[1], b1 =q6*/ - "fmla v15.4s, v7.4s, v0.s[2]\n" /* out10 = b1 * a10[2], b1 =q6*/ - "fmla v18.4s, v7.4s, v0.s[3]\n" /* out11 = b1 * a10[3], b1 =q6*/ - "fmla v21.4s, v7.4s, v1.s[0]\n" /* out12 = b1 * a10[0], b1 =q6*/ - "fmla v24.4s, v7.4s, v1.s[1]\n" /* out13 = b1 * a10[1], b1 =q6*/ - "fmla v27.4s, v7.4s, v1.s[2]\n" /* out14 = b1 * a10[2], b1 =q6*/ - "fmla v30.4s, v7.4s, v1.s[3]\n" /* out15 = b1 * a10[3], b1 =q6*/ - "fmla v10.4s, v4.4s, v0.s[0]\n" /* out16 = b2 * a10[0], b2 =q7*/ - "fmla v13.4s, v4.4s, v0.s[1]\n" /* out17 = b2 * a10[0], b2 =q7*/ - "fmla v16.4s, v4.4s, v0.s[2]\n" /* out18 = b2 * a10[0], b2 =q7*/ - "fmla v19.4s, v4.4s, v0.s[3]\n" /* out19 = b2 * a10[0], b2 =q7*/ - "fmla v22.4s, v4.4s, v1.s[0]\n" /* out20 = b2 * a10[0], b2 =q7*/ - "fmla v25.4s, v4.4s, v1.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/ - "fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/ - "fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/ - "11: \n" /* check if relu */ - "cbz %w[relu], 12f\n" /* skip relu */ - "movi v2.4s, #0\n" /* for relu*/ - "fmax v8.4s, v8.4s, v2.4s\n" /* relu*/ - "fmax v9.4s, v9.4s, v2.4s\n" /* relu*/ - "fmax v10.4s, v10.4s, v2.4s\n" /* relu*/ - "fmax v11.4s, v11.4s, v2.4s\n" /* relu*/ - "fmax v12.4s, v12.4s, v2.4s\n" /* relu*/ - "fmax v13.4s, v13.4s, v2.4s\n" /* relu*/ - "fmax v14.4s, v14.4s, v2.4s\n" /* relu*/ - "fmax v15.4s, v15.4s, v2.4s\n" /* relu*/ - "fmax v16.4s,v16.4s,v2.4s\n" /* relu*/ - "fmax v17.4s,v17.4s,v2.4s\n" /* relu*/ - "fmax v18.4s, v18.4s, v2.4s\n" /* relu*/ - "fmax v19.4s, v19.4s, v2.4s\n" /* relu*/ - "fmax v20.4s, v20.4s, v2.4s\n" /* relu*/ - "fmax v21.4s, v21.4s, v2.4s\n" /* relu*/ - "fmax v22.4s, v22.4s, v2.4s\n" /* relu*/ - "fmax v23.4s, v23.4s, v2.4s\n" /* relu*/ - "fmax v24.4s,v24.4s,v2.4s\n" /* relu*/ - "fmax v25.4s,v25.4s,v2.4s\n" /* relu*/ - "fmax v26.4s, v26.4s, v2.4s\n" /* relu*/ - "fmax v27.4s, v27.4s, v2.4s\n" /* relu*/ - "fmax v28.4s, v28.4s, v2.4s\n" /* relu*/ - "fmax v29.4s, v29.4s, v2.4s\n" /* relu*/ - "fmax v30.4s, v30.4s, v2.4s\n" /* relu*/ - "fmax v31.4s, v31.4s, v2.4s\n" /* relu*/ - "12: \n" - "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */ - "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ - "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ - "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */ - "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */ - "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */ - "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */ - "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */ - - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [k] "+r"(k), - [tail] "+r"(tail), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [c_ptr4] "+r"(c_ptr4), - [c_ptr5] "+r"(c_ptr5), - [c_ptr6] "+r"(c_ptr6), - [c_ptr7] "+r"(c_ptr7) - : [bias_ptr] "r"(bias_local), - [relu] "r"(has_relu), - [has_beta] "r"(has_beta), - [beta] "r"(beta) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "v26", - "v27", - "v28", - "v29", - "v30", - "v31"); - if (flag_p_remain && (xb == bblocks - 1)) { - for (int i = 0; i < remain; ++i) { - *pout0++ = cout0[i]; - *pout1++ = cout1[i]; - *pout2++ = cout2[i]; - *pout3++ = cout3[i]; - *pout4++ = cout4[i]; - *pout5++ = cout5[i]; - *pout6++ = cout6[i]; - *pout7++ = cout7[i]; - } - } - } - } - } -} -#else // __aarch64__ -/** - * \brief gemm with ablock = 6, bblock = 8, output 6x8 - * @param A - * @param B - * @param C - * @param M - * @param N - * @param K - * @param threads - * @param workspace - */ -void sgemm_prepacked_6x8(bool is_transB, - int M, - int N, - int K, - const float* A_packed, - const float* B, - int ldb, - float beta, - float* C, - int ldc, - const float* bias, - bool has_bias, - bool has_relu, - ARMContext* ctx) { - size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; - auto* workspace = ctx->workspace_data(); - int threads = ctx->threads(); - //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 - int x_block = - (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH)); - x_block /= NBLOCK; - x_block *= NBLOCK; - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK - 1) / NBLOCK; - x_block *= NBLOCK; - x_block = x_block < NBLOCK ? NBLOCK : x_block; - - int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; - int tail_pre = (K & (KBLOCK - 1)); - if (tail_pre == 0) { - tail_pre = KBLOCK; - } - - bool flag_p_remain = false; - int remain = 0; - - int has_beta = fabsf(beta) > 1e-8f ? 1 : 0; - - //! apanel is pre_compute outside gemm - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) { - xmax = N; - } - int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; - remain = xmax - x0 - (bblocks - 1) * NBLOCK; - if (remain > 0) { - flag_p_remain = true; - } - //! load bpanel - auto b_pannel = static_cast(workspace); - if (is_transB) { - loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax); - } else { - loadb(b_pannel, B, ldb, 0, K, x0, xmax); - } -#pragma omp parallel for num_threads(threads) - for (unsigned int y = 0; y < M; y += MBLOCK_OTH) { - unsigned int ymax = y + MBLOCK_OTH; - if (ymax > M) { - ymax = M; - } - float* c_ptr0 = C + y * ldc + x0; - float* c_ptr1 = c_ptr0 + ldc; - float* c_ptr2 = c_ptr1 + ldc; - float* c_ptr3 = c_ptr2 + ldc; - float* c_ptr4 = c_ptr3 + ldc; - float* c_ptr5 = c_ptr4 + ldc; - - float* pout0 = c_ptr0; - float* pout1 = c_ptr1; - float* pout2 = c_ptr2; - float* pout3 = c_ptr3; - float* pout4 = c_ptr4; - float* pout5 = c_ptr5; - - float bias_local[6] = {0}; - if (has_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - bias_local[4] = bias[y + 4]; - bias_local[5] = bias[y + 5]; - } - - float cout0[NBLOCK]; - float cout1[NBLOCK]; - float cout2[NBLOCK]; - float cout3[NBLOCK]; - float cout4[NBLOCK]; - float cout5[NBLOCK]; - - const float* a_ptr_l = A_packed + y * K; - const float* b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if ((y + 5) >= ymax) { - switch ((y + 5) - ymax) { - case 4: - c_ptr1 = cout1; - case 3: - c_ptr2 = cout2; - case 2: - c_ptr3 = cout3; - case 1: - c_ptr4 = cout4; - case 0: - c_ptr5 = cout5; - default: - break; - } - } - if (flag_p_remain && (xb == bblocks - 1)) { - pout0 = c_ptr0; - pout1 = c_ptr1; - pout2 = c_ptr2; - pout3 = c_ptr3; - pout4 = c_ptr4; - pout5 = c_ptr5; - - c_ptr0 = cout0; - c_ptr1 = cout1; - c_ptr2 = cout2; - c_ptr3 = cout3; - c_ptr4 = cout4; - c_ptr5 = cout5; - if (has_beta) { - for (int i = 0; i < remain; ++i) { - cout0[i] = pout0[i]; - cout1[i] = pout1[i]; - cout2[i] = pout2[i]; - cout3[i] = pout3[i]; - cout4[i] = pout4[i]; - cout5[i] = pout5[i]; - } - } - } - const float* a_ptr = a_ptr_l; - int tails = tail_pre; - int k = k_pre; - asm volatile( - // sgemm 6x8 - "vld1.32 {d2-d4}, [%[bias_ptr]] @ load bias 6 elements\n" - "pld [%[a_ptr]] @ preload a\n" - "vdup.i32 q12,d4[0] @ out40=0\n" - "pld [%[b_ptr]] @ preload b\n" - "vdup.i32 q13,d4[0] @ out41=0\n" - "pld [%[a_ptr], #64] @ preload a\n" - "vdup.i32 q14,d4[1] @ out50=0\n" - "pld [%[b_ptr], #64] @ preload b\n" - "vdup.i32 q15,d4[1] @ out51=0\n" - "pld [%[a_ptr], #128] @ preload a\n" - "vdup.i32 q4, d2[0] @ out00=0\n" - "pld [%[b_ptr], #128] @ preload b\n" - "vdup.i32 q5, d2[0] @ out01=0\n" - "vdup.i32 q6, d2[1] @ out10=0\n" - "pld [%[a_ptr], #192] @ preload a\n" - "vdup.i32 q7, d2[1] @ out11=0\n" - "pld [%[b_ptr], #192] @ preload a\n" - "vdup.i32 q8, d3[0] @ out20=0\n" - "pld [%[a_ptr], #256] @ preload a\n" - "vdup.i32 q9, d3[0] @ out21=0\n" - "pld [%[b_ptr], #256] @ preload a\n" - "vdup.i32 q10,d3[1] @ out30=0\n" - "pld [%[b_ptr], #320] @ preload b\n" - "vdup.i32 q11,d3[1] @ out31=0\n" - "pld [%[b_ptr], #384] @ preload b\n" - "cmp %[has_beta], #0\n" - "beq 11f\n" /* check beta == 0? */ - /* process beta */ - "vdup.32 q3, %[beta]\n" /* beta to vector */ - "vld1.32 {d0-d3}, [%[c_ptr0]]\n" /* load output r0 */ - "vmla.f32 q4, q0, q3\n" /* cr00 += beta * c_r00 */ - "vmla.f32 q5, q1, q3\n" /* cr01 += beta * c_r01 */ - "vld1.32 {d0-d3}, [%[c_ptr1]]\n" /* load output r1 */ - "vmla.f32 q6, q0, q3\n" /* cr10 += beta * c_r10 */ - "vmla.f32 q7, q1, q3\n" /* cr11 += beta * c_r11 */ - "vld1.32 {d0-d3}, [%[c_ptr2]]\n" /* load output r2 */ - "vmla.f32 q8, q0, q3\n" /* cr20 += beta * c_r20 */ - "vmla.f32 q9, q1, q3\n" /* cr21 += beta * c_r21 */ - "vld1.32 {d0-d3}, [%[c_ptr3]]\n" /* load output r3 */ - "vmla.f32 q10, q0, q3\n" /* cr30 += beta * c_r30 */ - "vmla.f32 q11, q1, q3\n" /* cr31 += beta * c_r31 */ - "vld1.32 {d0-d3}, [%[c_ptr4]]\n" /* load output r4 */ - "vmla.f32 q12, q0, q3\n" /* cr40 += beta * c_r40 */ - "vmla.f32 q13, q1, q3\n" /* cr41 += beta * c_r41 */ - "vld1.32 {d0-d3}, [%[c_ptr5]]\n" /* load output r5 */ - "vmla.f32 q14, q0, q3\n" /* cr50 += beta * c_r50 */ - "vmla.f32 q15, q1, q3\n" /* cr51 += beta * c_r51 */ - "11: \n" /* check loop count */ - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a0~a3\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "cmp %[k], #0 @ check weather k is bigger than " - "0\n" - "beq 0f @ jump to tail\n" - "1: @ main loop for k\n" - /* Unroll 0*/ - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a4, a5, and next a0, " - "a1\n" - "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q6, q2, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d1[1] @ out9 += b2 * a3\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - /* Unroll 1 */ - "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - /*"pld [%[a_ptr], #64] @ preload a\n"*/ - "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" - /*"pld [%[b_ptr], #192]\n"*/ - "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a0~a3\n" - "vmla.f32 q9, q3, d0[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d0[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a4, a5, a0, a1\n" - /* Unroll 2 */ - "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d3[1] @ out3 += b1 * a3\n" - /*"pld [%[a_ptr], #240] @ preload\n"*/ - "vmla.f32 q12, q2, d0[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d0[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d2[1] @ out7 += b2 * a1\n" - /*"pld [%[b_ptr], #208]\n"*/ - "vmla.f32 q9, q3, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - /* Unroll 3 */ - "vmla.f32 q4, q2, d1[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d1[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d2[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d2[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d3[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d3[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d1[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d1[1] @ out7 += b2 * a1\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a0~a3\n" - "vmla.f32 q9, q3, d2[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d2[1] @ out9 += b2 * a3\n" - "subs %[k], %[k], #1 @ k--\n" - "vmla.f32 q13, q3, d3[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d3[1] @ out11 += b2 * a5\n" - "bne 1b @ jump to main loop\n" - "0: @ process tail\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "beq 3f @ jump to tail = 1\n" - /* Unroll 0*/ - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a4,5, a0, a1\n" - "vmla.f32 q6, q2, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d1[1] @ out9 += b2 * a3\n" - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 4f @ jump to tail==2\n" - /* Unroll 1*/ - "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a0~a3\n" - "vmla.f32 q9, q3, d0[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d0[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "beq 5f @ jump to tail==3\n" - /* Unroll 2 */ - "vld1.32 {d0-d1}, [%[a_ptr] :64]! @ load a4,a5, a0,a1\n" - "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d0[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d0[1] @ out5 += b1 * a5\n" - "vld1.32 {d4-d5}, [%[b_ptr] :128]! @ load b1\n" - "vmla.f32 q5, q3, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" - "vld1.32 {d2-d3}, [%[a_ptr] :64]! @ load a2~a5\n" - "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - /* Unroll 3*/ - "vmla.f32 q4, q2, d1[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d1[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d2[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d2[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d3[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d3[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d1[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d1[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d2[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d2[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d3[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d3[1] @ out11 += b2 * a5\n" - "b 2f\n" - /* tails==1 final tail*/ - "3: @ tail=1\n" - "vmla.f32 q4, q2, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d2}, [%[a_ptr] :64]! @ load a4,a5\n" - "vmla.f32 q6, q2, d0[1] @ out1 += b1 * a1\n" - "vld1.32 {d6-d7}, [%[b_ptr] :128]! @ load b2\n" - "vmla.f32 q8, q2, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d2[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d2[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d0[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d0[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d1[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d1[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d2[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d2[1] @ out11 += b2 * a5\n" - "b 2f @ jump to end\n" - /* tails==2 final tail*/ - "4: @ tail == 2\n" - "vmla.f32 q4, q2, d3[0] @ out0 += b1 * a0\n" - "vmla.f32 q6, q2, d3[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d0[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d0[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d1[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d1[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d3[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d3[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d0[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d0[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d1[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d1[1] @ out11 += b2 * a5\n" - "b 2f @ jump to end\n" - /* tails==3 final tail*/ - "5: @ tail=3\n" - "vmla.f32 q4, q2, d2[0] @ out0 += b1 * a0\n" - "vld1.32 {d0}, [%[a_ptr] :64]! @ load a4,a5\n" - "vmla.f32 q6, q2, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q8, q2, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q10, q2, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q12, q2, d0[0] @ out4 += b1 * a4\n" - "vmla.f32 q14, q2, d0[1] @ out5 += b1 * a5\n" - "vmla.f32 q5, q3, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q7, q3, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q9, q3, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q11, q3, d3[1] @ out9 += b2 * a3\n" - "vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n" - "vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n" - "2: @ check relu\n" - "cmp %[relu], #0 @ check if has relu\n" - "ble 6f @ skip relu if relu <= 0\n" - "vmov.u32 q0, #0 @ for relu\n" - "vmax.f32 q4, q4, q0 @ for relu\n" - "vmax.f32 q5, q5, q0 @ for relu\n" - "vmax.f32 q6, q6, q0 @ for relu\n" - "vmax.f32 q7, q7, q0 @ for relu\n" - "vmax.f32 q8, q8, q0 @ for relu\n" - "vmax.f32 q9, q9, q0 @ for relu\n" - "vmax.f32 q10, q10, q0 @ for relu\n" - "vmax.f32 q11, q11, q0 @ for relu\n" - "vmax.f32 q12, q12, q0 @ for relu\n" - "vmax.f32 q13, q13, q0 @ for relu\n" - "vmax.f32 q14, q14, q0 @ for relu\n" - "vmax.f32 q15, q15, q0 @ for relu\n" - "6: @ store result\n" - "vst1.32 {d8-d11}, [%[c_ptr0]]! @ store r0\n" - "vst1.32 {d12-d15}, [%[c_ptr1]]! @ store r1\n" - "vst1.32 {d16-d19}, [%[c_ptr2]]! @ store r2\n" - "vst1.32 {d20-d23}, [%[c_ptr3]]! @ store r3\n" - "vst1.32 {d24-d27}, [%[c_ptr4]]! @ store r4\n" - "vst1.32 {d28-d31}, [%[c_ptr5]]! @ store r5\n" - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [c_ptr4] "+r"(c_ptr4), - [c_ptr5] "+r"(c_ptr5), - [k] "+r"(k), - [tails] "+r"(tails) - : [bias_ptr] "r"(bias_local), - [relu] "r"(has_relu), - [has_beta] "r"(has_beta), - [beta] "r"(beta) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15", - "cc", - "memory"); - - if (flag_p_remain && (xb == bblocks - 1)) { - for (int i = 0; i < remain; ++i) { - *pout0++ = cout0[i]; - *pout1++ = cout1[i]; - *pout2++ = cout2[i]; - *pout3++ = cout3[i]; - *pout4++ = cout4[i]; - *pout5++ = cout5[i]; - } - } - } - } - } -} - -void sgemm_prepacked_4x8(bool is_transB, - int M, - int N, - int K, - const float* A_packed, - const float* B, - int ldb, - float beta, - float* C, - int ldc, - const float* bias, - bool has_bias, - bool has_relu, - ARMContext* ctx) { - size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; - auto* workspace = ctx->workspace_data(); - int threads = ctx->threads(); - //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 - int x_block = - (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73)); - x_block /= NBLOCK; - x_block *= NBLOCK; - int x_num = (N + (x_block - 1)) / x_block; - x_block = (N + x_num - 1) / x_num; - x_block = (x_block + NBLOCK - 1) / NBLOCK; - x_block *= NBLOCK; - x_block = x_block < NBLOCK ? NBLOCK : x_block; - - int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; - int tail_pre = (K & (KBLOCK - 1)); - if (tail_pre == 0) { - tail_pre = KBLOCK; - } - - bool flag_p_remain = false; - int remain = 0; - - int has_beta = fabsf(beta) > 1e-8f ? 1 : 0; - - //! apanel is pre_compute outside gemm - for (unsigned int x0 = 0; x0 < N; x0 += x_block) { - unsigned int xmax = x0 + x_block; - if (xmax > N) { - xmax = N; - } - int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK; - remain = xmax - x0 - (bblocks - 1) * NBLOCK; - if (remain > 0) { - flag_p_remain = true; - } - //! load bpanel - auto b_pannel = static_cast(workspace); - if (is_transB) { - loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax); - } else { - loadb(b_pannel, B, ldb, 0, K, x0, xmax); - } -#pragma omp parallel for num_threads(threads) - for (unsigned int y = 0; y < M; y += MBLOCK_A73) { - unsigned int ymax = y + MBLOCK_A73; - if (ymax > M) { - ymax = M; - } - - float cout0[NBLOCK]; - float cout1[NBLOCK]; - float cout2[NBLOCK]; - float cout3[NBLOCK]; - - float bias_local[4] = {0}; - if (has_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; - } - - float* c_ptr0 = C + y * ldc + x0; - float* c_ptr1 = c_ptr0 + ldc; - float* c_ptr2 = c_ptr1 + ldc; - float* c_ptr3 = c_ptr2 + ldc; - - float* pout0 = c_ptr0; - float* pout1 = c_ptr1; - float* pout2 = c_ptr2; - float* pout3 = c_ptr3; - - const float* a_ptr_l = A_packed + y * K; - const float* b_ptr = b_pannel; - for (int xb = 0; xb < bblocks; xb++) { - if ((y + 3) >= ymax) { - switch ((y + 3) - ymax) { - case 2: - c_ptr1 = cout1; - case 1: - c_ptr2 = cout1; - case 0: - c_ptr3 = cout1; - default: - break; - } - } - if (flag_p_remain && (xb == bblocks - 1)) { - pout0 = c_ptr0; - pout1 = c_ptr1; - pout2 = c_ptr2; - pout3 = c_ptr3; - - c_ptr0 = cout0; - c_ptr1 = cout1; - c_ptr2 = cout2; - c_ptr3 = cout3; - - if (has_beta) { - for (int i = 0; i < remain; ++i) { - cout0[i] = pout0[i]; - cout1[i] = pout1[i]; - cout2[i] = pout2[i]; - cout3[i] = pout3[i]; - } - } - } - const float* a_ptr = a_ptr_l; - int tails = tail_pre; - int k = k_pre; - asm volatile( - "vld1.32 {d4-d5}, [%[bias_ptr]] @ load bias\n" - "vdup.32 q8, d4[0] @ add bias to out00\n" - "pld [%[a_ptr]] @ preload a, 64byte\n" - "vdup.32 q9, d4[0] @ add bias to out01\n" - "pld [%[b_ptr]] @ preload b\n" - "vdup.32 q10, d4[1] @ add bias to out10\n" - "pld [%[a_ptr], #64] @ preload a\n" - "vdup.32 q11, d4[1] @ add bias to out11\n" - "vdup.32 q12, d5[0] @ add bias to out20\n" - "pld [%[b_ptr], #64] @ preload b\n" - "vdup.32 q13, d5[0] @ add bias to out21\n" - "pld [%[a_ptr], #128] @ preload a\n" - "vdup.32 q14, d5[1] @ add bias to out30\n" - "pld [%[b_ptr], #128] @ preload b\n" - "vdup.32 q15, d5[1] @ add bias to out31\n" - "pld [%[b_ptr], #192] @ preload b\n" - "cmp %[has_beta], #0\n" - "beq 11f\n" /* check beta == 0? */ - /* process beta */ - "vdup.32 q4, %[beta]\n" /* beta to vector */ - "vld1.32 {d0-d3}, [%[c_ptr0]]\n" /* load output r0 */ - "vld1.32 {d4-d7}, [%[c_ptr1]]\n" /* load output r1 */ - "vmla.f32 q8, q0, q4\n" /* cr00 += beta * c_r00 */ - "vmla.f32 q9, q1, q4\n" /* cr01 += beta * c_r01 */ - "vld1.32 {d0-d3}, [%[c_ptr2]]\n" /* load output r2 */ - "vmla.f32 q10, q2, q4\n" /* cr10 += beta * c_r10 */ - "vmla.f32 q11, q3, q4\n" /* cr11 += beta * c_r11 */ - "vld1.32 {d4-d7}, [%[c_ptr3]]\n" /* load output r3 */ - "vmla.f32 q12, q0, q4\n" /* cr20 += beta * c_r20 */ - "vmla.f32 q13, q1, q4\n" /* cr21 += beta * c_r21 */ - "vmla.f32 q14, q2, q4\n" /* cr30 += beta * c_r30 */ - "vmla.f32 q15, q3, q4\n" /* cr31 += beta * c_r31 */ - "11: \n" /* check loop count */ - "vld1.32 {d0-d3}, [%[a_ptr] :128]! @ load a0~a3\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load b1\n" - "cmp %[k], #0 @ check weather k is bigger than " - "0\n" - "beq 0f @ jump to tail\n" - "1: @ main loop for k\n" - /* Unroll 0*/ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" - "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" - "vld1.32 {d4-d7}, [%[a_ptr] :128]! @ load next 2xa0~a3\n" - "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d0[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d0[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d1[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d1[1] @ out7 += b2 * a3\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load next b1, b2\n" - /* Unroll 1 */ - "vmla.f32 q8, q6, d2[0] @ out0 += b1 * a0\n" - "pld [%[b_ptr], #64] @ preload b\n" - "vmla.f32 q10, q6, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q11, q7, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q13, q7, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q15, q7, d3[1] @ out9 += b2 * a3\n" - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n" - /* Unroll 2 */ - "vmla.f32 q8, q4, d4[0] @ out0 += b1 * a0\n" - "vld1.32 {d0-d3}, [%[a_ptr] :128]! @ load next a0~a3\n" - "vmla.f32 q10, q4, d4[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d5[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d5[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d4[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d4[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load next b1, b2\n" - /* Unroll 3 */ - "vmla.f32 q8, q6, d6[0] @ out0 += b1 * a0\n" - "pld [%[a_ptr], #64] @ preload a\n" - "vmla.f32 q10, q6, d6[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d7[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d7[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d6[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q7, d6[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q7, d7[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q7, d7[1] @ out7 += b2 * a3\n" - "subs %[k], %[k], #1 @ k--\n" - "bne 1b @ jump to main loop\n" - "0: @ process tail\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "beq 3f @ jump to tail = 1\n" - /* Unroll 0*/ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n" - "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q12, q4, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d0[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d0[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d1[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d1[1] @ out7 += b2 * a3\n" - "beq 4f @ jump to tail==2\n" - /* Unroll 1 */ - "vld1.32 {d8-d11}, [%[b_ptr] :128]! @ load next b1, b2\n" - "vmla.f32 q8, q6, d2[0] @ out0 += b1 * a0\n" - "vld1.32 {d4-d7}, [%[a_ptr] :128]! @ load next 2xa0~a3\n" - "vmla.f32 q10, q6, d2[1] @ out1 += b1 * a1\n" - "subs %[tails], %[tails], #1 @ tail--\n" - "vmla.f32 q12, q6, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d2[0] @ out6 += b2 * a0\n" - "vmla.f32 q11, q7, d2[1] @ out7 += b2 * a1\n" - "vmla.f32 q13, q7, d3[0] @ out8 += b2 * a2\n" - "vmla.f32 q15, q7, d3[1] @ out9 += b2 * a3\n" - "beq 5f @ jump to tail==3\n" - /* Unroll 2 */ - "vld1.32 {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n" - "vmla.f32 q8, q4, d4[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q4, d4[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d5[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d5[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d4[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d4[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" - /* Unroll 3 */ - "vmla.f32 q8, q6, d6[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q6, d6[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d7[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d7[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d6[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q7, d6[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q7, d7[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q7, d7[1] @ out7 += b2 * a3\n" - "b 2f\n" - /* tails==1 final tail */ - "3: @ tail=1\n" - "vmla.f32 q8, q4, d0[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q4, d0[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d1[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d1[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d0[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d0[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d1[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d1[1] @ out7 += b2 * a3\n" - /*aptr - 16 */ - "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" - "b 2f @ jump to end\n" - /* tails==2 final tail*/ - "4: @ tail == 2\n" - "vmla.f32 q8, q6, d2[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q6, d2[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q6, d3[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q6, d3[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q7, d2[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q7, d2[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q7, d3[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q7, d3[1] @ out7 += b2 * a3\n" - "b 2f @ jump to end\n" - /* tails==3 final tail*/ - "5: @ tail=3\n" - "vmla.f32 q8, q4, d4[0] @ out0 += b1 * a0\n" - "vmla.f32 q10, q4, d4[1] @ out1 += b1 * a1\n" - "vmla.f32 q12, q4, d5[0] @ out2 += b1 * a2\n" - "vmla.f32 q14, q4, d5[1] @ out3 += b1 * a3\n" - "vmla.f32 q9, q5, d4[0] @ out4 += b2 * a0\n" - "vmla.f32 q11, q5, d4[1] @ out5 += b2 * a1\n" - "vmla.f32 q13, q5, d5[0] @ out6 += b2 * a2\n" - "vmla.f32 q15, q5, d5[1] @ out7 += b2 * a3\n" - /*aptr - 16*/ - "sub %[a_ptr], %[a_ptr], #16 @ tail--\n" - "2: @ check relu\n" - "cmp %[relu], #0 @ check if has relu\n" - "ble 6f @ skip relu if relu <= 0\n" - "vmov.u32 q0, #0 @ for relu\n" - "vmax.f32 q8, q8, q0 @ for relu\n" - "vmax.f32 q9, q9, q0 @ for relu\n" - "vmax.f32 q10, q10, q0 @ for relu\n" - "vmax.f32 q11, q11, q0 @ for relu\n" - "vmax.f32 q12, q12, q0 @ for relu\n" - "vmax.f32 q13, q13, q0 @ for relu\n" - "vmax.f32 q14, q14, q0 @ for relu\n" - "vmax.f32 q15, q15, q0 @ for relu\n" - "6: @ store result\n" - "vst1.32 {d16-d19}, [%[c_ptr0]]! @ store r0\n" - "vst1.32 {d20-d23}, [%[c_ptr1]]! @ store r1\n" - "vst1.32 {d24-d27}, [%[c_ptr2]]! @ store r2\n" - "vst1.32 {d28-d31}, [%[c_ptr3]]! @ store r3\n" - : [a_ptr] "+r"(a_ptr), - [b_ptr] "+r"(b_ptr), - [c_ptr0] "+r"(c_ptr0), - [c_ptr1] "+r"(c_ptr1), - [c_ptr2] "+r"(c_ptr2), - [c_ptr3] "+r"(c_ptr3), - [k] "+r"(k), - [tails] "+r"(tails) - : [bias_ptr] "r"(bias_local), - [relu] "r"(has_relu), - [has_beta] "r"(has_beta), - [beta] "r"(beta) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15", - "cc", - "memory"); - - if (flag_p_remain && (xb == bblocks - 1)) { - for (int i = 0; i < remain; ++i) { - *pout0++ = cout0[i]; - *pout1++ = cout1[i]; - *pout2++ = cout2[i]; - *pout3++ = cout3[i]; - } - } - } - } - } -} -#endif // __aarch64__ - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm.h b/lite/backends/arm/math/packed_sgemm.h deleted file mode 100644 index 396ca7beb9..0000000000 --- a/lite/backends/arm/math/packed_sgemm.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/context.h" -#include "lite/core/device_info.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -#ifdef __aarch64__ -constexpr int MBLOCK = 8; -constexpr int NBLOCK = 12; -constexpr int KBLOCK = 4; -inline int get_hblock(ARMArch arch) { return MBLOCK; } -#else -constexpr int MBLOCK_A73 = 4; -constexpr int MBLOCK_OTH = 6; -constexpr int NBLOCK = 8; -constexpr int KBLOCK = 4; -inline int get_hblock(ARMArch arch) { - if (arch == kA73) { - return MBLOCK_A73; - } else { - return MBLOCK_OTH; - } -} -#endif // __aarch64__ - -void prepackA(float* out, - const float* in, - float alpha, - int ldin, - int m0, - int mmax, - int k0, - int kmax, - bool is_trans, - ARMContext* ctx); - -void prepackA(TensorLite* tout, - const TensorLite& tin, - float alpha, - int m, - int k, - int group, - bool is_trans, - ARMContext* ctx); - -void sgemm_prepack(bool is_transB, - int M, - int N, - int K, - const float* A_packed, - const float* B, - int ldb, - float beta, - float* C, - int ldc, - const float* bias, - bool has_bias, - bool has_relu, - ARMContext* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/pad2d.cc b/lite/backends/arm/math/pad2d.cc deleted file mode 100644 index 35c4fafb77..0000000000 --- a/lite/backends/arm/math/pad2d.cc +++ /dev/null @@ -1,413 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/pad2d.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void pad_constant(const float* din, - float* dout, - int n, - int c, - int h, - int w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const float pad_value) { - int h_in = h - pad_top - pad_bottom; - int w_in = w - pad_left - pad_right; - int spatial_size_out = w * h; - int spatial_size_in = h_in * w_in; -#pragma omp parallel for - for (int s = 0; s < n * c; ++s) { - const float* din_s = din + s * spatial_size_in; - float* dout_s = dout + s * spatial_size_out; - int top_loop = (w * pad_top) >> 3; - int top_loop_remain = (w * pad_top) & 7; - float32x4_t vpad_value = vdupq_n_f32(pad_value); - // process top - for (int i = 0; i < top_loop; ++i) { - vst1q_f32(dout_s, vpad_value); - vst1q_f32(dout_s + 4, vpad_value); - dout_s += 8; - } - for (int i = 0; i < top_loop_remain; ++i) { - *dout_s++ = pad_value; - } - // process med - int left_loop = pad_left >> 2; - int left_loop_remain = pad_left & 3; - int med_loop = w_in >> 3; - int med_loop_remain = w_in & 7; - for (int i = 0; i < left_loop; ++i) { - vst1q_f32(dout_s, vpad_value); - dout_s += 4; - } - - for (int i = 0; i < left_loop_remain; ++i) { - *dout_s++ = pad_value; - } - - for (int i = 0; i < med_loop; ++i) { - float32x4_t val = vld1q_f32(din_s); - float32x4_t val1 = vld1q_f32(din_s + 4); - vst1q_f32(dout_s, val); - vst1q_f32(dout_s + 4, val1); - dout_s += 8; - din_s += 8; - } - for (int i = 0; i < med_loop_remain; ++i) { - float val = *din_s++; - *dout_s++ = val; - } - - int loop = (pad_right + pad_left) >> 2; - int loop_remain = (pad_right + pad_left) & 3; - for (int j = 0; j < h_in - 1; ++j) { - for (int i = 0; i < loop; ++i) { - vst1q_f32(dout_s, vpad_value); - dout_s += 4; - } - - for (int i = 0; i < loop_remain; ++i) { - *dout_s++ = pad_value; - } - - for (int i = 0; i < med_loop; ++i) { - float32x4_t val = vld1q_f32(din_s); - float32x4_t val1 = vld1q_f32(din_s + 4); - vst1q_f32(dout_s, val); - vst1q_f32(dout_s + 4, val1); - dout_s += 8; - din_s += 8; - } - - for (int i = 0; i < med_loop_remain; ++i) { - *dout_s++ = *din_s++; - } - } - int right_loop = pad_right >> 2; - int right_loop_remain = pad_right & 3; - - for (int i = 0; i < right_loop; ++i) { - vst1q_f32(dout_s, vpad_value); - dout_s += 4; - } - - for (int i = 0; i < right_loop_remain; ++i) { - *dout_s++ = pad_value; - } - // process bottom - int bottom_loop = (pad_bottom * w) >> 3; - int bottom_loop_remain = (pad_bottom * w) & 7; - for (int i = 0; i < bottom_loop; ++i) { - vst1q_f32(dout_s, vpad_value); - vst1q_f32(dout_s + 4, vpad_value); - dout_s += 8; - } - for (int i = 0; i < bottom_loop_remain; ++i) { - *dout_s++ = pad_value; - } - } -} - -void pad_edge(const float* din, - float* dout, - int n, - int c, - int h, - int w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const float pad_value) { - int h_in = h - pad_top - pad_bottom; - int w_in = w - pad_left - pad_right; - int spatial_size_out = w * h; - int spatial_size_in = h_in * w_in; -#pragma omp parallel for - for (int s = 0; s < n * c; ++s) { - const float* din_s = din + s * spatial_size_in; - float* dout_s = dout + s * spatial_size_out; - - // process med - int left_loop = pad_left >> 2; - int right_loop = pad_right >> 2; - int med_loop = w_in >> 3; - int med_loop_remain = w_in & 7; - int left_loop_remain = pad_left & 3; - int right_loop_remain = pad_right & 3; - float* dout_med = dout_s + w * pad_top; - for (int j = 0; j < h_in; ++j) { - float edge_val = din_s[0]; - float32x4_t vedge = vdupq_n_f32(edge_val); - for (int i = 0; i < left_loop; ++i) { - vst1q_f32(dout_med, vedge); - dout_med += 4; - } - for (int i = 0; i < left_loop_remain; ++i) { - *dout_med++ = edge_val; - } - for (int i = 0; i < med_loop; ++i) { - float32x4_t val = vld1q_f32(din_s); - float32x4_t val1 = vld1q_f32(din_s + 4); - vst1q_f32(dout_med, val); - vst1q_f32(dout_med + 4, val1); - din_s += 8; - dout_med += 8; - } - for (int i = 0; i < med_loop_remain; ++i) { - *dout_med++ = *din_s++; - } - edge_val = din_s[-1]; - vedge = vdupq_n_f32(edge_val); - for (int i = 0; i < right_loop; ++i) { - vst1q_f32(dout_med, vedge); - dout_med += 4; - } - for (int i = 0; i < right_loop_remain; ++i) { - *dout_med++ = edge_val; - } - } - - // process bottom - float* dout_bottom = dout_med; - for (int i = 0; i < pad_bottom; ++i) { - memcpy(dout_bottom, dout_s + w * (pad_top + h_in - 1), w * sizeof(float)); - dout_bottom += w; - } - - // process top - float* dout_top = dout_s; - for (int i = 0; i < pad_top; ++i) { - memcpy(dout_top, dout_s + w * pad_top, w * sizeof(float)); - dout_top += w; - } - } -} - -void pad_reflect(const float* din, - float* dout, - int n, - int c, - int h, - int w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const float pad_value) { - int h_in = h - pad_top - pad_bottom; - int w_in = w - pad_left - pad_right; - int spatial_size_out = w * h; - int spatial_size_in = h_in * w_in; -#pragma omp parallel for - for (int s = 0; s < n * c; ++s) { - const float* din_s = din + s * spatial_size_in; - float* dout_s = dout + s * spatial_size_out; - - // process med - int left_loop = pad_left >> 2; - int right_loop = pad_right >> 2; - int med_loop = w_in >> 3; - int med_loop_remain = w_in & 7; - int left_loop_remain = pad_left & 3; - int right_loop_remain = pad_right & 3; - float* dout_med = dout_s + w * pad_top; - for (int j = 0; j < h_in; ++j) { -#ifdef __aarch64__ - for (int i = 0; i < left_loop; ++i) { - float32x4_t val = vld1q_f32(din_s + left_loop_remain + - ((left_loop - i - 1) << 2) + 1); - val = vrev64q_f32(val); - float32x2_t low = vget_low_f32(val); - float32x2_t high = vget_high_f32(val); - float32x2_t tmp = low; - low = high; - high = tmp; - float32x4_t val1 = vcombine_f32(low, high); - vst1q_f32(dout_med, val1); - dout_med += 4; - } -#else - const float* din_s_ptr = - din_s + left_loop_remain + ((left_loop - 1) << 2) + 1; - int cnt = left_loop; - if (cnt > 0) { - asm volatile( - "1: \n" - "vld1.32 {d0-d1}, [%[din_s]] \n" - "subs %[cnt], #1 \n" - "sub %[din_s], #16 \n" - "vrev64.32 q1, q0 \n" - "vswp d2, d3 \n" - "vst1.32 {d2-d3}, [%[dout_med]]!\n" - "bne 1b \n" - : - [din_s] "+r"(din_s_ptr), [dout_med] "+r"(dout_med), [cnt] "+r"(cnt) - : - : "cc", "memory", "q0", "q1"); - } -#endif // __aarch64__ - for (int i = 0; i < left_loop_remain; ++i) { - *dout_med++ = *(din_s + left_loop_remain - i); - } - for (int i = 0; i < med_loop; ++i) { - float32x4_t val = vld1q_f32(din_s); - float32x4_t val1 = vld1q_f32(din_s + 4); - vst1q_f32(dout_med, val); - vst1q_f32(dout_med + 4, val1); - din_s += 8; - dout_med += 8; - } - for (int i = 0; i < med_loop_remain; ++i) { - *dout_med++ = *din_s++; - } -#ifdef __aarch64__ - for (int i = 0; i < right_loop; ++i) { - float32x4_t val = vld1q_f32(din_s - ((i + 1) << 2) - 1); - val = vrev64q_f32(val); - float32x2_t low = vget_low_f32(val); - float32x2_t high = vget_high_f32(val); - float32x2_t tmp = low; - low = high; - high = tmp; - float32x4_t val1 = vcombine_f32(low, high); - vst1q_f32(dout_med, val1); - dout_med += 4; - } -#else - din_s_ptr = din_s - 5; - cnt = right_loop; - if (cnt > 0) { - asm volatile( - "1: \n" - "vld1.32 {d0-d1}, [%[din_s]] \n" - "subs %[cnt], #1 \n" - "sub %[din_s], #16 \n" - "vrev64.32 q1, q0 \n" - "vswp d2, d3 \n" - "vst1.32 {d2-d3}, [%[dout_med]]!\n" - "bne 1b \n" - : - [din_s] "+r"(din_s_ptr), [dout_med] "+r"(dout_med), [cnt] "+r"(cnt) - : - : "cc", "memory", "q0", "q1"); - } -#endif // __aarch64__ - const float* remain = din_s - (right_loop << 2) - 2; - for (int i = 0; i < right_loop_remain; ++i) { - *dout_med++ = *remain--; - } - } - - // process bottom - float* dout_bottom = dout_med; - float* dout_bottom_reflect = dout_med - (w << 1); - for (int i = 0; i < pad_bottom; ++i) { - memcpy(dout_bottom, dout_bottom_reflect, w * sizeof(float)); - dout_bottom += w; - dout_bottom_reflect -= w; - } - - // process top - float* dout_top = dout_s; - float* dout_top_reflect = dout_s + w * (pad_top << 1); - for (int i = 0; i < pad_top; ++i) { - memcpy(dout_top, dout_top_reflect, w * sizeof(float)); - dout_top += w; - dout_top_reflect -= w; - } - } -} - -// void pad2d_func(const lite::Tensor *input,lite::Tensor *output) -void pad2d_func(const lite::Tensor* input, - lite::Tensor* output, - int _mode, - std::vector _pad_h, - std::vector _pad_w, - float _pad_value) { - float* dout = output->mutable_data(); // modified by zhiqiang - const float* din = input->data(); // modified by zhiqiang - - auto output_dims = output->dims(); - // nchw - int on = output_dims[0]; - int oc = output_dims[1]; - int oh = output_dims[2]; - int ow = output_dims[3]; - ///////////////////////////// - /* _mode是PadMode - typedef enum{ - PAD_CONSTANT = 0, - PAD_EDGE = 1, - PAD_REFLECT = 2, - } PadMode; */ - ///////////////////////// - if (_mode == 0) { - pad_constant(din, - dout, - on, - oc, - oh, - ow, - _pad_h[0], - _pad_h[1], - _pad_w[0], - _pad_w[1], - _pad_value); - } else if (_mode == 1) { - pad_edge(din, - dout, - on, - oc, - oh, - ow, - _pad_h[0], - _pad_h[1], - _pad_w[0], - _pad_w[1], - _pad_value); - } else if (_mode == 2) { - pad_reflect(din, - dout, - on, - oc, - oh, - ow, - _pad_h[0], - _pad_h[1], - _pad_w[0], - _pad_w[1], - _pad_value); - } else { - LOG(ERROR) << "ERROR: unknown pad mode " << _mode; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/pad2d.h b/lite/backends/arm/math/pad2d.h deleted file mode 100644 index 08c5c8c1a2..0000000000 --- a/lite/backends/arm/math/pad2d.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/operators/op_params.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void pad_constant(const float* din, - float* dout, - int n, - int c, - int h, - int w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const float pad_value); -void pad_edge(const float* din, - float* dout, - int n, - int c, - int h, - int w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const float pad_value); -void pad_reflect(const float* din, - float* dout, - int n, - int c, - int h, - int w, - const int pad_top, - const int pad_bottom, - const int pad_left, - const int pad_right, - const float pad_value); -void pad2d_func(const lite::Tensor* input, - lite::Tensor* output, - int _mode, - std::vector _pad_h, - std::vector _pad_w, - float _pad_value); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc deleted file mode 100644 index 38078580c2..0000000000 --- a/lite/backends/arm/math/pooling.cc +++ /dev/null @@ -1,3173 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/pooling.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void pooling_basic(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool global_pooling, - bool exclusive, - bool adaptive, - bool ceil_mode, - bool use_quantizer, - const std::string& pooling_type) { - // no need to pad input tensor, border is zero pad inside this function - int kernel_h = ksize[0]; - int kernel_w = ksize[1]; - int stride_h = strides[0]; - int stride_w = strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[1]; - int size_channel_in = win * hin; - int size_channel_out = wout * hout; - if (global_pooling) { - if (pooling_type == "max") { // Pooling_max - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; ++c) { - const float* din_ch = din_batch + c * size_channel_in; // in address - float tmp1 = din_ch[0]; - for (int i = 0; i < size_channel_in; ++i) { - float tmp2 = din_ch[i]; - tmp1 = tmp1 > tmp2 ? tmp1 : tmp2; - } - dout_batch[c] = tmp1; - } - } - } else if (pooling_type == "avg") { - // Pooling_average_include_padding - // Pooling_average_exclude_padding - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; ++c) { - const float* din_ch = din_batch + c * size_channel_in; // in address - float sum = 0.f; - for (int i = 0; i < size_channel_in; ++i) { - sum += din_ch[i]; - } - dout_batch[c] = sum / size_channel_in; - } - } - } else { - LOG(FATAL) << "unsupported pooling type: " << pooling_type; - } - } else { - if (pooling_type == "max") { - // Pooling_max - for (int n = 0; n < num; ++n) { - float* dout_ch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_row = dout_ch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - int pool_size = (hend - hstart) * (wend - wstart); - if (pool_size == 0) continue; - float tmp1 = din_ch[hstart * win + wstart]; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - float tmp2 = din_ch[h * win + w]; - tmp1 = tmp1 > tmp2 ? tmp1 : tmp2; - } - } - dout_row[j] = tmp1; - } - dout_row += wout; - } - } - } - } else if (pooling_type == "avg") { - if (exclusive) { - // Pooling_average_exclude_padding - for (int n = 0; n < num; ++n) { - float* dout_ch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_row = dout_ch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - int pool_size = (hend - hstart) * (wend - wstart); - if (pool_size == 0) continue; - float sum = 0.f; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - sum += din_ch[h * win + w]; - } - } - dout_row[j] = sum / pool_size; - } - dout_row += wout; - } - } - } - } else { // Pooling_average_include_padding - for (int n = 0; n < num; ++n) { - float* dout_ch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_row = dout_ch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - int pool_size = (hend - hstart) * (wend - wstart); - if (pool_size == 0) continue; - float sum = 0.f; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - sum += din_ch[h * win + w]; - } - } - dout_row[j] = sum / (kernel_w * kernel_h); - } - dout_row += wout; - } - } - } - } - } else { - LOG(FATAL) << "unsupported pooling type: " << pooling_type; - } - } -} - -void pooling_global_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win) { - int size_channel_in = win * hin; - int cnt = size_channel_in / 8; - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; ++c) { - const float* din_ch = din_batch + c * size_channel_in; - int i = 0; - float minval = std::numeric_limits::lowest(); - float32x4_t vmax = vdupq_n_f32(minval); -#ifdef __aarch64__ - for (; i < cnt; i++) { - float32x4_t vdin1 = vld1q_f32(din_ch); - vmax = vmaxq_f32(vdin1, vmax); - float32x4_t vdin2 = vld1q_f32(din_ch + 4); - vmax = vmaxq_f32(vmax, vdin2); - din_ch += 8; - } -#else - int cnt_num = cnt; - if (cnt_num > 0) { - asm volatile( - "max_loop: @main loop\n" - "vld1.f32 {d0-d1}, [%[din_ch]]! @load q1,din_ch\n" - "vmax.f32 %q[vmax], %q[vmax], q0 @max vmax,vmax,din_ch\n" - "vld1.f32 {d2-d3}, [%[din_ch]]! @load 2nd 4 data\n" - "vmax.f32 %q[vmax], %q[vmax], q1 @compare 2nd 4 datas\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "bne max_loop @bne cnt_num\n" - : [din_ch] "+r"(din_ch), [cnt_num] "+r"(cnt_num), [vmax] "+w"(vmax) - : - : "cc", "memory", "q0", "q1"); - } -#endif // __aarch64__ - float32x2_t vmax_tmp = vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax)); - float tmp1 = vget_lane_f32(vmax_tmp, 0); - float tmp2 = vget_lane_f32(vmax_tmp, 1); - float max_tmp = tmp1 > tmp2 ? tmp1 : tmp2; - for (i = cnt * 8; i < size_channel_in; ++i) { - /* code */ - max_tmp = max_tmp > din_ch[0] ? max_tmp : din_ch[0]; - din_ch++; - } - dout_batch[c] = max_tmp; - } - } -} - -void pooling_global_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win) { - int size_channel_in = win * hin; - int cnt = size_channel_in / 4; - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - const float* din_ch = din_batch + c * size_channel_in; // in address - int i = 0; - float32x4_t vsum = vdupq_n_f32(0.0f); -#ifdef __aarch64__ - for (; i < cnt; i++) { - vsum = vaddq_f32(vld1q_f32(din_ch), vsum); - din_ch += 4; - } -#else - int cnt_num = cnt; - if (cnt_num > 0) { - asm volatile( - "add_loop: @main loop\n" - "vld1.f32 {d0-d1}, [%[din_ch]]! @load q1,din_ch\n" - "vadd.f32 %q[vsum], %q[vsum], q0 @add vmax,vmax, din_ch\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "bne add_loop @bne num\n" - : [din_ch] "+r"(din_ch), [cnt_num] "+r"(cnt_num), [vsum] "+w"(vsum) - : - : "cc", "memory", "q0"); - } -#endif // __aarch64__ - float32x2_t vsum_tmp = vadd_f32(vget_low_f32(vsum), vget_high_f32(vsum)); - float sum = vget_lane_f32(vsum_tmp, 0) + vget_lane_f32(vsum_tmp, 1); - for (i = cnt * 4; i < size_channel_in; i++) { - sum += din_ch[0]; - din_ch++; - } - dout_batch[c] = sum / size_channel_in; - } - } -} - -void pooling2x2s2_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win) { - int kernel = 2; - int stride = 2; - int padding = 0; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_needed = (wout << 1); - int h_needed = (hout << 1); - int w_limit = w_needed > win ? win : w_needed; - int h_limit = h_needed > hin ? hin : h_needed; - int w_even = (w_limit >> 1) << 1; - int h_even = (h_limit >> 1) << 1; - int w_unroll_size = (w_even >> 3) << 3; - // int w_unroll_remain = w_even - w_unroll_size; - int w_in_2 = win << 1; - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - int h = 0; - for (; h < h_even; h += 2) { - int w = 0; -#ifdef __aarch64__ - for (; w < w_unroll_size; w += 8) { - float32x4_t dr00 = vld1q_f32(&r0[w]); - float32x4_t dr01 = vld1q_f32(&r0[w + 4]); - float32x4_t dr10 = vld1q_f32(&r1[w]); - float32x4_t dr11 = vld1q_f32(&r1[w + 4]); - float32x4_t dmax1 = vmaxq_f32(dr00, dr10); - float32x4_t dmax2 = vmaxq_f32(dr01, dr11); -#ifdef __aarch64__ - float32x4_t dmax = vpmaxq_f32(dmax1, dmax2); -#else - float32x2_t dmaxl = - vpmax_f32(vget_low_f32(dmax1), vget_high_f32(dmax1)); - float32x2_t dmaxh = - vpmax_f32(vget_low_f32(dmax2), vget_high_f32(dmax2)); - float32x4_t dmax = vcombine_f32(dmaxl, dmaxh); -#endif - vst1q_f32(&dout_ch[w >> 1], dmax); - } -#else - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - int cnt_num = w_unroll_size >> 3; - if (cnt_num > 0) { - asm volatile( - "s2_max_loop: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load q0,dr0\n" - "vld1.f32 {d4-d7}, [%[dr1]]! @load q1,dr1\n" - "vmax.f32 q0, q0, q2 @max q0,q0,q2\n" - "vmax.f32 q1, q1, q3 @max q1,q1,q2\n" - "vpmax.f32 d4, d0, d1 @max d4,d0,d1\n" - "vpmax.f32 d5, d2, d3 @max d5,d2,d3\n" - "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2,dr_out\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "bne s2_max_loop @bne cnt_num\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num) - : - : "cc", "memory", "q0", "q1", "q2", "q3"); - } - w = w_unroll_size; -#endif // __aarch64__ - for (; w < w_even; w += 2) { - dout_ch[w >> 1] = - std::max(std::max(r0[w], r0[w + 1]), std::max(r1[w], r1[w + 1])); - } - for (; w < w_limit; ++w) { // run 0 or 1 time - dout_ch[w >> 1] = std::max(r0[w], r1[w]); - } - r0 += w_in_2; // << 1; - r1 += w_in_2; // << 1; - dout_ch += wout; - } - // process remain row (odd, last row) - for (; h < h_limit; h++) { // run 0 or 1 time - int w = 0; -#ifdef __aarch64__ - for (; w < w_unroll_size; w += 8) { - float32x4_t dr00 = vld1q_f32(&r0[w]); - float32x4_t dr01 = vld1q_f32(&r0[w + 4]); -#ifdef __aarch64__ - float32x4_t dmax = vpmaxq_f32(dr00, dr01); -#else - float32x2_t dmaxl = - vpmax_f32(vget_low_f32(dr00), vget_high_f32(dr00)); - float32x2_t dmaxh = - vpmax_f32(vget_low_f32(dr01), vget_high_f32(dr01)); - float32x4_t dmax = vcombine_f32(dmaxl, dmaxh); -#endif - vst1q_f32(&dout_ch[w >> 1], dmax); - } -#else - float* dr_out = dout_ch; - const float* dr0 = r0; - int cnt_num = w_unroll_size >> 3; - if (cnt_num > 0) { - asm volatile( - "s2_max_loop1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load q0,dr0\n" - "vpmax.f32 d4, d0, d1 @max d4,d0,d1\n" - "vpmax.f32 d5, d2, d3 @max d5,d2,d3\n" - "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2,dr_out\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "bne s2_max_loop1 @bne cnt_num\n" - : [dr0] "+r"(dr0), [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num) - : - : "cc", "memory", "q0", "q1", "q2"); - } - w = w_unroll_size; -#endif // __aarch64__ - for (; w < w_even; w += 2) { - dout_ch[w >> 1] = std::max(r0[w], r0[w + 1]); - } - for (; w < w_limit; ++w) { // run 0 or 1 time - dout_ch[w >> 1] = r0[w]; - } - } - } - } -} - -void pooling2x2s2_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive) { - int kernel = 2; - int stride = 2; - int padding = 0; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_needed = (wout << 1); - int h_needed = (hout << 1); - int w_limit = w_needed > win ? win : w_needed; - int h_limit = h_needed > hin ? hin : h_needed; - int w_even = (w_limit >> 1) << 1; - int h_even = (h_limit >> 1) << 1; - int w_unroll_size = (w_even >> 3) << 3; - // int w_unroll_remain = w_even - w_unroll_size; - int w_in_2 = win << 1; - const float coef = 1.f / 4.f; - const float coef_1 = exclusive ? 1.f : coef; - const float coef_2 = exclusive ? 1.f / 2.f : coef; - float32x4_t vcoef = vdupq_n_f32(coef); - float32x4_t vcoef_1 = vdupq_n_f32(coef_1); - float32x4_t vcoef_2 = vdupq_n_f32(coef_2); - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - int h = 0; - for (; h < h_even; h += 2) { - int w = 0; -#ifdef __aarch64__ - for (; w < w_unroll_size; w += 8) { - float32x4_t dr00 = vld1q_f32(&r0[w]); - float32x4_t dr01 = vld1q_f32(&r0[w + 4]); - float32x4_t dr10 = vld1q_f32(&r1[w]); - float32x4_t dr11 = vld1q_f32(&r1[w + 4]); - float32x4_t dsum1 = vaddq_f32(dr00, dr10); - float32x4_t dsum2 = vaddq_f32(dr01, dr11); -#ifdef __aarch64__ - float32x4_t dsum = vpaddq_f32(dsum1, dsum2); -#else - float32x2_t dsuml = - vpadd_f32(vget_low_f32(dsum1), vget_high_f32(dsum1)); - float32x2_t dsumh = - vpadd_f32(vget_low_f32(dsum2), vget_high_f32(dsum2)); - float32x4_t dsum = vcombine_f32(dsuml, dsumh); -#endif - float32x4_t res = vmulq_f32(dsum, vcoef); - vst1q_f32(&dout_ch[w >> 1], res); - } -#else - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - int cnt_num = w_unroll_size >> 3; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load q0,dr0\n" - "vld1.f32 {d4-d7}, [%[dr1]]! @load q1,dr1\n" - "vadd.f32 q0, q0, q2 @add q0,q0,q2\n" - "vadd.f32 q1, q1, q3 @add q1,q1,q2\n" - "vpadd.f32 d4, d0, d1 @add d4,d0,d1\n" - "vpadd.f32 d5, d2, d3 @add d5,d2,d3\n" - "vmul.f32 q2, q2, %q[vcoef] @mul q2,q2,vcoef\n" - "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2,dr_out\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "bne 1b @bne cnt_num\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [vcoef] "+w"(vcoef), - [cnt_num] "+r"(cnt_num) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "w"(vcoef) - : "cc", "memory", "q0", "q1", "q2", "q3"); - } - w = w_unroll_size; -#endif // __aarch64__ - for (; w < w_even; w += 2) { - dout_ch[w >> 1] = (r0[w] + r0[w + 1] + r1[w] + r1[w + 1]) * coef; - } - for (; w < w_limit; ++w) { // run 0 or 1 time - dout_ch[w >> 1] = (r0[w] + r1[w]) * coef_2; - } - r0 += w_in_2; // << 1; - r1 += w_in_2; // << 1; - dout_ch += wout; - } - // process remain row (odd, last row) - for (; h < h_limit; h++) { // run 0 or 1 time - int w = 0; -#ifdef __aarch64__ - for (; w < w_unroll_size; w += 8) { - float32x4_t dr00 = vld1q_f32(&r0[w]); - float32x4_t dr01 = vld1q_f32(&r0[w + 4]); -#ifdef __aarch64__ - float32x4_t dsum = vpaddq_f32(dr00, dr01); -#else - float32x2_t dsuml = - vpadd_f32(vget_low_f32(dr00), vget_high_f32(dr00)); - float32x2_t dsumh = - vpadd_f32(vget_low_f32(dr01), vget_high_f32(dr01)); - float32x4_t dsum = vcombine_f32(dsuml, dsumh); -#endif - float32x4_t res = vmulq_f32(dsum, vcoef_2); - vst1q_f32(&dout_ch[w >> 1], res); - } -#else - float* dr_out = dout_ch; - const float* dr0 = r0; - int cnt_num = w_unroll_size >> 3; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load q0,dr0\n" - "vpadd.f32 d4, d0, d1 @add d4,d0,d1\n" - "vpadd.f32 d5, d2, d3 @add d5,d2,d3\n" - "vmul.f32 q2, q2, %q[vcoef_2] @mul q2,q2,vcoef_2\n" - "vst1.f32 {d4-d5}, [%[dr_out]]! @vst1 q2,dr_out\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "bne 1b @bne cnt_num\n" - : [dr0] "+r"(dr0), - [dr_out] "+r"(dr_out), - [vcoef_2] "+w"(vcoef_2), - [cnt_num] "+r"(cnt_num) - : "r"(dr0), "r"(dr_out), "r"(cnt_num), "w"(vcoef_2) - : "cc", "memory", "q0", "q1", "q2"); - } - w = w_unroll_size; -#endif // __aarch64__ - for (; w < w_even; w += 2) { - dout_ch[w >> 1] = (r0[w] + r0[w + 1]) * coef_2; - } - for (; w < w_limit; ++w) { // run 0 or 1 time - dout_ch[w >> 1] = r0[w] * coef_1; - } - } - } - } -} - -void pooling3x3s1p1_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win) { - int kernel = 3; - int stride = 1; - int padding = 1; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_unroll_size = ((win - 2) >> 2) << 2; - int w_unroll_remain = win - 2 - w_unroll_size; - const float minval = std::numeric_limits::lowest(); - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - const float* r2 = r1 + win; - int cnt_num = w_unroll_size >> 2; // w_unroll_size / 4 - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - const float* dr2 = r2; - int w = 0; - int cnt = 1; - // left - dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); -// first row with zero pad -#ifdef __aarch64__ - for (; w < w_unroll_size; w += 4) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_34_56 = - vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56); - float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0)); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3); - vst1q_f32(&dout_ch[cnt], vmax); - cnt += 4; - } - -#else - dr_out = dr_out + 1; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d2}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6}, [%[dr1]]! @load d4-d7,dr1\n" - "vmax.f32 q5, q0, q2 @max r0_1234,r1_1234\n" - "vmax.f32 d12, d2, d6 @max r0_5678,r1_5678\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q5, q6, #1 @vext max_2345\n" - "vext.f32 q2, q5, q6, #2 @vext max_3456\n" - "vpmax.f32 d2, d10, d11 @pmax d4,max_1234,max_1234\n" - "vpmax.f32 d3, d0, d1 @pmax d4,max_2345,max_2345\n" - "vpmax.f32 d6, d4, d5 @pmax d6,max_3456,max_3456\n" - "vmax.f32 d8, d2, d3 @max d2,vmax_12_34,vmax_23_45\n" - "vmax.f32 d9, d3, d6 @max d2,vmax_23_45,vmax_34_56\n" - "sub %[dr0], #8 @sub w,8\n" - "sub %[dr1], #8 @sub w,8\n" - // swap - "vmov.f32 s0, s17 @mov\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s0 @mov\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s1_max_loop\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); - } - -#endif - // remain - w = w_unroll_size; - for (int j = 0; j < w_unroll_remain; j++) { - float tmp_max = std::max(r0[j + w], r1[j + w]); - tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1])); - tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2])); - dout_ch[j + w + 1] = tmp_max; - } - // right - float tmp = std::max(r0[win - 2], r1[win - 2]); - tmp = std::max(tmp, std::max(r0[win - 1], r1[win - 1])); - dout_ch[wout - 1] = tmp; - - // r0 = r1; - // r1 = r0 + w_in; - // r2 = r1 + w_in; - dout_ch += wout; - int h = 0; - for (; h < hin - 2; h += 1) { - // deal with left pad - float maxr0 = std::max(r0[0], r0[1]); - float maxr1 = std::max(r1[0], r1[1]); - float maxr2 = std::max(r2[0], r2[1]); - dout_ch[0] = std::max(std::max(maxr0, maxr1), maxr2); -#ifdef __aarch64__ - w = 0; - cnt = 1; - for (; w < w_unroll_size; w += 4) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr2_1234 = vld1q_f32(&r2[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678); - - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_34_56 = - vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56); - float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0)); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3); - vst1q_f32(&dout_ch[cnt], vmax); - cnt += 4; - } -#else - dr_out = dout_ch + 1; - dr0 = r0; - dr1 = r1; - dr2 = r2; - cnt_num = w_unroll_size >> 2; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d8-d9}, [%[dr2]]! @load d4-d7,dr1\n" - "vld1.f32 {d2}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d10}, [%[dr2]]! @load d4-d7, dr1\n" - "vmax.f32 q7, q0, q2 @max r0_1234,r1_1234\n" - "vmax.f32 d16, d2, d6 @max r0_5678,r1_5678\n" - "vmax.f32 q3, q7, q4 @max r0_1234,r1_1234\n" - "vmax.f32 d12, d16, d10 @max r0_5678,r1_5678\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q3, q6, #1 @vext max_2345\n" - "vext.f32 q2, q3, q6, #2 @vext max_3456\n" - "vpmax.f32 d2, d6, d7 @pmax d4,max_1234,max_1234\n" - "vpmax.f32 d3, d0, d1 @pmax d4,max_2345,max_2345\n" - "vpmax.f32 d6, d4, d5 @pmax d6,max_3456,max_3456\n" - "vmax.f32 d8, d2, d3 @max d2,vmax_12_34,vmax_23_45\n" - "vmax.f32 d9, d3, d6 @max d2,vmax_23_45,vmax_34_56\n" - "sub %[dr0], #8 @sub w,8\n" - "sub %[dr1], #8 @sub w,8\n" - "sub %[dr2], #8 @sub w,8\n" - // swap - "vmov.f32 s0, s17 @mov\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s0 @mov\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s1_max_loop\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8"); - } -#endif - // remain - w = w_unroll_size; - for (int j = 0; j < w_unroll_remain; j++) { - float tmp_max = std::max(r0[j + w], r1[j + w]); - tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1])); - tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2])); - tmp_max = std::max(tmp_max, std::max(r2[j + w], r2[j + w + 1])); - tmp_max = std::max(tmp_max, r2[j + w + 2]); - dout_ch[j + w + 1] = tmp_max; - } - // right - tmp = std::max(r0[win - 2], r1[win - 2]); - tmp = std::max(tmp, std::max(r0[win - 1], r1[win - 1])); - tmp = std::max(tmp, std::max(r2[win - 2], r2[win - 1])); - dout_ch[wout - 1] = tmp; - - r0 = r1; - r1 = r2; - r2 = r1 + win; - dout_ch += wout; - } - - // the last two line - float maxr0 = std::max(r0[0], r0[1]); - float maxr1 = std::max(r1[0], r1[1]); - dout_ch[0] = std::max(maxr0, maxr1); -#ifdef __aarch64__ - w = 0; - cnt = 1; - for (; w < w_unroll_size; w += 4) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_34_56 = - vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56); - float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0)); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2); - vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3); - vst1q_f32(&dout_ch[cnt], vmax); - cnt += 4; - } -#else - dr_out = dout_ch + 1; - dr0 = r0; - dr1 = r1; - cnt_num = w_unroll_size >> 2; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d2}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6}, [%[dr1]]! @load d4-d7,dr1\n" - "vmax.f32 q5, q0, q2 @max r0_1234,r1_1234\n" - "vmax.f32 d12, d2, d6 @max r0_5678,r1_5678\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q5, q6, #1 @vext max_2345\n" - "vext.f32 q2, q5, q6, #2 @vext max_3456\n" - "vpmax.f32 d2, d10, d11 @pmax d4,max_1234,max_1234\n" - "vpmax.f32 d3, d0, d1 @pmax d4,max_2345,max_2345\n" - "vpmax.f32 d6, d4, d5 @pmax d6,max_3456,max_3456\n" - "vmax.f32 d8, d2, d3 @max d2,vmax_12_34,vmax_23_45\n" - "vmax.f32 d9, d3, d6 @max d2,vmax_23_45,vmax_34_56\n" - "sub %[dr0], #8 @sub w,8\n" - "sub %[dr1], #8 @sub w,8\n" - // swap - "vmov.f32 s0, s17 @mov\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s0 @mov\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s1_max_loop\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); - } -#endif - // remian - w = w_unroll_size; - for (int j = 0; j < w_unroll_remain; j++) { - float tmp_max = std::max(r0[j + w], r1[j + w]); - tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1])); - tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2])); - dout_ch[j + w + 1] = tmp_max; - } - tmp = std::max(r0[win - 2], r1[win - 2]); - tmp = std::max(tmp, std::max(r0[win - 1], r1[win - 1])); - dout_ch[wout - 1] = tmp; - } - } -} - -void pooling3x3s1p1_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive) { - int kernel = 3; - int stride = 1; - int padding = 1; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_unroll_size = ((win - 2) >> 2) << 2; - int w_unroll_remain = win - 2 - w_unroll_size; - const float coef = 1.f / 9.f; - const float coef_2 = exclusive ? 1.f / 2.f : coef; - const float coef_4 = exclusive ? 1.f / 4.f : coef; - const float coef_6 = exclusive ? 1.f / 6.f : coef; - float32x4_t vcoef = vdupq_n_f32(coef); - float32x4_t vcoef_2 = vdupq_n_f32(coef_2); - float32x4_t vcoef_4 = vdupq_n_f32(coef_4); - float32x4_t vcoef_6 = vdupq_n_f32(coef_6); - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - const float* r2 = r1 + win; - int cnt_num = w_unroll_size >> 2; // w_unroll_size / 4 - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - const float* dr2 = r2; - int w = 0; - int cnt = 1; - // left - dout_ch[0] = (r0[0] + r0[1] + r1[0] + r1[1]) * coef_4; -// first row with zero pad -#ifdef __aarch64__ - for (; w < w_unroll_size; w += 4) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); - float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); - - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345); - vsum = vaddq_f32(vsum, vsum_3456); - vsum = vmulq_f32(vsum, vcoef_6); - vst1q_f32(&dout_ch[cnt], vsum); - cnt += 4; - } -#else - dr_out = dr_out + 1; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d2}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6}, [%[dr1]]! @load d4-d7,dr1\n" - "vadd.f32 q5, q0, q2 @max r0_1234,r1_1234\n" - "vadd.f32 d12, d2, d6 @max r0_5678,r1_5678\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q5, q6, #1 @vext max_2345\n" - "vext.f32 q2, q5, q6, #2 @vext max_3456\n" - "vadd.f32 q1, q5, q0 @add 1234+2345\n" - "vadd.f32 q1, q1, q2 @add + 3456\n" - "vmul.f32 q4, q1, %q[vcoef_6] @mul * 1/9.f\n" - "sub %[dr0], #8 @sub w,8\n" - "sub %[dr1], #8 @sub w,8\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s1_max_loop\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [vcoef_6] "+w"(vcoef_6) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); - } - -#endif - // remain - w = w_unroll_size; - for (int j = 0; j < w_unroll_remain; j++) { - float tmp_sum = r0[j + w] + r1[j + w]; - tmp_sum += (r0[j + w + 1] + r1[j + w + 1]); - tmp_sum += (r0[j + w + 2] + r1[j + w + 2]); - dout_ch[j + w + 1] = tmp_sum * coef_6; - } - // right - float tmp = r0[win - 2] + r1[win - 2]; - tmp += (r0[win - 1] + r1[win - 1]); - dout_ch[wout - 1] = tmp * coef_4; - - // r0 = r1; - // r1 = r0 + w_in; - // r2 = r1 + w_in; - dout_ch += wout; - int h = 0; - for (; h < hin - 2; h += 1) { - // deal with left pad - float maxr0 = r0[0] + r0[1]; - float maxr1 = r1[0] + r1[1]; - float maxr2 = r2[0] + r2[1]; - dout_ch[0] = (maxr0 + maxr1 + maxr2) * coef_6; -#ifdef __aarch64__ - w = 0; - cnt = 1; - for (; w < w_unroll_size; w += 4) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr2_1234 = vld1q_f32(&r2[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); - float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); - vsum_1234 = vaddq_f32(vsum_1234, vr2_1234); - float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); - vsum_5678 = vaddq_f32(vsum_5678, vr2_5678); - - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345); - vsum = vaddq_f32(vsum, vsum_3456); - vsum = vmulq_f32(vsum, vcoef); - vst1q_f32(&dout_ch[cnt], vsum); - cnt += 4; - } -#else - dr_out = dout_ch + 1; - dr0 = r0; - dr1 = r1; - dr2 = r2; - cnt_num = w_unroll_size >> 2; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d8-d9}, [%[dr2]]! @load d4-d7,dr1\n" - "vld1.f32 {d2}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d10}, [%[dr2]]! @load d4-d7,dr1\n" - "vadd.f32 q7, q0, q2 @max r0_1234,r1_1234\n" - "vadd.f32 d16, d2, d6 @max r0_5678,r1_5678\n" - "vadd.f32 q3, q7, q4 @max r0_1234,r1_1234\n" - "vadd.f32 d12, d16, d10 @max r0_5678,r1_5678\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q3, q6, #1 @vext max_2345\n" - "vext.f32 q2, q3, q6, #2 @vext max_3456\n" - "vadd.f32 q1, q3, q0 @add 1234+2345\n" - "vadd.f32 q1, q1, q2 @add+3456\n" - "vmul.f32 q4, q1, %q[vcoef] @mul*1/9.f\n" - "sub %[dr0], #8 @sub w,8\n" - "sub %[dr1], #8 @sub w,8\n" - "sub %[dr2], #8 @sub w,8\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s1_max_loop\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [vcoef] "+w"(vcoef) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8"); - } -#endif - // remain - w = w_unroll_size; - for (int j = 0; j < w_unroll_remain; j++) { - float tmp_sum = r0[j + w] + r1[j + w]; - tmp_sum += (r0[j + w + 1] + r1[j + w + 1]); - tmp_sum += (r0[j + w + 2] + r1[j + w + 2]); - tmp_sum += (r2[j + w + 1] + r2[j + w + 2]); - tmp_sum += r2[j + w]; - dout_ch[j + w + 1] = tmp_sum * coef; - } - // right - tmp = r0[win - 2] + r1[win - 2]; - tmp += (r0[win - 1] + r1[win - 1]); - tmp += (r2[win - 2] + r2[win - 1]); - dout_ch[wout - 1] = tmp * coef_6; - - r0 = r1; - r1 = r2; - r2 = r1 + win; - dout_ch += wout; - } - - // last line - float maxr0 = (r0[0] + r0[1]); - float maxr1 = (r1[0] + r1[1]); - dout_ch[0] = (maxr0 + maxr1) * coef_4; -#ifdef __aarch64__ - w = 0; - cnt = 1; - for (; w < w_unroll_size; w += 4) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); - float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); - - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345); - vsum = vaddq_f32(vsum, vsum_3456); - vsum = vmulq_f32(vsum, vcoef_6); - vst1q_f32(&dout_ch[cnt], vsum); - cnt += 4; - } -#else - dr_out = dout_ch + 1; - dr0 = r0; - dr1 = r1; - cnt_num = w_unroll_size >> 2; - if (cnt_num > 0) { - asm volatile( - "1: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d4-d5}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d2}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6}, [%[dr1]]! @load d4-d7,dr1\n" - "vadd.f32 q5, q0, q2 @max r0_1234,r1_1234\n" - "vadd.f32 d12, d2, d6 @max r0_5678,r1_5678\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q5, q6, #1 @vext max_2345\n" - "vext.f32 q2, q5, q6, #2 @vext max_3456\n" - "vadd.f32 q1, q5, q0 @add 1234+2345\n" - "vadd.f32 q1, q1, q2 @add + 3456\n" - "vmul.f32 q4, q1, %q[vcoef_6] @mul * 1/9.f\n" - "sub %[dr0], #8 @sub w,8\n" - "sub %[dr1], #8 @sub w,8\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s1_max_loop\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [vcoef_6] "+w"(vcoef_6) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); - } -#endif - // remain - w = w_unroll_size; - for (int j = 0; j < w_unroll_remain; j++) { - float tmp_sum = r0[j + w] + r1[j + w]; - tmp_sum += (r0[j + w + 1] + r1[j + w + 1]); - tmp_sum += (r0[j + w + 2] + r1[j + w + 2]); - dout_ch[j + w + 1] = tmp_sum * coef_6; - } - // right - tmp = r0[win - 2] + r1[win - 2]; - tmp += (r0[win - 1] + r1[win - 1]); - dout_ch[wout - 1] = tmp * coef_4; - } - } -} - -void pooling3x3s2p1_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win) { - int kernel = 3; - int stride = 2; - int padding = 1; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_needed = (wout << 1) + 1; - int h_needed = (hout << 1) + 1; - int w_limit = w_needed > win ? win : w_needed; - int h_limit = h_needed > hin ? hin : h_needed; - int w_even = (w_limit >> 1) << 1; - int h_even = (h_limit >> 1) << 1; - int w_unroll_size = ((w_even - 1) >> 3) << 3; - int w_unroll_remain = w_even - 1 - w_unroll_size; - int w_remain = w_needed - w_limit - padding; - int h_remain = h_needed - h_limit - padding; - int w_in_2 = win << 1; - float minval = std::numeric_limits::lowest(); - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - const float* r2 = r1 + win; - int cnt_num = w_unroll_size >> 3; - int cnt_num_remain = w_unroll_remain >> 1; - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - const float* dr2 = r2; - int w = 1; - int cnt = 1; - dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); -// first row with zero pad -#if __aarch64__ - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_56_78 = - vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); - float32x2_t vmax_67_89 = - vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); - vst1_f32(&dout_ch[cnt], vmax_123_345); - vst1_f32(&dout_ch[cnt + 2], vmax_567_789); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - vr0 = vsetq_lane_f32(minval, vr0, 3); - vr1 = vsetq_lane_f32(minval, vr1, 3); - float32x4_t vmax1 = vmaxq_f32(vr0, vr1); - float32x2_t vmax2 = - vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); - vmax2 = vpmax_f32(vmax2, vmax2); - dout_ch[cnt] = vget_lane_f32(vmax2, 0); - cnt++; - } -#else - dr0 = dr0 + 1; - dr1 = dr1 + 1; - dr_out = dr_out + 1; - // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " << - // cnt_num_remain; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7,dr1\n" - "vmax.f32 q6, q0, q3 @max r0_1234,r1_1234\n" - "vmax.f32 q7, q1, q4 @max r0_5678,r1_5678\n" - "vmax.f32 q8, q2, q5 @max r0_9101112,r1_9101112\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q6, q7, #1 @vext max_2345\n" - "vext.f32 q1, q7, q8, #1 @vext max_6789\n" - "vpmax.f32 d4, d12, d13 @pmax d4,vmax_1234,vmax_1234\n" - "vpmax.f32 d6, d14, d15 @pmax d6,vmax_5678,vmax_5678\n" - "vpmax.f32 d5, d0, d1 @pmax d5,vmax_2345,vmax_2345\n" - "vpmax.f32 d7, d2, d3 @pmax d7,vmax_6789,vmax_6789\n" - "vmax.f32 d8, d4, d5 @max d2,vmax_12_34,vmax_23_45\n" - "vmax.f32 d9, d6, d7 @max d2,vmax_56_78,vmax_67_89\n" - "sub %[dr0], #16 @add w,8\n" - "sub %[dr1], #16 @add w, 8\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "subs %[cnt_num], #1 @subs cnt_num, #1\n" - "bne 1b @bne s3_max_loop\n" - "3: @loop \n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num,0\n" - "ble 4f @ble exit\n" - "2: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vmov.f32 s3,s2 @movs3,s2\n" - "vmov.f32 s7,s6 @movs7,s6\n" - "vmax.f32 q0, q0, q1 @max q0,q0,q1\n" - "vpmax.f32 d0, d0, d1 @pmax d0,d0,d1\n" - "vpmax.f32 d0, d0, d0 @pmax d0,d0,d0\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @subs cnt_num,#1\n" - "bne 2b @bne s3_max_loop_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9"); - } -#endif - // int w = w_even - 1; - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp = r0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { // only run 1 or 2 times - tmp = std::max(tmp, std::max(r0[i], r1[i])); - } - dout_ch[w_even >> 1] = tmp; - // cnt ++; - } - - r0 = r1; - r1 = r0 + win; - r2 = r1 + win; - dout_ch += wout; - int h = 2; - for (; h < h_even; h += 2) { - // deal with left pad - float maxr0 = std::max(r0[0], r0[1]); - float maxr1 = std::max(r1[0], r1[1]); - float maxr2 = std::max(r2[0], r2[1]); - dout_ch[0] = std::max(std::max(maxr0, maxr1), maxr2); -#if __aarch64__ - w = 1; - cnt = 1; - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - float32x4_t vr2_1234 = vld1q_f32(&r2[w]); - float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); - float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678); - float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); - vmax_9101112 = vmaxq_f32(vmax_9101112, vr2_9101112); - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_56_78 = - vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); - float32x2_t vmax_67_89 = - vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); - vst1_f32(&dout_ch[cnt], vmax_123_345); - vst1_f32(&dout_ch[cnt + 2], vmax_567_789); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - float32x4_t vr2 = vld1q_f32(&r2[w]); - vr0 = vsetq_lane_f32(minval, vr0, 3); - vr1 = vsetq_lane_f32(minval, vr1, 3); - vr2 = vsetq_lane_f32(minval, vr2, 3); - float32x4_t vmax1 = vmaxq_f32(vr0, vr1); - vmax1 = vmaxq_f32(vmax1, vr2); - float32x2_t vmax2 = - vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); - float32x2_t vmax = vpmax_f32(vmax2, vmax2); - dout_ch[cnt] = vget_lane_f32(vmax, 0); - cnt++; - } -#else - dr_out = dout_ch + 1; - dr0 = (r0 + 1); - dr1 = (r1 + 1); - dr2 = (r2 + 1); - cnt_num = w_unroll_size >> 3; - cnt_num_remain = w_unroll_remain >> 1; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7,dr1\n" - "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d16-d17}, [%[dr2]]! @load d4-d7,dr1\n" - "vmax.f32 q9, q0, q3 @max q0,q0,q2\n" - "vmax.f32 q10, q1, q4 @max q1,q1,q3\n" - "vmax.f32 q11, q2, q5 @max q1,q1,q3\n" - "vmax.f32 q0, q9, q6 @max q0,q0,q2 1234\n" - "vmax.f32 q3, q10, q7 @max q1,q1,q3 5678\n" - "vmax.f32 q1, q11, q8 @max q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7, s6\n" - "vext.f32 q4, q0, q3, #1 @vext 2345\n" - "vext.f32 q2, q3, q1, #1 @vext 6789\n" - "vpmax.f32 d10, d0, d1 @pmax d10,vmax_1234,vmax_1234\n" - "vpmax.f32 d12, d6, d7 @pmax d12,vmax_5678,vmax_5678\n" - "vpmax.f32 d11, d8, d9 @pmax d11,vmax_2345,vmax_2345\n" - "vpmax.f32 d13, d4, d5 @pmax d13,vmax_6789,vmax_6789\n" - "vmax.f32 d0, d10, d11 @pmax d0,vmax_12_34,vmax_23_45\n" - "vmax.f32 d1, d12, d13 @pmax d1,vmax_56_78,vmax_67_89\n" - "sub %[dr0], #16 @add w,8\n" - "sub %[dr1], #16 @add w,8\n" - "sub %[dr2], #16 @add w,8\n" - "vst1.f32 d0, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d1, [%[dr_out]]! @vst1 d0,dr_out\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "bne 1b @bne s3_max_loop_mid\n" - "3: @loop \n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num,0\n" - "ble 4f @ble exit1\n" - "2: @mid loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3,dr1\n" - "vmov.f32 s3,s2 @movs3,s2\n" - "vmov.f32 s7,s6 @movs7,s6\n" - "vmov.f32 s11,s10 @movs11,s10\n" - "vmax.f32 q0, q0, q1 @max q0,q0,q1\n" - "vmax.f32 q0, q0, q2 @max q0,q0,q2\n" - "vpmax.f32 d0, d0, d1 @pmax d0,d0,d1\n" - "vpmax.f32 d0, d0, d0 @pmax d0, d0,d0\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "sub %[dr2], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @subs cnt_num,#1\n" - "bne 2b @bne s3_max_loop_mid_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain) - : "r"(dr0), - "r"(dr1), - "r"(dr2), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp = r0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { - tmp = std::max(tmp, std::max(r0[i], r1[i])); - tmp = std::max(tmp, r2[i]); - } - dout_ch[w_even >> 1] = tmp; - // cnt ++; - } - r0 = r2; - r1 = r0 + win; - r2 = r1 + win; - dout_ch += wout; - } - - if (h_remain > 0) { - // deal with bottom pad - // first row with zero pad - int hstart = (h >> 1) * stride - padding; - int hend = std::min(std::min(hstart + kernel, hin + padding), hin); - if (hstart == hend - 1) { // only one lline - dout_ch[0] = std::max(r0[0], r0[1]); -#if __aarch64__ - w = 1; - cnt = 1; - for (; w < w_unroll_size; w += 8) { - float32x4_t vmax_1234 = vld1q_f32(&r0[w]); - float32x4_t vmax_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vmax_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_56_78 = - vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); - float32x2_t vmax_67_89 = - vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); - vst1_f32(&dout_ch[cnt], vmax_123_345); - vst1_f32(&dout_ch[cnt + 2], vmax_567_789); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - vr0 = vsetq_lane_f32(minval, vr0, 3); - float32x2_t vmax = vpmax_f32(vget_low_f32(vr0), vget_high_f32(vr0)); - vmax = vpmax_f32(vmax, vmax); - dout_ch[cnt] = vget_lane_f32(vmax, 0); - cnt++; - } -#else - dr_out = dout_ch + 1; - dr0 = (r0 + 1); - cnt_num = w_unroll_size >> 3; - cnt_num_remain = w_unroll_remain >> 1; - // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " << - // cnt_num_remain; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d3,dr0\n" - "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3,dr0\n" - "vext.f32 q4, q0, q1, #1 @vmax_2345\n" - "vext.f32 q5, q1, q2, #1 @vmax_6789\n" - "vpmax.f32 d12, d0, d1 @vmax_12_34\n" - "vpmax.f32 d14, d2, d3 @vmax_56_78\n" - "vpmax.f32 d13, d8, d9 @vmax_23_45\n" - "vpmax.f32 d15, d10, d11 @vmax_67_89\n" - "vmax.f32 d0, d12, d13 @12_34,23_45\n" - "vmax.f32 d1, d14, d15 @56_78,67_89\n" - "sub %[dr0], #16 @add w,6\n" - "vst1.f32 d0, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d1, [%[dr_out]]! @vst1 d0,dr_out\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "bne 1b @bne s3_max_loop_bot\n" - "3: @loop \n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num,0\n" - "ble 4f @ble exit\n" - "2: @bot loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vmov.f32 s3,s2 @movs3, s2\n" - "vpmax.f32 d0, d0, d1 @pmax d0,d0,d1\n" - "vpmax.f32 d0, d0, d0 @pmax d0,d0,d0\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "sub %[dr0], #8 @add w,2\n" - "subs %[cnt_num_remain], #1 @subs cnt_num,#1\n" - "bne 2b @bne s3_max_loop_bot_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain) - : "r"(dr0), - "r"(dr1), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp = r0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { - tmp = std::max(tmp, r0[i]); - } - dout_ch[w_even >> 1] = tmp; - } - } else { // two lines - dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1])); -#ifdef __aarch64__ - w = 1; - cnt = 1; - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_56_78 = - vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); - float32x2_t vmax_67_89 = - vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); - vst1_f32(&dout_ch[cnt], vmax_123_345); - vst1_f32(&dout_ch[cnt + 2], vmax_567_789); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - vr0 = vsetq_lane_f32(minval, vr0, 3); - vr1 = vsetq_lane_f32(minval, vr1, 3); - float32x4_t vmax1 = vmaxq_f32(vr0, vr1); - float32x2_t vmax2 = - vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); - vmax2 = vpmax_f32(vmax2, vmax2); - dout_ch[cnt] = vget_lane_f32(vmax2, 0); - cnt++; - } -#else - dr_out = dout_ch + 1; - dr0 = (r0 + 1); - dr1 = (r1 + 1); - cnt_num = w_unroll_size >> 3; - cnt_num_remain = w_unroll_remain >> 1; - // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " << - // cnt_num_remain; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3,dr0\n" - "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7,dr1\n" - "vmax.f32 q6, q0, q3 @max q0,q0,q2 1234\n" - "vmax.f32 q7, q1, q4 @max q1,q1,q3 5678\n" - "vmax.f32 q8, q2, q5 @max q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7, s6\n" - "vext.f32 q0, q6, q7, #1 @vext q0,2345\n" - "vext.f32 q1, q7, q8, #1 @vext q1,6789\n" - "vpmax.f32 d4, d12, d13 @pmax " - "d4,vmax_1234,vmax_1234\n" - "vpmax.f32 d6, d14, d15 @pmax " - "d6,vmax_5678,vmax_5678\n" - "vpmax.f32 d5, d0, d1 @pmax " - "d5,vmax_2345,vmax_2345\n" - "vpmax.f32 d7, d2, d3 @pmax " - "d7,vmax_6789,vmax_6789\n" - "vmax.f32 d8, d4, d5 @max " - "d2,vmax_12_34,vmax_23_45\n" - "vmax.f32 d9, d6, d7 @max " - "d2,vmax_56_78,vmax_67_89\n" - "sub %[dr0], #16 @add w,8\n" - "sub %[dr1], #16 @add w,8\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "bne 1b @bne s3_max_loop_bot\n" - "3: @loop \n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num,0\n" - "ble 4f @ble exit\n" - "2: @bot loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vmov.f32 s3,s2 @movs3, s2\n" - "vmov.f32 s7,s6 @movs7, s6\n" - "vmax.f32 q0, q0, q1 @max q0,q0,q1\n" - "vpmax.f32 d0, d0, d1 @pmax d0,d0,d1\n" - "vpmax.f32 d0, d0, d0 @pmax d0,d0,d0\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @subs cnt_num,#1\n" - "bne 2b @bne s3_max_loop_bot_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain) - : "r"(dr0), - "r"(dr1), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp = r0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { // only run 1 or 2 times - tmp = std::max(tmp, std::max(r0[i], r1[i])); - } - dout_ch[w_even >> 1] = tmp; - } - } - } - } - } -} - -void pooling3x3s2p1_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive) { - int kernel = 3; - int stride = 2; - int padding = 1; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_needed = (wout << 1) + 1; - int h_needed = (hout << 1) + 1; - int w_limit = w_needed > win ? win : w_needed; - int h_limit = h_needed > hin ? hin : h_needed; - int w_even = (w_limit >> 1) << 1; - int h_even = (h_limit >> 1) << 1; - int w_unroll_size = ((w_even - 1) >> 3) << 3; - int w_unroll_remain = w_even - 1 - w_unroll_size; - int w_remain = w_needed - w_limit - padding; - int h_remain = h_needed - h_limit - padding; - int w_in_2 = win << 1; - const float coef = 1.f / 9.f; - const float coef_1 = exclusive ? 1.f : coef; - const float coef_2 = exclusive ? 1.f / 2.f : coef; - const float coef_3 = exclusive ? 1.f / 3.f : coef; - const float coef_4 = exclusive ? 1.f / 4.f : coef; - const float coef_6 = exclusive ? 1.f / 6.f : coef; - float32x4_t vcoef = vdupq_n_f32(coef); - float32x4_t vcoef_1 = vdupq_n_f32(coef_1); - float32x4_t vcoef_2 = vdupq_n_f32(coef_2); - float32x4_t vcoef_3 = vdupq_n_f32(coef_3); - float32x4_t vcoef_4 = vdupq_n_f32(coef_4); - float32x4_t vcoef_6 = vdupq_n_f32(coef_6); - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - const float* r2 = r1 + win; - int cnt_num = w_unroll_size >> 3; - int cnt_num_remain = w_unroll_remain >> 1; - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - const float* dr2 = r2; - int w = 1; - int cnt = 1; - float32x4_t vzero = vdupq_n_f32(0.f); - dout_ch[0] = (r0[0] + r0[1] + r1[0] + r1[1]) * coef_4; -// first row with zero pad -#ifdef __aarch64__ - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); - float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); - float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); - - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); - float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); - float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); - vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); - float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); - vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); - float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_6); - vst1q_f32(&dout_ch[cnt], vrst); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - vr0 = vsetq_lane_f32(0.f, vr0, 3); - vr1 = vsetq_lane_f32(0.f, vr1, 3); - float32x4_t vsum1 = vaddq_f32(vr0, vr1); - float32x2_t vsum2 = - vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); - vsum2 = vpadd_f32(vsum2, vsum2); - float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef_6)); - dout_ch[cnt] = vget_lane_f32(vrst, 0); - cnt++; - } -#else - dr0 = dr0 + 1; - dr1 = dr1 + 1; - dr_out = dr_out + 1; - // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " << - // cnt_num_remain; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7,dr1\n" - "vadd.f32 q6, q0, q3 @max r0_1234,r1_1234\n" - "vadd.f32 q7, q1, q4 @max r0_5678,r1_5678\n" - "vadd.f32 q8, q2, q5 @max r0_9101112,r1_9101112\n" - //"vmov.f32 s7,s6 @mov s7, s6\n" - "vext.f32 q0, q6, q7, #1 @vext max_2345\n" - "vext.f32 q1, q6, q7, #3 @vext max_4567\n" - "vext.f32 q2, q6, q7, #2 @vext max_3456\n" - "vext.f32 q3, q7, q8, #1 @vext max_6789\n" - "vadd.f32 q4, q6, q0 @add 1234, 2345\n" - "vadd.f32 q5, q7, q1 @add 5678, 4567\n" - "vadd.f32 q4, q4, q2 @add 3456, sum1\n" - "vadd.f32 q5, q5, q3 @add 6789, sum2\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s21 @mov\n" - "vmov.f32 s19, s23 @mov\n" - "vmul.f32 q4, q4, %q[vcoef_6] @mul\n" - "sub %[dr0], #16 @add w,8\n" - "sub %[dr1], #16 @add w,8\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s3_max_loop\n" - "3: @loop\n" - "cmp %[cnt_num_remain], #0 @cnt_num_remain<=0\n" - "ble 4f @ble exit\n" - "2: @main loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vext.f32 q0, %q[vzero], q0, #3 @ext v0_0123\n" - "vext.f32 q1, %q[vzero], q1, #3 @ext v1_0123\n" - "vadd.f32 q0, q0, q1 @add q0,q0,q1\n" - "vpadd.f32 d0, d0, d1 @padd d0,d0,d1\n" - "vpadd.f32 d0, d0, d0 @padd d0, d0,d0\n" - "vmul.f32 d0, d0, %e[vcoef_6] @mul\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @subs cnt_num,#1\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "bne 2b @bne s3_max_loop_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain), - [vcoef_6] "+w"(vcoef_6), - [vzero] "+w"(vzero) - : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9"); - } -#endif - // int w = w_even - 1; - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp1 = 0.f; // std::numeric_limits::min(); - float tmp2 = exclusive ? 1.0f / (2.f * (wend - wstart)) : coef; - for (int i = wstart; i < wend; i++) { // only run 1 or 2 times - tmp1 += (r0[i] + r1[i]); - } - dout_ch[w_even >> 1] = tmp1 * tmp2; - // cnt ++; - } - - r0 = r1; - r1 = r0 + win; - r2 = r1 + win; - dout_ch += wout; - int h = 2; - for (; h < h_even; h += 2) { - // deal with left pad - float sum0 = r0[0] + r0[1]; - float sum1 = r1[0] + r1[1]; - float sum2 = r2[0] + r2[1]; - dout_ch[0] = (sum0 + sum1 + sum2) * coef_6; -#ifdef __aarch64__ - w = 1; - cnt = 1; - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - float32x4_t vr2_1234 = vld1q_f32(&r2[w]); - float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); - float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); - float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); - float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); - float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); - vsum_1234 = vaddq_f32(vsum_1234, vr2_1234); - vsum_5678 = vaddq_f32(vsum_5678, vr2_5678); - vsum_9101112 = vaddq_f32(vsum_9101112, vr2_9101112); - - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); - float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); - float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); - vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); - float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); - vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); - float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef); - vst1q_f32(&dout_ch[cnt], vrst); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - float32x4_t vr2 = vld1q_f32(&r2[w]); - vr0 = vsetq_lane_f32(0.f, vr0, 3); - vr1 = vsetq_lane_f32(0.f, vr1, 3); - vr2 = vsetq_lane_f32(0.f, vr2, 3); - float32x4_t vsum1 = vaddq_f32(vr0, vr1); - vsum1 = vaddq_f32(vsum1, vr2); - float32x2_t vsum2 = - vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); - float32x2_t vsum = vpadd_f32(vsum2, vsum2); - dout_ch[cnt] = vget_lane_f32(vsum, 0) * coef; - cnt++; - } -#else - dr_out = dout_ch + 1; - dr0 = (r0 + 1); - dr1 = (r1 + 1); - dr2 = (r2 + 1); - cnt_num = w_unroll_size >> 3; - cnt_num_remain = w_unroll_remain >> 1; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, " - "dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7,dr1\n" - "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d16-d17}, [%[dr2]]! @load d4-d7,dr1\n" - "vadd.f32 q9, q0, q3 @max q0,q0,q2\n" - "vadd.f32 q10, q1, q4 @max q1,q1,q3\n" - "vadd.f32 q11, q2, q5 @max q1,q1,q3\n" - "vadd.f32 q6, q9, q6 @max q0,q0,q2 1234\n" - "vadd.f32 q7, q10, q7 @max q1,q1,q3 5678\n" - "vadd.f32 q8, q11, q8 @max q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7, s6\n" - "vext.f32 q0, q6, q7, #1 @vext max_2345\n" - "vext.f32 q1, q6, q7, #3 @vext max_4567\n" - "vext.f32 q2, q6, q7, #2 @vext max_3456\n" - "vext.f32 q3, q7, q8, #1 @vext max_6789\n" - "vadd.f32 q4, q6, q0 @add 1234,2345\n" - "vadd.f32 q5, q7, q1 @add 5678,4567\n" - "vadd.f32 q4, q4, q2 @add 3456,sum1\n" - "vadd.f32 q5, q5, q3 @add 6789,sum2\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s21 @mov\n" - "vmov.f32 s19, s23 @mov\n" - "vmul.f32 q4, q4, %q[vcoef] @mul\n" - "sub %[dr0], #16 @add w,8\n" - "sub %[dr1], #16 @add w,8\n" - "sub %[dr2], #16 @add w, 8\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s3_max_loop_mid\n" - "3: @loop\n" - "cmp %[cnt_num_remain], #0 @cnt_num_remain<=0\n" - "ble 4f @ble exit1\n" - "2: @mid loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3,dr1\n" - "vext.f32 q0, %q[vzero], q0, #3 @ext v0_0123\n" - "vext.f32 q1, %q[vzero], q1, #3 @ext v1_0123\n" - "vext.f32 q2, %q[vzero], q2, #3 @ext v1_0123\n" - "vadd.f32 q0, q0, q1 @add q0,q0,q1\n" - "vadd.f32 q0, q0, q2 @add q0,q0,q1\n" - "vpadd.f32 d0, d0, d1 @padd d0,d0,d1\n" - "vpadd.f32 d0, d0, d0 @padd d0,d0,d0\n" - "vmul.f32 d0, d0, %e[vcoef] @mul\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "sub %[dr2], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @cnt_num_remain--\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "bne 2b @bne s3_max_loop_mid_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain), - [vcoef] "+w"(vcoef), - [vzero] "+w"(vzero) - : "r"(dr0), - "r"(dr1), - "r"(dr2), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp1 = 0.f; - float tmp2 = exclusive ? 1.0f / (3.f * (wend - wstart)) : coef; - for (int i = wstart; i < wend; i++) { - tmp1 += (r0[i] + r1[i] + r2[i]); - } - dout_ch[w_even >> 1] = tmp1 * tmp2; - // cnt ++; - } - r0 = r2; - r1 = r0 + win; - r2 = r1 + win; - dout_ch += wout; - } - - if (h_remain > 0) { - // deal with bottom pad - // first row with zero pad - int hstart = (h >> 1) * stride - padding; - int hend = std::min(std::min(hstart + kernel, hin + padding), hin); - if (hstart == hend - 1) { // only one line - dout_ch[0] = (r0[0] + r0[1]) * coef_2; -#ifdef __aarch64__ - w = 1; - cnt = 1; - for (; w < w_unroll_size; w += 8) { - float32x4_t vsum_1234 = vld1q_f32(&r0[w]); - float32x4_t vsum_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vsum_9101112 = vld1q_f32(&r0[w + 8]); - - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); - float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); - float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); - vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); - float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); - vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); - vsum_123_345 = vsetq_lane_f32( - vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); - vsum_123_345 = vsetq_lane_f32( - vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); - vsum_123_345 = vsetq_lane_f32( - vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); - float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_3); - vst1q_f32(&dout_ch[cnt], vrst); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - vr0 = vsetq_lane_f32(0.f, vr0, 3); - float32x2_t vsum = vpadd_f32(vget_low_f32(vr0), vget_high_f32(vr0)); - vsum = vpadd_f32(vsum, vsum); - dout_ch[cnt] = vget_lane_f32(vsum, 0) * coef_3; - cnt++; - } -#else - dr_out = dout_ch + 1; - dr0 = (r0 + 1); - cnt_num = w_unroll_size >> 3; - cnt_num_remain = w_unroll_remain >> 1; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d12-d15}, [%[dr0]]! @load d0-d3,dr0\n" - "vld1.f32 {d16-d17}, [%[dr0]]! @load d0-d3,dr0\n" - "vext.f32 q0, q6, q7, #1 @vext max_2345\n" - "vext.f32 q1, q6, q7, #3 @vext max_4567\n" - "vext.f32 q2, q6, q7, #2 @vext max_3456\n" - "vext.f32 q3, q7, q8, #1 @vext max_6789\n" - "vadd.f32 q4, q6, q0 @add 1234,2345\n" - "vadd.f32 q5, q7, q1 @add 5678,4567\n" - "vadd.f32 q4, q4, q2 @add 3456,sum1\n" - "vadd.f32 q5, q5, q3 @add 6789,sum2\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s21 @mov\n" - "vmov.f32 s19, s23 @mov\n" - "vmul.f32 q4, q4, %q[vcoef_3] @mul\n" - "sub %[dr0], #16 @add w,6\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s3_max_loop_bot\n" - "3: @loop\n" - "cmp %[cnt_num_remain], #0 @cnt_num_remain<=0\n" - "ble 4f @ble exit\n" - "2: @bot loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vext.f32 q0, %q[vzero], q0, #3 @ext v0_0123\n" - "vpadd.f32 d0, d0, d1 @padd d0,d0,d1\n" - "vpadd.f32 d0, d0, d0 @padd d0,d0,d0\n" - "vmul.f32 d0, d0, %e[vcoef_3] @mul\n" - "sub %[dr0], #8 @add w,2\n" - "subs %[cnt_num_remain], #1 @cnt_num_remain--\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "bne 2b @bne s3_max_loop_bot_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain), - [vcoef_3] "+w"(vcoef_3), - [vzero] "+w"(vzero) - : "r"(dr0), - "r"(dr1), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp1 = 0.f; - float tmp2 = exclusive ? 1.0f / (1.f * (wend - wstart)) : coef; - for (int i = wstart; i < wend; i++) { - tmp1 += r0[i]; - } - dout_ch[w_even >> 1] = tmp1 * tmp2; - } - } else { // two lines - dout_ch[0] = (r0[0] + r0[1] + r1[0] + r1[1]) * coef_4; -#ifdef __aarch64__ - w = 1; - cnt = 1; - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - - float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); - float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); - float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); - float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); - float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); - vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); - float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); - vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); - vsum_123_345 = vsetq_lane_f32( - vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); - vsum_123_345 = vsetq_lane_f32( - vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); - vsum_123_345 = vsetq_lane_f32( - vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); - float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_6); - vst1q_f32(&dout_ch[cnt], vrst); - cnt += 4; - } - for (; w < w_even - 1; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - vr0 = vsetq_lane_f32(0.f, vr0, 3); - vr1 = vsetq_lane_f32(0.f, vr1, 3); - float32x4_t vsum1 = vaddq_f32(vr0, vr1); - float32x2_t vsum2 = - vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); - vsum2 = vpadd_f32(vsum2, vsum2); - float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef_6)); - dout_ch[cnt] = vget_lane_f32(vrst, 0); - cnt++; - } -#else - dr_out = dout_ch + 1; - dr0 = (r0 + 1); - dr1 = (r1 + 1); - cnt_num = w_unroll_size >> 3; - cnt_num_remain = w_unroll_remain >> 1; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d4-d5}, [%[dr0]]! @load d0-d3,dr0\n" - "vld1.f32 {d10-d11}, [%[dr1]]! @load d4-d7,dr1\n" - "vadd.f32 q6, q0, q3 @add q0,q0,q2 1234\n" - "vadd.f32 q7, q1, q4 @add q1,q1,q3 5678\n" - "vadd.f32 q8, q2, q5 @add q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q6, q7, #1 @vext max_2345\n" - "vext.f32 q1, q6, q7, #3 @vext max_4567\n" - "vext.f32 q2, q6, q7, #2 @vext max_3456\n" - "vext.f32 q3, q7, q8, #1 @vext max_6789\n" - "vadd.f32 q4, q6, q0 @add 1234,2345\n" - "vadd.f32 q5, q7, q1 @add 5678,4567\n" - "vadd.f32 q4, q4, q2 @add 3456,sum1\n" - "vadd.f32 q5, q5, q3 @add 6789,sum2\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s21 @mov\n" - "vmov.f32 s19, s23 @mov\n" - "vmul.f32 q4, q4, %q[vcoef_6] @mul\n" - "sub %[dr0], #16 @add w,8\n" - "sub %[dr1], #16 @add w,8\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0, dr_out\n" - "bne 1b @bne s3_max_loop_bot\n" - "3: @loop\n" - "cmp %[cnt_num_remain], #0 @cnt_num_remain<=0\n" - "ble 4f @ble exit\n" - "2: @bot loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vext.f32 q0, %q[vzero], q0, #3 @ext v0_0123\n" - "vext.f32 q1, %q[vzero], q1, #3 @ext v1_0123\n" - "vadd.f32 q0, q0, q1 @add q0,q0,q1\n" - "vpadd.f32 d0, d0, d1 @padd d0,d0,d1\n" - "vpadd.f32 d0, d0, d0 @padd d0,d0,d0\n" - "vmul.f32 d0, d0, %e[vcoef_6] @mul\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @cnt_num_remain--\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "bne 2b @bne s3_max_loop_bot_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain), - [vcoef_6] "+w"(vcoef_6), - [vzero] "+w"(vzero) - : "r"(dr0), - "r"(dr1), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp1 = 0.f; - float tmp2 = exclusive ? 1.0f / (2.f * (wend - wstart)) : coef; - for (int i = wstart; i < wend; i++) { // only run 1 or 2 times - tmp1 += (r0[i] + r1[i]); - } - dout_ch[w_even >> 1] = tmp1 * tmp2; - } - } - } - } - } -} - -void pooling3x3s2p0_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win) { - int kernel = 3; - int stride = 2; - int padding = 0; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_needed = (wout << 1) + 1; - int h_needed = (hout << 1) + 1; - int w_limit = w_needed > win ? win : w_needed; - int h_limit = h_needed > hin ? hin : h_needed; - int w_even = ((w_limit - 1) >> 1) << 1; - int h_even = ((h_limit - 1) >> 1) << 1; - int w_unroll_size = (w_even >> 3) << 3; - int w_unroll_remain = w_even - w_unroll_size; - int w_remain = w_needed - w_limit; - int h_remain = h_needed - h_limit; - int w_in_2 = win << 1; - float minval = std::numeric_limits::lowest(); - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - const float* r2 = r1 + win; - // w = w_in - 8; - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - const float* dr2 = r2; - int w = 0; - int cnt = 0; - // dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], - // r1[1])); - // first row with zero pad - // r0 = r1; - // r1 = r0 + w_in; - // r2 = r1 + w_in; - // dout_channel += w_out; - int h = 0; - for (; h < h_even; h += 2) { - // deal with left pad - float maxr0 = std::max(r0[0], r0[1]); - float maxr1 = std::max(r1[0], r1[1]); - float maxr2 = std::max(r2[0], r2[1]); -// dout_ch[0] = std::max(std::max(maxr0, maxr1), maxr2); -#ifdef __aarch64__ - w = 0; - cnt = 0; - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - float32x4_t vr2_1234 = vld1q_f32(&r2[w]); - float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]); - float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678); - float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); - vmax_9101112 = vmaxq_f32(vmax_9101112, vr2_9101112); - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_56_78 = - vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); - float32x2_t vmax_67_89 = - vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); - vst1_f32(&dout_ch[cnt], vmax_123_345); - vst1_f32(&dout_ch[cnt + 2], vmax_567_789); - cnt += 4; - } - for (; w < w_even; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - float32x4_t vr2 = vld1q_f32(&r2[w]); - vr0 = vsetq_lane_f32(minval, vr0, 3); - vr1 = vsetq_lane_f32(minval, vr1, 3); - vr2 = vsetq_lane_f32(minval, vr2, 3); - float32x4_t vmax1 = vmaxq_f32(vr0, vr1); - vmax1 = vmaxq_f32(vmax1, vr2); - float32x2_t vmax2 = - vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); - float32x2_t vmax = vpmax_f32(vmax2, vmax2); - dout_ch[cnt] = vget_lane_f32(vmax, 0); - cnt++; - } -#else - dr_out = dout_ch; // + 1; - dr0 = r0; // (r0 + 1); - dr1 = r1; // (r1 + 1); - dr2 = r2; // (r2 + 1); - int cnt_num = w_unroll_size >> 3; - int cnt_num_remain = w_unroll_remain >> 1; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7,dr1\n" - "vld1.f32 {d4}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d10}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d16}, [%[dr2]]! @load d4-d7,dr1\n" - "vmax.f32 q9, q0, q3 @max q0,q0,q2\n" - "vmax.f32 q10, q1, q4 @max q1,q1,q3\n" - "vmax.f32 d22, d4, d10 @max q1,q1,q3\n" - "vmax.f32 q0, q9, q6 @max q0,q0,q2 1234\n" - "vmax.f32 q3, q10, q7 @max q1,q1,q3 5678\n" - "vmax.f32 d2, d22, d16 @max q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7, s6\n" - "vext.f32 q4, q0, q3, #1 @vext 2345\n" - "vext.f32 q2, q3, q1, #1 @vext 6789\n" - "vpmax.f32 d10, d0, d1 @pmax " - "d10,vmax_1234,vmax_1234\n" - "vpmax.f32 d12, d6, d7 @pmax " - "d12,vmax_5678,vmax_5678\n" - "vpmax.f32 d11, d8, d9 @pmax " - "d11,vmax_2345,vmax_2345\n" - "vpmax.f32 d13, d4, d5 @pmax " - "d13,vmax_6789,vmax_6789\n" - "vmax.f32 d0, d10, d11 @pmax " - "d0,vmax_12_34,vmax_23_45\n" - "vmax.f32 d1, d12, d13 @pmax " - "d1,vmax_56_78,vmax_67_89\n" - "sub %[dr0], #8 @add w,8\n" - "sub %[dr1], #8 @add w,8\n" - "sub %[dr2], #8 @add w,8\n" - "vst1.f32 d0, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d1, [%[dr_out]]! @vst1 d0,dr_out\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "bne 1b @bne s3_max_loop_mid\n" - "3: @loop\n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num_remain,0\n" - "ble 4f @ble exit1\n" - "2: @mid loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3,dr1\n" - "vmov.f32 s3,s2 @movs3,s2\n" - "vmov.f32 s7,s6 @movs7,s6\n" - "vmov.f32 s11,s10 @movs11,s10\n" - "vmax.f32 q0, q0, q1 @max q0,q0,q1\n" - "vmax.f32 q0, q0, q2 @max q0,q0,q2\n" - "vpmax.f32 d0, d0, d1 @pmax d0,d0,d1\n" - "vpmax.f32 d0, d0, d0 @pmax d0,d0,d0\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "sub %[dr2], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @cnt_num_remain--\n" - "bne 2b @bne s3_max_loop_mid_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain) - : "r"(dr0), - "r"(dr1), - "r"(dr2), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp = r0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { - tmp = std::max(tmp, std::max(r0[i], r1[i])); - tmp = std::max(tmp, r2[i]); - } - dout_ch[w_even >> 1] = tmp; - // cnt ++; - } - r0 = r2; - r1 = r0 + win; - r2 = r1 + win; - dout_ch += wout; - } - - if (h_remain > 0) { -// deal with bottom pad -// first row with zero pad -// int hstart = (h >> 1) * stride_h - pad_h; -// int hend = std::min(std::min(hstart + kernel_h, hin + pad_h), hin); -// dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], -// r1[1])); -#ifdef __aarch64__ - w = 0; - cnt = 0; - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234); - float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678); - float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112); - float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1); - float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1); - float32x2_t vmax_12_34 = - vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234)); - float32x2_t vmax_23_45 = - vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345)); - float32x2_t vmax_56_78 = - vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678)); - float32x2_t vmax_67_89 = - vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789)); - float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45); - float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89); - vst1_f32(&dout_ch[cnt], vmax_123_345); - vst1_f32(&dout_ch[cnt + 2], vmax_567_789); - cnt += 4; - } - for (; w < w_even; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - vr0 = vsetq_lane_f32(minval, vr0, 3); - vr1 = vsetq_lane_f32(minval, vr1, 3); - float32x4_t vmax1 = vmaxq_f32(vr0, vr1); - float32x2_t vmax2 = - vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1)); - vmax2 = vpmax_f32(vmax2, vmax2); - dout_ch[cnt] = vget_lane_f32(vmax2, 0); - cnt++; - } -#else - dr_out = dout_ch; // + 1; - dr0 = r0; // (r0 + 1); - dr1 = r1; // (r1 + 1); - int cnt_num = w_unroll_size >> 3; - int cnt_num_remain = w_unroll_remain >> 1; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 3f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d4}, [%[dr0]]! @load d0-d3,dr0\n" - "vld1.f32 {d10}, [%[dr1]]! @load d4-d7,dr1\n" - "vmax.f32 q6, q0, q3 @max q0,q0,q2 1234\n" - "vmax.f32 q7, q1, q4 @max q1,q1,q3 5678\n" - "vmax.f32 d16, d4, d10 @max q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7,s6\n" - "vext.f32 q0, q6, q7, #1 @vext q0,2345\n" - "vext.f32 q1, q7, q8, #1 @vext q1,6789\n" - "vpmax.f32 d4, d12, d13 @pmax " - "d4,vmax_1234,vmax_1234\n" - "vpmax.f32 d6, d14, d15 @pmax " - "d6,vmax_5678,vmax_5678\n" - "vpmax.f32 d5, d0, d1 @pmax " - "d5,vmax_2345,vmax_2345\n" - "vpmax.f32 d7, d2, d3 @pmax " - "d7,vmax_6789,vmax_6789\n" - "vmax.f32 d8, d4, d5 @max " - "d2,vmax_12_34,vmax_23_45\n" - "vmax.f32 d9, d6, d7 @max " - "d2,vmax_56_78,vmax_67_89\n" - "sub %[dr0], #8 @add w,8\n" - "sub %[dr1], #8 @add w,8\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "subs %[cnt_num], #1 @subs cnt_num,#1\n" - "bne 1b @bne s3_max_loop_bot\n" - "3: @loop \n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num_remain,0\n" - "ble 4f @ble exit\n" - "2: @bot loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vmov.f32 s3,s2 @movs3,s2\n" - "vmov.f32 s7,s6 @movs7,s6\n" - "vmax.f32 q0, q0, q1 @max q0,q0,q1\n" - "vpmax.f32 d0, d0, d1 @pmax d0,d0,d1\n" - "vpmax.f32 d0, d0, d0 @pmax d0,d0,d0\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @cnt_num_remain--\n" - "bne 2b @bne s3_max_loop_bot_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain) - : "r"(dr0), - "r"(dr1), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "cc", - "memory", - "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp = r0[wstart]; // std::numeric_limits::min(); - for (int i = wstart; i < wend; i++) { // only run 1 or 2 times - tmp = std::max(tmp, std::max(r0[i], r1[i])); - } - dout_ch[w_even >> 1] = tmp; - } - } - } - } -} - -void pooling3x3s2p0_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive) { - int kernel = 3; - int stride = 2; - int padding = 0; - int size_channel_out = wout * hout; - int size_channel_in = win * hin; - - int w_needed = (wout << 1) + 1; - int h_needed = (hout << 1) + 1; - int w_limit = w_needed > win ? win : w_needed; - int h_limit = h_needed > hin ? hin : h_needed; - int w_even = ((w_limit - 1) >> 1) << 1; - int h_even = ((h_limit - 1) >> 1) << 1; - int w_unroll_size = (w_even >> 3) << 3; - int w_unroll_remain = w_even - w_unroll_size; - int w_remain = w_needed - w_limit; - int h_remain = h_needed - h_limit; - int w_in_2 = win << 1; - const float coef = 1.f / 9.f; - const float coef_6 = exclusive ? 1.f / 6.f : coef; - float32x4_t vcoef = vdupq_n_f32(coef); - float32x4_t vcoef_6 = vdupq_n_f32(coef_6); - for (int n = 0; n < num; ++n) { - float* dout_batch = dout + n * chout * size_channel_out; - const float* din_batch = din + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; c++) { - float* dout_ch = dout_batch + c * size_channel_out; - const float* din_ch = din_batch + c * size_channel_in; - const float* r0 = din_ch; - const float* r1 = r0 + win; - const float* r2 = r1 + win; - // w = w_in - 8; - float* dr_out = dout_ch; - const float* dr0 = r0; - const float* dr1 = r1; - const float* dr2 = r2; - - float32x4_t vzero = vdupq_n_f32(0.f); - - int h = 0; - for (; h < h_even; h += 2) { -// LOG(INFO) << "h: " << h <<", dr0:" << r0 << ", dr1: " << r1 << -// ",dr2: " <> 3; - int cnt_num_remain = w_unroll_remain >> 1; - // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " << - // cnt_num_remain; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num, 0\n" - "ble 3f @ble exit\n" - "s3_ave_loop_mid_p0: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5, dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7, dr1\n" - "vld1.f32 {d12-d15}, [%[dr2]]! @load d4-d7, dr2\n" - "vld1.f32 {d4}, [%[dr0]]! @load d0-d5, dr0\n" - "vld1.f32 {d10}, [%[dr1]]! @load d4-d7, dr1\n" - "vld1.f32 {d16}, [%[dr2]]! @load d4-d7, dr2\n" - "vadd.f32 q9, q0, q3 @max q0,q0,q2\n" - "vadd.f32 q10, q1, q4 @max q1,q1,q3\n" - "vadd.f32 d22, d4, d10 @max q1,q1,q3\n" - "vadd.f32 q6, q9, q6 @max q0,q0,q2 1234\n" - "vadd.f32 q7, q10, q7 @max q1,q1,q3 5678\n" - "vadd.f32 d16, d22, d16 @max q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7, s6\n" - "vext.f32 q0, q6, q7, #1 @vext max_2345\n" - "vext.f32 q1, q6, q7, #3 @vext max_4567\n" - "vext.f32 q2, q6, q7, #2 @vext max_3456\n" - "vext.f32 q3, q7, q8, #1 @vext max_6789\n" - "vadd.f32 q4, q6, q0 @add 1234, 2345\n" - "vadd.f32 q5, q7, q1 @add 5678, 4567\n" - "vadd.f32 q4, q4, q2 @add 3456, sum1\n" - "vadd.f32 q5, q5, q3 @add 6789, sum2\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s21 @mov\n" - "vmov.f32 s19, s23 @mov\n" - "vmul.f32 q4, q4, %q[vcoef] @mul\n" - "sub %[dr0], #8 @add w,8\n" - "sub %[dr1], #8 @add w,8\n" - "sub %[dr2], #8 @add w,8\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne s3_ave_loop_mid_p0 @bne s3_max_loop_mid\n" - "3: @loop\n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num_remain,0\n" - "ble 4f @ble exit1\n" - "s3_ave_loop_mid_1_p0: @mid loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vld1.f32 {d4-d5}, [%[dr2]]! @load d2-d3,dr1\n" - "vext.f32 q0, %q[vzero], q0, #3 @ext v0_0123\n" - "vext.f32 q1, %q[vzero], q1, #3 @ext v1_0123\n" - "vext.f32 q2, %q[vzero], q2, #3 @ext v1_0123\n" - "vadd.f32 q0, q0, q1 @add q0,q0,q1\n" - "vadd.f32 q0, q0, q2 @add q0,q0,q1\n" - "vpadd.f32 d0, d0, d1 @padd d0,d0,d1\n" - "vpadd.f32 d0, d0, d0 @padd d0,d0,d0\n" - "vmul.f32 d0, d0, %e[vcoef] @mul\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "sub %[dr2], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @cnt_num_remain--\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "bne s3_ave_loop_mid_1_p0 @bne s3_max_loop_mid_1\n" - "4: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr2] "+r"(dr2), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain), - [vcoef] "+w"(vcoef), - [vzero] "+w"(vzero) - : "r"(dr0), - "r"(dr1), - "r"(dr2), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12"); - } -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp1 = 0.f; - float tmp2 = exclusive ? 1.0f / (3.f * (wend - wstart)) : coef; - for (int i = wstart; i < wend; i++) { - tmp1 += (r0[i] + r1[i] + r2[i]); - } - dout_ch[w_even >> 1] = tmp1 * tmp2; - // cnt ++; - } - r0 = r2; - r1 = r0 + win; - r2 = r1 + win; - dout_ch += wout; - } - - if (h_remain > 0) { -// deal with bottom pad -// first row with zero pad -// int hstart = (h >> 1) * stride_h - pad_h; -// int hend = std::min(std::min(hstart + kernel_h, hin + padding_h), -// hin); data_out_channel[0] =(r0[0] + r0[1] + r0[2] + r1[0] + r1[1] + -// r1[2]) / 9.f; -#ifdef __aarch64__ - int w = 0; - int cnt = 0; - for (; w < w_unroll_size; w += 8) { - float32x4_t vr0_1234 = vld1q_f32(&r0[w]); - float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]); - float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]); - float32x4_t vr1_1234 = vld1q_f32(&r1[w]); - float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]); - float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]); - - float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234); - float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678); - float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112); - float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1); - float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2); - float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3); - float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1); - float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345); - vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456); - float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678); - vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2); - vsum_123_345 = - vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3); - float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_6); - vst1q_f32(&dout_ch[cnt], vrst); - cnt += 4; - } - for (; w < w_even; w += 2) { - float32x4_t vr0 = vld1q_f32(&r0[w]); - float32x4_t vr1 = vld1q_f32(&r1[w]); - vr0 = vsetq_lane_f32(0.f, vr0, 3); - vr1 = vsetq_lane_f32(0.f, vr1, 3); - float32x4_t vsum1 = vaddq_f32(vr0, vr1); - float32x2_t vsum2 = - vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1)); - vsum2 = vpadd_f32(vsum2, vsum2); - float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef_6)); - dout_ch[cnt] = vget_lane_f32(vrst, 0); - cnt++; - } -#else - dr_out = dout_ch; // + 1; - dr0 = r0; // (r0 + 1); - dr1 = r1; // (r1 + 1); - int cnt_num = w_unroll_size >> 3; - int cnt_num_remain = w_unroll_remain >> 1; - // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " << - // cnt_num_remain; - if (cnt_num > 0 || cnt_num_remain > 0) { - asm volatile( - "cmp %[cnt_num], #0 @cmp cnt_num,0\n" - "ble 2f @ble exit\n" - "1: @main loop\n" - "vld1.f32 {d0-d3}, [%[dr0]]! @load d0-d5,dr0\n" - "vld1.f32 {d6-d9}, [%[dr1]]! @load d4-d7,dr1\n" - "vld1.f32 {d4}, [%[dr0]]! @load d0-d3,dr0\n" - "vld1.f32 {d10}, [%[dr1]]! @load d4-d7,dr1\n" - "vadd.f32 q6, q0, q3 @max q0,q0,q2 1234\n" - "vadd.f32 q7, q1, q4 @max q1,q1,q3 5678\n" - "vadd.f32 d16, d4, d10 @max q1,q1,q3 9101112\n" - //"vmov.f32 s7,s6 @mov s7, s6\n" - "vext.f32 q0, q6, q7, #1 @vext max_2345\n" - "vext.f32 q1, q6, q7, #3 @vext max_4567\n" - "vext.f32 q2, q6, q7, #2 @vext max_3456\n" - "vext.f32 q3, q7, q8, #1 @vext max_6789\n" - "vadd.f32 q4, q6, q0 @add 1234,2345\n" - "vadd.f32 q5, q7, q1 @add 5678,4567\n" - "vadd.f32 q4, q4, q2 @add 3456,sum1\n" - "vadd.f32 q5, q5, q3 @add 6789,sum2\n" - "vmov.f32 s17, s18 @mov\n" - "vmov.f32 s18, s21 @mov\n" - "vmov.f32 s19, s23 @mov\n" - "vmul.f32 q4, q4, %q[vcoef_6] @mul\n" - "sub %[dr0], #8 @add w,8\n" - "sub %[dr1], #8 @add w,8\n" - "subs %[cnt_num], #1 @cnt_num--\n" - "vst1.f32 d8, [%[dr_out]]! @vst1 d0,dr_out\n" - "vst1.f32 d9, [%[dr_out]]! @vst1 d0,dr_out\n" - "bne 1b @bne s3_max_loop_bot\n" - "2: @loop\n" - "cmp %[cnt_num_remain], #0 @cmp cnt_num_remain, 0\n" - "ble 3f @ble exit\n" - "4: @bot loop\n" - "vld1.f32 {d0-d1}, [%[dr0]]! @load d0-d1,dr0\n" - "vld1.f32 {d2-d3}, [%[dr1]]! @load d2-d3,dr1\n" - "vext.f32 q0, %q[vzero], q0, #3 @ext v0_0123\n" - "vext.f32 q1, %q[vzero], q1, #3 @ext v1_0123\n" - "vadd.f32 q0, q0, q1 @add q0,q0,q1\n" - "vpadd.f32 d0, d0, d1 @padd d0,d0,d1\n" - "vpadd.f32 d0, d0, d0 @padd d0,d0,d0\n" - "vmul.f32 d0, d0, %e[vcoef_6] @mul\n" - "sub %[dr0], #8 @add w,6\n" - "sub %[dr1], #8 @add w,6\n" - "subs %[cnt_num_remain], #1 @cnt_num_remain--\n" - "vst1.f32 d0[0], [%[dr_out]]! @vst d0[0],dr_out\n" - "bne 4b @bne s3_max_loop_bot_1\n" - "3: @exit\n" - : [dr0] "+r"(dr0), - [dr1] "+r"(dr1), - [dr_out] "+r"(dr_out), - [cnt_num] "+r"(cnt_num), - [cnt_num_remain] "+r"(cnt_num_remain), - [vcoef_6] "+w"(vcoef_6), - [vzero] "+w"(vzero) - : "r"(dr0), - "r"(dr1), - "r"(dr_out), - "r"(cnt_num), - "r"(cnt_num_remain) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9"); - } - -#endif - if (w_remain > 0) { - // deal with right pad - int wstart = (w_even >> 1) * stride - padding; - int wend = std::min(std::min(wstart + kernel, win + padding), win); - float tmp1 = 0.f; - float tmp2 = exclusive ? 1.0f / (2.f * (wend - wstart)) : coef; - for (int i = wstart; i < wend; i++) { // only run 1 or 2 times - tmp1 += (r0[i] + r1[i]); - } - dout_ch[w_even >> 1] = tmp1 * tmp2; - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h deleted file mode 100644 index 8fc9e0c4e0..0000000000 --- a/lite/backends/arm/math/pooling.h +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -// !pooling fp32 Op -void pooling_basic(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool global_pooling, - bool exclusive, - bool adaptive, - bool ceil_mode, - bool use_quantizer, - const std::string& pooling_type); - -void pooling_global_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win); - -void pooling_global_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win); - -void pooling2x2s2_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win); - -void pooling2x2s2_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive); - -void pooling3x3s1p1_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win); - -void pooling3x3s1p1_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive); - -void pooling3x3s2p1_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win); - -void pooling3x3s2p1_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive); - -void pooling3x3s2p0_max(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win); - -void pooling3x3s2p0_avg(const float* din, - float* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - bool exclusive); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/power.cc b/lite/backends/arm/math/power.cc deleted file mode 100644 index 752c63d917..0000000000 --- a/lite/backends/arm/math/power.cc +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/power.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void power(const float* din, - float* dout, - const int num, - float scale_, - float shift_, - float power_) { - int cnt = num >> 4; - int remain = num % 16; - bool _do_power = true; - bool _do_scale = true; - bool _do_shift = true; - if (fabsf(power_ - 1.f) < 1e-6f) { - _do_power = false; - } - if (fabsf(scale_ - 1.f) < 1e-6f) { - _do_scale = false; - } - if (fabsf(shift_ - 0.f) < 1e-6f) { - _do_shift = false; - } - float* ptr_out = dout; - const float* ptr_in = din; - float32x4_t vscale = vdupq_n_f32(scale_); - float32x4_t vshift = vdupq_n_f32(shift_); - float32x4_t vpower = vdupq_n_f32(power_); -#pragma omp parallel for - for (int nums = 0; nums < cnt; ++nums) { - float32x4_t vr0 = vld1q_f32(ptr_in); - ptr_in += 4; - float32x4_t vr1 = vld1q_f32(ptr_in); - ptr_in += 4; - float32x4_t vr2 = vld1q_f32(ptr_in); - ptr_in += 4; - float32x4_t vr3 = vld1q_f32(ptr_in); - ptr_in += 4; - if (_do_scale) { - vr0 = vmulq_f32(vr0, vscale); - vr1 = vmulq_f32(vr1, vscale); - vr2 = vmulq_f32(vr2, vscale); - vr3 = vmulq_f32(vr3, vscale); - } - if (_do_shift) { - vr0 = vaddq_f32(vr0, vshift); - vr1 = vaddq_f32(vr1, vshift); - vr2 = vaddq_f32(vr2, vshift); - vr3 = vaddq_f32(vr3, vshift); - } - if (_do_power) { - vr0 = pow_ps(vr0, vpower); - vr1 = pow_ps(vr1, vpower); - vr2 = pow_ps(vr2, vpower); - vr3 = pow_ps(vr3, vpower); - } - vst1q_f32(ptr_out, vr0); - ptr_out += 4; - vst1q_f32(ptr_out, vr1); - ptr_out += 4; - vst1q_f32(ptr_out, vr2); - ptr_out += 4; - vst1q_f32(ptr_out, vr3); - ptr_out += 4; - } - for (int j = 0; j < remain; ++j) { - ptr_out[0] = std::pow((ptr_in[0] * scale_ + shift_), power_); - ptr_in++; - ptr_out++; - } -} - -} /* namespace math */ -} /* namespace arm */ -} /* namespace lite */ -} /* namespace paddle */ diff --git a/lite/backends/arm/math/power.h b/lite/backends/arm/math/power.h deleted file mode 100644 index 7b9074918d..0000000000 --- a/lite/backends/arm/math/power.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void power(const T* din, - T* dout, - const int num, - float scale_, - float shift_, - float power_); - -} /* namespace math */ -} /* namespace arm */ -} /* namespace lite */ -} /* namespace paddle */ diff --git a/lite/backends/arm/math/prior_box.cc b/lite/backends/arm/math/prior_box.cc deleted file mode 100644 index f262e6e1d7..0000000000 --- a/lite/backends/arm/math/prior_box.cc +++ /dev/null @@ -1,362 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/prior_box.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -const int MALLOC_ALIGN = 64; - -void* fast_malloc(size_t size) { - size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; - char* p = static_cast(malloc(offset + size)); - - if (!p) { - return nullptr; - } - - void* r = reinterpret_cast(reinterpret_cast(p + offset) & - (~(MALLOC_ALIGN - 1))); - static_cast(r)[-1] = p; - memset(r, 0, size); - return r; -} - -void fast_free(void* ptr) { - if (ptr) { - free(static_cast(ptr)[-1]); - } -} - -void density_prior_box(const lite::Tensor* input, - const lite::Tensor* image, - lite::Tensor** boxes, - lite::Tensor** variances, - const std::vector& min_size_, - const std::vector& fixed_size_, - const std::vector& fixed_ratio_, - const std::vector& density_size_, - const std::vector& max_size_, - const std::vector& aspect_ratio_, - const std::vector& variance_, - int img_w_, - int img_h_, - float step_w_, - float step_h_, - float offset_, - int prior_num_, - bool is_flip_, - bool is_clip_, - const std::vector& order_) { - // compute output shape - int win1 = input->dims()[3]; - int hin1 = input->dims()[2]; - DDim shape_out({hin1, win1, prior_num_, 4}); - (*boxes)->Resize(shape_out); - (*variances)->Resize(shape_out); - - float* _cpu_data = (*boxes)->mutable_data(); - float* _variance_data = (*variances)->mutable_data(); - - const int width = win1; - const int height = hin1; - int img_width = img_w_; - int img_height = img_h_; - if (img_width == 0 || img_height == 0) { - img_width = image->dims()[3]; - img_height = image->dims()[2]; - } - float step_w = step_w_; - float step_h = step_h_; - if (step_w == 0 || step_h == 0) { - step_w = static_cast(img_width) / width; - step_h = static_cast(img_height) / height; - } - float offset = offset_; - int step_average = static_cast((step_w + step_h) * 0.5); // add - int channel_size = height * width * prior_num_ * 4; - int idx = 0; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - float center_x = (w + offset) * step_w; - float center_y = (h + offset) * step_h; - float box_width; - float box_height; - if (fixed_size_.size() > 0) { - // add - for (int s = 0; s < fixed_size_.size(); ++s) { - int fixed_size = fixed_size_[s]; - int com_idx = 0; - box_width = fixed_size; - box_height = fixed_size; - - if (fixed_ratio_.size() > 0) { - for (int r = 0; r < fixed_ratio_.size(); ++r) { - float ar = fixed_ratio_[r]; - int density = density_size_[s]; - int shift = step_average / density; - float box_width_ratio = fixed_size_[s] * sqrt(ar); - float box_height_ratio = fixed_size_[s] / sqrt(ar); - - for (int p = 0; p < density; ++p) { - for (int c = 0; c < density; ++c) { - float center_x_temp = - center_x - step_average / 2.0f + shift / 2.f + c * shift; - float center_y_temp = - center_y - step_average / 2.0f + shift / 2.f + p * shift; - // xmin - _cpu_data[idx++] = - (center_x_temp - box_width_ratio / 2.f) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.f) / img_width - : 0; - // ymin - _cpu_data[idx++] = - (center_y_temp - box_height_ratio / 2.f) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.f) / - img_height - : 0; - // xmax - _cpu_data[idx++] = - (center_x_temp + box_width_ratio / 2.f) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.f) / img_width - : 1; - // ymax - _cpu_data[idx++] = - (center_y_temp + box_height_ratio / 2.f) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.f) / - img_height - : 1; - } - } - } - } else { - // this code for density anchor box - if (density_size_.size() > 0) { - CHECK_EQ(fixed_size_.size(), density_size_.size()) - << "fixed_size_ should be same with density_size_"; - int density = density_size_[s]; - int shift = fixed_size_[s] / density; - - for (int r = 0; r < density; ++r) { - for (int c = 0; c < density; ++c) { - float center_x_temp = - center_x - fixed_size / 2.f + shift / 2.f + c * shift; - float center_y_temp = - center_y - fixed_size / 2.f + shift / 2.f + r * shift; - // xmin - _cpu_data[idx++] = - (center_x_temp - box_width / 2.f) / img_width >= 0 - ? (center_x_temp - box_width / 2.f) / img_width - : 0; - // ymin - _cpu_data[idx++] = - (center_y_temp - box_height / 2.f) / img_height >= 0 - ? (center_y_temp - box_height / 2.f) / img_height - : 0; - // xmax - _cpu_data[idx++] = - (center_x_temp + box_width / 2.f) / img_width <= 1 - ? (center_x_temp + box_width / 2.f) / img_width - : 1; - // ymax - _cpu_data[idx++] = - (center_y_temp + box_height / 2.f) / img_height <= 1 - ? (center_y_temp + box_height / 2.f) / img_height - : 1; - } - } - } - - // rest of priors: will never come here!!! - for (int r = 0; r < aspect_ratio_.size(); ++r) { - float ar = aspect_ratio_[r]; - - if (fabs(ar - 1.) < 1e-6) { - continue; - } - - int density = density_size_[s]; - int shift = fixed_size_[s] / density; - float box_width_ratio = fixed_size_[s] * sqrt(ar); - float box_height_ratio = fixed_size_[s] / sqrt(ar); - - for (int p = 0; p < density; ++p) { - for (int c = 0; c < density; ++c) { - float center_x_temp = - center_x - fixed_size / 2.f + shift / 2.f + c * shift; - float center_y_temp = - center_y - fixed_size / 2.f + shift / 2.f + p * shift; - // xmin - _cpu_data[idx++] = - (center_x_temp - box_width_ratio / 2.f) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.f) / img_width - : 0; - // ymin - _cpu_data[idx++] = - (center_y_temp - box_height_ratio / 2.f) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.f) / - img_height - : 0; - // xmax - _cpu_data[idx++] = - (center_x_temp + box_width_ratio / 2.f) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.f) / img_width - : 1; - // ymax - _cpu_data[idx++] = - (center_y_temp + box_height_ratio / 2.f) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.f) / - img_height - : 1; - } - } - } - } - } - } else { - float* min_buf = - reinterpret_cast(fast_malloc(sizeof(float) * 4)); - float* max_buf = - reinterpret_cast(fast_malloc(sizeof(float) * 4)); - float* com_buf = reinterpret_cast( - fast_malloc(sizeof(float) * aspect_ratio_.size() * 4)); - - for (int s = 0; s < min_size_.size(); ++s) { - int min_idx = 0; - int max_idx = 0; - int com_idx = 0; - int min_size = min_size_[s]; - // first prior: aspect_ratio = 1, size = min_size - box_width = box_height = min_size; - //! xmin - min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width; - //! ymin - min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height; - //! xmax - min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width; - //! ymax - min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height; - - if (max_size_.size() > 0) { - int max_size = max_size_[s]; - //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size) - box_width = box_height = sqrtf(min_size * max_size); - //! xmin - max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width; - //! ymin - max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height; - //! xmax - max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width; - //! ymax - max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height; - } - - //! rest of priors - for (int r = 0; r < aspect_ratio_.size(); ++r) { - float ar = aspect_ratio_[r]; - if (fabs(ar - 1.) < 1e-6) { - continue; - } - box_width = min_size * sqrt(ar); - box_height = min_size / sqrt(ar); - //! xmin - com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width; - //! ymin - com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height; - //! xmax - com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width; - //! ymax - com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height; - } - memcpy(_cpu_data + idx, min_buf, sizeof(float) * min_idx); - idx += min_idx; - memcpy(_cpu_data + idx, com_buf, sizeof(float) * com_idx); - idx += com_idx; - memcpy(_cpu_data + idx, max_buf, sizeof(float) * max_idx); - idx += max_idx; - } - fast_free(min_buf); - fast_free(max_buf); - fast_free(com_buf); - } - } - } - //! clip the prior's coordinate such that it is within [0, 1] - if (is_clip_) { - for (int d = 0; d < channel_size; ++d) { - _cpu_data[d] = std::min(std::max(_cpu_data[d], 0.f), 1.f); - } - } - //! set the variance. - int count = 0; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - for (int i = 0; i < prior_num_; ++i) { - for (int j = 0; j < 4; ++j) { - _variance_data[count] = variance_[j]; - ++count; - } - } - } - } -} - -void prior_box(const lite::Tensor* input, - const lite::Tensor* image, - lite::Tensor** boxes, - lite::Tensor** variances, - const std::vector& min_size, - const std::vector& max_size, - const std::vector& aspect_ratio, - const std::vector& variance, - int img_w, - int img_h, - float step_w, - float step_h, - float offset, - int prior_num, - bool is_flip, - bool is_clip, - const std::vector& order) { - density_prior_box(input, - image, - boxes, - variances, - min_size, - std::vector(), - std::vector(), - std::vector(), - max_size, - aspect_ratio, - variance, - img_w, - img_h, - step_w, - step_h, - offset, - prior_num, - is_flip, - is_clip, - order); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/prior_box.h b/lite/backends/arm/math/prior_box.h deleted file mode 100644 index ffa821b75e..0000000000 --- a/lite/backends/arm/math/prior_box.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/op_lite.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void density_prior_box(const lite::Tensor* input, - const lite::Tensor* image, - lite::Tensor** boxes, - lite::Tensor** variances, - const std::vector& min_size_, - const std::vector& fixed_size_, - const std::vector& fixed_ratio_, - const std::vector& density_size_, - const std::vector& max_size_, - const std::vector& aspect_ratio_, - const std::vector& variance_, - int img_w_, - int img_h_, - float step_w_, - float step_h_, - float offset_, - int prior_num_, - bool is_flip_, - bool is_clip_, - const std::vector& order_); - -void prior_box(const lite::Tensor* input, - const lite::Tensor* image, - lite::Tensor** boxes, - lite::Tensor** variances, - const std::vector& min_size, - const std::vector& max_size, - const std::vector& aspect_ratio, - const std::vector& variance, - int img_w, - int img_h, - float step_w, - float step_h, - float offset, - int prior_num, - bool is_flip, - bool is_clip, - const std::vector& order); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/reduce_max.cc b/lite/backends/arm/math/reduce_max.cc deleted file mode 100644 index 5c75960d72..0000000000 --- a/lite/backends/arm/math/reduce_max.cc +++ /dev/null @@ -1,207 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/arm/math/reduce_max.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void reduce_n(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int hw_size = height_in * width_in; - int chw_size = channel_in * hw_size; - int data_index, src_index, src_index0; - for (int c = 0; c < channel_in; ++c) { - for (int h = 0; h < height_in; ++h) { - for (int w = 0; w < width_in; ++w) { - data_index = c * hw_size + h * width_in + w; - dst[data_index] = src[data_index]; - for (int n = 1; n < num_in; ++n) { - src_index = n * chw_size + data_index; - dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index] - : src[src_index]; - } - } - } - } -} - -template <> -void reduce_c(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int hw_size = height_in * width_in; - int chw_size = hw_size * channel_in; - int data_index, src_index0, src_index; - for (int n = 0; n < num_in; ++n) { - for (int h = 0; h < height_in; ++h) { - for (int w = 0; w < width_in; ++w) { - data_index = n * hw_size + h * width_in + w; - src_index0 = n * chw_size + h * width_in + w; - dst[data_index] = src[src_index0]; - for (int c = 1; c < channel_in; ++c) { - src_index = src_index0 + c * hw_size; - dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index] - : src[src_index]; - } - } - } - } -} - -template <> -void reduce_h(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int cw_size = channel_in * width_in; - int chw_size = cw_size * height_in; - int hw_size = height_in * width_in; - int data_index, src_index, src_index0; - for (int n = 0; n < num_in; ++n) { - for (int c = 0; c < channel_in; ++c) { - for (int w = 0; w < width_in; ++w) { - data_index = n * cw_size + c * width_in + w; - src_index0 = n * chw_size + c * hw_size + w; - dst[data_index] = src[src_index0]; - for (int h = 1; h < height_in; ++h) { - src_index = src_index0 + h * width_in; - dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index] - : src[src_index]; - } - } - } - } -} - -template <> -void reduce_w(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int ch_size = channel_in * height_in; - int hw_size = height_in * width_in; - int chw_size = ch_size * width_in; - int data_index = 0; - int src_index0 = 0; - int src_index = 0; - for (int n = 0; n < num_in; ++n) { - for (int c = 0; c < channel_in; ++c) { - for (int h = 0; h < height_in; ++h) { - data_index = n * ch_size + c * height_in + h; - src_index0 = n * chw_size + c * hw_size + h * width_in; - dst[data_index] = src[src_index0]; - for (int w = 1; w < width_in; ++w) { - src_index = src_index0 + w; - dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index] - : src[src_index]; - } - } - } - } -} - -template <> -void reduce_all(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - float max = src[0]; - int src_index; - int n_id, c_id; - for (int n = 0; n < num_in; ++n) { - n_id = n * channel_in * height_in * width_in; - for (int c = 0; c < channel_in; ++c) { - c_id = c * height_in * width_in; - for (int h = 0; h < height_in; ++h) { - for (int w = 0; w < width_in; ++w) { - src_index = n_id + c_id + h * width_in + w; - max = src[src_index] > max ? src[src_index] : max; - } - } - } - } - dst[0] = max; -} - -template <> -void reduce_nc(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - // reduce n first. - DDimLite ddimA({1, channel_in, height_in, width_in}); - lite::Tensor tensor_tmp; - tensor_tmp.Resize(ddimA); - float* tmp_out = tensor_tmp.mutable_data(); - reduce_n(src, tmp_out, num_in, channel_in, height_in, width_in); - reduce_c(tmp_out, dst, 1, channel_in, height_in, width_in); -} - -template <> -void reduce_ch(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - // reduce c first - DDimLite ddimA({num_in, 1, height_in, width_in}); - lite::Tensor tensor_tmp; - tensor_tmp.Resize(ddimA); - float* tmp_out = tensor_tmp.mutable_data(); - reduce_c(src, tmp_out, num_in, channel_in, height_in, width_in); - reduce_h(tmp_out, dst, num_in, 1, height_in, width_in); -} - -template <> -void reduce_hw(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - // reduce h first - DDimLite ddimA({num_in, channel_in, 1, width_in}); - lite::Tensor tensor_tmp; - tensor_tmp.Resize(ddimA); - float* tmp_out = tensor_tmp.mutable_data(); - reduce_h(src, tmp_out, num_in, channel_in, height_in, width_in); - reduce_w(tmp_out, dst, num_in, channel_in, 1, width_in); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/reduce_max.h b/lite/backends/arm/math/reduce_max.h deleted file mode 100644 index dab9626182..0000000000 --- a/lite/backends/arm/math/reduce_max.h +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void reduce_n(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_c(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_h(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_w(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_nc(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_ch(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_hw(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_all(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/reduce_mean.cc b/lite/backends/arm/math/reduce_mean.cc deleted file mode 100644 index 56104550d8..0000000000 --- a/lite/backends/arm/math/reduce_mean.cc +++ /dev/null @@ -1,204 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/arm/math/reduce_mean.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void reduce_mean_n(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int hw_size = height_in * width_in; - int chw_size = channel_in * hw_size; - int data_index, src_index, src_index0; - for (int c = 0; c < channel_in; ++c) { - for (int h = 0; h < height_in; ++h) { - for (int w = 0; w < width_in; ++w) { - data_index = c * hw_size + h * width_in + w; - dst[data_index] = 0.0; - for (int n = 0; n < num_in; ++n) { - src_index = n * chw_size + data_index; - dst[data_index] += static_cast(src[src_index]) / num_in; - } - } - } - } -} - -template <> -void reduce_mean_c(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int hw_size = height_in * width_in; - int chw_size = hw_size * channel_in; - int data_index, src_index0, src_index; - for (int n = 0; n < num_in; ++n) { - for (int h = 0; h < height_in; ++h) { - for (int w = 0; w < width_in; ++w) { - data_index = n * hw_size + h * width_in + w; - src_index0 = n * chw_size + h * width_in + w; - dst[data_index] = 0.0; - for (int c = 0; c < channel_in; ++c) { - src_index = src_index0 + c * hw_size; - dst[data_index] += static_cast(src[src_index]) / channel_in; - } - } - } - } -} - -template <> -void reduce_mean_h(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int cw_size = channel_in * width_in; - int chw_size = cw_size * height_in; - int hw_size = height_in * width_in; - int data_index, src_index, src_index0; - for (int n = 0; n < num_in; ++n) { - for (int c = 0; c < channel_in; ++c) { - for (int w = 0; w < width_in; ++w) { - data_index = n * cw_size + c * width_in + w; - src_index0 = n * chw_size + c * hw_size + w; - dst[data_index] = 0.0; - for (int h = 0; h < height_in; ++h) { - src_index = src_index0 + h * width_in; - dst[data_index] += static_cast(src[src_index]) / height_in; - } - } - } - } -} - -template <> -void reduce_mean_w(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - int ch_size = channel_in * height_in; - int hw_size = height_in * width_in; - int chw_size = ch_size * width_in; - int data_index = 0; - int src_index0 = 0; - int src_index = 0; - for (int n = 0; n < num_in; ++n) { - for (int c = 0; c < channel_in; ++c) { - for (int h = 0; h < height_in; ++h) { - data_index = n * ch_size + c * height_in + h; - src_index0 = n * chw_size + c * hw_size + h * width_in; - dst[data_index] = 0.0; - for (int w = 0; w < width_in; ++w) { - src_index = src_index0 + w; - dst[data_index] += static_cast(src[src_index]) / width_in; - } - } - } - } -} - -template <> -void reduce_mean_all(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - float mean = 0.0; - int src_index; - int n_id, c_id; - int all = num_in * channel_in * height_in * width_in; - for (int n = 0; n < num_in; ++n) { - n_id = n * channel_in * height_in * width_in; - for (int c = 0; c < channel_in; ++c) { - c_id = c * height_in * width_in; - for (int h = 0; h < height_in; ++h) { - for (int w = 0; w < width_in; ++w) { - src_index = n_id + c_id + h * width_in + w; - mean = src[src_index] / all; - } - } - } - } - dst[0] = mean; -} - -template <> -void reduce_mean_nc(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - // reduce n first. - DDimLite ddimA({1, channel_in, height_in, width_in}); - lite::Tensor tensor_tmp; - tensor_tmp.Resize(ddimA); - float* tmp_out = tensor_tmp.mutable_data(); - reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in); - reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in); -} - -template <> -void reduce_mean_ch(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - // reduce c first - DDimLite ddimA({num_in, 1, height_in, width_in}); - lite::Tensor tensor_tmp; - tensor_tmp.Resize(ddimA); - float* tmp_out = tensor_tmp.mutable_data(); - reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in); - reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in); -} - -template <> -void reduce_mean_hw(const float* src, - float* dst, - int num_in, - int channel_in, - int height_in, - int width_in) { - // reduce h first - DDimLite ddimA({num_in, channel_in, 1, width_in}); - lite::Tensor tensor_tmp; - tensor_tmp.Resize(ddimA); - float* tmp_out = tensor_tmp.mutable_data(); - reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in); - reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/reduce_mean.h b/lite/backends/arm/math/reduce_mean.h deleted file mode 100644 index 277ed209c0..0000000000 --- a/lite/backends/arm/math/reduce_mean.h +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void reduce_mean_n(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_mean_c(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_mean_h(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_mean_w(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_mean_nc(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_mean_ch(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_mean_hw(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -template -void reduce_mean_all(const T* src, - T* dst, - int num_in, - int channel_in, - int height_in, - int width_in); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/saturate.h b/lite/backends/arm/math/saturate.h deleted file mode 100644 index 833f0f5c1c..0000000000 --- a/lite/backends/arm/math/saturate.h +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -static inline _Tp saturate_cast(uint8_t v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(int8_t v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(uint16_t v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(int16_t v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(uint32_t v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(int32_t v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(float v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(double v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(int64_t v) { - return _Tp(v); -} -/** @overload */ -template -static inline _Tp saturate_cast(uint64_t v) { - return _Tp(v); -} - -template <> -inline uint8_t saturate_cast(int8_t v) { - return static_cast(std::max(static_cast(v), 0)); -} - -template <> -inline uint8_t saturate_cast(uint16_t v) { - return static_cast(std::min((unsigned)v, (unsigned)UCHAR_MAX)); -} - -template <> -inline uint8_t saturate_cast(int v) { - return static_cast( - ((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0)); -} - -template <> -inline uint8_t saturate_cast(int16_t v) { - return saturate_cast(static_cast(v)); -} - -template <> -inline uint8_t saturate_cast(unsigned v) { - return static_cast(std::min(v, (unsigned)UCHAR_MAX)); -} -template <> -inline uint8_t saturate_cast(float v) { - int iv = static_cast(roundf(v)); - return saturate_cast(iv); -} -template <> -inline uint8_t saturate_cast(double v) { - int iv = static_cast(round(v)); - return saturate_cast(iv); -} -template <> -inline uint8_t saturate_cast(int64_t v) { - return static_cast( - ((uint64_t)v <= (uint64_t)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0)); -} -template <> -inline uint8_t saturate_cast(uint64_t v) { - return static_cast(std::min(v, (uint64_t)UCHAR_MAX)); -} - -template <> -inline int8_t saturate_cast(uint8_t v) { - return static_cast(std::min(static_cast(v), SCHAR_MAX)); -} -template <> -inline int8_t saturate_cast(uint16_t v) { - return static_cast(std::min((unsigned)v, (unsigned)SCHAR_MAX)); -} -template <> -inline int8_t saturate_cast(int v) { - return static_cast(((unsigned)(v - SCHAR_MIN) <= (unsigned)UCHAR_MAX - ? v - : v > 0 ? SCHAR_MAX : SCHAR_MIN)); -} -template <> -inline int8_t saturate_cast(int16_t v) { - return saturate_cast(static_cast(v)); -} -template <> -inline int8_t saturate_cast(unsigned v) { - return static_cast(std::min(v, (unsigned)SCHAR_MAX)); -} -template <> -inline int8_t saturate_cast(float v) { - int iv = static_cast(roundf(v)); - return saturate_cast(iv); -} -template <> -inline int8_t saturate_cast(double v) { - int iv = static_cast(round(v)); - return saturate_cast(iv); -} -template <> -inline int8_t saturate_cast(int64_t v) { - return static_cast( - ((uint64_t)(static_cast(v) - SCHAR_MIN) <= (uint64_t)UCHAR_MAX - ? v - : v > 0 ? SCHAR_MAX : SCHAR_MIN)); -} -template <> -inline int8_t saturate_cast(uint64_t v) { - return static_cast(std::min(v, (uint64_t)SCHAR_MAX)); -} - -template <> -inline uint16_t saturate_cast(int8_t v) { - return static_cast(std::max(static_cast(v), 0)); -} - -template <> -inline uint16_t saturate_cast(int16_t v) { - return static_cast(std::max(static_cast(v), 0)); -} -template <> -inline uint16_t saturate_cast(int v) { - return static_cast( - (unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); -} -template <> -inline uint16_t saturate_cast(unsigned v) { - return static_cast(std::min(v, (unsigned)USHRT_MAX)); -} -template <> -inline uint16_t saturate_cast(float v) { - int iv = static_cast(roundf(v)); - return saturate_cast(iv); -} -template <> -inline uint16_t saturate_cast(double v) { - int iv = static_cast(round(v)); - return saturate_cast(iv); -} -template <> -inline uint16_t saturate_cast(int64_t v) { - return static_cast( - (uint64_t)v <= (uint64_t)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); -} -template <> -inline uint16_t saturate_cast(uint64_t v) { - return static_cast(std::min(v, (uint64_t)USHRT_MAX)); -} - -template <> -inline int16_t saturate_cast(uint16_t v) { - return static_cast(std::min(static_cast(v), SHRT_MAX)); -} -template <> -inline int16_t saturate_cast(int v) { - return static_cast((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX - ? v - : v > 0 ? SHRT_MAX : SHRT_MIN); -} -template <> -inline int16_t saturate_cast(unsigned v) { - return (int16_t)std::min(v, (unsigned)SHRT_MAX); -} -template <> -inline int16_t saturate_cast(float v) { - int iv = static_cast(roundf(v)); - return saturate_cast(iv); -} -template <> -inline int16_t saturate_cast(double v) { - int iv = static_cast(round(v)); - return saturate_cast(iv); -} -template <> -inline int16_t saturate_cast(int64_t v) { - return static_cast((uint64_t)((int64_t)v - SHRT_MIN) <= - (uint64_t)USHRT_MAX - ? v - : v > 0 ? SHRT_MAX : SHRT_MIN); -} -template <> -inline int16_t saturate_cast(uint64_t v) { - return static_cast(std::min(v, (uint64_t)SHRT_MAX)); -} - -template <> -inline int saturate_cast(unsigned v) { - return static_cast(std::min(v, (unsigned)INT_MAX)); -} -template <> -inline int saturate_cast(int64_t v) { - return static_cast((uint64_t)(v - INT_MIN) <= (uint64_t)UINT_MAX - ? v - : v > 0 ? INT_MAX : INT_MIN); -} -template <> -inline int saturate_cast(uint64_t v) { - return static_cast(std::min(v, (uint64_t)INT_MAX)); -} -template <> -inline int saturate_cast(float v) { - return static_cast(roundf(v)); -} -template <> -inline int saturate_cast(double v) { - return static_cast(round(v)); -} - -template <> -inline unsigned saturate_cast(int8_t v) { - return static_cast(std::max(v, static_cast(0))); -} -template <> -inline unsigned saturate_cast(int16_t v) { - return static_cast(std::max(v, (int16_t)0)); -} -template <> -inline unsigned saturate_cast(int v) { - return static_cast(std::max(v, static_cast(0))); -} -template <> -inline unsigned saturate_cast(int64_t v) { - return static_cast( - (uint64_t)v <= (uint64_t)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); -} -template <> -inline unsigned saturate_cast(uint64_t v) { - return static_cast(std::min(v, (uint64_t)UINT_MAX)); -} -// we intentionally do not clip negative numbers, to make -1 become 0xffffffff -// etc. -template <> -inline unsigned saturate_cast(float v) { - return static_cast(roundf(v)); -} -template <> -inline unsigned saturate_cast(double v) { - return static_cast(round(v)); -} - -template <> -inline uint64_t saturate_cast(int8_t v) { - return static_cast(std::max(v, static_cast(0))); -} - -template <> -inline uint64_t saturate_cast(int16_t v) { - return static_cast(std::max(v, (int16_t)0)); -} -template <> -inline uint64_t saturate_cast(int v) { - return static_cast(std::max(v, static_cast(0))); -} -template <> -inline uint64_t saturate_cast(int64_t v) { - return static_cast(std::max(v, (int64_t)0)); -} - -template <> -inline int64_t saturate_cast(uint64_t v) { - return static_cast(std::min(v, (uint64_t)LLONG_MAX)); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc deleted file mode 100644 index 7f2169a645..0000000000 --- a/lite/backends/arm/math/scale.cc +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/scale.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void scale( - const float* din, float* dout, int num, float scale, float bias) { - int cnt = num >> 4; - int remain = num % 16; - float32x4_t vscale = vdupq_n_f32(scale); - float32x4_t vbias = vdupq_n_f32(bias); -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* din_ptr = din + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale); - float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale); - float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale); - float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale); - - vst1q_f32(dout_ptr, vsum1); - vst1q_f32(dout_ptr + 4, vsum2); - vst1q_f32(dout_ptr + 8, vsum3); - vst1q_f32(dout_ptr + 12, vsum4); - } - if (remain > 0) { - const float* din_ptr = din + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *din_ptr * scale + bias; - dout_ptr++; - din_ptr++; - } - } -} - -template <> -void scale(const float* din, - float* dout, - int outer_dim, - int scale_dim, - int inner_dim, - const float* scale_data, - const float* bias_data) { - int cnt = inner_dim >> 4; - int remain = inner_dim % 16; - int size = inner_dim * scale_dim; - for (int n = 0; n < outer_dim; n++) { - const float* din_ptr_n = din + n * size; - float* dout_ptr_n = dout + n * size; -#pragma omp parallel for - for (int i = 0; i < scale_dim; i++) { - const float* din_ptr = din_ptr_n + i * inner_dim; - float* dout_ptr = dout_ptr_n + i * inner_dim; - float scale = scale_data[i]; - float32x4_t vscale = vdupq_n_f32(scale); - float bias = bias_data[i]; - float32x4_t vbias = vdupq_n_f32(bias); - for (int j = 0; j < cnt; j++) { - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale); - float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale); - float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale); - float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale); - - din_ptr += 16; - vst1q_f32(dout_ptr, vsum1); - vst1q_f32(dout_ptr + 4, vsum2); - vst1q_f32(dout_ptr + 8, vsum3); - vst1q_f32(dout_ptr + 12, vsum4); - - dout_ptr += 16; - } - for (int j = 0; j < remain; j++) { - *dout_ptr = *din_ptr * scale + bias; - dout_ptr++; - din_ptr++; - } - } - } -} - -template <> -void scale(const float* din, - float* dout, - int outer_dim, - int scale_dim, - const float* scale_data, - const float* bias_data) { - int cnt = scale_dim >> 4; - int remain = scale_dim % 16; - for (int n = 0; n < outer_dim; n++) { - const float* din_ptr_n = din + n * scale_dim; - float* dout_ptr_n = dout + n * scale_dim; -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - int idx = i << 4; - const float* din_ptr = din_ptr_n + idx; - const float* scale_ptr = scale_data + idx; - const float* bias_ptr = bias_data + idx; - float* dout_ptr = dout_ptr_n + idx; - - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t vscale0 = vld1q_f32(scale_ptr); - float32x4_t vbias0 = vld1q_f32(bias_ptr); - - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t vscale1 = vld1q_f32(scale_ptr + 4); - float32x4_t vbias1 = vld1q_f32(bias_ptr + 4); - - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t vscale2 = vld1q_f32(scale_ptr + 8); - float32x4_t vbias2 = vld1q_f32(bias_ptr + 8); - - float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0); - float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1); - - float32x4_t din3 = vld1q_f32(din_ptr + 12); - float32x4_t vscale3 = vld1q_f32(scale_ptr + 12); - float32x4_t vbias3 = vld1q_f32(bias_ptr + 12); - - vst1q_f32(dout_ptr, vsum1); - vst1q_f32(dout_ptr + 4, vsum2); - - float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2); - float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3); - - vst1q_f32(dout_ptr + 8, vsum3); - vst1q_f32(dout_ptr + 12, vsum4); - } - int idx = cnt << 4; - const float* din_ptr = din_ptr_n + idx; - float* dout_ptr = dout_ptr_n + idx; - const float* scale_ptr = scale_data + idx; - const float* bias_ptr = bias_data + idx; - for (int j = 0; j < remain; j++) { - *dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr); - dout_ptr++; - din_ptr++; - scale_ptr++; - bias_ptr++; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h deleted file mode 100644 index a86528c9df..0000000000 --- a/lite/backends/arm/math/scale.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void scale(const T* din, T* dout, int num, float scale, float bias); - -template -void scale(const T* din, - T* dout, - int outer_dim, - int scale_dim, - int inner_dim, - const float* scale_data, - const float* bias_data); - -template -void scale(const T* din, - T* dout, - int outer_dim, - int scale_dim, - const float* scale_data, - const float* bias_data); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sequence2batch.h b/lite/backends/arm/math/sequence2batch.h deleted file mode 100644 index d982ad6667..0000000000 --- a/lite/backends/arm/math/sequence2batch.h +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -class CopyMatrixRowsFunctor { - public: - // If is_src_index is true, - // copy the indexed rows of input src to the output dst. - // If is_src_index is false, - // copy the input src to the indexed rows of output dst. - // The indexed rows are based on the input index. - void operator()(const Tensor& src, - std::vector index_lod, - Tensor* dst, - bool is_src_index) { - auto index = index_lod.data(); - auto src_dims = src.dims(); - auto dst_dims = dst->dims(); - CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2."; - CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2."; - CHECK_EQ(src_dims[1], dst_dims[1]) - << "The width of src and dst must be same."; - auto height = dst_dims[0]; - auto width = dst_dims[1]; - auto* src_data = src.data(); - auto* dst_data = dst->mutable_data(); - const int sz = width * sizeof(T); - if (is_src_index) { - for (int i = 0; i < height; ++i) { - TargetCopy(TARGET(kARM), - dst_data + i * width, - src_data + index[i] * width, - sz); - } - } else { - for (int i = 0; i < height; ++i) { - TargetCopy(TARGET(kARM), - dst_data + index[i] * width, - src_data + i * width, - sz); - } - } - } -}; - -template -class LoDTensor2BatchFunctor { - // Calculate the length of each sequence and - // sort sequence index by the length. - // example: sequences = {s0, s1, s2} - // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 - // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} - // - struct SeqInfo { - SeqInfo(int start, int length, int seq_idx) - : start(start), length(length), seq_idx(seq_idx) {} - int start; - int length; - int seq_idx; - }; - - public: - void operator()(const Tensor& lod_tensor, - Tensor* batch, - bool is_cal_batch_lod, - bool is_reverse = false) const { - if (!is_cal_batch_lod) { - auto lods = batch->lod(); - CHECK_GT(lods.size(), 2UL) - << "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."; - CHECK_EQ(lods[1].size(), static_cast(lod_tensor.dims()[0])) - << "The LoD information should be consistent with the dims."; - CopyMatrixRowsFunctor to_batch; - to_batch(lod_tensor, lods[1], batch, true); - return; - } - - auto lods = lod_tensor.lod(); - CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now."; - - const auto& lod = lods[0]; - - std::vector seq_info; - for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { - int length = lod[seq_id + 1] - lod[seq_id]; - seq_info.emplace_back(lod[seq_id], length, seq_id); - } - - std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) { - return a.length > b.length; - }); - - // Calculate the start position of each batch. - // example: sequences = {s0, s1, s2} - // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 - // max_seqlen = 5, - // batchIndex = {b0, b1, b2, b3, b4} - // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 - // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} - // batch_start_positions[0] = len(b0) - // batch_start_positions[1] = len(b0) + len(b1) - // batch_start_positions[2] = len(b0) + len(b1) + len(b2) - // ... - // seq2batch_idx[12] = {4, 0, 9, - // 5, 1, 10, - // 6, 2, 11, - // 7, 3, - // 8} - // seq_order = {1, 0, 2}, the sort order. - // where 1 is the second sequence, - // 0 is the first sequence, - // 2 is the third sequence. - // The max_seqlen represents batch size after rearranging the - // input LodTensor. It is also the maximum length of input sequence. - - LoD batch_lods; - batch_lods.emplace_back(std::vector{0}); - batch_lods.emplace_back(std::vector{0}); - batch_lods.emplace_back(std::vector{0}); - - // batch_lods[0] is the start positions for batch LoDTensor - int max_seqlen = seq_info[0].length; - batch_lods[0].resize(static_cast(max_seqlen + 1)); - // batch_lods[1] is the raw index in the input LoDTensor - batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); - // batch_lods[2] is the sort order for the input LoDTensor. - batch_lods[2].resize(seq_info.size()); - - auto batch_starts = batch_lods[0].data(); - auto seq2batch_idx = batch_lods[1].data(); - batch_starts[0] = 0; - for (int n = 0; n < max_seqlen; n++) { - auto batch_id = static_cast(batch_starts[n]); - for (size_t i = 0; i < seq_info.size(); ++i) { - int seq_len = seq_info[i].length; - int start = seq_info[i].start; - if (n < seq_len) { - seq2batch_idx[batch_id] = - is_reverse ? start + seq_len - 1 - n : start + n; - batch_id++; - } else { - break; - } - } - batch_starts[n + 1] = static_cast(batch_id); - } - auto seq_order = batch_lods[2].data(); - for (size_t i = 0; i < seq_info.size(); ++i) { - seq_order[i] = seq_info[i].seq_idx; - } - *(batch->mutable_lod()) = batch_lods; - - CopyMatrixRowsFunctor to_batch; - to_batch(lod_tensor, batch_lods[1], batch, true); - } -}; - -template -class Batch2LoDTensorFunctor { - public: - void operator()(const Tensor& batch, Tensor* lod_tensor) const { - auto in_lod = batch.lod(); - CHECK_GT(in_lod.size(), 2UL) - << "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."; - CHECK_EQ(in_lod[1].size(), static_cast(lod_tensor->dims()[0])) - << "The LoD information should be consistent with the dims."; - CopyMatrixRowsFunctor to_seq; - to_seq(batch, in_lod[1], lod_tensor, false); - } -}; - -template -inline void ReorderInitState(const Tensor& src, - const std::vector& index_lod, - Tensor* dst, - bool indexed_src) { - CopyMatrixRowsFunctor row_shuffle; - dst->Resize(src.dims()); - dst->mutable_data(); - row_shuffle(src, index_lod, dst, indexed_src); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sequence_expand.cc b/lite/backends/arm/math/sequence_expand.cc deleted file mode 100644 index 63a2e91793..0000000000 --- a/lite/backends/arm/math/sequence_expand.cc +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/sequence_expand.h" -#include -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void SequenceExpandImpl(const float* x_data, - const LoD& x_lod, - int width, - const std::vector& ref_lod, - lite::Tensor* output) { - float* output_data = output->mutable_data(); - if (x_lod.size() == 0) { - for (int i = 0; i < ref_lod.size() - 1; i++) { - for (int j = ref_lod[i]; j < ref_lod[i + 1]; j++) { - memcpy( - output_data + j * width, x_data + i * width, sizeof(float) * width); - } - } - (output->mutable_lod())->push_back(ref_lod); - } else { - std::vector out_lod; - out_lod.push_back(0); - uint64_t out_offset = 0; - uint64_t len = 0; - for (int i = 0; i < ref_lod.size() - 1; i++) { - auto x_seq_len = x_lod[0][i + 1] - x_lod[0][i]; - for (int j = ref_lod[i]; j < ref_lod[i + 1]; j++) { - memcpy(output_data + out_offset * width, - x_data + len * width, - width * sizeof(float) * x_seq_len); - out_offset += x_seq_len; - out_lod.push_back(out_offset); - } - len += x_seq_len; - } - (output->mutable_lod())->push_back(out_lod); - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sequence_expand.h b/lite/backends/arm/math/sequence_expand.h deleted file mode 100644 index d3b19a4c62..0000000000 --- a/lite/backends/arm/math/sequence_expand.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/core/tensor.h" - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void SequenceExpandImpl(const T* x_data, - const LoD& x_lod, - int width, - const std::vector& ref_lod, - lite::Tensor* output); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sequence_pool.cc b/lite/backends/arm/math/sequence_pool.cc deleted file mode 100644 index b8f9ab0a1a..0000000000 --- a/lite/backends/arm/math/sequence_pool.cc +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/sequence_pool.h" -#include -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void seq_pool_sum(const float* din, - float* dout, - const std::vector lod, - int64_t width) { - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float* din_ptr = din + lod[i] * width; - float* dout_ptr = dout + i * width; - int64_t height = static_cast(lod[i + 1] - lod[i]); - if (width == 1) { - float sum = 0.f; - for (int h = 0; h < height; ++h) { - sum += din_ptr[h]; - } - *dout_ptr = sum; - } else { - memcpy(dout_ptr, din_ptr, width * sizeof(float)); - din_ptr += width; - height = height - 1; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; ++w) { - dout_ptr[w] += din_ptr[w]; - } - din_ptr += width; - } - } - } -} - -template <> -void seq_pool_average(const float* din, - float* dout, - const std::vector lod, - int64_t width) { - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float* din_ptr = din + lod[i] * width; - float* dout_ptr = dout + i * width; - int64_t height = static_cast(lod[i + 1] - lod[i]); - if (height > 0) { - if (width == 1) { - float sum = 0.f; - for (int h = 0; h < height; ++h) { - sum += din_ptr[h]; - } - *dout_ptr = sum / height; - } else { - memcpy(dout_ptr, din_ptr, width * sizeof(float)); - din_ptr += width; - int remain_h = height - 1; - for (int h = 0; h < remain_h; h++) { - for (int w = 0; w < width; ++w) { - dout_ptr[w] += din_ptr[w]; - } - din_ptr += width; - } - for (int w = 0; w < width; ++w) { - dout_ptr[w] /= height; - } - } - } - } -} - -template <> -void seq_pool_sqrt(const float* din, - float* dout, - const std::vector lod, - int64_t width) { - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float* din_ptr = din + lod[i] * width; - float* dout_ptr = dout + i * width; - int64_t height = static_cast(lod[i + 1] - lod[i]); - if (height > 0) { - float sqrt_len = sqrtf(height); - if (width == 1) { - float sum = 0.f; - for (int h = 0; h < height; ++h) { - sum += din_ptr[h]; - } - *dout_ptr = sum / sqrt_len; - } else { - memcpy(dout_ptr, din_ptr, width * sizeof(float)); - din_ptr += width; - int remain_h = height - 1; - for (int h = 0; h < remain_h; h++) { - for (int w = 0; w < width; ++w) { - dout_ptr[w] += din_ptr[w]; - } - din_ptr += width; - } - for (int w = 0; w < width; ++w) { - dout_ptr[w] /= sqrt_len; - } - } - } - } -} - -template <> -void seq_pool_max(const float* din, - float* dout, - const std::vector lod, - int64_t width) { - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float* din_ptr = din + lod[i] * width; - float* dout_ptr = dout + i * width; - int64_t height = static_cast(lod[i + 1] - lod[i]); - if (height > 0) { - if (width == 1) { - float max = -std::numeric_limits::max(); - for (int h = 0; h < height; ++h) { - max = std::max(max, din_ptr[h]); - } - *dout_ptr = max; - } else { - memcpy(dout_ptr, din_ptr, width * sizeof(float)); - din_ptr += width; - int remain_h = height - 1; - for (int h = 0; h < remain_h; h++) { - for (int w = 0; w < width; w++) { - dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]); - } - din_ptr += width; - } - } - } - } -} - -template <> -void seq_pool_min(const float* din, - float* dout, - const std::vector lod, - int64_t width) { - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - const float* din_ptr = din + lod[i] * width; - float* dout_ptr = dout + i * width; - int64_t height = static_cast(lod[i + 1] - lod[i]); - if (height > 0) { - if (width == 1) { - float min = std::numeric_limits::max(); - for (int h = 0; h < height; ++h) { - min = std::min(min, din_ptr[h]); - } - *dout_ptr = min; - } else { - memcpy(dout_ptr, din_ptr, width * sizeof(float)); - din_ptr += width; - int remain_h = height - 1; - for (int h = 0; h < remain_h; h++) { - for (int w = 0; w < width; w++) { - dout_ptr[w] = std::min(dout_ptr[w], din_ptr[w]); - } - din_ptr += width; - } - } - } - } -} - -template <> -void seq_pool_first(const float* din, - float* dout, - const std::vector lod, - int64_t width) { - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - int64_t height = lod[i + 1] - lod[i]; - const float* din_ptr = din + width * lod[i]; - float* dout_ptr = dout + i * width; - if (height > 0) { - memcpy(dout_ptr, din_ptr, width * sizeof(float)); - } - } -} - -template <> -void seq_pool_last(const float* din, - float* dout, - const std::vector lod, - int64_t width) { - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - int64_t height = lod[i + 1] - lod[i]; - int64_t seq_len = static_cast(lod[i + 1] - lod[0]); - const float* din_ptr = din + width * seq_len; - float* dout_ptr = dout + i * width; - if (height > 0) { - memcpy(dout_ptr, din_ptr - width, width * sizeof(float)); - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sequence_pool.h b/lite/backends/arm/math/sequence_pool.h deleted file mode 100644 index 6cbcd7d6d6..0000000000 --- a/lite/backends/arm/math/sequence_pool.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void seq_pool_sum(const T* din, - T* dout, - const std::vector lod, - int64_t width); - -template -void seq_pool_average(const T* din, - T* dout, - const std::vector lod, - int64_t width); - -template -void seq_pool_sqrt(const T* din, - T* dout, - const std::vector lod, - int64_t width); - -template -void seq_pool_max(const T* din, - T* dout, - const std::vector lod, - int64_t width); - -template -void seq_pool_min(const T* din, - T* dout, - const std::vector lod, - int64_t width); - -template -void seq_pool_first(const T* din, - T* dout, - const std::vector lod, - int64_t width); - -template -void seq_pool_last(const T* din, - T* dout, - const std::vector lod, - int64_t width); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sequence_softmax.cc b/lite/backends/arm/math/sequence_softmax.cc deleted file mode 100644 index fcbb1a353d..0000000000 --- a/lite/backends/arm/math/sequence_softmax.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/sequence_softmax.h" -#include -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -bool sequence_softmax(const float* input, - const std::vector& seq_offset, - float* out, - Context* ctx) { - int seq_num = seq_offset.size() - 1; - for (int i = 0; i < seq_num; i++) { - float seq_max = input[seq_offset[i]]; - float exp_sum = 0.f; - for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { - seq_max = std::max(seq_max, input[j]); - } - for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { - exp_sum += expf(input[j] - seq_max); - } - for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { - out[j] = expf(input[j] - seq_max) / exp_sum; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sequence_softmax.h b/lite/backends/arm/math/sequence_softmax.h deleted file mode 100644 index 2923039b0c..0000000000 --- a/lite/backends/arm/math/sequence_softmax.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -bool sequence_softmax(const float* input, - const std::vector& seq_offset, - float* out, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sgemm.cc b/lite/backends/arm/math/sgemm.cc deleted file mode 100644 index 93f64445e2..0000000000 --- a/lite/backends/arm/math/sgemm.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/sgemm.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void sgemm(bool is_transA, - bool is_transB, - int M, - int N, - int K, - float alpha, - const float* A, - int lda, - const float* B, - int ldb, - float beta, - float* C, - int ldc, - const float* bias, - bool is_bias, - bool is_relu, - ARMContext* ctx) { - auto arch = ctx->arch(); - int hblock = get_hblock(arch); - int m_roundup = hblock * ((M + hblock - 1) / hblock); - - auto packed_A = static_cast( - TargetMalloc(TargetType::kARM, m_roundup * K * sizeof(float))); - - prepackA(packed_A, A, alpha, lda, 0, M, 0, K, is_transA, ctx); - - sgemm_prepack(is_transB, - M, - N, - K, - packed_A, - B, - ldb, - beta, - C, - ldc, - bias, - is_bias, - is_relu, - ctx); - TargetFree(TargetType::kARM, packed_A); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sgemm.h b/lite/backends/arm/math/sgemm.h deleted file mode 100644 index 08f68fb3d4..0000000000 --- a/lite/backends/arm/math/sgemm.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/backends/arm/math/packed_sgemm.h" -#include "lite/core/context.h" -#include "lite/core/device_info.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void sgemm(bool is_transA, - bool is_transB, - int M, - int N, - int K, - float alpha, - const float* A, - int lda, - const float* B, - int ldb, - float beta, - float* C, - int ldc, - const float* bias, - bool is_bias, - bool is_relu, - ARMContext* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc deleted file mode 100644 index 506451932d..0000000000 --- a/lite/backends/arm/math/sgemv.cc +++ /dev/null @@ -1,1054 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/sgemv.h" -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void sgemv(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y); - -void sgemv_relu(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y); - -void sgemv_bias(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y, - const float *bias); - -void sgemv_bias_relu(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y, - const float *bias); - -bool sgemv(const float *A, - const float *x, - float *y, - bool transA, - int M, - int N, - bool is_bias, - const float *bias, - bool is_relu) { - if (transA) { - LOG(ERROR) << " sgemv, transA is not supported now"; - return false; - } - if (is_bias) { - //! with bias - if (is_relu) { - //! with relu - sgemv_bias_relu(transA, M, N, A, x, y, bias); - } else { - //! without relu - sgemv_bias(transA, M, N, A, x, y, bias); - } - } else { - //! without bias - if (is_relu) { - //! with relu - sgemv_relu(transA, M, N, A, x, y); - } else { - //! without relu - sgemv(transA, M, N, A, x, y); - } - } - return true; -} - -//! define compute kernel -#ifdef __aarch64__ -#define SGEMV_IN_8 \ - "prfm pldl1keep, [%[in]] \n" /* preload din */ \ - "prfm pldl1keep, [%[w0]] \n" /* preload w0 */ \ - "prfm pldl1keep, [%[w1]] \n" /* preload w1 */ \ - "prfm pldl1keep, [%[w2]] \n" /* preload w2 */ \ - "prfm pldl1keep, [%[w3]] \n" /* preload w3 */ \ - "prfm pldl1keep, [%[w4]] \n" /* preload w4 */ \ - "prfm pldl1keep, [%[w5]] \n" /* preload w5 */ \ - "prfm pldl1keep, [%[w6]] \n" /* preload w6 */ \ - "prfm pldl1keep, [%[w7]] \n" /* preload w7 */ \ - "movi v0.4s, #0 \n" /* set out0 to 0 */ \ - "movi v1.4s, #0 \n" /* set out1 to 0 */ \ - "movi v2.4s, #0 \n" /* set out2 to 0 */ \ - "movi v3.4s, #0 \n" /* set out3 to 0 */ \ - "movi v4.4s, #0 \n" /* set out4 to 0 */ \ - "movi v5.4s, #0 \n" /* set out5 to 0 */ \ - "movi v6.4s, #0 \n" /* set out6 to 0 */ \ - "movi v7.4s, #0 \n" /* set out7 to 0 */ - -#define SGEMV_IN_8_BIAS \ - "ldp q8, q9, [%[bias_ptr]]\n" /* load bias to q8, q9*/ \ - "prfm pldl1keep, [%[in]] \n" /* preload din */ \ - "prfm pldl1keep, [%[w0]] \n" /* preload w0 */ \ - "prfm pldl1keep, [%[w1]] \n" /* preload w1 */ \ - "prfm pldl1keep, [%[w2]] \n" /* preload w2 */ \ - "prfm pldl1keep, [%[w3]] \n" /* preload w3 */ \ - "prfm pldl1keep, [%[w4]] \n" /* preload w4 */ \ - "prfm pldl1keep, [%[w5]] \n" /* preload w5 */ \ - "prfm pldl1keep, [%[w6]] \n" /* preload w6 */ \ - "prfm pldl1keep, [%[w7]] \n" /* preload w7 */ \ - "movi v0.4s, #0 \n" /* set out0 to 0 */ \ - "movi v1.4s, #0 \n" /* set out1 to 0 */ \ - "movi v2.4s, #0 \n" /* set out2 to 0 */ \ - "movi v3.4s, #0 \n" /* set out3 to 0 */ \ - "movi v4.4s, #0 \n" /* set out4 to 0 */ \ - "movi v5.4s, #0 \n" /* set out5 to 0 */ \ - "movi v6.4s, #0 \n" /* set out6 to 0 */ \ - "movi v7.4s, #0 \n" /* set out7 to 0 */ \ - "ins v0.s[0], v8.s[0] \n" /* out0 = bias0 */ \ - "ins v1.s[0], v8.s[1] \n" /* out1 = bias1 */ \ - "ins v2.s[0], v8.s[2] \n" /* out2 = bias2 */ \ - "ins v3.s[0], v8.s[3] \n" /* out3 = bias3 */ \ - "ins v4.s[0], v9.s[0] \n" /* out4 = bias4 */ \ - "ins v5.s[0], v9.s[1] \n" /* out5 = bias5 */ \ - "ins v6.s[0], v9.s[2] \n" /* out6 = bias6 */ \ - "ins v7.s[0], v9.s[3] \n" /* out7 = bias7 */ - -#define SGEMV_IN_1 \ - "prfm pldl1keep, [%[in]] \n" /* preload din */ \ - "prfm pldl1keep, [%[w0]] \n" /* preload w0 */ \ - "movi v0.4s, #0 \n" /* set out0 to 0 */ \ - "movi v1.4s, #0 \n" /* set out0 to 0 */ - -#define SGEMV_IN_1_BIAS \ - "prfm pldl1keep, [%[in]] \n" /* preload din */ \ - "prfm pldl1keep, [%[w0]] \n" /* preload w0 */ \ - "movi v0.4s, #0 \n" /* set out0 to 0 */ \ - "movi v1.4s, #0 \n" /* set out0 to 0 */ \ - "fmov s0, %w[bias0] \n" /* set out0 = bias0 */ - -#define SGEMV_KERNEL_8 \ - /* check main loop */ \ - "cmp %w[cnt], #1 \n" /* check whether has main loop */ \ - "blt 2f \n" /* jump to tail */ /* main loop */ \ - "1: \n" /* main loop */ \ - "ldp q8, q9, [%[in]], #32 \n" /* load input 8 float */ \ - "ldp q10, q11, [%[w0]], #32 \n" /* load w0 8 float */ \ - "ldp q12, q13, [%[w1]], #32 \n" /* load w1 8 float */ \ - "ldp q14, q15, [%[w2]], #32 \n" /* load w2 8 float */ \ - "ldp q16, q17, [%[w3]], #32 \n" /* load w3 8 float */ \ - "ldp q18, q19, [%[w4]], #32 \n" /* load w4 8 float */ \ - "ldp q20, q21, [%[w5]], #32 \n" /* load w5 8 float */ \ - "ldp q22, q23, [%[w6]], #32 \n" /* load w6 8 float */ \ - "ldp q24, q25, [%[w7]], #32 \n" /* load w7 8 float */ \ - "fmla v0.4s, v8.4s, v10.4s \n" /* mul + add*/ \ - "fmla v1.4s, v8.4s, v12.4s \n" /* mul + add*/ \ - "fmla v2.4s, v8.4s, v14.4s \n" /* mul + add*/ \ - "fmla v3.4s, v8.4s, v16.4s \n" /* mul + add*/ \ - "fmla v4.4s, v8.4s, v18.4s \n" /* mul + add*/ \ - "fmla v5.4s, v8.4s, v20.4s \n" /* mul + add*/ \ - "fmla v6.4s, v8.4s, v22.4s \n" /* mul + add*/ \ - "fmla v7.4s, v8.4s, v24.4s \n" /* mul + add*/ \ - "subs %w[cnt], %w[cnt], #1 \n" /* sub main loop count */ \ - "fmla v0.4s, v9.4s, v11.4s \n" /* mul + add*/ \ - "fmla v1.4s, v9.4s, v13.4s \n" /* mul + add*/ \ - "fmla v2.4s, v9.4s, v15.4s \n" /* mul + add*/ \ - "fmla v3.4s, v9.4s, v17.4s \n" /* mul + add*/ \ - "fmla v4.4s, v9.4s, v19.4s \n" /* mul + add*/ \ - "fmla v5.4s, v9.4s, v21.4s \n" /* mul + add*/ \ - "fmla v6.4s, v9.4s, v23.4s \n" /* mul + add*/ \ - "fmla v7.4s, v9.4s, v25.4s \n" /* mul + add*/ \ - "bne 1b \n" /* jump to main loop */ /* pair add to final \ - result */ \ - "2: \n" /* reduce to scale */ \ - "faddp v16.4s, v0.4s, v0.4s\n" /* pair add to vector */ \ - "faddp s8, v16.2s \n" /* pair add to scale */ \ - "faddp v17.4s, v1.4s, v1.4s\n" /* pair add to vector */ \ - "faddp s9, v17.2s \n" /* pair add to scale */ \ - "faddp v18.4s, v2.4s, v2.4s\n" /* pair add to vector */ \ - "faddp s10, v18.2s \n" /* pair add to scale */ \ - "faddp v19.4s, v3.4s, v3.4s\n" /* pair add to vector */ \ - "faddp s11, v19.2s \n" /* pair add to scale */ \ - "faddp v20.4s, v4.4s, v4.4s\n" /* pair add to vector */ \ - "faddp s12, v20.2s \n" /* pair add to scale */ \ - "faddp v21.4s, v5.4s, v5.4s\n" /* pair add to vector */ \ - "faddp s13, v21.2s \n" /* pair add to scale */ \ - "faddp v22.4s, v6.4s, v6.4s\n" /* pair add to vector */ \ - "faddp s14, v22.2s \n" /* pair add to scale */ \ - "faddp v23.4s, v7.4s, v7.4s\n" /* pair add to vector */ \ - "faddp s15, v23.2s \n" /* pair add to scale */ \ - "cmp %w[tail], #1 \n" /* check whether has tail */ \ - "blt 4f \n" /* jump to end */ \ - "3: \n" /* tail loop */ \ - "ldr s16, [%[in]], #4 \n" /* load in, 1 float */ \ - "ldr s17, [%[w0]], #4 \n" /* load w0, 1 float */ \ - "ldr s18, [%[w1]], #4 \n" /* load w1, 1 float */ \ - "ldr s19, [%[w2]], #4 \n" /* load w2, 1 float */ \ - "ldr s20, [%[w3]], #4 \n" /* load w3, 1 float */ \ - "ldr s21, [%[w4]], #4 \n" /* load w4, 1 float */ \ - "ldr s22, [%[w5]], #4 \n" /* load w5, 1 float */ \ - "ldr s23, [%[w6]], #4 \n" /* load w6, 1 float */ \ - "ldr s24, [%[w7]], #4 \n" /* load w7, 1 float */ \ - "fmadd s8, s16, s17, s8 \n" /* mul + add */ \ - "fmadd s9, s16, s18, s9 \n" /* mul + add */ \ - "fmadd s10, s16, s19, s10 \n" /* mul + add */ \ - "fmadd s11, s16, s20, s11 \n" /* mul + add */ \ - "fmadd s12, s16, s21, s12 \n" /* mul + add */ \ - "fmadd s13, s16, s22, s13 \n" /* mul + add */ \ - "fmadd s14, s16, s23, s14 \n" /* mul + add */ \ - "fmadd s15, s16, s24, s15 \n" /* mul + add */ \ - "subs %w[tail], %w[tail], #1\n" /* sub tail loop count */ \ - "bne 3b \n" /* jump to tail loop */ - -#define SGEMV_KERNEL_1 \ - /* check main loop */ \ - "cmp %w[cnt], #1 \n" /* check whether has main loop */ \ - "blt 2f \n" /* jump to tail */ /* main loop */ \ - "1: \n" /* main loop */ \ - "ldp q8, q9, [%[in]], #32 \n" /* load input 8 float */ \ - "ldp q10, q11, [%[w0]], #32 \n" /* load w0 8 float */ \ - "fmla v0.4s, v8.4s, v10.4s \n" /* mul + add*/ \ - "subs %w[cnt], %w[cnt], #1 \n" /* sub main loop count */ \ - "fmla v1.4s, v9.4s, v11.4s \n" /* mul + add*/ \ - "bne 1b \n" /* jump to main loop */ /* pair add to final \ - result */ \ - "2: \n" /* reduce to scale */ \ - "fadd v9.4s, v0.4s, v1.4s \n" /* add 2 vector */ \ - "faddp v10.4s, v9.4s, v9.4s\n" /* pair add to vector */ \ - "faddp s8, v10.2s \n" /* pair add to scale */ /* check tails */ \ - "cmp %w[tail], #1 \n" /* check whether has tail */ \ - "blt 4f \n" /* jump to end */ \ - "3: \n" /* tail loop */ \ - "ldr s16, [%[in]], #4 \n" /* load in, 1 float */ \ - "ldr s17, [%[w0]], #4 \n" /* load w0, 1 float */ \ - "fmadd s8, s16, s17, s8 \n" /* mul + add */ \ - "subs %w[tail], %w[tail], #1\n" /* sub tail loop count */ \ - "bne 3b \n" /* jump to tail loop */ - -#define SGEMV_OUT_8 \ - /* end */ \ - "4: \n" /* end */ \ - "stp s8, s9, [%[out]] \n" /* save result */ \ - "stp s10, s11, [%[out], #8] \n" /* save result */ \ - "stp s12, s13, [%[out], #16]\n" /* save result */ \ - "stp s14, s15, [%[out], #24]\n" /* save result */ - -#define SGEMV_OUT_8_RELU \ - /* end */ \ - "4: \n" /* end */ \ - "movi d0, #0 \n" /* zero data for relu */ \ - "fmax s8, s8, s0 \n" /* relu */ \ - "fmax s9, s9, s0 \n" /* relu */ \ - "fmax s10, s10, s0 \n" /* relu */ \ - "fmax s11, s11, s0 \n" /* relu */ \ - "fmax s12, s12, s0 \n" /* relu */ \ - "fmax s13, s13, s0 \n" /* relu */ \ - "fmax s14, s14, s0 \n" /* relu */ \ - "fmax s15, s15, s0 \n" /* relu */ \ - "stp s8, s9, [%[out]] \n" /* save result */ \ - "stp s10, s11, [%[out], #8] \n" /* save result */ \ - "stp s12, s13, [%[out], #16]\n" /* save result */ \ - "stp s14, s15, [%[out], #24]\n" /* save result */ - -#define SGEMV_OUT_1 \ - /* end */ \ - "4: \n" /* end */ \ - "str s8, [%[out]] \n" /* save result */ - -#define SGEMV_OUT_1_RELU \ - /* end */ \ - "4: \n" /* end */ \ - "movi d0, #0 \n" /* zero data for relu */ \ - "fmax s8, s8, s0 \n" /* relu */ \ - "str s8, [%[out]] \n" /* save result */ - -#else //__aarch64__ - -#define SGEMV_IN_4 \ - "pld [%[in]] @ preload cache line, input\n" \ - "pld [%[w0]] @ preload cache line, weights r0\n" \ - "pld [%[w1]] @ preload cache line, weights r1\n" \ - "pld [%[w2]] @ preload cache line, weights r2\n" \ - "pld [%[w3]] @ preload cache line, weights r3\n" \ - "vmov.u32 q0, #0 @ set q0 to 0\n" \ - "vmov.u32 q1, #0 @ set q1 to 0\n" \ - "vmov.u32 q2, #0 @ set q2 to 0\n" \ - "vmov.u32 q3, #0 @ set q3 to 0\n" \ - "pld [%[w0], #64] @ preload cache line, weights r0\n" \ - "pld [%[w1], #64] @ preload cache line, weights r1\n" \ - "pld [%[w2], #64] @ preload cache line, weights r2\n" \ - "pld [%[w3], #64] @ preload cache line, weights r3\n" - -#define SGEMV_IN_4_BIAS \ - "pld [%[in]] @ preload cache line, input\n" \ - "pld [%[w0]] @ preload cache line, weights r0\n" \ - "pld [%[w1]] @ preload cache line, weights r1\n" \ - "pld [%[w2]] @ preload cache line, weights r2\n" \ - "pld [%[w3]] @ preload cache line, weights r3\n" \ - "vmov.u32 q0, #0 @ set q0 to 0\n" \ - "vmov.u32 q1, #0 @ set q1 to 0\n" \ - "vmov.u32 q2, #0 @ set q2 to 0\n" \ - "vmov.u32 q3, #0 @ set q3 to 0\n" \ - "vmov s0, %[bias0] @ set q0 to bias0\n" \ - "vmov s4, %[bias1] @ set q1 to bias1\n" \ - "vmov s8, %[bias2] @ set q2 to bias2\n" \ - "vmov s12,%[bias3] @ set q3 to bias3\n" \ - "pld [%[w0], #64] @ preload cache line, weights r0\n" \ - "pld [%[w1], #64] @ preload cache line, weights r1\n" \ - "pld [%[w2], #64] @ preload cache line, weights r2\n" \ - "pld [%[w3], #64] @ preload cache line, weights r3\n" - -#define SGEMV_IN_1 \ - "pld [%[in]] @ preload cache line, input\n" \ - "pld [%[w0]] @ preload cache line, weights r0\n" \ - "vmov.u32 q0, #0 @ set q0 to 0\n" - -#define SGEMV_IN_1_BIAS \ - "pld [%[in]] @ preload cache line, input\n" \ - "pld [%[w0]] @ preload cache line, weights r0\n" \ - "vmov.u32 q0, #0 @ set q0 to 0\n" \ - "vmov s0, %[bias0] @ set q0 to 0\n" - -#define SGEMV_KERNEL_4 \ - /* check main loop */ \ - "cmp %[cnt], #1 @ check whether has main loop\n" \ - "blt 2f @ jump to tail\n" \ - "1: @ main loop\n" \ - "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \ - "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \ - "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \ - "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \ - "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \ - "vmla.f32 q0, q4, q6 @ mul add\n" \ - "vmla.f32 q1, q4, q8 @ mul add\n" \ - "vmla.f32 q2, q4, q10 @ mul add\n" \ - "vmla.f32 q3, q4, q12 @ mul add\n" \ - "subs %[cnt], #1 @ sub loop count \n" \ - "vmla.f32 q0, q5, q7 @ mul add\n" \ - "vmla.f32 q1, q5, q9 @ mul add\n" \ - "vmla.f32 q2, q5, q11 @ mul add\n" \ - "vmla.f32 q3, q5, q13 @ mul add\n" \ - "bne 1b @ jump to main loop\n" /* pair add to final \ - result */ \ - "2: @ pair add \n" \ - "vpadd.f32 d8, d0, d1 @ pair add, first step\n" \ - "vpadd.f32 d9, d2, d3 @ pair add, first step\n" \ - "vpadd.f32 d10, d4, d5 @ pair add, first step\n" \ - "vpadd.f32 d11, d6, d7 @ pair add, first step\n" \ - "vpadd.f32 d0, d8, d9 @ pair add, second step\n" \ - "vpadd.f32 d1, d10, d11 @ pair add, second step\n" /* check tails */ \ - "cmp %[tail], #1 @ check whether has tail\n" \ - "blt 4f @ jump to end\n" \ - "3: @ tail loop\n" \ - "vldm %[in]!, {s16} @ load 1 float\n" \ - "vldm %[w0]!, {s17} @ load 1 float\n" \ - "vldm %[w1]!, {s18} @ load 1 float\n" \ - "vldm %[w2]!, {s19} @ load 1 float\n" \ - "vldm %[w3]!, {s20} @ load 1 float\n" \ - "vmla.f32 s0, s16, s17 @ mul + add\n" \ - "vmla.f32 s1, s16, s18 @ mul + add\n" \ - "vmla.f32 s2, s16, s19 @ mul + add\n" \ - "vmla.f32 s3, s16, s20 @ mul + add\n" \ - "subs %[tail], #1 @ sub loop count \n" \ - "bne 3b @ jump to tail loop\n" - -#define SGEMV_KERNEL_1 \ - "cmp %[cnt], #1 @ check whether has main loop\n" \ - "blt 2f @ jump to tail\n" \ - "1: @ main loop\n" \ - "vld1.32 {d24-d27}, [%[in]]! @ load input, q12,q13\n" \ - "vld1.32 {d28-d31}, [%[w0]]! @ load weights r0, q14, q15\n" \ - "vmla.f32 q0, q12, q14 @ mul add\n" \ - "vmla.f32 q0, q13, q15 @ mul add\n" \ - "subs %[cnt] , #1 @ sub loop count \n" \ - "bne 1b @ jump to main loop\n" /* pair add to \ - final result \ - */ \ - "2: @ end processing\n" \ - "vpadd.f32 d2, d0, d1 @ pair add, first step\n" \ - "vpadd.f32 d0, d2, d2 @ pair add, final step\n" /* check tails \ - */ \ - "cmp %[tail], #1 @ check whether has mid cols\n" \ - "blt 4f @ jump to end\n" \ - "3: @ tail loop\n" \ - "vldm %[in]!, {s16} @ load 1 float\n" \ - "vldm %[w0]!, {s17} @ load 1 float\n" \ - "vmla.f32 s0, s16, s17 @ mul + add\n" \ - "subs %[tail], #1 @ sub loop count \n" \ - "bne 3b @ jump to tail loop\n" - -#define SGEMV_OUT_4 \ - /* end */ \ - "4: @ end\n" \ - "vst1.32 {d0-d1}, [%[out]] @ save result\n" - -#define SGEMV_OUT_4_RELU \ - /* end */ \ - "4: @ end\n" \ - "vmov.i32 q1, #0 @ zero for relu\n" \ - "vmax.f32 q0, q0, q1 @ relu\n" \ - "vst1.32 {d0-d1}, [%[out]] @ save result\n" - -#define SGEMV_OUT_1 \ - /* end */ \ - "4: @ end\n" \ - "vst1.32 {d0[0]}, [%[out]] @ save result\n" - -#define SGEMV_OUT_1_RELU \ - /* end */ \ - "4: @ end\n" \ - "vmov.i32 d1, #0 @ zero for relu\n" \ - "vmax.f32 d0, d0, d1 @ relu\n" \ - "vst1.32 {d0[0]}, [%[out]] @ save result\n" -#endif - -void sgemv(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y) { - float *data_out = y; - const float *data_in = x; - const float *weights_ptr = A; - - int cnt = N >> 3; - int tail = N & 7; - -#ifdef __aarch64__ - int out_cnt = M >> 3; - -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 8; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - const float *ptr_w4 = ptr_w3 + N; - const float *ptr_w5 = ptr_w4 + N; - const float *ptr_w6 = ptr_w5 + N; - const float *ptr_w7 = ptr_w6 + N; - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_8 SGEMV_KERNEL_8 SGEMV_OUT_8 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [w4] "+r"(ptr_w4), - [w5] "+r"(ptr_w5), - [w6] "+r"(ptr_w6), - [w7] "+r"(ptr_w7), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 8; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - float tmp[4]; - float tmp1[4]; - float tmp2[4]; - float tmp3[4]; - float tmp4[4]; - asm volatile( - SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), - [tmp] "r"(tmp), - [tmp1] "r"(tmp1), - [tmp2] "r"(tmp2), - [tmp3] "r"(tmp3), - [tmp4] "r"(tmp4) - : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); - } -#else //__aarch64__ - int out_cnt = M >> 2; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 4; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_4 SGEMV_KERNEL_4 SGEMV_OUT_4 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 4; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out) - : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); - } -#endif //__aarch64__ -} - -void sgemv_relu(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y) { - float *data_out = y; - const float *data_in = x; - const float *weights_ptr = A; - - int cnt = N >> 3; - int tail = N & 7; - -#ifdef __aarch64__ - int out_cnt = M >> 3; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 8; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - const float *ptr_w4 = ptr_w3 + N; - const float *ptr_w5 = ptr_w4 + N; - const float *ptr_w6 = ptr_w5 + N; - const float *ptr_w7 = ptr_w6 + N; - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_8 SGEMV_KERNEL_8 SGEMV_OUT_8_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [w4] "+r"(ptr_w4), - [w5] "+r"(ptr_w5), - [w6] "+r"(ptr_w6), - [w7] "+r"(ptr_w7), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 8; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile( - SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out) - : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); - } -#else //__aarch64__ - int out_cnt = M >> 2; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 4; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_4 SGEMV_KERNEL_4 SGEMV_OUT_4_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 4; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out) - : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); - } -#endif //__aarch64__ -} - -void sgemv_bias(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y, - const float *bias) { - float *data_out = y; - const float *data_in = x; - const float *weights_ptr = A; - - int cnt = N >> 3; - int tail = N & 7; - -#ifdef __aarch64__ - int out_cnt = M >> 3; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 8; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - const float *ptr_w4 = ptr_w3 + N; - const float *ptr_w5 = ptr_w4 + N; - const float *ptr_w6 = ptr_w5 + N; - const float *ptr_w7 = ptr_w6 + N; - const float *bias_ptr = bias + out_idx; - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [w4] "+r"(ptr_w4), - [w5] "+r"(ptr_w5), - [w6] "+r"(ptr_w6), - [w7] "+r"(ptr_w7), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 8; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - float bias0 = bias[j]; - asm volatile( - SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), [bias0] "r"(bias0) - : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); - } -#else //__aarch64__ - int out_cnt = M >> 2; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 4; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - float bias0 = bias[out_idx]; - float bias1 = bias[out_idx + 1]; - float bias2 = bias[out_idx + 2]; - float bias3 = bias[out_idx + 3]; - - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), - [bias0] "r"(bias0), - [bias1] "r"(bias1), - [bias2] "r"(bias2), - [bias3] "r"(bias3) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 4; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - float bias0 = bias[j]; - asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1 - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), [bias0] "r"(bias0) - : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); - } -#endif //__aarch64__ -} - -void sgemv_bias_relu(const bool transA, - const int M, - const int N, - const float *A, - const float *x, - float *y, - const float *bias) { - float *data_out = y; - const float *data_in = x; - const float *weights_ptr = A; - int cnt = N >> 3; - int tail = N & 7; -#ifdef __aarch64__ - int out_cnt = M >> 3; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 8; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - const float *ptr_w4 = ptr_w3 + N; - const float *ptr_w5 = ptr_w4 + N; - const float *ptr_w6 = ptr_w5 + N; - const float *ptr_w7 = ptr_w6 + N; - const float *bias_ptr = bias + out_idx; - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [w4] "+r"(ptr_w4), - [w5] "+r"(ptr_w5), - [w6] "+r"(ptr_w6), - [w7] "+r"(ptr_w7), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 8; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - float bias0 = bias[j]; - asm volatile( - SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), [bias0] "r"(bias0) - : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); - } -#else //__aarch64__ - int out_cnt = M >> 2; -#pragma omp parallel for - for (int j = 0; j < out_cnt; j++) { - int out_idx = j * 4; - float *ptr_out = data_out + out_idx; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * out_idx); - const float *ptr_w1 = ptr_w0 + N; - const float *ptr_w2 = ptr_w1 + N; - const float *ptr_w3 = ptr_w2 + N; - float bias0 = bias[out_idx]; - float bias1 = bias[out_idx + 1]; - float bias2 = bias[out_idx + 2]; - float bias3 = bias[out_idx + 3]; - - int cnt_loop = cnt; - int tail_loop = tail; - asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [w1] "+r"(ptr_w1), - [w2] "+r"(ptr_w2), - [w3] "+r"(ptr_w3), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), - [bias0] "r"(bias0), - [bias1] "r"(bias1), - [bias2] "r"(bias2), - [bias3] "r"(bias3) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "cc", - "memory"); - } -//! deal with remains -#pragma omp parallel for - for (int j = out_cnt * 4; j < M; ++j) { - float *ptr_out = data_out + j; - const float *ptr_in = data_in; - const float *ptr_w0 = weights_ptr + (N * j); - int cnt_loop = cnt; - int tail_loop = tail; - float bias0 = bias[j]; - asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU - : [in] "+r"(ptr_in), - [w0] "+r"(ptr_w0), - [cnt] "+r"(cnt_loop), - [tail] "+r"(tail_loop) - : [out] "r"(ptr_out), [bias0] "r"(bias0) - : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); - } -#endif //__aarch64__ -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/sgemv.h b/lite/backends/arm/math/sgemv.h deleted file mode 100644 index 4d74006f93..0000000000 --- a/lite/backends/arm/math/sgemv.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -// TODO(xxx): fixme now only support transA = false -bool sgemv(const float* A, - const float* x, - float* y, - bool transA, - int M, - int N, - bool is_bias = false, - const float* bias = nullptr, - bool is_relu = false); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/shuffle_channel.cc b/lite/backends/arm/math/shuffle_channel.cc deleted file mode 100644 index 7c4564aa00..0000000000 --- a/lite/backends/arm/math/shuffle_channel.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/shuffle_channel.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void shuffle_kernel( - Dtype* output, const Dtype* input, int group_row, int group_col, int len) { - for (int i = 0; i < group_row; ++i) { - for (int j = 0; j < group_col; ++j) { - const Dtype* p_i = input + (i * group_col + j) * len; - Dtype* p_o = output + (j * group_row + i) * len; - memcpy(p_o, p_i, len * sizeof(Dtype)); - } - } -} - -template <> -void shuffle_channel(const float* inputs, - float* outputs, - int group, - int num, - int channel, - int height, - int width) { - int fea_size = channel * height * width; - int spatial_size = height * width; - int group_row = group; - int group_col = channel / group; - for (int i = 0; i < num; ++i) { - shuffle_kernel(outputs + i * fea_size, - inputs + i * fea_size, - group_row, - group_col, - spatial_size); - } -} - -template <> -void shuffle_channel(const char* inputs, - char* outputs, - int group, - int num, - int channel, - int height, - int width) { - int fea_size = channel * height * width; - int spatial_size = height * width; - int group_row = group; - int group_col = channel / group; - for (int i = 0; i < num; ++i) { - shuffle_kernel(outputs + i * fea_size, - inputs + i * fea_size, - group_row, - group_col, - spatial_size); - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/shuffle_channel.h b/lite/backends/arm/math/shuffle_channel.h deleted file mode 100644 index d0c8b7b81e..0000000000 --- a/lite/backends/arm/math/shuffle_channel.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void shuffle_channel(const T* inputs, - T* outputs, - int group, - int num, - int channel, - int height, - int width); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/slice.cc b/lite/backends/arm/math/slice.cc deleted file mode 100644 index 8b9a769050..0000000000 --- a/lite/backends/arm/math/slice.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/slice.h" -#include -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void slice(const Dtype* input, - std::vector in_dims, - std::vector axes, - std::vector starts, - std::vector ends, - Dtype* out, - Context* ctx) { - auto out_dims = in_dims; - std::vector real_starts(in_dims.size(), 0); - std::vector real_ends(in_dims.size(), 0); - std::vector real_step(in_dims.size(), 0); - for (int i = 0; i < in_dims.size(); i++) { - real_ends[i] = in_dims[i]; - } - for (int i = 0; i < axes.size(); i++) { - int dim_value = in_dims[axes[i]]; - if (dim_value > 0) { - int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; - int end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim_value); - out_dims[axes[i]] = end - start; - real_starts[axes[i]] = start; - real_ends[axes[i]] = end; - } - } - const int LEN = in_dims.size(); - int dst_step[LEN]; - for (int i = 0; i < in_dims.size(); ++i) { - dst_step[i] = 1; - } - int src_step[LEN]; - for (int i = 0; i < in_dims.size(); ++i) { - src_step[i] = 1; - } - int out_num = out_dims[in_dims.size() - 1]; - for (int i = in_dims.size() - 2; i >= 0; i--) { - dst_step[i] = out_dims[i + 1] * dst_step[i + 1]; - src_step[i] = in_dims[i + 1] * src_step[i + 1]; - out_num *= out_dims[i]; - } - - for (int dst_id = 0; dst_id < out_num; dst_id++) { - int src_id = 0; - int index_id = dst_id; - for (int j = 0; j < out_dims.size(); j++) { - int cur_id = index_id / dst_step[j]; - index_id = index_id % dst_step[j]; - src_id += (cur_id + real_starts[j]) * src_step[j]; - } - out[dst_id] = input[src_id]; - } -} - -template void slice(const int* input, - std::vector dims, - std::vector axes, - std::vector starts, - std::vector ends, - int* out, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/slice.h b/lite/backends/arm/math/slice.h deleted file mode 100644 index 86172d28a7..0000000000 --- a/lite/backends/arm/math/slice.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void slice(const Dtype* input, - std::vector dims, - std::vector axes, - std::vector starts, - std::vector ends, - Dtype* out, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/softmax.cc b/lite/backends/arm/math/softmax.cc deleted file mode 100644 index 65d41b0491..0000000000 --- a/lite/backends/arm/math/softmax.cc +++ /dev/null @@ -1,616 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/softmax.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void softmax_basic(const float* din, - float* dout, - const int axis_size, - const int inner_num, - const int outer_num) { - int compute_size = inner_num * outer_num; -#pragma omp parallel for - for (int i = 0; i < compute_size; ++i) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - float max_data = din[real_index]; - // get max - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - max_data = din[real_index] > max_data ? din[real_index] : max_data; - } - - real_index = idx_outer * inner_num + idx_inner; - // sub, exp and sum - dout[real_index] = expf(din[real_index] - max_data); - float sum_data = dout[real_index]; - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - dout[real_index] = expf(din[real_index] - max_data); - sum_data += dout[real_index]; - } - - float sum_inv = 1.f / sum_data; - real_index = idx_outer * inner_num + idx_inner; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - dout[real_index] *= sum_inv; - real_index += inner_num; - } - } -} - -template <> -void softmax_inner8_axis4(const float* din, - float* dout, - const int axis_size, - const int inner_num, - const int outer_num) { - int compute_size = inner_num * outer_num; - int cmp_cnt = compute_size >> 3; - int remain = compute_size % 8; - float32x4_t vone = vdupq_n_f32(1.0f); - -#pragma omp parallel for - for (int c = 0; c < cmp_cnt; ++c) { - int i = c * 8; - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - // get max axis_size == 4 - const float* din_ptr = din + real_index; - const float* din_ptr1 = din_ptr + inner_num; - const float* din_ptr2 = din_ptr1 + inner_num; - const float* din_ptr3 = din_ptr2 + inner_num; - float32x4_t vdata0 = vld1q_f32(din_ptr); - float32x4_t vdata1 = vld1q_f32(din_ptr1); - float32x4_t vdata2 = vld1q_f32(din_ptr2); - float32x4_t vdata3 = vld1q_f32(din_ptr3); - - float32x4_t vdata01 = vld1q_f32(din_ptr + 4); - float32x4_t vdata11 = vld1q_f32(din_ptr1 + 4); - float32x4_t vdata21 = vld1q_f32(din_ptr2 + 4); - float32x4_t vdata31 = vld1q_f32(din_ptr3 + 4); - - float* dout_ptr0 = dout + real_index; - float* dout_ptr1 = dout_ptr0 + inner_num; - float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1); - float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3); - float32x4_t vmax11 = vmaxq_f32(vdata01, vdata11); - float32x4_t vmax21 = vmaxq_f32(vdata21, vdata31); - float* dout_ptr2 = dout_ptr1 + inner_num; - float* dout_ptr3 = dout_ptr2 + inner_num; - float32x4_t vmax = vmaxq_f32(vmax1, vmax2); - float32x4_t vmax_1 = vmaxq_f32(vmax11, vmax21); - - // sub, exp and sum - float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax)); - float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax)); - float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax)); - float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax)); - - float32x4_t vsum01 = exp_ps(vsubq_f32(vdata01, vmax_1)); - float32x4_t vsum11 = exp_ps(vsubq_f32(vdata11, vmax_1)); - float32x4_t vsum21 = exp_ps(vsubq_f32(vdata21, vmax_1)); - float32x4_t vsum31 = exp_ps(vsubq_f32(vdata31, vmax_1)); - - float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1); - float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3); - float32x4_t vsum_11 = vaddq_f32(vsum01, vsum11); - float32x4_t vsum_21 = vaddq_f32(vsum21, vsum31); - - float32x4_t vsum = vaddq_f32(vsum_1, vsum_2); - float32x4_t vsum111 = vaddq_f32(vsum_11, vsum_21); - - float32x4_t vinf = div_ps(vone, vsum); - float32x4_t vinf1 = div_ps(vone, vsum111); - - vsum0 = vmulq_f32(vsum0, vinf); - vsum1 = vmulq_f32(vsum1, vinf); - vsum2 = vmulq_f32(vsum2, vinf); - vsum3 = vmulq_f32(vsum3, vinf); - - vsum01 = vmulq_f32(vsum01, vinf1); - vsum11 = vmulq_f32(vsum11, vinf1); - vsum21 = vmulq_f32(vsum21, vinf1); - vsum31 = vmulq_f32(vsum31, vinf1); - - vst1q_f32(dout_ptr0, vsum0); - vst1q_f32(dout_ptr1, vsum1); - vst1q_f32(dout_ptr2, vsum2); - vst1q_f32(dout_ptr3, vsum3); - - vst1q_f32(dout_ptr0 + 4, vsum01); - vst1q_f32(dout_ptr1 + 4, vsum11); - vst1q_f32(dout_ptr2 + 4, vsum21); - vst1q_f32(dout_ptr3 + 4, vsum31); - } - - int i = cmp_cnt * 8; - - if (remain > 4) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - // get max axis_size == 4 - const float* din_ptr = din + real_index; - const float* din_ptr1 = din_ptr + inner_num; - const float* din_ptr2 = din_ptr1 + inner_num; - const float* din_ptr3 = din_ptr2 + inner_num; - float32x4_t vdata0 = vld1q_f32(din_ptr); - float32x4_t vdata1 = vld1q_f32(din_ptr1); - float32x4_t vdata2 = vld1q_f32(din_ptr2); - float32x4_t vdata3 = vld1q_f32(din_ptr3); - - float* dout_ptr0 = dout + real_index; - float* dout_ptr1 = dout_ptr0 + inner_num; - float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1); - float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3); - float* dout_ptr2 = dout_ptr1 + inner_num; - float* dout_ptr3 = dout_ptr2 + inner_num; - float32x4_t vmax = vmaxq_f32(vmax1, vmax2); - - // sub, exp and sum - float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax)); - float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax)); - float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax)); - float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax)); - - float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1); - float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3); - - float32x4_t vsum = vaddq_f32(vsum_1, vsum_2); - - float32x4_t vone = vdupq_n_f32(1.0f); - float32x4_t vinf = div_ps(vone, vsum); - - vsum0 = vmulq_f32(vsum0, vinf); - vsum1 = vmulq_f32(vsum1, vinf); - vsum2 = vmulq_f32(vsum2, vinf); - vsum3 = vmulq_f32(vsum3, vinf); - - vst1q_f32(dout_ptr0, vsum0); - vst1q_f32(dout_ptr1, vsum1); - vst1q_f32(dout_ptr2, vsum2); - vst1q_f32(dout_ptr3, vsum3); - - i += 4; - } - for (; i < compute_size; i++) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - float max_data = din[real_index]; - // get max - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - max_data = din[real_index] > max_data ? din[real_index] : max_data; - } - - real_index = idx_outer * inner_num + idx_inner; - // sub, exp and sum - dout[real_index] = expf(din[real_index] - max_data); - float sum_data = dout[real_index]; - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - dout[real_index] = expf(din[real_index] - max_data); - sum_data += dout[real_index]; - } - - float sum_inv = 1.f / sum_data; - real_index = idx_outer * inner_num + idx_inner; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - dout[real_index] *= sum_inv; - real_index += inner_num; - } - } -} - -template <> -void softmax_inner4_axis4(const float* din, - float* dout, - const int axis_size, - const int inner_num, - const int outer_num) { - int compute_size = inner_num * outer_num; - int cmp_cnt = compute_size >> 2; - int remain = compute_size % 4; - float32x4_t vone = vdupq_n_f32(1.0f); - -#pragma omp parallel for - for (int c = 0; c < cmp_cnt; ++c) { - int i = c * 4; - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - // get max axis_size == 4 - const float* din_ptr = din + real_index; - const float* din_ptr1 = din_ptr + inner_num; - const float* din_ptr2 = din_ptr1 + inner_num; - const float* din_ptr3 = din_ptr2 + inner_num; - float32x4_t vdata0 = vld1q_f32(din_ptr); - float32x4_t vdata1 = vld1q_f32(din_ptr1); - float32x4_t vdata2 = vld1q_f32(din_ptr2); - float32x4_t vdata3 = vld1q_f32(din_ptr3); - - float* dout_ptr0 = dout + real_index; - float* dout_ptr1 = dout_ptr0 + inner_num; - float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1); - float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3); - float* dout_ptr2 = dout_ptr1 + inner_num; - float* dout_ptr3 = dout_ptr2 + inner_num; - float32x4_t vmax = vmaxq_f32(vmax1, vmax2); - - // sub, exp and sum - float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax)); - float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax)); - float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax)); - float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax)); - - float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1); - float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3); - - float32x4_t vsum = vaddq_f32(vsum_1, vsum_2); - - float32x4_t vinf = div_ps(vone, vsum); - - vsum0 = vmulq_f32(vsum0, vinf); - vsum1 = vmulq_f32(vsum1, vinf); - vsum2 = vmulq_f32(vsum2, vinf); - vsum3 = vmulq_f32(vsum3, vinf); - - vst1q_f32(dout_ptr0, vsum0); - vst1q_f32(dout_ptr1, vsum1); - vst1q_f32(dout_ptr2, vsum2); - vst1q_f32(dout_ptr3, vsum3); - } - - int i = cmp_cnt * 8; - for (; i < compute_size; i++) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - float max_data = din[real_index]; - // get max - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - max_data = din[real_index] > max_data ? din[real_index] : max_data; - } - - real_index = idx_outer * inner_num + idx_inner; - // sub, exp and sum - dout[real_index] = expf(din[real_index] - max_data); - float sum_data = dout[real_index]; - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - dout[real_index] = expf(din[real_index] - max_data); - sum_data += dout[real_index]; - } - - float sum_inv = 1.f / sum_data; - real_index = idx_outer * inner_num + idx_inner; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - dout[real_index] *= sum_inv; - real_index += inner_num; - } - } -} - -template <> -void softmax_inner8(const float* din, - float* dout, - const int axis_size, - const int inner_num, - const int outer_num) { - int compute_size = inner_num * outer_num; - int cmp_cnt = compute_size >> 3; -#pragma omp parallel for - for (int c = 0; c < cmp_cnt; ++c) { - int i = c * 8; - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - const float* din_ptr = din + real_index; - float32x4_t vmax = vld1q_f32(din_ptr); - float32x4_t vmax2 = vld1q_f32(din_ptr + 4); - // get max - for (int j = 1; j < axis_size; ++j) { - din_ptr += inner_num; - float32x4_t vdata = vld1q_f32(din_ptr); - float32x4_t vdata2 = vld1q_f32(din_ptr + 4); - vmax = vmaxq_f32(vmax, vdata); - vmax2 = vmaxq_f32(vmax2, vdata2); - } - - // sub, exp and sum - din_ptr = din + real_index; - float* dout_ptr = dout + real_index; - float32x4_t vdata = vld1q_f32(din_ptr); - float32x4_t vdata2 = vld1q_f32(din_ptr + 4); - float32x4_t vsum = exp_ps(vsubq_f32(vdata, vmax)); - float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax2)); - din_ptr += inner_num; - vst1q_f32(dout_ptr, vsum); - vst1q_f32(dout_ptr + 4, vsum2); - dout_ptr += inner_num; - for (int j = 1; j < axis_size; ++j) { - float32x4_t vdata0 = vld1q_f32(din_ptr); - float32x4_t vdata1 = vld1q_f32(din_ptr + 4); - vdata0 = exp_ps(vsubq_f32(vdata0, vmax)); - vdata1 = exp_ps(vsubq_f32(vdata1, vmax2)); - din_ptr += inner_num; - vsum = vaddq_f32(vsum, vdata0); - vsum2 = vaddq_f32(vsum2, vdata1); - vst1q_f32(dout_ptr, vdata0); - vst1q_f32(dout_ptr + 4, vdata1); - dout_ptr += inner_num; - } - - float32x4_t vone = vdupq_n_f32(1.0f); - float32x4_t vinf = div_ps(vone, vsum); - float32x4_t vinf2 = div_ps(vone, vsum2); - dout_ptr = dout + real_index; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - float32x4_t vdata0 = vld1q_f32(dout_ptr); - float32x4_t vdata1 = vld1q_f32(dout_ptr + 4); - vdata0 = vmulq_f32(vdata0, vinf); - vdata1 = vmulq_f32(vdata1, vinf2); - vst1q_f32(dout_ptr, vdata0); - vst1q_f32(dout_ptr + 4, vdata1); - dout_ptr += inner_num; - } - } - - for (int i = cmp_cnt * 8; i < compute_size; i++) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - float max_data = din[real_index]; - // get max - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - max_data = din[real_index] > max_data ? din[real_index] : max_data; - } - - real_index = idx_outer * inner_num + idx_inner; - // sub, exp and sum - dout[real_index] = expf(din[real_index] - max_data); - float sum_data = dout[real_index]; - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - dout[real_index] = expf(din[real_index] - max_data); - sum_data += dout[real_index]; - } - - float sum_inv = 1.f / sum_data; - real_index = idx_outer * inner_num + idx_inner; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - dout[real_index] *= sum_inv; - real_index += inner_num; - } - } -} - -template <> -void softmax_inner4(const float* din, - float* dout, - const int axis_size, - const int inner_num, - const int outer_num) { - int compute_size = inner_num * outer_num; - int cmp_cnt = compute_size >> 2; -#pragma omp parallel for - for (int c = 0; c < cmp_cnt; ++c) { - int i = c * 4; - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - // float max_data = din[real_index]; - const float* din_ptr = din + real_index; - float32x4_t vmax = vld1q_f32(din_ptr); - // get max - for (int j = 1; j < axis_size; ++j) { - din_ptr += inner_num; - float32x4_t vdata = vld1q_f32(din_ptr); - vmax = vmaxq_f32(vmax, vdata); - } - // sub, exp and sum - din_ptr = din + real_index; - float* dout_ptr = dout + real_index; - float32x4_t vdata = vld1q_f32(din_ptr); - float32x4_t vsum = exp_ps(vsubq_f32(vdata, vmax)); - din_ptr += inner_num; - vst1q_f32(dout_ptr, vsum); - dout_ptr += inner_num; - for (int j = 1; j < axis_size; ++j) { - // real_index += inner_num; - float32x4_t vdata0 = vld1q_f32(din_ptr); - vdata0 = exp_ps(vsubq_f32(vdata0, vmax)); - din_ptr += inner_num; - vsum = vaddq_f32(vsum, vdata0); - vst1q_f32(dout_ptr, vdata0); - dout_ptr += inner_num; - } - - float32x4_t vone = vdupq_n_f32(1.0f); - float32x4_t vinf = div_ps(vone, vsum); - dout_ptr = dout + real_index; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - float32x4_t vdata0 = vld1q_f32(dout_ptr); - vdata0 = vmulq_f32(vdata0, vinf); - vst1q_f32(dout_ptr, vdata0); - dout_ptr += inner_num; - } - } - - for (int i = cmp_cnt * 4; i < compute_size; i++) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - float max_data = din[real_index]; - // get max - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - max_data = din[real_index] > max_data ? din[real_index] : max_data; - } - - real_index = idx_outer * inner_num + idx_inner; - // sub, exp and sum - dout[real_index] = expf(din[real_index] - max_data); - float sum_data = dout[real_index]; - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - dout[real_index] = expf(din[real_index] - max_data); - sum_data += dout[real_index]; - } - - float sum_inv = 1.f / sum_data; - real_index = idx_outer * inner_num + idx_inner; - // get softmax result - for (int j = 0; j < axis_size; ++j) { - dout[real_index] *= sum_inv; - real_index += inner_num; - } - } -} - -template <> -void softmax_inner1_large_axis(const float* din, - float* dout, - const int outer_size, - const int axis_size) { -#pragma omp parallel for - for (int i = 0; i < outer_size; ++i) { - const float* din_ptr = din + i * axis_size; - float* dout_ptr = dout + i * axis_size; - - const float* din_max_ptr = din_ptr; - int nn = axis_size >> 2; - - // get max - float32x4_t vmax = vld1q_f32(din_max_ptr); - din_max_ptr += 4; - int j = 1; - for (; j < nn; ++j) { - vmax = vmaxq_f32(vmax, vld1q_f32(din_max_ptr)); - din_max_ptr += 4; - } - float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax)); - float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1)); - for (j = 4 * j; j < axis_size; ++j) { - max_data = std::max(max_data, din_max_ptr[0]); - din_max_ptr++; - } - - // sub, exp and sum - const float* din_sum_ptr = din_ptr; - float* dout_sum_ptr = dout_ptr; - vmax = vdupq_n_f32(max_data); - float32x4_t vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax)); - float32x4_t vsum = vsub_exp; - vst1q_f32(dout_sum_ptr, vsub_exp); - din_sum_ptr += 4; - dout_sum_ptr += 4; - - j = 1; - for (; j < nn; ++j) { - vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax)); - vst1q_f32(dout_sum_ptr, vsub_exp); - vsum = vaddq_f32(vsum, vsub_exp); - din_sum_ptr += 4; - dout_sum_ptr += 4; - } - float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum)); - float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1); - - for (j = 4 * j; j < axis_size; ++j) { - dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data); - sum_data += dout_sum_ptr[0]; - din_sum_ptr++; - dout_sum_ptr++; - } - - float sum_inv = 1.f / sum_data; - float* dout_res_ptr = dout_ptr; - float32x4_t vinv = vdupq_n_f32(sum_inv); - // get softmax result - j = 0; - for (; j < nn; ++j) { - float32x4_t vout = vld1q_f32(dout_res_ptr); - float32x4_t vres = vmulq_f32(vout, vinv); - vst1q_f32(dout_res_ptr, vres); - dout_res_ptr += 4; - } - for (j = nn * 4; j < axis_size; ++j) { - dout_ptr[j] *= sum_inv; - } - } -} - -template <> -void softmax_inner1_small_axis(const float* din, - float* dout, - const int outer_size, - const int axis_size) { -#pragma omp parallel for - for (int i = 0; i < outer_size; ++i) { - const float* din_ptr = din + i * axis_size; - float* dout_ptr = dout + i * axis_size; - // get max - float max_data = din_ptr[0]; - for (int j = 1; j < axis_size; ++j) { - max_data = std::max(max_data, din_ptr[j]); - } - - // sub, exp and sum - float sum_data = 0.f; - for (int j = 0; j < axis_size; ++j) { - dout_ptr[j] = expf(din_ptr[j] - max_data); - sum_data += dout_ptr[j]; - } - - float sum_inv = 1.f / sum_data; - for (int j = 0; j < axis_size; ++j) { - dout_ptr[j] *= sum_inv; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/softmax.h b/lite/backends/arm/math/softmax.h deleted file mode 100644 index cc1957a73e..0000000000 --- a/lite/backends/arm/math/softmax.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void softmax_basic(const T* din, - T* dout, - const int axis_size, - const int inner_num, - const int outer_num); - -template -void softmax_inner8_axis4(const T* din, - T* dout, - const int axis_size, - const int inner_num, - const int outer_num); - -template -void softmax_inner4_axis4(const T* din, - T* dout, - const int axis_size, - const int inner_num, - const int outer_num); -template -void softmax_inner8(const T* din, - T* dout, - const int axis_size, - const int inner_num, - const int outer_num); - -template -void softmax_inner4(const T* din, - T* dout, - const int axis_size, - const int inner_num, - const int outer_num); - -template -void softmax_inner1_large_axis(const T* din, - T* dout, - const int outer_size, - const int axis_size); - -template -void softmax_inner1_small_axis(const T* din, - T* dout, - const int outer_size, - const int axis_size); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/split.cc b/lite/backends/arm/math/split.cc deleted file mode 100644 index 54ea7e62c2..0000000000 --- a/lite/backends/arm/math/split.cc +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/split.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template <> -void split_cpy(const float* din, float* dout, int num) { - int cnt = num >> 4; - int remain = num % 16; -#pragma omp parallel for - for (int i = 0; i < cnt; i++) { - const float* din_ptr = din + (i << 4); - float* dout_ptr = dout + (i << 4); - - float32x4_t din0 = vld1q_f32(din_ptr); - float32x4_t din1 = vld1q_f32(din_ptr + 4); - float32x4_t din2 = vld1q_f32(din_ptr + 8); - float32x4_t din3 = vld1q_f32(din_ptr + 12); - - vst1q_f32(dout_ptr, din0); - vst1q_f32(dout_ptr + 4, din1); - vst1q_f32(dout_ptr + 8, din2); - vst1q_f32(dout_ptr + 12, din3); - } - if (remain > 0) { - const float* din_ptr = din + (cnt << 4); - float* dout_ptr = dout + (cnt << 4); - for (int i = 0; i < remain; i++) { - *dout_ptr = *din_ptr; - dout_ptr++; - din_ptr++; - } - } -} - -template <> -void split(const float* din, - const std::vector& dout, - const int axis, - const std::vector& in_strides) { - int input_offset = 0; - for (auto out : dout) { - auto out_dim = out->dims(); - std::vector out_strides(out_dim.size()); - out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1]; - for (int i = out_dim.size() - 2; i >= 0; --i) { - out_strides[i] = out_strides[i + 1] * out_dim[i]; - } - - float* out_data = out->mutable_data(); - int before = out_strides[0] / out_strides[axis]; - int in_after = in_strides[axis]; - int out_after = out_strides[axis]; - - for (int i = 0; i < before; ++i) { - split_cpy(din + input_offset + i * in_after, - out_data + i * out_after, - out_after); - } - input_offset += out_strides[axis]; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/split.h b/lite/backends/arm/math/split.h deleted file mode 100644 index 2c6f392cc5..0000000000 --- a/lite/backends/arm/math/split.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/op_lite.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void split_cpy(const T* din, T* dout, int num); - -template -void split(const T* din, - const std::vector& dout, - const int axis, - const std::vector& in_strides); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/stack.cc b/lite/backends/arm/math/stack.cc deleted file mode 100644 index e017a8d01e..0000000000 --- a/lite/backends/arm/math/stack.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/stack.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void stack(std::vector x, lite::Tensor *y, int axis) { - if (axis < 0) axis += (x[0]->dims().size() + 1); - int n = x.size(); - auto *y_data = y->mutable_data(); - std::vector x_datas(n); - for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); - - int pre = 1, post = 1; - auto &dim = x[0]->dims(); - for (auto i = 0; i < axis; ++i) pre *= dim[i]; - for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; - - auto x_data_arr = x_datas.data(); - - size_t x_offset = 0; - size_t y_offset = 0; - for (int i = 0; i < pre; i++) { - for (int j = 0; j < n; j++) { - std::memcpy( - y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(float)); - y_offset += post; - } - x_offset += post; - } -} - -} /* namespace math */ -} /* namespace arm */ -} /* namespace lite */ -} /* namespace paddle */ diff --git a/lite/backends/arm/math/stack.h b/lite/backends/arm/math/stack.h deleted file mode 100644 index 2000b3da60..0000000000 --- a/lite/backends/arm/math/stack.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void stack(std::vector x, lite::Tensor* out, int axis); - -} /* namespace math */ -} /* namespace arm */ -} /* namespace lite */ -} /* namespace paddle */ diff --git a/lite/backends/arm/math/topk.cc b/lite/backends/arm/math/topk.cc deleted file mode 100644 index c9239134e1..0000000000 --- a/lite/backends/arm/math/topk.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/topk.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" -namespace paddle { -namespace lite { -namespace arm { -namespace math { -bool comp_func(std::pair a, std::pair b) { - return (a.first > b.first); -} - -void topk(const float* in_data, - float* out_val, - int* out_ind, - int m, - int n, - int k, - Context* ctx) { - for (int i = 0; i < m; i++) { - const float* in_tmp = in_data + i * n; - float* out_val_tmp = out_val + i * k; - int* out_ind_tmp = out_ind + i * k; - std::vector> vec; - for (int j = 0; j < n; j++) { - vec.push_back(std::make_pair(in_tmp[j], j)); - } - std::partial_sort(vec.begin(), vec.begin() + k, vec.end(), comp_func); - for (int q = 0; q < k; q++) { - out_val_tmp[q] = vec[q].first; - out_ind_tmp[q] = vec[q].second; - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/topk.h b/lite/backends/arm/math/topk.h deleted file mode 100644 index 5bf472e1af..0000000000 --- a/lite/backends/arm/math/topk.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void topk(const float* din, - float* out_val, - int* out_ind, - int m, - int n, - int k, - Context* ctx); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/type_trans.cc b/lite/backends/arm/math/type_trans.cc deleted file mode 100644 index 6ded50e752..0000000000 --- a/lite/backends/arm/math/type_trans.cc +++ /dev/null @@ -1,919 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/type_trans.h" -#include -#include -#include -#include "lite/backends/arm/math/saturate.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void int32_to_dtype(const int* din, - dtype* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size); - -void fp32_to_int8(const float* din, - int8_t* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = outer_size * axis_size; - -#pragma omp parallel for - for (int j = 0; j < loop_size; ++j) { - float inv_scale = 1.f / scale[j % axis_size]; - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - const float* din_c = din + j * inner_size; - signed char* dout_c = dout + j * inner_size; - if (cnt > 0) { - int cnt_loop = cnt; - const float* din_ptr = din_c; - signed char* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "ldp q2, q3, [%[in]], #32 \n" - "0: \n" /* main loop */ - "fmul v4.4s, v0.4s, %[scale].4s \n" - "fmul v5.4s, v1.4s, %[scale].4s \n" - "fmul v6.4s, v2.4s, %[scale].4s \n" - "fmul v7.4s, v3.4s, %[scale].4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "subs %[cnt], %[cnt], #1 \n" - "FCVTAS v8.4s, v4.4s \n" - "FCVTAS v9.4s, v5.4s \n" - "FCVTAS v10.4s, v6.4s \n" - "FCVTAS v11.4s, v7.4s \n" - "ldp q2, q3, [%[in]], #32 \n" - "sqxtn v4.4h, v8.4s \n" - "sqxtn2 v4.8h, v9.4s \n" - "sqxtn v5.4h, v10.4s \n" - "sqxtn2 v5.8h, v11.4s \n" - "sqxtn v8.8b, v4.8h \n" - "sqxtn2 v8.16b, v5.8h \n" - "str q8, [%[out]], #16 \n" - "bne 0b \n" - : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) - : [scale] "w"(vscale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "0: @ main loop\n" - "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q5, q4, q4 @ set offset, 0.5\n" - "vand.i32 q6, q4, q4 @ set offset, 0.5\n" - "vand.i32 q7, q4, q4 @ set offset, 0.5\n" - "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" - "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" - "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" - "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" - "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" - "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" - "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" - "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" - "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q0, q4 @ cvt to int32\n" - "vcvt.s32.f32 q1, q5 @ cvt to int32\n" - "vcvt.s32.f32 q2, q6 @ cvt to int32\n" - "vcvt.s32.f32 q3, q7 @ cvt to int32\n" - "vqmovn.s32 d8, q0 @ cnt to int16\n" - "vqmovn.s32 d9, q1 @ cnt to int16\n" - "vqmovn.s32 d10, q2 @ cnt to int16\n" - "vqmovn.s32 d11, q3 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vqmovn.s16 d12, q4 @ cnt to int8\n" - "vqmovn.s16 d13, q5 @ cnt to int8\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "vst1.32 {d12-d13}, [%[dout]]! @ write to output\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - - : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) - : [vscale] "w"(vscale), - [vpoff] "w"(vpoff), - [vnoff] "w"(vnoff), - [vzero] "w"(vzero) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); -#endif - } - const float* din_r = din_c + 16 * cnt; - signed char* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); - } - } -} - -void fp32_to_int16(const float* din, - int16_t* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int cnt = inner_size / 8; - int remain = inner_size & 7; - int64_t loop_size = outer_size * axis_size; - -#pragma omp parallel for - for (int j = 0; j < loop_size; ++j) { - float inv_scale = 1.f / scale[j % axis_size]; - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - const float* din_c = din + j * inner_size; - int16_t* dout_c = dout + j * inner_size; - if (cnt > 0) { - int cnt_loop = cnt; - const float* din_ptr = din_c; - int16_t* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "0: \n" /* main loop */ - "fmul v4.4s, v0.4s, %[scale].4s \n" - "fmul v5.4s, v1.4s, %[scale].4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "subs %[cnt], %[cnt], #1 \n" - "FCVTAS v8.4s, v4.4s \n" - "FCVTAS v9.4s, v5.4s \n" - "sqxtn v4.4h, v8.4s \n" - "sqxtn2 v4.8h, v9.4s \n" - "str q4, [%[out]], #16 \n" - "bne 0b \n" - : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) - : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v8", "v9"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "0: @ main loop\n" - "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q5, q4, q4 @ set offset, 0.5\n" - "vand.i32 q6, q4, q4 @ set offset, 0.5\n" - "vand.i32 q7, q4, q4 @ set offset, 0.5\n" - "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" - "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" - "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" - "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q0, q4 @ cvt to int32\n" - "vcvt.s32.f32 q1, q5 @ cvt to int32\n" - "vqmovn.s32 d8, q0 @ cnt to int16\n" - "vqmovn.s32 d9, q1 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - - : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) - : [vscale] "w"(vscale), - [vpoff] "w"(vpoff), - [vnoff] "w"(vnoff), - [vzero] "w"(vzero) - : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); -#endif - } - const float* din_r = din_c + 8 * cnt; - int16_t* dout_r = dout_c + 8 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); - } - } -} - -void int8_to_fp32(const int8_t* in, - float* out, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = axis_size * outer_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const signed char* din_c = in + n * inner_size; - float* dout_c = out + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const signed char* din_ptr = din_c; - float* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ - "0: \n" /* main loop */ - "sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/ - "sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/ - - "sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/ - "sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/ - "sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/ - "sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/ - - "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ - - "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ - "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ - "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ - "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ - - "subs %[loop], %[loop], #1 \n" - - "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ - - "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ - "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11"); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" - "0: @ main loop\n" - "vmovl.s8 q2, d0 @ trans to int16\n" - "vmovl.s8 q3, d1 @ trans to int16\n" - "vmovl.s16 q4, d4 @ trans to int32\n" - "vmovl.s16 q5, d5 @ trans to int32\n" - "vmovl.s16 q6, d6 @ trans to int32\n" - "vmovl.s16 q7, d7 @ trans to int32\n" - "vcvt.f32.s32 q0, q4 @ trans to fp32\n" - "vcvt.f32.s32 q1, q5 @ trans to fp32\n" - "vcvt.f32.s32 q2, q6 @ trans to fp32\n" - "vcvt.f32.s32 q3, q7 @ trans to fp32\n" - "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" - "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" - "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" - "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" - - "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" - - "subs %[loop], #1 \n" - - "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" - "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -#endif // __aarch64__ - } - const signed char* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } - } -} - -void int16_to_fp32(const int16_t* in, - float* out, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = axis_size * outer_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int16_t* din_c = in + n * inner_size; - float* dout_c = out + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const int16_t* din_ptr = din_c; - float* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ - "0: \n" /* main loop */ - "sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/ - "sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/ - "sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/ - "sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/ - - "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ - - "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ - "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ - "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ - "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ - - "subs %[loop], %[loop], #1 \n" - - "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ - - "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ - "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" - "0: @ main loop\n" - "vmovl.s16 q4, d0 @ trans to int32\n" - "vmovl.s16 q5, d1 @ trans to int32\n" - "vmovl.s16 q6, d2 @ trans to int32\n" - "vmovl.s16 q7, d3 @ trans to int32\n" - "vcvt.f32.s32 q0, q4 @ trans to fp32\n" - "vcvt.f32.s32 q1, q5 @ trans to fp32\n" - "vcvt.f32.s32 q2, q6 @ trans to fp32\n" - "vcvt.f32.s32 q3, q7 @ trans to fp32\n" - "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" - "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" - "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" - "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" - - "vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n" - - "subs %[loop], #1 \n" - - "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" - "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -#endif // __aarch64__ - } - const int16_t* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } - } -} - -void int32_to_fp32(const int* din, - float* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = axis_size * outer_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int* din_c = din + n * inner_size; - float* dout_c = dout + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const int* din_ptr = din_c; - float* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "ldp q2, q3, [%[in]], #32 \n" - "0: \n" - "scvtf v4.4s, v0.4s \n" - "scvtf v5.4s, v1.4s \n" - "scvtf v6.4s, v2.4s \n" - "scvtf v7.4s, v3.4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "fmul v8.4s, v4.4s, %[scale].4s \n" - "fmul v9.4s, v5.4s, %[scale].4s \n" - "fmul v10.4s, v6.4s, %[scale].4s \n" - "fmul v11.4s, v7.4s, %[scale].4s \n" - "ldp q2, q3, [%[in]], #32 \n" - "stp q8, q9, [%[out]], #32 \n" - "stp q10, q11, [%[out]], #32 \n" - "subs %[loop], %[loop], #1 \n" - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11"); -#else - asm volatile( - "vld1.s32 {d0-d3}, [%[in]]! \n" - "vld1.s32 {d4-d7}, [%[in]]! \n" - "0: \n" - "vcvt.f32.s32 q4, q0 \n" - "vcvt.f32.s32 q5, q1 \n" - "vcvt.f32.s32 q6, q2 \n" - "vcvt.f32.s32 q7, q3 \n" - "vld1.s32 {d0-d3}, [%[in]]! \n" - "vmul.f32 q8, q4, %q[scale] \n" - "vmul.f32 q9, q5, %q[scale] \n" - "vmul.f32 q10, q6, %q[scale] \n" - "vmul.f32 q11, q7, %q[scale] \n" - "vld1.s32 {d4-d7}, [%[in]]! \n" - "subs %[loop], #1 \n" - "vst1.f32 {d16-d19}, [%[out]]! \n" - "vst1.f32 {d20-d23}, [%[out]]! \n" - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); -#endif // __aarch64__ - } - const int* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } - } -} - -void int32_to_int8(const int* din, - int8_t* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = outer_size * axis_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int* din_c = din + n * inner_size; - int8_t* dout_c = dout + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - if (cnt > 0) { - int loop = cnt; - const int* din_ptr = din_c; - int8_t* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "0: \n" - "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" - - "scvtf v4.4s, v0.4s \n" - "scvtf v5.4s, v1.4s \n" - "scvtf v6.4s, v2.4s \n" - "scvtf v7.4s, v3.4s \n" - - "fmul v0.4s, v4.4s, %[scale].4s \n" - "fmul v1.4s, v5.4s, %[scale].4s \n" - "fmul v2.4s, v6.4s, %[scale].4s \n" - "fmul v3.4s, v7.4s, %[scale].4s \n" - - "fcvtas v4.4s, v0.4s \n" - "fcvtas v5.4s, v1.4s \n" - "fcvtas v6.4s, v2.4s \n" - "fcvtas v7.4s, v3.4s \n" - - "sqxtn v0.4h, v4.4s \n" - "sqxtn2 v0.8h, v5.4s \n" - "sqxtn v1.4h, v6.4s \n" - "sqxtn2 v1.8h, v7.4s \n" - - "sqxtn v2.8b, v0.8h \n" - "sqxtn2 v2.16b, v1.8h \n" - - "st1 {v2.16b}, [%[out]], #16 \n" - "subs %[loop], %[loop], #1 \n" - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "0: @ main loop\n" - "vcvt.f32.s32 q4, q0 @ cvt to float\n" - "vcvt.f32.s32 q5, q1 @ cvt to float\n" - "vcvt.f32.s32 q6, q2 @ cvt to float\n" - "vcvt.f32.s32 q7, q3 @ cvt to float\n" - "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q1, q0, q0 @ set offset, 0.5\n" - "vand.i32 q2, q0, q0 @ set offset, 0.5\n" - "vand.i32 q3, q0, q0 @ set offset, 0.5\n" - "vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n" - "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n" - "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n" - "vbif.f32 q0, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q1, %q[vnoff], q9 @ get right offset\n" - "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" - "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" - "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" - "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" - "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q4, q0 @ cvt to int32\n" - "vcvt.s32.f32 q5, q1 @ cvt to int32\n" - "vcvt.s32.f32 q6, q2 @ cvt to int32\n" - "vcvt.s32.f32 q7, q3 @ cvt to int32\n" - "vqmovn.s32 d16, q4 @ cnt to int16\n" - "vqmovn.s32 d17, q5 @ cnt to int16\n" - "vqmovn.s32 d18, q6 @ cnt to int16\n" - "vqmovn.s32 d19, q7 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vqmovn.s16 d8, q8 @ cnt to int8\n" - "vqmovn.s16 d9, q9 @ cnt to int8\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" - "subs %[loop], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr) - : [vscale] "w"(vscale), - [vzero] "w"(vzero), - [vnoff] "w"(vnoff), - [vpoff] "w"(vpoff) - : "q0", - "q1", - "q2", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11"); -#endif // __aarch64__ - } - const int* din_r = din_c + 16 * cnt; - int8_t* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); - } - } -} - -/******************************************/ -/******** kernel implement *********/ -/******************************************/ -float compute_max_kernel(const float* din, int64_t size) { - float max_value = 0.f; - int cnt = size / 16; - int remain = size & 15; - float32x4_t vmax_val = vdupq_n_f32(0.f); - const float* ptr_in = din; - if (cnt > 0) { - int loop_cnt = cnt; -#ifdef __aarch64__ - asm volatile( - "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" - "0: \n" - "fabs v4.4s, v0.4s \n" - "fabs v5.4s, v1.4s \n" - "fabs v6.4s, v2.4s \n" - "fabs v7.4s, v3.4s \n" - "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" - "fmax v2.4s, v4.4s, v5.4s \n" - "fmax v3.4s, v6.4s, v7.4s \n" - "fmax v4.4s, v2.4s, v3.4s \n" - "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" - "fmax %[max_val].4s, v4.4s, %[max_val].4s \n" - "subs %[cnt], %[cnt], #1 \n" - "bne 0b \n" - : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n" - "vld1.32 {d4-d7}, [%[in]]! @ load 8 float\n" - "0: @ main loop\n" - "vabs.f32 q4, q0 @ abs \n" - "vabs.f32 q5, q1 @ abs \n" - "vabs.f32 q6, q2 @ abs \n" - "vabs.f32 q7, q3 @ abs \n" - "vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n" - "vmax.f32 q2, q4, q5 @ max \n" - "vmax.f32 q3, q6, q7 @ max \n" - "vmax.f32 q4, q2, q3 @ max \n" - "vld1.32 {d4-d7}, [%[in]]! @ load 8 float\n" - "vmax.f32 %q[max_val], q4, %q[max_val] @ max \n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 0b @ jump to main loop\n" - - : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val) - : - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -#endif - float32x2_t vmax_p = - vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val)); - float max0 = vget_lane_f32(vmax_p, 0); - float max1 = vget_lane_f32(vmax_p, 1); - float max2 = max0 > max1 ? max0 : max1; - max_value = max_value > max2 ? max_value : max2; - } - ptr_in = din + 16 * cnt; - for (int i = 0; i < remain; ++i) { - float data = fabsf(*(ptr_in++)); - max_value = fmaxf(max_value, data); - } - return max_value; -} - -std::vector get_tensor_scale_n(const float* in_data, - int axis_size, - int64_t inner_size, - float scale_factor) { - std::vector scale_out(axis_size); -#pragma omp parallel for - for (int c = 0; c < axis_size; ++c) { // num - const float* ptr_in = in_data + c * inner_size; // channel*width*height - scale_out[c] = compute_max_kernel(ptr_in, inner_size) / scale_factor; - } - return scale_out; -} - -std::vector get_tensor_scale_chw(const float* in_data, - int axis_size, - int64_t outer_size, - int64_t inner_size, - float scale_factor) { - std::vector scale_out(axis_size); - int64_t inner_size_with_axis = axis_size * inner_size; -#pragma omp parallel for - for (int c = 0; c < axis_size; ++c) { - const float* din = in_data + c * inner_size; - float max_val = 0.f; - for (int j = 0; j < outer_size; ++j) { - const float* ptr_in = din + j * inner_size_with_axis; - max_val = fmaxf(compute_max_kernel(ptr_in, inner_size), max_val); - } - scale_out[c] = max_val / scale_factor; - } - return scale_out; -} - -void int32_to_int32(const int* din, - int* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int size_all = outer_size * axis_size * inner_size; - memmove(dout, din, size_all * sizeof(int)); -} - -template <> -void int32_to_dtype(const int* din, - float* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size); -} - -template <> -void int32_to_dtype(const int* din, - signed char* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size); -} - -template <> -void int32_to_dtype(const int* din, - int* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size); -} - -bool trans_tensor_int32_to_int8(Tensor* tin, - Tensor* tout, - float input_scale, - float output_scale, - std::vector weights_scale, - int axis) { - tout->Resize(tin->dims()); - - // compute scale - std::vector scale(weights_scale.size()); - for (int i = 0; i < weights_scale.size(); ++i) { - scale[i] = input_scale * weights_scale[i] / output_scale; - } - - auto i_dims = tin->dims(); - int outer_size = i_dims.count(0, axis); - int axis_size = i_dims[axis]; - int inner_size = i_dims.count(axis + 1, i_dims.size()); - - const int* i_data = tin->data(); - int8_t* o_data = tout->mutable_data(); - int32_to_int8( - i_data, o_data, scale.data(), axis_size, outer_size, inner_size); - - return true; -} - -template <> -bool get_tensor_scale(const Tensor& tin, - std::vector* scale_out, - int axis, - float scale_factor) { - int axis_size = 1; - if (axis >= 0 && axis < tin.dims().size()) { - axis_size = tin.dims()[axis]; - } - int outer_size = 1; - if (axis >= 0) { - outer_size = tin.dims().count(0, axis); - } - int64_t inner_size = tin.dims().count(axis + 1, tin.dims().size()); - - const float* in_data = static_cast(tin.data()); - if (axis <= 0) { - *scale_out = - get_tensor_scale_n(in_data, axis_size, inner_size, scale_factor); - } else { - *scale_out = get_tensor_scale_chw( - in_data, axis_size, outer_size, inner_size, scale_factor); - } - return true; -} - -bool trans_tensor_int32_to_fp32(Tensor* tin, - Tensor* tout, - float input_scale, - std::vector weights_scale, - int axis) { - tout->Resize(tin->dims()); - - // compute scale - std::vector scale(weights_scale.size()); - for (int i = 0; i < weights_scale.size(); ++i) { - scale[i] = input_scale * weights_scale[i]; - } - - auto i_dims = tin->dims(); - int outer_size = i_dims.count(0, axis); - int axis_size = i_dims[axis]; - int inner_size = i_dims.count(axis + 1, i_dims.size()); - - const auto* i_data = tin->data(); - float* o_data = tout->mutable_data(); - //! convert to fp32 - int32_to_fp32( - i_data, o_data, scale.data(), axis_size, outer_size, inner_size); - return true; -} - -bool trans_tensor_fp32_to_int8(Tensor* tin, Tensor* tout, float input_scale) { - tout->Resize(tin->dims()); - - // compute scale - std::vector scale({input_scale}); - int inner_size = tin->dims().production(); - - const auto* i_data = tin->data(); - int8_t* o_data = tout->mutable_data(); - fp32_to_int8(i_data, o_data, scale.data(), 1, 1, inner_size); - return true; -} - -bool trans_fp32_bias_to_int32_basic(Tensor* tin, - Tensor* tout, - float in_scale, - std::vector vector_weight_scale) { - tout->Resize(tin->dims()); - - const float* i_data = tin->data(); - int* o_data = tout->mutable_data(); - for (int i = 0; i < tin->dims().production(); ++i) { - o_data[i] = - static_cast(roundf(i_data[i] / in_scale / vector_weight_scale[i])); - } - return true; -} - -template <> -bool trans_tensor_dtype( - Tensor* tin, - Tensor* tout, - float input_scale, - float output_scale, - std::vector weights_scale) { - return trans_tensor_int32_to_int8( - tin, tout, input_scale, output_scale, weights_scale, 1); -} - -template <> -bool trans_tensor_dtype( - Tensor* tin, - Tensor* tout, - float input_scale, - float output_scale, - std::vector weights_scale) { - return trans_tensor_int32_to_fp32(tin, tout, input_scale, weights_scale, 1); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/type_trans.h b/lite/backends/arm/math/type_trans.h deleted file mode 100644 index e07d798b10..0000000000 --- a/lite/backends/arm/math/type_trans.h +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -bool trans_tensor_dtype(Tensor* tin, - Tensor* tout, - float input_scale, - float output_scale, - std::vector weights_scale) { - LOG(FATAL) << "trans_tensor_dtype has no impl"; - return false; -} - -template <> -bool trans_tensor_dtype( - Tensor* tin, - Tensor* tout, - float input_scale, - float output_scale, - std::vector weights_scale); - -template <> -bool trans_tensor_dtype( - Tensor* tin, - Tensor* tout, - float input_scale, - float output_scale, - std::vector weights_scale); - -template -bool get_tensor_scale(const Tensor& tin, - std::vector* scale_out, - int axis, - float scale_factor) { - return false; -} - -std::vector get_tensor_scale_n(const float* in_data, - int axis_size, - int64_t inner_size, - float scale_factor); - -bool trans_fp32_bias_to_int32_basic(Tensor* tin, - Tensor* tout, - float in_scale, - std::vector vector_weight_scale); - -bool trans_tensor_int32_to_int8(Tensor* tin, - Tensor* tout, - float input_scale, - float output_scale, - std::vector weights_scale, - int axis = 1); - -bool trans_tensor_int32_to_fp32(Tensor* tin, - Tensor* tout, - float input_scale, - std::vector weights_scale, - int axis = 1); - -bool trans_tensor_fp32_to_int8(Tensor* tin, Tensor* tout, float input_scale); - -template <> -bool get_tensor_scale(const Tensor& tin, - std::vector* scale_out, - int axis, - float scale_factor); - -template -void int32_to_dtype(const int* din, - dtype* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size); - -void fp32_to_int8(const float* din, - int8_t* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size); - -void int8_to_fp32(const int8_t* in, - float* out, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/yolo_box.cc b/lite/backends/arm/math/yolo_box.cc deleted file mode 100644 index 72e67cf693..0000000000 --- a/lite/backends/arm/math/yolo_box.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/yolo_box.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -namespace { -inline float sigmoid(float x) { return 1.f / (1.f + expf(-x)); } - -inline void get_yolo_box(float* box, - const float* x, - const int* anchors, - int i, - int j, - int an_idx, - int grid_size, - int input_size, - int index, - int stride, - int img_height, - int img_width) { - box[0] = (i + sigmoid(x[index])) * img_width / grid_size; - box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size; - box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / - input_size; - box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * - img_height / input_size; -} - -inline int get_entry_index(int batch, - int an_idx, - int hw_idx, - int an_num, - int an_stride, - int stride, - int entry) { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; -} - -inline void calc_detection_box(float* boxes, - float* box, - const int box_idx, - const int img_height, - const int img_width) { - boxes[box_idx] = box[0] - box[2] / 2; - boxes[box_idx + 1] = box[1] - box[3] / 2; - boxes[box_idx + 2] = box[0] + box[2] / 2; - boxes[box_idx + 3] = box[1] + box[3] / 2; - - boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); - boxes[box_idx + 1] = - boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); - boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 - ? boxes[box_idx + 2] - : static_cast(img_width - 1); - boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 - ? boxes[box_idx + 3] - : static_cast(img_height - 1); -} - -inline void calc_label_score(float* scores, - const float* input, - const int label_idx, - const int score_idx, - const int class_num, - const float conf, - const int stride) { - for (int i = 0; i < class_num; i++) { - scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); - } -} -} // namespace - -void yolobox(lite::Tensor* X, - lite::Tensor* ImgSize, - lite::Tensor* Boxes, - lite::Tensor* Scores, - std::vector anchors, - int class_num, - float conf_thresh, - int downsample_ratio) { - const int n = X->dims()[0]; - const int h = X->dims()[2]; - const int w = X->dims()[3]; - const int b_num = Boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int X_size = downsample_ratio * h; - - const int stride = h * w; - const int an_stride = (class_num + 5) * stride; - - auto anchors_data = anchors.data(); - - const float* X_data = X->data(); - float* ImgSize_data = ImgSize->mutable_data(); - - float* Boxes_data = Boxes->mutable_data(); - - float* Scores_data = Scores->mutable_data(); - - float box[4]; - for (int i = 0; i < n; i++) { - int img_height = static_cast(ImgSize_data[2 * i]); - int img_width = static_cast(ImgSize_data[2 * i + 1]); - - for (int j = 0; j < an_num; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - int obj_idx = - get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 4); - float conf = sigmoid(X_data[obj_idx]); - if (conf < conf_thresh) { - continue; - } - - int box_idx = - get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 0); - get_yolo_box(box, - X_data, - anchors_data, - l, - k, - j, - h, - X_size, - box_idx, - stride, - img_height, - img_width); - box_idx = (i * b_num + j * stride + k * w + l) * 4; - calc_detection_box(Boxes_data, box, box_idx, img_height, img_width); - - int label_idx = - get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 5); - int score_idx = (i * b_num + j * stride + k * w + l) * class_num; - calc_label_score(Scores_data, - X_data, - label_idx, - score_idx, - class_num, - conf, - stride); - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/yolo_box.h b/lite/backends/arm/math/yolo_box.h deleted file mode 100644 index e454308700..0000000000 --- a/lite/backends/arm/math/yolo_box.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void yolobox(lite::Tensor* X, - lite::Tensor* ImgSize, - lite::Tensor* Boxes, - lite::Tensor* Scores, - std::vector anchors, - int class_num, - float conf_thresh, - int downsample_ratio); - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt deleted file mode 100644 index c0418f6b6a..0000000000 --- a/lite/backends/cuda/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -if(NOT LITE_WITH_CUDA) - return() -endif() - -nv_library(target_wrapper_cuda SRCS target_wrapper.cc) -nv_library(cuda_blas SRCS blas.cc) - -add_subdirectory(math) diff --git a/lite/backends/cuda/blas.cc b/lite/backends/cuda/blas.cc deleted file mode 100644 index c9d2d46cfe..0000000000 --- a/lite/backends/cuda/blas.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/cuda/blas.h" - -namespace paddle { -namespace lite { -namespace cuda { - -template <> -class Blas : public BlasBase { - using T = float; - - void sgemm(cublasOperation_t transa, - cublasOperation_t transb, // - int m, - int n, - int k, // - const T* alpha, // - const T* A, - int lda, // - const T* B, - int ldb, // - const T* beta, // - T* C, - int ldc) const { - CUBLAS_CALL(cublasSgemm(handle(), - transa, - transb, - m, - n, - k, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc)); - } -}; - -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/blas.h b/lite/backends/cuda/blas.h deleted file mode 100644 index f73bb576b8..0000000000 --- a/lite/backends/cuda/blas.h +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "lite/backends/cuda/cuda_utils.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { -namespace cuda { - -#define CUBLAS_CHECK(xxx) CHECK_EQ((xxx), CUBLAS_STATUS_SUCCESS); - -/* - * Some basic methods. - */ -struct BlasBase { - /* - BlasBase() { CUBLAS_CHECK(cublasCreate(&handle_)); } - ~BlasBase() { CUBLAS_CHECK(cublasDestroy(handle_)); } - */ - - void SetStream(cudaStream_t stream) { - CUBLAS_CHECK(cublasSetStream(handle_, stream)); - } - - cudaStream_t GetStream() const { - cudaStream_t stream; - CUBLAS_CHECK(cublasGetStream_v2(handle_, &stream)); - return stream; - } - - int GetVersion() const { - int version{}; - CUBLAS_CHECK(cublasGetVersion_v2(handle_, &version)); - return version; - } - - cublasHandle_t& handle() const { return handle_; } - - protected: - // Not thread-safe, should created for each thread. - // According to cublas doc. - mutable cublasHandle_t handle_; -}; - -// T: Scalar type. -template -class Blas : public lite::cuda::BlasBase { - public: - void sgemm(cublasOperation_t transa, - cublasOperation_t transb, // - int m, - int n, - int k, // - const T* alpha, // - const T* A, - int lda, // - const T* B, - int ldb, // - const T* beta, // - T* C, - int ldc) const { - CHECK_EQ(CUBLAS_STATUS_SUCCESS, - cublasSgemm(handle_, // - CUBLAS_OP_N, - CUBLAS_OP_N, // - m, - n, - k, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc)); - } -}; - -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h deleted file mode 100644 index 13bf8190ef..0000000000 --- a/lite/backends/cuda/cuda_utils.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include "lite/utils/cp_logging.h" - -/* - * This file contains some CUDA specific utils. - */ - -// For quickly implementing the prototype, some of the following code snippets -// are borrowed from project MXNet, great thanks for the original developers. - -#define CHECK_CUDA_ERROR(msg) \ - { \ - auto e = cudaGetLastError(); \ - CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \ - } - -#define CUDA_CALL(func) \ - { \ - auto e = (func); \ - CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ - << "CUDA: " << cudaGetErrorString(e); \ - } - -#define CUBLAS_CALL(func) \ - { \ - auto e = (func); \ - CHECK_EQ(e, CUBLAS_STATUS_SUCCESS) \ - << "cuBlas: " << paddle::lite::cuda::CublasErrorInfo(e); \ - } - -#define CUDNN_VERSION_MIN(major, minor, patch) \ - (CUDNN_VERSION >= (major * 1000 + minor * 100 + patch)) - -#define CUDNN_CHECK(condition) \ - { \ - cudnnStatus_t status = condition; \ - CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \ - } - -namespace paddle { -namespace lite { -namespace cuda { - -static const char* CublasErrorInfo(int error) { - switch (error) { -#define LITE_CUBLAS_ERROR_INFO(xx) \ - case xx: \ - return #xx; \ - break; - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_NOT_INITIALIZED); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_ALLOC_FAILED); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_INVALID_VALUE); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_ARCH_MISMATCH); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_MAPPING_ERROR); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_EXECUTION_FAILED); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_INTERNAL_ERROR); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_NOT_SUPPORTED); - LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_LICENSE_ERROR); -#undef LITE_CUBLAS_ERROR_INFO - default: - return "unknown error"; - } -} - -static const char* CudnnGetErrorInfo(cudnnStatus_t status) { - switch (status) { - case CUDNN_STATUS_SUCCESS: - return "CUDNN_STATUS_SUCCESS"; - case CUDNN_STATUS_NOT_INITIALIZED: - return "CUDNN_STATUS_NOT_INITIALIZED"; - case CUDNN_STATUS_ALLOC_FAILED: - return "CUDNN_STATUS_ALLOC_FAILED"; - case CUDNN_STATUS_BAD_PARAM: - return "CUDNN_STATUS_BAD_PARAM"; - case CUDNN_STATUS_INTERNAL_ERROR: - return "CUDNN_STATUS_INTERNAL_ERROR"; - case CUDNN_STATUS_INVALID_VALUE: - return "CUDNN_STATUS_INVALID_VALUE"; - case CUDNN_STATUS_ARCH_MISMATCH: - return "CUDNN_STATUS_ARCH_MISMATCH"; - case CUDNN_STATUS_MAPPING_ERROR: - return "CUDNN_STATUS_MAPPING_ERROR"; - case CUDNN_STATUS_EXECUTION_FAILED: - return "CUDNN_STATUS_EXECUTION_FAILED"; - case CUDNN_STATUS_NOT_SUPPORTED: - return "CUDNN_STATUS_NOT_SUPPORTED"; - case CUDNN_STATUS_LICENSE_ERROR: - return "CUDNN_STATUS_LICENSE_ERROR"; -#if CUDNN_VERSION_MIN(6, 0, 0) - case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: - return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING"; -#endif -#if CUDNN_VERSION_MIN(7, 0, 0) - case CUDNN_STATUS_RUNTIME_IN_PROGRESS: - return "CUDNN_STATUS_RUNTIME_IN_PROGRESS"; - case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: - return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW"; -#endif - } - return "Unknown cudnn status"; -} - -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt deleted file mode 100644 index c49713fbfe..0000000000 --- a/lite/backends/cuda/math/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -if(NOT LITE_WITH_CUDA) - return() -endif() - -nv_library(cuda_activation SRCS activation.cu) -nv_library(cuda_scale SRCS scale.cu) -nv_library(cuda_type_trans SRCS type_trans.cu) -nv_library(cuda_transpose SRCS transpose.cu) -nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale -cuda_type_trans) - -set ( - math_cuda - cudnn_conv - cuda_activation - cuda_scale - cuda_type_trans - cuda_transpose -) - -set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda") diff --git a/lite/backends/cuda/math/activation.cu b/lite/backends/cuda/math/activation.cu deleted file mode 100644 index 0f50df8e60..0000000000 --- a/lite/backends/cuda/math/activation.cu +++ /dev/null @@ -1,285 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/cuda/math/activation.h" -#include "lite/backends/cuda/math/utils.h" - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -template -__global__ void relu_kernel(const int num, - const T alpha, - const T* input, - T* output) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < num) { -#if __CUDA_ARCH__ >= 350 - output[index] = __ldg(input + index) >= 0 ? __ldg(input + index) - : __ldg(input + index) * alpha; -#else - output[index] = input[index] >= 0 ? input[index] : input[index] * alpha; -#endif - } -} - -__global__ void bias_relu_int8_nhwc4_kernel(int num, - const float4* in, - const float4* bias, - float4* out, - int N, - int K, - int H, - int W, - const float4* scale, - float alpha) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int bias_idx = tid % K; - const float4 bias_ptr = bias[bias_idx]; - const float4 scale_ptr = scale[bias_idx]; - const float4 in_ptr = in[tid]; - - float4 packed_val; - packed_val.x = in_ptr.x * scale_ptr.x + bias_ptr.x; - packed_val.x = fmaxf(packed_val.x * alpha, packed_val.x); - packed_val.y = in_ptr.y * scale_ptr.y + bias_ptr.y; - packed_val.y = fmaxf(packed_val.y * alpha, packed_val.y); - packed_val.z = in_ptr.z * scale_ptr.z + bias_ptr.z; - packed_val.z = fmaxf(packed_val.z * alpha, packed_val.z); - packed_val.w = in_ptr.w * scale_ptr.w + bias_ptr.w; - packed_val.w = fmaxf(packed_val.w * alpha, packed_val.w); - out[tid] = packed_val; - } -} - -__global__ void bias_relu_int8_nhwc4_kernel(int num, - const float4* in, - const float4* bias, - char4* out, - int N, - int K, - int H, - int W, - const float4* scale, - float alpha) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int bias_idx = tid % K; - const float4 bias_ptr = bias[bias_idx]; - const float4 scale_ptr = scale[bias_idx]; - const float4 in_ptr = in[tid]; - - float4 packed_val; - char4 result_val; - packed_val.x = in_ptr.x * scale_ptr.x + bias_ptr.x; - result_val.x = - from_float(fmaxf(packed_val.x * alpha, packed_val.x)); - packed_val.y = in_ptr.y * scale_ptr.y + bias_ptr.y; - result_val.y = - from_float(fmaxf(packed_val.y * alpha, packed_val.y)); - packed_val.z = in_ptr.z * scale_ptr.z + bias_ptr.z; - result_val.z = - from_float(fmaxf(packed_val.z * alpha, packed_val.z)); - packed_val.w = in_ptr.w * scale_ptr.w + bias_ptr.w; - result_val.w = - from_float(fmaxf(packed_val.w * alpha, packed_val.w)); - - out[tid] = result_val; - } -} - -__global__ void relu_int8_nhwc4_kernel(int num, - const float4* in, - float4* out, - int N, - int K, - int H, - int W, - const float4* scale, - float alpha) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int scale_idx = tid % K; - const float4 scale_ptr = scale[scale_idx]; - const float4 in_ptr = in[tid]; - - float4 packed_val; - packed_val.x = in_ptr.x * scale_ptr.x; - packed_val.x = fmaxf(packed_val.x * alpha, packed_val.x); - packed_val.y = in_ptr.y * scale_ptr.y; - packed_val.y = fmaxf(packed_val.y * alpha, packed_val.y); - packed_val.z = in_ptr.z * scale_ptr.z; - packed_val.z = fmaxf(packed_val.z * alpha, packed_val.z); - packed_val.w = in_ptr.w * scale_ptr.w; - packed_val.w = fmaxf(packed_val.w * alpha, packed_val.w); - out[tid] = packed_val; - } -} - -__global__ void relu_int8_nhwc4_kernel(int num, - const float4* in, - char4* out, - int N, - int K, - int H, - int W, - const float4* scale, - float alpha) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int scale_idx = tid % K; - const float4 scale_ptr = scale[scale_idx]; - const float4 in_ptr = in[tid]; - - float4 packed_val; - char4 result_val; - packed_val.x = in_ptr.x * scale_ptr.x; - result_val.x = - from_float(fmaxf(packed_val.x * alpha, packed_val.x)); - packed_val.y = in_ptr.y * scale_ptr.y; - result_val.y = - from_float(fmaxf(packed_val.y * alpha, packed_val.y)); - packed_val.z = in_ptr.z * scale_ptr.z; - result_val.z = - from_float(fmaxf(packed_val.z * alpha, packed_val.z)); - packed_val.w = in_ptr.w * scale_ptr.w; - result_val.w = - from_float(fmaxf(packed_val.w * alpha, packed_val.w)); - - out[tid] = result_val; - } -} - -template <> -void bias_relu_int8_nhwc4(int num, - const void* in, - const void* bias, - void* out, - int N, - int K, - int H, - int W, - const void* scale, - float alpha, - cudaStream_t stream) { - int thread = 256; - int block = (num + thread - 1) / thread; - bias_relu_int8_nhwc4_kernel<<>>( - num, - static_cast(in), - static_cast(bias), - static_cast(out), - N, - K, - H, - W, - static_cast(scale), - alpha); -} - -template <> -void bias_relu_int8_nhwc4(int num, - const void* in, - const void* bias, - void* out, - int N, - int K, - int H, - int W, - const void* scale, - float alpha, - cudaStream_t stream) { - int thread = 256; - int block = (num + thread - 1) / thread; - bias_relu_int8_nhwc4_kernel<<>>( - num, - static_cast(in), - static_cast(bias), - static_cast(out), - N, - K, - H, - W, - static_cast(scale), - alpha); -} - -template <> -void relu_int8_nhwc4(int num, - const void* in, - void* out, - int N, - int K, - int H, - int W, - const void* scale, - float alpha, - cudaStream_t stream) { - int thread = 256; - int block = (num + thread - 1) / thread; - relu_int8_nhwc4_kernel<<>>( - num, - static_cast(in), - static_cast(out), - N, - K, - H, - W, - static_cast(scale), - alpha); -} - -template <> -void relu_int8_nhwc4(int num, - const void* in, - void* out, - int N, - int K, - int H, - int W, - const void* scale, - float alpha, - cudaStream_t stream) { - int thread = 256; - int block = (num + thread - 1) / thread; - relu_int8_nhwc4_kernel<<>>( - num, - static_cast(in), - static_cast(out), - N, - K, - H, - W, - static_cast(scale), - alpha); -} - -template -void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream) { - int thread = 256; - int block = (num + thread - 1) / thread; - relu_kernel<<>>(num, alpha, din, dout); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) std::cout << cudaGetErrorString(error); -} -template void relu(int, const float*, float*, float, cudaStream_t); - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/activation.h b/lite/backends/cuda/math/activation.h deleted file mode 100644 index 7bcb1efdba..0000000000 --- a/lite/backends/cuda/math/activation.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -// fp32 -template -void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream); - -// For int8 -template -void bias_relu_int8_nhwc4(int num, - const void* in, - const void* bias, - void* out, - int N, - int K, - int H, - int W, - const void* scale, - float alpha, - cudaStream_t stream); - -template -void relu_int8_nhwc4(int num, - const void* in, - void* out, - int N, - int K, - int H, - int W, - const void* scale, - float alpha, - cudaStream_t stream); - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc deleted file mode 100644 index ec7fac3187..0000000000 --- a/lite/backends/cuda/math/cudnn_conv.cc +++ /dev/null @@ -1,481 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/cuda/math/cudnn_conv.h" -#include "lite/backends/cuda/math/activation.h" -#include "lite/backends/cuda/math/scale.h" -#include "lite/backends/cuda/math/type_trans.h" - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -template <> -bool CudnnConv2D::create(const operators::ConvParam& param, - Context* ctx) { - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - int batch = x_dims[0]; - - int iw = x_dims[3]; // nchw - int ih = x_dims[2]; - int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; - int oc = o_dims[1]; - int kw = w_dims[3]; - int kh = w_dims[2]; - int sw = param.strides[1]; - int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; - - CHECK(ic % param.groups == 0) - << "The conv input channel shoud be divide group number."; - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - batch, - ic, - ih, - iw)); - CUDNN_CHECK(cudnnSetFilter4dDescriptor(this->filter_desc_, - CUDNN_DATA_FLOAT, - CUDNN_TENSOR_NCHW, - oc, - ic / param.groups, - kh, - kw)); - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(this->conv_desc_, - ph, - pw, - sh, - sw, - dh, - dw, - CUDNN_CROSS_CORRELATION, - CUDNN_DATA_FLOAT)); - CUDNN_CHECK(cudnnSetConvolutionGroupCount(this->conv_desc_, param.groups)); - CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - batch, - oc, - oh, - ow)); - - if (param.activation_param.has_active && with_relu_act_) { - CUDNN_CHECK(cudnnSetActivationDescriptor( - this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - } - - if (ic == param.groups && ic == oc && ic != 1) { - this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - } else { - CUDNN_CHECK( - cudnnGetConvolutionForwardAlgorithm(this->handle_, - this->input_desc_, - this->filter_desc_, - this->conv_desc_, - this->output_desc_, - this->preference_, - this->workspace_limit_bytes_, - &this->fwd_algo_)); - } - CUDNN_CHECK( - cudnnGetConvolutionForwardWorkspaceSize(this->handle_, - this->input_desc_, - this->filter_desc_, - this->conv_desc_, - this->output_desc_, - this->fwd_algo_, - &this->workspace_fwd_sizes_)); - if (this->workspace_fwd_sizes_ > this->workspace_size_inbytes_) { - this->workspace_size_inbytes_ = this->workspace_fwd_sizes_; - if (this->workspace_data_ != NULL) { - cudaFree(this->workspace_data_); - } - cudaMalloc(&this->workspace_data_, this->workspace_size_inbytes_); - this->workspace_ = reinterpret_cast(this->workspace_data_); - } - if (param.bias) { - int dim_bias[] = {1, oc, 1, 1}; - int stride_bias[] = {oc, 1, 1, 1}; - cudnnSetTensorNdDescriptor( - this->bias_desc_, CUDNN_DATA_FLOAT, 4, dim_bias, stride_bias); - } - return true; -} - -template <> -bool CudnnConv2D::init(const operators::ConvParam& param, - Context* ctx) { - this->workspace_size_inbytes_ = 0; - this->workspace_data_ = NULL; - this->workspace_fwd_sizes_ = 0; - - this->stream_ = ctx->exec_stream(); - CUDNN_CHECK(cudnnCreate(&this->handle_)); - CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_)); - - this->workspace_ = NULL; - - cudnnCreateTensorDescriptor(&this->input_desc_); - cudnnCreateTensorDescriptor(&this->output_desc_); - cudnnCreateFilterDescriptor(&this->filter_desc_); - cudnnCreateConvolutionDescriptor(&this->conv_desc_); - cudnnCreateTensorDescriptor(&this->bias_desc_); - - if (param.activation_param.has_active) { - if (param.activation_param.active_type == lite_api::ActivationType::kRelu) { - cudnnCreateActivationDescriptor(&this->act_desc_); - } else { - this->with_relu_act_ = false; - } - } - return create(param, ctx); -} - -template <> -bool CudnnConv2D::run(const operators::ConvParam& param) { - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - auto* o_data = param.output->mutable_data(TARGET(kCUDA)); - - if (param.activation_param.has_active && with_relu_act_) { - if (b_data) { - float alpha = 1.0f; - float beta = 0.0f; - CUDNN_CHECK(cudnnConvolutionBiasActivationForward(handle_, - &alpha, - input_desc_, - i_data, - filter_desc_, - w_data, - conv_desc_, - fwd_algo_, - workspace_, - workspace_fwd_sizes_, - &beta, - output_desc_, - o_data, - bias_desc_, - b_data, - act_desc_, - output_desc_, - o_data)); - } else { - float alpha = 1.0f; - float beta = 0.0f; - CUDNN_CHECK(cudnnConvolutionForward(handle_, - &alpha, - input_desc_, - i_data, - filter_desc_, - w_data, - conv_desc_, - fwd_algo_, - workspace_, - workspace_fwd_sizes_, - &beta, - output_desc_, - o_data)); - - CUDNN_CHECK(cudnnActivationForward(handle_, - act_desc_, - &alpha, - output_desc_, - o_data, - &beta, - output_desc_, - o_data)); - } - } else { - float alpha = 1.0f; - float beta = 0.0f; - CUDNN_CHECK(cudnnConvolutionForward(handle_, - &alpha, - input_desc_, - i_data, - filter_desc_, - w_data, - conv_desc_, - fwd_algo_, - workspace_, - workspace_fwd_sizes_, - &beta, - output_desc_, - o_data)); - if (b_data) { - CUDNN_CHECK(cudnnAddTensor( - handle_, &alpha, bias_desc_, b_data, &alpha, output_desc_, o_data)); - } - } - - if (!with_relu_act_) { - CHECK(param.activation_param.active_type == - lite_api::ActivationType::kLeakyRelu) - << "Only support leaky relu now."; - auto out_dims = param.output->dims(); - int n = out_dims[0], c = out_dims[1], h = out_dims[2], w = out_dims[3]; - int num = n * h * w * c; - float alpha = param.activation_param.Leaky_relu_alpha; - - relu(num, o_data, o_data, alpha, this->stream_); - } - return true; -} - -template -bool CudnnConv2DInt8::create(const operators::ConvParam& param, - Context* ctx) { - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - - int batch = x_dims[0]; - - int iw = x_dims[2]; // nchw - int ih = x_dims[1]; - int ic = x_dims[3]; - int ow = o_dims[2]; - int oh = o_dims[1]; - int oc = o_dims[3]; - - int kw = w_dims[2]; - int kh = w_dims[1]; - - int sw = param.strides[1]; - int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; - - std::vector weight_scale = param.weight_scale; - float input_scale = param.input_scale; - float output_scale = param.output_scale; - CHECK(weight_scale.size() == oc) - << "the num of the weight_scale should be equals to the output channel."; - if (Ptype_out == PRECISION(kInt8)) { - this->temp_tensor_.Resize(o_dims); - this->temp_tensor_.template mutable_data(TARGET(kCUDA)); - for (int i = 0; i < weight_scale.size(); i++) { - weight_scale[i] = (weight_scale[i] * input_scale) / output_scale; - } - } else { - for (int i = 0; i < weight_scale.size(); i++) { - weight_scale[i] = (weight_scale[i] * input_scale); - } - } - this->scale_.Resize({oc}); - auto* scale_data = this->scale_.template mutable_data(TARGET(kCUDA)); - this->scale_.template Assign( - weight_scale.data(), this->scale_.dims()); - - CHECK(ic % param.groups == 0) - << "The conv input channel shoud be divide group number."; - CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_, - CUDNN_TENSOR_NHWC, - CUDNN_DATA_INT8, - batch, - ic, - ih, - iw)); - CUDNN_CHECK(cudnnSetFilter4dDescriptor(this->filter_desc_, - CUDNN_DATA_INT8, - CUDNN_TENSOR_NHWC, - oc, - ic / param.groups, - kh, - kw)); - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(this->conv_desc_, - ph, - pw, - sh, - sw, - dh, - dw, - CUDNN_CROSS_CORRELATION, - CUDNN_DATA_INT32)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_, - CUDNN_TENSOR_NHWC, - CUDNN_DATA_FLOAT, - batch, - oc, - oh, - ow)); - - this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK( - cudnnGetConvolutionForwardWorkspaceSize(this->handle_, - this->input_desc_, - this->filter_desc_, - this->conv_desc_, - this->output_desc_, - this->fwd_algo_, - &(this->workspace_fwd_sizes_))); - - if (this->workspace_fwd_sizes_ > this->workspace_size_inbytes_) { - this->workspace_size_inbytes_ = this->workspace_fwd_sizes_; - if (this->workspace_data_ != NULL) { - cudaFree(this->workspace_data_); - } - cudaMalloc(&this->workspace_data_, this->workspace_size_inbytes_); - this->workspace_ = reinterpret_cast(this->workspace_data_); - } - - return true; -} - -template -bool CudnnConv2DInt8::init(const operators::ConvParam& param, - Context* ctx) { - this->workspace_size_inbytes_ = 0; // 64Mb - this->workspace_data_ = NULL; - this->workspace_fwd_sizes_ = 0; - - this->stream_ = ctx->exec_stream(); - CUDNN_CHECK(cudnnCreate(&this->handle_)); - CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_)); - - this->workspace_ = NULL; - - cudnnCreateTensorDescriptor(&this->input_desc_); - cudnnCreateTensorDescriptor(&this->output_desc_); - cudnnCreateFilterDescriptor(&this->filter_desc_); - cudnnCreateConvolutionDescriptor(&this->conv_desc_); - cudnnCreateTensorDescriptor(&this->bias_desc_); - - if (param.activation_param.has_active) { - if (!(param.activation_param.active_type == - lite_api::ActivationType::kRelu)) { - this->with_relu_act_ = false; - } - } - return create(param, ctx); -} - -template -bool CudnnConv2DInt8::run(const operators::ConvParam& param) { - const auto* i_data = param.x->data(); - const auto* w_data = param.filter->data(); - const auto* b_data = param.bias ? param.bias->data() : nullptr; - float* temp_out; - float* scale = this->scale_.template mutable_data(TARGET(kCUDA)); - if (Ptype_out == PRECISION(kInt8)) { - temp_out = this->temp_tensor_.template mutable_data(TARGET(kCUDA)); - } else { - temp_out = param.output->mutable_data(TARGET(kCUDA)); - } - - float alpha = 1.0f; - float beta = 0.0f; - CUDNN_CHECK(cudnnConvolutionForward(this->handle_, - &alpha, - this->input_desc_, - i_data, - this->filter_desc_, - w_data, - this->conv_desc_, - this->fwd_algo_, - this->workspace_, - this->workspace_fwd_sizes_, - &beta, - this->output_desc_, - temp_out)); - - auto out_dims = param.output->dims(); - int n = out_dims[0], h = out_dims[1], w = out_dims[2], c = out_dims[3]; - int num = n * h * w * c / 4; - - if (!param.activation_param.has_active && !b_data) { - if (Ptype_out == PRECISION(kInt8)) { - auto* out = param.output->mutable_data(TARGET(kCUDA)); - fp32_to_int8_nhwc4(num, - static_cast(temp_out), - static_cast(out), - static_cast(scale), - n, - c / 4, - h, - w, - this->stream_); - } else { - fp32_scale_nhwc4(num, - static_cast(temp_out), - static_cast(temp_out), - static_cast(scale), - n, - c / 4, - h, - w, - this->stream_); - } - return true; - } - - if (b_data) { - if (param.activation_param.has_active) { - float alpha = 0.0; - if (!this->with_relu_act_) - alpha = param.activation_param.Leaky_relu_alpha; - if (Ptype_out == PRECISION(kInt8)) { - auto* out = param.output->mutable_data(TARGET(kCUDA)); - bias_relu_int8_nhwc4(num, - static_cast(temp_out), - static_cast(b_data), - static_cast(out), - n, - c / 4, - h, - w, - static_cast(scale), - alpha, - this->stream_); - } else { - bias_relu_int8_nhwc4(num, - static_cast(temp_out), - static_cast(b_data), - static_cast(temp_out), - n, - c / 4, - h, - w, - static_cast(scale), - alpha, - this->stream_); - } - return true; - } - } - - CHECK(false) - << "Conv Int8 support Conv, Conv + bias + relu, Conv + bias + leaky_relu"; -} - -template class CudnnConv2DInt8; -template class CudnnConv2DInt8; - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_conv.h b/lite/backends/cuda/math/cudnn_conv.h deleted file mode 100644 index 03612a5e5a..0000000000 --- a/lite/backends/cuda/math/cudnn_conv.h +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "lite/api/paddle_place.h" -#include "lite/backends/cuda/cuda_utils.h" -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" -#include "lite/operators/op_params.h" - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -template -class CudnnConv2DBase { - public: - CudnnConv2DBase() - : handle_(NULL), - workspace_data_(NULL), - workspace_(NULL), - conv_desc_(NULL), - input_desc_(NULL), - output_desc_(NULL), - filter_desc_(NULL), - act_desc_(NULL), - bias_desc_(NULL), - workspace_fwd_sizes_(0), - workspace_size_inbytes_(0), - fwd_algo_((cudnnConvolutionFwdAlgo_t)0) {} - - ~CudnnConv2DBase() { - if (conv_desc_) { - CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_)); - } - if (input_desc_) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_)); - } - if (output_desc_) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_)); - } - if (act_desc_) { - CUDNN_CHECK(cudnnDestroyActivationDescriptor(act_desc_)); - } - if (bias_desc_) { - CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_)); - } - if (filter_desc_) { - CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_)); - } - if (handle_ != NULL) { - CUDNN_CHECK(cudnnDestroy(handle_)); - } - if (workspace_data_ != NULL) { - cudaFree(workspace_data_); - } - } - - protected: - cudaStream_t stream_; - cudnnHandle_t handle_; - cudnnConvolutionFwdAlgo_t fwd_algo_; - cudnnTensorDescriptor_t input_desc_; - cudnnTensorDescriptor_t output_desc_; - cudnnTensorDescriptor_t bias_desc_; - cudnnFilterDescriptor_t filter_desc_; - cudnnConvolutionDescriptor_t conv_desc_; - - // activation descriptor - cudnnActivationDescriptor_t act_desc_; - bool with_relu_act_{true}; - - size_t workspace_fwd_sizes_; - size_t workspace_size_inbytes_; // size of underlying storage - void* workspace_data_; // underlying storage - void* workspace_; // aliases into _workspaceData - - const bool use_tensor_core_ = true; - const size_t workspace_limit_bytes_ = 4 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t preference_ = - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; - - // For int8 - Tensor temp_tensor_; - Tensor scale_; -}; - -template -class CudnnConv2D : public CudnnConv2DBase { - public: - CudnnConv2D() : CudnnConv2DBase() {} - virtual bool init(const operators::ConvParam& param, - Context* ctx); - - virtual bool create(const operators::ConvParam& param, - Context* ctx); - - virtual bool run(const operators::ConvParam& param); -}; - -template -class CudnnConv2DInt8 : CudnnConv2DBase { - public: - CudnnConv2DInt8() : CudnnConv2DBase() {} - virtual bool init(const operators::ConvParam& param, - Context* ctx); - - virtual bool create(const operators::ConvParam& param, - Context* ctx); - - virtual bool run(const operators::ConvParam& param); -}; - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_helper.h b/lite/backends/cuda/math/cudnn_helper.h deleted file mode 100644 index b7f9b2cf69..0000000000 --- a/lite/backends/cuda/math/cudnn_helper.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -namespace paddle { -namespace lite { -namespace cuda { -namespace math {} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/scale.cu b/lite/backends/cuda/math/scale.cu deleted file mode 100644 index cc49d0403d..0000000000 --- a/lite/backends/cuda/math/scale.cu +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "iostream" -#include "lite/backends/cuda/math/scale.h" -#include "lite/backends/cuda/math/utils.h" - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -__global__ void fp32_scale_nhwc4_kernel(int num, - const float4* in, - float4* out, - const float4* scale, - int N, - int K, - int H, - int W) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int scale_idx = tid % K; - const float4 scale_ptr = scale[scale_idx]; - const float4 in_ptr = in[tid]; - float4 packed_val; - - packed_val.x = in_ptr.x * scale_ptr.x; - packed_val.y = in_ptr.y * scale_ptr.y; - packed_val.z = in_ptr.z * scale_ptr.z; - packed_val.w = in_ptr.w * scale_ptr.w; - out[tid] = packed_val; - } -} - -void fp32_scale_nhwc4(int num, - const void* in, - void* out, - const void* scale, - int N, - int K, - int H, - int W, - cudaStream_t stream) { - int thread = 256; - int block = (num + thread - 1) / thread; - fp32_scale_nhwc4_kernel<<>>( - num, - static_cast(in), - static_cast(out), - static_cast(scale), - N, - K, - H, - W); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) std::cout << cudaGetErrorString(error); -} - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/scale.h b/lite/backends/cuda/math/scale.h deleted file mode 100644 index e96b864c92..0000000000 --- a/lite/backends/cuda/math/scale.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -void fp32_scale_nhwc4(int num, - const void* din, - void* dout, - const void* scale, - int N, - int K, - int H, - int W, - cudaStream_t stream); - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu deleted file mode 100644 index 6467f00307..0000000000 --- a/lite/backends/cuda/math/transpose.cu +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/cuda/math/transpose.h" -#include "lite/backends/cuda/math/utils.h" - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -constexpr int kTileDim = 32; -constexpr int kBlockRows = 8; -constexpr int CUDA_NUM_THREADS = 128; - -// Splits the original matrix into submatrices with size 32 * 32. -// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/ -template -__global__ void BatchTranspose2DCUDAKernel(const int N, - const int H, - const int W, - const int dh, - const int dw, - const T* input, - T* out) { - __shared__ T tile[kTileDim][kTileDim + 1]; // plus 1 to prevent bank confict. - const int n = blockIdx.x / (dh * dw); - const int k = blockIdx.x % (dh * dw); - const int r = k / dw; - const int c = k % dw; - const int offset = n * H * W; - int x = c * kTileDim + threadIdx.x; - int y = r * kTileDim + threadIdx.y; - if (x < W) { - for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) { -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) - tile[threadIdx.y + i][threadIdx.x] = - __ldg(input + offset + (y + i) * W + x); -#else - tile[threadIdx.y + i][threadIdx.x] = input[offset + (y + i) * W + x]; -#endif - } - } - __syncthreads(); - x = r * kTileDim + threadIdx.x; - y = c * kTileDim + threadIdx.y; - if (x < H) { - for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) { - out[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i]; - } - } -} - -template -void BatchTranspose2DCUDAImpl(const int N, - const int H, - const int W, - const T* input, - T* out, - CUDAContext* ctx) { - const int dh = (H + kTileDim - 1) / kTileDim; - const int dw = (W + kTileDim - 1) / kTileDim; - BatchTranspose2DCUDAKernel< - T><<exec_stream()>>>( - N, H, W, dh, dw, input, out); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -#define TYPE_SPECIALIZED_CUDA_NCHW2NHWC(T) \ - template <> \ - void NCHW2NHWC(const int N, \ - const int C, \ - const int HxW, \ - const T* X, \ - T* Y, \ - CUDAContext* ctx) { \ - BatchTranspose2DCUDAImpl(N, C, HxW, X, Y, ctx); \ - } -TYPE_SPECIALIZED_CUDA_NCHW2NHWC(float) -#undef TYPE_SPECIALIZED_CUDA_NCHW2NHWC - -#define TYPE_SPECIALIZED_CUDA_NHWC2NCHW(T) \ - template <> \ - void NHWC2NCHW(const int N, \ - const int C, \ - const int HxW, \ - const T* X, \ - T* Y, \ - CUDAContext* ctx) { \ - BatchTranspose2DCUDAImpl(N, HxW, C, X, Y, ctx); \ - } -TYPE_SPECIALIZED_CUDA_NHWC2NCHW(float) -#undef TYPE_SPECIALIZED_CUDA_NHWC2NCHW - -template -__global__ void TransposeCUDAKernel(const int size, - const int ndim, - const int* X_strides, - const int* Y_dims, - const T* X, - T* Y) { - const int Y_index = blockIdx.x * CUDA_NUM_THREADS + threadIdx.x; - if (Y_index < size) { - int X_index = 0; - int v = Y_index; -#pragma unroll - for (int i = ndim - 1; i >= 0; --i) { - X_index += v % Y_dims[i] * X_strides[i]; - v /= Y_dims[i]; - } -#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__) - Y[Y_index] = __ldg(X + X_index); -#else - Y[Y_index] = X[X_index]; -#endif - } -} - -template -void TransposeCUDAImpl(const std::vector& X_dims, - const std::vector& axes, - const T* X, - T* Y, - CUDAContext* ctx) { - CHECK_EQ(X_dims.size(), axes.size()) << "dimension size should be equal"; - int ndim = X_dims.size(); - std::vector strides(ndim, 0); - std::vector Y_dims(ndim, 0); - std::vector buf(ndim, 0); - int cur_stride = 1; - for (int i = ndim - 1; i >= 0; --i) { - buf[i] = cur_stride; - cur_stride *= X_dims[i]; - } - for (int i = 0; i < ndim; ++i) { - strides[i] = buf[axes[i]]; - } - int size = 1; - for (int i = 0; i < ndim; ++i) { - Y_dims[i] = static_cast(X_dims[axes[i]]); - size *= X_dims[i]; - } - - lite::Tensor Y_dims_, strides_; - Y_dims_.Resize(std::vector({ndim})); - int* d_y_dims = Y_dims_.mutable_data(TARGET(kCUDA)); - CopySync( - d_y_dims, Y_dims.data(), sizeof(int) * Y_dims.size(), IoDirection::HtoD); - - strides_.Resize(std::vector({ndim})); - int* d_strides = strides_.mutable_data(TARGET(kCUDA)); - CopySync(d_strides, - strides.data(), - sizeof(int) * strides.size(), - IoDirection::HtoD); - - const int M = (size + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; - TransposeCUDAKernel<<exec_stream()>>>( - size, ndim, d_strides, d_y_dims, X, Y); - // cudaError_t error = cudaGetLastError(); - // if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -#define TYPE_SPECIALIZED_CUDA_TRANSPOSE(T) \ - template <> \ - void Transpose(const std::vector& X_dims, \ - const std::vector& axes, \ - const T* X, \ - T* Y, \ - CUDAContext* ctx) { \ - TransposeCUDAImpl(X_dims, axes, X, Y, ctx); \ - } -TYPE_SPECIALIZED_CUDA_TRANSPOSE(float) -#undef TYPE_SPECIALIZED_CUDA_TRANSPOSEF - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/transpose.h b/lite/backends/cuda/math/transpose.h deleted file mode 100644 index ba2464547b..0000000000 --- a/lite/backends/cuda/math/transpose.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -template -void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context); - -template -void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context); - -template -void Transpose(const std::vector& X_dims, - const std::vector& axes, - const T* X, - T* Y, - CUDAContext* ctx); - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/type_trans.cu b/lite/backends/cuda/math/type_trans.cu deleted file mode 100644 index 6636f98840..0000000000 --- a/lite/backends/cuda/math/type_trans.cu +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/cuda/math/type_trans.h" -#include "lite/backends/cuda/math/utils.h" - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -__global__ void fp32_scale_nhwc4_kernel(int num, - const float4* in, - char4* out, - const float4* scale, - int N, - int K, - int H, - int W) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int scale_idx = tid % K; - const float4 scale_ptr = scale[scale_idx]; - const float4 in_ptr = in[tid]; - char4 result_val; - - result_val.x = from_float(in_ptr.x * scale_ptr.x); - result_val.y = from_float(in_ptr.y * scale_ptr.y); - result_val.z = from_float(in_ptr.z * scale_ptr.z); - result_val.w = from_float(in_ptr.w * scale_ptr.w); - out[tid] = result_val; - } -} - -void fp32_to_int8_nhwc4(int num, - const void* in, - void* out, - const void* scale, - int N, - int K, - int H, - int W, - cudaStream_t stream) { - int thread = 256; - int block = (num + thread - 1) / thread; - fp32_scale_nhwc4_kernel<<>>( - num, - static_cast(in), - static_cast(out), - static_cast(scale), - N, - K, - H, - W); -} - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/type_trans.h b/lite/backends/cuda/math/type_trans.h deleted file mode 100644 index b83830f10a..0000000000 --- a/lite/backends/cuda/math/type_trans.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -void fp32_to_int8_nhwc4(int num, - const void* din, - void* dout, - const void* scale, - int N, - int K, - int H, - int W, - cudaStream_t stream); - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h deleted file mode 100644 index b4cd82fd8d..0000000000 --- a/lite/backends/cuda/math/utils.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include - -namespace paddle { -namespace lite { -namespace cuda { -namespace math { - -template -__device__ T from_float(float x); - -template <> -__device__ __forceinline__ float from_float(float x) { - return x; -} - -template <> -__device__ __forceinline__ half from_float(float x) { - return __float2half(x); -} - -template <> -__device__ __forceinline__ int8_t from_float(float x) { - x = fmaxf(x, std::numeric_limits::min()); - x = fminf(x, std::numeric_limits::max()); - return __float2int_rn(x); -} - -} // namespace math -} // namespace cuda -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/target_wrapper.cc b/lite/backends/cuda/target_wrapper.cc deleted file mode 100644 index b1aaadf027..0000000000 --- a/lite/backends/cuda/target_wrapper.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/cuda/target_wrapper.h" - -namespace paddle { -namespace lite { - -size_t TargetWrapperCuda::num_devices() { - int count = 0; - cudaGetDeviceCount(&count); - return count; -} - -void* TargetWrapperCuda::Malloc(size_t size) { - void* ptr{}; - CHECK_EQ(cudaSuccess, cudaMalloc(&ptr, size)); - return ptr; -} - -void TargetWrapperCuda::Free(void* ptr) { - CHECK_EQ(cudaSuccess, cudaFree(ptr)); -} - -void TargetWrapperCuda::MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir) { - switch (dir) { - case IoDirection::DtoD: - CHECK(cudaSuccess == - cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice)); - break; - case IoDirection::HtoD: - CHECK(cudaSuccess == cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); - break; - case IoDirection::DtoH: - CHECK(cudaSuccess == cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); - break; - default: - LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); - } -} - -void TargetWrapperCuda::MemcpyAsync(void* dst, - const void* src, - size_t size, - IoDirection dir, - const stream_t& stream) { - switch (dir) { - case IoDirection::DtoD: - CHECK(cudaSuccess == - cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream)); - break; - case IoDirection::HtoD: - CHECK(cudaSuccess == - cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream)); - break; - case IoDirection::DtoH: - CHECK(cudaSuccess == - cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream)); - break; - default: - LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h deleted file mode 100644 index 50063007ce..0000000000 --- a/lite/backends/cuda/target_wrapper.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { - -using TargetWrapperCuda = TargetWrapper; - -template <> -class TargetWrapper { - public: - using stream_t = cudaStream_t; - using event_t = cudaEvent_t; - - static size_t num_devices(); - static size_t maximum_stream() { return 0; } - - static size_t GetCurDevice() { - int dev_id; - cudaGetDevice(&dev_id); - return dev_id; - } - static void CreateStream(stream_t* stream) {} - static void DestroyStream(const stream_t& stream) {} - - static void CreateEvent(event_t* event) {} - static void DestroyEvent(const event_t& event) {} - - static void RecordEvent(const event_t& event) {} - static void SyncEvent(const event_t& event) {} - - static void StreamSync(const stream_t& stream) {} - - static void* Malloc(size_t size); - static void Free(void* ptr); - - static void MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir); - static void MemcpyAsync(void* dst, - const void* src, - size_t size, - IoDirection dir, - const stream_t& stream); -}; -} // namespace lite -} // namespace paddle diff --git a/lite/backends/fpga/CMakeLists.txt b/lite/backends/fpga/CMakeLists.txt deleted file mode 100644 index b12fd85caf..0000000000 --- a/lite/backends/fpga/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -if (NOT LITE_WITH_FPGA) - return() -endif() - -set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD") -set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga") - -message("fpga_kd_path ${LITE_FPGA_KD_PATH}") -message("fpga_path ${LITE_FPGA_PATH}") -file(GLOB_RECURSE KD_CPP *.cpp *.cc) -file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc") - -cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP}) -cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory) -cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga) diff --git a/lite/backends/fpga/KD/alignment.h b/lite/backends/fpga/KD/alignment.h deleted file mode 100644 index 5cca79885c..0000000000 --- a/lite/backends/fpga/KD/alignment.h +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" - -namespace paddle { -namespace zynqmp { - -inline int align_image(int wc) { return align_to_x(wc, IMAGE_ALIGNMENT); } - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/context.hpp b/lite/backends/fpga/KD/context.hpp deleted file mode 100644 index 86109a4d1e..0000000000 --- a/lite/backends/fpga/KD/context.hpp +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pes/conv_pe.hpp" -#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp" -#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp" -#include "lite/backends/fpga/KD/pes/input_pe.hpp" -#include "lite/backends/fpga/KD/pes/output_pe.hpp" -#include "lite/backends/fpga/KD/pes/pooling_pe.hpp" -#include "lite/backends/fpga/KD/pes/softmax_pe.hpp" - -namespace paddle { -namespace zynqmp { - -class Context { - public: - template - Ptype& pe() { - if (pe_ == nullptr) { - pe_ = new Ptype(); - } - return static_cast(*pe_); - } - - ~Context() { - if (pe_ != nullptr) { - delete pe_; - } - } - - private: - PE* pe_ = nullptr; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp deleted file mode 100644 index 9849e4275b..0000000000 --- a/lite/backends/fpga/KD/dl_engine.cpp +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/fpga/KD/dl_engine.hpp" -namespace paddle { -namespace zynqmp { - -DLEngine::DLEngine() { - open_device(); - struct DeviceInfo info; - int ret = get_device_info(info); - filter::set_filter_capacity(info.filter_cap); -} - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp deleted file mode 100644 index 829f41dfeb..0000000000 --- a/lite/backends/fpga/KD/dl_engine.hpp +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "lite/backends/fpga/KD/llapi/filter.h" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" - -namespace paddle { -namespace zynqmp { - -class DLEngine { - public: - static DLEngine& get_instance() { - static DLEngine s_instance; - return s_instance; - } - - private: - DLEngine(); -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/float16.hpp b/lite/backends/fpga/KD/float16.hpp deleted file mode 100755 index 9f12317196..0000000000 --- a/lite/backends/fpga/KD/float16.hpp +++ /dev/null @@ -1,508 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle { -namespace zynqmp { - -typedef uint16_t float16; - -static const uint32_t mantissatable[2048] = { - 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, - 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, - 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, - 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, - 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, - 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, - 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, - 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, - 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, - 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, - 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, - 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, - 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, - 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, - 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, - 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, - 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, - 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, - 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, - 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, - 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, - 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, - 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, - 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, - 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, - 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, - 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, - 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, - 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, - 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, - 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, - 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, - 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, - 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, - 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, - 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, - 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, - 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, - 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, - 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, - 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, - 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, - 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, - 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, - 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, - 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, - 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, - 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, - 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, - 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, - 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, - 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, - 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, - 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, - 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, - 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, - 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, - 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, - 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, - 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, - 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, - 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, - 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, - 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, - 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, - 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, - 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, - 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, - 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, - 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, - 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, - 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, - 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, - 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, - 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, - 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, - 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, - 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, - 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, - 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, - 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, - 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, - 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, - 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, - 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, - 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, - 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, - 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, - 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, - 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, - 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, - 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, - 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, - 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, - 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, - 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, - 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, - 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, - 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, - 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, - 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, - 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, - 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, - 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, - 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, - 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, - 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, - 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, - 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, - 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, - 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, - 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, - 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, - 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, - 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, - 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, - 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, - 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, - 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, - 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, - 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, - 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, - 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, - 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, - 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, - 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, - 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, - 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, - 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, - 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, - 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, - 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, - 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, - 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, - 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, - 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, - 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, - 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, - 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, - 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, - 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, - 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, - 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, - 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, - 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, - 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, - 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, - 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, - 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, - 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, - 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, - 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, - 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, - 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, - 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, - 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, - 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, - 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, - 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, - 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, - 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, - 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, - 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, - 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, - 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, - 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, - 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, - 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, - 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, - 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, - 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, - 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, - 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, - 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, - 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, - 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, - 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, - 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, - 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, - 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, - 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, - 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, - 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, - 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, - 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, - 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, - 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, - 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, - 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, - 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, - 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, - 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, - 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, - 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, - 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, - 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, - 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, - 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, - 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, - 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, - 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, - 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, - 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, - 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, - 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, - 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, - 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, - 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, - 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, - 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, - 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, - 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, - 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, - 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, - 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, - 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, - 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, - 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, - 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, - 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, - 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, - 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, - 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, - 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, - 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, - 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, - 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, - 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, - 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, - 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, - 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, - 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, - 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, - 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, - 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, - 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, - 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, - 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, - 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, - 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, - 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, - 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, - 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, - 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, - 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, - 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, - 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, - 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, - 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, - 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, - 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, - 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, - 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, - 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, - 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, - 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, - 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, - 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, - 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, - 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, - 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, - 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, - 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, - 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, - 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, - 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, - 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, - 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, - 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, - 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, - 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, - 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, - 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, - 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, - 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, - 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, - 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, - 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, - 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, - 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, - 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, - 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, - 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, - 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, - 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, - 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, - 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, - 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, - 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, - 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, - 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, - 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, - 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, - 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, - 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, - 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, - 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, - 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, - 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, - 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, - 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, - 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, - 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, - 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, - 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, - 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, - 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, - 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, - 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, - 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, - 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, - 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, - 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, - 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, - 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, - 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, - 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, - 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, - 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, - 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, - 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, - 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, - 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, - 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, - 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, - 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, - 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, - 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, - 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, - 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, - 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, - 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, - 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, - 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, - 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, - 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, - 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, - 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, - 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, - 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, - 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, - 0x387fc000, 0x387fe000}; - -static const uint16_t offsettable[64] = { - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, - 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; - -static const uint32_t exponenttable[64] = { - 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, - 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, - 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, - 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, - 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, - 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, - 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, - 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, - 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, - 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, - 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; - -static const uint16_t basetable[512] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, - 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, - 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, - 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, - 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, - 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, - 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, - 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, - 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, - 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, - 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, - 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; - -static const uint8_t shifttable[512] = { - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, - 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, - 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, - 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, - 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; - -inline float16 float_to_half(float f) { - uint32_t v = *reinterpret_cast(&f); - return basetable[(v >> 23) & 0x1ff] + - ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); -} - -inline float half_to_float(float16 h) { - uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + - exponenttable[h >> 10]; - return *reinterpret_cast(&v); -} - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/fpga_cv.cpp b/lite/backends/fpga/KD/fpga_cv.cpp deleted file mode 100644 index 15a20e368b..0000000000 --- a/lite/backends/fpga/KD/fpga_cv.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/fpga/KD/fpga_cv.hpp" - -using paddle::zynqmp::float16; - -void fpga_resize(float* input, - int input_width, - int input_height, - int input_channel, - uint8_t* output, - int output_width, - int output_height) { - paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0}; - paddle::zynqmp::config_inplace(inplace_args); - - paddle::zynqmp::ImageInputArgs input_args = {nullptr}; - input_args.address = nullptr; - input_args.scale_address = nullptr; - - float16* input_image_address = - reinterpret_cast(paddle::zynqmp::fpga_malloc( - input_width * input_height * input_channel * sizeof(float16))); - int index = 0; - - for (int i = 0; i < input_width * input_height * input_channel; i++) { - input_image_address[i] = float16(1.0 * input[i]); - } - - paddle::zynqmp::ResizeArgs resize_args = {0}; - - resize_args.input_width = input_width; - resize_args.input_height = input_height; - resize_args.image_channel = input_channel; - resize_args.output_width = output_width; - resize_args.output_height = output_height; - float height_ratio = static_cast(input_height) / - static_cast(resize_args.output_height); - float width_ratio = static_cast(input_width) / - static_cast(resize_args.output_width); - resize_args.height_ratio = *reinterpret_cast(&height_ratio); - resize_args.width_ratio = *reinterpret_cast(&width_ratio); - - int output_size = - resize_args.output_width * resize_args.output_height * input_channel; - float16* fpga_output = reinterpret_cast( - paddle::zynqmp::fpga_malloc(output_size * sizeof(float16))); - resize_args.input_image_address = input_image_address; - resize_args.output_image_address = fpga_output; - - memset(fpga_output, 0, output_size * sizeof(float16)); - paddle::zynqmp::fpga_flush( - input_image_address, - input_width * input_height * input_channel * sizeof(float16)); - paddle::zynqmp::fpga_flush(resize_args.output_image_address, - output_size * sizeof(float16)); - int ret = paddle::zynqmp::compute_fpga_resize(resize_args); - if (ret == 0) { - paddle::zynqmp::fpga_invalidate(resize_args.output_image_address, - output_size * sizeof(float16)); - } - - for (int i = 0; i < output_size; i++) { - output[i] = fpga_output[i]; - } -} diff --git a/lite/backends/fpga/KD/fpga_cv.hpp b/lite/backends/fpga/KD/fpga_cv.hpp deleted file mode 100644 index 6aa52edfbb..0000000000 --- a/lite/backends/fpga/KD/fpga_cv.hpp +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" -#include "lite/backends/fpga/KD/pe.hpp" - -void fpga_resize(float* input, - int input_width, - int input_height, - int input_channel, - uint8_t* output, - int output_width, - int output_height); diff --git a/lite/backends/fpga/KD/layout.hpp b/lite/backends/fpga/KD/layout.hpp deleted file mode 100644 index 74819cd212..0000000000 --- a/lite/backends/fpga/KD/layout.hpp +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "lite/backends/fpga/KD/alignment.h" - -namespace paddle { -namespace zynqmp { - -enum LayoutType { - N, - NC, - NCHW, - NHWC, - NHW, -}; - -class Layout { - public: - virtual int numIndex() = 0; - virtual int channelIndex() { return -1; } - virtual int heightIndex() { return -1; } - virtual int widthIndex() { return -1; } - virtual int alignedElementCount(const std::vector& dims) = 0; - virtual int elementCount(const std::vector& dims) = 0; -}; - -struct NCHW : Layout { - int numIndex() { return 0; } - int channelIndex() { return 1; } - int heightIndex() { return 2; } - int widthIndex() { return 3; } - int alignedElementCount(const std::vector& dims) { - return dims[0] * dims[2] * align_image(dims[1] * dims[3]); - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1] * dims[2] * dims[3]; - } -}; - -struct NHWC : Layout { - int numIndex() { return 0; } - int heightIndex() { return 1; } - int widthIndex() { return 2; } - int channelIndex() { return 3; } - int alignedElementCount(const std::vector& dims) { - return dims[0] * dims[1] * align_image(dims[2] * dims[3]); - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1] * dims[2] * dims[3]; - } -}; - -struct NC : Layout { - int numIndex() { return 0; } - int channelIndex() { return 1; } - int alignedElementCount(const std::vector& dims) { - return dims[0] * dims[1]; - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1]; - } -}; - -struct N : Layout { - int numIndex() { return 0; } - int alignedElementCount(const std::vector& dims) { return dims[0]; } - virtual int elementCount(const std::vector& dims) { return dims[0]; } -}; - -struct NHW : Layout { - int numIndex() { return 0; } - int heightIndex() { return 1; } - int widthIndex() { return 2; } - int alignedElementCount(const std::vector& dims) { - // TODO(chonwhite) align it; - return dims[0] * dims[1] * dims[2]; - } - virtual int elementCount(const std::vector& dims) { - return dims[0] * dims[1] * dims[2]; - } -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/llapi/bias_scale.cpp b/lite/backends/fpga/KD/llapi/bias_scale.cpp deleted file mode 100644 index cd60f27f98..0000000000 --- a/lite/backends/fpga/KD/llapi/bias_scale.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "lite/backends/fpga/KD/llapi/bias_scale.h" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" - -namespace paddle { -namespace zynqmp { -namespace bias_scale { - -void align_element(float **data_in, int num_per_div_before_alignment, int num) { - int copynum = 0; - float *ptr_unaligned = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); - int num_element = - 2 * div_num * num_per_div_after_alignment; // including bias & scale - float *ptr_aligned = - (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT - - memset(ptr_aligned, 0, num_element * sizeof(float)); - for (int i = 0; i < div_num; i++) { - if (i == div_num - 1) { - copynum = (num_per_div_after_alignment * div_num > num) - ? (num % num_per_div_after_alignment) - : (num_per_div_before_alignment); - } else { - copynum = num_per_div_before_alignment; - } - - memcpy(ptr_aligned + i * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i, - copynum * sizeof(float)); - memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment, - ptr_unaligned + num_per_div_before_alignment * i + num, - copynum * sizeof(float)); - } - fpga_free(ptr_unaligned); - *data_in = ptr_aligned; -} - -void interleave(float **data_in, int num_after_alignment) { - float *ptr_uninterleaved = *data_in; - float *ptr_interleaved = - (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT - int num = num_after_alignment / 4; - for (int i = 0; i < num; i++) { - memcpy( - ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, 4 * sizeof(float)); - memcpy(ptr_interleaved + 8 * i + 4, - ptr_uninterleaved + num_after_alignment + 4 * i, - 4 * sizeof(float)); - } - - fpga_free(ptr_uninterleaved); - *data_in = ptr_interleaved; -} - -void format_bias_scale_array(float **bias_scale_array, - int element_num_per_division, - int num) { - align_element(bias_scale_array, element_num_per_division, num); - int div_num = (num + element_num_per_division - 1) / element_num_per_division; - int element_num_after_division = - align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); - interleave(bias_scale_array, div_num * element_num_after_division); - fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); -} -void format_bias_array(float **bias_array, int num) { - float *ptr_unaligned = *bias_array; - int num_before_align = num; - int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT); - int16_t *ptr_aligned = - (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT - - memset(ptr_aligned, 0, num_after_align * sizeof(int16_t)); - for (int i = 0; i < num_before_align; i++) { - float value = ptr_aligned[i]; - ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]); - } - *bias_array = (float *)ptr_aligned; // NOLINT - fpga_free(ptr_unaligned); -} - -} // namespace bias_scale -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/llapi/bias_scale.h b/lite/backends/fpga/KD/llapi/bias_scale.h deleted file mode 100644 index 83f30df18f..0000000000 --- a/lite/backends/fpga/KD/llapi/bias_scale.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle { -namespace zynqmp { -namespace bias_scale { - -void align_element(float** data_in, int num_per_div_before_alignment, int num); -void interleave(float** data_in, int num_after_alignment); -void format_bias_scale_array(float** bias_scale_array, - int element_num_per_division, - int num); -void format_bias_array(float** bias_array, int num); - -} // namespace bias_scale -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/llapi/config.h b/lite/backends/fpga/KD/llapi/config.h deleted file mode 100755 index acf8c8adf4..0000000000 --- a/lite/backends/fpga/KD/llapi/config.h +++ /dev/null @@ -1,19 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define PADDLE_LITE_ZU5 -#define FPGA_PRINT_MODE -#define PADDLE_LITE_PROFILE diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp deleted file mode 100644 index 0e41a204a8..0000000000 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/fpga/KD/llapi/filter.h" -#include -#include -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" - -namespace paddle { -namespace zynqmp { -namespace filter { - -static int FILTER_SIZE = 2048; - -void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; } - -int calc_division_capacity(int chw) { - int n = FILTER_SIZE / ((chw + 15) / 16) * 32; - return n < FILTER_SIZE ? n : FILTER_SIZE; -} - -int calc_split_num(int num, int division_capacity) { - return (num + division_capacity - 1) / division_capacity; -} - -int calc_division_number(int num, int group_num, int division_capacity) { - int split_num = calc_split_num(num, division_capacity); - return group_num * split_num; -} - -int calc_num_per_div(int num, int group_num, int division_capacity) { - if (group_num == 1) { - if (num > division_capacity) { - return division_capacity; - } else { - return num; - } - } else { - return (num + group_num - 1) / group_num; - } -} - -void convert_to_hwc( - char **data_in, int num, int channel, int height, int width) { - char *tmp = *data_in; - int chw = channel * height * width; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT - for (int n = 0; n < num; n++) { - int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + n * chw + offset_height + w * channel + c) = - *((*data_in)++); - } - } - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -float find_max(float *data_in, int data_size) { - float max = 0.0; - for (int i = 0; i < data_size; ++i) { - float value = data_in[i]; - float abs = value > 0 ? value : -value; - max = std::max(max, abs); - } - return max; -} - -signed char float_to_int8(float fdata) { - if (fdata < 0.0) { - fdata -= 0.5; - } else { - fdata += 0.5; - } - return (signed char)fdata; -} - -void quantize(float **data_in, int data_size, float max) { - float *tmp = *data_in; - float fix_range = 127; - float scale = fix_range / max; - - signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8( - (*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale); - } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} - -void align_element(char **data_in, int num, int chw) { - int j = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - if (align_chw != chw) { - char *tmp = *data_in; - char *data_tmp = - (char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num * align_chw); - for (j = 0; j < num; j++) { - memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw); - } - *data_in = data_tmp; - fpga_free(tmp); - } -} - -void align_num(char **data_in, - int num_per_div_before_alignment, - int num, - int chw) { - int i = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - - char *tmp = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(char)); - - for (i = 0; i < div_num - 1; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - num_per_div_before_alignment * align_chw); - } - - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - (num - (div_num - 1) * num_per_div_before_alignment) * align_chw); - - *data_in = data_tmp; - fpga_free(tmp); -} - -void reorder(char **data_in, int num_after_alignment, int chw) { - int index = 0; - int new_index = 0; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; - for (index = 0; index < num_after_alignment; index++) { - new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) + - (index / 16 % 2 * 4); - memcpy(data_tmp + index * chw_align, - *data_in + new_index * chw_align, - chw_align); - } - *data_in = data_tmp; - fpga_free(tmp); -} - -size_t interleave(char **data_in, int num_after_alignment, int chw) { - int i = 0; - int j = 0; - int k = 0; - int interleave_per_num = 16; - - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; - int interleave_num = chw_align * 2 / interleave_per_num; - for (i = 0; i < num_after_alignment; i += 2) { - for (j = 0, k = 0; j < interleave_num; j += 2, k++) { - memcpy(data_tmp + i * chw_align + interleave_per_num * j, - *data_in + i * chw_align + interleave_per_num * k, - interleave_per_num); - memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1), - *data_in + (i + 1) * chw_align + interleave_per_num * k, - interleave_per_num); - } - } - *data_in = data_tmp; - fpga_free(tmp); - return chw_align * num_after_alignment; -} - -size_t format_filter(float **data_in, - int num, - int channel, - int height, - int width, - int group_num, - float max) { - int data_size = channel * height * width * num; - int chw = channel * height * width; - - int division_capacity = calc_division_capacity(chw); - int num_per_div_before_alignment = - calc_num_per_div(num, group_num, division_capacity); - int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int residual = num % num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * - ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_to_hwc(quantize_data, num, channel, height, width); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); - } - - reorder(quantize_data, num_after_alignment, chw); - size_t mem_size = interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, - align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment * - sizeof(char)); - return mem_size; -} - -void convert_to_hwn(int16_t **data_in, int num, int height, int width) { - int16_t *tmp = *data_in; - int16_t *data_tmp = - (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT - for (int n = 0; n < num; n++) { - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - *(data_tmp + h * width * num + w * num + n) = *((*data_in)++); - } - } - } - *data_in = data_tmp; - fpga_free(tmp); -} - -size_t align_element_n(int16_t **data_in, int num, int height, int width) { - int unalign_n = num; - int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT); - int num_element = height * width * align_n; - if (unalign_n != align_n) { - int16_t *tmp = *data_in; - - int num_element = height * width * align_n; - int16_t *data_tmp = - (int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(int16_t)); - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int offset_unalign = h * width * unalign_n + w * unalign_n; - int offset_align = h * width * align_n + w * align_n; - for (int n = 0; n < unalign_n; n++) { - data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n); - } - } - } - *data_in = data_tmp; - free(tmp); - } - return num_element * sizeof(int16_t); -} - -void quantize_to_fp16( - float **data_in, int num, int height, int width, float *scale_ptr) { - float *tmp = *data_in; - int size = num * height * width; - - float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16)); // NOLINT - for (int n = 0; n < num; n++) { - float scale_val = scale_ptr[n]; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int index = n * height * width + h * width + w; - float value = tmp[index] * scale_val; - tmp_data[index] = float_to_half(value); - } - } - } - fpga_flush(tmp_data, size * sizeof(int16_t)); - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); -} -size_t format_dwconv_filter( - float **data_in, int num, int height, int width, float *scale_ptr) { - quantize_to_fp16(data_in, num, height, width, scale_ptr); - int16_t **quantize_data = (int16_t **)data_in; // NOLINT - convert_to_hwn(quantize_data, num, height, width); - size_t size = align_element_n(quantize_data, num, height, width); - fpga_flush(*quantize_data, - align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * height * width * - sizeof(int16_t)); - return size; -} -} // namespace filter -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h deleted file mode 100644 index 7d9c6c2e01..0000000000 --- a/lite/backends/fpga/KD/llapi/filter.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle { -namespace zynqmp { -namespace filter { - -void set_filter_capacity(uint32_t cap); -int calc_division_capacity(int chw); -int calc_split_num(int num, int division_capacity); -int calc_division_number(int num, int group_num, int division_capacity); -int calc_num_per_div(int num, int group_num, int division_capacity); -void convert_to_hwc( - char** data_in, int num, int channel, int height, int width); -float find_max(float* data_in, int data_size); -void quantize(float** data_in, int data_size, float max); -void align_element(char** data_in, int num, int chw); -void align_num(char** data_in, - int num_per_div_before_alignment, - int num, - int chw); -void reorder(char** data_in, int num_after_alignment, int chw); -size_t interleave(char** data_in, int num_after_alignment, int chw); -size_t format_filter(float** data_in, - int num, - int channel, - int height, - int width, - int group_num, - float max); - -void convert_to_hwn(int16_t** data_in, int num, int height, int width); -size_t align_element_n(int16_t** data_in, int num, int height, int width); -void quantize_to_fp16( - float** data_in, int num, int height, int width, float* scale_ptr); -size_t format_dwconv_filter( - float** data_in, int num, int height, int width, float* scale_ptr); - -} // namespace filter -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp deleted file mode 100644 index 1f1226ead3..0000000000 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ /dev/null @@ -1,327 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lite/backends/fpga/KD/llapi/config.h" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" - -namespace paddle { -namespace zynqmp { - -#define PADDLE_LITE_OS_LINUX - -static int fd = -1; -static const char *device_path = "/dev/fpgadrv0"; -static std::map memory_map; - -static size_t memory_size_max = 0; -static size_t memory_size = 0; - -static inline int do_ioctl(uint64_t req, const void *arg) { - int ret = -1; -#ifdef PADDLE_LITE_OS_LINUX - ret = ioctl(fd, req, arg); - if (ret != 0) { - throw - 1; - } -#else - return ret; -#endif -} - -int open_device() { - if (fd == -1) { - fd = open(device_path, O_RDWR); - } - return fd; -} - -void close_device() { close(fd); } - -void reset_device() { - FpgaResetArgs args; - do_ioctl(IOCTL_FPGA_RESET, &args); -} - -// memory management; -void *fpga_malloc(size_t size) { -#ifdef PADDLE_LITE_OS_LINUX - void *ptr = reinterpret_cast( - mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); - if (ptr == NULL) { - std::cout << "not enough memory !"; - exit(-1); - } - memory_map.insert(std::make_pair(ptr, size)); - memory_size += size; - if (memory_size > memory_size_max) { - memory_size_max = memory_size; - } - return ptr; -#else - return malloc(size); -#endif -} - -size_t fpga_get_memory_size(void *ptr) { return memory_map[ptr]; } - -size_t fpga_get_memory_size_max() { return memory_size_max; } - -size_t fpga_diagnose_memory(int detailed) { - size_t total = 0; - auto iter = memory_map.begin(); // std::map::iterator - while (iter != memory_map.end()) { - total += iter->second; - iter++; - } - return total; -} - -void fpga_free(void *ptr) { - size_t size = 0; - auto iter = memory_map.find(ptr); // std::map::iterator - if (iter != memory_map.end()) { - size = iter->second; - memory_map.erase(iter); - } - - memory_size -= size; - -#ifdef PADDLE_LITE_OS_LINUX - - munmap(ptr, size); -#else - free(ptr); -#endif -} - -void fpga_copy(void *dst, const void *src, int size) { memcpy(dst, src, size); } - -int fpga_flush(void *address, size_t size) { - struct MemoryCacheArgs args; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -int fpga_invalidate(void *address, size_t size) { - struct MemoryCacheArgs args; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - -int invalidate_cache(void *addr, int size) { - struct MemoryCacheArgs args; - args.address = addr; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - -int flush_cache(void *addr, int size) { - struct MemoryCacheArgs args; - args.address = addr; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -void fpga_copy(void *dest, const void *src, size_t num) { - memcpy(dest, src, num); -} - -int ioctl_conv(const struct ConvArgs &args) { - return do_ioctl(IOCTL_CONFIG_CONV, &args); -} - -int compute_fpga_conv_basic(const struct ConvArgs &args) { - return do_ioctl(IOCTL_CONFIG_CONV, &args); -} - -int compute_fpga_conv(const struct SplitConvArgs &args) { - int split_num = args.split_num; - int ret = -1; - for (int i = 0; i < split_num; i++) { - ret = compute_fpga_conv_basic(args.conv_arg[i]); - } - - if (split_num > 1) { - std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl; - exit(-1); - } - return ret; -} - -int compute_fpga_pool(const struct PoolingArgs &args) { - return do_ioctl(IOCTL_CONFIG_POOLING, &args); -} - -int compute_fpga_ewadd(const struct EWAddArgs &args) { - return do_ioctl(IOCTL_CONFIG_EW, &args); -} - -int get_device_info(const struct DeviceInfo &args) { - int ret = do_ioctl(IOCTL_DEVICE_INFO, &args); - return ret; -} - -int perform_bypass(const struct BypassArgs &args) { - int size = args.image.channels * args.image.width * args.image.height; - int max_size = 1 << 21; - - float times = 1.0 * size / max_size; - int count = static_cast(times); - - void *input_address = args.image.address; - int type_size = - args.input_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t); - - void *output_address = args.output.address; - int out_type_size = - args.output_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t); - - float scales[2]; - struct BypassArgs bypassArgs = args; - bypassArgs.image.width = 1; - bypassArgs.image.height = 1; - bypassArgs.output.scale_address = scales; - - float scale = 0; - for (int i = 0; i < count; ++i) { - bypassArgs.image.channels = max_size; - bypassArgs.image.address = - reinterpret_cast(input_address + i * max_size * type_size); - bypassArgs.output.address = - reinterpret_cast(output_address + i * max_size * out_type_size); - int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); - scale = std::max(scale, scales[0]); - - if (ret != 0) { - return ret; - } - } - - int remainder = size - max_size * count; - bypassArgs.image.channels = remainder; - bypassArgs.image.address = - reinterpret_cast(input_address + count * max_size * type_size); - bypassArgs.output.address = reinterpret_cast( - output_address + count * max_size * out_type_size); - int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); - scale = std::max(scale, scales[0]); - args.output.scale_address[0] = scale; - args.output.scale_address[1] = 1.0f / scale; - return ret; -} - -int compute_fpga_concat(const struct ConcatArgs &args) { return -1; } - -int compute_fpga_scale(const struct ScaleArgs &args) { -#ifdef ENABLE_DEBUG - std::cout << "======Compute Scale======"; - std::cout << "scale_address:" << args.scale_address << std::endl; - std::cout << "bias_address:" << args.bias_address << std::endl; - - std::cout << "wc_alignment:" << args.wc_alignment << std::endl; - std::cout << "channel_alignment:" << args.channel_alignment << std::endl; - - std::cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - - std::cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; - -#endif - return do_ioctl(IOCTL_CONFIG_SCALE, &args); -} - -int compute_fpga_dwconv(const struct DWconvArgs &args) { -#ifdef ENABLE_DEBUG - std::cout << "======Compute Basic Conv======"; - std::cout << " relu_enabled:" << args.relu_enabled - << " filter_address:" << args.filter_address; - std::cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - std::cout << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - std::cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; - -#endif - return do_ioctl(IOCTL_CONFIG_DWCONV, &args); -} - -int config_inplace(const struct InplaceArgs &args) { - return do_ioctl(IOCTL_CONFIG_INPLACE, &args); -} - -int config_norm_param(const struct NormalizeParameterArgs &args) { - return do_ioctl(IOCTL_CONFIG_NORMALIZE_PARAMETER, &args); -} - -int compute_norm(const struct NormalizeArgs &args) { - return do_ioctl(IOCTL_CONFIG_NORMALIZE, &args); -} - -int compute_fpga_resize(const struct ResizeArgs &args) { - return do_ioctl(IOCTL_CONFIG_RESIZE, &args); -} - -int16_t fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT - auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | - (((tmp & 0x7f800000) >> 13) - (112 << 10))); - if (tmp & 0x1000) { - t++; // roundoff - } - return t; -} - -float fp16_2_fp32(int16_t fp16_num) { - if (0 == fp16_num) { - return 0; - } - int frac = (fp16_num & 0x3ff); - int exp = ((fp16_num & 0x7c00) >> 10) + 112; - int s = fp16_num & 0x8000; - int tmp = 0; - float fp32_num = 0; - tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; // NOLINT - return fp32_num; -} - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h deleted file mode 100644 index 7d22de95a2..0000000000 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.h +++ /dev/null @@ -1,347 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -namespace paddle { -namespace zynqmp { - -typedef int16_t half; - -#define IMAGE_ALIGNMENT 16 // Aligned to 16 -#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 -#define BS_NUM_ALIGNMENT 8 -#define BIAS_NUM_ALIGNMENT 16 - -enum DDataType { - DATA_TYPE_FP32 = 1, - DATA_TYPE_FP16 = 0, -}; - -enum DLayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -struct VersionArgs { - void* buffer; -}; - -struct DeviceInfo { - uint32_t filter_cap; - uint32_t version; - uint16_t device_type; - uint32_t reserved0; - uint32_t reserved1; - uint32_t reserved2; - uint32_t reserved3; - uint32_t reserved4; - uint32_t reserved5; - uint32_t reserved6; -}; - -struct MemoryCopyArgs { - void* src; - void* dest; - size_t size; -}; - -struct MemoryCacheArgs { - void* address; - size_t size; -}; - -struct MemoryBarrierArgs {}; - -struct BNArgs { - bool enabled; - void* bias_address; - void* scale_address; -}; - -/** -Conv and Pooling kernel -*/ -struct KernelArgs { - uint32_t width; - uint32_t height; - uint32_t stride_w; - uint32_t stride_h; -}; - -struct ImageInputArgs { - void* address; // input featuremap virtual address - void* scale_address; // input scale address; - uint32_t channels; - uint32_t width; // featuremap width - uint32_t height; - uint32_t pad_width; // padding width; - uint32_t pad_height; -}; - -struct ImageOutputArgs { - void* address; // output result address; - float* scale_address; // output scale address; -}; - -struct ConvArgs { - bool relu_enabled; - void* sb_address; // scale and bias are interlaced; - void* filter_address; - void* filter_scale_address; - uint32_t filter_num; - uint32_t group_num; - - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct DWconvArgs { - bool relu_enabled; - void* bias_address; - void* filter_address; - struct KernelArgs kernel; - struct ImageInputArgs image; - struct ImageOutputArgs output; - uint16_t out_width; - uint16_t out_height; - uint16_t sub_conv_num; -}; - -struct PoolingArgs { - uint16_t mode; - uint16_t kernel_reciprocal; - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; - uint16_t out_width; - uint16_t out_height; -}; - -// elementwise add arguments -struct EWAddArgs { - bool relu_enabled; - - uint32_t const0; // output0 = const0 x input0 + const1 x input1; - uint32_t const1; - struct ImageInputArgs image0; - struct ImageInputArgs image1; - struct ImageOutputArgs output; -}; - -struct BypassArgs { - enum DDataType input_data_type; - enum DDataType output_data_type; - enum DLayoutType input_layout_type; - enum DLayoutType output_layout_type; - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - -struct ScaleArgs { - void* scale_address; - void* bias_address; - uint32_t wc_alignment; - uint32_t channel_alignment; - - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - -struct NormalizeArgs { - void* input_image_address; - void* output_image_address; - uint32_t image_width; - uint32_t image_height; - uint32_t image_channel; - uint32_t* output_scale_address; -}; - -struct ResizeArgs { - void* input_image_address; - void* output_image_address; - uint32_t input_width; - uint32_t input_height; - uint32_t image_channel; - uint32_t output_width; - uint32_t output_height; - uint32_t height_ratio; - uint32_t width_ratio; - uint32_t* output_scale_address; -}; - -struct PowerParameterArgs { - uint16_t shift; - uint16_t scale; - uint16_t power; -}; - -struct NormalizeParameterArgs { - uint32_t channel; - uint32_t hight_width; -}; - -struct InplaceArgs { - bool leaky_relu_enable; - bool relu_enable; - bool power_enable; - bool normalize_enable; -}; - -struct FpgaRegWriteArgs { - uint64_t address; // - uint64_t value; -}; - -struct FpgaRegReadArgs { - uint64_t address; - uint64_t value; -}; - -struct FpgaResetArgs {}; - -#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4) - -#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) -#define IOCTL_DEVICE_INFO _IOW(IOCTL_FPGA_MAGIC, 100, struct DeviceInfo) - -#define IOCTL_SEPARATOR_0 10 - -#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) -#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) -#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) -#define IOCTL_MEMORY_BARRIER \ - _IOW(IOCTL_FPGA_MAGIC, 14, struct MemoryBarrierArgs) - -#define IOCTL_SEPARATOR_1 20 - -#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs) -#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs) -#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs) -#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs) -#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs) -#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs) -#define IOCTL_CONFIG_RESIZE _IOW(IOCTL_FPGA_MAGIC, 30, struct ResizeArgs) - -#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs) - -#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs) -#define IOCTL_CONFIG_POWER_PARAMETER \ - _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs) -#define IOCTL_CONFIG_NORMALIZE_PARAMETER \ - _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs) -#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs) -#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs) -#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs) - -//============================== API ============================= - -struct DeconvArgs { - uint32_t sub_conv_num; - uint32_t group_num; - uint32_t filter_num; - uint32_t omit_size; - uint32_t sub_output_width; - uint32_t sub_output_height; - struct ImageOutputArgs output; - struct SplitConvArgs* split_conv_args; -}; - -struct SplitArgs { - uint32_t image_num; - int16_t* image_in; - float* scale_in; - void** images_out; - float** scales_out; - uint32_t* out_channel_nums; - uint32_t height; - uint32_t width; -}; - -struct ConcatArgs { - uint32_t image_num; - half** images_in; - float** scales_in; - void* image_out; - float* scale_out; - uint32_t* channel_num; - uint32_t height; - uint32_t width; -}; - -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_arg; - struct ConcatArgs concat_arg; -}; - -struct GroupConvArgs { - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct SplitConvArgs* conv_args; - struct ConcatArgs concat_arg; -}; - -inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } -int open_device(); -void close_device(); -void reset_device(); - -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -size_t fpga_get_memory_size(void* ptr); -size_t fpga_get_memory_size_max(); -size_t fpga_diagnose_memory(int detailed); - -void fpga_copy(void* dst, const void* src, int size); - -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - -int get_device_info(const struct DeviceInfo& args); - -int perform_bypass(const struct BypassArgs& args); -int compute_fpga_conv_basic(const struct ConvArgs& args); -int compute_fpga_conv(const struct SplitConvArgs& args); -int compute_fpga_pool(const struct PoolingArgs& args); -int compute_fpga_ewadd(const struct EWAddArgs& args); -int compute_fpga_scale(const struct ScaleArgs& args); -int compute_fpga_concat(const struct ConcatArgs& args); -int compute_fpga_resize(const struct ResizeArgs& args); - -int config_power(const struct PowerArgs& args); -int compute_fpga_dwconv(const struct DWconvArgs& args); -int config_norm_param(const struct NormalizeParameterArgs& args); -int compute_norm(const struct NormalizeArgs& args); - -int config_inplace(const struct InplaceArgs& args); - -int flush_cache(void* addr, int size); -int invalidate_cache(void* addr, int size); - -int16_t fp32_2_fp16(float fp32_num); -float fp16_2_fp32(int16_t fp16_num); -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pe.hpp b/lite/backends/fpga/KD/pe.hpp deleted file mode 100644 index d1dc3c4caa..0000000000 --- a/lite/backends/fpga/KD/pe.hpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/backends/fpga/KD/pe_params.hpp" -#include "lite/backends/fpga/KD/tensor_util.hpp" - -namespace paddle { -namespace zynqmp { - -class PE { - public: - virtual bool init() { return false; } - - virtual void apply() {} - - virtual bool dispatch() { return false; } - - virtual ~PE() {} -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp deleted file mode 100644 index 709f04d399..0000000000 --- a/lite/backends/fpga/KD/pe_params.hpp +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" -#include "lite/backends/fpga/KD/tensor.hpp" - -namespace paddle { -namespace zynqmp { - -struct ReLUParam { - public: - bool enabled = false; -}; - -struct PEParam { - ReLUParam relu; -}; - -struct InputParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; -}; - -struct OutputParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; -}; - -struct BatchnormParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; - - Tensor* bias = nullptr; - Tensor* scale = nullptr; - Tensor* mean = nullptr; - Tensor* variance = nullptr; - float epsilon = 0; -}; - -struct BasicConvParam { - Tensor input; - Tensor output; - Tensor filter; - Tensor scaleBias; - ConvArgs args; -}; - -struct ConvParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; - Tensor* filter = nullptr; - - int groups = 1; - std::vector strides; - std::vector paddings; - std::vector kernelSize; - std::vector dilations; - - Tensor* scale() { return scale_; } - - Tensor* bias() { return bias_; } - - std::vector& splitParams() { return splitParams_; } - - protected: - std::vector splitParams_; - Tensor* scale_ = new Tensor(); - Tensor* bias_ = new Tensor(); -}; - -struct DepthwiseConvParam : ConvParam { - public: - Tensor* quantizedFilter() { return quantizedFilter_; } - - DWconvArgs args; - - protected: - Tensor* quantizedFilter_ = new Tensor(); -}; - -enum PoolingType : int { - MAX = 0, - AVERAGE = 1, -}; - -struct PoolingParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; - - PoolingType type = PoolingType::MAX; - bool globalPooling = false; - std::vector kernelSize; - std::vector strides; - std::vector paddings; - - PoolingArgs poolingArgs = {0}; -}; - -struct ConcatParam : PEParam { - public: - std::vector inputs; - Tensor* output; - int axis = 0; -}; - -struct ElementwiseAddParam : PEParam { - public: - std::vector inputs; - Tensor* output = nullptr; - int axis = 0; - - EWAddArgs ewargs; -}; - -struct FullyConnectedParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* filter = nullptr; - Tensor* bias = nullptr; - Tensor* output = nullptr; - - Tensor* quantizedFilter() { return quantizedFilter_; } - - Tensor* biasScale() { return biasScale_; } - - protected: - Tensor* quantizedFilter_ = new Tensor(); - Tensor* biasScale_ = new Tensor(); -}; - -struct SoftmaxParam : PEParam { - public: - Tensor* input = nullptr; - - Tensor* output = nullptr; - - private: - Tensor* floatInput = nullptr; -}; - -struct SplitParam : PEParam { - public: - Tensor* input = nullptr; - std::vector outputs; - int axis = 1; - int num = 1; -}; - -struct NormParam : PEParam { - public: - Tensor* input = nullptr; - - Tensor* output = nullptr; - float epsilon = 0; - - private: - Tensor* floatInput = nullptr; -}; - -struct PriorBoxParam : PEParam { - Tensor* input; - Tensor* image; - Tensor* outputBoxes; - Tensor* outputVariances; - - std::vector minSizes; - std::vector maxSizes; - std::vector aspectRatios; - std::vector variances; - - bool minMaxAspectRatiosOrder; - bool flip; - bool clip; - float stepW; - float stepH; - float offset; -}; - -struct ScaleParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; - Tensor* scale = nullptr; - Tensor* bias = nullptr; - - Tensor* alignedScale() { return alignedScale_; } - - Tensor* alignedBias() { return alignedBias_; } - - ScaleArgs args = {0}; - - protected: - Tensor* alignedScale_ = new Tensor(); - Tensor* alignedBias_ = new Tensor(); -}; - -struct ResizeParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; -}; - -struct CropParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* output = nullptr; - int axis = 2; - std::vector offsets; - std::vector shape; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/batchnorm_pe.hpp b/lite/backends/fpga/KD/pes/batchnorm_pe.hpp deleted file mode 100644 index a207875105..0000000000 --- a/lite/backends/fpga/KD/pes/batchnorm_pe.hpp +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -#include "lite/backends/fpga/KD/pes/scale_pe.hpp" - -namespace paddle { -namespace zynqmp { -class BatchnormPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - - ScaleParam& scale_param = scalePE_.param(); - scale_param.input = param_.input; - scale_param.output = param_.output; - Tensor* scale = new Tensor(); - Tensor* bias = new Tensor(); - Shape shape(N, {output->shape().channel()}); - - auto mean_data = param_.mean->data(); - auto variance_data = param_.variance->data(); - auto scale_data = param_.scale->data(); - auto bias_data = param_.bias->data(); - auto new_scale_ptr = scale->mutableData(FP32, shape); - auto new_bias_ptr = bias->mutableData(FP32, shape); - - float epsilon = param_.epsilon; - - Shape& in_shape = param_.input->shape(); - bool match = in_shape.channel() == 128 && in_shape.height() == 128 && - in_shape.width() == 128; - - for (int c = 0; c < output->shape().channel(); c++) { - float var = variance_data[c]; - float inv_scale = 1.0 / (std::sqrt(var + epsilon)); - float scale_value = inv_scale * scale_data[c]; - float bias_value = bias_data[c] - scale_value * mean_data[c]; - new_scale_ptr[c] = scale_value; - new_bias_ptr[c] = bias_value; - } - - scale->flush(); - bias->flush(); - - scale_param.scale = scale; - scale_param.bias = bias; - scale_param.relu = param_.relu; - - scalePE_.init(); - - inplace_.relu_enable = param_.relu.enabled; - inplace_.relu_enable = true; - inplace_.power_enable = false; - inplace_.normalize_enable = false; - - return true; - } - - void apply() { scalePE_.apply(); } - - bool dispatch() { - if (inplace_.relu_enable) { - config_inplace(inplace_); - } - bool ret = scalePE_.dispatch(); - - inplace_.relu_enable = false; - config_inplace(inplace_); - return ret; - } - - BatchnormParam& param() { return param_; } - - ~BatchnormPE() { - scalePE_.param().input = nullptr; - scalePE_.param().output = nullptr; - } - - private: - BatchnormParam param_; - ScalePE scalePE_; - InplaceArgs inplace_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/concat_pe.hpp b/lite/backends/fpga/KD/pes/concat_pe.hpp deleted file mode 100644 index 72b480ab88..0000000000 --- a/lite/backends/fpga/KD/pes/concat_pe.hpp +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { - -class ConcatPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(false); - output->setDataLocation(CPU); - return true; - } - - void apply() {} - - void concat2D() { - int offset = 0; - float16* out_data = param_.output->data(); - for (unsigned int n = 0; n < param_.inputs.size(); n++) { - Tensor* input = param_.inputs[n]; - Shape& input_shape = input->shape(); - - float16* src = input->data(); - memcpy(out_data + offset, src, input_shape.numel() * sizeof(float16)); - offset += input_shape.numel(); - } - Tensor* output = param_.output; - output->flush(); - } - - void concat3D() { - auto input = param_.inputs; - Tensor* output = param_.output; - int axis = param_.axis; - int num = input.size(); - int rows = 1; - auto dim_0 = input[0]->shape().dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (int i = 0; i < num; ++i) { - int t_cols = input[i]->shape().numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; - } - - // computation - for (int k = 0; k < out_rows; ++k) { - float16* dst_ptr = output->data() + k * out_cols; - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = input_cols[j]; - const float16* src_prt = input[j]->data() + k * col_len; - memcpy(dst_ptr + col_idx, src_prt, sizeof(float16) * col_len); - col_idx += col_len; - } - } - output->flush(); - } - - bool dispatch() { - Tensor* output = param_.output; - Shape& output_shape = output->shape(); - - float scale = 0; - for (unsigned int n = 0; n < param_.inputs.size(); n++) { - Tensor* input = param_.inputs[n]; - input->syncToCPU(); - input->unalignImage(); - scale = std::max(scale, input->scale()[0]); - } - output->scale()[0] = scale; - output->scale()[1] = 1.0f / scale; - - if (output_shape.dimSize() == 3) { - concat3D(); - return true; - } - - if (output_shape.dimSize() == 2) { - concat2D(); - return true; - } - - float16* out_data = param_.output->data(); - int channel_sum = 0; - int out_channel = output_shape.channel(); - for (unsigned int n = 0; n < param_.inputs.size(); n++) { - Tensor* input = param_.inputs[n]; - Shape& input_shape = input->shape(); - int wh = output_shape.width() * output_shape.height(); - for (int j = 0; j < wh; j++) { - float16* src = input->data() + j * input_shape.channel(); - memcpy(out_data + j * out_channel + channel_sum, - src, - input_shape.channel() * sizeof(float16)); - } - channel_sum += input_shape.channel(); - } - output->flush(); - return true; - } - - ConcatParam& param() { return param_; } - - private: - ConcatParam param_; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp deleted file mode 100644 index e897f82280..0000000000 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -#include "lite/backends/fpga/KD/pes/concat_pe.hpp" -#include "lite/backends/fpga/KD/pes/conv_pe.hpp" -#include "lite/backends/fpga/KD/pes/conv_process.hpp" -#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" -#include "lite/backends/fpga/KD/pes/scale_pe.hpp" - -namespace paddle { -namespace zynqmp { - -class ConvPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - split_axis = fill_split_arg(param_); - - if (split_axis == 0 && param_.splitParams().size() > 1) { - ConcatParam& concat_param = concatPE_.param(); - for (auto conv_param : param_.splitParams()) { - concat_param.inputs.push_back(&conv_param->output); - } - concat_param.output = param_.output; - concatPE_.init(); - concatPE_.apply(); - } - } - void cpu_compute() { - Tensor* input = param_.input; - Tensor* output = param_.output; - input->syncToCPU(); - - Tensor float_input; - Tensor float_output; - float* image_addr = float_input.mutableData(FP32, input->shape()); - float_input.copyFrom(input); - float* out = float_output.mutableData(FP32, output->shape()); - - int out_channel = output->shape().channel(); - int in_channel = input->shape().channel(); - - float* filter_data = param_.filter->data(); - float* mi = new float[in_channel]; - - for (int i = 0; i < out_channel; i++) { - float* image = image_addr; - float* filter_ptr = filter_data + i * in_channel; - float* out_ptr = mi; -#pragma omp parallel for - for (int j = 0; j < in_channel; j++) { - float value = image_addr[j] * filter_ptr[j]; - mi[j] = value; - } - - float sum = 0; - for (int j = 0; j < in_channel; j++) { - sum += mi[j]; - } - out[i] = sum; - } - delete[] mi; - float_output.flush(); - output->copyFrom(&float_output); - } - - bool dispatch() { - inplace_.relu_enable = param_.relu.enabled; - inplace_.power_enable = false; - inplace_.normalize_enable = false; - - if (param_.relu.enabled) { - inplace_.relu_enable = param_.relu.enabled; - config_inplace(inplace_); - } - - std::vector& params = param_.splitParams(); - int ret = 0; - for (auto conv_param : params) { - ret |= compute_fpga_conv_basic(conv_param->args); - } - - if (param_.relu.enabled) { - inplace_.relu_enable = false; - config_inplace(inplace_); - } - - size_t size = params.size(); - if (split_axis == 0 && ret == 0 && size > 1) { - concatPE_.dispatch(); - } - if (split_axis == 1 && ret == 0 && size > 1) { - ElementwiseAddParam& add_param = addPE_.param(); - add_param.inputs = {¶ms[0]->output, ¶ms[1]->output}; - add_param.output = param_.output; - addPE_.init(); - addPE_.apply(); - addPE_.dispatch(); - } - return ret == 0; - } - - ConvParam& param() { return param_; } - - private: - ConvParam param_; - ConcatPE concatPE_; - ElementwiseAddPE addPE_; - int split_axis = 0; - InplaceArgs inplace_ = {0}; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp deleted file mode 100644 index fd17218d06..0000000000 --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ /dev/null @@ -1,418 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/llapi/bias_scale.h" -#include "lite/backends/fpga/KD/llapi/filter.h" -#include "lite/backends/fpga/KD/pe_params.hpp" -#include "lite/backends/fpga/KD/tensor.hpp" -#include "lite/backends/fpga/KD/tensor_util.hpp" - -namespace paddle { -namespace zynqmp { - -inline int get_aligned_filter_element_num(int chw) { - return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); -} - -inline int get_filter_num_per_div(Tensor* filter, int group_num) { - auto chw = filter->shape().channel() * filter->shape().height() * - filter->shape().width(); - auto num = filter->shape().num(); - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_num_per_div(num, group_num, div_capacity); -} - -inline int get_split_num(Tensor* filter) { - auto chw = filter->shape().channel() * filter->shape().height() * - filter->shape().width(); - auto num = filter->shape().num(); - int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_split_num(num, div_capacity); -} - -inline void fill_scale_bias_const(ConvParam* param_) { - int channel = param_->output->shape().channel(); - Shape sb_shape(N, {channel}); - float* new_scale_ptr = param_->scale()->mutableData(FP32, sb_shape); - float* new_bias_ptr = param_->bias()->mutableData(FP32, sb_shape); - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = 1.0f; - new_bias_ptr[i] = 0.0f; - } - param_->scale()->flush(); - param_->bias()->flush(); -} - -inline void combine_bn_params(BatchnormParam* bn, ConvParam* param_) { - int channel = param_->output->shape().channel(); - Shape sb_shape(N, {channel}); - float* new_scale_ptr = param_->scale()->mutableData(FP32, sb_shape); - float* new_bias_ptr = param_->bias()->mutableData(FP32, sb_shape); - float* bn_scale_ptr = bn->scale->data(); - float* bn_bias_ptr = bn->bias->data(); - float* bn_var_ptr = bn->variance->data(); - float* bn_mean_ptr = bn->mean->data(); - float epsilon = bn->epsilon; - for (int i = 0; i < channel; i++) { - float new_scale = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_scale_ptr[i] = new_scale; - new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - } -} - -inline void combine_add_bn_params(BatchnormParam* bn, - Tensor* bias, - ConvParam* param_) { - int channel = param_->output->shape().channel(); - Shape sb_shape(N, {channel}); - float* new_scale_ptr = param_->scale()->mutableData(FP32, sb_shape); - float* new_bias_ptr = param_->bias()->mutableData(FP32, sb_shape); - if (bn != nullptr) { - float* bn_scale_ptr = bn->scale->data(); - float* bn_bias_ptr = bn->bias->data(); - float* bn_var_ptr = bn->variance->data(); - float* bn_mean_ptr = bn->mean->data(); - float epsilon = bn->epsilon; - float* bias_data = bias->data(); - for (int i = 0; i < channel; i++) { - float new_scale = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_scale_ptr[i] = new_scale; - new_bias_ptr[i] = - bn_bias_ptr[i] + (bias_data[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - } - } else { - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = 1.0f; - new_bias_ptr[i] = 0.0f; - } - } - param_->scale()->flush(); - param_->bias()->flush(); - param_->scale()->setDataLocation(CPU); - param_->bias()->setDataLocation(CPU); -} - -inline void format_scale_bias(Tensor* scale, - Tensor* bias, - Tensor* filter, - Tensor* scale_bias, - int group) { - float* scale_data = nullptr; - float* bias_data = nullptr; - if (scale != nullptr) { - scale_data = scale->data(); - } - if (bias != nullptr) { - bias_data = bias->data(); - } - int channel = filter->shape().num(); - Shape bias_scale_shape(N, {2 * channel}); - float* bs_data = scale_bias->mutableData(FP32, bias_scale_shape); - for (int i = 0; i < channel; i++) { - float scale_value = scale_data == nullptr ? 1 : scale_data[i]; - float bias_value = bias_data == nullptr ? 0 : bias_data[i]; - bs_data[i + channel] = scale_value; - bs_data[i] = bias_value; - } - - int element_num_per_div = get_filter_num_per_div(filter, group); - bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel); -} - -inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) { - float max_value = find_max(*filter); - Shape& filter_shape = filter->shape(); - quantized_filter->setAligned(true); - quantized_filter->mutableData(INT8, filter->shape()); - quantized_filter->scale()[0] = max_value / 127.0f; - quantized_filter->scale()[1] = 127.0f / max_value; - - auto memory_size = filter->shape().memorySize(sizeof(float)); - auto new_data = reinterpret_cast(fpga_malloc(memory_size)); - memcpy(new_data, filter->data(), memory_size); - size_t mem_size = filter::format_filter(&new_data, - filter_shape.num(), - filter_shape.channel(), - filter_shape.height(), - filter_shape.width(), - group, - max_value); - int8_t* src = quantized_filter->mutableData(INT8, filter->shape()); - memcpy(src, new_data, mem_size); - fpga_free(new_data); - quantized_filter->flush(); -} - -inline void format_dw_filter(Tensor* filter, - Tensor* quantized_filter, - float* scale) { - int num = filter->shape().num(); - int height = filter->shape().height(); - int width = filter->shape().width(); - auto memory_size = filter->shape().memorySize(sizeof(float)); - auto new_data = (float*)fpga_malloc(memory_size); // NOLINT - memcpy(new_data, filter->data(), memory_size); - - size_t size = - filter::format_dwconv_filter(&new_data, num, height, width, scale); - float16* src = quantized_filter->mutableData(FP16, filter->shape()); - - memcpy(src, new_data, size); - quantized_filter->flush(); - - fpga_free(new_data); -} - -inline void format_fc_filter(Tensor* filter, Tensor* quantized_filter) { - float max_value = find_max(*filter); - Shape& filter_shape = filter->shape(); - quantized_filter->setAligned(true); - quantized_filter->mutableData(INT8, filter->shape()); - quantized_filter->scale()[0] = max_value / 127.0f; - quantized_filter->scale()[1] = 127.0f / max_value; - - size_t memory_size = filter->shape().memorySize(sizeof(float)); - auto new_data = (float*)fpga_malloc(memory_size); // NOLINT - memcpy(new_data, filter->data(), memory_size); - - int8_t* src = quantized_filter->mutableData(INT8, filter->shape()); - memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(int8_t))); - quantized_filter->flush(); - fpga_free(new_data); -} - -inline void split_filter_num(const ConvParam& c_param) { - ConvParam& param = const_cast(c_param); - Tensor* input = param.input; - Tensor* out = param.output; - Tensor* filter = param.filter; - auto channel = out->shape().channel(); - - int split_num = param.groups == 1 ? get_split_num(param.filter) : 1; - int filter_num_per_div = get_filter_num_per_div(filter, param.groups); - - Shape& out_shape = out->shape(); - for (int i = 0; i < split_num; i++) { - BasicConvParam* conv_param = new BasicConvParam(); - conv_param->output.setDataLocation(Device); - conv_param->output.setAligned(true); - - int filter_num = filter->shape().num(); - float16* out_address = nullptr; - float* out_scale_address = nullptr; - - ConvArgs& args = conv_param->args; - - if (split_num == 1) { - out_address = out->data(); - out_scale_address = out->scale(); - } - filter_num = i == split_num - 1 - ? channel - (split_num - 1) * filter_num_per_div // NOLINT - : filter_num_per_div; - - if (split_num != 1) { - Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num}); - out_address = conv_param->output.mutableData(FP16, shape); - out_scale_address = conv_param->output.scale(); - } - Shape f_shape(NCHW, - {filter_num, - filter->shape().channel(), - filter->shape().height(), - filter->shape().width()}); - - Tensor new_filter; - float* new_filter_data = new_filter.mutableData(FP32, f_shape); - int filter_hwc = filter->shape().height() * filter->shape().width() * - filter->shape().channel(); - - memcpy(new_filter_data, - filter->data() + i * filter_num_per_div * filter_hwc, - filter_num * filter_hwc * sizeof(float)); - new_filter.flush(); - - conv_param->filter.mutableData(FP32, f_shape); - format_filter(&new_filter, &(conv_param->filter), param.groups); - - int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT); - Tensor scale; - Tensor bias; - - int chnnnel_start = i * filter_num_per_div; - - Shape s_shape(N, {filter_num}); - float* scale_data = scale.mutableData(FP32, s_shape); - float* bias_data = bias.mutableData(FP32, s_shape); - for (int n = 0; n < filter_num; n++) { - scale_data[n] = param.scale()->data()[n + chnnnel_start]; - } - for (int n = 0; n < filter_num; n++) { - bias_data[n] = param.bias()->data()[n + chnnnel_start]; - } - Shape sb_shape(N, {sb_num}); - format_scale_bias(&scale, - &bias, - &conv_param->filter, - &conv_param->scaleBias, - param.groups); - conv_param->scaleBias.flush(); - - args.group_num = param.groups; - args.relu_enabled = param.relu.enabled; - args.sb_address = conv_param->scaleBias.data(); - args.kernel.stride_h = param.strides[1]; - args.kernel.stride_w = param.strides[0]; - args.kernel.height = new_filter.shape().height(); - args.kernel.width = new_filter.shape().width(); - - args.filter_address = conv_param->filter.data(); - args.filter_num = filter_num; - args.filter_scale_address = conv_param->filter.scale(); - args.image.address = input->data(); - args.image.scale_address = input->scale(); - args.image.channels = input->shape().channel(); - args.image.width = input->shape().width(); - args.image.height = input->shape().height(); - args.image.pad_width = param.paddings[1]; - args.image.pad_height = param.paddings[0]; - args.output.address = out_address; - args.output.scale_address = out_scale_address; - param.splitParams().push_back(conv_param); - } -} - -inline void split_channel(const ConvParam& c_param) { - ConvParam& param = const_cast(c_param); - Tensor* input = param.input; - Tensor* output = param.output; - input->syncToCPU(); - - int num = ceil(input->shape().channel() * 1.0f / 2047); - int channel = input->shape().channel() / num; - std::cout << "channel::" << channel << "num::" << num << std::endl; - Shape bs_shape(N, {channel}); - - for (int i = 0; i < num; i++) { - BasicConvParam* conv_param = new BasicConvParam(); - - // input && output; - Shape in_shape( - NCHW, {1, channel, input->shape().height(), input->shape().width()}); - conv_param->input.shareDataWith(input, in_shape, channel * i); - conv_param->output.mutableData(FP16, output->shape()); - - // filter transformation; - Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1}); - Tensor new_filter; - - float* dst = new_filter.mutableData(FP32, f_shape); - float* src = param.filter->data() + i * channel; - for (int n = 0; n < f_shape.num(); n++) { - memcpy(dst, src, channel * sizeof(float)); - dst += channel; - src += param.filter->shape().channel(); - } - new_filter.flush(); - format_filter(&new_filter, &(conv_param->filter), param.groups); - - Tensor bias; - Tensor scale; - - float* bias_data = bias.mutableData(FP32, bs_shape); - float* scale_data = scale.mutableData(FP32, bs_shape); - for (int c = 0; c < channel; c++) { - scale_data[c] = 1; - bias_data[c] = param.bias()->data()[c] / num; - } - scale.flush(); - bias.flush(); - format_scale_bias(&scale, - &bias, - &conv_param->filter, - &conv_param->scaleBias, - param.groups); - conv_param->scaleBias.flush(); - - ConvArgs& args = conv_param->args; - args.group_num = param.groups; - args.relu_enabled = param.relu.enabled; - args.sb_address = conv_param->scaleBias.data(); - args.kernel.stride_h = param.strides[1]; - args.kernel.stride_w = param.strides[0]; - args.kernel.height = new_filter.shape().height(); - args.kernel.width = new_filter.shape().width(); - - args.filter_address = conv_param->filter.data(); - args.filter_num = f_shape.num(); - args.filter_scale_address = conv_param->filter.scale(); - args.image.address = conv_param->input.mutableData(); - args.image.scale_address = conv_param->input.scale(); - - args.image.channels = conv_param->input.shape().channel(); - args.image.width = conv_param->input.shape().width(); - args.image.height = conv_param->input.shape().height(); - args.image.pad_width = param.paddings[1]; - args.image.pad_height = param.paddings[0]; - args.output.address = conv_param->output.mutableData(); - args.output.scale_address = conv_param->output.scale(); - param.splitParams().push_back(conv_param); - } -} - -inline int fill_split_arg(const ConvParam& c_param) { - ConvParam& param = const_cast(c_param); - Tensor* input = param.input; - Tensor* output = param.output; - if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 && - input->shape().width() == 1) { - split_channel(c_param); - return 1; - } else { - split_filter_num(c_param); - return 0; - } -} - -inline bool compute_conv(const ConvParam& c_conv_params) { - ConvParam& conv_params = const_cast(c_conv_params); - std::vector& params = conv_params.splitParams(); - int ret = 0; - for (auto conv_param : params) { - ret |= compute_fpga_conv_basic(conv_param->args); - } - size_t size = params.size(); - if (ret == 0 && size > 1) { - Tensor& img = params[0]->output; - for (int i = 0; i < 1; i++) { - for (int i = 0; i < img.shape().numel(); i++) { - float value = half_to_float(img.data()[i]); - std::cout << "value:" << value << std::endl; - } - } - } - return ret == 0; -} - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp deleted file mode 100644 index c29df623aa..0000000000 --- a/lite/backends/fpga/KD/pes/crop_pe.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/fpga/KD/pes/crop_pe.hpp" - -#include - -namespace paddle { -namespace zynqmp { - -bool CropPE::dispatch() { - Tensor* input = param_.input; - input->syncToCPU(); - const auto axis = param_.axis; - std::vector shape = param_.shape; - auto* out = param_.output; - - Shape out_shape = out->shape(); - float16* src_ptr = reinterpret_cast(input->data()); - float16* dst_ptr = reinterpret_cast( - out->mutableData(DataType::FP16, out_shape)); - - std::vector offsets = param_.offsets; - - int input_c = input->shape().channel(); - int input_h = input->shape().height(); - int input_w = input->shape().width(); - - int out_c = out->shape().channel(); - int out_h = out->shape().height(); - int out_w = out->shape().width(); - if (axis == 1) { - int index = 0; - - int offset_h = offsets[0]; - int offset_w = offsets[0]; - int offset_c = offsets[0]; - - if (offsets.size() == 3) { - offset_h = offsets[1]; - offset_w = offsets[2]; - offset_c = offsets[0]; - } - - for (int h = 0; h < out_h; h++) { - for (int w = 0; w < out_w; w++) { - float16* crop_start = src_ptr + (h + offset_h) * input_w * input_c + - (offset_w * input_c) + offset_c; - std::memcpy(dst_ptr + h * (out_w * out_c) + w * out_c, - crop_start, - out_c * sizeof(float16)); - } - } - } else if (axis == 2) { - int offset_h = offsets[0]; - int offset_w = offsets[0]; - - if (offsets.size() == 2) { - offset_h = offsets[0]; - offset_w = offsets[1]; - } - - for (int h = 0; h < out_h; h++) { - float16* crop_start = - src_ptr + (h + offset_h) * input_w * input_c + (offset_w * input_c); - std::memcpy(dst_ptr + h * out_w * input_c, - crop_start, - out_w * input_c * sizeof(float16)); - } - } - out->flush(); - out->copyScaleFrom(input); - return true; -} - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/crop_pe.hpp b/lite/backends/fpga/KD/pes/crop_pe.hpp deleted file mode 100755 index 6ebbcdb31f..0000000000 --- a/lite/backends/fpga/KD/pes/crop_pe.hpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { -class CropPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(CPU); - return true; - } - - void apply() {} - - bool dispatch(); - - CropParam& param() { return param_; } - - private: - CropParam param_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp deleted file mode 100755 index 9d7b9b544b..0000000000 --- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -#include "lite/backends/fpga/KD/pes/conv_process.hpp" - -namespace paddle { -namespace zynqmp { - -class DepthwiseConvPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - DepthwiseConvParam& param = param_; - Tensor* input = param.input; - Tensor* output = param.output; - int channel = output->shape().channel(); - - float* new_scale_data = param_.scale()->data(); - float* new_bias_data = param_.bias()->data(); - - float16* b_data = bias_.mutableData(FP16, param_.bias()->shape()); - for (int i = 0; i < channel; i++) { - b_data[i] = float_to_half(new_bias_data[i]); - } - bias_.flush(); - - Tensor* quantized_filter = param.quantizedFilter(); - quantized_filter->mutableData(FP16, param.filter->shape()); - format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data); - - DWconvArgs args = {0}; - args.bias_address = b_data; - args.filter_address = param.quantizedFilter()->data(); - args.kernel.width = param.filter->shape().height(); - args.kernel.height = param.filter->shape().width(); - args.kernel.stride_w = param.strides[0]; - args.kernel.stride_h = param.strides[1]; - args.image.address = input->data(); - args.image.channels = input->shape().channel(); - args.image.height = input->shape().height(); - args.image.width = input->shape().width(); - args.image.pad_width = param.paddings[0]; - args.image.pad_height = param.paddings[1]; - args.image.scale_address = input->scale(); - args.output.address = output->data(); - args.output.scale_address = output->scale(); - args.out_width = param.output->shape().width(); - args.out_height = param.output->shape().height(); - args.sub_conv_num = 1; - param.args = args; - - inplace_.relu_enable = param_.relu.enabled; - inplace_.power_enable = false; - inplace_.normalize_enable = false; - } - - bool dispatch() { - param_.input->syncToDevice(); - if (param_.relu.enabled) { - inplace_.relu_enable = param_.relu.enabled; - config_inplace(inplace_); - } - bool ret = compute_fpga_dwconv(param_.args) == 0; - if (param_.relu.enabled) { - inplace_.relu_enable = false; - config_inplace(inplace_); - } - return ret; - } - - DepthwiseConvParam& param() { return param_; } - - private: - DepthwiseConvParam param_; - Tensor bias_; - InplaceArgs inplace_ = {0}; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp deleted file mode 100755 index a498a2bde9..0000000000 --- a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { - -class ElementwiseAddPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - Tensor* input0 = param_.inputs[0]; - Tensor* input1 = param_.inputs[1]; - Tensor* output = param_.output; - EWAddArgs args = {0}; - args.const0 = 0x3c00; - args.const1 = 0x3c00; // =1 - args.image0.address = input0->data(); - args.image0.channels = input0->shape().channel(); - args.image0.scale_address = input0->scale(); - args.image0.height = input0->shape().height(); - args.image0.width = input0->shape().width(); - args.image0.pad_height = 0; - args.image0.pad_width = 0; - args.image1.address = input1->data(); - args.image1.channels = input1->shape().channel(); - args.image1.scale_address = input1->scale(); - args.image1.height = input1->shape().height(); - args.image1.width = input1->shape().width(); - args.image1.pad_height = 0; - args.image1.pad_width = 0; - args.output.scale_address = output->scale(); - args.output.address = output->data(); - param_.ewargs = args; - } - - bool dispatch() { - param_.inputs[0]->syncToDevice(); - param_.inputs[1]->syncToDevice(); - InplaceArgs inplace_args = {0}; - if (param_.relu.enabled) { - inplace_args.relu_enable = true; - config_inplace(inplace_args); - } - compute_fpga_ewadd(param_.ewargs); - if (param_.relu.enabled) { - inplace_args.relu_enable = false; - config_inplace(inplace_args); - } - return true; - } - - ElementwiseAddParam& param() { return param_; } - - private: - ElementwiseAddParam param_; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp deleted file mode 100644 index 2179a142ad..0000000000 --- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -#include "lite/backends/fpga/KD/pes/conv_pe.hpp" -#include "lite/backends/fpga/KD/pes/conv_process.hpp" - -namespace paddle { -namespace zynqmp { - -class FullyConnectedPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - ConvParam& convParam_ = convPE_.param(); - Tensor* input = param_.input; - convParam_.input = param_.input; - convParam_.output = param_.output; - convParam_.groups = 1; - convParam_.strides = {1, 1}; - convParam_.paddings = {0, 0}; - convParam_.kernelSize = {input->shape().width(), input->shape().height()}; - convParam_.dilations = {1, 1}; - - int num = param_.filter->shape().channel(); - int chw = param_.filter->shape().num(); - - int height = param_.input->shape().height(); - int width = param_.input->shape().width(); - int filter_channel = chw / height / width; - - int channel = param_.output->shape().channel(); - Shape shape(NCHW, {num, filter_channel, height, width}); - Tensor* conv_filter = new Tensor(); - float* new_filter_data = conv_filter->mutableData(FP32, shape); - float* filter_data = param_.filter->data(); - - for (int i = 0; i < num; i++) { - for (int j = 0; j < chw; j++) { - float scale = filter_data[j * num + i]; - new_filter_data[i * chw + j] = scale; - } - } - - conv_filter->flush(); - convParam_.filter = conv_filter; - - Shape sb_shape(N, {channel}); - float* scale_data = convParam_.scale()->mutableData(FP32, sb_shape); - float* bias_data = convParam_.bias()->mutableData(FP32, sb_shape); - - for (int i = 0; i < channel; i++) { - scale_data[i] = 1.0f; - bias_data[i] = param_.bias->data()[i]; - } - convParam_.scale()->flush(); - convParam_.bias()->flush(); - - convPE_.init(); - convPE_.apply(); - } - - bool dispatch() { return convPE_.dispatch(); } - - FullyConnectedParam& param() { return param_; } - - private: - FullyConnectedParam param_; - ConvPE convPE_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/input_pe.hpp b/lite/backends/fpga/KD/pes/input_pe.hpp deleted file mode 100755 index 380c85e17e..0000000000 --- a/lite/backends/fpga/KD/pes/input_pe.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -namespace paddle { -namespace zynqmp { - -class InputPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - bool dispatch() { - Tensor* input = param_.input; - Tensor* output = param_.output; - - Tensor* src = input; - input->flush(); - Tensor half_tensor; - if (input->dataType() == DataType::FP32) { - half_tensor.mutableData(DataType::FP16, input->shape()); - half_tensor.copyFrom(input); - src = &half_tensor; - } - output->mutableData(); - src->alignImage(output, true); - return true; - } - - InputParam& param() { return param_; } - - private: - InputParam param_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/norm_pe.hpp b/lite/backends/fpga/KD/pes/norm_pe.hpp deleted file mode 100644 index 3e2fd80627..0000000000 --- a/lite/backends/fpga/KD/pes/norm_pe.hpp +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { -class NormPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - inplace_args_.relu_enable = false; - inplace_args_.power_enable = false; - inplace_args_.normalize_enable = true; - - Shape& input_shape = param_.input->shape(); - - norm_param_args_.channel = input_shape.channel(); - norm_param_args_.hight_width = input_shape.height() * input_shape.width(); - - float16* mid_data = - mid_out_.mutableData(FP16, param_.output->shape()); - - bypass_args_.input_data_type = DATA_TYPE_FP16; - bypass_args_.output_data_type = DATA_TYPE_FP16; - bypass_args_.input_layout_type = LAYOUT_HWC; - bypass_args_.output_layout_type = LAYOUT_HWC; - bypass_args_.image.address = param_.input->data(); - bypass_args_.image.scale_address = param_.input->scale(); - bypass_args_.image.channels = input_shape.channel(); - bypass_args_.image.height = input_shape.height(); - bypass_args_.image.width = input_shape.width(); - bypass_args_.output.address = mid_out_.data(); - bypass_args_.output.scale_address = mid_out_.scale(); - - norm_args_.input_image_address = mid_data; - norm_args_.image_width = input_shape.width(); - norm_args_.image_height = input_shape.height(); - norm_args_.image_channel = input_shape.channel(); - norm_args_.output_image_address = param_.output->data(); - norm_args_.output_scale_address = - reinterpret_cast(param_.output->scale()); - } - - void cpuCompute() { - Tensor input_float; - Tensor float_out; - input_float.mutableData(FP32, param_.input->shape()); - float_out.mutableData(FP32, param_.output->shape()); - - input_float.copyFrom(param_.input); - input_float.syncToCPU(); - - int channel = input_float.shape().channel(); - int height = input_float.shape().height(); - int width = input_float.shape().width(); - int cw = channel * width; - - Tensor* input = &input_float; - float* input_ptr = input->data(); - float* out_ptr = float_out.data(); - - int loop = height * width; - for (int i = 0; i < loop; i++) { - float sum = param_.epsilon; - for (int c = 0; c < channel; c++) { - float value = input_ptr[i * channel + c]; - sum += value * value; - } - float norm = sqrtf(sum); -#pragma omp parallel for - for (int c = 0; c < channel; c++) { - out_ptr[i * channel + c] = input_ptr[i * channel + c] / norm; - } - } - float_out.flush(); - param_.output->copyFrom(&float_out); - } - - bool dispatch() { - cpuCompute(); - return true; - } - - NormParam& param() { return param_; } - - private: - NormParam param_; - Tensor mid_out_; - InplaceArgs inplace_args_ = {0}; - NormalizeParameterArgs norm_param_args_ = {0}; - BypassArgs bypass_args_; - - NormalizeArgs norm_args_ = {0}; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp deleted file mode 100644 index 1c99386ab1..0000000000 --- a/lite/backends/fpga/KD/pes/output_pe.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { - -class OutputPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(false); - return true; - } - - bool dispatch() { - Tensor* input = param_.input; - Tensor* output = param_.output; - if (input->aligned()) { - Tensor tmp; - tmp.setAligned(true); - tmp.mutableData(FP16, input->shape()); - tmp.copyFrom(input); - tmp.unalignImage(); - output->copyFrom(&tmp); - } else { - output->copyFrom(input); - } - return true; - } - - OutputParam& param() { return param_; } - - private: - OutputParam param_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp deleted file mode 100644 index fd3be1f463..0000000000 --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { - -class PoolingPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - Tensor* input = param_.input; - Tensor* output = param_.output; - - uint32_t k_width = param_.kernelSize[0]; - uint32_t k_height = param_.kernelSize[1]; - - if (param_.globalPooling) { - k_width = input->shape().width(); - k_height = input->shape().height(); - } - - PoolingArgs args = {0}; - args.mode = param_.type; - args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height)); - args.image.address = input->data(); - args.image.channels = input->shape().channel(); - args.image.height = input->shape().height(); - args.image.width = input->shape().width(); - args.image.pad_height = param_.paddings[0]; - args.image.pad_width = param_.paddings[1]; - args.image.scale_address = input->scale(); - args.output.address = output->mutableData(); - args.output.scale_address = output->scale(); - args.kernel.height = k_height; - args.kernel.width = k_width; - args.kernel.stride_h = param_.strides[0]; - args.kernel.stride_w = param_.strides[1]; - args.out_height = output->shape().height(); - args.out_width = output->shape().width(); - param_.poolingArgs = args; - - use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && - (k_width > 7 || k_height > 7); - } - - void compute() { - Tensor* input = param_.input; - Tensor* output = param_.output; - input->syncToCPU(); - - Tensor float_input; - float* image_addr = float_input.mutableData(FP32, input->shape()); - float_input.copyFrom(input); - float16* data_out = output->data(); - - int image_height = input->shape().height(); - int image_width = input->shape().width(); - int image_channels = input->shape().channel(); - int image_pad_h = param_.paddings[0]; - int image_pad_w = param_.paddings[1]; - int kernel_height = param_.kernelSize[1]; - int kernel_width = param_.kernelSize[0]; - int kernel_step_h = param_.strides[0]; - int kernel_step_w = param_.strides[1]; - - int pooled_height_ = output->shape().height(); - int pooled_width_ = output->shape().width(); - - int kernel = kernel_height * kernel_width; - - float max = 0; - - for (int ph = 0; ph < pooled_height_; ++ph) { - for (int pw = 0; pw < pooled_width_; ++pw) { - int hstart = ph * kernel_step_h - image_pad_h; - int wstart = pw * kernel_step_w - image_pad_w; - int hend = std::min(hstart + kernel_height, image_height); - int wend = std::min(wstart + kernel_width, image_width); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - - kernel = (hend - hstart) * (wend - wstart); - for (int c = 0; c < image_channels; ++c) { - const int pool_index = (ph * pooled_width_ + pw) * image_channels + c; - float sum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int index = (h * image_width + w) * image_channels + c; - float value = image_addr[index]; - sum += value; - } - } - float value = sum / kernel; - if (value > max) { - max = value; - } - data_out[pool_index] = float_to_half(value); - } - } - } - output->scale()[0] = max / 127.0f; - output->scale()[1] = 127.0f / max; - output->flush(); - } - - void cpu_compute() { - Tensor* input = param_.input; - Tensor* output = param_.output; - input->syncToCPU(); - - Tensor float_input; - float_input.mutableData(FP32, input->shape()); - float_input.copyFrom(input); - float16* data_out = output->data(); - - int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1]; - - float scale_max = 0; - for (int i = 0; i < output->shape().channel(); i++) { - float sum = 0; - for (int j = 0; j < kernel_hw; j++) { - float value = half_to_float(input->data()[i * kernel_hw + j]); - sum += value; - } - float value = sum / kernel_hw; - data_out[i] = float_to_half(value); - scale_max = std::max(scale_max, std::abs(value)); - } - output->scale()[0] = scale_max / 127.0f; - output->scale()[1] = 127.0f / scale_max; - std::cout << "pool scale:" << scale_max / 127.0f << std::endl; - output->flush(); - } - - bool dispatch() { - if (use_cpu_) { - compute(); - return true; - } - param_.input->syncToDevice(); - return compute_fpga_pool(param_.poolingArgs) == 0; - } - - PoolingParam& param() { return param_; } - - private: - PoolingParam param_; - bool use_cpu_; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp deleted file mode 100644 index d6a503a31d..0000000000 --- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp +++ /dev/null @@ -1,273 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "lite/backends/fpga/KD/pes/prior_box_pe.hpp" - -namespace paddle { -namespace zynqmp { - -struct Transform { - template - void operator()(InputIter first, - InputIter last, - OutputIter result, - UnaryOperation op) { - std::transform(first, last, result, op); - } - - template - void operator()(InputIter1 first1, - InputIter1 last1, - InputIter2 first2, - OutputIter result, - BinaryOperation op) { - std::transform(first1, last1, first2, result, op); - } -}; - -inline void ExpandAspectRatios(const std::vector &input_aspect_ratior, - bool flip, - std::vector *output_aspect_ratior) { - constexpr float epsilon = 1e-6; - output_aspect_ratior->clear(); - output_aspect_ratior->push_back(1.0f); - for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { - float ar = input_aspect_ratior[i]; - bool already_exist = false; - for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { - if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { - already_exist = true; - break; - } - } - if (!already_exist) { - output_aspect_ratior->push_back(ar); - if (flip) { - output_aspect_ratior->push_back(1.0f / ar); - } - } - } -} - -template -struct ClipFunctor { - inline T operator()(T in) const { - return std::min(std::max(in, 0.), 1.); - } -}; - -void PriorBoxPE::compute_prior_box() { - PriorBoxParam ¶m = param_; - Tensor *input = param.input; - Shape &input_shape = input->shape(); - - Tensor *input_image = param.image; - Shape &image_shape = input_image->shape(); - - const auto &min_sizes = param.minSizes; - const auto &max_sizes = param.maxSizes; - const auto &input_aspect_ratio = param.aspectRatios; - const bool &flip = param.flip; - const bool &clip = param.clip; - const float &step_w = param.stepW; - const float &step_h = param.stepH; - const float &offset = param.offset; - - Tensor *output_boxes = this->cachedBoxes_; - Tensor *output_variances = this->cachedVariances_; - - Tensor boxes; - Tensor variances; - - float *output_boxes_dataptr = - boxes.mutableData(FP32, output_boxes->shape()); - memset(output_boxes_dataptr, 0, boxes.memorySize()); - float *output_variances_dataptr = - variances.mutableData(FP32, output_boxes->shape()); - - std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); - - auto img_width = image_shape.width(); - auto img_height = image_shape.height(); - auto feature_width = input_shape.width(); - auto feature_height = input_shape.height(); - - auto stride0 = output_boxes->shape().channel() * - output_boxes->shape().height() * output_boxes->shape().width(); - auto stride1 = output_boxes->shape().height() * output_boxes->shape().width(); - auto stride2 = output_boxes->shape().width(); - - float step_width = step_w; - float step_height = step_h; - if (step_w == 0 || step_h == 0) { - step_width = static_cast(img_width) / feature_width; - step_height = static_cast(img_height) / feature_height; - } - - int num_priors = aspect_ratios.size() * min_sizes.size(); - if (!max_sizes.empty()) { - num_priors += max_sizes.size(); - } - - for (int h = 0; h < feature_height; ++h) { - for (int w = 0; w < feature_width; ++w) { - /// map origin image - float center_x = (w + offset) * step_width; - float center_y = (h + offset) * step_height; - float box_width, box_height; - int idx = 0; - for (size_t s = 0; s < min_sizes.size(); ++s) { - auto min_size = min_sizes[s]; - if (param.minMaxAspectRatiosOrder) { - box_width = box_height = min_size / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] = - (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] = - (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] = - (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] = - (center_y + box_height) / img_height; - idx++; - - if (max_sizes.size() > 0) { - auto max_size = max_sizes[s]; - // square prior with size sqrt(minSize * maxSize) - box_width = box_height = sqrt(min_size * max_size) / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - - idx++; - } - - // priors with different aspect ratios - for (float ar : aspect_ratios) { - if (fabs(ar - 1.) < 1e-6) { - continue; - } - box_width = min_size * sqrt(ar) / 2.; - box_height = min_size / sqrt(ar) / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - - idx++; - } - - } else { - // priors with different aspect ratios - for (float ar : aspect_ratios) { - box_width = min_size * sqrt(ar) / 2.; - box_height = min_size / sqrt(ar) / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - idx++; - } - if (!max_sizes.empty()) { - auto max_size = max_sizes[s]; - // square prior with size sqrt(minSize * maxSize) - box_width = box_height = sqrt(min_size * max_size) / 2.; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 0] = (center_x - box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 1] = (center_y - box_height) / img_height; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 2] = (center_x + box_width) / img_width; - output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + - 3] = (center_y + box_height) / img_height; - idx++; - } - } - } - } - } - if (clip) { - for (int i = 0; i < output_boxes->shape().numel(); i++) { - float value = output_boxes_dataptr[i]; - value = std::min(std::max(0.0f, value), 1.0f); - output_boxes_dataptr[i] = value; - } - } - - if ((param.variances.size() != 4)) { - // TODO(chonwhite) throw error; - } - - int64_t box_num = feature_height * feature_width * num_priors; - - for (int i = 0; i < box_num; i++) { - output_variances_dataptr[4 * i] = param.variances[0]; - output_variances_dataptr[4 * i + 1] = param.variances[1]; - output_variances_dataptr[4 * i + 2] = param.variances[2]; - output_variances_dataptr[4 * i + 3] = param.variances[3]; - } - - boxes.flush(); - boxes.syncToCPU(); - variances.flush(); - output_boxes->copyFrom(&boxes); - output_variances->copyFrom(&variances); -} - -void PriorBoxPE::apply() {} - -bool PriorBoxPE::dispatch() { - if (cachedBoxes_ == nullptr) { - cachedBoxes_ = new Tensor(); - cachedVariances_ = new Tensor(); - cachedBoxes_->mutableData(FP16, param_.outputBoxes->shape()); - cachedVariances_->mutableData(FP16, - param_.outputVariances->shape()); - cachedBoxes_->setDataLocation(CPU); - cachedVariances_->setDataLocation(CPU); - compute_prior_box(); - } - - param_.outputBoxes->copyFrom(this->cachedBoxes_); - - param_.outputVariances->copyFrom(this->cachedVariances_); - param_.outputBoxes->flush(); - param_.outputBoxes->syncToCPU(); - param_.outputVariances->flush(); -} - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.hpp b/lite/backends/fpga/KD/pes/prior_box_pe.hpp deleted file mode 100755 index 8afe40dd30..0000000000 --- a/lite/backends/fpga/KD/pes/prior_box_pe.hpp +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -namespace paddle { -namespace zynqmp { - -class PriorBoxPE : public PE { - public: - bool init() { - param_.outputBoxes->setAligned(false); - param_.outputVariances->setAligned(false); - param_.outputBoxes->setDataLocation(CPU); - param_.outputVariances->setDataLocation(CPU); - return true; - } - - bool dispatch(); - - void apply(); - - PriorBoxParam& param() { return param_; } - - private: - PriorBoxParam param_; - Tensor* cachedBoxes_ = nullptr; - Tensor* cachedVariances_ = nullptr; - - void compute_prior_box(); -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/relu_pe.hpp b/lite/backends/fpga/KD/pes/relu_pe.hpp deleted file mode 100755 index 5c125010c2..0000000000 --- a/lite/backends/fpga/KD/pes/relu_pe.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -namespace paddle { -namespace zynqmp { - -class ReluPE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - Tensor* src = param_.input; - - args_.input_data_type = DATA_TYPE_FP16; - args_.output_data_type = DATA_TYPE_FP16; - args_.input_layout_type = LAYOUT_HWC; - args_.output_layout_type = LAYOUT_HWC; - args_.image = {.address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().channel(), - .width = (uint32_t)src->shape().width(), - .height = (uint32_t)src->shape().height(), - .pad_width = 0u, - .pad_height = 0u}; - args_.output = { - .address = param_.output->data(), - .scale_address = param_.output->scale(), - }; - - inplace_.relu_enable = false; - inplace_.power_enable = false; - inplace_.normalize_enable = false; - } - - bool dispatch() { - inplace_.relu_enable = true; - config_inplace(inplace_); - param_.input->syncToDevice(); - param_.output->copyFrom(param_.input); - param_.output->invalidate(); - inplace_.relu_enable = false; - config_inplace(inplace_); - return true; - } - - InputParam& param() { return param_; } - - private: - InputParam param_; - BypassArgs args_; - InplaceArgs inplace_; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/resize.hpp b/lite/backends/fpga/KD/pes/resize.hpp deleted file mode 100644 index f83896d2c7..0000000000 --- a/lite/backends/fpga/KD/pes/resize.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { -class ResizePE : public PE { - public: - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - Tensor* input = param_.input; - Tensor* output = param_.output; - ResizeArgs& args = args_; - - int input_width = input->shape().width(); - int input_height = input->shape().height(); - int input_channel = input->shape().channel(); - - int output_width = output->shape().width(); - int output_height = output->shape().height(); - - args.input_width = input_width; - args.input_height = input_height; - args.image_channel = input_channel; - args.output_width = output_width; - args.output_height = output_height; - float height_ratio = static_cast(input_height) / - static_cast(args.output_height); - float width_ratio = - static_cast(input_width) / static_cast(args.output_width); - args.height_ratio = *reinterpret_cast(&height_ratio); - args.width_ratio = *reinterpret_cast(&width_ratio); - - args.input_image_address = input->mutableData(); - args.output_image_address = output->mutableData(); - args.output_scale_address = reinterpret_cast(output->scale()); - } - - void compute_scale(Tensor* src, float* scale) { - float16* data = src->data(); - src->invalidate(); - float max = 0; - for (int i = 0; i < src->shape().numel(); i++) { - float value = half_to_float(data[i]); - if (value < 0) { - value = -value; - } - if (value > max) { - max = value; - } - } - scale[0] = max / 127.0; - scale[1] = 127.0 / max; - } - - bool dispatch() { - bool ret = compute_fpga_resize(args_) == 0; - return true; - } - - ResizeParam& param() { return param_; } - - private: - ResizeParam param_; - ResizeArgs args_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp deleted file mode 100755 index d5e16615d9..0000000000 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { -class ScalePE : public PE { - public: - inline int gcd(int a, int b) { - while (b) { - int temp = a; - a = b; - b = temp % b; - } - return a; - } - - inline int lcm(int a, int b) { return a * b / gcd(a, b); } - bool init() { - Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); - return true; - } - - void apply() { - Tensor* input = param_.input; - Tensor* output = param_.output; - Shape& input_shape = input->shape(); - int channel = input_shape.channel(); - int repeat = 1; - int alignment = 16; - int length = channel; - - if (channel % alignment != 0 || channel < alignment) { - int c_lcm = lcm(channel, alignment); - repeat = c_lcm / (channel); - } - Shape shape(N, {channel * repeat}); - param_.alignedBias()->mutableData(FP16, shape); - param_.alignedScale()->mutableData(FP16, shape); - - float16* bias_data = param_.alignedBias()->data(); - float16* scale_data = param_.alignedScale()->data(); - - if (param_.bias != nullptr) { - float* bias_data_float = param_.bias->data(); - for (int i = 0; i < repeat; i++) { - for (int j = 0; j < length; j++) { - float16 value = float_to_half(bias_data_float[j]); - bias_data[i * length + j] = value; - } - } - } else { - float16 zero = float_to_half(0.0f); - for (int i = 0; i < repeat; i++) { - for (int j = 0; j < length; j++) { - bias_data[i * length + j] = zero; - } - } - } - - float* scale_data_float = param_.scale->data(); - for (int i = 0; i < repeat; i++) { - for (int j = 0; j < length; j++) { - float16 value = float_to_half(scale_data_float[j]); - scale_data[i * length + j] = value; - } - } - - param_.alignedScale()->flush(); - param_.alignedBias()->flush(); - - int wc = input_shape.width() * input_shape.channel(); - int wc_aligned = align_image(wc); - - ScaleArgs& args = param_.args; - args.scale_address = param_.alignedScale()->data(); - args.bias_address = param_.alignedBias()->data(); - args.wc_alignment = wc_aligned; - args.channel_alignment = channel * repeat; - - args.image.address = input->data(); - args.image.scale_address = input->scale(); - args.image.channels = channel; - args.image.height = input_shape.height(); - args.image.width = input_shape.width(); - args.image.pad_width = 0; - args.image.pad_height = 0; - args.output.address = output->data(); - args.output.scale_address = output->scale(); - } - - bool dispatch() { - param_.input->syncToDevice(); - return compute_fpga_scale(param_.args) == 0; - } - - ScaleParam& param() { return param_; } - - private: - ScaleParam param_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/softmax_pe.cpp b/lite/backends/fpga/KD/pes/softmax_pe.cpp deleted file mode 100755 index 099ed20b8f..0000000000 --- a/lite/backends/fpga/KD/pes/softmax_pe.cpp +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/fpga/KD/pes/softmax_pe.hpp" - -#include - -namespace paddle { -namespace zynqmp { - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#ifndef __aarch64__ -static inline float32_t vmaxvq_f32(const float32x4_t &r) { - float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r)); - return vget_lane_f32(vpmax_f32(v, v), 0); -} - -static inline float32_t vaddvq_f32(const float32x4_t &r) { - float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r)); - return vget_lane_f32(vpadd_f32(v, v), 0); -} -#endif // __aarch64__ -#endif // __ARM_NEON__ - -static float find_max(const float *input, const int num_classes) { - int remain = num_classes; - float max = -std::numeric_limits::max(); -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - int loop = num_classes >> 3; - remain = num_classes & 0x7; - float32x4_t __max = vdupq_n_f32(max); - for (int i = 0; i < loop; ++i, input += 8) { - float32x4_t x0 = vld1q_f32(input); - float32x4_t x1 = vld1q_f32(input + 4); - __max = vmaxq_f32(x0, __max); - __max = vmaxq_f32(x1, __max); - } - max = vmaxvq_f32(__max); -#endif - for (int i = 0; i < remain; ++i) { - max = std::max(max, input[i]); - } - return max; -} - -static void softmax(Tensor *X, Tensor *Y) { - std::vector dims = X->shape().dims(); - int batch_size = X->shape().num(); - int num_classes = dims[X->shape().dimSize() - 1]; - int channels = X->shape().numel() / batch_size / num_classes; - float *x = X->data(); - float *y = Y->mutableData(); - -#pragma omp parallel for collapse(2) - for (int batch = 0; batch < batch_size; ++batch) { - for (int channel = 0; channel < channels; ++channel) { - size_t offset = (batch * channels + channel) * num_classes; - const float *input = x + offset; - float *output = y + offset; - // find max - float max = find_max(input, num_classes); - - // exp(x - max) - int remain = num_classes; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - int loop = num_classes >> 3; - remain = num_classes & 0x7; - float32x4_t __max = vdupq_n_f32(max); - for (int i = 0; i < loop; ++i, input += 8, output += 8) { - float32x4_t x0 = vld1q_f32(input); - float32x4_t x1 = vld1q_f32(input + 4); - x0 = vsubq_f32(x0, __max); - x1 = vsubq_f32(x1, __max); - x0 = lite::arm::math::exp_ps(x0); - x1 = lite::arm::math::exp_ps(x1); - vst1q_f32(output, x0); - vst1q_f32(output + 4, x1); - } -#endif // __ARM_NEON__ - for (int i = 0; i < remain; ++i) { - output[i] = expf(input[i] - max); - } - - // sum(exp(x - max)) - float sum = 0.f; - output = y + offset; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - float32x4_t __sum = vdupq_n_f32(0.f); - for (int i = 0; i < loop; ++i, output += 8) { - float32x4_t x0 = vld1q_f32(output); - float32x4_t x1 = vld1q_f32(output + 4); - __sum = vaddq_f32(x0, __sum); - __sum = vaddq_f32(x1, __sum); - } - sum += vaddvq_f32(__sum); -#endif // __ARM_NEON__ - for (int i = 0; i < remain; ++i) { - sum += output[i]; - } - - // exp(x - max) / sum - float inv_sum = 1.f / sum; - output = y + offset; -#if defined(__ARM_NEON) || defined(__ARM_NEON__) - float32x4_t __inv_sum = vdupq_n_f32(inv_sum); - for (int i = 0; i < loop; ++i, output += 8) { - float32x4_t x0 = vld1q_f32(output); - float32x4_t x1 = vld1q_f32(output + 4); - x0 = vmulq_f32(x0, __inv_sum); - x1 = vmulq_f32(x1, __inv_sum); - vst1q_f32(output, x0); - vst1q_f32(output + 4, x1); - } -#endif - for (int i = 0; i < remain; ++i) { - output[i] *= inv_sum; - } - } - } -} - -bool SoftmaxPE::init() { - Tensor *output = param_.output; - output->setAligned(false); - output->setDataLocation(CPU); - return true; -} - -bool SoftmaxPE::dispatch() { - Tensor *input = param_.input; - Tensor *output = param_.output; - input->syncToCPU(); - - Tensor float_input; - Tensor float_output; - float_input.mutableData(DataType::FP32, input->shape()); - float_input.copyFrom(input); - - float *out_data = - float_output.mutableData(DataType::FP32, input->shape()); - - softmax(&float_input, &float_output); - float_output.flush(); - - output->copyFrom(&float_output); - return true; -} - -SoftmaxParam &SoftmaxPE::param() { return param_; } -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/softmax_pe.hpp b/lite/backends/fpga/KD/pes/softmax_pe.hpp deleted file mode 100644 index 5733f873a4..0000000000 --- a/lite/backends/fpga/KD/pes/softmax_pe.hpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#include -#include "lite/backends/arm/math/funcs.h" -#endif - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" - -namespace paddle { -namespace zynqmp { - -class SoftmaxPE : public PE { - public: - bool init(); - bool dispatch(); - - SoftmaxParam& param(); - - private: - SoftmaxParam param_; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/split_pe.hpp b/lite/backends/fpga/KD/pes/split_pe.hpp deleted file mode 100644 index 26598a4c87..0000000000 --- a/lite/backends/fpga/KD/pes/split_pe.hpp +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "lite/backends/fpga/KD/pe.hpp" -#include "lite/backends/fpga/KD/pe_params.hpp" -namespace paddle { -namespace zynqmp { - -class SplitPE : public PE { - public: - bool init() { - std::vector outputs = param_.outputs; - for (size_t i = 0; i < outputs.size(); i++) { - Tensor* out = outputs[i]; - out->setAligned(false); - out->setDataLocation(CPU); - } - return true; - } - - std::vector stride_numel(std::vector ddim) { - std::vector strides(ddim.size()); - strides[ddim.size() - 1] = ddim[ddim.size() - 1]; - for (int i = ddim.size() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * ddim[i]; - } - return strides; - } - - template - inline void StridedNumelCopyWithAxis(int64_t axis, - T* dst, - const std::vector& dst_stride_numel, - T* src, - const std::vector& src_stride_numel, - int64_t size) { - int64_t before = dst_stride_numel[0] / dst_stride_numel[axis]; - int64_t src_after = src_stride_numel[axis]; - int64_t dst_after = dst_stride_numel[axis]; - - for (int64_t i = 0; i < axis; ++i) { - if (i < axis) { - } else if (i == axis) { - continue; - } else { - } - } - - for (int64_t i = 0; i < before; ++i) { - memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size); - } - } - - void split3D() { int axis = param_.axis; } - - bool dispatch() { - Tensor* input = param_.input; - input->syncToCPU(); - if (input->shape().dimSize() <= 3) { - auto in_stride = stride_numel(input->shape().dims()); - int64_t axis = param_.axis; - size_t input_offset = 0; - float16* in_data = input->data(); - - for (auto& out : param_.outputs) { - float16* out_data = out->mutableData(); - auto out_stride = stride_numel(out->shape().dims()); - - StridedNumelCopyWithAxis(axis, - out_data, - out_stride, - in_data + input_offset, - in_stride, - out_stride[axis]); - input_offset += out_stride[axis]; - } - return true; - } - - std::vector outputs = param_.outputs; - - int in_channel = input->shape().channel(); - int split_channel = input->shape().channel() / param_.num; - int hw = input->shape().height() * input->shape().width(); - - float16* in_data = input->data(); - for (int i = 0; i < hw; i++) { - for (int n = 0; n < outputs.size(); n++) { - Tensor* out = outputs[n]; - float16* out_data = out->data(); - memcpy(out_data + i * split_channel, - in_data + i * in_channel + n * split_channel, - split_channel * sizeof(float16)); - } - } - for (int n = 0; n < outputs.size(); n++) { - Tensor* out = outputs[n]; - out->copyScaleFrom(input); - } - return true; - } - - SplitParam& param() { return param_; } - - private: - SplitParam param_; -}; -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/shape.hpp b/lite/backends/fpga/KD/shape.hpp deleted file mode 100755 index 566ad8e6ff..0000000000 --- a/lite/backends/fpga/KD/shape.hpp +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "lite/backends/fpga/KD/alignment.h" -#include "lite/backends/fpga/KD/layout.hpp" - -namespace paddle { -namespace zynqmp { - -static struct NCHW nchw_; -static struct NHWC nhwc_; -static struct NC nc_; -static struct NHW nhw_; -static struct N n_; - -class Shape { - public: - explicit Shape(std::vector dims) { dims_ = dims; } - - Shape(LayoutType type, std::vector dims) { - dims_ = dims; - setLayoutType(type); - } - - Shape(const Shape& src) { - dims_ = src.dims_; - setLayoutType(src.layoutType_); - } - - bool shouldAlign() { - return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_); - } - - int num() { - int index = layout_->numIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int channel() { - int index = layout_->channelIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int height() { - int index = layout_->heightIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int width() { - int index = layout_->widthIndex(); - return index == -1 ? 1 : dims_[index]; - } - - int dimSize() { return dims_.size(); } - - std::vector dims() { return dims_; } - - size_t memorySize(int cellSize) { - return layout_->alignedElementCount(dims_) * cellSize; - } - - int numel() { return layout_->elementCount(dims_); } - - int alignedElementCount() { return layout_->alignedElementCount(dims_); } - - void setLayoutType(LayoutType layout) { - this->layoutType_ = layout; - switch (layout) { - case NCHW: - layout_ = &nchw_; - break; - case NHWC: - layout_ = &nhwc_; - break; - case NC: - layout_ = &nc_; - break; - case NHW: - layout_ = &nhw_; - break; - case N: - layout_ = &n_; - break; - default: - break; - } - } - - void print() {} - - int operator[](int index) { return dims_[index]; } - - private: - LayoutType layoutType_; - Layout* layout_ = &nhwc_; - std::vector dims_; -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp deleted file mode 100644 index f003ded33e..0000000000 --- a/lite/backends/fpga/KD/tensor.hpp +++ /dev/null @@ -1,456 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// #include "lite/core/tensor.h" - -#include "lite/backends/fpga/KD/dl_engine.hpp" -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" -#include "lite/backends/fpga/KD/shape.hpp" -// #include "lite/backends/fpga/KD/types.hpp" - -namespace paddle { -namespace zynqmp { - -enum DataType : int { - FP32 = 0, - FP16 = 1, - INT8 = 2, - INT32 = 3, -}; - -enum DataSyncStatus : int { - Synched = 0, - Device = 1, - CPU = 2, -}; - -typedef uint16_t float16; - -inline int CellSize(DataType type) { - switch (type) { - case FP32: - return sizeof(float); - case FP16: - return sizeof(float16); - case INT32: - return sizeof(int32_t); - case INT8: - return sizeof(int8_t); - default: - return 0; - } - return 0; -} - -class PlaceHolder { - public: - PlaceHolder() {} - explicit PlaceHolder(size_t size) { - size_ = size; - data_ = fpga_malloc(size_); - } - - void* data() { return data_; } - void set_data(const void* ptr) { data_ = const_cast(ptr); } - - size_t memorySize() { return size_; } - void set_size(size_t new_size) { size_ = new_size; } - - ~PlaceHolder() { fpga_free(data_); } - - float scale_[2]; - - private: - void* data_ = nullptr; - size_t size_ = 0; -}; - -class Tensor { - public: - Tensor() { DLEngine::get_instance(); } - - int id() { return id_; } - - template - Dtype* data() { - if (placeHolder_ == nullptr) { - return nullptr; - } - void* ptr = reinterpret_cast(this->placeHolder_->data()) + - offset * CellSize(dataType_); - return reinterpret_cast(ptr); - } - - template - Dtype* mutableData(DataType dataType, const Shape& shape) { - if (this->shape_ != nullptr) { - delete shape_; - } - this->shape_ = new Shape(shape); - this->dataType_ = dataType; - return mutableData(); - } - - template - Dtype* mutableData() { - size_t memorySize = shape_->memorySize(CellSize(dataType_)); - if (placeHolder_ != nullptr) { - if (memorySize > placeHolder_->memorySize()) { - placeHolder_.reset(new PlaceHolder(memorySize)); - } - } else { - placeHolder_.reset(new PlaceHolder(memorySize)); - } - return data(); - } - - size_t memorySize() { - if (placeHolder_ == nullptr) { - return 0; - } - return placeHolder_->memorySize(); - } - - void setDataType(DataType dataType) { this->dataType_ = dataType; } - - DataType dataType() { return this->dataType_; } - - Shape& shape() { return *shape_; } - - bool aligned() { return this->aligned_; } - - void setAligned(bool aligned) { this->aligned_ = aligned; } - - float* scale() { return placeHolder_->scale_; } - - void alignImage(Tensor* dst = nullptr, bool copy = false) { - if (shape_->shouldAlign()) { - int cell_size = CellSize(this->dataType_); - char* dst_data = nullptr; - size_t mem_size = shape_->memorySize(cell_size); - if (dst == nullptr) { - dst_data = reinterpret_cast(fpga_malloc(mem_size)); - } else { - dst_data = dst->data(); - } - int wc = shape_->width() * shape_->channel(); - int wc_aligned = align_image(wc); - int remainder = wc_aligned - wc; - - char* src_start = data(); - char* dst_start = dst_data; - for (int n = 0; n < shape_->num(); n++) { - for (int h = 0; h < shape_->height(); h++) { - memcpy(dst_start, src_start, wc * cell_size); - memset(dst_start + wc * cell_size, 0, remainder * cell_size); - src_start += wc * cell_size; - dst_start += wc_aligned * cell_size; - } - } - if (dst == nullptr) { - memcpy(data(), dst_data, mem_size); - flush(); - fpga_free(dst_data); - } else { - dst->flush(); - } - } else { - if (copy) { - dst->copyFrom(this); - } else { - // TODO(chonwhite) share data. - } - } - if (dst != nullptr) { - dst->copyScaleFrom(this); - } - } - - inline void copyScaleFrom(Tensor* src) { - placeHolder_->scale_[0] = src->placeHolder_->scale_[0]; - placeHolder_->scale_[1] = src->placeHolder_->scale_[1]; - } - - void unalignImage(Tensor* dst = nullptr, bool copy = false) { - Tensor* target = dst == nullptr ? this : dst; - if (!target->aligned_) { - if (copy && dst != nullptr) { - dst->copyFrom(this); - } - return; - } - target->syncToCPU(); - if (shape_->shouldAlign()) { - int cell_size = CellSize(this->dataType_); - char* dst_data = nullptr; - size_t mem_size = shape_->memorySize(cell_size); - if (dst == nullptr) { - dst_data = reinterpret_cast(fpga_malloc(mem_size)); - } else { - dst_data = dst->data(); - } - int wc = shape_->width() * shape_->channel(); - int wc_aligned = align_image(wc); - - char* src_start = data(); - char* dst_start = dst_data; - for (int n = 0; n < shape_->num(); n++) { - for (int h = 0; h < shape_->height(); h++) { - memcpy(dst_start, src_start, wc * cell_size); - src_start += wc_aligned * cell_size; - dst_start += wc * cell_size; - } - } - if (dst == nullptr) { - memcpy(data(), dst_data, mem_size); - flush(); - fpga_free(dst_data); - } else { - dst->flush(); - } - } else { - if (copy) { - dst->copyFrom(this); - } else { - // TODO(chonwhite) share data. - } - } - } - - void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); } - - void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) { - if (shape_ != nullptr) { - delete shape_; - } - this->placeHolder_ = src->placeHolder_; - this->dataType_ = src->dataType_; - this->aligned_ = src->aligned_; - this->dateLocation_ = src->dateLocation_; - this->offset = offset; - shape_ = new Shape(const_cast(shape)); - } - - void copyFrom(Tensor* src) { - if (src->dataType_ == dataType_) { - src->syncToCPU(); - memcpy(data(), src->data(), memorySize()); - copyScaleFrom(src); - flush(); - return; - } - BypassArgs args; - args.input_data_type = - src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; - args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; - args.input_layout_type = LAYOUT_HWC; - args.output_layout_type = LAYOUT_HWC; - args.image = {.address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().numel(), - .width = 1, - .height = 1, - .pad_width = 0u, - .pad_height = 0u}; - args.output = { - .address = data(), .scale_address = scale(), - }; - src->syncToDevice(); - size_t aligned_remainder = src->shape().numel() % 16; - if (aligned_remainder > 0) { - size_t dtype_size = - src->dataType_ == FP32 ? sizeof(float) : sizeof(float16); - void* dst = src->data() + src->shape().numel() * dtype_size; - memset(dst, 0, aligned_remainder * dtype_size); - fpga_flush(dst, aligned_remainder * dtype_size); - } - src->syncToDevice(); - this->invalidate(); - perform_bypass(args); - this->invalidate(); - } - - void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); } - - void invalidate() { - fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize()); - } - - void sync() { - switch (synchedStatus_) { - case CPU: - flush(); - break; - case Device: - invalidate(); - break; - default: - break; - } - } - - void syncToCPU() { - if (dateLocation_ == Device) { - invalidate(); - } - } - - void syncToDevice() { - if (dateLocation_ == CPU) { - flush(); - } - } - - DataSyncStatus synchedStatus() { return synchedStatus_; } - - void setSynchedStatus(DataSyncStatus status) { synchedStatus_ = status; } - - void setDataLocation(DataSyncStatus location) { dateLocation_ = location; } - - void print() {} - - void printScale() { - if (placeHolder_ == nullptr) { - return; - } - } - - std::string dimsFileName() { - return std::to_string(shape_->num()) + "_" + - std::to_string(shape_->channel()) + "_" + - std::to_string(shape_->height()) + "_" + - std::to_string(shape_->width()) + ".txt"; - } - - void saveToFile() { std::string path = dimsFileName(); } - - void saveToFile(std::string prefix, bool with_shape) { - std::string path = prefix; - if (with_shape) { - path = path + "_" + dimsFileName(); - } else { - path = path + ".txt"; - } - saveToFile(path); - } - - friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) { - os << "tensor:" - << "\n"; - os << "dims: {"; - for (int i = 0; i < tensor.shape().dimSize(); ++i) { - os << tensor.shape()[i] << " "; - } - os << "}\n"; - for (int i = 0; i < tensor.shape().numel(); i++) { - float value = 0; - if (tensor.dataType() == FP32) { - value = tensor.data()[i]; - } else { - value = half_to_float(tensor.data()[i]); - } - os << value << " "; - } - os << "\n"; - return os; - } - - void saveToFile(std::string path) { - syncToCPU(); - std::ofstream ofs; - static int counter = 0; - std::string npath = std::to_string(counter) + "_" + path; - counter++; - save_file_with_name(npath); - } - - void save_file_with_name(std::string path) { - // return; - invalidate(); - std::ofstream ofs; - - ofs.open(path); - for (int i = 0; i < shape_->numel(); i++) { - float value = 0; - if (dataType_ == FP32) { - value = data()[i]; - } else { - value = half_to_float(data()[i]); - } - ofs << value << std::endl; - } - ofs.close(); - } - - void readFromFile(std::string path) { - std::ifstream file_stream; - file_stream.open(path); - if (!file_stream) { - return; - } - int num = shape_->numel(); - invalidate(); - float max = 0.0f; - float16* data = mutableData(); - for (int i = 0; i < num; ++i) { - float value = 0; - file_stream >> value; - max = std::max(std::abs(value), max); - data[i] = float_to_half(value); - } - flush(); - placeHolder_->scale_[0] = max / 127.0f; - placeHolder_->scale_[1] = 127.0f / max; - } - - ~Tensor() { - if (shape_ != nullptr) { - delete shape_; - shape_ = nullptr; - } - } - - private: - int offset = 0; - std::shared_ptr placeHolder_; - Shape* shape_ = nullptr; - DataType dataType_ = FP32; - bool aligned_ = false; - DataSyncStatus synchedStatus_ = Synched; - DataSyncStatus dateLocation_ = Device; - - static int generateID() { - static int sID = 0; - int id = sID++; - return id; - } - - int id_ = generateID(); -}; - -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/tensor_util.cpp b/lite/backends/fpga/KD/tensor_util.cpp deleted file mode 100644 index cbf5df15cd..0000000000 --- a/lite/backends/fpga/KD/tensor_util.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "lite/backends/fpga/KD/tensor_util.hpp" - -namespace paddle { -namespace zynqmp { -float find_max(const Tensor& tensor) { - float max = 0; - Tensor& t = const_cast(tensor); - float* data = t.data(); - for (int i = 0; i < t.shape().numel(); i++) { - float value = data[i] > 0 ? data[i] : -data[i]; - max = std::max(value, max); - } - return max; -} -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/KD/tensor_util.hpp b/lite/backends/fpga/KD/tensor_util.hpp deleted file mode 100644 index 01f5757039..0000000000 --- a/lite/backends/fpga/KD/tensor_util.hpp +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "lite/backends/fpga/KD/tensor.hpp" - -namespace paddle { -namespace zynqmp { -float find_max(const Tensor& tensor); -} // namespace zynqmp -} // namespace paddle diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc deleted file mode 100644 index 43218173fd..0000000000 --- a/lite/backends/fpga/lite_tensor.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/fpga/lite_tensor.h" -#include - -namespace paddle { -namespace lite { - -using value_type = int64_t; - -value_type DDimLite::production() const { - value_type res = 1; - for (size_t i = 0; i < this->size(); i++) { - res *= (*this)[i]; - } - return res; -} - -value_type DDimLite::count(int start, int end) const { - if (start < 0) { - start = 0; - } - if (end > size()) { - end = size(); - } - if (end < start) { - end = start; - } - value_type sum = 1; - for (auto i = start; i < end; ++i) { - sum *= data_[i]; - } - return sum; -} - -DDimLite DDimLite::Slice(int start, int end) const { - std::vector vec; - for (int i = start; i < end; i++) { - vec.push_back((*this)[i]); - } - return DDimLite(vec); -} - -std::string DDimLite::repr() const { - std::stringstream ss; - if (empty()) { - ss << "{}"; - return ss.str(); - } - ss << "{"; - for (size_t i = 0; i < this->size() - 1; i++) { - ss << (*this)[i] << ","; - } - if (!this->empty()) ss << (*this)[size() - 1]; - ss << "}"; - return ss.str(); -} - -void TensorLite::ShareDataWith(const TensorLite &other) { - buffer_ = other.buffer_; - dims_ = other.dims_; - zynq_tensor_ = other.zynq_tensor_; - target_ = other.target_; - lod_ = other.lod_; - memory_size_ = other.memory_size_; - throw - 1; -} - -void *TensorLite::mutable_data(size_t memory_size) { - memory_size_ = memory_size; - buffer_->ResetLazy(target_, memory_size_); - // throw -1; - std::cout << memory_size << std::endl; - return buffer_->data(); -} - -void *TensorLite::mutable_data(TargetType target, size_t memory_size) { - target_ = target; - return mutable_data(memory_size); -} - -void TensorLite::CopyDataFrom(const TensorLite &other) { - dims_ = other.dims_; - target_ = other.target_; - lod_ = other.lod_; - // memory_size_ = other.memory_size_; - // buffer_->CopyDataFrom(*other.buffer_, memory_size_); - zynq_tensor_->mutableData(other.zynq_tensor_->dataType(), - other.zynq_tensor_->shape()); -} - -// template -// void TensorLite::mutable_data_internal() { - -// } - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h deleted file mode 100644 index 2f9df3abb0..0000000000 --- a/lite/backends/fpga/lite_tensor.h +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include // for multiplies -#include -#include -#include -#include - -#include "lite/backends/fpga/KD/tensor.hpp" -#include "lite/core/memory.h" - -namespace paddle { -namespace lite { - -class DDimLite; -class TensorLite; - -using DDim = lite::DDimLite; -using Tensor = lite::TensorLite; - -class DDimLite { - public: - using value_type = int64_t; - - DDimLite() = default; - - explicit DDimLite(const std::vector &x) { ConstructFrom(x); } - - void ConstructFrom(const std::vector &x) { data_ = x; } - - value_type operator[](int offset) const { return data_[offset]; } - value_type &operator[](int offset) { return data_[offset]; } - std::vector Vectorize() const { return data_; } - - size_t size() const { return data_.size(); } - bool empty() const { return data_.empty(); } - - value_type production() const; - - const std::vector &data() const { return data_; } - value_type count(int start, int end) const; - - DDimLite Slice(int start, int end) const; - - DDimLite Flatten2D(int col) const { - return DDimLite(std::vector( - {Slice(0, col).production(), Slice(col, size()).production()})); - } - - std::string repr() const; - - friend std::ostream &operator<<(std::ostream &os, const DDimLite &dims) { - os << dims.repr(); - return os; - } - - friend bool operator==(const DDimLite &a, const DDimLite &b) { - if (a.size() != b.size()) return false; - for (size_t i = 0; i < a.size(); i++) { - if (a[i] != b[i]) return false; - } - return true; - } - - friend bool operator!=(const DDimLite &a, const DDimLite &b) { - return !(a == b); - } - - private: - std::vector data_; -}; - -using LoD = std::vector>; - -// A light-weight tensor implementation. -class TensorLite { - public: - TensorLite() : buffer_(std::make_shared()) {} - - template - void Assign(DType *data, const DimT &dim) { - Resize(dim); - auto *dst = mutable_data(Target); - CopySync( - dst, data, dim.production() * sizeof(DType), IoDirection::HtoD); - } - - // T is the data type and R is the return type - // For OpenCL, the return type can be cl::Buffer - // and the data type can be float/int8_t. - // For other devices, T and R may be the same type. - template - const R *data() const { - return zynq_tensor_->data(); - } - - void Resize(const DDimLite &ddim) { dims_ = ddim; } - void Resize(const std::vector &x) { dims_ = DDimLite(x); } - - const DDimLite &dims() const { return dims_; } - int64_t numel() const { return dims_.production(); } - - const LoD &lod() const { return lod_; } - LoD *mutable_lod() { return &lod_; } - - void set_lod(const LoD &lod) { lod_ = lod; } - - PrecisionType precision() const { return precision_; } - void set_precision(PrecisionType precision) { precision_ = precision; } - - bool persistable() const { return persistable_; } - void set_persistable(bool persistable) { persistable_ = persistable; } - // T is the data type and R is the return type - // For OpenCL, the return type can be cl::Buffer - // and the data type can be float/int8_t. - // For other devices, T and R may be the same type. - template - R *mutable_data(); - - // T is the data type and R is the return type - // For OpenCL, the return type can be cl::Buffer - // and the data type can be float/int8_t. - // For other devices, T and R may be the same type. - template - R *mutable_data(TargetType target); - void *mutable_data(size_t memory_size); - void *mutable_data(TargetType target, size_t memory_size); - - const void *raw_data() const { return buffer_->data(); } - - size_t data_size() const { return this->dims().production(); } - - size_t memory_size() const { return zynq_tensor_->memorySize(); } - - bool IsInitialized() const { return buffer_->data(); } - - // Other share data to this. - void ShareDataWith(const TensorLite &other); - - void CopyDataFrom(const TensorLite &other); - - template - TensorLite Slice(int64_t begin, int64_t end) const; - - TargetType target() const { return target_; } - - zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; } - - friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) { - os << "Tensor:" << '\n'; - os << "dim: " << tensor.dims() << '\n'; - for (int i = 0; i < tensor.dims().production(); i++) { - os << tensor.template data()[i] << " "; - } - os << "\n"; - return os; - } - - private: - TargetType target_{TargetType::kHost}; - DDimLite dims_; - std::shared_ptr buffer_; - LoD lod_; - size_t memory_size_{}; - - size_t offset_{0}; - - PrecisionType precision_{PrecisionType::kUnk}; - bool persistable_{false}; - - zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor(); - - template - void mutable_data_internal(); -}; - -template -R *TensorLite::mutable_data() { - std::vector v; - for (int i = 0; i < dims_.size(); i++) { - v.push_back(dims_[i]); - } - zynqmp::LayoutType layout_type = zynqmp::NCHW; - switch (v.size()) { - case 1: - layout_type = zynqmp::N; - break; - case 2: - layout_type = zynqmp::NC; - break; - case 3: - layout_type = zynqmp::NHW; - break; - case 4: - layout_type = zynqmp::NCHW; - break; - } - zynqmp::Shape input_shape(layout_type, v); - - zynqmp::DataType data_type = zynqmp::FP32; - if (typeid(T) == typeid(float)) { - data_type = zynqmp::FP32; - } - if (typeid(T) == typeid(zynqmp::float16)) { - data_type = zynqmp::FP16; - } - return zynq_tensor_->mutableData(data_type, input_shape); -} - -template -R *TensorLite::mutable_data(TargetType target) { - target_ = target; - return mutable_data(); -} - -template -bool TensorCompareWith(const TensorT &a, const TensorT &b) { - if (a.dims() != b.dims()) return false; - if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false; - return true; -} -template -TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { - int64_t base = numel() / dims_[0]; - - TensorLite dst; - dst.buffer_ = buffer_; - dst.target_ = target_; - auto dst_dims = dims_; - dst_dims[0] = end - begin; - dst.Resize(dst_dims); - dst.offset_ = offset_ + static_cast(begin * base) * sizeof(T); - return dst; -} -} // namespace lite -} // namespace paddle diff --git a/lite/backends/fpga/target_wrapper.cc b/lite/backends/fpga/target_wrapper.cc deleted file mode 100644 index 653384b061..0000000000 --- a/lite/backends/fpga/target_wrapper.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/target_wrapper.h" -#include "lite/backends/fpga/KD/llapi/zynqmp_api.h" -#include "lite/utils/all.h" -#ifdef LITE_WITH_FPGA -namespace paddle { -namespace lite { - -void* TargetWrapper::Malloc(size_t size) { - return zynqmp::fpga_malloc(size); -} - -void TargetWrapper::Free(void* ptr) { zynqmp::fpga_free(ptr); } - -void TargetWrapper::MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir) { - memcpy(dst, src, size); -} - -} // namespace lite -} // namespace paddle -#endif diff --git a/lite/backends/host/CMakeLists.txt b/lite/backends/host/CMakeLists.txt deleted file mode 100644 index 8c22d8da75..0000000000 --- a/lite/backends/host/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -lite_cc_library(target_wrapper_host SRCS target_wrapper.cc) - - diff --git a/lite/backends/host/target_wrapper.cc b/lite/backends/host/target_wrapper.cc deleted file mode 100644 index 5f020662a9..0000000000 --- a/lite/backends/host/target_wrapper.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/target_wrapper.h" -#include -#include - -namespace paddle { -namespace lite { - -const int MALLOC_ALIGN = 64; - -void* TargetWrapper::Malloc(size_t size) { - size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; - char* p = static_cast(malloc(offset + size)); - if (!p) { - return nullptr; - } - void* r = reinterpret_cast(reinterpret_cast(p + offset) & - (~(MALLOC_ALIGN - 1))); - static_cast(r)[-1] = p; - memset(r, 0, size); - return r; -} -void TargetWrapper::Free(void* ptr) { - if (ptr) { - free(static_cast(ptr)[-1]); - } -} -void TargetWrapper::MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir) { - memcpy(dst, src, size); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/CMakeLists.txt b/lite/backends/npu/CMakeLists.txt deleted file mode 100644 index abe567566b..0000000000 --- a/lite/backends/npu/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -if(NOT LITE_WITH_NPU) - return() -endif() - -lite_cc_library(npu_helper SRCS npu_helper.cc DEPS ${npu_ddk_libs}) -add_subdirectory(bridge) diff --git a/lite/backends/npu/bridge/CMakeLists.txt b/lite/backends/npu/bridge/CMakeLists.txt deleted file mode 100644 index cf3ad99055..0000000000 --- a/lite/backends/npu/bridge/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ - -lite_cc_library(npu_bridge_registry SRCS registry.cc DEPS ${npu_ddk_libs}) -lite_cc_library(npu_bridge_utils SRCS utils.cc DEPS ${npu_ddk_libs} tensor op mir_node scope) - -set(npu_bridge_deps npu_bridge_registry npu_bridge_utils op) - -lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_mul_op SRCS mul_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_act_op SRCS act_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_scale_op SRCS scale_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_softmax_op SRCS softmax_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_pool_op SRCS pool_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_elementwise_op SRCS elementwise_ops.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_reshape_op SRCS reshape_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_conv_transpose_op SRCS conv_transpose_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_interpolate_op SRCS interpolate_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_transpose_op SRCS transpose_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps}) -lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps}) - -set(npu_bridges - npu_bridge_registry - npu_bridge_utils - npu_bridge_fc_op - npu_bridge_conv_op - npu_bridge_mul_op - npu_bridge_act_op - npu_bridge_scale_op - npu_bridge_softmax_op - npu_bridge_pool_op - npu_bridge_batch_norm_op - npu_bridge_elementwise_op - npu_bridge_reshape_op - npu_bridge_conv_transpose_op - npu_bridge_interpolate_op - npu_bridge_transpose_op - npu_bridge_split_op - npu_bridge_concat_op - npu_bridge_shuffle_channel_op - npu_bridge_pad2d_op - CACHE INTERNAL "npu_bridges") - -lite_cc_library(npu_test_helper SRCS test_helper.cc DEPS npu_helper ${npu_ddk_libs} ${npu_bridges} ${npu_kernels} ${ops}) - -lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_mul_op SRCS mul_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_act_op SRCS act_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_scale_op SRCS scale_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_softmax_op SRCS softmax_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_pool_op SRCS pool_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_batch_norm_op SRCS batch_norm_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_elementwise_op SRCS elementwise_ops_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_reshape_op SRCS reshape_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_conv_transpose_op SRCS conv_transpose_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_interpolate_op SRCS interpolate_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_transpose_op SRCS transpose_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc DEPS npu_test_helper) -lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc DEPS npu_test_helper) - -message(STATUS "+++++ npu_bridges: ${npu_bridges}") diff --git a/lite/backends/npu/bridge/act_op.cc b/lite/backends/npu/bridge/act_op.cc deleted file mode 100644 index 9573f7d7e9..0000000000 --- a/lite/backends/npu/bridge/act_op.cc +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" -#include "lite/operators/relu_op.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ActConverter(const std::shared_ptr act_op, - const node_map_type& inputs_map) { - auto scope = act_op->scope(); - auto op_info = act_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - // create act node and set input node from inputs_map - auto x_var_name = op_info->Input("X").front(); - auto act_node = std::make_shared(unique_op_type); - CHECK(inputs_map.count(x_var_name)); - act_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(act_node); - - // parse and set activation type - int act_mode = 1; - if (op_type == "sigmod") { - act_mode = 0; - } else if (op_type == "relu") { - act_mode = 1; - } else if (op_type == "tanh") { - act_mode = 2; - } else if (op_type == "elu") { - act_mode = 4; - } else if (op_type == "abs") { - act_mode = 6; - } else if (op_type == "softsign") { - act_mode = 8; - } else if (op_type == "softplus") { - act_mode = 9; - } else if (op_type == "hardsigmoid") { - act_mode = 10; - } else { - // TODO(hong19860320) add more activation mode, and set the coef value - // clipped ReLU, LEAKY_RELU, relu1, threshold, selu and linear - LOG(FATAL) << "Unsupported activation type " << op_type; - } - act_node->set_attr_mode(act_mode); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = act_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(sigmod, paddle::lite::npu::bridge::ActConverter); -REGISTER_NPU_BRIDGE(relu, paddle::lite::npu::bridge::ActConverter); -REGISTER_NPU_BRIDGE(tanh, paddle::lite::npu::bridge::ActConverter); -REGISTER_NPU_BRIDGE(elu, paddle::lite::npu::bridge::ActConverter); -REGISTER_NPU_BRIDGE(abs, paddle::lite::npu::bridge::ActConverter); -REGISTER_NPU_BRIDGE(softsign, paddle::lite::npu::bridge::ActConverter); -REGISTER_NPU_BRIDGE(softplus, paddle::lite::npu::bridge::ActConverter); -REGISTER_NPU_BRIDGE(hardsigmoid, paddle::lite::npu::bridge::ActConverter); diff --git a/lite/backends/npu/bridge/act_op_test.cc b/lite/backends/npu/bridge/act_op_test.cc deleted file mode 100644 index edbfbb416f..0000000000 --- a/lite/backends/npu/bridge/act_op_test.cc +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" -#include "lite/operators/relu_op.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void relu_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->data(); - auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - DDim out_dims = out->dims(); - CHECK_EQ(x_dims.production(), out_dims.production()); - for (int i = 0; i < out_dims.production(); i++) { - out_data[i] = std::max(0.f, x_data[i]); - } -} - -void test_relu(int bs, int ic, int ih, int iw) { - // prepare input&output variables - Scope scope; - std::string x_var_name("x"); - std::string out_var_name("out"); - std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("relu"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - relu_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, relu) { - for (auto bs : {1, 3}) { - for (auto ic : {3, 4}) { - for (auto ih : {2, 5}) { - for (auto iw : {5, 9}) { - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw; - test_relu(bs, ic, ih, iw); - } - } - } - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(relu); -USE_NPU_BRIDGE(relu); diff --git a/lite/backends/npu/bridge/batch_norm_op.cc b/lite/backends/npu/bridge/batch_norm_op.cc deleted file mode 100644 index 76b4ac3d9b..0000000000 --- a/lite/backends/npu/bridge/batch_norm_op.cc +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/batch_norm_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type BatchNormConverter( - const std::shared_ptr batch_norm_op, - const node_map_type& inputs_map) { - auto scope = batch_norm_op->scope(); - auto op_info = batch_norm_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - std::shared_ptr batch_norm_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); - - auto scale_var_name = op_info->Input("Scale").front(); - lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable(); - auto npu_scale = std::make_shared(scale_var_name); - npu_scale->set_attr_value(CvtFromLiteTensor(scale)); - OpList::Global().add(npu_scale); - - auto bias_var_name = op_info->Input("Bias").front(); - lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable(); - auto npu_bias = std::make_shared(bias_var_name); - npu_bias->set_attr_value(CvtFromLiteTensor(bias)); - OpList::Global().add(npu_bias); - - auto mean_var_name = op_info->Input("Mean").front(); - lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable(); - auto npu_mean = std::make_shared(mean_var_name); - npu_mean->set_attr_value(CvtFromLiteTensor(mean)); - OpList::Global().add(npu_mean); - - auto variance_var_name = op_info->Input("Variance").front(); - lite::Tensor* variance = - scope->FindVar(variance_var_name)->GetMutable(); - auto npu_variance = std::make_shared(variance_var_name); - npu_variance->set_attr_value(CvtFromLiteTensor(variance)); - OpList::Global().add(npu_variance); - - float npu_momentum = op_info->GetAttr("momentum"); - float npu_epsilon = op_info->GetAttr("epsilon"); - int npu_mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1 - bool npu_use_global_stats = op_info->GetAttr("use_global_stats"); - - batch_norm_node->set_input_x(*inputs_map.at(x_var_name)); - batch_norm_node->set_input_scale(*npu_scale); - batch_norm_node->set_input_b(*npu_bias); - batch_norm_node->set_input_mean(*npu_mean); - batch_norm_node->set_input_variance(*npu_variance); - batch_norm_node->set_attr_momentum(npu_momentum); - batch_norm_node->set_attr_epsilon(npu_epsilon); - batch_norm_node->set_attr_mode(npu_mode); - batch_norm_node->set_attr_use_global_stats(npu_use_global_stats); - - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(batch_norm_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Y").front()] = batch_norm_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(batch_norm, paddle::lite::npu::bridge::BatchNormConverter); diff --git a/lite/backends/npu/bridge/batch_norm_op_test.cc b/lite/backends/npu/bridge/batch_norm_op_test.cc deleted file mode 100644 index ec5898f6c8..0000000000 --- a/lite/backends/npu/bridge/batch_norm_op_test.cc +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/batch_norm_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -void batch_norm_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable(); - auto bias = - scope->FindVar(op_info->Input("Bias").front())->GetMutable(); - auto scale = - scope->FindVar(op_info->Input("Scale").front())->GetMutable(); - auto mean = - scope->FindVar(op_info->Input("Mean").front())->GetMutable(); - auto variance = - scope->FindVar(op_info->Input("Variance").front())->GetMutable(); - - auto x_data = x->data(); - auto y_data = y->mutable_data(); - auto scale_data = scale->mutable_data(); - auto bias_data = bias->mutable_data(); - auto mean_data = mean->mutable_data(); - auto variance_data = variance->mutable_data(); - DDim x_dims = x->dims(); - - float epsilon = op_info->GetAttr("epsilon"); - float momentum = op_info->GetAttr("momentum"); - auto data_layout = op_info->GetAttr("data_layout"); - - bool global_stats = op_info->GetAttr("use_global_stats"); - if (global_stats) { - int64_t outer_size = 0; - int64_t channel_size = 0; - int64_t inner_size = 0; - if (data_layout == "NCHW") { - outer_size = x_dims[0]; - channel_size = x_dims[1]; - inner_size = x_dims.Slice(2, x_dims.size()).production(); - } else { - LOG(FATAL) << "Unknown storage order: " << data_layout; - } - auto x_ptr = x_data; - auto y_ptr = y_data; - for (int o = 0; o < outer_size; o++) { - for (int c = 0; c < channel_size; c++) { - for (int i = 0; i < inner_size; i++) { - dtype norm_x = - (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); - *y_ptr = norm_x * scale_data[c] + bias_data[c]; - x_ptr++; - y_ptr++; - } - } - } - } -} - -void test_batch_norm( - int bs, int ic, int ih, int iw, float epsilon, float momentum) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - std::string scale_var_name = "scale"; - std::string bias_var_name = "bias"; - std::string mean_var_name = "mean"; - std::string variance_var_name = "variance"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* scale = scope.Var(scale_var_name)->GetMutable(); - auto* bias = scope.Var(bias_var_name)->GetMutable(); - auto* mean = scope.Var(mean_var_name)->GetMutable(); - auto* variance = scope.Var(variance_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - scale->Resize({ic}); - bias->Resize({ic}); - mean->Resize({ic}); - variance->Resize({ic}); - - // initialize input&output data - FillTensor(x); - FillTensor(scale); - FillTensor(bias); - FillTensor(mean); - // variance > 0 - FillTensor(variance, 1.f, 5.f); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("batch_norm"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetInput("Scale", {scale_var_name}); - opdesc.SetInput("Bias", {bias_var_name}); - opdesc.SetInput("Mean", {mean_var_name}); - opdesc.SetInput("Variance", {variance_var_name}); - opdesc.SetOutput("Y", {out_var_name}); - opdesc.SetAttr("is_test", 1); - opdesc.SetAttr("use_global_stats", true); - opdesc.SetAttr("epsilon", epsilon); - opdesc.SetAttr("momentum", momentum); - opdesc.SetAttr("data_layout", std::string("NCHW")); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - batch_norm_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, batch_norm) { - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto epsilon : {1e-4f, 1e-5f}) { - for (auto momentum : {0.9f, 0.99f}) { - test_batch_norm(bs, ic, ih, iw, epsilon, momentum); - } - } - } - } - } - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(batch_norm); -USE_NPU_BRIDGE(batch_norm); diff --git a/lite/backends/npu/bridge/concat_op.cc b/lite/backends/npu/bridge/concat_op.cc deleted file mode 100644 index 8548225181..0000000000 --- a/lite/backends/npu/bridge/concat_op.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/concat_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" -#include "lite/backends/npu/npu_helper.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ConcatConverter(const std::shared_ptr concat_op, - const node_map_type& inputs_map) { - lite::Scope* scope = concat_op->scope(); - const lite::OpInfo* op_info = concat_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "converting " << op_type << " ... "; - - auto x_var_names = op_info->Input("X"); - auto axis = op_info->GetAttr("axis"); - int num = x_var_names.size(); - int index = 0; - - std::shared_ptr output_node = - std::make_shared(unique_op_type); - output_node->set_attr_axis(axis); - output_node->set_attr_N(num); - output_node->create_dynamic_input_x(num); - for (auto x_var_name : x_var_names) { - if (inputs_map.find(x_var_name) != inputs_map.end()) { - output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - } else { - auto consty = std::make_shared(x_var_name); - auto* x = scope->FindVar(x_var_name)->GetMutable(); - consty->set_attr_value(CvtFromLiteTensor(x)); - output_node->set_dynamic_input_x(index + 1, *consty); - OpList::Global().add(consty); - } - index++; - } - OpList::Global().add(output_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = output_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(concat, paddle::lite::npu::bridge::ConcatConverter); diff --git a/lite/backends/npu/bridge/concat_op_test.cc b/lite/backends/npu/bridge/concat_op_test.cc deleted file mode 100644 index f1bf3101b2..0000000000 --- a/lite/backends/npu/bridge/concat_op_test.cc +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/concat_op.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -std::vector stride_numel(const DDim& ddim) { - std::vector strides(ddim.size()); - strides[ddim.size() - 1] = ddim[ddim.size() - 1]; - for (int i = ddim.size() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * ddim[i]; - } - return strides; -} - -void concat_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = op_info->Input("X"); - std::vector inputs; - for (auto var : x) { - inputs.push_back(scope->FindVar(var)->GetMutable()); - } - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - int axis = op_info->GetAttr("axis"); - std::vector inputs_concat(inputs.size()); - for (int j = 0; j < inputs.size(); ++j) { - inputs_concat[j] = inputs[j]; - } - size_t num = inputs.size(); - int rows = 1; - auto dim_0 = inputs[0]->dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int out_rows = rows, out_cols = 0; - std::vector inputs_cols(inputs.size()); - for (int i = 0; i < num; ++i) { - int t_cols = inputs[i]->numel() / rows; - out_cols += t_cols; - inputs_cols[i] = t_cols; - } - for (int k = 0; k < out_rows; ++k) { - float* dst_ptr = out->mutable_data() + k * out_cols; - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = inputs_cols[j]; - const float* src_prt = inputs[j]->data() + k * col_len; - std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len); - col_idx += col_len; - } - } -} - -void test_concat(std::vector> input, int axis) { - std::string x_var_name = "x"; - std::string y_var_name = "y"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - - // prepare input&output variables - Scope scope; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* y = scope.Var(y_var_name)->GetMutable(); - x->Resize(DDim(input[0])); - y->Resize(DDim(input[1])); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - CHECK_EQ(out->dims(), out_ref->dims()); - - // initialize input&output data - FillTensor(x); - FillTensor(y); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("concat"); - opdesc.SetInput("X", {x_var_name, y_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axis", axis); - - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name, y_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - concat_ref(op); - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4); - } -} - -TEST(NPUBridges, concat) { - test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0); - test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1); - test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2); - test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3); -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(concat); -USE_NPU_BRIDGE(concat); diff --git a/lite/backends/npu/bridge/conv_op.cc b/lite/backends/npu/bridge/conv_op.cc deleted file mode 100644 index 1be3d17cb6..0000000000 --- a/lite/backends/npu/bridge/conv_op.cc +++ /dev/null @@ -1,216 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/conv_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ConvConverter(const std::shared_ptr conv_op, - const node_map_type& inputs_map) { - auto scope = conv_op->scope(); - auto op_info = conv_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " << op_type << "... "; - - // get input, filter and op attributes - auto input_var_name = op_info->Input("Input").front(); - auto input = scope->FindVar(input_var_name)->GetMutable(); - auto input_dims = input->dims(); - auto output_var_name = op_info->Output("Output").front(); - auto output = scope->FindVar(output_var_name)->GetMutable(); - auto output_dims = output->dims(); - auto filter_var_name = op_info->Input("Filter").front(); - auto filter = scope->FindVar(filter_var_name)->GetMutable(); - auto filter_dims = filter->dims(); - auto bs = input_dims[0]; - auto ic = input_dims[1]; - auto oc = filter_dims[0]; - CHECK_EQ(input_dims.size(), 4); - CHECK_EQ(output_dims.size(), 4); - CHECK_EQ(filter_dims.size(), 4); - CHECK_EQ(output_dims[0], bs); - CHECK_EQ(output_dims[1], oc); - auto strides = op_info->GetAttr>("strides"); - auto paddings = op_info->GetAttr>("paddings"); - auto groups = op_info->GetAttr("groups"); - auto dilations = op_info->GetAttr>("dilations"); - auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); - - // check depthwise mode, and decide whether use ConvolutionDepthwise Op - bool use_depthwise_conv = - false; // whether use ge::op::ConvolutionDepthwise ? - bool is_depthwise_mode = ic == groups && oc == groups; - if (is_depthwise_mode && - !((groups == 1 || groups >= 5) && dilations[0] == 1 && - dilations[1] == 1)) { - use_depthwise_conv = true; - LOG(WARNING) << "For depthwise mode, dilation = 1 and groups >= 5 (or " - "groups = 1) is only supported in " - "Convolution Op, so force to use ConvolutionDepthwise Op, " - "but may lead poor performance."; - } - - // check input - CHECK(inputs_map.count(input_var_name)); - OpList::Global().add(inputs_map.at(input_var_name)); - - // create filter node - CHECK(!inputs_map.count(filter_var_name)); - auto filter_const_node = std::make_shared(filter_var_name); - filter_const_node->set_attr_value(CvtFromLiteTensor(filter)); - OpList::Global().add(filter_const_node); - - // create bias node if has bias - // supports the bias nodes with the following dimensions - // 0: {oc} - // 1: {1, oc, oh, ow} - // 2: {n, oc, oh, ow} - std::shared_ptr bias_node = nullptr; - bool is_channel_bias = false; - if (HasInputArg(op_info, scope, "Bias")) { - auto bias_var_name = op_info->Input("Bias").front(); - auto* bias = scope->FindVar(bias_var_name)->GetMutable(); - auto bias_dims = bias->dims(); - auto bias_data_size = bias_dims.production(); - auto output_data_size = output_dims.production(); - std::vector bias_shape; - if (bias_data_size == oc) { - // 0: {oc} - bias_shape = {1, oc, 1, 1}; - is_channel_bias = true; - } else if (bias_data_size == output_data_size / bs) { - // 1: {1, oc, oh, ow} - bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; - } else if (bias_data_size == output_data_size) { - // 2: {n, oc, oh, ow} - bias_shape = output_dims.Vectorize(); - } else { - LOG(ERROR) << "bias dimension " << bias_dims - << " isn't supported in conv2d Op when output dimension is " - << output_dims; - } - if (inputs_map.count(bias_var_name)) { - // bias node from input map - bias_node = inputs_map.at(bias_var_name); - } else { - // bias node with const data - auto bias_const_node = std::make_shared(bias_var_name); - bias_const_node->set_attr_value(CvtFromLiteTensor(bias, bias_shape)); - bias_node = bias_const_node; - } - OpList::Global().add(bias_node); - } - - // create conv node and set input, filter, bias nodes and attributes - std::shared_ptr conv_node = nullptr; - if (use_depthwise_conv && is_depthwise_mode) { - auto depthwise_conv_node = - std::make_shared(unique_op_type); - depthwise_conv_node->set_input_x(*inputs_map.at(input_var_name)); - depthwise_conv_node->set_input_filter(*filter_const_node); - depthwise_conv_node->set_attr_mode(1); - depthwise_conv_node->set_attr_algo(0); - depthwise_conv_node->set_attr_format(0); // NCHW - depthwise_conv_node->set_attr_pad_mode(5); // VALID - depthwise_conv_node->set_attr_group(groups); - depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); - depthwise_conv_node->set_attr_dilation( - ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - depthwise_conv_node->set_attr_stride( - ge::AttrValue::LIST_INT({strides[0], strides[1]})); - depthwise_conv_node->set_attr_kernel( - ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - OpList::Global().add(depthwise_conv_node); - conv_node = depthwise_conv_node; - // ConvolutionDepthwise Op doesn't support bias, so append Add node to - // support bias - if (bias_node != nullptr) { - auto add_node = std::make_shared(unique_op_type + "/add"); - add_node->set_input_x1(*depthwise_conv_node); - add_node->set_input_x2(*bias_node); - OpList::Global().add(add_node); - conv_node = add_node; - } - } else { - auto common_conv_node = - std::make_shared(unique_op_type); - common_conv_node->set_input_x(*inputs_map.at(input_var_name)); - common_conv_node->set_input_w(*filter_const_node); - common_conv_node->set_attr_mode(1); - common_conv_node->set_attr_pad_mode(0); // NOTSET - common_conv_node->set_attr_group(groups); - common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); - common_conv_node->set_attr_dilation( - ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - common_conv_node->set_attr_stride( - ge::AttrValue::LIST_INT({strides[0], strides[1]})); - common_conv_node->set_attr_kernel( - ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - OpList::Global().add(common_conv_node); - conv_node = common_conv_node; - // Convolution Op only support bias with dimension {1, oc, 1, 1}, - // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow) - if (bias_node != nullptr) { - if (is_channel_bias) { - common_conv_node->set_input_b(*bias_node); - } else { - auto add_node = std::make_shared(unique_op_type + "/add"); - add_node->set_input_x1(*common_conv_node); - add_node->set_input_x2(*bias_node); - OpList::Global().add(add_node); - conv_node = add_node; - } - } - } - CHECK(conv_node); - - node_map_type outputs_map; - if (fuse_relu) { - // append relu node if fuse_relu is true - auto relu_node = - std::make_shared(unique_op_type + "/relu"); - relu_node->set_input_x(*conv_node); - relu_node->set_attr_mode(1); - OpList::Global().add(relu_node); - outputs_map[op_info->Output("Output").front()] = relu_node; - } else { - outputs_map[op_info->Output("Output").front()] = conv_node; - } - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(conv2d, paddle::lite::npu::bridge::ConvConverter); -REGISTER_NPU_BRIDGE(depthwise_conv2d, paddle::lite::npu::bridge::ConvConverter); diff --git a/lite/backends/npu/bridge/conv_op_test.cc b/lite/backends/npu/bridge/conv_op_test.cc deleted file mode 100644 index 27e1226eaf..0000000000 --- a/lite/backends/npu/bridge/conv_op_test.cc +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/conv_op.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void conv_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto input = - scope->FindVar(op_info->Input("Input").front())->GetMutable(); - auto filter = - scope->FindVar(op_info->Input("Filter").front())->GetMutable(); - auto output = - scope->FindVar(op_info->Output("Output").front())->GetMutable(); - std::vector strides = - op_info->GetAttr>("strides"); - std::vector paddings = - op_info->GetAttr>("paddings"); - int32_t groups = op_info->GetAttr("groups"); - std::vector dilations = - op_info->GetAttr>("dilations"); - bool fuse_relu = op_info->GetAttr("fuse_relu"); - auto input_dims = input->dims(); - auto filter_dims = filter->dims(); - auto output_dims = output->dims(); - auto input_data = input->mutable_data(); - auto filter_data = filter->mutable_data(); - auto output_data = output->mutable_data(); - int kernel_w = filter_dims[3]; - int kernel_h = filter_dims[2]; - int stride_w = strides[1]; - int stride_h = strides[0]; - int dila_w = dilations[1]; - int dila_h = dilations[0]; - int pad_w = paddings[1]; - int pad_h = paddings[0]; - int batch_size = input_dims[0]; - int in_ch_size = input_dims[1]; - int in_h = input_dims[2]; - int in_w = input_dims[3]; - int out_ch_size = output_dims[1]; - int out_h = output_dims[2]; - int out_w = output_dims[3]; - int out_c_group = out_ch_size / groups; - int in_c_group = in_ch_size / groups; - Tensor* bias = nullptr; - float* bias_data = nullptr; - bool is_channel_bias = false; - if (op_info->HasInput("Bias")) { - auto bias_var_names = op_info->Input("Bias"); - if (bias_var_names.size() > 0) { - auto bias_var_name = bias_var_names.front(); - bias = scope->FindVar(bias_var_name)->GetMutable(); - auto bias_dims = bias->dims(); - is_channel_bias = bias_dims.production() == out_ch_size; - bias_data = bias->mutable_data(); - } - } - for (int n = 0; n < batch_size; ++n) { - for (int g = 0; g < groups; ++g) { - for (int oc = 0; oc < out_c_group; ++oc) { - for (int oh = 0; oh < out_h; ++oh) { - for (int ow = 0; ow < out_w; ++ow) { - int out_idx = n * groups * out_c_group * out_h * out_w + - g * out_c_group * out_h * out_w + oc * out_h * out_w + - oh * out_w + ow; - float out_value = - bias_data != nullptr - ? (is_channel_bias ? bias_data[g * out_c_group + oc] - : bias_data[out_idx]) - : 0; - // + out_value *= beta; - for (int ic = 0; ic < in_c_group; ++ic) { - for (int kh = 0; kh < kernel_h; ++kh) { - for (int kw = 0; kw < kernel_w; ++kw) { - int iw = ow * stride_w - pad_w + kw * (dila_w); - int ih = oh * stride_h - pad_h + kh * (dila_h); - if (iw < 0 || iw >= in_w) continue; - if (ih < 0 || ih >= in_h) continue; - int in_idx = n * in_ch_size * in_h * in_w + - g * in_c_group * in_h * in_w + ic * in_h * in_w + - ih * in_w + iw; - int filter_idx = - g * out_c_group * in_c_group * kernel_h * kernel_w + - oc * in_c_group * kernel_h * kernel_w + - ic * kernel_h * kernel_w + kh * kernel_w + kw; - out_value += input_data[in_idx] * filter_data[filter_idx]; - } - } - } - if (fuse_relu) { - out_value = out_value > 0 ? out_value : 0; - } - output_data[out_idx] = out_value; - } - } - } - } - } -} - -void test_conv(int bs, - int ic, - int oc, - int ih, - int iw, - bool has_bias, - bool is_channel_bias, - bool fuse_relu, - bool depthwise, - int dilation, - int stride, - int padding, - int kernel) { - // prepare input&output variables - Scope scope; - std::string input_var_name("input"); - std::string filter_var_name("filter"); - std::string bias_var_name("bias"); - std::string output_var_name("output"); - std::string output_ref_var_name("output_ref"); - auto* input = scope.Var(input_var_name)->GetMutable(); - auto* filter = scope.Var(filter_var_name)->GetMutable(); - auto* bias = scope.Var(bias_var_name)->GetMutable(); - auto* output = scope.Var(output_var_name)->GetMutable(); - auto* output_ref = scope.Var(output_ref_var_name)->GetMutable(); - - // get group size and input&filter shape - int groups = 1; - if (depthwise) { // depthwise convolution ? - groups = oc = ic; - } - std::vector input_shape = {bs, ic, ih, iw}; - std::vector filter_shape = {oc, ic / groups, kernel, kernel}; - std::vector output_shape({bs, oc}); - for (size_t i = 0; i < 2; i++) { - const int dkernel = dilation * (kernel - 1) + 1; - int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1; - output_shape.push_back(output_size); - } - input->Resize(input_shape); - filter->Resize(filter_shape); - - // initialize input&output data - FillTensor(input); - FillTensor(filter); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d"); - opdesc.SetInput("Input", {input_var_name}); - opdesc.SetInput("Filter", {filter_var_name}); - opdesc.SetOutput("Output", {output_var_name}); - opdesc.SetAttr("dilations", std::vector({dilation, dilation})); - opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); - opdesc.SetAttr("groups", groups); - opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); - if (has_bias) { - if (is_channel_bias) { - bias->Resize({1, oc, 1, 1}); - } else { - bias->Resize({output_shape}); - } - FillTensor(bias); - opdesc.SetInput("Bias", {bias_var_name}); - } - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {input_var_name}, {output_var_name}); - output_ref->CopyDataFrom(*output); - - // execute reference implementation and save to output tensor('out') - conv_ref(op); - - // compare results - auto* output_data = output->mutable_data(); - auto* output_ref_data = output_ref->mutable_data(); - for (int i = 0; i < output->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, conv) { -#if 1 - for (auto bs : {1, 2}) { - for (auto ic : {3, 6}) { - for (auto oc : {6, 9}) { - for (auto ih : {14, 28}) { - for (auto iw : {14, 28}) { - for (auto has_bias : {false, true}) { - for (auto is_channel_bias : {false, true}) { - for (auto fuse_relu : {false, true}) { - for (auto depthwise : {false, true}) { - for (auto dilation : {1, 2}) { - for (auto stride : {1, 2}) { - for (auto kernel : {1, 3, 5}) { - std::vector paddings = {kernel / 2}; - if (kernel / 2 != 0) { - paddings.push_back(0); - } - for (auto padding : paddings) { - VLOG(3) << "bs: " << bs << " ic: " << ic - << " oc: " << oc << " ih: " << ih - << " iw: " << iw - << " has_bias: " << has_bias - << " is_channel_bias: " << is_channel_bias - << " fuse_relu: " << fuse_relu - << " depthwise: " << depthwise - << " dilation: " << dilation - << " stride: " << stride - << " padding: " << padding - << " kernel: " << kernel; - test_conv(bs, - ic, - oc, - ih, - iw, - has_bias, - is_channel_bias, - fuse_relu, - depthwise, - dilation, - stride, - padding, - kernel); - } - } - } - } - } - } - } - } - } - } - } - } - } -#else - test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 1, 3); - test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 3); - test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 2, 5); - test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 5); -#endif -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(conv2d); -USE_NPU_BRIDGE(conv2d); - -USE_LITE_OP(depthwise_conv2d); -USE_NPU_BRIDGE(depthwise_conv2d); diff --git a/lite/backends/npu/bridge/conv_transpose_op.cc b/lite/backends/npu/bridge/conv_transpose_op.cc deleted file mode 100644 index e27132c216..0000000000 --- a/lite/backends/npu/bridge/conv_transpose_op.cc +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/conv_transpose_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ConvTransposeConverter( - const std::shared_ptr conv_transpose_op, - const node_map_type& inputs_map) { - auto scope = conv_transpose_op->scope(); - auto op_info = conv_transpose_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " << op_type << "... "; - - // get input, output and op attributes - auto input_var_name = op_info->Input("Input").front(); - auto input = scope->FindVar(input_var_name)->GetMutable(); - auto input_shape = input->dims().Vectorize(); - auto filter_var_name = op_info->Input("Filter").front(); - auto filter = scope->FindVar(filter_var_name)->GetMutable(); - auto filter_shape = filter->dims().Vectorize(); - CHECK_EQ(input_shape.size(), 4); - CHECK_EQ(filter_shape.size(), 4); - auto strides = op_info->GetAttr>("strides"); - auto paddings = op_info->GetAttr>("paddings"); - auto groups = op_info->GetAttr("groups"); - auto dilations = op_info->GetAttr>("dilations"); - auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); - - // create deconv node - auto conv_transpose_node = - std::make_shared(unique_op_type); - - // create input sizes node to describe the dimensions of input tensor - std::vector output_shape; - output_shape.push_back(input_shape[0]); - output_shape.push_back(filter_shape[1] * groups); - for (int i = 0; i < strides.size(); i++) { - int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1; - int output_size = - (input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i]; - output_shape.push_back(output_size); - } - auto input_sizes_const_node = - std::make_shared(unique_op_type + "/input_size"); - input_sizes_const_node->set_attr_value(CreateTensorAndFillData(output_shape)); - conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); - OpList::Global().add(input_sizes_const_node); - - // create filter node - CHECK(!inputs_map.count(filter_var_name)); - auto filter_const_node = std::make_shared(filter_var_name); - filter_const_node->set_attr_value(CvtFromLiteTensor(filter)); - conv_transpose_node->set_input_filter(*filter_const_node); - OpList::Global().add(filter_const_node); - - // set input node - CHECK(inputs_map.count(input_var_name)); - conv_transpose_node->set_input_x(*inputs_map.at(input_var_name)); - OpList::Global().add(inputs_map.at(input_var_name)); - - // set attributes - conv_transpose_node->set_attr_mode(1); - conv_transpose_node->set_attr_format(0); // NCHW - conv_transpose_node->set_attr_pad_mode(0); // NOTSET - conv_transpose_node->set_attr_group(groups); - conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); - conv_transpose_node->set_attr_dilation( - ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - conv_transpose_node->set_attr_stride( - ge::AttrValue::LIST_INT({strides[0], strides[1]})); - conv_transpose_node->set_attr_kernel( - ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]})); - OpList::Global().add(conv_transpose_node); - - // append add node to add bias if has bias - std::shared_ptr output_node = conv_transpose_node; - if (HasInputArg(op_info, scope, "Bias")) { - // create bias node - auto bias_var_name = op_info->Input("Bias").front(); - CHECK(!inputs_map.count(bias_var_name)); - auto* bias = scope->FindVar(bias_var_name)->GetMutable(); - auto channel_size = bias->dims().production(); - CHECK_EQ(channel_size, filter_shape[1] * groups); - auto bias_const_node = std::make_shared(bias_var_name); - bias_const_node->set_attr_value( - CvtFromLiteTensor(bias, {1, channel_size, 1, 1})); - OpList::Global().add(bias_const_node); - // append add node to add bias node - auto add_node = std::make_shared(unique_op_type + "/add"); - add_node->set_input_x1(*conv_transpose_node); - add_node->set_input_x2(*bias_const_node); - OpList::Global().add(add_node); - output_node = add_node; - } - - node_map_type outputs_map; - if (fuse_relu) { - // append relu node if fuse_relu is true - auto relu_node = - std::make_shared(unique_op_type + "/relu"); - relu_node->set_input_x(*output_node); - relu_node->set_attr_mode(1); - OpList::Global().add(relu_node); - outputs_map[op_info->Output("Output").front()] = relu_node; - } else { - outputs_map[op_info->Output("Output").front()] = output_node; - } - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(conv2d_transpose, - paddle::lite::npu::bridge::ConvTransposeConverter); diff --git a/lite/backends/npu/bridge/conv_transpose_op_test.cc b/lite/backends/npu/bridge/conv_transpose_op_test.cc deleted file mode 100644 index 02e3c7a1ce..0000000000 --- a/lite/backends/npu/bridge/conv_transpose_op_test.cc +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/conv_transpose_op.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -void add_bias_with_relu(DType* data, - const DType* bias, - int channel_size, - int inner_size, - bool has_relu) { - for (int c = 0; c < channel_size; ++c) { - DType bias_val = bias != nullptr ? bias[c] : 0; - for (int i = 0; i < inner_size; i++) { - DType data_val = data[i]; - data_val += bias_val; - if (has_relu) { - data_val = data_val > 0 ? data_val : 0.f; - } - data[i] = data_val; - } - data += inner_size; - } -} - -template -void col2im(const DType* data_col, - const int channel_size, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - DType* data_im) { - memset(data_im, 0, height * width * channel_size * sizeof(DType)); - const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; - const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; - const int inner_size = height * width; - for (int c = channel_size; c--; data_im += inner_size) { - for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; - for (int output_rows = output_h; output_rows; output_rows--) { - if (input_row < 0 || input_row >= height) { - data_col += output_w; - } else { - int input_col = -pad_w + kernel_col * dilation_w; - for (int output_col = output_w; output_col; output_col--) { - if (input_col >= 0 && input_col < width) { - data_im[input_row * width + input_col] += *data_col; - } - data_col++; - input_col += stride_w; - } - } - input_row += stride_h; - } - } - } - } -} - -template -void gemm(int M, - int N, - int K, - const IType* A, - const IType* B, - OType* C, - OType alpha, - OType beta, - bool is_trans_A = false, - bool is_trans_B = false) { - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - OType sum = static_cast(0); - for (int k = 0; k < K; ++k) { - IType a; - IType b; - if (is_trans_A) { - a = A[k * M + m]; - } else { - a = A[m * K + k]; - } - if (is_trans_B) { - b = B[n * K + k]; - } else { - b = B[k * N + n]; - } - sum += a * b; - } - C[m * N + n] = alpha * sum + beta * C[m * N + n]; - } - } -} - -template -void conv_transpose_ref( - const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto input = - scope->FindVar(op_info->Input("Input").front())->GetMutable(); - auto filter = - scope->FindVar(op_info->Input("Filter").front())->GetMutable(); - auto output = - scope->FindVar(op_info->Output("Output").front())->GetMutable(); - std::vector strides = - op_info->GetAttr>("strides"); - std::vector paddings = - op_info->GetAttr>("paddings"); - int32_t groups = op_info->GetAttr("groups"); - std::vector dilations = - op_info->GetAttr>("dilations"); - bool fuse_relu = op_info->GetAttr("fuse_relu"); - Tensor* bias = nullptr; - OType* bias_data = nullptr; - if (op_info->HasInput("Bias")) { - auto bias_var_names = op_info->Input("Bias"); - if (bias_var_names.size() > 0) { - auto bias_var_name = bias_var_names.front(); - bias = scope->FindVar(bias_var_name)->GetMutable(); - bias_data = bias->mutable_data(); - } - } - auto input_dims = input->dims(); - auto filter_dims = filter->dims(); - auto output_dims = output->dims(); - auto input_data = input->mutable_data(); - auto filter_data = filter->mutable_data(); - auto output_data = output->mutable_data(); - int kernel_w = filter_dims[3]; - int kernel_h = filter_dims[2]; - int stride_w = strides[1]; - int stride_h = strides[0]; - int dila_w = dilations[1]; - int dila_h = dilations[0]; - int pad_w = paddings[1]; - int pad_h = paddings[0]; - int batch_size = input_dims[0]; - int in_ch_size = input_dims[1]; - int in_h = input_dims[2]; - int in_w = input_dims[3]; - int out_ch_size = output_dims[1]; - int out_h = output_dims[2]; - int out_w = output_dims[3]; - - int M = out_ch_size * kernel_w * kernel_h / groups; - int N = in_h * in_w; - int K = in_ch_size / groups; - - if (in_ch_size != out_ch_size || groups != in_ch_size) { - CHECK_EQ(in_ch_size % groups, 0); - CHECK_EQ(out_ch_size % groups, 0); - } - - auto workspace = std::vector(groups * M * N); - int group_input_size = in_w * in_h * in_ch_size / groups; - int group_output_size = out_w * out_h * out_ch_size / groups; - int group_col_size = M * N; - int group_filter_size = - in_ch_size * out_ch_size * kernel_w * kernel_h / (groups * groups); - bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && - (dila_w == 1) && (dila_h == 1); - for (int n = 0; n < batch_size; ++n) { - input_data += n * in_ch_size * in_h * in_w; - output_data += n * out_ch_size * out_h * out_w; - auto col_data = workspace.data(); - if (flag_1x1s1p1) { - col_data = output_data; - } - memset(col_data, 0, sizeof(OType) * group_col_size); - for (int g = 0; g < groups; ++g) { - auto input_group_data = input_data + g * group_input_size; - auto filter_group_data = filter_data + g * group_filter_size; - auto col_group_data = col_data + g * group_col_size; - gemm(M, - N, - K, - filter_group_data, - input_group_data, - col_group_data, - static_cast(1), - static_cast(0), - true, - false); - } - if (!flag_1x1s1p1) { - col2im(col_data, - out_ch_size, - out_h, - out_w, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dila_h, - dila_w, - output_data); - } - add_bias_with_relu( - output_data, bias_data, out_ch_size, out_w * out_h, fuse_relu); - } -} - -void test_conv_transpose(int bs, - int ic, - int ih, - int iw, - bool has_bias, - bool fuse_relu, - int filters, - int groups, - int dilation, - int stride, - int padding, - int kernel) { - // prepare input&output variables - Scope scope; - std::string input_var_name("input"); - std::string filter_var_name("filter"); - std::string bias_var_name("bias"); - std::string output_var_name("output"); - std::string output_ref_var_name("output_ref"); - auto* input = scope.Var(input_var_name)->GetMutable(); - auto* filter = scope.Var(filter_var_name)->GetMutable(); - auto* bias = scope.Var(bias_var_name)->GetMutable(); - auto* output = scope.Var(output_var_name)->GetMutable(); - auto* output_ref = scope.Var(output_ref_var_name)->GetMutable(); - - // get group size and input&filter shape - std::vector input_shape = {bs, ic, ih, iw}; - std::vector filter_shape = {ic, filters, kernel, kernel}; - input->Resize(input_shape); - filter->Resize(filter_shape); - - // initialize input&output data - FillTensor(input); - FillTensor(filter); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("conv2d_transpose"); - opdesc.SetInput("Input", {input_var_name}); - opdesc.SetInput("Filter", {filter_var_name}); - opdesc.SetOutput("Output", {output_var_name}); - opdesc.SetAttr("dilations", std::vector({dilation, dilation})); - opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); - opdesc.SetAttr("groups", groups); - opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); - if (has_bias) { - bias->Resize({1, filters * groups, 1, 1}); - FillTensor(bias); - opdesc.SetInput("Bias", {bias_var_name}); - } - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {input_var_name}, {output_var_name}); - output_ref->CopyDataFrom(*output); - - // execute reference implementation and save to output tensor('out') - conv_transpose_ref(op); - - // compare results - auto* output_data = output->mutable_data(); - auto* output_ref_data = output_ref->mutable_data(); - for (int i = 0; i < output->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, conv_transpose) { -#if 1 - for (auto bs : {1, 2}) { - for (auto ic : {3, 6}) { - for (auto ih : {14, 28}) { - for (auto iw : {14, 28}) { - for (auto has_bias : {false, true}) { - for (auto fuse_relu : {false, true}) { - for (auto filters : {1, 2, 5}) { - for (auto groups : {1 /* , 2, 5*/}) { - for (auto dilation : {1, 2}) { - for (auto stride : {1, 2}) { - for (auto kernel : {1, 3, 5}) { - std::vector paddings = {kernel / 2}; - if (kernel / 2 != 0) { - paddings.push_back(0); - } - for (auto padding : paddings) { - VLOG(3) << "bs: " << bs << " ic: " << ic - << " ih: " << ih << " iw: " << iw - << " has_bias: " << has_bias - << " fuse_relu: " << fuse_relu - << " filters: " << filters - << " groups: " << groups - << " dilation: " << dilation - << " stride: " << stride - << " padding: " << padding - << " kernel: " << kernel; - test_conv_transpose(bs, - ic, - ih, - iw, - has_bias, - fuse_relu, - filters, - groups, - dilation, - stride, - padding, - kernel); - } - } - } - } - } - } - } - } - } - } - } - } -#else - test_conv_transpose(1, 6, 8, 8, false, false, 5, 2, 1, 1, 1, 3); -#endif -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(conv2d_transpose); -USE_NPU_BRIDGE(conv2d_transpose); diff --git a/lite/backends/npu/bridge/elementwise_ops.cc b/lite/backends/npu/bridge/elementwise_ops.cc deleted file mode 100644 index 5459d819bb..0000000000 --- a/lite/backends/npu/bridge/elementwise_ops.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/elementwise_ops.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ElementwiseConverter( - const std::shared_ptr elementwise_op, - const node_map_type& inputs_map) { - auto scope = elementwise_op->scope(); - auto op_info = elementwise_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "converting elementwise..."; - - std::shared_ptr elementwise_node = - std::make_shared(unique_op_type); - - auto x_var_name = op_info->Input("X").front(); - auto y_var_name = op_info->Input("Y").front(); - - CHECK_EQ(op_info->GetAttr("axis"), -1) - << "npu elementwise only support inputs with same size"; - - CHECK(inputs_map.find(x_var_name) != inputs_map.end()); - elementwise_node->set_input_x1(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - - if (inputs_map.find(y_var_name) != inputs_map.end()) { - elementwise_node->set_input_x2(*inputs_map.at(y_var_name)); - OpList::Global().add(inputs_map.at(y_var_name)); - } else { - auto consty = std::make_shared(y_var_name); - auto* y = scope->FindVar(y_var_name)->GetMutable(); - consty->set_attr_value(CvtFromLiteTensor(y)); - elementwise_node->set_input_x2(*consty); - OpList::Global().add(consty); - } - - OpList::Global().add(elementwise_node); - - // paddlelite has sum only - elementwise_node->set_attr_mode(1); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = elementwise_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(elementwise_add, - paddle::lite::npu::bridge::ElementwiseConverter); diff --git a/lite/backends/npu/bridge/elementwise_ops_test.cc b/lite/backends/npu/bridge/elementwise_ops_test.cc deleted file mode 100644 index ff82daec10..0000000000 --- a/lite/backends/npu/bridge/elementwise_ops_test.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/elementwise_ops.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -void elementwise_add_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - - auto x_data = x->data(); - auto y_data = y->data(); - dtype* out_data = out->mutable_data(); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - int axis = op_info->GetAttr("axis"); - - if (axis < 0) { - axis = x_dims.size() - y_dims.size(); - } - int batch = 1; - int channels = 1; - int num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - num *= x_dims[i]; - } - // do elementwise add/sub/max... - std::string elt_type = "add"; - if (elt_type == "add") { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const dtype* din_ptr = x_data + offset; - const dtype diny_data = y_data[j]; - dtype* dout_ptr = out_data + offset; - for (int k = 0; k < num; ++k) { - *dout_ptr = *din_ptr + diny_data; - dout_ptr++; - din_ptr++; - } - } - } - } else if (elt_type == "sub") { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const dtype* din_ptr = x_data + offset; - const dtype diny_data = y_data[j]; - dtype* dout_ptr = out_data + offset; - for (int k = 0; k < num; ++k) { - *dout_ptr = *din_ptr - diny_data; - dout_ptr++; - din_ptr++; - } - } - } - } else if (elt_type == "mul") { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const dtype* din_ptr = x_data + offset; - const dtype diny_data = y_data[j]; - dtype* dout_ptr = out_data + offset; - for (int k = 0; k < num; ++k) { - *dout_ptr = *din_ptr * diny_data; - dout_ptr++; - din_ptr++; - } - } - } - } else if (elt_type == "max") { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const dtype* din_ptr = x_data + offset; - const dtype diny_data = y_data[j]; - dtype* dout_ptr = out_data + offset; - for (int k = 0; k < num; ++k) { - *dout_ptr = std::max(*din_ptr, diny_data); - dout_ptr++; - din_ptr++; - } - } - } - } else { - LOG(FATAL) << "unsupported Elementwise type: " << elt_type; - } -} - -void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string y_var_name = "y"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* y = scope.Var(y_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - y->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - FillTensor(y); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("elementwise_add"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetInput("Y", {y_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axis", axis); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - elementwise_add_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-1); - } -} - -TEST(NPUBridges, elementwise_add) { - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto axis : {-1}) test_elementwise_add(bs, ic, ih, iw, axis); - } - } - } - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(elementwise_add); -USE_NPU_BRIDGE(elementwise_add); diff --git a/lite/backends/npu/bridge/fc_op.cc b/lite/backends/npu/bridge/fc_op.cc deleted file mode 100644 index 1321498db6..0000000000 --- a/lite/backends/npu/bridge/fc_op.cc +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/fc_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type FCConverter(const std::shared_ptr fc_op, - const node_map_type& inputs_map) { - LOG(INFO) << "Converting fc..."; - lite::Scope* scope = fc_op->scope(); - const lite::OpInfo* op_info = fc_op->op_info(); - auto output_node = std::make_shared(UniqueName("fc")); - - auto x_var_name = op_info->Input("Input").front(); - auto w_var_name = op_info->Input("W").front(); - - int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); - auto* xtensor = scope->FindVar(x_var_name)->GetMutable(); - auto* wtensor = scope->FindVar(w_var_name)->GetMutable(); - auto x_dims = xtensor->dims(); - auto w_dims = wtensor->dims(); - - CHECK_GE(x_dims.size(), 2UL); - CHECK_EQ(w_dims.size(), 2UL); - - int m = x_dims.Slice(0, in_num_col_dims).production(); - int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production(); - int n = w_dims[1]; - - CHECK(inputs_map.count(x_var_name)); - CHECK(!inputs_map.count(w_var_name)); - - LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k; - LOG(INFO) << "x_var_name:" << x_var_name - << ", is data: " << inputs_map.count(x_var_name); - LOG(INFO) << "w_var_name:" << w_var_name - << ", is data: " << inputs_map.count(w_var_name); - - auto xsrc = inputs_map.at(x_var_name); - auto reshapex = std::make_shared(x_var_name + "_reshape"); - reshapex->set_input_tensor(*xsrc); - reshapex->set_attr_shape({m, k}); - reshapex->set_attr_axis(0); - OpList::Global().add(xsrc); - OpList::Global().add(reshapex); - output_node->set_input_x(*reshapex); - - auto wconst = std::make_shared(w_var_name); - ge::TensorDesc wdesc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = wdesc.GetShape().GetShapeSize(); - CHECK_EQ(size, w_dims.production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(wdesc); - auto* pdata = reinterpret_cast(wtensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - wconst->set_attr_value(ptensor); - OpList::Global().add(wconst); - output_node->set_input_w(*wconst); - - if (HasInputArg(op_info, scope, "Bias")) { - auto b_var_name = op_info->Input("Bias").front(); - auto* btensor = scope->FindVar(b_var_name)->GetMutable(); - - LOG(INFO) << "b_var_name:" << b_var_name - << ", is data: " << inputs_map.count(b_var_name); - CHECK(!inputs_map.count(b_var_name)); - CHECK_EQ(btensor->numel(), n); - - auto bconst = std::make_shared(b_var_name); - ge::TensorDesc bdesc( - ge::Shape({1, n, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = bdesc.GetShape().GetShapeSize(); - CHECK_EQ(size, n); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(bdesc); - auto* pdata = reinterpret_cast(btensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - bconst->set_attr_value(ptensor); - OpList::Global().add(bconst); - output_node->set_input_bias(*bconst); - output_node->set_attr_has_bias(ge::AttrValue::BOOL{true}); - } - - OpList::Global().add(output_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = output_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(fc, paddle::lite::npu::bridge::FCConverter); diff --git a/lite/backends/npu/bridge/fc_op_test.cc b/lite/backends/npu/bridge/fc_op_test.cc deleted file mode 100644 index 7bfee2034f..0000000000 --- a/lite/backends/npu/bridge/fc_op_test.cc +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/fc_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void fc_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto input = - scope->FindVar(op_info->Input("Input").front())->GetMutable(); - auto w = scope->FindVar(op_info->Input("W").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - int32_t in_num_col_dims = op_info->GetAttr("in_num_col_dims"); - Tensor* bias = nullptr; - float* bias_data = nullptr; - if (op_info->HasInput("Bias")) { - auto bias_var_names = op_info->Input("Bias"); - if (bias_var_names.size() > 0) { - auto bias_var_name = bias_var_names.front(); - bias = scope->FindVar(bias_var_name)->GetMutable(); - bias_data = bias->mutable_data(); - } - } - auto input_data = input->data(); - auto w_data = w->mutable_data(); - auto out_data = out->mutable_data(); - auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims); - int out_num_classes = w->dims()[1]; - const int M = in_mat_dims[0]; - const int K = in_mat_dims[1]; - const int N = out_num_classes; - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - out_data[m * N + n] = 0; - for (int k = 0; k < K; ++k) { - out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n]; - } - } - } - if (bias_data != nullptr) { - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - out_data[m * N + n] += bias_data[n]; - } - } - } -} - -void test_fc(const std::vector& x_shape, - const std::vector& w_shape, - int in_num_col_dims, - bool has_bias) { - CHECK_EQ(w_shape.size(), 2UL); - - const auto& bridges = lite::npu::bridge::Factory::Instance(); - const auto& supported_lists = bridges.AllFunctions(); - CHECK(bridges.HasType("fc")); - - Scope scope; - std::string x_var_name("Input"); - std::string w_var_name("W"); - std::string bias_var_name("Bias"); - std::string out_var_name("Out"); - std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* w = scope.Var(w_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize(x_shape); - input->Resize({bs, ic, ih, iw}); - - // get w shape - auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims); - std::vector w_shape = {in_mat_dims[1], out_num_classes}; - w->Resize(w_shape); - - FillTensor(x); - FillTensor(w); - - // create fc op - cpp::OpDesc fc_op_desc; - fc_op_desc.SetType("fc"); - fc_op_desc.SetInput("Input", {x_var_name}); - fc_op_desc.SetInput("W", {w_var_name}); - fc_op_desc.SetOutput("Out", {out_var_name}); - fc_op_desc.SetAttr("in_num_col_dims", static_cast(in_num_col_dims)); - if (has_bias) { - auto* bias = scope.Var(bias_var_name)->GetMutable(); - bias->Resize({w_shape[1]}); - FillTensor(bias); - fc_op_desc.SetInput("Bias", {bias_var_name}); - } - - auto fc_op = CreateOp(fc_op_desc, &scope); - LauchOp(fc_op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // compare results - fc_ref(fc_op); - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } - - // model release - npu::OpList::Global().clear(); - npu::DeviceInfo::Global().Clear(); -} - -TEST(NPUBridges, fc) { - for (bool use_bias : {true, false}) { - test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias); - test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias); - test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(fc); -USE_NPU_BRIDGE(fc); diff --git a/lite/backends/npu/bridge/interpolate_op.cc b/lite/backends/npu/bridge/interpolate_op.cc deleted file mode 100644 index 83cae61e3f..0000000000 --- a/lite/backends/npu/bridge/interpolate_op.cc +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type InterpolateConverter( - const std::shared_ptr interpolate_op, - const node_map_type& inputs_map) { - auto scope = interpolate_op->scope(); - auto op_info = interpolate_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - // get input, output and attributes from lite op - auto x_var_name = op_info->Input("X").front(); - CHECK(inputs_map.count(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - - auto x = scope->FindVar(x_var_name)->GetMutable(); - auto x_dims = x->dims(); - auto x_h = x_dims[2]; - auto x_w = x_dims[3]; - CHECK_EQ(x_dims.size(), 4); - auto scale = op_info->GetAttr("scale"); - auto out_w = op_info->GetAttr("out_w"); - auto out_h = op_info->GetAttr("out_h"); - auto align_corners = op_info->GetAttr("align_corners"); - int align_mode = op_info->GetAttr("align_mode"); - CHECK(!(align_mode == 0 && !align_corners)) - << "align_mode = 0 && align_corners = false isn't supported in NPU DDK"; - - // priority: OutSize > scale > out_h/out_w - if (scale > 0) { - out_h = static_cast(x_h * scale); - out_w = static_cast(x_w * scale); - out_h = out_h > 0 ? out_h : -1; - out_w = out_w > 0 ? out_w : -1; - } - - // update out_h and out_w if has OutSize - bool inputs_map_has_w = false; - if (HasInputArg(op_info, scope, "OutSize")) { - auto out_size_var_name = op_info->Input("OutSize").front(); - if (inputs_map.count(out_size_var_name)) { - inputs_map_has_w = true; - } else { - auto out_size = - scope->FindVar(out_size_var_name)->GetMutable(); - CHECK_EQ(out_size->numel(), 2); - auto out_size_data = out_size->mutable_data(); - // update out_h and out_w if has OutSize - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - } - - node_map_type outputs_map; - auto interp_method = op_info->GetAttr("interp_method"); - if (interp_method == "bilinear") { - auto interp_node = std::make_shared(unique_op_type); - OpList::Global().add(interp_node); - interp_node->set_input_x(*inputs_map.at(x_var_name)); - if (inputs_map_has_w) { - auto out_size_var_name = op_info->Input("OutSize").front(); - interp_node->set_input_w(*inputs_map.at(out_size_var_name)); - OpList::Global().add(inputs_map.at(out_size_var_name)); - } else { - const float largest_multiple = 7.0f; - float multiple = static_cast(x_h * x_w) / (out_h * out_w); - CHECK_LT(multiple, largest_multiple) - << "multiple=(ih*iw)/(oh*ow)=" << multiple - << " is too large, should not exceed " << largest_multiple - << " in NPU DDK"; - auto w_const_node = - std::make_shared(unique_op_type + "/w"); - w_const_node->set_attr_value( - CreateTensorAndFillData(std::vector({out_h, out_w}))); - interp_node->set_input_w(*w_const_node); - OpList::Global().add(w_const_node); - } - interp_node->set_attr_output_dim_mode( - 2); // 0: zoom_factor, 1: shrink_factor, 2: height/width - interp_node->set_attr_align_corners(align_corners); - outputs_map[op_info->Output("Out").front()] = interp_node; - } else if (interp_method == "nearest") { - auto interp_node = - std::make_shared(unique_op_type); - OpList::Global().add(interp_node); - interp_node->set_input_image(*inputs_map.at(x_var_name)); - if (inputs_map_has_w) { - auto out_size_var_name = op_info->Input("OutSize").front(); - interp_node->set_input_size(*inputs_map.at(out_size_var_name)); - OpList::Global().add(inputs_map.at(out_size_var_name)); - } else { - auto w_const_node = - std::make_shared(unique_op_type + "/w"); - w_const_node->set_attr_value( - CreateTensorAndFillData(std::vector({out_h, out_w}))); - interp_node->set_input_size(*w_const_node); - OpList::Global().add(w_const_node); - } - interp_node->set_attr_align_corners(align_corners); - outputs_map[op_info->Output("Out").front()] = interp_node; - } else { - LOG(FATAL) << "unsupported interpolate method: " << interp_method; - } - - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(bilinear_interp, - paddle::lite::npu::bridge::InterpolateConverter); -REGISTER_NPU_BRIDGE(nearest_interp, - paddle::lite::npu::bridge::InterpolateConverter); diff --git a/lite/backends/npu/bridge/interpolate_op_test.cc b/lite/backends/npu/bridge/interpolate_op_test.cc deleted file mode 100644 index 79dd612c59..0000000000 --- a/lite/backends/npu/bridge/interpolate_op_test.cc +++ /dev/null @@ -1,405 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/interpolate_op.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -void bilinear_interp_ref(const std::shared_ptr op) { - auto scope = op->scope(); - auto op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_dims = x->dims(); - int batch_size = x_dims[0]; - int channel_size = x_dims[1]; - auto x_h = x_dims[2]; - auto x_w = x_dims[3]; - CHECK_EQ(x_dims.size(), 4); - auto scale = op_info->GetAttr("scale"); - auto out_w = op_info->GetAttr("out_w"); - auto out_h = op_info->GetAttr("out_h"); - auto align_corners = op_info->GetAttr("align_corners"); - int align_mode = op_info->GetAttr("align_mode"); - auto interp_method = op_info->GetAttr("interp_method"); - - // calc real out_h and out_w - if (scale > 0) { - out_h = static_cast(x_h * scale); - out_w = static_cast(x_w * scale); - } - if (op_info->HasInput("OutSize")) { - auto out_size_var_names = op_info->Input("OutSize"); - if (out_size_var_names.size() > 0) { - auto out_size_var_name = out_size_var_names.front(); - auto out_size = - scope->FindVar(out_size_var_name)->GetMutable(); - auto out_size_dims = out_size->dims(); - CHECK_EQ(out_size_dims.size(), 1); - CHECK_EQ(out_size_dims.production(), 2); - auto out_size_data = out_size->mutable_data(); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - } - CHECK_GT(out_h, 0); - CHECK_GT(out_w, 0); - out->Resize({batch_size, channel_size, out_h, out_w}); - - // copy from x if no change - if (x_h == out_h && x_w == out_w) { - out->CopyDataFrom(*x); - return; - } - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - ratio_h = (align_corners) ? static_cast(x_h - 1) / (out_h - 1) - : static_cast(x_h) / out_h; - } - if (out_w > 1) { - ratio_w = (align_corners) ? static_cast(x_w - 1) / (out_w - 1) - : static_cast(x_w) / out_w; - } - - // naive bilinear interpolation - auto x_data = x->mutable_data(); - auto out_data = out->mutable_data(); - bool align_flag = (align_mode == 0 && !align_corners); - - std::vector vy_n, vy_s; - std::vector vd_n, vd_s; - vy_n.reserve(out_h); - vy_s.reserve(out_h); - vd_n.reserve(out_h); - vd_s.reserve(out_h); - for (int k = 0; k < out_h; k++) { - int yn = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) - : static_cast(ratio_h * k); - yn = (yn > 0) ? yn : 0; - int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1); - float idx_src_y = ratio_h * (k + 0.5) - 0.5; - idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; - float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn; - float ds = 1.f - dn; - { - vy_n[k] = yn; - vy_s[k] = ys; - vd_n[k] = dn; - vd_s[k] = ds; - } - } - - std::vector vx_w, vx_e; - std::vector vd_w, vd_e; - vx_w.reserve(out_w); - vx_e.reserve(out_w); - vd_w.reserve(out_w); - vd_e.reserve(out_w); - for (int l = 0; l < out_w; l++) { - int xw = (align_mode == 0 && !align_corners) - ? static_cast(ratio_w * (l + 0.5) - 0.5) - : static_cast(ratio_w * l); - xw = (xw > 0) ? xw : 0; - int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1); - float idx_src_x = ratio_w * (l + 0.5) - 0.5; - idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; - float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw; - float de = 1.f - dw; - { - vx_w[l] = xw; - vx_e[l] = xe; - vd_w[l] = dw; - vd_e[l] = de; - } - } - - std::vector x_strides(x_dims.size(), 1); - for (int idx = x_strides.size() - 2; idx >= 0; idx--) { - x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1]; - } - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < channel_size; j++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - DType x0 = x_data[i * x_strides[0] + j * x_strides[1] + - vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]]; - DType x1 = x_data[i * x_strides[0] + j * x_strides[1] + - vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]]; - DType x2 = x_data[i * x_strides[0] + j * x_strides[1] + - vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]]; - DType x3 = x_data[i * x_strides[0] + j * x_strides[1] + - vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]]; - *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] + - x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l]; - out_data++; - } - } - } - } -} - -template -void nearest_interp_ref(const std::shared_ptr op) { - auto scope = op->scope(); - auto op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_dims = x->dims(); - CHECK_EQ(x_dims.size(), 4); - auto scale = op_info->GetAttr("scale"); - auto out_w = op_info->GetAttr("out_w"); - auto out_h = op_info->GetAttr("out_h"); - auto align_corners = op_info->GetAttr("align_corners"); - // int align_mode = op_info->GetAttr("align_mode"); - auto interp_method = op_info->GetAttr("interp_method"); - CHECK_EQ(interp_method, "nearest"); - - int x_h = x_dims[2]; - int x_w = x_dims[3]; - if (scale > 0) { - out_h = static_cast(x_h * scale); - out_w = static_cast(x_w * scale); - } - if (op_info->HasInput("OutSize")) { - auto out_size_var_names = op_info->Input("OutSize"); - if (out_size_var_names.size() > 0) { - auto out_size_var_name = out_size_var_names.front(); - auto out_size = - scope->FindVar(out_size_var_name)->GetMutable(); - CHECK_EQ(out_size->numel(), 2); - auto out_size_data = out_size->mutable_data(); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } - } - CHECK_GT(out_h, 0); - CHECK_GT(out_w, 0); - out->Resize({x_dims[0], x_dims[1], out_h, out_w}); - - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - ratio_h = align_corners ? static_cast(x_h - 1.0) / (out_h - 1.0) - : static_cast(x_h) / out_h; - } - if (out_w > 1) { - ratio_w = align_corners ? static_cast(x_w - 1.0) / (out_w - 1.0) - : static_cast(x_w) / out_w; - } - - auto x_data = x->data(); - auto out_data = out->mutable_data(); - auto out_dims = out->dims(); - std::vector x_strides(x_dims.size(), 1); - for (int idx = x_strides.size() - 2; idx >= 0; idx--) { - x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1]; - } - - for (int n = 0; n < out_dims[0]; n++) { - for (int c = 0; c < out_dims[1]; c++) { - for (int h = 0; h < out_dims[2]; h++) { - for (int w = 0; w < out_dims[3]; w++) { - int in_i = ratio_h * h; - int in_j = ratio_w * w; - if (align_corners) { - in_i = ratio_h * h + 0.5; - in_j = ratio_w * w + 0.5; - } - *out_data = x_data[n * x_strides[0] + c * x_strides[1] + - in_i * x_strides[2] + in_j * x_strides[3]]; - out_data++; - } - } - } - } -} - -void test_interpolate(int bs, - int ic, - int ih, - int iw, - int oh, - int ow, - float scale, - int out_size_h, - int out_size_w, - bool align_corners, - int align_mode, - std::string interp_method) { - // prepare input&output variables - Scope scope; - std::string x_var_name("x"); - std::string out_size_var_name("out_size"); - std::string out_var_name("out"); - std::string out_ref_var_name("out_ref"); - auto x = scope.Var(x_var_name)->GetMutable(); - auto out_size = scope.Var(out_size_var_name)->GetMutable(); - auto out = scope.Var(out_var_name)->GetMutable(); - auto out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - out_size->Resize({2}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType(interp_method + "_interp"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("out_h", oh); - opdesc.SetAttr("out_w", ow); - opdesc.SetAttr("scale", scale); - opdesc.SetAttr("align_corners", static_cast(align_corners)); - opdesc.SetAttr("align_mode", static_cast(align_mode)); - opdesc.SetAttr("interp_method", interp_method); - if (out_size_h > 0 && out_size_w > 0) { - auto out_size_dims = out_size->dims(); - CHECK_EQ(out_size_dims.size(), 1); - CHECK_EQ(out_size_dims.production(), 2); - auto out_size_data = out_size->mutable_data(); - out_size_data[0] = out_size_h; - out_size_data[1] = out_size_w; - opdesc.SetInput("OutSize", {out_size_var_name}); - } - - // create op and execute reference implementation - auto op = CreateOp(opdesc, &scope); - if (interp_method == "bilinear") { - bilinear_interp_ref(op); - } else { - nearest_interp_ref(op); - } - out_ref->CopyDataFrom(*out); - - // convert op to NPU model, then run it on NPU - LauchOp(op, {x_var_name}, {out_var_name}); - - // compare results - auto out_dims = out->dims(); - auto out_ref_dims = out_ref->dims(); - CHECK_EQ(out_dims.size(), out_ref_dims.size()); - for (int i = 0; i < out_dims.size(); i++) { - CHECK_EQ(out_dims[i], out_ref_dims[i]); - } - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2f); - } -} - -TEST(NPUBridges, bilinear_interp) { -#if 1 - for (auto bs : {1, 3}) { - for (auto ic : {3, 4}) { - for (auto ih : {4, 5}) { - for (auto iw : {3, 6}) { - for (auto oh : {0, 3, 8}) { - for (auto ow : {0, 4, 9}) { - for (auto scale : {0.f, 0.5f, 0.6f, 2.0f, 2.2f}) { - for (auto out_size_h : {0, 3, 11}) { - for (auto out_size_w : {0, 2, 12}) { - for (auto align_corners : {true, false}) { - for (auto align_mode : {0, 1}) { - for (auto interp_method : {"bilinear", "nearest"}) { - int act_oh = 0, act_ow = 0; - if (out_size_h > 0 && out_size_w > 0) { - act_oh = out_size_h; - act_ow = out_size_w; - } else if (scale > 1e-5) { - act_oh = static_cast(ih * scale); - act_ow = static_cast(iw * scale); - } else if (oh > 0 && ow > 0) { - act_oh = oh; - act_ow = ow; - } - if (act_oh <= 0 || act_ow <= 0) { - continue; - } - // TODO(hong19860320) multiple=(ih*iw)/(oh*ow) - // should - // not exceed 7.0 in NPU DDK, delete the following - // lines - // if the limination is removed. - const float largest_multiple = 7.0f; - float multiple = - static_cast(ih * iw) / (act_oh * act_ow); - if (multiple > largest_multiple) { - continue; - } - if (align_mode == 0 && !align_corners) { - continue; - } - VLOG(3) << "bs: " << bs << " ic: " << ic - << " ih: " << ih << " iw: " << iw - << " oh: " << oh << " ow: " << ow - << " scale: " << scale - << " out_size: " << out_size_h << "," - << out_size_w - << " align_corners: " << align_corners - << " align_mode: " << align_mode; - test_interpolate(bs, - ic, - ih, - iw, - oh, - ow, - scale, - out_size_h, - out_size_w, - align_corners, - align_mode, - interp_method); - } - } - } - } - } - } - } - } - } - } - } - } -#else - test_interpolate(1, 1, 4, 3, 0, 0, 1.f, 3, 6, false, 1, "nearest"); -#endif -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(bilinear_interp); -USE_NPU_BRIDGE(bilinear_interp); - -USE_LITE_OP(nearest_interp); -USE_NPU_BRIDGE(nearest_interp); diff --git a/lite/backends/npu/bridge/mul_op.cc b/lite/backends/npu/bridge/mul_op.cc deleted file mode 100644 index 290f3d88f8..0000000000 --- a/lite/backends/npu/bridge/mul_op.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/mul_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" -#include "lite/backends/npu/npu_helper.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -// Note: inputs_map the var_name contains only the data, the weight should be -// handle in this converter -node_map_type MulConverter(const std::shared_ptr mul_op, - const node_map_type& inputs_map) { - LOG(INFO) << "converting mul..."; - lite::Scope* scope = mul_op->scope(); - const lite::OpInfo* op_info = mul_op->op_info(); - auto output_node = std::make_shared(UniqueName("mul")); - - auto x_var_name = op_info->Input("X").front(); - auto y_var_name = op_info->Input("Y").front(); - int x_num_col_dims = op_info->GetAttr("x_num_col_dims"); - int y_num_col_dims = op_info->GetAttr("y_num_col_dims"); - auto* xtensor = scope->FindVar(x_var_name)->GetMutable(); - auto* ytensor = scope->FindVar(y_var_name)->GetMutable(); - - int m = xtensor->dims().Slice(0, x_num_col_dims).production(); - int x_w = xtensor->dims() - .Slice(x_num_col_dims, xtensor->dims().size()) - .production(); - int y_h = ytensor->dims().Slice(0, y_num_col_dims).production(); - int n = ytensor->dims() - .Slice(y_num_col_dims, ytensor->dims().size()) - .production(); - CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; - int k = x_w; - LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k; - LOG(INFO) << "x_var_name:" << x_var_name - << ", is data: " << inputs_map.count(x_var_name); - LOG(INFO) << "y_var_name:" << y_var_name - << ", is data: " << inputs_map.count(y_var_name); - CHECK(inputs_map.count(x_var_name)) - << "[NPU] MatMul only support X is data, Y is const yet"; - if (inputs_map.count(x_var_name)) { - auto xsrc = inputs_map.at(x_var_name); - auto reshapex = std::make_shared(x_var_name + "_reshape"); - reshapex->set_input_tensor(*xsrc); - reshapex->set_attr_shape({m, k}); - reshapex->set_attr_axis(0); - OpList::Global().add(xsrc); - OpList::Global().add(reshapex); - output_node->set_input_x(*reshapex); - } else { - auto constx = std::make_shared(x_var_name); - ge::TensorDesc desc(ge::Shape({m, k}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = desc.GetShape().GetShapeSize(); - CHECK_EQ(size, xtensor->dims().production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(desc); - auto* pdata = reinterpret_cast(xtensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - constx->set_attr_value(ptensor); - OpList::Global().add(constx); - output_node->set_input_x(*constx); - } - - if (inputs_map.count(y_var_name)) { - auto ysrc = inputs_map.at(y_var_name); - auto reshapey = std::make_shared(y_var_name + "_reshape"); - reshapey->set_input_tensor(*ysrc); - reshapey->set_attr_shape({k, n}); - reshapey->set_attr_axis(0); - OpList::Global().add(ysrc); - OpList::Global().add(reshapey); - output_node->set_input_w(*reshapey); - } else { - auto consty = std::make_shared(y_var_name); - ge::TensorDesc desc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = desc.GetShape().GetShapeSize(); - CHECK_EQ(size, ytensor->dims().production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(desc); - auto* pdata = reinterpret_cast(ytensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - consty->set_attr_value(ptensor); - OpList::Global().add(consty); - output_node->set_input_w(*consty); - } - - OpList::Global().add(output_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = output_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(mul, paddle::lite::npu::bridge::MulConverter); diff --git a/lite/backends/npu/bridge/mul_op_test.cc b/lite/backends/npu/bridge/mul_op_test.cc deleted file mode 100644 index c28d0487cc..0000000000 --- a/lite/backends/npu/bridge/mul_op_test.cc +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/mul_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void mul_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - int32_t x_num_col_dims = op_info->GetAttr("x_num_col_dims"); - int32_t y_num_col_dims = op_info->GetAttr("y_num_col_dims"); - auto x_data = x->mutable_data(); - auto y_data = y->mutable_data(); - auto out_data = out->mutable_data(); - auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims); - auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims); - CHECK_EQ(x_mat_dims[1], y_mat_dims[0]); - const int M = x_mat_dims[0]; - const int K = x_mat_dims[1]; - const int N = y_mat_dims[1]; - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - out_data[m * N + n] = 0; - for (int k = 0; k < K; ++k) { - out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n]; - } - } - } -} - -void test_mul(const std::vector& x_shape, - const std::vector& y_shape, - int x_num_col_dims, - int y_num_col_dims) { - const auto& bridges = lite::npu::bridge::Factory::Instance(); - const auto& supported_lists = bridges.AllFunctions(); - CHECK(bridges.HasType("mul")); - - Scope scope; - std::string x_var_name("X"); - std::string y_var_name("Y"); - std::string out_var_name("Out"); - std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* y = scope.Var(y_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize(x_shape); - - // get y shape - auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims); - std::vector y_shape; - for (int i = 0; i < y_num_col_dims - 1; i++) { - y_shape.push_back(1); - } - y_shape.push_back(x_mat_dims[1]); - y_shape.push_back(o); - y->Resize(y_shape); - - FillTensor(x); - FillTensor(y); - - // create mul op - cpp::OpDesc mul_op_desc; - mul_op_desc.SetType("mul"); - mul_op_desc.SetInput("X", {x_var_name}); - mul_op_desc.SetInput("Y", {y_var_name}); - mul_op_desc.SetOutput("Out", {out_var_name}); - mul_op_desc.SetAttr("x_num_col_dims", static_cast(x_num_col_dims)); - mul_op_desc.SetAttr("y_num_col_dims", static_cast(y_num_col_dims)); - - auto mul_op = CreateOp(mul_op_desc, &scope); - LauchOp(mul_op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - mul_ref(mul_op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } - - // model release - npu::OpList::Global().clear(); - npu::DeviceInfo::Global().Clear(); -} - -TEST(NPUBridges, mul) { - test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2); - test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2); - test_mul({1, 4, 1, 1}, {4, 8}, 1, 1); -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(mul); -USE_NPU_BRIDGE(mul); diff --git a/lite/backends/npu/bridge/pad2d_op.cc b/lite/backends/npu/bridge/pad2d_op.cc deleted file mode 100644 index 2c67383c0c..0000000000 --- a/lite/backends/npu/bridge/pad2d_op.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type Pad2dConverter(const std::shared_ptr pad2d_op, - const node_map_type& inputs_map) { - auto scope = pad2d_op->scope(); - auto op_info = pad2d_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - std::shared_ptr pad2d_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); - pad2d_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(pad2d_node); - - auto mode = op_info->GetAttr("mode"); - if (mode == "constant") { - pad2d_node->set_attr_mode(0); - } else if (mode == "reflect") { - LOG(FATAL) << "NPU doesn't support this pad mod: " << mode; - pad2d_node->set_attr_mode(1); - } else { - LOG(FATAL) << "NPU doesn't support this pad mod: " << mode; - } - - auto x_dims = scope->FindTensor(x_var_name)->dims(); - auto padding = op_info->GetAttr>("paddings"); - CHECK_EQ(padding.size(), 4); - int xds = x_dims.size(); - padding.insert(padding.begin(), xds * 2 - 4, 0); - auto npu_padding = - std::make_shared(unique_op_type + "/padding"); - npu_padding->set_attr_value(CreateTensorAndFillData(padding, {xds, 2})); - pad2d_node->set_input_padding(*npu_padding); - OpList::Global().add(npu_padding); - - if (mode == "constant") { - auto pad_value = op_info->GetAttr("pad_value"); - auto npu_pad_value = - std::make_shared(unique_op_type + "/pad_value"); - npu_pad_value->set_attr_value(CreateTensorAndFillData({pad_value})); - pad2d_node->set_input_constant_values(*npu_pad_value); - OpList::Global().add(npu_pad_value); - - pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32 - } - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = pad2d_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(pad2d, paddle::lite::npu::bridge::Pad2dConverter); diff --git a/lite/backends/npu/bridge/pad2d_op_test.cc b/lite/backends/npu/bridge/pad2d_op_test.cc deleted file mode 100644 index 7a10e0a559..0000000000 --- a/lite/backends/npu/bridge/pad2d_op_test.cc +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/pad2d_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -void pad2d_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindMutableTensor(op_info->Input("X").front()); - auto out = scope->FindMutableTensor(op_info->Output("Out").front()); - - auto paddings = op_info->GetAttr>("paddings"); - int pad_top = paddings[0]; - int pad_bottom = paddings[1]; - int pad_left = paddings[2]; - int pad_right = paddings[3]; - - auto mode = op_info->GetAttr("mode"); - int pad_mode; - if (mode == "constant") { - pad_mode = 0; - } else if (mode == "reflect") { - pad_mode = 1; - } else if (mode == "edge") { - pad_mode = 2; - } else { - LOG(FATAL) << "Unknown mode type"; - } - float pad_value = op_info->GetAttr("pad_value"); - - auto out_dims = out->dims(); - int n = out_dims[0]; - int c = out_dims[1]; - int h = out_dims[2]; - int w = out_dims[3]; - - int in_w = w - pad_left - pad_right; - int in_h = h - pad_bottom - pad_top; - int spatial_size_out = w * h; - int spatial_size_in = in_w * in_h; - - auto x_data = x->data(); - auto out_data = out->mutable_data(); -#pragma omp parallel for - for (int i = 0; i < n * c; ++i) { - const float* din_batch = x_data + i * spatial_size_in; - float* dout_batch = out_data + i * spatial_size_out; - int in_y = 0; - int in_x = 0; - for (int y = 0; y < h; ++y) { - for (int x = 0; x < w; ++x) { - switch (pad_mode) { - case 0: - in_y = y - pad_top; - in_x = x - pad_left; - dout_batch[y * w + x] = - (in_x >= 0 && in_x < in_w) && (in_y >= 0 && in_y < in_h) - ? din_batch[in_y * in_w + in_x] - : pad_value; - break; - case 1: - in_x = - std::min(std::max(pad_left, x), in_w + pad_left - 1) - pad_left; - in_y = std::min(std::max(pad_top, y), in_h + pad_top - 1) - pad_top; - dout_batch[y * w + x] = din_batch[in_y * in_w + in_x]; - break; - case 2: - in_y = y - pad_top; - in_x = x - pad_left; - in_y = std::max(in_y, -in_y); - in_y = std::min(in_y, 2 * in_h - in_y - 2); - in_x = std::max(in_x, -in_x); - in_x = std::min(in_x, 2 * in_w - in_x - 2); - dout_batch[y * w + x] = din_batch[in_y * in_w + in_x]; - break; - default: - LOG(ERROR) << "ERROR: unknown pad mode:" << pad_mode; - } - } - } - } -} - -void test_pad2d(int bs, - int ic, - int ih, - int iw, - std::vector paddings, - float pad_value, - std::string mode) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.NewTensor(x_var_name); - auto* out = scope.NewTensor(out_var_name); - auto* out_ref = scope.NewTensor(out_ref_var_name); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - // FillTensor(x); - auto x_data = x->mutable_data(); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("pad2d"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("paddings", paddings); - opdesc.SetAttr("pad_value", pad_value); - opdesc.SetAttr("mode", mode); - opdesc.SetAttr("data_format", std::string("NCHW")); - - auto op = CreateOp(opdesc, &scope); - pad2d_ref(op); - out_ref->CopyDataFrom(*out); - - LauchOp(op, {x_var_name}, {out_var_name}); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->numel(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2) << "-----" << i; - } -} - -TEST(NPUBridges, pad2d) { -#if 1 - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto paddings : {/*std::vector{0, 0, 0, 0},*/ - std::vector{0, 0, 0, 1}, - std::vector{0, 1, 0, 2}, - std::vector{1, 2, 3, 4}}) { - // npu not support pad_value!=0 - for (auto pad_value : {0.f /*,1.f*/}) { - // npu only support constant - for (auto mode : {"constant" /*, "reflect", "edge"*/}) { - if (mode == "edge") continue; - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw << " paddings: {" << paddings[0] - << "," << paddings[1] << "," << paddings[2] << "," - << paddings[3] << "}" - << " pad_value: " << pad_value << " mode: " << mode; - test_pad2d(bs, ic, ih, iw, paddings, pad_value, mode); - } - } - } - } - } - } - } -#else - test_pad2d(1, 1, 1, 1, {0, 0, 0, 1}, 0, "constant"); -#endif -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(pad2d); -USE_NPU_BRIDGE(pad2d); diff --git a/lite/backends/npu/bridge/paddle_use_npu_bridges.h b/lite/backends/npu/bridge/paddle_use_npu_bridges.h deleted file mode 100644 index 404d003954..0000000000 --- a/lite/backends/npu/bridge/paddle_use_npu_bridges.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/backends/npu/bridge/registry.h" - -USE_NPU_BRIDGE(mul); -USE_NPU_BRIDGE(fc); -USE_NPU_BRIDGE(conv2d); -USE_NPU_BRIDGE(depthwise_conv2d); -USE_NPU_BRIDGE(pool2d); -USE_NPU_BRIDGE(relu); -USE_NPU_BRIDGE(elementwise_add); -USE_NPU_BRIDGE(scale); -USE_NPU_BRIDGE(softmax); -USE_NPU_BRIDGE(concat); -USE_NPU_BRIDGE(split); -USE_NPU_BRIDGE(transpose); -USE_NPU_BRIDGE(transpose2); -USE_NPU_BRIDGE(shuffle_channel); -USE_NPU_BRIDGE(batch_norm); -USE_NPU_BRIDGE(bilinear_interp); -USE_NPU_BRIDGE(conv2d_transpose); -USE_NPU_BRIDGE(reshape); -USE_NPU_BRIDGE(reshape2); diff --git a/lite/backends/npu/bridge/pool_op.cc b/lite/backends/npu/bridge/pool_op.cc deleted file mode 100644 index aebfd68856..0000000000 --- a/lite/backends/npu/bridge/pool_op.cc +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/pool_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type PoolConverter(const std::shared_ptr pool_op, - const node_map_type& inputs_map) { - auto scope = pool_op->scope(); - auto op_info = pool_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - std::shared_ptr pool_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); - auto pooling_type = op_info->GetAttr("pooling_type"); - int npu_mode = 0; - if (pooling_type == "max") { - npu_mode = 0; - } else if (pooling_type == "avg") { - npu_mode = 1; - CHECK(op_info->GetAttr("exclusive")) - << "exclusive must be true when use npu"; - } else { - LOG(FATAL) << "Unsupported pooling type: " << pooling_type; - } - bool npu_global_pooling = op_info->GetAttr("global_pooling"); - auto ksize = op_info->GetAttr>("ksize"); - auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()); - - auto padding = op_info->GetAttr>("paddings"); - auto npu_pad = - ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]}; - auto strides = op_info->GetAttr>("strides"); - auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end()); - int npu_ceil_mode = 0; - if (op_info->HasAttr("ceil_mode")) { - npu_ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; - } - - pool_node->set_input_x(*inputs_map.at(x_var_name)); - pool_node->set_attr_mode(npu_mode); - pool_node->set_attr_pad_mode(0); - pool_node->set_attr_global_pooling(npu_global_pooling); - pool_node->set_attr_window(npu_window); - pool_node->set_attr_pad(npu_pad); - pool_node->set_attr_stride(npu_stride); - pool_node->set_attr_ceil_mode(npu_ceil_mode); - // output_node->set_attr_data_mode(npu_data_mode); - - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(pool_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = pool_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(pool2d, paddle::lite::npu::bridge::PoolConverter); diff --git a/lite/backends/npu/bridge/pool_op_test.cc b/lite/backends/npu/bridge/pool_op_test.cc deleted file mode 100644 index 86ad893084..0000000000 --- a/lite/backends/npu/bridge/pool_op_test.cc +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/pool_op.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void pool_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto& in_dims = x->dims(); - auto& out_dims = out->dims(); - - const float* src_ptr = x->data(); - float* dst_ptr = out->mutable_data(); - - std::vector ksize = op_info->GetAttr>("ksize"); - std::vector strides = op_info->GetAttr>("strides"); - std::vector paddings = op_info->GetAttr>("paddings"); - bool exclusive = op_info->GetAttr("exclusive"); - std::string pooling_type = op_info->GetAttr("pooling_type"); - bool global_pooling = op_info->GetAttr("global_pooling"); - - int in_n = in_dims[0]; - int in_c = in_dims[1]; - int in_h = in_dims[2]; - int in_w = in_dims[3]; - int size_in_n = in_c * in_h * in_w; - int size_in_c = in_h * in_w; - - int out_h = out_dims[2]; - int out_w = out_dims[3]; - int size_out_n = in_c * out_h * out_w; - int size_out_c = out_h * out_w; - - int window_h = ksize[0]; - int window_w = ksize[1]; - int stride_h = strides[0]; - int stride_w = strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[1]; - - if (global_pooling == true) { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - const float* src = src_ptr + n * size_in_n + c * size_in_c; - float res = src[0]; - if (pooling_type == "max") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res = cur_val > res ? cur_val : res; - } - } else if (pooling_type == "avg") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res += cur_val; - } - res /= size_in_c; - } - dst_ptr[n * size_out_n + c] = res; - } - } - } else { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - for (int h = 0; h < out_h; ++h) { - int sh = h * stride_h; - int eh = sh + window_h; - sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; - eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; - for (int w = 0; w < out_w; ++w) { - int sw = w * stride_w; - int ew = sw + window_w; - sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; - ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; - int pooling_size = (ew - sw) * (eh - sh); - if (pooling_size == 0) continue; - float res = 0.f; - for (int kh = sh; kh < eh; ++kh) { - for (int kw = sw; kw < ew; ++kw) { - int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; - if (kh == sh && kw == sw) { - res = src_ptr[src_idx]; - } else { - if (pooling_type == "max") { - res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; - } - if (pooling_type == "avg") { - res += src_ptr[src_idx]; - } - } - } - } - if (pooling_type == "avg") { - if (exclusive) { - res /= pooling_size; - } else { - res /= window_h * window_w; - } - } - dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; - } - } - } - } - } -} - -void test_pool(int bs, - int ic, - int ih, - int iw, - std::string pooling_type, - bool ceil_mode, - bool global_pooling, - bool exclusive, - int ksize, - int stride, - int padding) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("pool2d"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("pooling_type", pooling_type); - opdesc.SetAttr("ksize", std::vector({ksize, ksize})); - opdesc.SetAttr("global_pooling", global_pooling); - opdesc.SetAttr("exclusive", exclusive); - opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - pool_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, pool) { - for (auto pooling_type : {"max", "avg"}) { - for (auto ceil_mode : {true, false}) { - for (auto global_pooling : {/*true, */ false}) { - for (auto exclusive : {true /*, false*/}) { - for (auto ksize : {2, 3}) { - for (auto stride : {1, 2}) { - for (auto padding : {0, 1}) { - for (auto bs : {1, 3}) { - for (auto ic : {1, 3}) { - for (auto ih : {3, 7}) { - for (auto iw : {3, 7}) { - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); - } - } - } - } - } - } - } - } - } - } - } - for (auto pooling_type : {"max", "avg"}) { - for (auto ceil_mode : {true, false}) { - bool global_pooling = true; - bool exclusive = true; - int ksize = 2; - int stride = 1; - int padding = 0; - int bs = 6; - int ic = 6; - int ih = 6; - int iw = 6; - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); - } - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(pool2d); -USE_NPU_BRIDGE(pool2d); diff --git a/lite/backends/npu/bridge/registry.cc b/lite/backends/npu/bridge/registry.cc deleted file mode 100644 index 180e0aa46e..0000000000 --- a/lite/backends/npu/bridge/registry.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/bridge/registry.h" -#include - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -Factory& Factory::Instance() { - static Factory g_npu_bridge; - return g_npu_bridge; -} - -bool Factory::HasType(const std::string& op_type) const { - return map_.count(op_type); -} - -void Factory::Insert(const std::string& op_type, const func_type& func_name) { - map_.insert(std::make_pair(op_type, func_name)); -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/bridge/registry.h b/lite/backends/npu/bridge/registry.h deleted file mode 100644 index 979760c816..0000000000 --- a/lite/backends/npu/bridge/registry.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/core/op_lite.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -// var_name, npu node point -using node_map_type = - std::unordered_map>; - -using func_type = std::function, - const node_map_type&)>; -using cvt_map_type = std::unordered_map; -class Factory { - public: - static Factory& Instance(); - - const cvt_map_type& AllFunctions() const { return map_; } - bool HasType(const std::string& op_type) const; - void Insert(const std::string& op_type, const func_type& func_name); - Factory() = default; - - private: - cvt_map_type map_; - DISALLOW_COPY_AND_ASSIGN(Factory); -}; - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -// some platform-independent defintion -#if defined(_WIN32) -#define UNUSED -#define __builtin_expect(EXP, C) (EXP) -#else -#define UNUSED __attribute__((unused)) -#endif - -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ - struct __test_global_namespace_##uniq_name##__ {}; \ - static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ - __test_global_namespace_##uniq_name##__>::value, \ - msg) - -#define REGISTER_NPU_BRIDGE(op_type, cvt_func_name) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_npu_bridge_##op_type##__, \ - "REGISTER_NPU_BRIDGE must be called in global namespace only once!"); \ - int __reg_npu_bridge_##op_type##_Insert() { \ - paddle::lite::npu::bridge::Factory::Instance().Insert(#op_type, \ - cvt_func_name); \ - return 0; \ - } - -#define USE_NPU_BRIDGE(op_type) \ - extern int __reg_npu_bridge_##op_type##_Insert(); \ - static int __reg_npu_bridge_##op_type##_Insert_return UNUSED = \ - __reg_npu_bridge_##op_type##_Insert(); diff --git a/lite/backends/npu/bridge/reshape_op.cc b/lite/backends/npu/bridge/reshape_op.cc deleted file mode 100644 index af160f9c72..0000000000 --- a/lite/backends/npu/bridge/reshape_op.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/reshape_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ReshapeConverter(const std::shared_ptr reshape_op, - const node_map_type& inputs_map) { - auto scope = reshape_op->scope(); - auto op_info = reshape_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - // get input, output and op attributes - auto x_var_name = op_info->Input("X").front(); - auto x = scope->FindVar(x_var_name)->GetMutable(); - auto x_dims = x->dims(); - - // create reshape node and set input node from inputs_map - auto reshape_node = std::make_shared(unique_op_type); - CHECK(inputs_map.count(x_var_name)); - reshape_node->set_input_tensor(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - - // read shape from actual shape tensor as input "w" if 'Shape' is found - if (HasInputArg(op_info, scope, "Shape")) { - auto actual_shape_var_name = op_info->Input("Shape").front(); - if (!inputs_map.count(actual_shape_var_name)) { - auto actual_shape = - scope->FindVar(actual_shape_var_name)->GetMutable(); - auto actual_shape_dims = actual_shape->dims(); - auto actual_shape_data = actual_shape->mutable_data(); - auto shape = - std::vector(actual_shape_data, - actual_shape_data + actual_shape_dims.production()); - auto out_dims = operators::ValidateShape(shape, x_dims); - auto out_shape = out_dims.Vectorize(); - if (out_shape.size() > 4) { - LOG(WARNING) - << "NPU DDK only supports less than 4 dimensions, but Shape has " - << out_shape.size(); - } - auto actual_shape_const_node = - std::make_shared(actual_shape_var_name); - actual_shape_const_node->set_attr_value(CreateTensorAndFillData( - std::vector(out_shape.begin(), out_shape.end()))); - reshape_node->set_input_w(*actual_shape_const_node); - OpList::Global().add(actual_shape_const_node); - } else { - reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name)); - OpList::Global().add(inputs_map.at(actual_shape_var_name)); - } - } else { - auto shape = op_info->GetAttr>("shape"); - auto out_dims = operators::ValidateShape(shape, x_dims); - auto out_shape = out_dims.Vectorize(); - if (out_shape.size() > 4) { - LOG(WARNING) - << "NPU DDK only supports less than 4 dimensions, but shape has " - << out_shape.size(); - } - reshape_node->set_attr_shape( - ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); - } - OpList::Global().add(reshape_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = reshape_node; - if (op_type == "reshape2") { - // append an extra reshape node to calc XShape - std::vector xshape_dims(x_dims.size() + 1, 1); - for (size_t i = 0; i < x_dims.size(); i++) { - xshape_dims[i + 1] = x_dims[i]; - } - if (xshape_dims.size() > 4) { - LOG(WARNING) - << "NPU DDK only supports less than 4 dimensions, but XShape has " - << xshape_dims.size(); - } - auto xshape_node = - std::make_shared(unique_op_type + "/xshape"); - xshape_node->set_input_tensor(*inputs_map.at(x_var_name)); - xshape_node->set_attr_shape( - ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end())); - OpList::Global().add(xshape_node); - outputs_map[op_info->Output("XShape").front()] = xshape_node; - } - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(reshape, paddle::lite::npu::bridge::ReshapeConverter); -REGISTER_NPU_BRIDGE(reshape2, paddle::lite::npu::bridge::ReshapeConverter); diff --git a/lite/backends/npu/bridge/reshape_op_test.cc b/lite/backends/npu/bridge/reshape_op_test.cc deleted file mode 100644 index 4a75961fdf..0000000000 --- a/lite/backends/npu/bridge/reshape_op_test.cc +++ /dev/null @@ -1,202 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/reshape_op.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void reshape_ref(const std::shared_ptr op) { - auto scope = op->scope(); - auto op_info = op->op_info(); - auto op_type = op_info->Type(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_dims = x->dims(); - auto shape = op_info->GetAttr>("shape"); - auto inplace = op_info->GetAttr("inplace"); - if (op_info->HasInput("Shape")) { - auto actual_shape_var_names = op_info->Input("Shape"); - if (actual_shape_var_names.size() > 0) { - auto actual_shape = scope->FindVar(actual_shape_var_names.front()) - ->GetMutable(); - auto actual_shape_dims = actual_shape->dims(); - auto* actual_shape_data = actual_shape->data(); - shape = - std::vector(actual_shape_data, - actual_shape_data + actual_shape_dims.production()); - } - } - if (inplace) { - out->ShareDataWith(*x); - } else { - out->CopyDataFrom(*x); - } - auto out_dims = operators::ValidateShape(shape, x_dims); - out->Resize(out_dims); -} - -void test_reshape(const std::vector& x_shape, - const std::vector& shape, - const std::vector& act_shape, - bool inplace, - bool reshape2) { - // prepare input&output variables - Scope scope; - std::string x_var_name("x"); - std::string actual_shape_var_name("actual_shape"); - std::string out_var_name("out"); - std::string out_ref_var_name("out_ref"); - std::string xshape_var_name("xshape"); - std::string xshape_ref_var_name("xshape_ref"); - auto x = scope.Var(x_var_name)->GetMutable(); - auto actual_shape = scope.Var(actual_shape_var_name)->GetMutable(); - auto out = scope.Var(out_var_name)->GetMutable(); - auto out_ref = scope.Var(out_ref_var_name)->GetMutable(); - auto xshape = scope.Var(xshape_var_name)->GetMutable(); - auto xshape_ref = scope.Var(xshape_ref_var_name)->GetMutable(); - - x->Resize(x_shape); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType(reshape2 ? "reshape2" : "reshape"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("shape", shape); - opdesc.SetAttr("inplace", inplace); - if (!act_shape.empty()) { - int64_t act_shape_size = act_shape.size(); - actual_shape->Resize({act_shape_size}); - memcpy(actual_shape->mutable_data(), - act_shape.data(), - act_shape_size * sizeof(int)); - opdesc.SetInput("Shape", {actual_shape_var_name}); - } - if (reshape2) { - opdesc.SetOutput("XShape", {xshape_var_name}); - } - - // create op and execute reference implementation - auto op = reshape2 ? CreateOp(opdesc, &scope) - : CreateOp(opdesc, &scope); - reshape_ref(op); - out_ref->CopyDataFrom(*out); - if (reshape2) { - xshape_ref->CopyDataFrom(*xshape); - } - - // convert op to NPU model, then run it on NPU - LauchOp(op, - {x_var_name}, - {out_var_name}); // TODO(hong19860320) support XShape for reshape2 - - // compare results - auto out_dims = out->dims(); - auto out_ref_dims = out_ref->dims(); - CHECK_EQ(out_dims.size(), out_ref_dims.size()); - for (int i = 0; i < out_dims.size(); i++) { - CHECK_EQ(out_dims[i], out_ref_dims[i]); - } - auto out_data = out->mutable_data(); - auto out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } - // if (reshape2) { - // auto xshape_dims = xshape->dims(); - // auto xshape_ref_dims = xshape_ref->dims(); - // CHECK_EQ(xshape_dims.size(), xshape_ref_dims.size()); - // for (size_t i = 0; i < xshape_dims.size(); i++) { - // CHECK_EQ(xshape_dims[i], xshape_ref_dims[i]); - // } - // } -} - -TEST(NPUBridges, reshape) { -#if 1 - std::map, std::vector>> tests = { - {{1, 2, 4, 6}, - {{}, - {-1}, - {48}, - {-1, 48}, - {1, 48}, - {0, 48}, - {48, -1}, - {48, 1}, - {-1, 24}, - {2, 24}, - {24, 0}, - {-1, 0, 3, 2}, - {4, 2, 3, 2}, - {0, -1, 3, 2}, - {1, 8, 3, 2}}}}; - for (auto& i : tests) { - for (auto& shape : i.second) { - if (shape.empty()) { - continue; - } - for (auto& act_shape : i.second) { - for (auto& inplace : {true, false}) { - for (auto& reshape2 : {true, false}) { - std::stringstream ss; - ss << "x:{ "; - for (auto s : i.first) { - ss << s << " "; - } - ss << "} shape:{ "; - for (auto s : shape) { - ss << s << " "; - } - ss << "} act_shape:{ "; - for (auto s : act_shape) { - ss << s << " "; - } - VLOG(3) << ss.str() << "} inplace:" << inplace - << " reshape2:" << reshape2; - test_reshape(i.first, shape, act_shape, inplace, reshape2); - } - } - } - } - } -#else - test_reshape({2, 4, 6}, {-1, 0, 4, 3}, {}, true, true); - test_reshape({1, 232, 14, 14}, {-1, 2, 116, 14, 14}, {}, true, true); -#endif -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(reshape); -USE_NPU_BRIDGE(reshape); - -USE_LITE_OP(reshape2); -USE_NPU_BRIDGE(reshape2); diff --git a/lite/backends/npu/bridge/scale_op.cc b/lite/backends/npu/bridge/scale_op.cc deleted file mode 100644 index a884b34856..0000000000 --- a/lite/backends/npu/bridge/scale_op.cc +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/scale_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ScaleConverter(const std::shared_ptr scale_op, - const node_map_type& inputs_map) { - auto scope = scale_op->scope(); - auto op_info = scale_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - // get input, output and op attributes - auto x_var_name = op_info->Input("X").front(); - auto x = scope->FindVar(x_var_name)->GetMutable(); - auto x_dims = x->dims().Vectorize(); - CHECK_GE(x_dims.size(), 2); - std::vector scale_bias_shape = {x_dims[1]}; - float scale = op_info->GetAttr("scale"); - float bias = op_info->GetAttr("bias"); - bool bias_after_scale = op_info->GetAttr("bias_after_scale"); - if (!bias_after_scale) { - bias *= scale; - } - - // create scale node and set input node from inputs_map - auto scale_node = std::make_shared(unique_op_type); - CHECK(inputs_map.count(x_var_name)); - scale_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(scale_node); - - // add filter node(fill with scale) - auto filter_const_node = - std::make_shared(unique_op_type + "/filter"); - filter_const_node->set_attr_value( - CreateTensorAndFillData(scale, scale_bias_shape)); - scale_node->set_input_filter(*filter_const_node); - OpList::Global().add(filter_const_node); - - // add bias node(fill with bias) - if (fabs(bias) > 1e-6f) { - auto bias_const_node = - std::make_shared(unique_op_type + "/bias"); - bias_const_node->set_attr_value( - CreateTensorAndFillData(bias, scale_bias_shape)); - scale_node->set_input_bias(*bias_const_node); - scale_node->set_attr_has_bias_value(true); - OpList::Global().add(bias_const_node); - } - - scale_node->set_attr_axis(1); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = scale_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(scale, paddle::lite::npu::bridge::ScaleConverter); diff --git a/lite/backends/npu/bridge/scale_op_test.cc b/lite/backends/npu/bridge/scale_op_test.cc deleted file mode 100644 index f4a241c8d9..0000000000 --- a/lite/backends/npu/bridge/scale_op_test.cc +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/scale_op.h" -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void scale_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - float scale = op_info->GetAttr("scale"); - float bias = op_info->GetAttr("bias"); - bool bias_after_scale = op_info->GetAttr("bias_after_scale"); - if (!bias_after_scale) { - bias *= scale; - } - auto x_data = x->data(); - auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - DDim out_dims = out->dims(); - CHECK_EQ(x_dims.production(), out_dims.production()); - for (int i = 0; i < out_dims.production(); i++) { - out_data[i] = x_data[i] * scale + bias; - } -} - -void test_scale(int bs, - int ic, - int ih, - int iw, - bool bias_after_scale, - float scale, - float bias) { - // prepare input&output variables - Scope scope; - std::string x_var_name("x"); - std::string out_var_name("out"); - std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("scale"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("bias_after_scale", bias_after_scale); - opdesc.SetAttr("scale", scale); - opdesc.SetAttr("bias", bias); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor('out') - scale_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, scale) { - for (auto bs : {1, 3}) { - for (auto ic : {1, 3}) { - for (auto ih : {3, 4}) { - for (auto iw : {4, 3}) { - for (auto bias_after_scale : {true, false}) { - for (auto scale : {-1.0f, 5.0f}) { - for (auto bias : {-2.0f, 30.0f}) { - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw - << " bias_after_scale: " << bias_after_scale - << " scale: " << scale << " bias: " << bias; - test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias); - } - } - } - } - } - } - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(scale); -USE_NPU_BRIDGE(scale); diff --git a/lite/backends/npu/bridge/shuffle_channel_op.cc b/lite/backends/npu/bridge/shuffle_channel_op.cc deleted file mode 100644 index ac4ae58d34..0000000000 --- a/lite/backends/npu/bridge/shuffle_channel_op.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/shuffle_channel_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type ShuffleChannelConverter( - const std::shared_ptr shuffle_channel_op, - const node_map_type& inputs_map) { - auto scope = shuffle_channel_op->scope(); - auto op_info = shuffle_channel_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - std::shared_ptr shuffle_channel_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); - - shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name)); - shuffle_channel_node->set_attr_group(op_info->GetAttr("group")); - - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(shuffle_channel_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = shuffle_channel_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(shuffle_channel, - paddle::lite::npu::bridge::ShuffleChannelConverter); diff --git a/lite/backends/npu/bridge/shuffle_channel_op_test.cc b/lite/backends/npu/bridge/shuffle_channel_op_test.cc deleted file mode 100644 index c37c97a3b4..0000000000 --- a/lite/backends/npu/bridge/shuffle_channel_op_test.cc +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/shuffle_channel_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void shuffle_channel_ref( - const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->mutable_data(); - auto out_data = out->mutable_data(); - int group = op_info->GetAttr("group"); - auto x_dims = x->dims(); - - int n_size = x_dims.production() / x_dims[0]; - int c_size = n_size / x_dims[1]; - for (int n = 0; n < x_dims[0]; n++) { - int g_num = x_dims[1] / group; - auto tmp_out_data = out_data; - for (int g = 0; g < g_num; g++) { - auto tmp_x_data = x_data + g * c_size; - for (int i = 0; i < group; i++) { - std::memcpy(tmp_out_data, - tmp_x_data + i * g_num * c_size, - c_size * sizeof(float)); - tmp_out_data += c_size; - } - } - x_data += n_size; - out_data += n_size; - } -} - -void test_shuffle_channel(int bs, int ic, int ih, int iw, int group) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("shuffle_channel"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("group", group); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - shuffle_channel_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, softmax) { - for (auto bs : {1, 4}) { - for (auto ic : {1, 24, 35}) { - for (auto ih : {1, 4}) { - for (auto iw : {1, 4}) { - for (auto group : {1, 3, 7, 24, 35}) { - if (ic % group != 0) continue; - test_shuffle_channel(bs, ic, ih, iw, group); - } - } - } - } - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(shuffle_channel); -USE_NPU_BRIDGE(shuffle_channel); diff --git a/lite/backends/npu/bridge/softmax_op.cc b/lite/backends/npu/bridge/softmax_op.cc deleted file mode 100644 index 6c556e6ca7..0000000000 --- a/lite/backends/npu/bridge/softmax_op.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/softmax_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type SoftmaxConverter(const std::shared_ptr softmax_op, - const node_map_type& inputs_map) { - auto scope = softmax_op->scope(); - auto op_info = softmax_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - std::shared_ptr softmax_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); - - auto x_dims = scope->FindVar(x_var_name)->GetMutable()->dims(); - auto axis = op_info->GetAttr("axis"); - if (x_dims.size() > 3) { - CHECK(!(axis == 2 && x_dims[3] > 1)) - << "unsupported npu softmax params: axis = " << axis - << " :x_w = " << x_dims[3]; - } - - CHECK(inputs_map.count(x_var_name)); - softmax_node->set_input_x(*inputs_map.at(x_var_name)); - softmax_node->set_attr_axis(axis); - - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(softmax_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = softmax_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(softmax, paddle::lite::npu::bridge::SoftmaxConverter); diff --git a/lite/backends/npu/bridge/softmax_op_test.cc b/lite/backends/npu/bridge/softmax_op_test.cc deleted file mode 100644 index c3114f5360..0000000000 --- a/lite/backends/npu/bridge/softmax_op_test.cc +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/softmax_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -void softmax_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->data(); - auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - - auto x_rank = x_dims.size(); - int axis = op_info->GetAttr("axis"); - if (axis < 0) { - axis += x_rank; - } - int axis_size = x_dims[axis]; - int outer_num = x_dims.Slice(0, axis).production(); - int inner_num = x_dims.Slice(axis + 1, x_rank).production(); - int compute_size = outer_num * inner_num; - for (int i = 0; i < compute_size; i++) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int start = idx_outer * inner_num + idx_inner; - int offset; - - offset = start; - dtype max_data = std::numeric_limits::lowest(); - for (int j = 0; j < axis_size; j++) { - max_data = x_data[offset] > max_data ? x_data[offset] : max_data; - offset += inner_num; - } - - offset = start; - dtype sum_data = (dtype)0; - for (int j = 0; j < axis_size; j++) { - out_data[offset] = exp(x_data[offset] - max_data); - sum_data += out_data[offset]; - offset += inner_num; - } - - offset = start; - for (int j = 0; j < axis_size; j++) { - out_data[offset] /= sum_data; - offset += inner_num; - } - } -} - -void test_softmax(int bs, int ic, int ih, int iw, int axis) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("softmax"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axis", axis); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - softmax_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, softmax) { - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto axis : {-3, -1, 0, 1, 2, 3}) { - // npu softmax exists bugs when axis is 2 and iw > 1 - if (axis == 2 && iw > 1) continue; - test_softmax(bs, ic, ih, iw, axis); - } - } - } - } - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(softmax); -USE_NPU_BRIDGE(softmax); diff --git a/lite/backends/npu/bridge/split_op.cc b/lite/backends/npu/bridge/split_op.cc deleted file mode 100644 index 86de45fedf..0000000000 --- a/lite/backends/npu/bridge/split_op.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/split_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" -#include "lite/backends/npu/npu_helper.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { -node_map_type SplitConverter(const std::shared_ptr split_op, - const node_map_type& inputs_map) { - lite::Scope* scope = split_op->scope(); - const lite::OpInfo* op_info = split_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " << op_type << " ... "; - - auto x_var_name = op_info->Input("X").front(); - auto axis = op_info->GetAttr("axis"); - auto num = op_info->GetAttr("num"); - auto sections = op_info->GetAttr>("sections"); - int64_t sections_num = static_cast(sections.size()); - - std::shared_ptr output_node = - std::make_shared(unique_op_type); - CHECK(inputs_map.count(x_var_name)); - output_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - - output_node->set_attr_axis(static_cast(axis)); - if (num > 0) { - output_node->set_attr_output_num(static_cast(num)); - } else { - output_node->set_attr_output_num(sections_num); - auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end()); - output_node->set_attr_size_split(size_split); - } - - node_map_type outputs_map; - auto out_var_names = op_info->Output("Out"); - output_node->create_dynamic_output_y(out_var_names.size()); - int index = 1; - for (auto out_var_name : out_var_names) { - auto const_node = std::make_shared( - unique_op_type + "/const_zero" + std::to_string(index)); - const_node->set_attr_value(CreateTensorAndFillData(0)); - OpList::Global().add(const_node); - auto add_node = std::make_shared(unique_op_type + "/add" + - std::to_string(index)); - add_node->set_input_x1(*output_node, "y" + std::to_string(index)); - add_node->set_input_x2(*const_node); - outputs_map[out_var_name] = add_node; - OpList::Global().add(add_node); - index++; - } - - OpList::Global().add(output_node); - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(split, paddle::lite::npu::bridge::SplitConverter); diff --git a/lite/backends/npu/bridge/split_op_test.cc b/lite/backends/npu/bridge/split_op_test.cc deleted file mode 100644 index 91629a70fc..0000000000 --- a/lite/backends/npu/bridge/split_op_test.cc +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/split_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -void split_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - int num = op_info->GetAttr("num"); - int axis = op_info->GetAttr("axis"); - std::vector sections = op_info->GetAttr>("sections"); - std::vector output_vec; - auto output = op_info->Output("Out"); - for (auto out_var : output) { - output_vec.push_back(scope->Var(out_var)->GetMutable()); - } - auto in_dims = x->dims(); - auto rank = in_dims.size(); - int outs_number = output_vec.size(); - std::vector outs_dims; - outs_dims.reserve(outs_number); - if (axis < 0) { - axis += rank; - } - if (num > 0) { - int out_axis_dim = in_dims[axis] / num; - for (int i = 0; i < outs_number; ++i) { - auto dim = in_dims; - dim[axis] = out_axis_dim; - outs_dims.push_back(dim); - } - } else if (sections.size() > 0) { - for (size_t i = 0; i < outs_number; ++i) { - auto dim = in_dims; - dim[axis] = sections[i]; - outs_dims.push_back(dim); - } - } - for (int j = 0; j < outs_dims.size(); ++j) { - output_vec[j]->Resize(outs_dims[j]); - } - - const dtype* din = x->mutable_data(); - std::vector in_strides(in_dims.size()); - in_strides[in_dims.size() - 1] = in_dims[in_dims.size() - 1]; - for (int i = in_dims.size() - 2; i >= 0; --i) { - in_strides[i] = in_strides[i + 1] * in_dims[i]; - } - - int input_offset = 0; - for (auto out : output_vec) { - auto out_dim = out->dims(); - std::vector out_strides(out_dim.size()); - out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1]; - for (int i = out_dim.size() - 2; i >= 0; --i) { - out_strides[i] = out_strides[i + 1] * out_dim[i]; - } - - dtype* out_data = out->mutable_data(); - int before = out_strides[0] / out_strides[axis]; - int in_after = in_strides[axis]; - int out_after = out_strides[axis]; - - for (int i = 0; i < before; ++i) { - std::memcpy(out_data + i * out_after, - din + input_offset + i * in_after, - sizeof(dtype) * out_after); - } - input_offset += out_strides[axis]; - } -} - -void test_split(int bs, - int ic, - int ih, - int iw, - int axis, - int num, - std::vector sections) { - const auto& bridges = lite::npu::bridge::Factory::Instance(); - const auto& supported_lists = bridges.AllFunctions(); - CHECK(bridges.HasType("split")); - // prepare input&output variables - std::string x_var_name = "x"; - std::string out_var_name_1 = "out_1"; - std::string out_var_name_2 = "out_2"; - std::string out_ref_var_name_1 = "out_ref_1"; - std::string out_ref_var_name_2 = "out_ref_2"; - - Scope scope; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out_1 = scope.Var(out_var_name_1)->GetMutable(); - auto* out_2 = scope.Var(out_var_name_2)->GetMutable(); - auto* out_ref_1 = scope.Var(out_ref_var_name_1)->GetMutable(); - auto* out_ref_2 = scope.Var(out_ref_var_name_2)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("split"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name_1, out_var_name_2}); - opdesc.SetAttr("axis", axis); - opdesc.SetAttr("sections", sections); - opdesc.SetAttr("num", num); - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name_1, out_var_name_2}); - out_ref_1->CopyDataFrom(*out_1); - out_ref_2->CopyDataFrom(*out_2); - // execute reference implementation and save to output tensor - split_ref(op); - - // compare results - auto* out_data_1 = out_1->mutable_data(); - auto* out_data_2 = out_2->mutable_data(); - auto* out_ref_data_1 = out_ref_1->mutable_data(); - auto* out_ref_data_2 = out_ref_2->mutable_data(); - for (int i = 0; i < out_1->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data_1[i], out_ref_data_1[i], 5e-4); - } - for (int i = 0; i < out_2->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data_2[i], out_ref_data_2[i], 5e-4); - } -} - -TEST(NPUBridges, split) { - test_split(4, 2, 3, 1, 0, 2, {}); - test_split(4, 2, 3, 1, 0, 0, {3, 1}); - test_split(4, 6, 3, 1, 1, 2, {}); - test_split(4, 6, 3, 1, 1, 0, {2, 4}); - test_split(4, 2, 2, 1, 2, 2, {}); - test_split(4, 2, 6, 1, 2, 0, {3, 3}); - test_split(4, 2, 3, 4, 3, 2, {}); - test_split(4, 2, 3, 6, 3, 0, {5, 1}); -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(split); -USE_NPU_BRIDGE(split); diff --git a/lite/backends/npu/bridge/test_helper.cc b/lite/backends/npu/bridge/test_helper.cc deleted file mode 100644 index 3d6dc03481..0000000000 --- a/lite/backends/npu/bridge/test_helper.cc +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/bridge/test_helper.h" -#include -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" -#include "lite/core/op_registry.h" -#include "lite/operators/graph_op.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -void LauchOp(const std::shared_ptr op, - const std::vector& input_var_names, - const std::vector& output_var_names) { - auto scope = op->scope(); - auto op_type = op->op_info()->Type(); - - // convert op to IR graph - const auto& bridges = lite::npu::bridge::Factory::Instance(); - const auto& supported_lists = bridges.AllFunctions(); - CHECK(bridges.HasType(op_type)); - - node_map_type inputs_map; - for (auto input_var_name : input_var_names) { - auto input = scope->FindVar(input_var_name)->GetMutable(); - ge::TensorDesc input_desc( - ge::Shape(input->dims().Vectorize()), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto input_node = std::make_shared(input_var_name); - input_node->update_input_desc_x(input_desc); - npu::OpList::Global().add(input_node); - inputs_map[input_var_name] = input_node; - } - auto outputs_map = supported_lists.at(op_type)(op, inputs_map); - CHECK_GT(outputs_map.size(), 0); - - // compile IR graph to om model - std::vector graph_inputs; - for (auto input_var_name : input_var_names) { - graph_inputs.push_back(*inputs_map[input_var_name]); - } - std::vector graph_outputs; - for (auto output_var_name : output_var_names) { - graph_outputs.push_back(*outputs_map[output_var_name]); - } - std::string model_name(UniqueName("test_" + op_type) + ".om"); - CHECK(npu::BuildNPUClient(graph_inputs, graph_outputs, model_name)); - - // create graph op and set inputs and outputs - cpp::OpDesc graph_op_desc; - graph_op_desc.SetType("graph_op"); - graph_op_desc.SetInput("Inputs", input_var_names); - graph_op_desc.SetOutput("Outputs", output_var_names); - graph_op_desc.SetAttr("model_name", model_name); - - auto graph_op = - std::make_shared(graph_op_desc.Type()); - graph_op->SetValidPlaces({Place{TARGET(kNPU), PRECISION(kFloat)}}); - CHECK(graph_op->Attach(graph_op_desc, scope)); - CHECK(graph_op->CheckShape()); - CHECK(graph_op->InferShape()); - - // create graph op kernel and set NPU context - auto graph_kernels = - graph_op->CreateKernels({Place{TARGET(kNPU), PRECISION(kFloat)}}); - CHECK(!graph_kernels.empty()); - auto graph_kernel = - std::move(graph_kernels.front()); // use the first kernel by default - auto graph_ctx = ContextScheduler::Global().NewContext(TARGET(kNPU)); - graph_kernel->SetContext(std::move(graph_ctx)); - - // perform graph op kernel and store to output variables - graph_kernel->Launch(); - - // release all of resources of generated model - npu::OpList::Global().clear(); - npu::DeviceInfo::Global().Clear(); -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(graph_op); -USE_LITE_KERNEL(graph_op, kNPU, kFloat, kNCHW, def); diff --git a/lite/backends/npu/bridge/test_helper.h b/lite/backends/npu/bridge/test_helper.h deleted file mode 100644 index 537f737640..0000000000 --- a/lite/backends/npu/bridge/test_helper.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "lite/core/op_lite.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -template -std::shared_ptr CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) { - auto op = std::make_shared(opdesc.Type()); - op->SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kNPU), PRECISION(kFloat)}}); - CHECK(op->Attach(opdesc, scope)); - CHECK(op->CheckShape()); - CHECK(op->InferShape()); - return op; -} - -// T is the target data type -// R is the range data type, e.g. int, half -template -void FillTensor(Tensor* x, - T lower = static_cast(-2), - T upper = static_cast(2)) { - static unsigned int seed = 100; - std::mt19937 rng(seed++); - std::uniform_real_distribution uniform_dist(0, 1); - - T* x_data = x->mutable_data(); - for (int i = 0; i < x->dims().production(); ++i) { - auto r = uniform_dist(rng) * (upper - lower) + lower; - x_data[i] = static_cast(static_cast(r)); - } -} - -void LauchOp(const std::shared_ptr op, - const std::vector& input_var_names, - const std::vector& output_var_names); - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/bridge/transpose_op.cc b/lite/backends/npu/bridge/transpose_op.cc deleted file mode 100644 index ad00e599ce..0000000000 --- a/lite/backends/npu/bridge/transpose_op.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/transpose_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -node_map_type TransposeConverter( - const std::shared_ptr transpose_op, - const node_map_type& inputs_map) { - auto scope = transpose_op->scope(); - auto op_info = transpose_op->op_info(); - auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); - LOG(INFO) << "Converting " + op_type + "..."; - - std::shared_ptr transpose_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); - - // paddlelite doesn't have this input - // w must be set, but it does nothing - auto w_var_name = unique_op_type + "/w"; - auto* w = scope->Var(w_var_name)->GetMutable(); - w->Resize({1}); - auto* w_data = w->mutable_data(); - for (int i = 0; i < w->numel(); i++) { - w_data[i] = 1.f; - } - auto npu_w = std::make_shared(w_var_name); - npu_w->set_attr_value(CvtFromLiteTensor(w)); - OpList::Global().add(npu_w); - - auto axis = op_info->GetAttr>("axis"); - auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end()); - - CHECK(inputs_map.count(x_var_name)); - transpose_node->set_input_x(*inputs_map.at(x_var_name)); - transpose_node->set_input_w(*npu_w); - transpose_node->set_attr_order(npu_axis); - - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(transpose_node); - - node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = transpose_node; - return outputs_map; -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -REGISTER_NPU_BRIDGE(transpose, paddle::lite::npu::bridge::TransposeConverter); -REGISTER_NPU_BRIDGE(transpose2, paddle::lite::npu::bridge::TransposeConverter); diff --git a/lite/backends/npu/bridge/transpose_op_test.cc b/lite/backends/npu/bridge/transpose_op_test.cc deleted file mode 100644 index 9bbfb11123..0000000000 --- a/lite/backends/npu/bridge/transpose_op_test.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/transpose_op.h" -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/test_helper.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -int data_index(std::vector pos, DDimLite dims) { - int d1 = dims[1]; - int d2 = dims[2]; - int d3 = dims[3]; - return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1; -} - -std::vector pos_trans(std::vector in_pos, std::vector axis) { - std::vector out_pos(in_pos.size()); - for (int i = 0; i < axis.size(); i++) { - out_pos[axis[i]] = in_pos[i]; - } - return out_pos; -} - -void transpose_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto input = - scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto output = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_dims = input->dims(); - auto y_dims = output->dims(); - auto axis = op_info->GetAttr>("axis"); - - auto* input_data = input->data(); - auto* output_data = output->mutable_data(); - - int input_n = x_dims[0]; - int input_c = x_dims[1]; - int input_h = x_dims[2]; - int input_w = x_dims[3]; - int output_n = y_dims[0]; - int output_c = y_dims[1]; - int output_h = y_dims[2]; - int output_w = y_dims[3]; - - for (int n = 0; n < input_n; ++n) { - for (int c = 0; c < input_c; ++c) { - for (int h = 0; h < input_h; ++h) { - for (int w = 0; w < input_w; ++w) { - std::vector in_pos{n, c, h, w}; - std::vector out_pos = pos_trans(in_pos, axis); - int in_index = data_index(in_pos, x_dims); - int out_index = data_index(out_pos, y_dims); - output_data[out_index] = input_data[in_index]; - } - } - } - } -} - -void test_transpose(int bs, int ic, int ih, int iw, std::vector axis) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("transpose"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axis", axis); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - transpose_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, transpose) { -#if 0 - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto axis : {std::vector{0, 1, 2, 3}, - std::vector{0, 1, 3, 2}, - std::vector{0, 3, 1, 2}, - std::vector{1, 2, 3, 0}, - std::vector{3, 2, 1, 0}, - std::vector{2, 3, 1, 0}}) { - test_transpose(bs, ic, ih, iw, axis); - } - } - } - } - } -#endif - test_transpose(2, 3, 4, 5, std::vector{0, 1, 3, 2}); - // test_transpose(2, 3, 4, 5, std::vector{0, 1, 2, 3}); - // test_transpose(2, 2, 2, 2, std::vector{0,1,3,2}); - // test_transpose(1, 1, 2, 2, std::vector{0,1,3,2}); - // test_transpose(1, 1, 1, 2, std::vector{0,1,2,3}); -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle - -USE_LITE_OP(transpose); -USE_NPU_BRIDGE(transpose); - -USE_LITE_OP(transpose2); -USE_NPU_BRIDGE(transpose2); diff --git a/lite/backends/npu/bridge/utils.cc b/lite/backends/npu/bridge/utils.cc deleted file mode 100644 index 8abd7dbda4..0000000000 --- a/lite/backends/npu/bridge/utils.cc +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/bridge/utils.h" -#include -#include // NOLINT -#include -#include -#include "ai_ddk_lib/include/graph/op/all_ops.h" // for ge::op::Data -#include "ai_ddk_lib/include/graph/tensor.h" // for ge::TensorUtils -#include "lite/core/op_lite.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -std::string UniqueName(const std::string& prefix) { - static std::mutex counter_mtx; - static std::unordered_map counter_map; - std::unique_lock counter_lck(counter_mtx); - int counter = 1; - auto it = counter_map.find(prefix); - if (it == counter_map.end()) { - counter_map[prefix] = counter; - } else { - counter = ++(it->second); - } - return prefix + "_" + std::to_string(counter); -} - -ge::DataType PrecisionConverter(PrecisionType itype) { - ge::DataType otype = ge::DT_FLOAT; - switch (itype) { - case PRECISION(kFloat): - otype = ge::DT_FLOAT; - break; - case PRECISION(kInt8): - otype = ge::DT_INT8; - break; - case PRECISION(kInt32): - otype = ge::DT_INT32; - break; - default: - LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(itype) - << ") from Lite to NPU"; - break; - } - return otype; -} - -ge::Format DataLayoutConverter(DataLayoutType itype) { - ge::Format otype = ge::FORMAT_NCHW; - switch (itype) { - case DATALAYOUT(kNCHW): - otype = ge::FORMAT_NCHW; - break; - // TODO(hong19860320) support more data layout type - default: - LOG(FATAL) << "Can not convert data layout type(" - << DataLayoutToStr(itype) << ") from Lite to NPU"; - break; - } - return otype; -} - -ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor, - std::vector out_shape, - PrecisionType in_ptype, - DataLayoutType in_ltype) { - uint8_t* in_data = nullptr; - auto in_size = in_tensor->dims().production(); - auto in_shape = in_tensor->dims().Vectorize(); - if (out_shape.empty()) { - out_shape = in_shape; - } - int in_bytes; - if (in_ptype == PRECISION(kFloat)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(float); - } else if (in_ptype == PRECISION(kInt32)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int32_t); - } else if (in_ptype == PRECISION(kInt8)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int8_t); - } else { - LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype); - } - ge::DataType out_ptype = PrecisionConverter(in_ptype); - ge::Format out_ltype = DataLayoutConverter(in_ltype); - - ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype); - CHECK_EQ(out_ltype, ge::FORMAT_NCHW); - - auto out_size = out_desc.GetShape().GetShapeSize(); - CHECK_EQ(out_size, in_size); - - ge::TensorPtr out_tensor = std::make_shared(); - out_tensor->SetTensorDesc(out_desc); - out_tensor->SetData(in_data, in_bytes); - return out_tensor; -} - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname) { - auto iarg_names = op_info->input_argnames(); - if (std::find(iarg_names.begin(), iarg_names.end(), argname) != - iarg_names.end()) { - auto inputs = op_info->Input(argname); - if (inputs.empty()) { - return false; - } - auto var_name = inputs.front(); - auto var = scope->FindVar(var_name); - return var != nullptr; - } else { - return false; - } -} - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/bridge/utils.h b/lite/backends/npu/bridge/utils.h deleted file mode 100644 index 169b7ca80c..0000000000 --- a/lite/backends/npu/bridge/utils.h +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/core/mir/node.h" -#include "lite/core/op_lite.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace npu { -namespace bridge { - -std::string UniqueName(const std::string& prefix); - -ge::DataType PrecisionConverter(PrecisionType itype); - -ge::Format DataLayoutConverter(DataLayoutType itype); - -ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor, - std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); - -template -ge::TensorPtr CreateTensorAndFillData(std::vector data, - std::vector shape = {}, - ge::Format format = ge::FORMAT_NCHW) { - const std::type_info& info = typeid(T); - ge::DataType type = ge::DT_FLOAT; - if (info == typeid(float)) { - type = ge::DT_FLOAT; - } else if (info == typeid(int8_t)) { - type = ge::DT_INT8; - } else if (info == typeid(int32_t)) { - type = ge::DT_INT32; - } else { - LOG(FATAL) << "Unknow value type " << info.name(); - } - if (shape.empty()) { - shape = {static_cast(data.size())}; - } else { - int size = 1; - for (auto i : shape) { - size *= i; - } - CHECK_EQ(data.size(), size); - } - ge::TensorDesc desc(ge::Shape(shape), format, type); - ge::TensorPtr tensor = std::make_shared(); - tensor->SetTensorDesc(desc); - tensor->SetData(reinterpret_cast(data.data()), - data.size() * sizeof(T)); - return tensor; -} - -template -ge::TensorPtr CreateTensorAndFillData(T value, - std::vector shape = {1}, - ge::Format format = ge::FORMAT_NCHW) { - int64_t size = 1; - for (auto i : shape) { - size *= i; - } - std::vector data(size, value); - return CreateTensorAndFillData(data, shape, format); -} - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname); - -} // namespace bridge -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/npu_helper.cc b/lite/backends/npu/npu_helper.cc deleted file mode 100644 index 688c62c7f6..0000000000 --- a/lite/backends/npu/npu_helper.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/npu_helper.h" -#include -#include -#include -#include -#include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/hiai_ir_build.h" - -namespace paddle { -namespace lite { -namespace npu { - -bool SaveNPUModel(const void* om_model_data, - const size_t om_model_size, - const std::string& om_file_path) { - std::FILE* fp; - fp = std::fopen(om_file_path.c_str(), "wb"); - if (fp == NULL) { - LOG(WARNING) << "[NPU] " << om_file_path << " open failed!"; - return false; - } - - size_t write_size = std::fwrite(om_model_data, 1, om_model_size, fp); - if (write_size != om_model_size) { - std::fclose(fp); - LOG(WARNING) << "[NPU] Write NPU model failed: " << om_file_path; - return false; - } - std::fclose(fp); - return true; -} - -bool BuildNPUClient(const void* om_model_data, - const size_t om_model_size, - const std::string& name) { - std::unique_ptr client( - new hiai::AiModelMngerClient); - int ret = client->Init(nullptr); - if (ret != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] Failed building NPU client " << name - << ", ret: " << ret; - throw std::runtime_error(""); - return false; - } - - auto desc = std::make_shared( - name, - DeviceInfo::Global().freq_level(), - DeviceInfo::Global().framework_type(), - DeviceInfo::Global().model_type(), - DeviceInfo::Global().device_type()); - desc->SetModelBuffer(om_model_data, om_model_size); - - std::vector> model_desc; - model_desc.push_back(desc); - if (client->Load(model_desc) != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] Model Load Failed: " << desc->GetName(); - throw std::runtime_error(""); - return false; - } - - DeviceInfo::Global().Insert(name, std::move(client)); - return true; -} - -// If build from inputs and outputs will save the npu offline model -bool BuildNPUClient(std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - const std::string& name) { - LOG(INFO) << "[NPU] Building Client"; - ge::Graph npu_subgraph("npu_subgraph" + name); - npu_subgraph.SetInputs(inputs).SetOutputs(outputs); - - ge::Model npu_model("model", "npu_model" + name); - npu_model.SetGraph(npu_subgraph); - - // compile IR graph and output om model to memory - domi::HiaiIrBuild ir_build; - domi::ModelBufferData om_model_buffer; - if (!ir_build.CreateModelBuff(npu_model, om_model_buffer)) { - LOG(WARNING) << "[NPU] Failed CreateModelBuff: " << npu_model.GetName(); - return false; - } - if (!ir_build.BuildIRModel(npu_model, om_model_buffer)) { - LOG(WARNING) << "[NPU] Failed BuildIRModel: " << npu_model.GetName(); - return false; - } - - if (BuildNPUClient(om_model_buffer.data, om_model_buffer.length, name)) { - // save npu offline model - if (!SaveNPUModel(om_model_buffer.data, om_model_buffer.length, name)) { - LOG(WARNING) << "[NPU] Save model " << name << " failed."; - } - ir_build.ReleaseModelBuff(om_model_buffer); - return true; - } - return false; -} - -// If build from path will not save the npu offline model -bool BuildNPUClient(const std::string& om_model_file_path, - const std::string& name) { - // load om model from file - std::ifstream file(om_model_file_path, std::ios::binary); - CHECK(file.is_open()) << "[NPU] Unable to open om model file: " - << om_model_file_path; - const auto fbegin = file.tellg(); - file.seekg(0, std::ios::end); - const auto fend = file.tellg(); - size_t om_model_size = fend - fbegin; - VLOG(5) << "[NPU] om model file size: " << om_model_size; - file.seekg(0, std::ios::beg); - std::vector om_model_data(om_model_size); - file.read(om_model_data.data(), om_model_size); - - return BuildNPUClient( - reinterpret_cast(om_model_data.data()), om_model_size, name); -} - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/npu_helper.h b/lite/backends/npu/npu_helper.h deleted file mode 100644 index 95c290315b..0000000000 --- a/lite/backends/npu/npu_helper.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace npu { - -class DeviceInfo { - public: - static DeviceInfo& Global() { - static DeviceInfo x; - return x; - } - DeviceInfo() {} - void Insert(const std::string& name, - std::unique_ptr client) { - if (clients_.find(name) != clients_.end()) { - LOG(WARNING) << "[NPU] Already insert " << name; - return; - } - clients_.emplace(std::make_pair(name, std::move(client))); - } - - void Clear() { clients_.clear(); } - - hiai::AiModelMngerClient* client(const std::string& model_name) const { - if (clients_.find(model_name) != clients_.end()) { - return clients_.at(model_name).get(); - } else { - return nullptr; - } - } - std::vector AllClientNames() { - std::vector names; - for (auto& i : clients_) { - names.push_back(i.first); - } - return names; - } - - int freq_level() { return freq_level_; } - int framework_type() { return framework_type_; } - int model_type() { return model_type_; } - int device_type() { return device_type_; } - - private: - int freq_level_{3}; - int framework_type_{0}; - int model_type_{0}; - int device_type_{0}; - // TODO(TJ): find better place - std::unordered_map> - clients_; -}; - -class OpList { - public: - static OpList& Global() { - static thread_local OpList x; - return x; - } - void clear() { lists_.clear(); } - void add(std::shared_ptr p) { lists_.push_back(p); } - - private: - std::vector> lists_; -}; - -bool SaveNPUModel(const void* om_model_data, - const size_t om_model_size, - const std::string& om_file_path); - -// If build from inputs and outputs will save the npu offline model -bool BuildNPUClient(std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - const std::string& name); - -// If build from path will not save the npu offline model -bool BuildNPUClient(const std::string& om_model_file_path, - const std::string& name); - -bool BuildNPUClient(const void* om_model_data, - const size_t om_model_size, - const std::string& name); - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt deleted file mode 100644 index 1acb983218..0000000000 --- a/lite/backends/opencl/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -if (NOT LITE_WITH_OPENCL) - return() -endif() - -lite_cc_library(cl_wrapper SRCS cl_wrapper.cc) -lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper) -lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility) -lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime) -lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor) -lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime) -lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) -lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) -lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl) -lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper - ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl) - -add_dependencies(cl_wrapper opencl_clhpp) diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc deleted file mode 100644 index ae755b756d..0000000000 --- a/lite/backends/opencl/cl_caller.cc +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/opencl/cl_caller.h" -#include -#include "lite/backends/opencl/cl_context.h" -#include "lite/backends/opencl/cl_image.h" -#include "lite/backends/opencl/cl_runtime.h" -#include "lite/backends/opencl/cl_utility.h" -#include "lite/core/tensor.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -static void CopyImageData(CLContext* context, - const CLImage& cl_image, - float* out) { - int width = cl_image.image_dims()[0]; - int height = cl_image.image_dims()[1]; - - float* image_data = new float[height * width * 4]; - cl::Image* image = cl_image.cl_image(); - const std::array origin{0, 0, 0}; - const std::array region{ - static_cast(width), static_cast(height), 1}; - cl_int err = context->GetCommandQueue().enqueueReadImage( - *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); - CL_CHECK_FATAL(err); - - auto* converter = cl_image.image_converter(); - converter->ImageToNCHW( - image_data, out, cl_image.image_dims(), cl_image.tensor_dims()); - - delete[] image_data; -} - -bool InitOpenCLRuntime(std::string cl_path) { - auto* runtime = CLRuntime::Global(); - runtime->set_cl_path(cl_path); - return runtime->IsInitSuccess(); -} - -void elementwise_add(CLContext* context, - const float* in, - const DDim& in_dim, - const float* bias, - const DDim& bias_dim, - float* out, - const DDim& out_dim) { - if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) { - LOG(FATAL) << "Error: bias dims is error"; - return; - } - auto kernel = bias_dim.size() == 1 ? context->GetKernel("channel_add") - : context->GetKernel("elementwise_add"); - CLImage in_image; - in_image.set_tensor_data(in, in_dim); - in_image.InitNormalCLImage(context->GetContext()); - VLOG(3) << " --- Inpu image: " << in_image << " --- "; - CLImage bias_image; - bias_image.set_tensor_data(bias, bias_dim); - bias_image.InitCLImage(context->GetContext()); - VLOG(3) << " --- Bias image: " << bias_image << " --- "; - CLImage out_image; - out_image.InitEmptyImage(context->GetContext(), out_dim); - cl_int status; - status = kernel.setArg(0, *in_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(1, *bias_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(2, *out_image.cl_image()); - CL_CHECK_FATAL(status); - - if (bias_dim.size() == 1) { - int tensor_w = in_dim[3]; - status = kernel.setArg(3, tensor_w); - CL_CHECK_FATAL(status); - } - size_t width = in_image.ImageWidth(); - size_t height = in_image.ImageHeight(); - auto global_work_size = cl::NDRange{width, height}; - status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); - CL_CHECK_FATAL(status); - - status = context->GetCommandQueue().finish(); - CL_CHECK_FATAL(status); - VLOG(3) << " --- Out image: " << out_image << " --- "; - CopyImageData(context, out_image, out); -} - -void pool(CLContext* context, - const std::string pooling_type, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int ksize_h, - const int ksize_w, - const float* in, - const DDim& in_dim, - float* out, - const DDim& out_dim) { - auto kernel = - context->GetKernel(string_format("pool_%s", pooling_type.c_str())); - CLImage in_image; - in_image.set_tensor_data(in, in_dim); - in_image.InitNormalCLImage(context->GetContext()); - VLOG(3) << " --- Inpu image: " << in_image << " --- "; - CLImage out_image; - out_image.InitEmptyImage(context->GetContext(), out_dim); - auto global_work_size = context->DefaultWorkSize(out_image); - auto* in_converter = - dynamic_cast(in_image.image_converter()); - auto* out_converter = - dynamic_cast(out_image.image_converter()); - const int in_height = in_converter->HeightOfOneBlock(); - const int in_width = in_converter->WidthOfOneBlock(); - const int out_height = out_converter->HeightOfOneBlock(); - const int out_width = out_converter->WidthOfOneBlock(); - cl_int status; - status = kernel.setArg(0, in_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(1, in_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(2, out_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(3, out_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(4, pad_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(5, pad_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(6, stride_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(7, stride_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(8, ksize_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(9, ksize_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(10, *in_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(11, *out_image.cl_image()); - CL_CHECK_FATAL(status); - - status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); - CL_CHECK_FATAL(status); - - status = context->GetCommandQueue().finish(); - CL_CHECK_FATAL(status); - VLOG(3) << " --- Out image: " << out_image << " --- "; - CopyImageData(context, out_image, out); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h deleted file mode 100644 index ed5c9153d3..0000000000 --- a/lite/backends/opencl/cl_caller.h +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/backends/opencl/cl_context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { - -bool InitOpenCLRuntime(std::string cl_path); - -/// An elementwise_add method to embed OpenCL logic inside, it is used as a -/// black box so that the framework can remain simple. -/// NOTE Currently, these methods are quite expensive, we will optimize them -/// latter. -void elementwise_add(CLContext* context, - const float* in, - const DDim& in_dim, - const float* bias, - const DDim& bias_dim, - float* out, - const DDim& out_dim); - -void pool(CLContext* context, - const std::string pooling_type, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int ksize_h, - const int ksize_w, - const float* in, - const DDim& in_dim, - float* out, - const DDim& out_dim); - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc deleted file mode 100644 index 0fcb99486e..0000000000 --- a/lite/backends/opencl/cl_context.cc +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/opencl/cl_context.h" -#include -#include -#include -#include "lite/backends/opencl/cl_runtime.h" -#include "lite/backends/opencl/cl_utility.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/replace_stl/stream.h" - -namespace paddle { -namespace lite { - -cl::CommandQueue &CLContext::GetCommandQueue() { - return CLRuntime::Global()->command_queue(); -} - -cl::Context &CLContext::GetContext() { return CLRuntime::Global()->context(); } - -cl::Program &CLContext::GetProgram(const std::string &file_name, - const std::string &options) { - STL::stringstream program_key_ss; - program_key_ss << file_name << options; - std::string program_key = program_key_ss.str(); - auto it = programs_.find(program_key); - if (it != programs_.end()) { - VLOG(3) << " --- program -> " << program_key << " has been built --- "; - return *(it->second); - } - - auto program = CLRuntime::Global()->CreateProgram( - GetContext(), CLRuntime::Global()->cl_path() + "/cl_kernel/" + file_name); - - VLOG(3) << " --- begin build program -> " << program_key << " --- "; - CLRuntime::Global()->BuildProgram(program.get(), options); - VLOG(3) << " --- end build program -> " << program_key << " --- "; - - programs_[program_key] = std::move(program); - - return *(programs_[program_key]); -} - -void CLContext::AddKernel(const std::string &kernel_name, - const std::string &file_name, - const std::string &options) { - cl_int status{CL_SUCCESS}; - VLOG(3) << " --- to get program " << file_name << " --- "; - auto program = GetProgram(file_name, options); - VLOG(3) << " --- end get program --- "; - VLOG(3) << " --- to create kernel: " << kernel_name << " --- "; - std::unique_ptr kernel( - new cl::Kernel(program, kernel_name.c_str(), &status)); - CL_CHECK_FATAL(status); - VLOG(3) << " --- end create kernel --- "; - kernels_.emplace_back(std::move(kernel)); - STL::stringstream kernel_key; - kernel_key << kernel_name << options; - kernel_offset_[kernel_key.str()] = kernels_.size() - 1; -} - -cl::Kernel &CLContext::GetKernel(const int index) { - VLOG(3) << " --- kernel count: " << kernels_.size() << " --- "; - CHECK(static_cast(index) < kernels_.size()) - << "The index must be less than the size of kernels."; - CHECK(kernels_[index] != nullptr) - << "The target kernel pointer cannot be null."; - return *(kernels_[index]); -} - -cl::Kernel &CLContext::GetKernel(const std::string &name) { - auto it = kernel_offset_.find(name); - CHECK(it != kernel_offset_.end()) << "Cannot find the kernel function: " - << name; - return GetKernel(it->second); -} - -cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { - // n c h w - auto image_dim = image.tensor_dims(); - if (image_dim.size() == 4) { - auto n = image_dim[0]; - auto h = image_dim[2]; - auto w = image_dim[3]; - auto image_width = image.ImageWidth(); - auto work_size_0 = image_width / w; - auto work_size_1 = w; - auto work_size_2 = n * h; - return cl::NDRange{static_cast(work_size_0), - static_cast(work_size_1), - static_cast(work_size_2)}; - } else if (image_dim.size() == 2) { - return cl::NDRange{static_cast(1), - static_cast(image.ImageWidth()), - static_cast(image.ImageHeight())}; - } else if (image_dim.size() == 1) { - return cl::NDRange{static_cast(1), - static_cast(image.ImageWidth()), - static_cast(1)}; - } else if (image_dim.size() == 3) { - auto c = image_dim[0]; - auto h = image_dim[1]; - auto w = image_dim[2]; - return cl::NDRange{static_cast((c + 3) / 4), - static_cast(w), - static_cast(h)}; - } else { - LOG(FATAL) << "Not support this dimension, need to be implemented!"; - return cl::NDRange{}; - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h deleted file mode 100644 index a28f82f40e..0000000000 --- a/lite/backends/opencl/cl_context.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include "lite/backends/opencl/cl_image.h" -#include "lite/backends/opencl/cl_include.h" - -namespace paddle { -namespace lite { - -class CLContext { - public: - cl::CommandQueue &GetCommandQueue(); - - cl::Context &GetContext(); - - cl::Program &GetProgram(const std::string &file_name, - const std::string &options); - - void AddKernel(const std::string &kernel_name, - const std::string &file_name, - const std::string &options = ""); - - cl::Kernel &GetKernel(const int index); - - cl::Kernel &GetKernel(const std::string &name); - - cl::NDRange DefaultWorkSize(const CLImage &image); - - private: - std::unordered_map> programs_; - std::vector> kernels_; - std::map kernel_offset_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc deleted file mode 100644 index b041952b34..0000000000 --- a/lite/backends/opencl/cl_functions_test.cc +++ /dev/null @@ -1,451 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include "lite/backends/opencl/cl_caller.h" -#include "lite/backends/opencl/cl_context.h" -#include "lite/backends/opencl/cl_image.h" -#include "lite/backends/opencl/cl_runtime.h" -#include "lite/backends/opencl/target_wrapper.h" -#include "lite/core/tensor.h" -#include "lite/utils/cp_logging.h" - -DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); - -namespace paddle { -namespace lite { - -TEST(cl_test, runtime_test) { - auto *runtime = CLRuntime::Global(); - CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); - runtime->platform(); - runtime->device(); - runtime->command_queue(); - auto &context = runtime->context(); - auto program = runtime->CreateProgram( - context, - runtime->cl_path() + "/cl_kernel/" + "image/elementwise_add_kernel.cl"); - auto event = runtime->CreateEvent(context); - CHECK(runtime->BuildProgram(program.get())); -} - -TEST(cl_test, context_test) { - auto *runtime = CLRuntime::Global(); - CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); - CLContext context; - context.AddKernel("pool_max", "image/pool_kernel.cl", ""); - context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", ""); - context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", ""); -} - -TEST(cl_test, kernel_test) { - auto *runtime = CLRuntime::Global(); - CHECK(runtime->IsInitSuccess()); - runtime->set_cl_path(FLAGS_cl_path); - std::unique_ptr context(new CLContext); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); - context->AddKernel("pool_max", "image/pool_kernel.cl"); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); - auto kernel = context->GetKernel(2); - - std::unique_ptr in_data(new float[4 * 3 * 256 * 512]); - for (int i = 0; i < 4 * 3 * 256 * 512; i++) { - in_data[i] = 1.f; - } - const DDim in_dim = DDim(std::vector{4, 3, 256, 512}); - CLImage in_image; - in_image.set_tensor_data(in_data.get(), in_dim); - in_image.InitNormalCLImage(context->GetContext()); - LOG(INFO) << in_image; - - std::unique_ptr bias_data(new float[4 * 3 * 256 * 512]); - for (int i = 0; i < 4 * 3 * 256 * 512; i++) { - bias_data[i] = 2.f; - } - const DDim bias_dim = DDim(std::vector{4, 3, 256, 512}); - CLImage bias_image; - bias_image.set_tensor_data(bias_data.get(), bias_dim); - bias_image.InitNormalCLImage(context->GetContext()); - LOG(INFO) << bias_image; - - CLImage out_image; - const DDim out_dim = DDim(std::vector{4, 3, 256, 512}); - out_image.InitEmptyImage(context->GetContext(), out_dim); - LOG(INFO) << out_image; - - cl_int status; - status = kernel.setArg(0, *in_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(1, *bias_image.cl_image()); - CL_CHECK_FATAL(status); - status = kernel.setArg(2, *out_image.cl_image()); - CL_CHECK_FATAL(status); - - size_t width = in_image.ImageWidth(); - size_t height = in_image.ImageHeight(); - auto global_work_size = cl::NDRange{width, height}; - cl::Event event; - status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); - CL_CHECK_FATAL(status); - status = context->GetCommandQueue().finish(); - CL_CHECK_FATAL(status); - double start_nanos = event.getProfilingInfo(); - double stop_nanos = event.getProfilingInfo(); - double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; - LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; - LOG(INFO) << out_image; -} - -TEST(cl_test, channel_add_test) { - std::default_random_engine engine; - std::uniform_real_distribution dist(-5, 5); - - const DDim in_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr in_data(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - in_data[i] = dist(engine); - } - - const DDim bias_dim = DDim(std::vector{16}); - std::unique_ptr bias_data(new float[16]); - for (int i = 0; i < 16; i++) { - bias_data[i] = dist(engine); - } - - std::unique_ptr out_ref(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 16; j++) { - float b = bias_data[j]; - for (int k = 0; k < 256 * 512; k++) { - int index = (i * 16 + j) * 256 * 512 + k; - out_ref[index] = in_data[index] + b; - } - } - } - - const DDim out_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr out(new float[4 * 16 * 256 * 512]); - - bool status = InitOpenCLRuntime(FLAGS_cl_path); - CHECK(status) << "Fail to initialize OpenCL runtime."; - std::unique_ptr context(new CLContext); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); - context->AddKernel("channel_add", "image/channel_add_kernel.cl"); - elementwise_add(context.get(), - in_data.get(), - in_dim, - bias_data.get(), - bias_dim, - out.get(), - out_dim); - - int stride = 4 * 16 * 256 * 512 / 20; - for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) { - std::cout << out[i] << " "; - } - std::cout << std::endl; - - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - EXPECT_NEAR(out[i], out_ref[i], 1e-6); - } -} - -TEST(cl_test, elementwise_add_test) { - std::default_random_engine engine; - std::uniform_real_distribution dist(-5, 5); - - const DDim in_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr in_data(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - in_data[i] = dist(engine); - } - - const DDim bias_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr bias_data(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - bias_data[i] = dist(engine); - } - - std::unique_ptr out_ref(new float[4 * 16 * 256 * 512]); - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - out_ref[i] = in_data[i] + bias_data[i]; - } - - const DDim out_dim = DDim(std::vector{4, 16, 256, 512}); - std::unique_ptr out(new float[4 * 16 * 256 * 512]); - - bool status = InitOpenCLRuntime(FLAGS_cl_path); - CHECK(status) << "Fail to initialize OpenCL runtime."; - std::unique_ptr context(new CLContext); - context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl"); - context->AddKernel("channel_add", "image/channel_add_kernel.cl"); - elementwise_add(context.get(), - in_data.get(), - in_dim, - bias_data.get(), - bias_dim, - out.get(), - out_dim); - - int stride = 4 * 16 * 256 * 512 / 20; - for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) { - std::cout << out[i] << " "; - } - std::cout << std::endl; - - for (int i = 0; i < 4 * 16 * 256 * 512; i++) { - EXPECT_NEAR(out[i], out_ref[i], 1e-6); - } -} - -void pool_avg(const int padding_height, - const int padding_width, - const int stride_height, - const int stride_width, - const int ksize_height, - const int ksize_width, - const float *input_data, - const DDim &in_dim, - float *output_data, - const DDim &out_dim) { - const int batch_size = in_dim[0]; - const int input_height = in_dim[2]; - const int input_width = in_dim[3]; - const int output_channels = out_dim[1]; - const int output_height = out_dim[2]; - const int output_width = out_dim[3]; - - const size_t input_spatial_size = input_height * input_width; - const size_t output_spatial_size = output_height * output_width; - - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - int channel = i * output_channels + c; - const float *input_ptr = input_data + channel * input_spatial_size; - float *output_ptr = output_data + channel * output_spatial_size; - - for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - - float val = 0.f; - int count = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - val += input_ptr[h * input_width + w]; - ++count; - } - } - output_ptr[ph * output_width + pw] = - (count > 0) ? val * (1.f / count) : 0.f; - } - } - } - } -} - -TEST(cl_test, pool_test) { - std::default_random_engine engine; - std::uniform_real_distribution dist(-5, 5); - - const DDim in_dim = DDim(std::vector{4, 1024, 7, 7}); - std::unique_ptr in_data(new float[4 * 1024 * 7 * 7]); - for (int i = 0; i < 4 * 1024 * 7 * 7; i++) { - in_data[i] = dist(engine); - } - - const DDim out_dim = DDim(std::vector{4, 1024, 1, 1}); - std::unique_ptr out(new float[4 * 1024 * 1 * 1]); - std::unique_ptr out_ref(new float[4 * 1024 * 1 * 1]); - - bool status = InitOpenCLRuntime(FLAGS_cl_path); - CHECK(status) << "Fail to initialize OpenCL runtime."; - std::unique_ptr context(new CLContext); - context->AddKernel("pool_max", "image/pool_kernel.cl"); - context->AddKernel("pool_avg", "image/pool_kernel.cl"); - pool(context.get(), - "avg", - 0, - 0, - 1, - 1, - 7, - 7, - in_data.get(), - in_dim, - out.get(), - out_dim); - pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim); - - for (int i = 0; i < 4 * 1024 * 1 * 1; i++) { - EXPECT_NEAR(out[i], out_ref[i], 1e-6); - } -} - -TEST(cl_test, target_wrapper_buffer_test) { - bool inited = InitOpenCLRuntime(FLAGS_cl_path); - CHECK(inited) << "Fail to initialize OpenCL runtime."; - std::unique_ptr context(new CLContext); - std::string kernel_name = "elementwise_add"; - std::string build_options = "-DCL_DTYPE=float"; - context->AddKernel( - kernel_name, "buffer/elementwise_add_kernel.cl", build_options); - std::vector h_a; - std::vector h_b; - std::vector h_out; - std::vector h_ref; - for (int i = 0; i < 10; i++) { - h_a.push_back(3.14f * i); - h_b.push_back(6.28f * i); - h_out.push_back(0); - h_ref.push_back((3.14f + 6.28f) * i); - } - auto *d_a = static_cast( - TargetWrapperCL::Malloc(sizeof(float) * h_a.size())); - auto *d_b = static_cast( - TargetWrapperCL::Malloc(sizeof(float) * h_b.size())); - auto *d_out = - static_cast(TargetWrapperCL::Malloc(sizeof(float) * 10)); - auto *d_copy = - static_cast(TargetWrapperCL::Malloc(sizeof(float) * 10)); - TargetWrapperCL::MemcpySync( - d_a, h_a.data(), sizeof(float) * h_a.size(), IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - d_b, h_b.data(), sizeof(float) * h_b.size(), IoDirection::HtoD); - // x + y: x[n=1, c=10, h=1, w=1], y[c=10] - auto kernel = context->GetKernel(kernel_name + build_options); - cl_int status = kernel.setArg(0, *d_a); - CL_CHECK_FATAL(status); - status = kernel.setArg(1, *d_b); - CL_CHECK_FATAL(status); - status = kernel.setArg(2, *d_out); - CL_CHECK_FATAL(status); - status = kernel.setArg(3, 1); - CL_CHECK_FATAL(status); - status = kernel.setArg(4, 10); - CL_CHECK_FATAL(status); - status = kernel.setArg(5, 1); - CL_CHECK_FATAL(status); - auto global_work_size = cl::NDRange{10, 1}; - status = context->GetCommandQueue().enqueueNDRangeKernel( - kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr); - CL_CHECK_FATAL(status); - status = context->GetCommandQueue().finish(); - CL_CHECK_FATAL(status); - TargetWrapperCL::MemcpySync( - h_out.data(), d_out, sizeof(float) * 10, IoDirection::DtoH); - - for (int i = 0; i < 10; i++) { - std::cout << h_out[i] << " "; - } - std::cout << std::endl; - - for (int i = 0; i < 10; i++) { - EXPECT_NEAR(h_out[i], h_ref[i], 1e-5); - } - - TargetWrapperCL::MemcpySync( - d_copy, d_out, sizeof(float) * 10, IoDirection::DtoD); - std::fill(h_out.begin(), h_out.end(), 0); - for (int i = 0; i < 10; i++) { - EXPECT_NEAR(h_out[i], 0, 1e-5); - } - TargetWrapperCL::MemcpySync( - h_out.data(), d_copy, sizeof(float) * 10, IoDirection::DtoH); - for (int i = 0; i < 10; i++) { - EXPECT_NEAR(h_out[i], h_ref[i], 1e-5); - } - - auto *mapped_ptr = - static_cast(TargetWrapperCL::Map(d_copy, 0, sizeof(float) * 10)); - for (int i = 0; i < 10; i++) { - EXPECT_NEAR(mapped_ptr[i], h_ref[i], 1e-5); - } - TargetWrapperCL::Unmap(d_copy, mapped_ptr); - - TargetWrapperCL::Free(d_copy); - TargetWrapperCL::Free(d_out); - TargetWrapperCL::Free(d_b); - TargetWrapperCL::Free(d_a); -} - -TEST(cl_test, target_wrapper_image_test) { - const std::array image_shape{28, 32}; - auto *d_image = static_cast( - TargetWrapperCL::MallocImage(image_shape, PRECISION(kFloat))); - std::array image_pitch; - // Map/Unmap test - auto *h_image = static_cast( - TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch)); - // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes) - // slice_pitch = 0 - size_t row_pitch = image_pitch[0]; - size_t slice_pitch = image_pitch[1]; - CHECK_EQ(row_pitch, 448); - CHECK_EQ(slice_pitch, 0); - LOG(INFO) << "row_pitch = " << row_pitch << ", slice_pitch " << slice_pitch; - - for (int i = 0; i < 10; i++) { - h_image[i] = 3.14f * i; - } - TargetWrapperCL::Unmap(d_image, h_image); - - auto *h_ptr = static_cast( - TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch)); - for (int i = 0; i < 10; i++) { - EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6); - } - TargetWrapperCL::Unmap(d_image, h_ptr); - - // Imagecpy test - std::vector h_image_cpy(28 * 4 * 32); - for (int i = 0; i < 28 * 4 * 32; i++) { - h_image_cpy[i] = 3.14f; - } - TargetWrapperCL::ImgcpySync( - d_image, h_image_cpy.data(), image_shape, image_pitch, IoDirection::HtoD); - auto *d_image_cpy = static_cast( - TargetWrapperCL::MallocImage(image_shape, PRECISION(kFloat))); - TargetWrapperCL::ImgcpySync( - d_image_cpy, d_image, image_shape, image_pitch, IoDirection::DtoD); - std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0); - TargetWrapperCL::ImgcpySync(h_image_cpy.data(), - d_image_cpy, - image_shape, - image_pitch, - IoDirection::DtoH); - for (int i = 0; i < 28 * 4 * 32; i++) { - EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6); - } - - TargetWrapperCL::FreeImage(d_image_cpy); - TargetWrapperCL::FreeImage(d_image); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_im2col_test.cc b/lite/backends/opencl/cl_im2col_test.cc deleted file mode 100644 index a0770d34ee..0000000000 --- a/lite/backends/opencl/cl_im2col_test.cc +++ /dev/null @@ -1,330 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include "lite/backends/opencl/cl_context.h" -#include "lite/backends/opencl/cl_runtime.h" -#include "lite/backends/opencl/target_wrapper.h" -#include "lite/core/tensor.h" -#include "lite/utils/cp_logging.h" - -DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); - -namespace paddle { -namespace lite { - -template -void PrintData(std::string name, Dtype *a, const int rows, const int cols) { - std::cout << "==== " << name << " ====" << std::endl; - for (int r = 0; r < rows; ++r) { - for (int c = 0; c < cols; ++c) { - std::cout << " " << a[r * cols + c]; - } - std::cout << std::endl; - } -} - -inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { - return static_cast(a) < static_cast(b); -} - -template -void im2col(const Dtype *data_im, - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int pad_h, - const int pad_w, - const int stride_h, - const int stride_w, - const int dilation_h, - const int dilation_w, - Dtype *data_col) { - const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; - const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; - const int channel_size = height * width; - - for (int channel = 0; channel++ < channels; data_im += channel_size) { - for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; - for (int output_rows = 0; output_rows < output_h; ++output_rows) { - if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { - for (int output_cols = 0; output_cols < output_w; ++output_cols) { - *(data_col++) = 0; - } - } else { - int input_col = -pad_w + kernel_col * dilation_w; - for (int output_col = 0; output_col < output_w; ++output_col) { - *(data_col++) = (is_a_ge_zero_and_a_lt_b(input_col, width)) - ? data_im[input_row * width + input_col] - : 0; - input_col += stride_w; - } - } - input_row += stride_h; - } - } - } - } -} - -// #define CHECK_ERROR -// #define PRINT_RESULT -// #define LOOP_TEST -TEST(cl_test, im2col_test) { - using T = float; - std::string kernel_func_name = "im2col"; - std::string kernel_func_path = "buffer/im2col_kernel.cl"; - -#ifdef LOOP_TEST - for (int n : {1}) { - for (int c : {32}) { - for (int h : {224}) { - for (int w : {224}) { - for (int kernel_h : {3}) { - for (int kernel_w : {3}) { - for (int pad_h : {1}) { - for (int pad_w : {1}) { - for (int stride_h : {2}) { - for (int stride_w : {2}) { - for (int dilation_h : {1}) { - for (int dilation_w : {1}) { -// TODO(yuanshuai): support group for im2col -#else - int n = 8; - int c = 32; - int h = 224; - int w = 224; - int kernel_h = 3; - int kernel_w = 3; - int pad_h = 1; - int pad_w = 1; - int stride_h = 2; - int stride_w = 2; - int dilation_h = 1; - int dilation_w = 1; -#endif - - int img_offset = 0; - int col_offset = 0; - - std::vector input_shape{n, c, h, w}; - int channels = input_shape[1]; - int height = input_shape[2]; - int width = input_shape[3]; - - int height_col = (height + 2 * pad_h - - (dilation_h * (kernel_h - 1) + 1)) / - stride_h + - 1; - int width_col = (width + 2 * pad_w - - (dilation_w * (kernel_w - 1) + 1)) / - stride_w + - 1; - int col_chw = channels * kernel_h * kernel_w * - height_col * width_col; - if (col_chw <= 0 || height_col <= 0 || - width_col <= 0 || channels <= 0) { - VLOG(4) << "col_chw <= 0, skipped"; -#ifdef LOOP_TEST - continue; -#else - return; -#endif - } - - VLOG(4) << "kernel_func_name:" << kernel_func_name - << " kernel_func_path:" << kernel_func_path; - VLOG(4) << "input_shape:" << input_shape[0] << ", " - << input_shape[1] << ", " << input_shape[2] - << ", " << input_shape[3]; - VLOG(4) << "kernel_h:" << kernel_h - << " kernel_w:" << kernel_w - << " pad_h:" << pad_h << " pad_w:" << pad_w - << " stride_h:" << stride_h - << " stride_w:" << stride_w - << " dilation_h:" << dilation_h - << " dilation_w:" << dilation_w; - VLOG(4) << "height_col:" << height_col - << " width_col:" << width_col - << " img_offset:" << img_offset - << " col_offset:" << col_offset - << " col_chw:" << col_chw; - - const DDim input_dim = DDim(input_shape); - const int input_elem_num = input_dim.production(); - T *in_data = static_cast( - calloc(sizeof(T), input_elem_num)); - T *out_data = - static_cast(calloc(sizeof(T), n * col_chw)); - T *out_ref_data = - static_cast(calloc(sizeof(T), n * col_chw)); - for (int i = 0; i < input_elem_num; ++i) { - in_data[i] = i; - } - - // CPU im2col - for (int b = 0; b < n; b++) { - im2col(in_data + b * channels * height * width, - channels, - height, - width, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - out_ref_data + b * col_chw); - } - - // OpenCL im2col - auto *runtime = CLRuntime::Global(); - CHECK(runtime->IsInitSuccess()) - << "Fail to initialize OpenCL runtime."; - runtime->set_cl_path(FLAGS_cl_path); - - std::unique_ptr context(new CLContext); - context->AddKernel(kernel_func_name, - kernel_func_path); - auto kernel = context->GetKernel(kernel_func_name); - - auto *d_in = - static_cast(TargetWrapperCL::Malloc( - sizeof(T) * input_elem_num)); - auto *d_out = static_cast( - TargetWrapperCL::Malloc(sizeof(T) * n * col_chw)); - TargetWrapperCL::MemcpySync( - d_in, - in_data, - sizeof(T) * input_elem_num, - IoDirection::HtoD); - - int n_threads = channels * height_col * width_col; - cl_int status; - int arg_idx = 0; - for (int b = 0; b < n; b++) { - img_offset = b * channels * height * width; - col_offset = b * col_chw; - arg_idx = 0; - status = kernel.setArg(arg_idx, *d_in); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, img_offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, n_threads); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, kernel_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, kernel_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, pad_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, pad_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, stride_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, stride_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, dilation_h); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, dilation_w); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, height_col); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, width_col); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *d_out); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, col_offset); - CL_CHECK_FATAL(status); - - auto global_work_size = - cl::NDRange{static_cast(col_chw)}; - status = - context->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - } - - status = context->GetCommandQueue().finish(); - CL_CHECK_FATAL(status); - - TargetWrapperCL::MemcpySync(out_data, - d_out, - sizeof(T) * n * col_chw, - IoDirection::DtoH); - -#ifdef PRINT_RESULT - PrintData("in", in_data, height, width); - PrintData("out_ref", out_ref_data, height, width); - PrintData("out", out_data, height, width); -#endif - - for (int i = 0; i < n * col_chw; ++i) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); -#ifdef CHECK_ERROR - if (abs(out_data[i] - out_ref_data[i]) > 1e-5) { - std::cout << "i:" << i << std::endl; - PrintData("in", in_data, height, width); - PrintData("out_ref", out_ref_data, height, width); - PrintData("out", out_data, height, width); - exit(0); - } -#endif - } - - free(in_data); - free(out_data); - free(out_ref_data); - TargetWrapperCL::Free(d_in); - TargetWrapperCL::Free(d_out); - -#ifdef LOOP_TEST - } // dilation_w - } // dilation_h - } // stride_w - } // stride_h - } // pad_w - } // pad_h - } // kernel_w - } // kernel_h - } // w - } // h - } // c - } // n -#endif -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_image.cc b/lite/backends/opencl/cl_image.cc deleted file mode 100644 index f6dcd4bbef..0000000000 --- a/lite/backends/opencl/cl_image.cc +++ /dev/null @@ -1,160 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/opencl/cl_image.h" -#include -#include "lite/backends/opencl/cl_runtime.h" -#include "lite/backends/opencl/cl_utility.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { - -std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) { - int width = cl_image.image_dims_[0]; - int height = cl_image.image_dims_[1]; - - float* image_data = new float[height * width * 4]; - cl::Image* image = cl_image.cl_image(); - const std::array origin{0, 0, 0}; - const std::array region{ - static_cast(width), static_cast(height), 1}; - cl_int err = CLRuntime::Global()->command_queue().enqueueReadImage( - *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr); - CL_CHECK_FATAL(err); - - float* tensor_data = new float[cl_image.numel()]; - auto* converter = cl_image.image_converter(); - converter->ImageToNCHW( - image_data, tensor_data, cl_image.image_dims_, cl_image.tensor_dims_); - int stride = cl_image.numel() / 20; - stride = stride > 0 ? stride : 1; - - os << " dims: " << cl_image.tensor_dims_ << "\n"; - for (int i = 0; i < cl_image.numel(); i += stride) { - os << tensor_data[i] << " "; - } - - delete[] tensor_data; - delete[] image_data; - - return os; -} - -void CLImage::set_tensor_data(const float* tensor_data, const DDim& dim) { - auto numel = dim.production(); - tensor_data_.reset(new float[numel]); - memcpy(tensor_data_.get(), tensor_data, numel * sizeof(float)); - tensor_dims_ = dim; -} - -void CLImage::InitCLImage(const cl::Context& context) { - CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; - image_converter_.reset(new CLImageConverterFolder); - InitCLImage(context, image_converter_.get()); -} - -void CLImage::InitNormalCLImage(const cl::Context& context) { - CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; - image_converter_.reset(new CLImageConverterNormal); - InitCLImage(context, image_converter_.get()); -} - -void CLImage::InitNImage(const cl::Context& context) { - CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; - CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4."; - image_converter_.reset(new CLImageConverterNWBlock); - InitCLImage(context, image_converter_.get()); -} - -void CLImage::InitDWImage(const cl::Context& context) { - CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; - CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4."; - image_converter_.reset(new CLImageConverterDWBlock); - InitCLImage(context, image_converter_.get()); -} - -void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) { - CHECK(tensor_data_ == nullptr) - << " Empty image tensor data shouldn't have value"; - - tensor_dims_ = dim; - image_converter_.reset(new CLImageConverterNormal); - - VLOG(3) << " to get image dims "; - image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_); - VLOG(3) << " end get image dims " << image_dims_; - - InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); - - cl_event_ = CLRuntime::Global()->CreateEvent(context); - initialized_ = true; - VLOG(3) << " end init cl image "; -} - -void CLImage::InitEmptyWithImageDim(const cl::Context& context, - const DDim& image_dims) { - VLOG(3) << " to get image dims "; - image_dims_ = image_dims; - VLOG(3) << " end get image dims " << image_dims_; - - InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); - - cl_event_ = CLRuntime::Global()->CreateEvent(context); - initialized_ = true; - VLOG(3) << " end init cl image"; -} - -void CLImage::InitCLImage(const cl::Context& context, - CLImageConverterBase* converter) { - CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!"; - - VLOG(3) << " begin init cl image "; - image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); - - float* image_data = new float[image_dims_.production() * 4]; - - VLOG(3) << " convert to image "; - converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_); - VLOG(3) << " end convert to image "; - - InitCLImage(context, image_dims_[0], image_dims_[1], image_data); - - delete[] image_data; - tensor_data_ = nullptr; - cl_event_ = CLRuntime::Global()->CreateEvent(context); - initialized_ = true; - VLOG(3) << " end init cl image "; -} - -void CLImage::InitCLImage(const cl::Context& context, - int width, - int height, - void* data) { - cl::ImageFormat img_format(CL_RGBA, CL_FLOAT); - cl_int err; - cl_image_.reset( - new cl::Image2D(context, - CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), - img_format, - width, - height, - 0, - data, - &err)); - CL_CHECK_FATAL(err); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_image.h b/lite/backends/opencl/cl_image.h deleted file mode 100644 index f3a5f6361f..0000000000 --- a/lite/backends/opencl/cl_image.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "lite/backends/opencl/cl_image_converter.h" -#include "lite/backends/opencl/cl_include.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { - -class CLImage { - // For debug - friend std::ostream& operator<<(std::ostream& os, const CLImage& image); - - public: - CLImage() = default; - /* - * Will not hold input tensor data, memcpy in this method. - * */ - void set_tensor_data(const float* tensor_data, const DDim& dim); - - bool IsInit() { return initialized_; } - /* - * Need call set_tensor_data first. - * Folder when one dim or two dim. - * */ - void InitCLImage(const cl::Context& context); - - void InitNormalCLImage(const cl::Context& context); - - void InitNImage(const cl::Context& context); - - void InitDWImage(const cl::Context& context); - - void InitEmptyImage(const cl::Context& context, const DDim& dim); - - void InitEmptyWithImageDim(const cl::Context& context, - const DDim& image_dims); - - cl::Image* cl_image() const { return cl_image_.get(); } - - const DDim& image_dims() const { return image_dims_; } - - inline size_t ImageWidth() const { return image_dims_[0]; } - - inline size_t ImageHeight() const { return image_dims_[1]; } - - const DDim& tensor_dims() const { return tensor_dims_; } - - /*with_da - * Resize original tensor dim. - * */ - inline CLImage& Resize(const DDim& dims) { - tensor_dims_ = dims; - return *this; - } - - template - T* data() const { - CHECK(!initialized_) << "CL image has initialized, tensor data has been " - "deleted, can't use tensor data!"; - return reinterpret_cast(tensor_data_); - } - - /* - * Numel of tensor dim - * */ - inline int64_t numel() const { return tensor_dims_.production(); } - - /* - * Original tensor dim - * */ - - cl::UserEvent& cl_event() const { return *cl_event_; } - - CLImageConverterBase* image_converter() const { - return image_converter_.get(); - } - - private: - void InitCLImage(const cl::Context& context, CLImageConverterBase* converter); - - void InitCLImage(const cl::Context& context, - int width, - int height, - void* data); - - bool initialized_ = false; - std::unique_ptr cl_image_{nullptr}; - std::unique_ptr cl_event_{nullptr}; - DDim tensor_dims_; - DDim image_dims_; - std::unique_ptr tensor_data_{nullptr}; - std::unique_ptr image_converter_{nullptr}; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc deleted file mode 100644 index 402f710d7a..0000000000 --- a/lite/backends/opencl/cl_image_converter.cc +++ /dev/null @@ -1,461 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/opencl/cl_image_converter.h" -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { - -DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (size_t j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - return DDim( - std::vector({static_cast(width), - static_cast(height)})); -} - -void CLImageConverterDefault::NCHWToImage(float *nchw, - float *image, - const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (size_t j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - - DDim in_image_dim = InitImageDimInfoWith(tensor_dim); - - VLOG(3) << " tensor dim: " << tensor_dim; - VLOG(3) << " image dim: " << in_image_dim; - - size_t width = in_image_dim[0]; - size_t w_block = width / W; - - float *p = nchw; - size_t i0 = 0; - for (size_t n = 0; n < N; n++) { - for (size_t c = 0; c < w_block * 4; c++) { - size_t i1 = i0 + (c / 4) * W; - for (size_t h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (size_t w = 0; w < W; w++) { - if (c < C) { - // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + - // (c % 4); - image[i2] = *p; - i2 += 4; - p++; - } else { - image[i2] = 0.0; - i2 += 4; - } - } - i1 += width; - } - } - i0 += width * H; - } -} - -void CLImageConverterDefault::ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (size_t j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - - size_t width = image_dim[0]; - float *p = tensor; - - size_t i0 = 0; - for (size_t n = 0; n < N; n++) { - for (size_t c = 0; c < C; c++) { - size_t i1 = i0 + (c / 4) * W; - for (size_t h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (size_t w = 0; w < W; w++) { - *p = image[i2]; - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } -} - -DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) { - if (tensor_dim.size() <= 2) { - size_t tdim[2] = {1, 1}; - if (tensor_dim.size() == 1) { - tdim[1] = tensor_dim[0]; - } else { - tdim[0] = tensor_dim[0]; - tdim[1] = tensor_dim[1]; - } - size_t width = (tdim[1] + 3) / 4; - size_t height = tdim[0]; - - width_of_one_block_ = width; - height_of_one_block_ = height; - c_block_ = 1; - - return DDim( - std::vector({static_cast(width), - static_cast(height)})); - - } else { - size_t new_dims[] = {1, 1, 1, 1}; - for (size_t j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - width_of_one_block_ = W; - height_of_one_block_ = H; - c_block_ = width / W; - - return DDim( - std::vector({static_cast(width), - static_cast(height)})); - } -} - -void CLImageConverterFolder::NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) { - CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) - << " Tensor dim is not support!"; - - if (tensor_dim.size() > 2) { - CLImageConverterDefault default_converter; - default_converter.NCHWToImage(tensor, image, tensor_dim); - - } else { - size_t tdim[2] = {1, 1}; - if (tensor_dim.size() == 1) { - tdim[1] = tensor_dim[0]; - } else { - tdim[0] = tensor_dim[0]; - tdim[1] = tensor_dim[1]; - } - - DDim image_dim = InitImageDimInfoWith(tensor_dim); - size_t width = image_dim[0]; - - for (size_t h = 0; h < tdim[0]; h++) { - for (size_t w = 0; w < tdim[1]; w++) { - image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w]; - } - } - } -} - -void CLImageConverterFolder::ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - if (tensor_dim.size() > 2) { - CLImageConverterDefault default_converter; - default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); - - } else { - size_t width = image_dim[0]; - size_t H = 1, W = 1; - - if (tensor_dim.size() == 2) { - H = tensor_dim[0]; - W = tensor_dim[1]; - } else if (tensor_dim.size() == 1) { - W = tensor_dim[0]; - } - - float *p = tensor; - - for (size_t h = 0; h < H; h++) { - for (size_t w = 0; w < W; w++) { - p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)]; - } - } - } -} - -DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { - CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; - size_t N, C, H, W; - N = tensor_dim[0]; - C = tensor_dim[1]; - H = tensor_dim[2]; - W = tensor_dim[3]; - size_t width = W * ((N + 3) / 4); - size_t height = C * H; - return DDim( - std::vector({static_cast(width), - static_cast(height)})); -} - -void CLImageConverterNWBlock::NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) { - CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; - auto image_dim = InitImageDimInfoWith(tensor_dim); - float *p = tensor; - size_t N = tensor_dim[0]; - size_t C = tensor_dim[1]; - size_t H = tensor_dim[2]; - size_t W = tensor_dim[3]; - size_t width = image_dim[0]; - size_t height = image_dim[1]; - size_t block = image_dim[0] / tensor_dim[3]; - - for (size_t n = 0; n < block * 4; n++) { - for (size_t c = 0; c < C; c++) { - for (size_t h = 0; h < H; ++h) { - for (size_t w = 0; w < W; ++w) { - size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + - w * 4 + n % 4; - if (n < N) { - image[index] = *p; - p++; - } else { - image[index] = 0.0; - } - if (index >= (width * height * 4)) { - LOG(INFO) << " index out of range "; - } - } - } - } - } - VLOG(3) << " init done"; -} - -void CLImageConverterNWBlock::ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; - float *p = tensor; - size_t N = tensor_dim[0]; - size_t C = tensor_dim[1]; - size_t H = tensor_dim[2]; - size_t W = tensor_dim[3]; - size_t width = image_dim[0]; - size_t height = image_dim[1]; - - for (size_t n = 0; n < N; n++) { - for (size_t c = 0; c < C; c++) { - for (size_t h = 0; h < H; ++h) { - for (size_t w = 0; w < W; ++w) { - size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + - w * 4 + n % 4; - *p = image[index]; - p++; - if (index >= (width * height * 4)) { - LOG(INFO) << " index out of range "; - } - } - } - } - } - VLOG(3) << " init done"; -} - -DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) { - CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; - size_t N, C, H, W; - N = tensor_dim[0]; - C = tensor_dim[1]; - H = tensor_dim[2]; - W = tensor_dim[3]; - size_t width = W * ((N + 3) / 4); - size_t height = C * H; - return DDim( - std::vector({static_cast(width), - static_cast(height)})); -} - -void CLImageConverterDWBlock::NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (size_t j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - - size_t N, C, H, W; - N = new_dims[1]; - C = new_dims[0]; - H = new_dims[2]; - W = new_dims[3]; - - DDim in_image_dim = InitImageDimInfoWith(tensor_dim); - - VLOG(3) << " tensor dim: " << tensor_dim; - VLOG(3) << " image dim: " << in_image_dim; - - size_t width = in_image_dim[0]; - size_t w_block = width / W; - - float *p = tensor; - size_t i0 = 0; - for (size_t n = 0; n < N; n++) { - for (size_t c = 0; c < w_block * 4; c++) { - size_t i1 = i0 + (c / 4) * W; - for (size_t h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (size_t w = 0; w < W; w++) { - if (c < C) { - // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 + - // (c % 4); - image[i2] = *p; - i2 += 4; - p++; - } else { - image[i2] = 0.0; - i2 += 4; - } - } - i1 += width; - } - } - i0 += width * H; - } -} - -void CLImageConverterDWBlock::ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; - float *p = tensor; - size_t N = tensor_dim[1]; - size_t C = tensor_dim[0]; - size_t H = tensor_dim[2]; - size_t W = tensor_dim[3]; - size_t width = image_dim[0]; - - size_t i0 = 0; - for (size_t n = 0; n < N; n++) { - for (size_t c = 0; c < C; c++) { - size_t i1 = i0 + (c / 4) * W; - for (size_t h = 0; h < H; h++) { - size_t i2 = (i1 << 2) + c % 4; - for (size_t w = 0; w < W; w++) { - *p = image[i2]; - i2 += 4; - p++; - } - i1 += width; - } - } - i0 += width * H; - } -} - -DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) { - size_t new_dims[] = {1, 1, 1, 1}; - for (size_t j = 0; j < tensor_dim.size(); ++j) { - new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; - } - size_t N, C, H, W; - N = new_dims[0]; - C = new_dims[1]; - H = new_dims[2]; - W = new_dims[3]; - size_t width = W * ((C + 3) / 4); - size_t height = H * N; - - width_of_one_block_ = W; - height_of_one_block_ = H; - c_block_ = width / W; - - return DDim( - std::vector({static_cast(width), - static_cast(height)})); -} - -void CLImageConverterNormal::NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) { - CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0) - << " Tensor dim is not support!"; - - CLImageConverterDefault default_converter; - default_converter.NCHWToImage(tensor, image, tensor_dim); -} - -void CLImageConverterNormal::ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) { - CLImageConverterDefault default_converter; - default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); -} - -DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith( - const DDim &tensor_dim) { - CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4."; - size_t N, C; - N = tensor_dim[0]; - C = tensor_dim[1]; - size_t width = (C + 3) / 4; - size_t height = N * 16; // N * (wino_blk_size + 2) * (wino_blk_size + 2) - return DDim( - std::vector({static_cast(width), - static_cast(height)})); -} - -void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) {} - -void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) {} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h deleted file mode 100644 index 6faa804557..0000000000 --- a/lite/backends/opencl/cl_image_converter.h +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { - -class CLImageConverterBase { - public: - virtual ~CLImageConverterBase() {} - - virtual void NCHWToImage(float *nchw, - float *image, - const DDim &tensor_dim) = 0; - - virtual void ImageToNCHW(float *image, - float *nchw, - const DDim &image_dim, - const DDim &tensor_dim) = 0; - virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0; -}; - -class CLImageConverterDefault : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override; - void ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) override; -}; - -class CLImageConverterFolder : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) override; - void ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) override; - - /* - * width of original tensor - * */ - inline size_t WidthOfOneBlock() const { return width_of_one_block_; } - - /* - * height of original tensor - * */ - inline size_t HeightOfOneBlock() const { return height_of_one_block_; } - - int GetCBlock() const { return c_block_; } - - private: - int c_block_; - int width_of_one_block_; - int height_of_one_block_; -}; - -class CLImageConverterNormal : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) override; - void ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) override; - - /* - * width of original tensor - * */ - inline size_t WidthOfOneBlock() const { return width_of_one_block_; } - - /* - * height of original tensor - * */ - inline size_t HeightOfOneBlock() const { return height_of_one_block_; } - - int GetCBlock() const { return c_block_; } - - private: - int c_block_; - int width_of_one_block_; - int height_of_one_block_; -}; - -class CLImageConverterNWBlock : public CLImageConverterBase { - DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) override; - void ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) override; -}; -class CLImageConverterDWBlock : public CLImageConverterBase { - DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) override; - void ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) override; -}; - -class CLImageConverterWinoTransWeight : public CLImageConverterBase { - public: - DDim InitImageDimInfoWith(const DDim &tensor_dim) override; - void NCHWToImage(float *tensor, - float *image, - const DDim &tensor_dim) override; - void ImageToNCHW(float *image, - float *tensor, - const DDim &image_dim, - const DDim &tensor_dim) override; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_include.h b/lite/backends/opencl/cl_include.h deleted file mode 100644 index 254782d629..0000000000 --- a/lite/backends/opencl/cl_include.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#define CL_TARGET_OPENCL_VERSION 200 -#define CL_HPP_TARGET_OPENCL_VERSION 200 -#define CL_HPP_MINIMUM_OPENCL_VERSION 110 - -#include diff --git a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl deleted file mode 100644 index ab575ba9b3..0000000000 --- a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void depthwise_conv2d(const int numel, // num of elements - __global CL_DTYPE* input_data, - const int height, - const int width, - const int conved_channel, - const int conved_height, - const int conved_width, - const int kernel_h, - const int kernel_w, - const int stride_h, - const int stride_w, - const int pad_h, - const int pad_w, - __global CL_DTYPE* output_data, - __global CL_DTYPE* weight_data, - __global CL_DTYPE* bias_data) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < numel; index += tmp) { - const int pw = index % conved_width; - const int ph = (index / conved_width) % conved_height; - const int c = (index / conved_width / conved_height) % conved_channel; - const int n = index / conved_width / conved_height / conved_channel; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - CL_DTYPE v = 0; - __global CL_DTYPE* input_slice = - input_data + (n * conved_channel + c) * height * width; - __global CL_DTYPE* weight_slice = - weight_data + c * kernel_h * kernel_w; - int khstart = hend < kernel_h ? kernel_h - hend : 0; - int kwstart = wend < kernel_w? kernel_w - wend : 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - v += input_slice[h * width + w] - * weight_slice[(khstart + h - hstart) * kernel_w + (kwstart + w - wstart)]; - } - } - if(bias_data != NULL){ - v += bias_data[c]; - } -#ifdef RELU - output_data[index] = activation(v); -#else - output_data[index] = v; -#endif - } -} diff --git a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl deleted file mode 100644 index bb6faea629..0000000000 --- a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void elementwise_add(__global const CL_DTYPE* x_data, - __global const CL_DTYPE* y_data, - __global CL_DTYPE* out_data, - const int batch, - const int channels, - const int num) { - - const int c = get_global_id(0); // c: [0, channels) - const int b = get_global_id(1); // b: [0, batch) - - if ((c >= channels) || (b >= batch)) { - return; - } - - const int offset = (b * channels + c) * num; - - __global const CL_DTYPE* din_ptr = x_data + offset; - const CL_DTYPE diny_data = y_data[c]; - __global CL_DTYPE* dout_ptr = out_data + offset; - - for (int n = 0; n < num; ++n) { // n: [0, h*w) - *dout_ptr = *din_ptr + diny_data; -#ifdef RELU - *dout_ptr = activation(*dout_ptr); -#endif - ++dout_ptr; - ++din_ptr; - } -} diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl deleted file mode 100644 index b8dbf62c06..0000000000 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ /dev/null @@ -1,424 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - - -#define SRC(i, j) src[i * src_width + j] -#define DST(i, j) dst[i * src_height + j] -__kernel -void mat_transpose(__global const CL_DTYPE* src, - __global CL_DTYPE* dst, - const int src_height, const int src_width) { - const int col = get_global_id(0); // [0, src_width) columns of src - const int row = get_global_id(1); // [0, src_height) rows of src - DST(col, row) = SRC(row, col); -} - - -// fc_gemm_naive: keep for check -// a: x_d -// b: filter_d -// c: output_d -__kernel -void fc_gemm_naive(__global const CL_DTYPE* a, - __global const CL_DTYPE* b, - __global const CL_DTYPE* bias, - __global CL_DTYPE* c, - const int M, const int N, const int K) { - const int row = get_global_id(0); // [0, M) height of out == m - const int col = get_global_id(1); // [0, N) width of out == n - - if ((col >= N) || (row >= M)) { - return; - } - - CL_DTYPE a0, b0, - c0 = (bias && col < N) ? bias[col] : 0; - - for (int p = 0; p < K; ++p) { - a0 = *(a + row * K + p); - b0 = *(b + p * N + col); - c0 += a0 * b0; - } - -#ifdef RELU - c[row * N + col] = activation(c0); -#else - c[row * N + col] = c0; -#endif -} - - -// gemm_batch_naive: used for conv1x1, gemm of im2col_gemm -// a: filter_d -// b: x_d -// c: output_d -__kernel -void gemm_batch_naive(__global const CL_DTYPE* a, - __global const CL_DTYPE* b, - __global const CL_DTYPE* bias, - __global CL_DTYPE* c, - const int M, const int N, const int K, const int batch_size) { - const int row = get_global_id(0); // [0, M) height of out == m - const int col = get_global_id(1); // [0, N) width of out == n - const int bidx = get_global_id(2); // [0, batch_size) - - const __global CL_DTYPE* cur_b = b + K * N * bidx; - __global CL_DTYPE* cur_c = c + M * N * bidx; - - if ((col >= N) || (row >= M) || (bidx >= batch_size)) { - return; - } - - CL_DTYPE a0, b0, - c0 = (bias && col < N) ? bias[row] : 0; - - for (int p = 0; p < K; ++p) { - a0 = *(a + row * K + p); - b0 = *(cur_b + p * N + col); - c0 += a0 * b0; - } - -#ifdef RELU - cur_c[row * N + col] = activation(c0); -#else - cur_c[row * N + col] = c0; -#endif -} - - -// gemm_batch_8x4_buf_buf_N_N: used for conv1x1, gemm of im2col_gemm -// a: filter_d -// b: x_d -// c: output_d - -//#define PRINT_KERNEL -__kernel -void gemm_batch(__global const CL_DTYPE* Aptr, - __global const CL_DTYPE* Bptr, - __global const CL_DTYPE* bias, - __global CL_DTYPE* Cptr, - const int M, const int N, const int K, const int batch_size) { - - int row = get_global_id(0) << 3; // [0, M >> 3) height of out == m - int col = get_global_id(1) << 2; // [0, N >> 2) width of out == n - const int bidx = get_global_id(2); // [0, batch_size) - - // update B(input), C(output) with batch_size - Aptr += mul24(row, K); // A += row * K - Bptr += mad24(mul24(K, N), bidx, col); // B += K * N * bidx + col - Cptr += mad24(mul24(M, N), bidx, mul24(row, N)); // C += M * N * bidx + row * N - - CL_DTYPE4 a8x4[8]; - CL_DTYPE4 b4x4[4] = {0.f, 0.f, 0.f, 0.f}; - CL_DTYPE4 c8x4[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - - if (bias) { - c8x4[0] = bias[row]; - c8x4[1] = bias[row + 1]; - c8x4[2] = bias[row + 2]; - c8x4[3] = bias[row + 3]; - c8x4[4] = bias[row + 4]; - c8x4[5] = bias[row + 5]; - c8x4[6] = bias[row + 6]; - c8x4[7] = bias[row + 7]; - } - - // main loop of K - short pos = 0; - for (; pos < K - 3; pos += 4) { - b4x4[0] = vload4(0, Bptr + mul24(pos, N)); - b4x4[1] = vload4(0, Bptr + mul24(pos+1, N)); - b4x4[2] = vload4(0, Bptr + mul24(pos+2, N)); - b4x4[3] = vload4(0, Bptr + mul24(pos+3, N)); - - // main compute of main loop K: pos + 3 < K - #pragma unroll(8) - for (int i = 0; i < 8 && i < M; ++i) { // M direction - a8x4[i] = vload4(0, Aptr + mad24(i, K, pos)); - - c8x4[i] += a8x4[i].x * b4x4[0]; - c8x4[i] += a8x4[i].y * b4x4[1]; - c8x4[i] += a8x4[i].z * b4x4[2]; - c8x4[i] += a8x4[i].w * b4x4[3]; - } - } - - // compute left K - if (pos < K) { - b4x4[0] = 0.0f; - b4x4[1] = 0.0f; - b4x4[2] = 0.0f; - // b4x4[3] = 0.0f; // impossible used - switch (K - pos) { - case 3: - b4x4[2] = vload4(0, Bptr + mul24(pos+2, N)); - - case 2: - b4x4[1] = vload4(0, Bptr + mul24(pos+1, N)); - - case 1: - b4x4[0] = vload4(0, Bptr + mul24(pos, N)); - } - - #pragma unroll(8) - for (int i = 0; i < 8; i++) { - a8x4[i] = vload4(0, Aptr + mad24(i, K, pos)); - - c8x4[i] += a8x4[i].x * b4x4[0] + - a8x4[i].y * b4x4[1] + - a8x4[i].z * b4x4[2]; - } - } - -#ifdef RELU - #pragma unroll(8) - for (int i = 0; i < 8; ++i) { - c8x4[i] = fmax(c8x4[i], (CL_DTYPE4)0.f); - } -#endif - - // store c - if (row + 7 < M && col + 3 < N) { - #pragma unroll(8) - for (int i = 0; i < 8; i++) { // M direction - vstore4(c8x4[i], 0, Cptr + mad24(i, N, col)); - } - } else { - for (int i = 0; i < 8 && i + row < M; ++i) { // M direction - if (col + 3 < N) { - vstore4(c8x4[i], 0, Cptr + mad24(i, N, col)); - } else { - switch (N - col) { - case 3: - *(Cptr + mad24(i, N, col + 2)) = c8x4[i].s2; - case 2: - *(Cptr + mad24(i, N, col + 1)) = c8x4[i].s1; - case 1: - *(Cptr + mad24(i, N, col)) = c8x4[i].s0; - } - } - } - } -} - - -// fc_gemv_naive: keep for check -// used for fc with M = 1 -// a: param.input {M, K} -// b: param.w {K, N} -// c: param.output {M, N} -__kernel -void fc_gemv_naive(__global const CL_DTYPE* a, - __global const CL_DTYPE* b, - __global const CL_DTYPE* bias, - __global CL_DTYPE* c, - const int M, const int N, const int K) { - const int col = get_global_id(0); // gws[0]: [0, N) width of B == N - - if (col >= N) { - return; - } - CL_DTYPE c0 = bias ? bias[col] : 0; - for (int p = 0; p < K; ++p) { - CL_DTYPE a0 = *(a + p); - CL_DTYPE b0 = *(b + p * N + col); - c0 += a0 * b0; - } - -#ifdef RELU - c[col] = activation(c0); -#else - c[col] = c0; -#endif -} - - -// fc_gemv_1x4: for fc with M = 1 -// a: param.input {M, K} -// b: param.w {K, N} -// c: param.output {M, N} -__kernel -void fc_gemv_1x4(__global const CL_DTYPE* a, - __global const CL_DTYPE* b, - __global const CL_DTYPE* bias, - __global CL_DTYPE* c, - const int M, const int N, const int K) { - const int col = get_global_id(0) << 2; // gws[0]: [0, N >> 2) height of B == N - - if (col + 3 < N) { - CL_DTYPE4 c0 = 0.0f; - if (bias) { - c0.x = bias[col]; - c0.y = bias[col+1]; - c0.z = bias[col+2]; - c0.w = bias[col+3]; - } - - // main loop of K - int p = 0; - for (; p < K - 3; p += 4) { - CL_DTYPE4 a0 = vload4(0, a + p); - CL_DTYPE4 b0 = vload4(0, b + p * N + col); - CL_DTYPE4 b1 = vload4(0, b + (p+1) * N + col); - CL_DTYPE4 b2 = vload4(0, b + (p+2) * N + col); - CL_DTYPE4 b3 = vload4(0, b + (p+3) * N + col); - - c0 += a0.x * b0; - c0 += a0.y * b1; - c0 += a0.z * b2; - c0 += a0.w * b3; - } - - // compute left K - CL_DTYPE4 b2 = 0.0f, - b1 = 0.0f, - b0 = 0.0f, - a0 = 0.0f; - switch (K - p) { - case 3: { - b2 = vload4(0, b + (p+2) * N + col); - a0.z = a[p + 2]; - } - case 2: { - b1 = vload4(0, b + (p+1) * N + col); - a0.y = a[p + 1]; - } - case 1: { - b0 = vload4(0, b + (p) * N + col); - a0.x = a[p]; - } - } - c0 += a0.x * b0; - c0 += a0.y * b1; - c0 += a0.z * b2; - - // store res -#ifdef RELU - if (col % 4 == 0) { - vstore4(fmax(c0, (CL_DTYPE4)0.f), 0, c + col); - } else { - switch (col % 4) { - case 3: - c[col + 2] = activation(c0.z); - case 2: - c[col + 1] = activation(c0.y); - case 1: - c[col] = activation(c0.x); - } - } -#else - if (col % 4 == 0) { - vstore4(c0, 0, c + col); - } else { - switch (col % 4) { - case 3: - c[col + 2] = c0.z; - case 2: - c[col + 1] = c0.y; - case 1: - c[col] = c0.x; - } - } -#endif - } else { - const int left_col = N - col; - for (int col_offset = 0; col_offset < left_col; ++col_offset) { - CL_DTYPE c0 = bias ? bias[col] : 0; - for (int p = 0; p < K; ++p) { - CL_DTYPE b0 = *(b + p * N + col + col_offset); - CL_DTYPE a0 = *(a + p); - c0 += a0 * b0; - } -#ifdef RELU - c[col + col_offset] = activation(c0); -#else - c[col + col_offset] = c0; -#endif - } - } -} - - -// fc_gemm_4x4: for fc with M = 1 -// a: param.input {M, K} -// b: param.w {K, N} -// c: param.output {M, N} -__kernel -void fc_gemm_4x4(__global const CL_DTYPE* a, - __global const CL_DTYPE* b, - __global const CL_DTYPE* bias, - __global CL_DTYPE* c, - const int M, const int N, const int K) { - const int row = get_global_id(0) << 2; // id: [0, M>>2) height of out == M - const int col = get_global_id(1) << 2; // id: [0, N>>2) width of out == N - - if (row+3 < M && col+3 < N) { - CL_DTYPE bias0 = bias ? bias[col] : 0, - bias1 = bias ? bias[col+1] : 0, - bias2 = bias ? bias[col+2] : 0, - bias3 = bias ? bias[col+3] : 0; - - CL_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3, - c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3, - c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3, - c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3; - - for (int p = 0; p < K; ++p) { - CL_DTYPE - a00 = *(a + row * K + p), - a10 = *(a + (row + 1) * K + p), - a20 = *(a + (row + 2) * K + p), - a30 = *(a + (row + 3) * K + p), - - b00 = *(b + p * N + col), - b01 = *(b + p * N + (col + 1)), - b02 = *(b + p * N + (col + 2)), - b03 = *(b + p * N + (col + 3)); - - c00 += a00 * b00; c01 += a00 * b01; c02 += a00 * b02; c03 += a00 * b03; - c10 += a10 * b00; c11 += a10 * b01; c12 += a10 * b02; c13 += a10 * b03; - c20 += a20 * b00; c21 += a20 * b01; c22 += a20 * b02; c23 += a20 * b03; - c30 += a30 * b00; c31 += a30 * b01; c32 += a30 * b02; c33 += a30 * b03; - } -#if defined(RELU) - c[row*N+col] = fmax(c00, 0); c[row*N+(col+1)] = fmax(c01, 0); c[row*N+(col+2)] = fmax(c02, 0); c[row*N+(col+3)] = fmax(c03, 0); - c[(row+1)*N+col] = fmax(c10, 0); c[(row+1)*N+(col+1)] = fmax(c11, 0); c[(row+1)*N+(col+2)] = fmax(c12, 0); c[(row+1)*N+(col+3)] = fmax(c13, 0); - c[(row+2)*N+col] = fmax(c20, 0); c[(row+2)*N+(col+1)] = fmax(c21, 0); c[(row+2)*N+(col+2)] = fmax(c22, 0); c[(row+2)*N+(col+3)] = fmax(c23, 0); - c[(row+3)*N+col] = fmax(c30, 0); c[(row+3)*N+(col+1)] = fmax(c31, 0); c[(row+3)*N+(col+2)] = fmax(c32, 0); c[(row+3)*N+(col+3)] = fmax(c33, 0); -#else - c[row*N+col] = c00; c[row*N+(col+1)] = c01; c[row*N+(col+2)] = c02; c[row*N+(col+3)] = c03; - c[(row+1)*N+col] = c10; c[(row+1)*N+(col+1)] = c11; c[(row+1)*N+(col+2)] = c12; c[(row+1)*N+(col+3)] = c13; - c[(row+2)*N+col] = c20; c[(row+2)*N+(col+1)] = c21; c[(row+2)*N+(col+2)] = c22; c[(row+2)*N+(col+3)] = c23; - c[(row+3)*N+col] = c30; c[(row+3)*N+(col+1)] = c31; c[(row+3)*N+(col+2)] = c32; c[(row+3)*N+(col+3)] = c33; -#endif - } else { - for (int cidx = col; cidx < N; ++cidx) { - for (int ridx = row; ridx < M; ++ridx) { - CL_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; - for (int p = 0; p < K; ++p) { - a0 = *(a + ridx * K + p); - b0 = *(b + p * N + cidx), - c0 += a0 * b0; - } -#if defined(RELU) - c[ridx * N + cidx] = fmax(c0, 0); -#else - c[ridx * N + cidx] = c0; -#endif - } - } - } -} diff --git a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl deleted file mode 100644 index fe71f4c6ff..0000000000 --- a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define CL_DTYPE float - -__kernel -void im2col(__global const CL_DTYPE* data_im, const int img_offset, - const int col_chw, - const int height, const int width, - const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, - const int height_col, const int width_col, - __global CL_DTYPE* col_data, const int col_offset) { - int index = get_global_id(0); // [0, col_chw) - - data_im = data_im + img_offset; - col_data = col_data + col_offset; - - if(index < col_chw) { - int w_out = index % width_col; - int h_index = index / width_col; - int h_out = h_index % height_col; - int channel_in = h_index / height_col; - - int channel_out = channel_in * kernel_h * kernel_w; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; - - __global CL_DTYPE* col_data_ptr = col_data; - col_data_ptr += (channel_out * height_col + h_out) * width_col + w_out; - __global const CL_DTYPE* data_im_ptr = data_im; - data_im_ptr += (channel_in * height + h_in) * width + w_in; - - int dh = 0; - for (int i = 0; i < kernel_h; ++i) { - int dw = 0; - for (int j = 0; j < kernel_w; ++j) { - int h = h_in + dh; - int w = w_in + dw; - *col_data_ptr = (h >= 0 && w >= 0 && h < height && w < width) - ? data_im_ptr[dh * width + dw] - : 0; - col_data_ptr += height_col * width_col; - dw += dilation_w; - } - dh += dilation_h; - } - } -} - diff --git a/lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl deleted file mode 100644 index f6c88c9430..0000000000 --- a/lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#if 0 -// naive gemm: keep for check -__kernel -void mat_mul(__global const CL_DTYPE* x, - __global const CL_DTYPE* y, - __global CL_DTYPE* out, - const int M, const int N, const int K) { - const int row = get_global_id(0); // [0, M) columns of out == m - const int col = get_global_id(1); // [0, N) rows of out == n - - if ((col >= N) || (row >= M)) { - return; - } - - CL_DTYPE x0, y0, - out0 = 0; - - for (int p = 0; p < K; ++p) { - x0 = *(x + row * K + p); - y0 = *(y + p * N + col); - out0 += x0 * y0; - } - - out[row * N + col] = out0; -} -#endif // naive gemm - -__kernel -void mat_mul(__global const CL_DTYPE* a, - __global const CL_DTYPE* b, - __global CL_DTYPE* c, - const int M, const int N, const int K) { - const int row = get_global_id(0) << 2; // id: [0, M>>2) height of out == M - const int col = get_global_id(1) << 2; // id: [0, N>>2) width of out == N - - if (row+3 < M && col+3 < N) { - CL_DTYPE c00 = 0, c01 = 0, c02 = 0, c03 = 0, - c10 = 0, c11 = 0, c12 = 0, c13 = 0, - c20 = 0, c21 = 0, c22 = 0, c23 = 0, - c30 = 0, c31 = 0, c32 = 0, c33 = 0; - - for (int p = 0; p < K; p++) { - - CL_DTYPE a00 = *(a + row * K + p), - a10 = *(a + (row + 1) * K + p), - a20 = *(a + (row + 2) * K + p), - a30 = *(a + (row + 3) * K + p), - - b00 = *(b + p * N + col), - b01 = *(b + p * N + (col+1)), - b02 = *(b + p * N + (col+2)), - b03 = *(b + p * N + (col+3)); - - c00 += a00 * b00; c01 += a00 * b01; c02 += a00 * b02; c03 += a00 * b03; - c10 += a10 * b00; c11 += a10 * b01; c12 += a10 * b02; c13 += a10 * b03; - c20 += a20 * b00; c21 += a20 * b01; c22 += a20 * b02; c23 += a20 * b03; - c30 += a30 * b00; c31 += a30 * b01; c32 += a30 * b02; c33 += a30 * b03; - } - c[row*N+col] = c00; c[row*N+(col+1)] = c01; c[row*N+(col+2)] = c02; c[row*N+(col+3)] = c03; - c[(row+1)*N+col] = c10; c[(row+1)*N+(col+1)] = c11; c[(row+1)*N+(col+2)] = c12; c[(row+1)*N+(col+3)] = c13; - c[(row+2)*N+col] = c20; c[(row+2)*N+(col+1)] = c21; c[(row+2)*N+(col+2)] = c22; c[(row+2)*N+(col+3)] = c23; - c[(row+3)*N+col] = c30; c[(row+3)*N+(col+1)] = c31; c[(row+3)*N+(col+2)] = c32; c[(row+3)*N+(col+3)] = c33; - } else { - for(int cidx = col; cidx < N; ++cidx) { - for (int ridx = row; ridx < M; ++ridx) { - CL_DTYPE a0, b0, c0 = 0; - for (int p = 0; p < K; ++p) { - a0 = *(a + ridx * K + p); - b0 = *(b + p * N + cidx), - c0 += a0 * b0; - } - c[ridx * N + cidx] = c0; - } - } - } -} - diff --git a/lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl deleted file mode 100644 index edf8f119eb..0000000000 --- a/lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl +++ /dev/null @@ -1,112 +0,0 @@ -/************************************************************************************* - * Copyright (c) 2015, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, - * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - **************************************************************************************/ - -#include - -#define MIN_VALUE -FLT_MAX - -__kernel void pool_max(const int numel, // num of elements - __global CL_DTYPE* input_data, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int kernel_h, - const int kernel_w, - const int stride_h, - const int stride_w, - const int pad_h, - const int pad_w, - __global CL_DTYPE* output_data) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < numel; index += tmp) { - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - CL_DTYPE maxval = MIN_VALUE; - int maxidx = -1; - input_data = - input_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (input_data[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = input_data[maxidx]; - } - } - } - output_data[index] = maxval; - } -} - -__kernel void pool_avg(const int numel, - __global CL_DTYPE* input_data, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int kernel_h, - const int kernel_w, - const int stride_h, - const int stride_w, - const int pad_h, - const int pad_w, - __global CL_DTYPE* output_data) { - int index = get_global_id(0); - int tmp = get_global_size(0); - for(index; index < numel; index+=tmp) { - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - CL_DTYPE aveval = 0; - input_data = - input_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += input_data[h * width + w]; - } - } - output_data[index] = aveval / pool_size; - } -} diff --git a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl deleted file mode 100644 index b07dc8132f..0000000000 --- a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void relu(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) { - const int index = get_global_id(0); - if (index < count) { - out_data[index] = activation(x_data[index]); - } -} diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h deleted file mode 100644 index ec67aa676d..0000000000 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define GET_VEC_TYPE(type__, size__) type__##size__ -#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__) -#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4) - -inline CL_DTYPE activation(CL_DTYPE in -#ifdef PRELU - , - CL_DTYPE prelu_alpha -#endif - ) { - CL_DTYPE output; -#ifdef PRELU - output = select(prelu_alpha * in, in, in >= (CL_DTYPE)0); -#endif - -#ifdef RELU - output = fmax(in, (CL_DTYPE)0); -#endif - return output; -} diff --git a/lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl deleted file mode 100644 index c106377830..0000000000 --- a/lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -__kernel void channel_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage, __private const int w) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - int2 coords_bias; - coords_bias.x = x/w; - coords_bias.y = 0; - float4 in = read_imagef(input, sampler, coords); - float4 biase = read_imagef(bias, sampler, coords_bias); - float4 output = in + biase; - write_imagef(outputImage, coords, output); - } diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl deleted file mode 100644 index ecf719ae93..0000000000 --- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - float4 in = read_imagef(input, sampler, coords); - float4 biase = read_imagef(bias, sampler, coords); - float4 output = in + biase; - write_imagef(outputImage,coords,output); - } diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl deleted file mode 100644 index 0ca3b9141d..0000000000 --- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#define MIN_VALUE -FLT_MAX - -__kernel void pool_max( - __private const int in_height, __private const int in_width, - __private const int out_height, __private const int out_width, - __private const int pad_top, __private const int pad_left, - __private const int stride_h, __private const int stride_w, - __private const int ksize_h, __private const int ksize_w, - __read_only image2d_t input, __write_only image2d_t output) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_height; - const int out_h = out_nh % out_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int start_h = out_h * stride_h - pad_top; - int end_h = min(start_h + ksize_h, in_height); - start_h = max(start_h,0); - - int start_w = out_w * stride_w - pad_left; - int end_w = min(start_w + ksize_w, in_width); - start_w = max(start_w,0); - - const int pos_in_x = out_c * in_width; - const int pos_in_y = out_n * in_height; - float4 max_value = (float4)(MIN_VALUE); - for (int y = start_h; y < end_h; ++y) { - for (int x = start_w; x < end_w; ++x) { - float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); - max_value = max(max_value, tmp); - } - } - - const int pos_out_x = mad24(out_c, out_width, out_w); - write_imagef(output, (int2)(pos_out_x, out_nh), max_value); -} - -__kernel void pool_avg( - __private const int in_height, __private const int in_width, - __private const int out_height, __private const int out_width, - __private const int pad_top, __private const int pad_left, - __private const int stride_h, __private const int stride_w, - __private const int ksize_h, __private const int ksize_w, - __read_only image2d_t input, __write_only image2d_t output) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_height; - const int out_h = out_nh % out_height; - - const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - int start_h = max(out_h * stride_h - pad_top, 0); - int end_h = min(start_h + ksize_h, in_height); - - int start_w = max(out_w * stride_w - pad_left, 0); - int end_w = min(start_w + ksize_w, in_width); - - const int pos_in_x = out_c * in_width; - const int pos_in_y = out_n * in_height; - float4 sum = (float4)(0.0f); - int num = 0; - for (int y = start_h; y < end_h; ++y) { - for (int x = start_w; x < end_w; ++x) { - sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); - num++; - } - } - float4 avg = sum / num; - const int pos_out_x = mad24(out_c, out_width, out_w); - write_imagef(output, (int2)(pos_out_x, out_nh), avg); -} diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc deleted file mode 100644 index c2504ab611..0000000000 --- a/lite/backends/opencl/cl_runtime.cc +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/opencl/cl_runtime.h" -#include -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { - -CLRuntime* CLRuntime::Global() { - static CLRuntime cl_runtime_; - cl_runtime_.Init(); - return &cl_runtime_; -} - -CLRuntime::~CLRuntime() { - if (command_queue_ != nullptr) { - command_queue_->finish(); - } - // For controlling the destruction order: - command_queue_.reset(); - context_.reset(); - device_.reset(); - platform_.reset(); -} - -bool CLRuntime::Init() { - if (initialized_) { - return true; - } - bool is_platform_init = InitializePlatform(); - bool is_device_init = InitializeDevice(); - is_init_success_ = is_platform_init && is_device_init; - initialized_ = true; - return initialized_; -} - -cl::Platform& CLRuntime::platform() { - CHECK(platform_ != nullptr) << "platform_ is not initialized!"; - return *platform_; -} - -cl::Context& CLRuntime::context() { - if (context_ == nullptr) { - context_ = CreateContext(); - } - return *context_; -} - -cl::Device& CLRuntime::device() { - CHECK(device_ != nullptr) << "device_ is not initialized!"; - return *device_; -} - -cl::CommandQueue& CLRuntime::command_queue() { - if (command_queue_ == nullptr) { - command_queue_ = CreateCommandQueue(context()); - } - return *command_queue_; -} - -std::unique_ptr CLRuntime::CreateProgram( - const cl::Context& context, std::string file_name) { - std::ifstream file{file_name, std::ios::binary | std::ios::ate}; - CHECK(file.is_open()) << "Can't open file from " << file_name; - auto size = file.tellg(); - CHECK(size > 0) << "size is too small."; - std::string content(size, '\0'); - file.seekg(0); - file.read(&content[0], size); - cl::Program::Sources sources; - sources.push_back(content); - auto prog = - std::unique_ptr(new cl::Program(context, sources, &status_)); - VLOG(4) << "OpenCL kernel file name: " << file_name; - VLOG(4) << "Program source size: " << content.size(); - CL_CHECK_FATAL(status_); - return std::move(prog); -} - -std::unique_ptr CLRuntime::CreateEvent( - const cl::Context& context) { - auto event = - std::unique_ptr(new cl::UserEvent(context, &status_)); - CL_CHECK_FATAL(status_); - return std::move(event); -} - -bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { - std::string build_option = options + " -cl-fast-relaxed-math -I " + - CLRuntime::Global()->cl_path() + "/cl_kernel"; - status_ = program->build({*device_}, build_option.c_str()); - CL_CHECK_ERROR(status_); - - if (status_ != CL_SUCCESS) { - if (program->getBuildInfo(device()) == - CL_BUILD_ERROR) { - std::string log = program->getBuildInfo(device()); - LOG(FATAL) << "Program build error: " << log; - } - return false; - } - - return true; -} - -bool CLRuntime::InitializePlatform() { - std::vector all_platforms; - status_ = cl::Platform::get(&all_platforms); - CL_CHECK_ERROR(status_); - if (all_platforms.empty()) { - LOG(FATAL) << "No OpenCL platform found!"; - return false; - } - platform_ = std::make_shared(); - *platform_ = all_platforms[0]; - return true; -} - -bool CLRuntime::InitializeDevice() { - std::vector all_devices; - status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices); - CL_CHECK_ERROR(status_); - if (all_devices.empty()) { - LOG(FATAL) << "No OpenCL GPU device found!"; - return false; - } - device_ = std::make_shared(); - *device_ = all_devices[0]; - - auto device_name = device_->getInfo(); - LOG(INFO) << "Using device: " << device_name; - auto image_support = device_->getInfo(); - if (image_support) { - LOG(INFO) << "The chosen device supports image processing."; - } else { - LOG(INFO) << "The chosen device doesn't support image processing!"; - return false; - } - auto ext_data = device_->getInfo(); - VLOG(4) << "The extensions supported by this device: " << ext_data; - if (ext_data.find("cl_khr_fp16") != std::string::npos) { - LOG(INFO) << "The chosen device supports the half data type."; - } else { - LOG(INFO) << "The chosen device doesn't support the half data type!"; - } - auto max_units = device_->getInfo(); - LOG(INFO) << "The chosen device has " << max_units << " compute units."; - auto local_mem = device_->getInfo(); - LOG(INFO) << "The local memory size of the chosen device is " - << static_cast(local_mem) / 1024 << " KB."; - return true; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h deleted file mode 100644 index 0859780c69..0000000000 --- a/lite/backends/opencl/cl_runtime.h +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include "lite/backends/opencl/cl_include.h" -#include "lite/backends/opencl/cl_utility.h" - -namespace paddle { -namespace lite { - -class CLRuntime { - public: - static CLRuntime* Global(); - - bool Init(); - - cl::Platform& platform(); - - cl::Context& context(); - - cl::Device& device(); - - cl::CommandQueue& command_queue(); - - std::unique_ptr CreateProgram(const cl::Context& context, - std::string file_name); - - std::unique_ptr CreateEvent(const cl::Context& context); - - bool BuildProgram(cl::Program* program, const std::string& options = ""); - - bool IsInitSuccess() { return is_init_success_; } - - std::string cl_path() { return cl_path_; } - - void set_cl_path(std::string cl_path) { cl_path_ = cl_path; } - - private: - CLRuntime() = default; - - ~CLRuntime(); - - bool InitializePlatform(); - - bool InitializeDevice(); - - std::shared_ptr CreateContext() { - auto context = std::make_shared( - std::vector{device()}, nullptr, nullptr, nullptr, &status_); - CL_CHECK_FATAL(status_); - return context; - } - - std::shared_ptr CreateCommandQueue( - const cl::Context& context) { - cl_command_queue_properties properties = 0; - -#ifdef LITE_WITH_PROFILE - properties |= CL_QUEUE_PROFILING_ENABLE; -#endif // LITE_WITH_PROFILE - auto queue = std::make_shared( - context, device(), properties, &status_); - CL_CHECK_FATAL(status_); - return queue; - } - - std::string cl_path_; - - std::shared_ptr platform_{nullptr}; - - std::shared_ptr context_{nullptr}; - - std::shared_ptr device_{nullptr}; - - std::shared_ptr command_queue_{nullptr}; - - cl_int status_{CL_SUCCESS}; - - bool initialized_{false}; - - bool is_init_success_{false}; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_utility.cc b/lite/backends/opencl/cl_utility.cc deleted file mode 100644 index 7c8cca414f..0000000000 --- a/lite/backends/opencl/cl_utility.cc +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/opencl/cl_utility.h" - -namespace paddle { -namespace lite { - -const char *opencl_error_to_str(cl_int error) { -#define CASE_CL_CONSTANT(NAME) \ - case NAME: \ - return #NAME; - // Suppose that no combinations are possible. - switch (error) { - CASE_CL_CONSTANT(CL_SUCCESS) - CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND) - CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE) - CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE) - CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE) - CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES) - CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY) - CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE) - CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP) - CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH) - CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED) - CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE) - CASE_CL_CONSTANT(CL_MAP_FAILURE) - CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET) - CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) - CASE_CL_CONSTANT(CL_INVALID_VALUE) - CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE) - CASE_CL_CONSTANT(CL_INVALID_PLATFORM) - CASE_CL_CONSTANT(CL_INVALID_DEVICE) - CASE_CL_CONSTANT(CL_INVALID_CONTEXT) - CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES) - CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE) - CASE_CL_CONSTANT(CL_INVALID_HOST_PTR) - CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT) - CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) - CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE) - CASE_CL_CONSTANT(CL_INVALID_SAMPLER) - CASE_CL_CONSTANT(CL_INVALID_BINARY) - CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS) - CASE_CL_CONSTANT(CL_INVALID_PROGRAM) - CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE) - CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME) - CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION) - CASE_CL_CONSTANT(CL_INVALID_KERNEL) - CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX) - CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE) - CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE) - CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS) - CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION) - CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE) - CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE) - CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET) - CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST) - CASE_CL_CONSTANT(CL_INVALID_EVENT) - CASE_CL_CONSTANT(CL_INVALID_OPERATION) - CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT) - CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE) - CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL) - CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE) - CASE_CL_CONSTANT(CL_INVALID_PROPERTY) - - default: - return "UNKNOWN ERROR CODE"; - } -#undef CASE_CL_CONSTANT -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_utility.h b/lite/backends/opencl/cl_utility.h deleted file mode 100644 index b7f14c15e6..0000000000 --- a/lite/backends/opencl/cl_utility.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/opencl/cl_include.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { - -const char* opencl_error_to_str(cl_int error); - -#define CL_CHECK_ERROR(err_code__) \ - if (err_code__ != CL_SUCCESS) { \ - LOG(ERROR) << string_format( \ - "OpenCL error with code %s happened in file %s at line %d. " \ - "Exiting.\n", \ - opencl_error_to_str(err_code__), \ - __FILE__, \ - __LINE__); \ - } - -#define CL_CHECK_FATAL(err_code__) \ - if (err_code__ != CL_SUCCESS) { \ - LOG(FATAL) << string_format( \ - "OpenCL error with code %s happened in file %s at line %d. " \ - "Exiting.\n", \ - opencl_error_to_str(err_code__), \ - __FILE__, \ - __LINE__); \ - } -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc deleted file mode 100644 index 357ac8c2d6..0000000000 --- a/lite/backends/opencl/cl_wrapper.cc +++ /dev/null @@ -1,732 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/opencl/cl_wrapper.h" -#include -#include -#include - -namespace paddle { -namespace lite { -CLWrapper *CLWrapper::Global() { - static CLWrapper wrapper; - return &wrapper; -} - -CLWrapper::CLWrapper() { - CHECK(InitHandle()) << "Fail to initialize the OpenCL library!"; - InitFunctions(); -} - -bool CLWrapper::InitHandle() { - const std::vector paths = { - "libOpenCL.so", -#if defined(__aarch64__) - // Qualcomm Adreno with Android - "/system/vendor/lib64/libOpenCL.so", - "/system/lib64/libOpenCL.so", - // Arm Mali with Android - "/system/vendor/lib64/egl/libGLES_mali.so", - "/system/lib64/egl/libGLES_mali.so", - // Arm Linux - "/usr/lib/aarch64-linux-gnu/libOpenCL.so", -#else - // Qualcomm Adreno with Android - "/system/vendor/lib/libOpenCL.so", - "/system/lib/libOpenCL.so", - // Arm Mali with Android - "/system/vendor/lib/egl/libGLES_mali.so", - "/system/lib/egl/libGLES_mali.so", - // Arm Linux - "/usr/lib/arm-linux-gnueabihf/libOpenCL.so", -#endif - }; - std::string target_lib = "Unknown"; - for (auto path : paths) { - handle_ = dlopen(path.c_str(), RTLD_LAZY); - if (handle_ != nullptr) { - target_lib = path; - break; - } - } - VLOG(4) << "Load the OpenCL library from " << target_lib; - if (handle_ != nullptr) { - return true; - } else { - return false; - } -} - -void CLWrapper::InitFunctions() { - CHECK(handle_ != nullptr) << "The library handle can't be null!"; - -#define PADDLE_DLSYM(cl_func) \ - do { \ - cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \ - if (cl_func##_ == nullptr) { \ - LOG(ERROR) << "Cannot find the " << #cl_func \ - << " symbol in libOpenCL.so!"; \ - break; \ - } \ - VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \ - } while (false) - - PADDLE_DLSYM(clGetPlatformIDs); - PADDLE_DLSYM(clGetPlatformInfo); - PADDLE_DLSYM(clBuildProgram); - PADDLE_DLSYM(clEnqueueNDRangeKernel); - PADDLE_DLSYM(clSetKernelArg); - PADDLE_DLSYM(clRetainMemObject); - PADDLE_DLSYM(clReleaseMemObject); - PADDLE_DLSYM(clEnqueueUnmapMemObject); - PADDLE_DLSYM(clRetainCommandQueue); - PADDLE_DLSYM(clCreateContext); - PADDLE_DLSYM(clCreateContextFromType); - PADDLE_DLSYM(clReleaseContext); - PADDLE_DLSYM(clWaitForEvents); - PADDLE_DLSYM(clReleaseEvent); - PADDLE_DLSYM(clEnqueueWriteBuffer); - PADDLE_DLSYM(clEnqueueReadBuffer); - PADDLE_DLSYM(clEnqueueReadImage); - PADDLE_DLSYM(clGetProgramBuildInfo); - PADDLE_DLSYM(clRetainProgram); - PADDLE_DLSYM(clEnqueueMapBuffer); - PADDLE_DLSYM(clEnqueueMapImage); - PADDLE_DLSYM(clCreateCommandQueue); - PADDLE_DLSYM(clCreateCommandQueueWithProperties); - PADDLE_DLSYM(clReleaseCommandQueue); - PADDLE_DLSYM(clCreateProgramWithBinary); - PADDLE_DLSYM(clRetainContext); - PADDLE_DLSYM(clGetContextInfo); - PADDLE_DLSYM(clReleaseProgram); - PADDLE_DLSYM(clFlush); - PADDLE_DLSYM(clFinish); - PADDLE_DLSYM(clGetProgramInfo); - PADDLE_DLSYM(clCreateKernel); - PADDLE_DLSYM(clRetainKernel); - PADDLE_DLSYM(clCreateBuffer); - PADDLE_DLSYM(clCreateImage2D); - PADDLE_DLSYM(clCreateImage); - PADDLE_DLSYM(clCreateUserEvent); - PADDLE_DLSYM(clCreateProgramWithSource); - PADDLE_DLSYM(clReleaseKernel); - PADDLE_DLSYM(clGetDeviceInfo); - PADDLE_DLSYM(clGetDeviceIDs); - PADDLE_DLSYM(clRetainDevice); - PADDLE_DLSYM(clReleaseDevice); - PADDLE_DLSYM(clRetainEvent); - PADDLE_DLSYM(clGetKernelWorkGroupInfo); - PADDLE_DLSYM(clGetEventInfo); - PADDLE_DLSYM(clGetEventProfilingInfo); - PADDLE_DLSYM(clGetImageInfo); - PADDLE_DLSYM(clEnqueueCopyBuffer); - PADDLE_DLSYM(clEnqueueWriteImage); - PADDLE_DLSYM(clEnqueueCopyImage); - -#undef PADDLE_DLSYM -} - -} // namespace lite -} // namespace paddle - -CL_API_ENTRY cl_int CL_API_CALL clGetPlatformIDs(cl_uint num_entries, - cl_platform_id *platforms, - cl_uint *num_platforms) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetPlatformIDs()( - num_entries, platforms, num_platforms); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetPlatformInfo(cl_platform_id platform, - cl_platform_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetPlatformInfo()( - platform, - param_name, - param_value_size, - param_value, - param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clBuildProgram( - cl_program program, - cl_uint num_devices, - const cl_device_id *device_list, - const char *options, - void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), - void *user_data) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clBuildProgram()( - program, num_devices, device_list, options, pfn_notify, user_data); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueNDRangeKernel(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueNDRangeKernel()( - command_queue, - kernel, - work_dim, - global_work_offset, - global_work_size, - local_work_size, - num_events_in_wait_list, - event_wait_list, - event); -} - -CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel kernel, - cl_uint arg_index, - size_t arg_size, - const void *arg_value) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clSetKernelArg()( - kernel, arg_index, arg_size, arg_value); -} - -CL_API_ENTRY cl_int CL_API_CALL clRetainMemObject(cl_mem memobj) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clRetainMemObject()(memobj); -} - -CL_API_ENTRY cl_int CL_API_CALL clReleaseMemObject(cl_mem memobj) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clReleaseMemObject()(memobj); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueUnmapMemObject(cl_command_queue command_queue, - cl_mem memobj, - void *mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueUnmapMemObject()( - command_queue, - memobj, - mapped_ptr, - num_events_in_wait_list, - event_wait_list, - event); -} - -CL_API_ENTRY cl_int CL_API_CALL clRetainCommandQueue( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clRetainCommandQueue()( - command_queue); -} - -CL_API_ENTRY cl_context CL_API_CALL -clCreateContext(const cl_context_properties *properties, - cl_uint num_devices, - const cl_device_id *devices, - void(CL_CALLBACK *pfn_notify)(const char *errinfo, - const void *private_info, - size_t cb, - void *user_data), - void *user_data, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clCreateContext()( - properties, num_devices, devices, pfn_notify, user_data, errcode_ret); -} - -CL_API_ENTRY cl_context CL_API_CALL -clCreateContextFromType(const cl_context_properties *properties, - cl_device_type device_type, - void(CL_CALLBACK *pfn_notify)(const char *errinfo, - const void *private_info, - size_t cb, - void *user_data), - void *user_data, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clCreateContextFromType()( - properties, device_type, pfn_notify, user_data, errcode_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clReleaseContext(cl_context context) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clReleaseContext()(context); -} - -CL_API_ENTRY cl_int CL_API_CALL clWaitForEvents( - cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clWaitForEvents()(num_events, - event_list); -} - -CL_API_ENTRY cl_int CL_API_CALL clReleaseEvent(cl_event event) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clReleaseEvent()(event); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t size, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueWriteBuffer()( - command_queue, - buffer, - blocking_write, - offset, - size, - ptr, - num_events_in_wait_list, - event_wait_list, - event); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t size, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueReadBuffer()( - command_queue, - buffer, - blocking_read, - offset, - size, - ptr, - num_events_in_wait_list, - event_wait_list, - event); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReadImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_read, - const size_t *origin, - const size_t *region, - size_t row_pitch, - size_t slice_pitch, - void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueReadImage()( - command_queue, - image, - blocking_read, - origin, - region, - row_pitch, - slice_pitch, - ptr, - num_events_in_wait_list, - event_wait_list, - event); -} - -CL_API_ENTRY cl_int CL_API_CALL -clGetProgramBuildInfo(cl_program program, - cl_device_id device, - cl_program_build_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetProgramBuildInfo()( - program, - device, - param_name, - param_value_size, - param_value, - param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clRetainProgram(cl_program program) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clRetainProgram()(program); -} - -CL_API_ENTRY void *CL_API_CALL -clEnqueueMapBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueMapBuffer()( - command_queue, - buffer, - blocking_map, - map_flags, - offset, - size, - num_events_in_wait_list, - event_wait_list, - event, - errcode_ret); -} - -CL_API_ENTRY void *CL_API_CALL -clEnqueueMapImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_map, - cl_map_flags map_flags, - const size_t *origin, - const size_t *region, - size_t *image_row_pitch, - size_t *image_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueMapImage()( - command_queue, - image, - blocking_map, - map_flags, - origin, - region, - image_row_pitch, - image_slice_pitch, - num_events_in_wait_list, - event_wait_list, - event, - errcode_ret); -} - -CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL -clCreateCommandQueue(cl_context context, - cl_device_id device, - cl_command_queue_properties properties, - cl_int *errcode_ret) - CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED { - return paddle::lite::CLWrapper::Global()->clCreateCommandQueue()( - context, device, properties, errcode_ret); -} - -CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties( - cl_context context, - cl_device_id device, - const cl_queue_properties *properties, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_0 { - return paddle::lite::CLWrapper::Global() - ->clCreateCommandQueueWithProperties()( - context, device, properties, errcode_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue( - cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clReleaseCommandQueue()( - command_queue); -} - -CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithBinary(cl_context context, - cl_uint num_devices, - const cl_device_id *device_list, - const size_t *lengths, - const unsigned char **binaries, - cl_int *binary_status, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clCreateProgramWithBinary()( - context, - num_devices, - device_list, - lengths, - binaries, - binary_status, - errcode_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clRetainContext(cl_context context) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clRetainContext()(context); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context context, - cl_context_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetContextInfo()( - context, param_name, param_value_size, param_value, param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clReleaseProgram(cl_program program) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clReleaseProgram()(program); -} - -CL_API_ENTRY cl_int CL_API_CALL clFlush(cl_command_queue command_queue) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clFlush()(command_queue); -} - -CL_API_ENTRY cl_int CL_API_CALL clFinish(cl_command_queue command_queue) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clFinish()(command_queue); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetProgramInfo(cl_program program, - cl_program_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetProgramInfo()( - program, param_name, param_value_size, param_value, param_value_size_ret); -} - -CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program program, - const char *kernel_name, - cl_int *errcode_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clCreateKernel()( - program, kernel_name, errcode_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clRetainKernel(cl_kernel kernel) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clRetainKernel()(kernel); -} - -CL_API_ENTRY cl_mem CL_API_CALL clCreateBuffer(cl_context context, - cl_mem_flags flags, - size_t size, - void *host_ptr, - cl_int *errcode_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clCreateBuffer()( - context, flags, size, host_ptr, errcode_ret); -} - -CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateImage2D(cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - size_t image_width, - size_t image_height, - size_t image_row_pitch, - void *host_ptr, - cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED { - return paddle::lite::CLWrapper::Global()->clCreateImage2D()(context, - flags, - image_format, - image_width, - image_height, - image_row_pitch, - host_ptr, - errcode_ret); -} - -CL_API_ENTRY cl_mem CL_API_CALL -clCreateImage(cl_context context, - cl_mem_flags flags, - const cl_image_format *image_format, - const cl_image_desc *image_desc, - void *host_ptr, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2 { - return paddle::lite::CLWrapper::Global()->clCreateImage()( - context, flags, image_format, image_desc, host_ptr, errcode_ret); -} - -CL_API_ENTRY cl_event CL_API_CALL clCreateUserEvent( - cl_context context, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_1 { - return paddle::lite::CLWrapper::Global()->clCreateUserEvent()(context, - errcode_ret); -} - -CL_API_ENTRY cl_program CL_API_CALL -clCreateProgramWithSource(cl_context context, - cl_uint count, - const char **strings, - const size_t *lengths, - cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clCreateProgramWithSource()( - context, count, strings, lengths, errcode_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clReleaseKernel(cl_kernel kernel) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clReleaseKernel()(kernel); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(cl_device_id device, - cl_device_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetDeviceInfo()( - device, param_name, param_value_size, param_value, param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDs(cl_platform_id platform, - cl_device_type device_type, - cl_uint num_entries, - cl_device_id *devices, - cl_uint *num_devices) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetDeviceIDs()( - platform, device_type, num_entries, devices, num_devices); -} - -CL_API_ENTRY cl_int CL_API_CALL clRetainDevice(cl_device_id device) - CL_API_SUFFIX__VERSION_1_2 { - return paddle::lite::CLWrapper::Global()->clRetainDevice()(device); -} - -CL_API_ENTRY cl_int CL_API_CALL clReleaseDevice(cl_device_id device) - CL_API_SUFFIX__VERSION_1_2 { - return paddle::lite::CLWrapper::Global()->clReleaseDevice()(device); -} - -CL_API_ENTRY cl_int CL_API_CALL clRetainEvent(cl_event event) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clRetainEvent()(event); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetKernelWorkGroupInfo( - cl_kernel kernel, - cl_device_id device, - cl_kernel_work_group_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetKernelWorkGroupInfo()( - kernel, - device, - param_name, - param_value_size, - param_value, - param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetEventInfo(cl_event event, - cl_event_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetEventInfo()( - event, param_name, param_value_size, param_value, param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetEventProfilingInfo( - cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetEventProfilingInfo()( - event, param_name, param_value_size, param_value, param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL clGetImageInfo(cl_mem image, - cl_image_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret) - CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clGetImageInfo()( - image, param_name, param_value_size, param_value, param_value_size_ret); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyBuffer(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueCopyBuffer()( - command_queue, - src_buffer, - dst_buffer, - src_offset, - dst_offset, - size, - num_events_in_wait_list, - event_wait_list, - event); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueWriteImage(cl_command_queue command_queue, - cl_mem image, - cl_bool blocking_write, - const size_t *origin, - const size_t *region, - size_t input_row_pitch, - size_t input_slice_pitch, - const void *ptr, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueWriteImage()( - command_queue, - image, - blocking_write, - origin, - region, - input_row_pitch, - input_slice_pitch, - ptr, - num_events_in_wait_list, - event_wait_list, - event); -} - -CL_API_ENTRY cl_int CL_API_CALL -clEnqueueCopyImage(cl_command_queue command_queue, - cl_mem src_image, - cl_mem dst_image, - const size_t *src_origin, - const size_t *dst_origin, - const size_t *region, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) CL_API_SUFFIX__VERSION_1_0 { - return paddle::lite::CLWrapper::Global()->clEnqueueCopyImage()( - command_queue, - src_image, - dst_image, - src_origin, - dst_origin, - region, - num_events_in_wait_list, - event_wait_list, - event); -} diff --git a/lite/backends/opencl/cl_wrapper.h b/lite/backends/opencl/cl_wrapper.h deleted file mode 100644 index 35ef33e5a2..0000000000 --- a/lite/backends/opencl/cl_wrapper.h +++ /dev/null @@ -1,572 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/opencl/cl_include.h" -#include "lite/utils/cp_logging.h" - -#if CL_HPP_TARGET_OPENCL_VERSION < 200 -#define CL_API_SUFFIX__VERSION_2_0 -#endif - -namespace paddle { -namespace lite { - -class CLWrapper final { - public: - static CLWrapper *Global(); - // Platform APIs - using clGetPlatformIDsType = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *); - using clGetPlatformInfoType = - cl_int (*)(cl_platform_id, cl_platform_info, size_t, void *, size_t *); - using clBuildProgramType = cl_int (*)(cl_program, - cl_uint, - const cl_device_id *, - const char *, - void (*pfn_notify)(cl_program, void *), - void *); - using clEnqueueNDRangeKernelType = cl_int (*)(cl_command_queue, - cl_kernel, - cl_uint, - const size_t *, - const size_t *, - const size_t *, - cl_uint, - const cl_event *, - cl_event *); - using clSetKernelArgType = cl_int (*)(cl_kernel, - cl_uint, - size_t, - const void *); - using clRetainMemObjectType = cl_int (*)(cl_mem); - using clReleaseMemObjectType = cl_int (*)(cl_mem); - using clEnqueueUnmapMemObjectType = cl_int (*)( - cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *); - using clRetainCommandQueueType = cl_int (*)(cl_command_queue command_queue); - using clCreateContextType = cl_context (*)(const cl_context_properties *, - cl_uint, - const cl_device_id *, - void(CL_CALLBACK *)( // NOLINT - const char *, - const void *, - size_t, - void *), - void *, - cl_int *); - using clCreateContextFromTypeType = - cl_context (*)(const cl_context_properties *, - cl_device_type, - void(CL_CALLBACK *)( // NOLINT - const char *, - const void *, - size_t, - void *), - void *, - cl_int *); - using clReleaseContextType = cl_int (*)(cl_context); - using clWaitForEventsType = cl_int (*)(cl_uint, const cl_event *); - using clReleaseEventType = cl_int (*)(cl_event); - using clEnqueueWriteBufferType = cl_int (*)(cl_command_queue, - cl_mem, - cl_bool, - size_t, - size_t, - const void *, - cl_uint, - const cl_event *, - cl_event *); - using clEnqueueReadBufferType = cl_int (*)(cl_command_queue, - cl_mem, - cl_bool, - size_t, - size_t, - void *, - cl_uint, - const cl_event *, - cl_event *); - using clEnqueueReadImageType = cl_int (*)(cl_command_queue, - cl_mem, - cl_bool, - const size_t *, - const size_t *, - size_t, - size_t, - void *, - cl_uint, - const cl_event *, - cl_event *); - using clGetProgramBuildInfoType = cl_int (*)(cl_program, - cl_device_id, - cl_program_build_info, - size_t, - void *, - size_t *); - using clRetainProgramType = cl_int (*)(cl_program program); - using clEnqueueMapBufferType = void *(*)(cl_command_queue, - cl_mem, - cl_bool, - cl_map_flags, - size_t, - size_t, - cl_uint, - const cl_event *, - cl_event *, - cl_int *); - using clEnqueueMapImageType = void *(*)(cl_command_queue, - cl_mem, - cl_bool, - cl_map_flags, - const size_t *, - const size_t *, - size_t *, - size_t *, - cl_uint, - const cl_event *, - cl_event *, - cl_int *); - using clCreateCommandQueueType = cl_command_queue(CL_API_CALL *)( // NOLINT - cl_context, - cl_device_id, - cl_command_queue_properties, - cl_int *); - using clCreateCommandQueueWithPropertiesType = cl_command_queue (*)( - cl_context, cl_device_id, const cl_queue_properties *, cl_int *); - using clReleaseCommandQueueType = cl_int (*)(cl_command_queue); - using clCreateProgramWithBinaryType = cl_program (*)(cl_context, - cl_uint, - const cl_device_id *, - const size_t *, - const unsigned char **, - cl_int *, - cl_int *); - using clRetainContextType = cl_int (*)(cl_context context); - using clGetContextInfoType = - cl_int (*)(cl_context, cl_context_info, size_t, void *, size_t *); - using clReleaseProgramType = cl_int (*)(cl_program program); - using clFlushType = cl_int (*)(cl_command_queue command_queue); - using clFinishType = cl_int (*)(cl_command_queue command_queue); - using clGetProgramInfoType = - cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *); - using clCreateKernelType = cl_kernel (*)(cl_program, const char *, cl_int *); - using clRetainKernelType = cl_int (*)(cl_kernel kernel); - using clCreateBufferType = - cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *); - using clCreateImage2DType = cl_mem(CL_API_CALL *)(cl_context, // NOLINT - cl_mem_flags, - const cl_image_format *, - size_t, - size_t, - size_t, - void *, - cl_int *); - using clCreateImageType = cl_mem (*)(cl_context, - cl_mem_flags, - const cl_image_format *, - const cl_image_desc *, - void *, - cl_int *); - using clCreateUserEventType = cl_event (*)(cl_context, cl_int *); - using clCreateProgramWithSourceType = cl_program (*)( - cl_context, cl_uint, const char **, const size_t *, cl_int *); - using clReleaseKernelType = cl_int (*)(cl_kernel kernel); - using clGetDeviceInfoType = - cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *); - using clGetDeviceIDsType = cl_int (*)( - cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); - using clRetainDeviceType = cl_int (*)(cl_device_id); - using clReleaseDeviceType = cl_int (*)(cl_device_id); - using clRetainEventType = cl_int (*)(cl_event); - using clGetKernelWorkGroupInfoType = cl_int (*)(cl_kernel, - cl_device_id, - cl_kernel_work_group_info, - size_t, - void *, - size_t *); - using clGetEventInfoType = cl_int (*)(cl_event event, - cl_event_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - using clGetEventProfilingInfoType = cl_int (*)(cl_event event, - cl_profiling_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - using clGetImageInfoType = - cl_int (*)(cl_mem, cl_image_info, size_t, void *, size_t *); - - using clEnqueueCopyBufferType = cl_int (*)(cl_command_queue, - cl_mem, - cl_mem, - size_t, - size_t, - size_t, - cl_uint, - const cl_event *, - cl_event *); - using clEnqueueWriteImageType = cl_int (*)(cl_command_queue, - cl_mem, - cl_bool, - const size_t *, - const size_t *, - size_t, - size_t, - const void *, - cl_uint, - const cl_event *, - cl_event *); - using clEnqueueCopyImageType = cl_int (*)(cl_command_queue, - cl_mem, - cl_mem, - const size_t *, - const size_t *, - const size_t *, - cl_uint, - const cl_event *, - cl_event *); - - clGetPlatformIDsType clGetPlatformIDs() { - CHECK(clGetPlatformIDs_ != nullptr) << "Cannot load clGetPlatformIDs!"; - return clGetPlatformIDs_; - } - - clGetPlatformInfoType clGetPlatformInfo() { - CHECK(clGetPlatformInfo_ != nullptr) << "Cannot load clGetPlatformInfo!"; - return clGetPlatformInfo_; - } - - clBuildProgramType clBuildProgram() { - CHECK(clBuildProgram_ != nullptr) << "Cannot load clBuildProgram!"; - return clBuildProgram_; - } - - clEnqueueNDRangeKernelType clEnqueueNDRangeKernel() { - CHECK(clEnqueueNDRangeKernel_ != nullptr) - << "Cannot load clEnqueueNDRangeKernel!"; - return clEnqueueNDRangeKernel_; - } - - clSetKernelArgType clSetKernelArg() { - CHECK(clSetKernelArg_ != nullptr) << "Cannot load clSetKernelArg!"; - return clSetKernelArg_; - } - - clRetainMemObjectType clRetainMemObject() { - CHECK(clRetainMemObject_ != nullptr) << "Cannot load clRetainMemObject!"; - return clRetainMemObject_; - } - - clReleaseMemObjectType clReleaseMemObject() { - CHECK(clReleaseMemObject_ != nullptr) << "Cannot load clReleaseMemObject!"; - return clReleaseMemObject_; - } - - clEnqueueUnmapMemObjectType clEnqueueUnmapMemObject() { - CHECK(clEnqueueUnmapMemObject_ != nullptr) - << "Cannot load clEnqueueUnmapMemObject!"; - return clEnqueueUnmapMemObject_; - } - - clRetainCommandQueueType clRetainCommandQueue() { - CHECK(clRetainCommandQueue_ != nullptr) - << "Cannot load clRetainCommandQueue!"; - return clRetainCommandQueue_; - } - - clCreateContextType clCreateContext() { - CHECK(clCreateContext_ != nullptr) << "Cannot load clCreateContext!"; - return clCreateContext_; - } - - clCreateContextFromTypeType clCreateContextFromType() { - CHECK(clCreateContextFromType_ != nullptr) - << "Cannot load clCreateContextFromType!"; - return clCreateContextFromType_; - } - - clReleaseContextType clReleaseContext() { - CHECK(clReleaseContext_ != nullptr) << "Cannot load clReleaseContext!"; - return clReleaseContext_; - } - - clWaitForEventsType clWaitForEvents() { - CHECK(clWaitForEvents_ != nullptr) << "Cannot load clWaitForEvents!"; - return clWaitForEvents_; - } - - clReleaseEventType clReleaseEvent() { - CHECK(clReleaseEvent_ != nullptr) << "Cannot load clReleaseEvent!"; - return clReleaseEvent_; - } - - clEnqueueWriteBufferType clEnqueueWriteBuffer() { - CHECK(clEnqueueWriteBuffer_ != nullptr) - << "Cannot loadcl clEnqueueWriteBuffer!"; - return clEnqueueWriteBuffer_; - } - - clEnqueueReadBufferType clEnqueueReadBuffer() { - CHECK(clEnqueueReadBuffer_ != nullptr) - << "Cannot load clEnqueueReadBuffer!"; - return clEnqueueReadBuffer_; - } - - clEnqueueReadImageType clEnqueueReadImage() { - CHECK(clEnqueueReadImage_ != nullptr) << "Cannot load clEnqueueReadImage!"; - return clEnqueueReadImage_; - } - - clGetProgramBuildInfoType clGetProgramBuildInfo() { - CHECK(clGetProgramBuildInfo_ != nullptr) - << "Cannot load clGetProgramBuildInfo!"; - return clGetProgramBuildInfo_; - } - - clRetainProgramType clRetainProgram() { - CHECK(clRetainProgram_ != nullptr) << "Cannot load clRetainProgram!"; - return clRetainProgram_; - } - - clEnqueueMapBufferType clEnqueueMapBuffer() { - CHECK(clEnqueueMapBuffer_ != nullptr) << "Cannot load clEnqueueMapBuffer!"; - return clEnqueueMapBuffer_; - } - - clEnqueueMapImageType clEnqueueMapImage() { - CHECK(clEnqueueMapImage_ != nullptr) << "Cannot load clEnqueueMapImage!"; - return clEnqueueMapImage_; - } - - clCreateCommandQueueType clCreateCommandQueue() { - CHECK(clCreateCommandQueue_ != nullptr) - << "Cannot load clCreateCommandQueue!"; - return clCreateCommandQueue_; - } - - clCreateCommandQueueWithPropertiesType clCreateCommandQueueWithProperties() { - CHECK(clCreateCommandQueueWithProperties_ != nullptr) - << "Cannot load clCreateCommandQueueWithProperties!"; - return clCreateCommandQueueWithProperties_; - } - - clReleaseCommandQueueType clReleaseCommandQueue() { - CHECK(clReleaseCommandQueue_ != nullptr) - << "Cannot load clReleaseCommandQueue!"; - return clReleaseCommandQueue_; - } - - clCreateProgramWithBinaryType clCreateProgramWithBinary() { - CHECK(clCreateProgramWithBinary_ != nullptr) - << "Cannot load clCreateProgramWithBinary!"; - return clCreateProgramWithBinary_; - } - - clRetainContextType clRetainContext() { - CHECK(clRetainContext_ != nullptr) << "Cannot load clRetainContext!"; - return clRetainContext_; - } - - clGetContextInfoType clGetContextInfo() { - CHECK(clGetContextInfo_ != nullptr) << "Cannot load clGetContextInfo!"; - return clGetContextInfo_; - } - - clReleaseProgramType clReleaseProgram() { - CHECK(clReleaseProgram_ != nullptr) << "Cannot load clReleaseProgram!"; - return clReleaseProgram_; - } - - clFlushType clFlush() { - CHECK(clFlush_ != nullptr) << "Cannot load clFlush!"; - return clFlush_; - } - - clFinishType clFinish() { - CHECK(clFinish_ != nullptr) << "Cannot load clFinish!"; - return clFinish_; - } - - clGetProgramInfoType clGetProgramInfo() { - CHECK(clGetProgramInfo_ != nullptr) << "Cannot load clGetProgramInfo!"; - return clGetProgramInfo_; - } - - clCreateKernelType clCreateKernel() { - CHECK(clCreateKernel_ != nullptr) << "Cannot load clCreateKernel!"; - return clCreateKernel_; - } - - clRetainKernelType clRetainKernel() { - CHECK(clRetainKernel_ != nullptr) << "Cannot load clRetainKernel!"; - return clRetainKernel_; - } - - clCreateBufferType clCreateBuffer() { - CHECK(clCreateBuffer_ != nullptr) << "Cannot load clCreateBuffer!"; - return clCreateBuffer_; - } - - clCreateImage2DType clCreateImage2D() { - CHECK(clCreateImage2D_ != nullptr) << "Cannot load clCreateImage2D!"; - return clCreateImage2D_; - } - - clCreateImageType clCreateImage() { - CHECK(clCreateImage_ != nullptr) << "Cannot load clCreateImage!"; - return clCreateImage_; - } - - clCreateUserEventType clCreateUserEvent() { - CHECK(clCreateUserEvent_ != nullptr) << "Cannot load clCreateUserEvent!"; - return clCreateUserEvent_; - } - - clCreateProgramWithSourceType clCreateProgramWithSource() { - CHECK(clCreateProgramWithSource_ != nullptr) - << "Cannot load clCreateProgramWithSource!"; - return clCreateProgramWithSource_; - } - - clReleaseKernelType clReleaseKernel() { - CHECK(clReleaseKernel_ != nullptr) << "Cannot load clReleaseKernel!"; - return clReleaseKernel_; - } - - clGetDeviceInfoType clGetDeviceInfo() { - CHECK(clGetDeviceInfo_ != nullptr) << "Cannot load clGetDeviceInfo!"; - return clGetDeviceInfo_; - } - - clGetDeviceIDsType clGetDeviceIDs() { - CHECK(clGetDeviceIDs_ != nullptr) << "Cannot load clGetDeviceIDs!"; - return clGetDeviceIDs_; - } - - clRetainDeviceType clRetainDevice() { - CHECK(clRetainDevice_ != nullptr) << "Cannot load clRetainDevice!"; - return clRetainDevice_; - } - - clReleaseDeviceType clReleaseDevice() { - CHECK(clReleaseDevice_ != nullptr) << "Cannot load clReleaseDevice!"; - return clReleaseDevice_; - } - - clRetainEventType clRetainEvent() { - CHECK(clRetainEvent_ != nullptr) << "Cannot load clRetainEvent!"; - return clRetainEvent_; - } - - clGetKernelWorkGroupInfoType clGetKernelWorkGroupInfo() { - CHECK(clGetKernelWorkGroupInfo_ != nullptr) - << "Cannot load clGetKernelWorkGroupInfo!"; - return clGetKernelWorkGroupInfo_; - } - - clGetEventInfoType clGetEventInfo() { - CHECK(clGetEventInfo_ != nullptr) << "Cannot load clGetEventInfo!"; - return clGetEventInfo_; - } - - clGetEventProfilingInfoType clGetEventProfilingInfo() { - CHECK(clGetEventProfilingInfo_ != nullptr) - << "Cannot load clGetEventProfilingInfo!"; - return clGetEventProfilingInfo_; - } - - clGetImageInfoType clGetImageInfo() { - CHECK(clGetImageInfo_ != nullptr) << "Cannot load clGetImageInfo!"; - return clGetImageInfo_; - } - - clEnqueueCopyBufferType clEnqueueCopyBuffer() { - CHECK(clEnqueueCopyBuffer_ != nullptr) - << "Cannot load clEnqueueCopyBuffer!"; - return clEnqueueCopyBuffer_; - } - - clEnqueueWriteImageType clEnqueueWriteImage() { - CHECK(clEnqueueWriteImage_ != nullptr) - << "Cannot load clEnqueueWriteImage!"; - return clEnqueueWriteImage_; - } - - clEnqueueCopyImageType clEnqueueCopyImage() { - CHECK(clEnqueueCopyImage_ != nullptr) << "Cannot load clEnqueueCopyImage!"; - return clEnqueueCopyImage_; - } - - private: - CLWrapper(); - CLWrapper(const CLWrapper &) = delete; - CLWrapper &operator=(const CLWrapper &) = delete; - bool InitHandle(); - void InitFunctions(); - void *handle_{nullptr}; - clGetPlatformIDsType clGetPlatformIDs_{nullptr}; - clGetPlatformInfoType clGetPlatformInfo_{nullptr}; - clBuildProgramType clBuildProgram_{nullptr}; - clEnqueueNDRangeKernelType clEnqueueNDRangeKernel_{nullptr}; - clSetKernelArgType clSetKernelArg_{nullptr}; - clRetainMemObjectType clRetainMemObject_{nullptr}; - clReleaseMemObjectType clReleaseMemObject_{nullptr}; - clEnqueueUnmapMemObjectType clEnqueueUnmapMemObject_{nullptr}; - clRetainCommandQueueType clRetainCommandQueue_{nullptr}; - clCreateContextType clCreateContext_{nullptr}; - clCreateContextFromTypeType clCreateContextFromType_{nullptr}; - clReleaseContextType clReleaseContext_{nullptr}; - clWaitForEventsType clWaitForEvents_{nullptr}; - clReleaseEventType clReleaseEvent_{nullptr}; - clEnqueueWriteBufferType clEnqueueWriteBuffer_{nullptr}; - clEnqueueReadBufferType clEnqueueReadBuffer_{nullptr}; - clEnqueueReadImageType clEnqueueReadImage_{nullptr}; - clGetProgramBuildInfoType clGetProgramBuildInfo_{nullptr}; - clRetainProgramType clRetainProgram_{nullptr}; - clEnqueueMapBufferType clEnqueueMapBuffer_{nullptr}; - clEnqueueMapImageType clEnqueueMapImage_{nullptr}; - clCreateCommandQueueType clCreateCommandQueue_{nullptr}; - clCreateCommandQueueWithPropertiesType clCreateCommandQueueWithProperties_{ - nullptr}; - clReleaseCommandQueueType clReleaseCommandQueue_{nullptr}; - clCreateProgramWithBinaryType clCreateProgramWithBinary_{nullptr}; - clRetainContextType clRetainContext_{nullptr}; - clGetContextInfoType clGetContextInfo_{nullptr}; - clReleaseProgramType clReleaseProgram_{nullptr}; - clFlushType clFlush_{nullptr}; - clFinishType clFinish_{nullptr}; - clGetProgramInfoType clGetProgramInfo_{nullptr}; - clCreateKernelType clCreateKernel_{nullptr}; - clRetainKernelType clRetainKernel_{nullptr}; - clCreateBufferType clCreateBuffer_{nullptr}; - clCreateImage2DType clCreateImage2D_{nullptr}; - clCreateImageType clCreateImage_{nullptr}; - clCreateUserEventType clCreateUserEvent_{nullptr}; - clCreateProgramWithSourceType clCreateProgramWithSource_{nullptr}; - clReleaseKernelType clReleaseKernel_{nullptr}; - clGetDeviceInfoType clGetDeviceInfo_{nullptr}; - clGetDeviceIDsType clGetDeviceIDs_{nullptr}; - clRetainDeviceType clRetainDevice_{nullptr}; - clReleaseDeviceType clReleaseDevice_{nullptr}; - clRetainEventType clRetainEvent_{nullptr}; - clGetKernelWorkGroupInfoType clGetKernelWorkGroupInfo_{nullptr}; - clGetEventInfoType clGetEventInfo_{nullptr}; - clGetEventProfilingInfoType clGetEventProfilingInfo_{nullptr}; - clGetImageInfoType clGetImageInfo_{nullptr}; - clEnqueueCopyBufferType clEnqueueCopyBuffer_{nullptr}; - clEnqueueWriteImageType clEnqueueWriteImage_{nullptr}; - clEnqueueCopyImageType clEnqueueCopyImage_{nullptr}; -}; -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc deleted file mode 100644 index eb324fcb0f..0000000000 --- a/lite/backends/opencl/target_wrapper.cc +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/opencl/target_wrapper.h" -#include -#include -#include "lite/backends/opencl/cl_include.h" -#include "lite/backends/opencl/cl_runtime.h" -#include "lite/backends/opencl/cl_utility.h" - -namespace paddle { -namespace lite { - -static cl_channel_type GetCLChannelType(const PrecisionType type) { - switch (type) { - case PRECISION(kFloat): - return CL_FLOAT; - case PRECISION(kInt32): - return CL_SIGNED_INT32; - case PRECISION(kInt8): - return CL_SIGNED_INT8; - default: - LOG(FATAL) << "Unsupported image channel type: " << PrecisionToStr(type); - return 0; - } -} - -void *TargetWrapperCL::Malloc(size_t size) { - cl_int status; - cl::Buffer *buffer = new cl::Buffer(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - size, - nullptr, - &status); - if (status != CL_SUCCESS) { - delete buffer; - buffer = nullptr; - } - CL_CHECK_FATAL(status); - return buffer; -} - -void TargetWrapperCL::Free(void *ptr) { - if (ptr != nullptr) { - cl::Buffer *cl_buffer = static_cast(ptr); - delete cl_buffer; - } -} - -void *TargetWrapperCL::MallocImage(const std::array &image_shape, - PrecisionType data_type) { - cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(data_type)); - cl_int status; - size_t width = image_shape[0]; - size_t height = image_shape[1]; - cl::Image2D *cl_image = - new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - img_format, - width, - height, - 0, - nullptr, - &status); - if (status != CL_SUCCESS) { - delete cl_image; - cl_image = nullptr; - } - CL_CHECK_FATAL(status); - return cl_image; -} - -void TargetWrapperCL::FreeImage(void *image) { - if (image != nullptr) { - cl::Image2D *cl_image = static_cast(image); - delete cl_image; - } -} - -void *TargetWrapperCL::Map(void *buffer, size_t offset, size_t size) { - cl::Buffer *cl_buffer = static_cast(buffer); - cl_int status; - void *mapped_ptr = CLRuntime::Global()->command_queue().enqueueMapBuffer( - *cl_buffer, - CL_TRUE, - CL_MAP_READ | CL_MAP_WRITE, - offset, - size, - nullptr, - nullptr, - &status); - if (status != CL_SUCCESS) { - mapped_ptr = nullptr; - } - CL_CHECK_FATAL(status); - return mapped_ptr; -} - -void *TargetWrapperCL::MapImage(void *image, - const std::array &image_shape, - std::array *image_pitch) { - cl::Image2D *cl_image = static_cast(image); - size_t width = image_shape[0]; - size_t height = image_shape[1]; - size_t *row_pitch = image_pitch->data(); - size_t *slice_pitch = image_pitch->data() + 1; - std::array origin{{0, 0, 0}}; - std::array region{{width, height, 1}}; - cl_int status; - void *mapped_ptr = CLRuntime::Global()->command_queue().enqueueMapImage( - *cl_image, - CL_TRUE, - CL_MAP_READ | CL_MAP_WRITE, - origin, - region, - row_pitch, - slice_pitch, - nullptr, - nullptr, - &status); - if (status != CL_SUCCESS) { - mapped_ptr = nullptr; - } - CL_CHECK_FATAL(status); - return mapped_ptr; -} - -void TargetWrapperCL::Unmap(void *cl_obj, void *mapped_ptr) { - cl::Memory *mem_obj = static_cast(cl_obj); - cl_int status = CLRuntime::Global()->command_queue().enqueueUnmapMemObject( - *mem_obj, mapped_ptr, nullptr, nullptr); - CL_CHECK_FATAL(status); -} - -void TargetWrapperCL::MemcpySync(void *dst, - const void *src, - size_t size, - IoDirection dir) { - cl_int status; - cl::Event event; - auto stream = CLRuntime::Global()->command_queue(); - switch (dir) { - case IoDirection::DtoD: - status = stream.enqueueCopyBuffer(*static_cast(src), - *static_cast(dst), - 0, - 0, - size, - nullptr, - &event); - CL_CHECK_FATAL(status); - event.wait(); - break; - case IoDirection::HtoD: - status = stream.enqueueWriteBuffer(*static_cast(dst), - CL_TRUE, - 0, - size, - src, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - case IoDirection::DtoH: - status = stream.enqueueReadBuffer(*static_cast(src), - CL_TRUE, - 0, - size, - dst, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - default: - LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); - } -} - -void TargetWrapperCL::MemcpyAsync(void *dst, - const void *src, - size_t size, - IoDirection dir, - const stream_t &stream) { - cl_int status; - switch (dir) { - case IoDirection::DtoD: - status = stream.enqueueCopyBuffer(*static_cast(src), - *static_cast(dst), - 0, - 0, - size, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - case IoDirection::HtoD: - status = stream.enqueueWriteBuffer(*static_cast(dst), - CL_FALSE, - 0, - size, - src, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - case IoDirection::DtoH: - status = stream.enqueueReadBuffer(*static_cast(src), - CL_FALSE, - 0, - size, - dst, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - default: - LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); - } -} - -void TargetWrapperCL::ImgcpySync(void *dst, - const void *src, - const std::array &image_shape, - const std::array &image_pitch, - IoDirection dir) { - size_t width = image_shape[0]; - size_t height = image_shape[1]; - size_t row_pitch = image_pitch[0]; - size_t slice_pitch = image_pitch[1]; - std::array origin{{0, 0, 0}}; - std::array region{{width, height, 1}}; - cl_int status; - cl::Event event; - auto stream = CLRuntime::Global()->command_queue(); - switch (dir) { - case IoDirection::DtoD: - status = stream.enqueueCopyImage(*static_cast(src), - *static_cast(dst), - origin, - origin, - region, - nullptr, - &event); - CL_CHECK_FATAL(status); - event.wait(); - break; - case IoDirection::HtoD: - status = stream.enqueueWriteImage(*static_cast(dst), - CL_TRUE, - origin, - region, - row_pitch, - slice_pitch, - src, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - case IoDirection::DtoH: - status = stream.enqueueReadImage(*static_cast(src), - CL_TRUE, - origin, - region, - row_pitch, - slice_pitch, - dst, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - default: - LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); - } -} - -void TargetWrapperCL::ImgcpyAsync(void *dst, - const void *src, - const std::array &image_shape, - const std::array &image_pitch, - IoDirection dir, - const stream_t &stream) { - size_t width = image_shape[0]; - size_t height = image_shape[1]; - size_t row_pitch = image_pitch[0]; - size_t slice_pitch = image_pitch[1]; - std::array origin{{0, 0, 0}}; - std::array region{{width, height, 1}}; - cl_int status; - switch (dir) { - case IoDirection::DtoD: - status = stream.enqueueCopyImage(*static_cast(src), - *static_cast(dst), - origin, - origin, - region, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - case IoDirection::HtoD: - status = stream.enqueueWriteImage(*static_cast(dst), - CL_FALSE, - origin, - region, - row_pitch, - slice_pitch, - src, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - case IoDirection::DtoH: - status = stream.enqueueReadImage(*static_cast(src), - CL_FALSE, - origin, - region, - row_pitch, - slice_pitch, - dst, - nullptr, - nullptr); - CL_CHECK_FATAL(status); - break; - default: - LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/target_wrapper.h b/lite/backends/opencl/target_wrapper.h deleted file mode 100644 index 8ff8e6fd40..0000000000 --- a/lite/backends/opencl/target_wrapper.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "lite/backends/opencl/cl_include.h" -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { - -using TargetWrapperCL = - TargetWrapper; -// This interface should be specified by each kind of target. -template <> -class TargetWrapper { - public: - using stream_t = cl::CommandQueue; - using event_t = cl::Event; - - static size_t num_devices() { return 0; } - static size_t maximum_stream() { return 0; } - - static void CreateStream(stream_t* stream) {} - static void DestroyStream(const stream_t& stream) {} - - static void CreateEvent(event_t* event) {} - static void DestroyEvent(const event_t& event) {} - - static void RecordEvent(const event_t& event) {} - static void SyncEvent(const event_t& event) {} - - static void StreamSync(const stream_t& stream) {} - - static void* Malloc(size_t size); - static void Free(void* ptr); - - static void* MallocImage(const std::array& image_shape, - PrecisionType data_type); - static void FreeImage(void* image); - - static void* Map(void* buffer, size_t offset, size_t size); - static void* MapImage(void* image, - const std::array& image_shape, - std::array* image_pitch); - static void Unmap(void* cl_obj, void* mapped_ptr); - - static void MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir); - static void MemcpyAsync(void* dst, - const void* src, - size_t size, - IoDirection dir, - const stream_t& stream); - static void ImgcpySync(void* dst, - const void* src, - const std::array& image_shape, - const std::array& image_pitch, - IoDirection dir); - static void ImgcpyAsync(void* dst, - const void* src, - const std::array& image_shape, - const std::array& image_pitch, - IoDirection dir, - const stream_t& stream); -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/CMakeLists.txt b/lite/backends/x86/CMakeLists.txt deleted file mode 100644 index 34e0800130..0000000000 --- a/lite/backends/x86/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -if (NOT LITE_WITH_X86) - return() -endif() - -configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) -configure_file(warpctc_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/warpctc_lib_path.h) - -lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) -lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) -lite_cc_library(target_wrapper_x86 SRCS target_wrapper.cc) -lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak) - -add_subdirectory(jit) -add_subdirectory(math) diff --git a/lite/backends/x86/cpu_info.cc b/lite/backends/x86/cpu_info.cc deleted file mode 100644 index c2759d6191..0000000000 --- a/lite/backends/x86/cpu_info.cc +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/x86/cpu_info.h" - -#ifdef PADDLE_WITH_XBYAK -#include "xbyak/xbyak.h" -#include "xbyak/xbyak_util.h" -#endif - -#ifdef __APPLE__ -#include -#include -#elif defined(_WIN32) -#define NOMINMAX // msvc max/min macro conflict with std::min/max -#include -#else -#include -#endif // _WIN32 - -#include -#include - -DEFINE_double(fraction_of_cpu_memory_to_use, - 1, - "Default use 100% of CPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); -DEFINE_uint64(initial_cpu_memory_in_mb, - 500ul, - "Initial CPU memory for PaddlePaddle, in MD unit."); - -DEFINE_double( - fraction_of_cuda_pinned_memory_to_use, - 0.5, - "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," - "reserve the rest for page tables, etc"); - -// If use_pinned_memory is true, CPUAllocator calls mlock, which -// returns pinned and locked memory as staging areas for data exchange -// between host and device. Allocates too much would reduce the amount -// of memory available to the system for paging. So, by default, we -// should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); - -namespace paddle { -namespace lite { -namespace x86 { - -size_t CpuTotalPhysicalMemory() { -#ifdef __APPLE__ - int mib[2]; - mib[0] = CTL_HW; - mib[1] = HW_MEMSIZE; - int64_t size = 0; - size_t len = sizeof(size); - if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; - return 0L; -#elif defined(_WIN32) - MEMORYSTATUSEX sMeminfo; - sMeminfo.dwLength = sizeof(sMeminfo); - GlobalMemoryStatusEx(&sMeminfo); - return sMeminfo.ullTotalPhys; -#else - int64_t pages = sysconf(_SC_PHYS_PAGES); - int64_t page_size = sysconf(_SC_PAGE_SIZE); - return pages * page_size; -#endif -} - -size_t CpuMaxAllocSize() { - // For distributed systems, it requires configuring and limiting - // the fraction of memory to use. - return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); -} - -size_t CpuMinChunkSize() { - // Allow to allocate the minimum chunk size is 4 KB. - return 1 << 12; -} - -size_t CpuMaxChunkSize() { - // Allow to allocate the maximum chunk size is roughly 3% of CPU memory, - // or the initial_cpu_memory_in_mb. - return std::min( - static_cast(CpuMaxAllocSize() / 32), - static_cast(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); -} - -size_t CUDAPinnedMaxAllocSize() { - // For distributed systems, it requires configuring and limiting - // the fraction of memory to use. - return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory(); -} - -size_t CUDAPinnedMinChunkSize() { - // Allow to allocate the minimum chunk size is 64 KB. - return 1 << 16; -} - -size_t CUDAPinnedMaxChunkSize() { - // Allow to allocate the maximum chunk size is roughly 1/256 of CUDA_PINNED - // memory. - return CUDAPinnedMaxAllocSize() / 256; -} - -#ifdef PADDLE_WITH_XBYAK -static Xbyak::util::Cpu cpu; -bool MayIUse(const cpu_isa_t cpu_isa) { - using namespace Xbyak::util; // NOLINT - switch (cpu_isa) { - case sse42: - return cpu.has(Cpu::tSSE42); - case avx: - return cpu.has(Cpu::tAVX); - case avx2: - return cpu.has(Cpu::tAVX2); - case avx512f: - return cpu.has(Cpu::tAVX512F); - case avx512_core: - return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && - cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ); - case avx512_core_vnni: - return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && - cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) && - cpu.has(Cpu::tAVX512_VNNI); - case avx512_mic: - return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) && - cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF); - case avx512_mic_4ops: - return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) && - cpu.has(Cpu::tAVX512_4VNNIW); - case isa_any: - return true; - } - return false; -} -#else -bool MayIUse(const cpu_isa_t cpu_isa) { - if (cpu_isa == isa_any) { - return true; - } else { - return false; - } -} -#endif - -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/cpu_info.h b/lite/backends/x86/cpu_info.h deleted file mode 100644 index c60cc4798c..0000000000 --- a/lite/backends/x86/cpu_info.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#ifdef _WIN32 -#if defined(__AVX2__) -#include // avx2 -#elif defined(__AVX__) -#include // avx -#endif // AVX -#else // WIN32 -#ifdef __AVX__ -#include -#endif -#endif // WIN32 - -#if defined(_WIN32) -#define ALIGN32_BEG __declspec(align(32)) -#define ALIGN32_END -#else -#define ALIGN32_BEG -#define ALIGN32_END __attribute__((aligned(32))) -#endif // _WIN32 - -namespace paddle { -namespace lite { -namespace x86 { - -size_t CpuTotalPhysicalMemory(); - -//! Get the maximum allocation size for a machine. -size_t CpuMaxAllocSize(); - -//! Get the maximum allocation size for a machine. -size_t CUDAPinnedMaxAllocSize(); - -//! Get the minimum chunk size for buddy allocator. -size_t CpuMinChunkSize(); - -//! Get the maximum chunk size for buddy allocator. -size_t CpuMaxChunkSize(); - -//! Get the minimum chunk size for buddy allocator. -size_t CUDAPinnedMinChunkSize(); - -//! Get the maximum chunk size for buddy allocator. -size_t CUDAPinnedMaxChunkSize(); - -typedef enum { - isa_any, - sse42, - avx, - avx2, - avx512f, - avx512_core, - avx512_core_vnni, - avx512_mic, - avx512_mic_4ops, -} cpu_isa_t; // Instruction set architecture - -// May I use some instruction -bool MayIUse(const cpu_isa_t cpu_isa); - -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/cupti_lib_path.h.in b/lite/backends/x86/cupti_lib_path.h.in deleted file mode 100644 index 017384bfbb..0000000000 --- a/lite/backends/x86/cupti_lib_path.h.in +++ /dev/null @@ -1,17 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@" diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc deleted file mode 100644 index 0f27a19cf5..0000000000 --- a/lite/backends/x86/dynamic_loader.cc +++ /dev/null @@ -1,263 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "lite/backends/x86/dynamic_loader.h" - -#include -#include // NOLINT -#include - -#include "gflags/gflags.h" -#include "glog/logging.h" -#include "lite/backends/x86/cupti_lib_path.h" -#include "lite/backends/x86/port.h" -#include "lite/backends/x86/warpctc_lib_path.h" -#include "lite/utils/paddle_enforce.h" - -DEFINE_string(cudnn_dir, - "", - "Specify path for loading libcudnn.so. For instance, " - "/usr/local/cudnn/lib. If empty [default], dlopen " - "will search cudnn from LD_LIBRARY_PATH"); - -DEFINE_string(cuda_dir, - "", - "Specify path for loading cuda library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); - -DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); - -DEFINE_string(nccl_dir, - "", - "Specify path for loading nccl library, such as libcublas, " - "libcurand. For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); - -DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); - -DEFINE_string( - tensorrt_dir, - "", - "Specify path for loading tensorrt library, such as libnvinfer.so."); - -DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); - -namespace paddle { -namespace lite { -namespace x86 { -static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; -static constexpr char warpctc_lib_path[] = WARPCTC_LIB_PATH; - -#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) -static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; -static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; -#endif - -static inline std::string join(const std::string& part1, - const std::string& part2) { - // directory separator - const char sep = '/'; - if (!part2.empty() && part2.front() == sep) { - return part2; - } - std::string ret; - ret.reserve(part1.size() + part2.size() + 1); - ret = part1; - if (!ret.empty() && ret.back() != sep) { - ret += sep; - } - ret += part2; - return ret; -} - -static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, - int dynload_flags) { - VLOG(3) << "Try to find library: " << dso_path - << " from default system path."; - // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH - // and /usr/local/lib path - void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); - -// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to -// bring System Integrity Projection (SIP), if dso_handle -// is null, search from default package path in Mac OS. -#if defined(__APPLE__) || defined(__OSX__) - if (nullptr == dso_handle) { - dso_handle = - dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags); - if (nullptr == dso_handle) { - if (dso_path == "libcudnn.dylib") { - LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " - "For instance, sudo tar -xzf " - "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " - "chmod a+r /usr/local/cuda/include/cudnn.h " - "/usr/local/cuda/lib/libcudnn*"; - } - } - } -#endif - - if (nullptr == dso_handle) { - LOG(WARNING) << "Can not find library: " << dso_path - << ". The process maybe hang. Please try to add the lib path " - "to LD_LIBRARY_PATH."; - } - return dso_handle; -} - -static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, - const std::string& dso_name, - bool throw_on_error = true) { -#if !defined(_WIN32) - int dynload_flags = RTLD_LAZY | RTLD_LOCAL; -#else - int dynload_flags = 0; -#endif // !_WIN32 - void* dso_handle = nullptr; - - std::string dlPath = dso_name; - if (search_root.empty()) { - dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags); - } else { - // search xxx.so from custom path - dlPath = join(search_root, dso_name); - dso_handle = dlopen(dlPath.c_str(), dynload_flags); -#if !defined(_WIN32) - auto errorno = dlerror(); -#else - auto errorno = GetLastError(); -#endif // !_WIN32 - // if not found, search from default path - if (nullptr == dso_handle) { - LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" - << errorno << ")"; - if (dlPath.find("nccl") != std::string::npos) { - LOG(INFO) - << "You may need to install 'nccl2' from NVIDIA official website: " - << "https://developer.nvidia.com/nccl/nccl-download" - << "before install PaddlePaddle"; - } - dlPath = dso_name; - dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags); - } - } - auto error_msg = - "Failed to find dynamic library: %s ( %s ) \n Please specify " - "its path correctly using following ways: \n Method. set " - "environment variable LD_LIBRARY_PATH on Linux or " - "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " - "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " - "using the DYLD_LIBRARY_PATH is impossible unless System " - "Integrity Protection (SIP) is disabled."; -#if !defined(_WIN32) - auto errorno = dlerror(); -#else - auto errorno = GetLastError(); -#endif // !_WIN32 - if (throw_on_error) { - CHECK(dso_handle != nullptr); - // PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno); - } else if (nullptr == dso_handle) { - // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno); - } - - return dso_handle; -} - -void* GetCublasDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); -#endif -} - -void* GetCUDNNDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); -#else - return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); -#endif -} - -void* GetCUPTIDsoHandle() { - std::string cupti_path = cupti_lib_path; - if (!FLAGS_cupti_dir.empty()) { - cupti_path = FLAGS_cupti_dir; - } -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false); -#else - return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false); -#endif -} - -void* GetCurandDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); -#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); -#else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); -#endif -} - -void* GetWarpCTCDsoHandle() { - std::string warpctc_dir = warpctc_lib_path; - if (!FLAGS_warpctc_dir.empty()) { - warpctc_dir = FLAGS_warpctc_dir; - } -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll"); -#else - return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so"); -#endif -} - -void* GetNCCLDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib"); -#else - return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so"); -#endif -} - -void* GetTensorRtDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); -#else - return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); -#endif -} - -void* GetMKLMLDsoHandle() { -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); -#elif defined(_WIN32) - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll"); -#else - return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); -#endif -} - -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/dynamic_loader.h b/lite/backends/x86/dynamic_loader.h deleted file mode 100644 index 81c277ffc8..0000000000 --- a/lite/backends/x86/dynamic_loader.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -namespace paddle { -namespace lite { -namespace x86 { - -#ifndef _WIN32 -#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__)) -#else -#define DECLARE_TYPE(__name, ...) decltype(auto) -#endif - -void* GetCublasDsoHandle(); -void* GetCUDNNDsoHandle(); -void* GetCUPTIDsoHandle(); -void* GetCurandDsoHandle(); -void* GetWarpCTCDsoHandle(); -void* GetNCCLDsoHandle(); -void* GetTensorRtDsoHandle(); -void* GetMKLMLDsoHandle(); - -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/CMakeLists.txt b/lite/backends/x86/jit/CMakeLists.txt deleted file mode 100644 index e4113832c6..0000000000 --- a/lite/backends/x86/jit/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ - -set(jit_file ${PADDLE_BINARY_DIR}/lite/backends/x86/jit/kernels.h) -file(WRITE ${jit_file} "// Generated by the lite/backends/x86/jit/CMakeLists.txt. DO NOT EDIT!\n\n") -file(APPEND ${jit_file} "\#pragma once\n") -file(APPEND ${jit_file} "\#include \"lite/backends/x86/jit/helper.h\"\n") -file(APPEND ${jit_file} "\#include \"lite/backends/x86/jit/registry.h\"\n\n") - -set(JIT_KERNEL_DEPS x86_cpu_info cblas gflags xxhash) - -file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) -lite_cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) - -# refer must go first -add_subdirectory(refer) -add_subdirectory(more) -if(WITH_XBYAK) - add_subdirectory(gen) -endif() - -lite_cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS}) -#lite_cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper) - -#if(NOT WIN32) - #lite_cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper tensor) -#endif() diff --git a/lite/backends/x86/jit/README.en.md b/lite/backends/x86/jit/README.en.md deleted file mode 100644 index cd2aa5c242..0000000000 --- a/lite/backends/x86/jit/README.en.md +++ /dev/null @@ -1,103 +0,0 @@ -# JIT Kernel - -JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic. -Each implementation has its own condition to use, defined in `CanBeUsed`. -They are combined together to get the best performance of one single independent function. -They could be some very simple functions like vector multiply, or some complicated functions like LSTM. -And they can be composed with some other exited jit kernels to build up a complex function. -Currently it's only supported on CPU yet. - -## Contents - -```txt -PaddlePaddle/Paddle/paddle/fluid/ -├── ... -└── lite/ - ├── .../ - └── jit/ - ├── ... - ├── gen/ - │ └── ... - |── more/ - │ ├── ... - │ ├── mkl/ - │ │ └── ... - │ ├── mkldnn/ - │ │ └── ... - │ ├── mix/ - │ │ └── ... - │ ├── intrinsic/ - │ │ └── ... - │ └── openblas/ - │ └── ... - └── refer/ - └── ... -``` - -All basical definations of jit kernels are addressed in `lite/backends/x86/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality. - -- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries. -- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak. -- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. - -## How to use - -We present these methods to get the functions: -- `GetAllCandidateFuncs`. It can return all the implementations supported. All of the implementations can get the same result. You can do some runtime benchmark to choose which should actually be used. -- `GetDefaultBestFunc`. It only return one default function pointer, which is tuning offline with some genenal configures and attributes. This should cover most situations. -- `KernelFuncs::Cache()`. It can get the default functions and save it for next time with the same attribute. -- `GetReferFunc`. It can only get the reference code in CPU, and all the others implementations have same logic with this reference code. - -And here are some examples: - -Get from cache: - -```cpp - using T = float; - jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); - auto seqpool_func = jit::KernelFuncs, platform::CPUPlace>::Cache().At(attr); - seqpool_func(src_data, dst_data, &attr); -``` - -Get all implementations and run once: - -```cpp - using T = float; - jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); - auto funcs = jit::GetAllCandidateFuncsWithTypes, platform::CPUPlace>(attr); - for (auto f : funcs) { - LOG(INFO) << "Kernel implementation type: " << f.first; - f.second(src_data, dst_data, &attr); - } -``` - -All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automatically generated in compile time, you can only include this one header to get all the registered kernels. - -## Solid Test - -- Unit Test - All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`. -- Benchmark - All functions should be tested, and make sure the `jit::GetDefaultBestFunc` function obtain the best performance with all attributes. - -# How to add new kernel - -## Required - -1. Add `your_key` at `KernelType`. -2. Add your new `KernelTuple` which must include `your_key`. It should be a combination of the data type, attribute type and function type. You can refer `SeqPoolTuple`. -3. Add reference function of `your_key`. -Note: - - this should be run on CPU and do not depend on any third-party. - - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used. -4. Add unit test in `test.cc`, and verfiy at least `float` and `double`. -Test more data type for some special functions if necessary, for example `int8`. -5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one. - -## Optional - -Add more implementations of `your_kery` for performance enhancement. - -1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have correpsonding creator from `JitCodeCreator` which will be registered on the `your_key`. -2. If new attribute type is added, you should specialize `JitCodeKey` of this type. -3. Add more functions in `more`,you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance. diff --git a/lite/backends/x86/jit/README.md b/lite/backends/x86/jit/README.md deleted file mode 100644 index 6998c5d867..0000000000 --- a/lite/backends/x86/jit/README.md +++ /dev/null @@ -1,94 +0,0 @@ -# JIT Kernel - -结合函数模板和JIT生成需要的kernel函数。 -这里的kernel是比Operator中kernel更小级别的算子单元,更侧重的是在不同硬件上的性能。可以有多重第三方库的实现,每种实现有自己的`CanBeUsed`函数负责什么条件下可以被调用。 -这里实现的函数可以非常细粒度的函数方法,比如Vector MUL, 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。 -目前仅支持CPU上的高性能计算。 - -## 目录结构 - -```txt -PaddlePaddle/Paddle/paddle/fluid/ -├── ... -└── lite/ - ├── .../ - └── jit/ - ├── ... - ├── gen/ - │ └── ... - |── more/ - │ ├── ... - │ ├── mkl/ - │ │ └── ... - │ ├── mkldnn/ - │ │ └── ... - │ ├── mix/ - │ │ └── ... - │ ├── intrinsic/ - │ │ └── ... - │ └── openblas/ - │ └── ... - └── refer/ - └── ... -``` - -基本类的定义都放在根目录下,根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现,每种kernel算子都需要有reference的实现,用作单元测试的基准,其他的实现都是可选的。 -- gen: 代表使用jit生成的code,需要依赖xbyak库。该实现最关心的就是性能。 -- refer: 代表reference的实现,每种kernel算子都需要有在CPU上的reference的实现,他主要关心的算法逻辑的正确性。 -- more: 下面可以放入跟多实现,可以包括mkl,mkldnn,intrinsic,openblas等,也可以是自身已有的kernel组合。 - -## 动态获取 - -- 提供`GetAllCandidateFuncs`方法,根据输入的kernel类别,获取满足要求的所有函数实现。所有实现保证结果一致,但是速度不一致,可以根据具体输入属性大小,动态测试得到当前最优实现,手动选择最优函数。 -- 提供`GetDefaultBestFunc`方法,返回一个默认最优的函数实现。该函数是根据一些通用配置离线tuning之后的结果,能覆盖大多数情况下最优结果。 -- 提供`KernelFuncs::Cache()`方法,该方法会返回默认最优的函数,同时会缓存该函数指针,如果出现属性一致的情况,直接返回上次的函数指针,如果不存在则根据属性新建。 -- 提供`GetReferFunc` 方法,返回该kernel最原始的逻辑函数。该方法与kernel的输入大小和属性没有任何关系,有且并只有一个在CPU上的实现。该方法表征了kernel的原始逻辑,其他所有实现的逻辑与它保持一致。 - -### 例子 - -所有kernel的调用只需要在头文件中包含`"lite/backends/x86/jit/kernels.h"`, 该文件是编译时自动生成的。 - -直接从缓存中获取默认最优的函数。 - -```cpp - using T = float; - jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); - auto seqpool_func = jit::KernelFuncs, platform::CPUPlace>::Cache().At(attr); - seqpool_func(src_data, dst_data, &attr); -``` - -跑一遍所有实现,并输出实现类别。 - -```cpp - using T = float; - jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum); - auto funcs = jit::GetAllCandidateFuncsWithTypes, platform::CPUPlace>(attr); - for (auto f : funcs) { - LOG(INFO) << "Kernel implementation type: " << f.first; - f.second(src_data, dst_data, &attr); - } -``` - -## 测试 - -- 逻辑测试 - 所有实现都要与refer的code对比,需要满足精度要求, 包括float和double的数据类型 -- 性能测试 - 所有实现的性能对比,并且与最终的`jit::GetDefaultBestFunc`方法对比,该方法拿到的性能需要在各种条件下都是最好的。 - -# 如何添加新的算子 - -1. 在`KernelType` 中添加 `your_key` 。 -2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。 -3. (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。 -4. (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。 -5. 添加新的`KernelTuple`,需要与`KernelType`一一对应,是所有类型的一个打包,包括数据类型,属性的类型,以及返回的函数类型。可以参考`SeqPoolTuple`,新加的Attr类型需要特例化`JitCodeKey`方法。 -6. 在`test.cc`中添加unit test,至少需要测试`float`和`double`两种数据类型,如有必要需要支持额外的数据类型,比如`int8`的相关函数。 -7. 在`benchmark.cc`中添加相应的性能对比,同一种kernel需要对比所有实现,并且确保`GetDefaultBestFunc`得到的实现一直是速度最快的。 - -# 优点 -- 接口方便,灵活调用。 -- 同一套逻辑可以有多套实现,可以依赖多套第三方库,互不影响。 -- 目录结构清晰,不会在某个文件中有多个宏定义,导致的可读性差问题。 -- 优化方便,可以直接针对某种属性针对性优化,并不影响其他属性下的性能。 -- 可以支持多种平台,包括Linux,Mac 和 Windows,至少可以保证每种平台都可以正常work。后期也可以针对不同平台有针对的优化。框架层面可以使用统一接口,不必关心底层实现。 diff --git a/lite/backends/x86/jit/benchmark.cc b/lite/backends/x86/jit/benchmark.cc deleted file mode 100644 index c49984691e..0000000000 --- a/lite/backends/x86/jit/benchmark.cc +++ /dev/null @@ -1,576 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "glog/logging.h" -#include "lite/backends/x86/jit/kernels.h" -#include "lite/backends/x86/legacy_place.h" -#include "lite/core/tensor.h" - -DEFINE_int32(burning, 10, "Burning times."); -DEFINE_int32(repeat, 3000, "Repeat times."); -DEFINE_int32(max_size, 1000, "The Max size would be tested."); -DEFINE_string(filter, "", "The Benchmark name would be run."); - -class BenchJITKernel { - public: - BenchJITKernel() = default; - virtual ~BenchJITKernel() = default; - virtual void Run() = 0; - virtual const char* Name() = 0; - virtual const char* Dtype() = 0; - virtual const char* Place() = 0; -}; - -static std::vector g_all_benchmarks; - -BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { - g_all_benchmarks.push_back(b); - return b; -} - -#define BENCH_JITKERNEL(name, dtype, place) \ - class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \ - public: \ - const char* Name() override { return #name; } \ - const char* Dtype() override { return #dtype; } \ - const char* Place() override { return #place; } \ - void Run() override; \ - }; \ - static auto inserted_##name##_##dtype##_##place##_ UNUSED = \ - InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ - void BenchJITKernel_##name##_##dtype##_##place##_::Run() - -void RUN_ALL_BENCHMARK() { - for (auto p : g_all_benchmarks) { - if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { - continue; - } - LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "." - << p->Place(); - p->Run(); - } -} - -template -void RandomVec(const int n, - T* a, - const T lower = static_cast(-20.f), - const T upper = static_cast(20.f), - unsigned int seed = 100) { - std::mt19937 rng(seed); - std::uniform_real_distribution uniform_dist(0, 1); - for (int i = 0; i < n; ++i) { - a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); - } -} - -std::vector TestSizes() { - std::vector s; - for (int i = 1; i <= FLAGS_max_size; ++i) { - s.push_back(i); - } - return s; -} - -template -struct BenchFunc { - // return this function avg time - // TODO(TJ): clear cache every time - double operator()(const typename KernelTuple::func_type tgt, Args... args) { - for (int i = 0; i < FLAGS_burning; ++i) { - tgt(args...); - } - auto start = paddle::lite::PosixInNsec() * 1e-3; - for (int i = 0; i < FLAGS_repeat; ++i) { - tgt(args...); - } - auto end = paddle::lite::PosixInNsec() * 1e-3; - return static_cast(end - start) / FLAGS_repeat; - } -}; - -namespace jit = paddle::lite::jit; - -template -void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { - BenchFunc benchmark; - std::vector> infos; - auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); - for (auto f : funcs) { - infos.push_back(std::make_pair(f.first, benchmark(f.second, args...))); - } - - // Test result from Get function - auto tgt = jit::KernelFuncs::Cache().At(attr); - if (!tgt) { - LOG(FATAL) << "Target can not be empty!"; - } - infos.push_back(std::make_pair("Target", benchmark(tgt, args...))); - - // print - std::ostringstream loginfos; - loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": " - << attr << ": "; - for (auto pair : infos) { - loginfos << pair.first << " takes " << pair.second << " us; "; - } - LOG(INFO) << loginfos.str(); -} - -using Tensor = paddle::framework::Tensor; - -template -void BenchKernelXYZN() { - using T = typename KernelTuple::data_type; - for (int d : TestSizes()) { - Tensor x, y, z; - x.Resize({d}); - y.Resize({d}); - z.Resize({d}); - T* x_data = x.mutable_data(PlaceType()); - T* y_data = y.mutable_data(PlaceType()); - T* z_data = z.mutable_data(PlaceType()); - RandomVec(d, x_data); - RandomVec(d, y_data); - BenchAllImpls( - d, x.data(), y.data(), z_data, d); - // test inplace - BenchAllImpls(d, x.data(), z_data, z_data, d); - } -} - -template -void BenchKernelAXYN() { - using T = typename KernelTuple::data_type; - for (int d : TestSizes()) { - const T a = static_cast(3); - Tensor x, y; - x.Resize({d}); - y.Resize({d}); - T* x_data = x.mutable_data(PlaceType()); - T* y_data = y.mutable_data(PlaceType()); - RandomVec(d, x_data); - BenchAllImpls(d, &a, x.data(), y_data, d); - // test inplace - BenchAllImpls(d, &a, x.data(), x_data, d); - } -} - -template -void BenchKernelXRN() { - using T = typename KernelTuple::data_type; - for (int d : TestSizes()) { - Tensor x; - RandomVec(d, x.mutable_data({d}, PlaceType())); - T res; - BenchAllImpls(d, x.data(), &res, d); - } -} - -template -void BenchKernelXYN() { - using T = typename KernelTuple::data_type; - for (int d : TestSizes()) { - Tensor x, y; - x.Resize({d}); - y.Resize({d}); - T* x_data = x.mutable_data(PlaceType()); - T* y_data = y.mutable_data(PlaceType()); - RandomVec(d, x_data); - BenchAllImpls(d, x.data(), y_data, d); - } -} - -template -void BenchKernelLSTM() { - using T = typename KernelTuple::data_type; - for (bool use_peephole : {true, false}) { - for (int d : TestSizes()) { - const jit::lstm_attr_t attr( - d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole); - Tensor x, ct_1, ct, ht, wp, checked; - x.Resize({4 * d}); - ct_1.Resize({d}); - ct.Resize({d}); - ht.Resize({d}); - wp.Resize({3 * d}); - checked.Resize({2 * d}); - auto place = PlaceType(); - RandomVec(x.numel(), x.mutable_data(place), -2.f, 2.f); - RandomVec(wp.numel(), wp.mutable_data(place), -2.f, 2.f); - RandomVec(ct_1.numel(), ct_1.mutable_data(place), -2.f, 2.f); - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - T* x_data = x.mutable_data(place); - T* checked_data = checked.mutable_data(place); - T* ct_data = ct.mutable_data(place); - T* ht_data = ht.mutable_data(place); - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - BenchAllImpls(attr, &step, &attr); - } - } -} - -template -void BenchKernelGRU() { - using T = typename KernelTuple::data_type; - for (int d : TestSizes()) { - const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); - auto place = PlaceType(); - Tensor x, ht_1, ht; - x.Resize({3 * d}); - ht_1.Resize({d}); - ht.Resize({d}); - RandomVec(3 * d, x.mutable_data(place), -2.f, 2.f); - RandomVec(d, ht_1.mutable_data(place), -2.f, 2.f); - const T* ht_1_data = ht_1.data(); - T* x_data = x.mutable_data(place); - T* ht_data = ht.mutable_data(place); - jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - BenchAllImpls(attr, &step, &attr); - } -} - -template -void BenchKernelSeqPool() { - using T = typename KernelTuple::data_type; - std::vector pool_types = { - jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; - for (auto type : pool_types) { - for (int w : TestSizes()) { - jit::seq_pool_attr_t attr(w, type); - for (int h : TestSizes()) { - attr.h = h; - Tensor x, y; - x.Resize({h * w}); - y.Resize({w}); - RandomVec(h * w, x.mutable_data(PlaceType()), -2.f, 2.f); - const T* x_data = x.data(); - T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls(attr, x_data, y_data, &attr); - } - } - } -} - -template -void BenchKernelEmbSeqPool() { - using T = typename KernelTuple::data_type; - std::vector pool_types = {jit::SeqPoolType::kSum}; - int64_t tbl_h = 1e4; - for (int tbl_w : {10, 16, 256}) { - Tensor table; - table.Resize({tbl_h, tbl_w}); - RandomVec(tbl_h * tbl_w, table.mutable_data(PlaceType()), -2.f, 2.f); - const T* table_data = table.data(); - for (auto type : pool_types) { - for (int idx_w : {1, 2, 10, 16}) { - for (int idx_h : {1, 2, 9, 13, 16}) { - int64_t out_w = tbl_w * idx_w; - jit::emb_seq_pool_attr_t attr( - tbl_h, tbl_w, idx_h, idx_w, out_w, type); - Tensor idx, out; - idx.Resize({idx_h, idx_w}); - out.Resize({out_w}); - RandomVec(idx_h * idx_w, - idx.mutable_data(PlaceType()), - 0, - tbl_h - 1); - const int64_t* idx_data = idx.data(); - T* o_data = out.mutable_data(PlaceType()); - BenchAllImpls( - attr, table_data, idx_data, o_data, &attr); - } - } - } - } -} - -template -void BenchKernelSgd() { - using T = typename KernelTuple::data_type; - const T lr = 0.1; - auto UnDuplicatedRandomVec = []( - int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); - std::vector all, out; - for (int i = 0; i < n; ++i) { - all.push_back(i); - } - std::random_shuffle(all.begin(), all.end()); - out.insert(out.begin(), all.begin(), all.begin() + n); - return out; - }; - for (int param_h : {1, 1000}) { - for (int grad_w : {1, 2, 8, 16, 30, 256}) { - // only benchmark inplace - Tensor param; - param.Resize({param_h, grad_w}); - T* param_data = param.mutable_data(PlaceType()); - RandomVec(param_h * grad_w, param_data, -2.f, 2.f); - for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) { - Tensor grad; - grad.Resize({rows_size, grad_w}); - std::vector rows = - UnDuplicatedRandomVec(rows_size, 0, rows_size - 1); - RandomVec( - rows_size * grad_w, grad.mutable_data(PlaceType()), -2.f, 2.f); - const T* grad_data = grad.data(); - const int64_t* rows_data = rows.data(); - jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); - BenchAllImpls( - attr, &lr, param_data, grad_data, rows_data, param_data, &attr); - } - } - } -} - -template -void BenchKernelMatMul() { - using T = typename KernelTuple::data_type; - for (int m : {1, 2, 3, 4}) { - for (int n : TestSizes()) { - for (int k : TestSizes()) { - Tensor a, b, c; - a.Resize({m * k}); - b.Resize({k * n}); - c.Resize({m * n}); - RandomVec(m * k, a.mutable_data(PlaceType()), -2.f, 2.f); - RandomVec(k * n, b.mutable_data(PlaceType()), -2.f, 2.f); - const T* a_data = a.data(); - const T* b_data = b.data(); - T* c_data = c.mutable_data(PlaceType()); - const jit::matmul_attr_t attr{m, n, k}; - BenchAllImpls( - attr, a_data, b_data, c_data, &attr); - } - } - } -} - -template -void BenchKernelSoftmax() { - using T = typename KernelTuple::data_type; - for (int bs : {1, 2, 10}) { - for (int n : TestSizes()) { - Tensor x, y; - x.Resize({bs, n}); - y.Resize({bs, n}); - RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); - const T* x_data = x.data(); - T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls(n, x_data, y_data, n, bs, 1); - } - } -} - -template -void BenchKernelLayerNorm() { - using T = typename KernelTuple::data_type; - const T epsilon = 9.99999975e-06; - for (int n : {1, 2, 10}) { - for (int x_dim_0 : {1, 9, 17, 50}) { - int left = n * x_dim_0; - for (int x_dim_1 : TestSizes()) { - int right = x_dim_1; - int sz = left * right; - Tensor x, mean, var, scale, bias, out; - x.Resize({n, x_dim_0, x_dim_1}); - out.Resize({n, x_dim_0, x_dim_1}); - mean.Resize({n, x_dim_0}); - var.Resize({n, x_dim_0}); - scale.Resize({x_dim_1}); - bias.Resize({x_dim_1}); - - RandomVec(sz, x.mutable_data(PlaceType()), -2.f, 2.f); - RandomVec(left, mean.mutable_data(PlaceType()), -2.f, 2.f); - RandomVec(left, var.mutable_data(PlaceType()), -2.f, 2.f); - RandomVec(right, scale.mutable_data(PlaceType()), -2.f, 2.f); - RandomVec(right, bias.mutable_data(PlaceType()), -2.f, 2.f); - - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* out_data = out.mutable_data(PlaceType()); - - BenchAllImpls(right, - x_data, - out_data, - mean_data, - var_data, - scale_data, - bias_data, - left, - epsilon, - right); - } - } - } -} - -template -void BenchKernelCRFDecoding() { - using T = typename KernelTuple::data_type; - constexpr int state_trans_base_idx = 2; - for (int seq_len : {1, 11, 17, 50}) { - for (int tag_num : TestSizes()) { - int x_sz = seq_len * tag_num; - int w_sz = (tag_num + state_trans_base_idx) * tag_num; - Tensor x, w, alpha, track; - x.Resize({seq_len, tag_num}); - w.Resize({tag_num + state_trans_base_idx, tag_num}); - alpha.Resize({seq_len, tag_num}); - track.Resize({seq_len, tag_num}); - - RandomVec(x_sz, x.mutable_data(PlaceType()), -2.f, 2.f); - RandomVec(w_sz, w.mutable_data(PlaceType()), -2.f, 2.f); - - const T* x_data = x.data(); - const T* w_data = w.data(); - T* alpha_data = alpha.mutable_data(PlaceType()); - int* track_data = track.mutable_data(PlaceType()); - - BenchAllImpls( - tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num); - } - } -} - -template -void BenchKernelVBroadcast() { - using T = typename KernelTuple::data_type; - for (int64_t w : {1, 16, 64, 100, 256}) { - Tensor x; - x.Resize({w}); - RandomVec(w, x.mutable_data(PlaceType())); - const T* x_data = x.data(); - for (int h : TestSizes()) { - Tensor y; - y.Resize({h * w}); - T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls( - w, x_data, y_data, static_cast(h), w); - } - } -} - -#define BenchKernelVMul BenchKernelXYZN -#define BenchKernelVAdd BenchKernelXYZN -#define BenchKernelVAddRelu BenchKernelXYZN -#define BenchKernelVSub BenchKernelXYZN - -#define BenchKernelVScal BenchKernelAXYN -#define BenchKernelVAddBias BenchKernelAXYN - -#define BenchKernelVRelu BenchKernelXYN -#define BenchKernelVIdentity BenchKernelXYN -#define BenchKernelVSquare BenchKernelXYN -#define BenchKernelVExp BenchKernelXYN -#define BenchKernelVSigmoid BenchKernelXYN -#define BenchKernelVTanh BenchKernelXYN -#define BenchKernelVCopy BenchKernelXYN - -#define BenchKernelHMax BenchKernelXRN -#define BenchKernelHSum BenchKernelXRN - -#define BenchKernelLSTMCtHt BenchKernelLSTM -#define BenchKernelLSTMC1H1 BenchKernelLSTM - -#define BenchKernelGRUH1 BenchKernelGRU -#define BenchKernelGRUHtPart1 BenchKernelGRU -#define BenchKernelGRUHtPart2 BenchKernelGRU - -using CPUPlace = paddle::lite::fluid::CPUPlace; - -#define BENCH_FP32_CPU(name) \ - BENCH_JITKERNEL(name, FP32, CPU) { \ - BenchKernel##name, CPUPlace>(); \ - } - -// xyzn -BENCH_FP32_CPU(VMul); -BENCH_FP32_CPU(VAdd); -BENCH_FP32_CPU(VAddRelu); -BENCH_FP32_CPU(VSub); - -// axyn -BENCH_FP32_CPU(VScal); -BENCH_FP32_CPU(VAddBias); - -// xyn -BENCH_FP32_CPU(VRelu); -BENCH_FP32_CPU(VIdentity); -BENCH_FP32_CPU(VSquare); -BENCH_FP32_CPU(VExp); -BENCH_FP32_CPU(VSigmoid); -BENCH_FP32_CPU(VTanh); -BENCH_FP32_CPU(VCopy); - -// xrn -BENCH_FP32_CPU(HMax); -BENCH_FP32_CPU(HSum); - -// LSTM -BENCH_FP32_CPU(LSTMCtHt); -BENCH_FP32_CPU(LSTMC1H1); - -// GRU -BENCH_FP32_CPU(GRUH1); -BENCH_FP32_CPU(GRUHtPart1); -BENCH_FP32_CPU(GRUHtPart2); - -BENCH_FP32_CPU(LayerNorm); -BENCH_FP32_CPU(CRFDecoding); - -BENCH_FP32_CPU(SeqPool); -BENCH_FP32_CPU(EmbSeqPool); -BENCH_FP32_CPU(MatMul); -BENCH_FP32_CPU(Softmax); -BENCH_FP32_CPU(Sgd); -BENCH_FP32_CPU(VBroadcast); - -// Benchmark all jit kernels including jitcode, mkl and refer. -// To use this tool, run command: ./benchmark [options...] -// Options: -// --burning: the burning time before count -// --repeat: the repeat times -// --max_size: the max size would be tested -// --filter: the bench name would be run -int main(int argc, char* argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - google::InitGoogleLogging(argv[0]); - LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat - << " times."; - - RUN_ALL_BENCHMARK(); -} diff --git a/lite/backends/x86/jit/gen/CMakeLists.txt b/lite/backends/x86/jit/gen/CMakeLists.txt deleted file mode 100644 index 99244ea9bd..0000000000 --- a/lite/backends/x86/jit/gen/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ - -file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") - -cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) - -function(USE_JITKERNEL_GEN TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n") -endfunction() - -# use gen jitcode kernel by name -USE_JITKERNEL_GEN(kMatMul) -USE_JITKERNEL_GEN(kVMul) -USE_JITKERNEL_GEN(kVAdd) -USE_JITKERNEL_GEN(kVSub) -USE_JITKERNEL_GEN(kVAddRelu) -USE_JITKERNEL_GEN(kVScal) -USE_JITKERNEL_GEN(kVAddBias) -USE_JITKERNEL_GEN(kVRelu) -USE_JITKERNEL_GEN(kVSquare) -USE_JITKERNEL_GEN(kVIdentity) -USE_JITKERNEL_GEN(kVExp) -USE_JITKERNEL_GEN(kVSigmoid) -USE_JITKERNEL_GEN(kVTanh) -USE_JITKERNEL_GEN(kLSTMCtHt) -USE_JITKERNEL_GEN(kLSTMC1H1) -USE_JITKERNEL_GEN(kGRUH1) -USE_JITKERNEL_GEN(kGRUHtPart1) -USE_JITKERNEL_GEN(kGRUHtPart2) -USE_JITKERNEL_GEN(kNCHW16CMulNC) -USE_JITKERNEL_GEN(kSeqPool) -USE_JITKERNEL_GEN(kHMax) -USE_JITKERNEL_GEN(kHSum) -USE_JITKERNEL_GEN(kEmbSeqPool) -USE_JITKERNEL_GEN(kSgd) -USE_JITKERNEL_GEN(kVBroadcast) diff --git a/lite/backends/x86/jit/gen/act.cc b/lite/backends/x86/jit/gen/act.cc deleted file mode 100644 index f1f261c199..0000000000 --- a/lite/backends/x86/jit/gen/act.cc +++ /dev/null @@ -1,164 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/act.h" -#include -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = { - REPEAT_8TIMES(1.f), - REPEAT_8TIMES(2.f), - REPEAT_8TIMES(0.5f), - REPEAT_8TIMES(EXP_HIG), - REPEAT_8TIMES(EXP_LOW), - REPEAT_8TIMES(CEPHES_LOG2EF), - REPEAT_8TIMES(CEPHES_EXP_C1), - REPEAT_8TIMES(CEPHES_EXP_C2), - REPEAT_8TIMES(CEPHES_EXP_P0), - REPEAT_8TIMES(CEPHES_EXP_P1), - REPEAT_8TIMES(CEPHES_EXP_P2), - REPEAT_8TIMES(CEPHES_EXP_P3), - REPEAT_8TIMES(CEPHES_EXP_P4), - REPEAT_8TIMES(CEPHES_EXP_P5), - REPEAT_8TIMES(EXP_MAX_INPUT), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), - REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; - -const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)}; -int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0}; - -void VActJitCode::genCode() { - int offset = 0; - for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { - vmovups(ymm_src, ptr[param1 + offset]); - act(ymm_dst, ymm_src, type_); - vmovups(ptr[param2 + offset], ymm_dst); - offset += sizeof(float) * YMM_FLOAT_BLOCK; - } - int rest = num_ % YMM_FLOAT_BLOCK; - while (rest > 0) { - int block = XMM_FLOAT_BLOCK; - if (rest >= 4) { - block = 4; - vmovups(xmm_src, ptr[param1 + offset]); - } else if (rest >= 2) { - block = 2; - vmovq(xmm_src, ptr[param1 + offset]); - } else { - block = 1; - vmovss(xmm_src, ptr[param1 + offset]); - } - act(xmm_dst, xmm_src, type_); - if (rest >= 4) { - vmovups(ptr[param2 + offset], xmm_dst); - } else if (rest >= 2) { - vmovq(ptr[param2 + offset], xmm_dst); - } else { - vmovss(ptr[param2 + offset], xmm_dst); - } - offset += sizeof(float) * block; - rest -= block; - } - ret(); -} - -#define DECLARE_ACT_CREATOR(name) \ - class name##Creator : public JitCodeCreator { \ - public: \ - bool CanBeUsed(const int& attr) const override; \ - size_t CodeSize(const int& d) const override; \ - std::unique_ptr CreateJitCode(const int& attr) const override { \ - return make_unique(attr, CodeSize(attr)); \ - } \ - } - -DECLARE_ACT_CREATOR(VRelu); -DECLARE_ACT_CREATOR(VSquare); -DECLARE_ACT_CREATOR(VIdentity); -DECLARE_ACT_CREATOR(VExp); -DECLARE_ACT_CREATOR(VSigmoid); -DECLARE_ACT_CREATOR(VTanh); - -// TODO(TJ): tuning use me -bool VReluCreator::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx); -} - -bool VSquareCreator::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx); -} - -bool VIdentityCreator::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx); -} - -bool VExpCreator::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx) && d < 32; -} - -bool VSigmoidCreator::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx); -} - -bool VTanhCreator::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx); -} - -size_t VReluCreator::CodeSize(const int& d) const { - return 96 /* init size */ + - (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * - 8 /* average bytes for each instruction */; -} - -size_t VSquareCreator::CodeSize(const int& d) const { - return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8; -} - -size_t VIdentityCreator::CodeSize(const int& d) const { - return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8; -} - -size_t VExpCreator::CodeSize(const int& d) const { - return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8; -} - -size_t VSigmoidCreator::CodeSize(const int& d) const { - return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8; -} - -size_t VTanhCreator::CodeSize(const int& d) const { - return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8; -} - -#undef DECLARE_ACT_CREATOR - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator); -REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator); -REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator); -REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator); -REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator); -REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator); diff --git a/lite/backends/x86/jit/gen/act.h b/lite/backends/x86/jit/gen/act.h deleted file mode 100644 index 6366cff3c8..0000000000 --- a/lite/backends/x86/jit/gen/act.h +++ /dev/null @@ -1,347 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/backends/x86/jit/gen/jitcode.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -extern const float exp_float_consts[]; -extern const int exp_int_0x7f[]; -extern int g_tmp_mem[]; - -#define EXP_HIG 88.3762626647949f -#define EXP_LOW -88.3762626647949f -#define CEPHES_LOG2EF 1.44269504088896341 -#define CEPHES_EXP_C1 0.693359375 -#define CEPHES_EXP_C2 -2.12194440e-4 -#define CEPHES_EXP_P0 1.9875691500E-4 -#define CEPHES_EXP_P1 1.3981999507E-3 -#define CEPHES_EXP_P2 8.3334519073E-3 -#define CEPHES_EXP_P3 4.1665795894E-2 -#define CEPHES_EXP_P4 1.6666665459E-1 -#define CEPHES_EXP_P5 5.0000001201E-1 - -#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val - -#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float) -#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float) - -class VActFunc : public JitCode { - public: - explicit VActFunc(size_t code_size, void* code_ptr) - : JitCode(code_size, code_ptr) {} - virtual void genCode() = 0; - - protected: - // compute RELU with ymm, xmm - template - void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) { // NOLINT - JMM zero = JMM(zero_idx); - vxorps(zero, zero, zero); - vmaxps(dst, src, zero); - } - - // compute SQUARE with ymm, xmm - template - void square_jmm(JMM& dst, JMM& src) { // NOLINT - vmulps(dst, src, src); - } - - // compute EXP with ymm, xmm - template - void exp_jmm(JMM& dst, // NOLINT - JMM& src, // NOLINT - int src_idx = 11, - int fx_idx = 12, // NOLINT - int fy_idx = 13, - int mask_idx = 14, - int tmp_idx = 15) { - using namespace x86; // NOLINT - // check all idx can not equal - JMM jmm_src = JMM(src_idx); - JMM jmm_fx = JMM(fx_idx); - JMM jmm_fy = JMM(fy_idx); - JMM jmm_mask = JMM(mask_idx); - JMM jmm_tmp = JMM(tmp_idx); - reg64_t reg_ptr_global = rax; - push(reg_ptr_global); - vmovaps(jmm_src, src); - mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); - vminps(jmm_src, jmm_src, jmm_tmp); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); - vmaxps(jmm_src, jmm_src, jmm_tmp); - // express exp(x) as exp(g + n*log(2)) - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); - vmulps(jmm_fx, jmm_src, jmm_tmp); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); - vaddps(jmm_fx, jmm_fx, jmm_tmp); - vroundps(jmm_fy, jmm_fx, 0x01); - // if greater, substract 1 - vcmpgtps(jmm_mask, jmm_fy, jmm_fx); - vmovaps(jmm_tmp, ptr[reg_ptr_global]); - vandps(jmm_mask, jmm_mask, jmm_tmp); - vsubps(jmm_fx, jmm_fy, jmm_mask); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]); - vmulps(jmm_fy, jmm_fx, jmm_tmp); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); - JMM ymm_z = JMM(jmm_mask.getIdx()); - vmulps(ymm_z, jmm_fx, jmm_tmp); - vsubps(jmm_src, jmm_src, jmm_fy); - vsubps(jmm_src, jmm_src, ymm_z); - vmulps(ymm_z, jmm_src, jmm_src); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); - vmulps(dst, jmm_src, jmm_tmp); - for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; - i += (YMM_FLOAT_BLOCK * sizeof(float))) { - vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4 - vaddps(dst, dst, jmm_tmp); - vmulps(dst, dst, jmm_src); - } - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); - vaddps(dst, dst, jmm_tmp); - vmulps(dst, dst, ymm_z); - vaddps(dst, dst, jmm_src); - vmovaps(jmm_tmp, ptr[reg_ptr_global]); - vaddps(dst, dst, jmm_tmp); - // build 2^n - JMM ymm_int = jmm_fx; - vcvttps2dq(ymm_int, jmm_fx); - mov(reg_ptr_global, reinterpret_cast(exp_int_0x7f)); - vmovdqa(jmm_tmp, ptr[reg_ptr_global]); - if (MayIUse(avx2) || std::is_same::value) { - vpaddd(ymm_int, ymm_int, jmm_tmp); - vpslld(ymm_int, ymm_int, 23); - } else if (MayIUse(avx)) { - xmm_t xtmp1 = xmm_t(ymm_int.getIdx()); - xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx()); - reg64_t reg_ptr_tmp = reg_ptr_global; - mov(reg_ptr_tmp, reinterpret_cast(g_tmp_mem)); - vmovdqa(ptr[reg_ptr_tmp], ymm_int); - vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp); - vpaddd(xtmp1, xtmp1, xtmp2); - vpslld(xtmp1, xtmp1, 23); - vmovdqa(ptr[reg_ptr_tmp], xtmp1); - // next 128bits - vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]); - vmovdqa(xtmp2, - ptr[reg_ptr_tmp + - (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]); - vpaddd(xtmp1, xtmp1, xtmp2); - vpslld(xtmp1, xtmp1, 23); - vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1); - // load out - vmovdqa(ymm_int, ptr[reg_ptr_tmp]); - } - vmulps(dst, dst, ymm_int); - pop(reg_ptr_global); - } - - // compute SIGMOID with ymm, xmm - template - void sigmoid_jmm(JMM& dst, // NOLINT - JMM& src, // NOLINT - int src_idx = 11, // NOLINT - int fx_idx = 12, - int fy_idx = 13, - int mask_idx = 14, - int tmp_idx = 15) { - // y = 1 / (1 + e^-x) - JMM jmm_tmp = JMM(tmp_idx); - JMM jmm_src = JMM(src_idx); - reg64_t reg_ptr_global = rax; - push(reg_ptr_global); - vmovaps(jmm_src, src); - mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); - vminps(jmm_src, jmm_src, jmm_tmp); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); - vmaxps(jmm_src, jmm_src, jmm_tmp); - vxorps(jmm_tmp, jmm_tmp, jmm_tmp); - vsubps(jmm_src, jmm_tmp, jmm_src); - exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); - vaddps(dst, dst, jmm_tmp); - vdivps(dst, jmm_tmp, dst); - pop(reg_ptr_global); - } - - // compute TANH with ymm, xmm - template - void tanh_jmm(JMM& dst, // NOLINT - JMM& src, // NOLINT - int src_idx = 11, // NOLINT - int fx_idx = 12, - int fy_idx = 13, - int mask_idx = 14, - int tmp_idx = 15) { - // y = 2 / (1 + e^(-2x)) - 1 - JMM jmm_src = JMM(src_idx); - JMM jmm_tmp = JMM(tmp_idx); - JMM jmm_zero = JMM(mask_idx); - reg64_t reg_ptr_global = rax; - push(reg_ptr_global); - vmovaps(jmm_src, src); - mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); - vxorps(jmm_zero, jmm_zero, jmm_zero); - vsubps(jmm_tmp, jmm_zero, jmm_tmp); - vmulps(jmm_src, jmm_src, jmm_tmp); - exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); - vaddps(dst, dst, jmm_tmp); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); - vdivps(dst, jmm_tmp, dst); - vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); - vsubps(dst, dst, jmm_tmp); - pop(reg_ptr_global); - } - - // compute IDENTITY with ymm, xmm - template - void identity_jmm(JMM& dst, JMM& src, int zero_idx) { // NOLINT - JMM zero = JMM(zero_idx); - vxorps(zero, zero, zero); - vaddps(dst, src, zero); - // TODO(TJ): use below - // dst.setIdx(src.getIdx()); - } - - template - void act(JMM& dst, JMM& src, operand_type type) { // NOLINT - // use 11~15 - switch (type) { - case operand_type::RELU: - relu_jmm(dst, src, 15); - break; - case operand_type::SQUARE: - square_jmm(dst, src); - break; - case operand_type::EXP: - exp_jmm(dst, src, 11, 12, 13, 14, 15); - break; - case operand_type::SIGMOID: - sigmoid_jmm(dst, src, 11, 12, 13, 14, 15); - break; - case operand_type::TANH: - tanh_jmm(dst, src, 11, 12, 13, 14, 15); - break; - case operand_type::IDENTITY: - identity_jmm(dst, src, 15); - break; - default: - LOG(FATAL) << "Do not support this operand type: " << type; - break; - } - } -}; - -class VActJitCode : public VActFunc { - public: - explicit VActJitCode(int d, - operand_type type, - size_t code_size, - void* code_ptr = nullptr) - : VActFunc(code_size, code_ptr), num_(d), type_(type) { - if (!(type_ == operand_type::RELU || type_ == operand_type::EXP || - type_ == operand_type::SIGMOID || type_ == operand_type::TANH || - type_ == operand_type::IDENTITY || type_ == operand_type::SQUARE)) { - LOG(FATAL) << "Do not support this operand type: " << type_; - } - this->genCode(); - } - - std::string name() const override { - std::string base = "VActJitCode"; - switch (type_) { - case operand_type::RELU: - base += "_Relu"; - break; - case operand_type::SQUARE: - base += "_Square"; - break; - case operand_type::EXP: - base += "_Exp"; - break; - case operand_type::SIGMOID: - base += "_Sigmoid"; - break; - case operand_type::TANH: - base += "_Tanh"; - break; - case operand_type::IDENTITY: - base += "_Identity"; - break; - default: - break; - } - return base; - } - void genCode() override; - - protected: - int num_; - operand_type type_; - reg64_t param1{abi_param1}; - reg64_t param2{abi_param2}; - - xmm_t xmm_src = xmm_t(0); - ymm_t ymm_src = ymm_t(0); - - xmm_t xmm_dst = xmm_t(1); - ymm_t ymm_dst = ymm_t(1); -}; - -#define DECLARE_ACT_JITCODE(name, op_type) \ - class name##JitCode : public VActJitCode { \ - public: \ - explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ - : VActJitCode(d, op_type, code_size, code_ptr) {} \ - }; - -DECLARE_ACT_JITCODE(VRelu, operand_type::RELU); -DECLARE_ACT_JITCODE(VSquare, operand_type::SQUARE); -DECLARE_ACT_JITCODE(VIdentity, operand_type::IDENTITY); -DECLARE_ACT_JITCODE(VExp, operand_type::EXP); -DECLARE_ACT_JITCODE(VSigmoid, operand_type::SIGMOID); -DECLARE_ACT_JITCODE(VTanh, operand_type::TANH); - -#undef DECLARE_ACT_JITCODE - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/blas.cc b/lite/backends/x86/jit/gen/blas.cc deleted file mode 100644 index 0bddea6ace..0000000000 --- a/lite/backends/x86/jit/gen/blas.cc +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/blas.h" -#include -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void VXXJitCode::genCode() { - // do not need push stack, and do not need save avx512reg if do not use avx512 - int offset = 0; - if (with_relu_) { - vxorps(ymm_zero, ymm_zero, ymm_zero); - } - if (scalar_index_ == 1) { - vbroadcastss(ymm_src1, ptr[param1]); - } else if (scalar_index_ == 2) { - vbroadcastss(ymm_src2, ptr[param2]); - } - for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { - if (scalar_index_ != 1) { - vmovups(ymm_src1, ptr[param1 + offset]); - } - if (scalar_index_ != 2) { - vmovups(ymm_src2, ptr[param2 + offset]); - } - if (type_ == operand_type::MUL) { - vmulps(ymm_dst, ymm_src1, ymm_src2); - } else if (type_ == operand_type::ADD) { - vaddps(ymm_dst, ymm_src1, ymm_src2); - } else if (type_ == operand_type::SUB) { - vsubps(ymm_dst, ymm_src1, ymm_src2); - } - if (with_relu_) { - vmaxps(ymm_dst, ymm_zero, ymm_dst); - } - vmovups(ptr[param3 + offset], ymm_dst); - offset += sizeof(float) * YMM_FLOAT_BLOCK; - } - int rest = num_ % YMM_FLOAT_BLOCK; - while (rest > 0) { - int block = XMM_FLOAT_BLOCK; - if (rest >= 4) { - block = 4; - if (scalar_index_ != 1) { - vmovups(xmm_src1, ptr[param1 + offset]); - } - if (scalar_index_ != 2) { - vmovups(xmm_src2, ptr[param2 + offset]); - } - } else if (rest >= 2) { - block = 2; - if (scalar_index_ != 1) { - vmovq(xmm_src1, ptr[param1 + offset]); - } - if (scalar_index_ != 2) { - vmovq(xmm_src2, ptr[param2 + offset]); - } - } else { - block = 1; - if (scalar_index_ != 1) { - vmovss(xmm_src1, ptr[param1 + offset]); - } - if (scalar_index_ != 2) { - vmovss(xmm_src2, ptr[param2 + offset]); - } - } - switch (type_) { - case operand_type::MUL: - vmulps(xmm_dst, xmm_src1, xmm_src2); - break; - case operand_type::ADD: - vaddps(xmm_dst, xmm_src1, xmm_src2); - break; - case operand_type::SUB: - vsubps(xmm_dst, xmm_src1, xmm_src2); - break; - default: - break; - } - if (with_relu_) { - vmaxps(xmm_dst, xmm_zero, xmm_dst); - } - if (rest >= 4) { - vmovups(ptr[param3 + offset], xmm_dst); - } else if (rest >= 2) { - vmovq(ptr[param3 + offset], xmm_dst); - } else { - vmovss(ptr[param3 + offset], xmm_dst); - } - offset += sizeof(float) * block; - rest -= block; - } - ret(); -} - -void NCHW16CMulNCJitCode::genCode() { - // RDI is ptr x_input - // RSI is ptr y_input - // RDX is ptr output - // RCX is height - // r8 is width - - push(rbx); - - xor_(rax, rax); - xor_(r10, r10); - vmovups(zmm3, ptr[rsi]); - - L("h_loop"); - xor_(rbx, rbx); - L("w_loop"); - vmovups(zmm2, ptr[rdi + rax]); - vmulps(zmm1, zmm2, zmm3); - vmovups(ptr[rdx + rax], zmm1); - add(rax, 64); - inc(rbx); - cmp(r8, rbx); - jnz("w_loop"); - inc(r10); - cmp(r10, rcx); - jnz("h_loop"); - - pop(rbx); - ret(); -} - -class NCHW16CMulNCCreator : public JitCodeCreator { - public: - bool CanBeUsed(const int& attr) const override { - return x86::MayIUse(x86::avx512f); - } - size_t CodeSize(const int& d) const override { return 256 * 1024; } - std::unique_ptr CreateJitCode(const int& attr) const override { - return make_unique(attr, CodeSize(attr)); - } -}; - -#define DECLARE_BLAS_CREATOR(name) \ - class name##Creator : public JitCodeCreator { \ - public: \ - bool CanBeUsed(const int& attr) const override { \ - return x86::MayIUse(x86::avx) && attr <= 1024; \ - } \ - size_t CodeSize(const int& d) const override { \ - return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ - } \ - std::unique_ptr CreateJitCode(const int& attr) const override { \ - return make_unique(attr, CodeSize(attr)); \ - } \ - } - -DECLARE_BLAS_CREATOR(VMul); -DECLARE_BLAS_CREATOR(VAdd); -DECLARE_BLAS_CREATOR(VSub); -DECLARE_BLAS_CREATOR(VAddRelu); -DECLARE_BLAS_CREATOR(VScal); -DECLARE_BLAS_CREATOR(VAddBias); - -#undef DECLARE_BLAS_CREATOR - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator); -REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator); -REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator); -REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator); -REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator); -REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator); -REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator); diff --git a/lite/backends/x86/jit/gen/blas.h b/lite/backends/x86/jit/gen/blas.h deleted file mode 100644 index 39920195b2..0000000000 --- a/lite/backends/x86/jit/gen/blas.h +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "glog/logging.h" -#include "lite/backends/x86/jit/gen/jitcode.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu) -class VXXJitCode : public JitCode { - public: - explicit VXXJitCode(int d, - operand_type type, - int scalar_index, - bool with_relu, - size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), - num_(d), - type_(type), - scalar_index_(scalar_index), - with_relu_(with_relu) { - if (!(type_ == operand_type::MUL || type_ == operand_type::ADD || - type_ == operand_type::SUB)) { - LOG(FATAL) << "Do not support this operand type: " << type_; - } - this->genCode(); - } - - std::string name() const override { - std::string base = "VXXJitCode"; - if (scalar_index_ == 1) { - base += "_Scalar"; - } else { - base += "_Vec"; - } - if (type_ == operand_type::MUL) { - base += "_Mul"; - } else if (type_ == operand_type::ADD) { - base += "_Add"; - } else if (type_ == operand_type::SUB) { - base += "_SUB"; - } - if (scalar_index_ == 2) { - base += "_Scalar"; - } else { - base += "_Vec"; - } - base += (with_relu_ ? "_Relu" : ""); - base += "_D" + std::to_string(num_); - return base; - } - void genCode() override; - - private: - int num_; - operand_type type_; - int scalar_index_; - bool with_relu_; - reg64_t param1{abi_param1}; - reg64_t param2{abi_param2}; - reg64_t param3{abi_param3}; - - xmm_t xmm_src1 = xmm_t(0); - xmm_t xmm_src2 = xmm_t(1); - xmm_t xmm_dst = xmm_t(2); - xmm_t xmm_zero = xmm_t(3); - - ymm_t ymm_src1 = ymm_t(0); - ymm_t ymm_src2 = ymm_t(1); - ymm_t ymm_dst = ymm_t(2); - ymm_t ymm_zero = ymm_t(3); -}; - -#define DECLARE_BLAS_JITCODE(name, op_type, scalar_idx, with_relu) \ - class name##JitCode : public VXXJitCode { \ - public: \ - explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ - : VXXJitCode(d, op_type, scalar_idx, with_relu, code_size, code_ptr) { \ - } \ - }; - -DECLARE_BLAS_JITCODE(VMul, operand_type::MUL, 0, false); -DECLARE_BLAS_JITCODE(VAdd, operand_type::ADD, 0, false); -DECLARE_BLAS_JITCODE(VSub, operand_type::SUB, 0, false); -DECLARE_BLAS_JITCODE(VAddRelu, operand_type::ADD, 0, true); -DECLARE_BLAS_JITCODE(VScal, operand_type::MUL, 1, false); -DECLARE_BLAS_JITCODE(VAddBias, operand_type::ADD, 1, false); - -#undef DECLARE_BLAS_JITCODE - -// nChw16c = nChw16c .* NC -class NCHW16CMulNCJitCode : public JitCode { - public: - DECLARE_JIT_CODE(NCHW16CMulNCJitCode); - explicit NCHW16CMulNCJitCode(int d /*unused*/, - size_t code_size, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr) { - this->genCode(); - } - void genCode() override; -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc deleted file mode 100644 index 2ff6894383..0000000000 --- a/lite/backends/x86/jit/gen/embseqpool.cc +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/embseqpool.h" -#include // offsetof -#include -#include -#include "lite/backends/x86/jit/gen/act.h" // for exp_float_consts ones -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void EmbSeqPoolJitCode::genCode() { - preCode(); - constexpr int block = YMM_FLOAT_BLOCK; - constexpr int max_num_regs = 8; - const int num_block = tbl_w_ / block; - const int num_groups = num_block / max_num_regs; - const size_t block_size = sizeof(float) * block; - std::vector groups(num_groups, max_num_regs); - int rest_num_regs = num_block % max_num_regs; - if (rest_num_regs > 0) { - groups.push_back(rest_num_regs); - } - - // protect param_dst - mov(reg_ptr_param_dst, param_dst); - mov(reg_idx_width_in_byte, - qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]); - mov(reg_idx_height, - qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]); - mov(rax, sizeof(int64_t)); - mul(reg_idx_width_in_byte); - mov(reg_idx_width_in_byte, rax); - const size_t tbl_width_in_byte = sizeof(float) * tbl_w_; - int acc_num_regs = 0; - for (int num_regs : groups) { - Label l_next_idx_w, l_next_idx_h, l_save_now; - xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte); - mov(reg_ptr_dst_i, reg_ptr_param_dst); - add(reg_ptr_dst_i, acc_num_regs * block_size); - - L(l_next_idx_w); - { - // h == 0 - mov(reg_ptr_idx_i, param_idx); - add(reg_ptr_idx_i, reg_idx_w_i_in_byte); - mov(reg_idx, qword[reg_ptr_idx_i]); - mov(rax, tbl_width_in_byte); - mul(reg_idx); - mov(reg_ptr_tbl_i, rax); // reg is offset now - add(reg_ptr_tbl_i, param_tbl); // reg is ptr_i now - size_t w_offset = 0; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]); - w_offset += block_size; - } - add(reg_ptr_idx_i, reg_idx_width_in_byte); - - // end condition of idx h - mov(reg_idx_h_end, reg_idx_height); - mov(rax, reg_idx_width_in_byte); - mul(reg_idx_h_end); - mov(reg_idx_h_end, rax); - add(reg_idx_h_end, reg_idx_w_i_in_byte); - add(reg_idx_h_end, param_idx); - - cmp(reg_ptr_idx_i, reg_idx_h_end); - jge(l_save_now, T_NEAR); - L(l_next_idx_h); - { - mov(reg_idx, qword[reg_ptr_idx_i]); - mov(reg_ptr_tbl_i, reg_idx); - mov(rax, tbl_width_in_byte); - mul(reg_idx); - mov(reg_ptr_tbl_i, rax); - add(reg_ptr_tbl_i, param_tbl); - size_t w_offset = 0; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]); - vaddps( - ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i)); - w_offset += block_size; - } - add(reg_ptr_idx_i, reg_idx_width_in_byte); - cmp(reg_ptr_idx_i, reg_idx_h_end); - jl(l_next_idx_h, T_NEAR); - } // end of idx h - L(l_save_now); - // avg or sqrt here, if needed - w_offset = 0; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs)); - w_offset += block_size; - } - add(reg_ptr_dst_i, tbl_width_in_byte); - add(reg_idx_w_i_in_byte, sizeof(int64_t)); - cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte); - jl(l_next_idx_w, T_NEAR); - } // end of idx w - - acc_num_regs += num_regs; - add(param_tbl, num_regs * block_size); // do not use acc_num_regs - } // end of groups - postCode(); -} - -class EmbSeqPoolCreator : public JitCodeCreator { - public: - bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override { - return x86::MayIUse(x86::avx) && attr.table_width % YMM_FLOAT_BLOCK == 0; - } - size_t CodeSize(const emb_seq_pool_attr_t& attr) const override { - return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8; - } - std::unique_ptr CreateJitCode( - const emb_seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.table_height, 0); - PADDLE_ENFORCE_GT(attr.table_width, 0); - PADDLE_ENFORCE_GT(attr.index_height, 0); - PADDLE_ENFORCE_GT(attr.index_width, 0); - PADDLE_ENFORCE_GT(attr.out_width, 0); - return make_unique(attr, CodeSize(attr)); - } -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator); diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h deleted file mode 100644 index 7cae76f9dd..0000000000 --- a/lite/backends/x86/jit/gen/embseqpool.h +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/backends/x86/jit/gen/jitcode.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -class EmbSeqPoolJitCode : public JitCode { - public: - explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr, - size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), - tbl_w_(attr.table_width), - type_(attr.pool_type) { - if (type_ != SeqPoolType::kSum) { - LOG(FATAL) << "Only support sum pool yet "; - } - this->genCode(); - } - - std::string name() const override { - std::string base = "EmbSeqPoolJitCode"; - if (type_ == SeqPoolType::kSum) { - base += "_Sum"; - } else if (type_ == SeqPoolType::kAvg) { - base += "_Avg"; - } else if (type_ == SeqPoolType::kSqrt) { - base += "_Sqrt"; - } - base += ("_W" + std::to_string(tbl_w_)); - return base; - } - void genCode() override; - - private: - int tbl_w_; - SeqPoolType type_; - reg64_t param_tbl{abi_param1}; - reg64_t param_idx{abi_param2}; - reg64_t param_dst{abi_param3}; - reg64_t param_attr{abi_param4}; - - reg64_t reg_tmp{rax}; - - reg64_t reg_idx_width_in_byte{r8}; - reg64_t reg_idx_height{r9}; - - reg64_t reg_ptr_tbl_i{r10}; - reg64_t reg_idx{r10}; // could use same of reg_ptr_tbl_i - reg64_t reg_ptr_idx_i{r11}; - reg64_t reg_ptr_dst_i{r12}; - reg64_t reg_ptr_param_dst{r13}; // rdx is used in mul so protect param_dst - - reg64_t reg_idx_w_i_in_byte{r14}; - reg64_t reg_idx_h_end{r15}; -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/gru.cc b/lite/backends/x86/jit/gen/gru.cc deleted file mode 100644 index c5737faf13..0000000000 --- a/lite/backends/x86/jit/gen/gru.cc +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/gru.h" -#include // offsetof -#include -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void GRUJitCode::genCode() { - reg64_t reg_ptr_gates = rax; - reg64_t reg_ptr_ht_1 = r9; - reg64_t reg_ptr_ht = r10; - mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]); - mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]); - mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]); - ymm_t ymm_one = ymm_t(0); - - if (id_ == 2) { - reg64_t reg_ptr_tmp = r11; - mov(reg_ptr_tmp, reinterpret_cast(exp_float_consts)); - vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]); - } - int offset = 0; - int d = num_ * sizeof(float); - for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { - ymm_t ymm_u = ymm_t(1); - ymm_t ymm_r = ymm_t(2); - ymm_t ymm_s = ymm_t(3); - ymm_t ymm_ht_1 = ymm_t(4); - // W: {W_update, W_reset; W_state} - if (id_ == 0 || id_ == 2) { - vmovups(ymm_u, ptr[reg_ptr_gates + offset]); - vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]); - } - if (id_ == 1) { - vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]); - } - if (id_ == 1 || id_ == 2) { - vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]); - } - - if (id_ == 0) { - // ht = act_gate(u) * act_cand(s) - act(ymm_u, ymm_u, act_gate_); - act(ymm_s, ymm_s, act_cand_); - vmulps(ymm_s, ymm_s, ymm_u); - vmovups(ptr[reg_ptr_ht + offset], ymm_s); - } else if (id_ == 1) { - // ht = act_gate(r) * ht_1 - act(ymm_r, ymm_r, act_gate_); - vmulps(ymm_r, ymm_r, ymm_ht_1); - vmovups(ptr[reg_ptr_ht + offset], ymm_r); - } else if (id_ == 2) { - // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 - ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx()); - act(ymm_u, ymm_u, act_gate_); - act(ymm_s, ymm_s, act_cand_); - vmulps(ymm_s, ymm_s, ymm_u); - vsubps(ymm_u, ymm_one_inner, ymm_u); - vmulps(ymm_u, ymm_ht_1, ymm_u); - vaddps(ymm_u, ymm_s, ymm_u); - vmovups(ptr[reg_ptr_ht + offset], ymm_u); - } - offset += sizeof(float) * YMM_FLOAT_BLOCK; - } - ret(); -} - -#define DECLARE_GRU_CREATOR(name) \ - class name##Creator : public JitCodeCreator { \ - public: \ - /* TODO(TJ): enable more */ \ - bool CanBeUsed(const gru_attr_t& attr) const override { \ - return x86::MayIUse(x86::avx) && attr.d % 8 == 0; \ - } \ - size_t CodeSize(const gru_attr_t& attr) const override { \ - return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \ - } \ - std::unique_ptr CreateJitCode( \ - const gru_attr_t& attr) const override { \ - return make_unique(attr, CodeSize(attr)); \ - } \ - } - -DECLARE_GRU_CREATOR(GRUH1); -DECLARE_GRU_CREATOR(GRUHtPart1); -DECLARE_GRU_CREATOR(GRUHtPart2); - -#undef DECLARE_GRU_CREATOR - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator); -REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator); -REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator); diff --git a/lite/backends/x86/jit/gen/gru.h b/lite/backends/x86/jit/gen/gru.h deleted file mode 100644 index 408f25746d..0000000000 --- a/lite/backends/x86/jit/gen/gru.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "glog/logging.h" -#include "lite/backends/x86/jit/gen/act.h" -#include "lite/backends/x86/jit/gen/jitcode.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -class GRUJitCode : public VActFunc { - public: - explicit GRUJitCode(int id, - const gru_attr_t& attr, - size_t code_size, - void* code_ptr = nullptr) - : VActFunc(code_size, code_ptr), id_(id), num_(attr.d) { - auto typeExchange = [](KernelType type) -> gen::operand_type { - if (type == KernelType::kVSigmoid) { - return operand_type::SIGMOID; - } else if (type == KernelType::kVRelu) { - return operand_type::RELU; - } else if (type == KernelType::kVTanh) { - return operand_type::TANH; - } else if (type == KernelType::kVIdentity) { - return operand_type::IDENTITY; - } else { - LOG(FATAL) << "Do not support this jit::KernelType: " << type; - } - return operand_type::IDENTITY; - }; - act_gate_ = typeExchange(attr.act_gate); - act_cand_ = typeExchange(attr.act_cand); - - this->genCode(); - } - - std::string name() const override { - std::string base = "GRUJitCode"; - if (id_ == 0) { - base += "_H1"; - } else if (id_ == 1) { - base += "_HtPart1"; - } else if (id_ == 2) { - base += "_HtPart2"; - } - auto AddTypeStr = [&](operand_type type) { - switch (type) { - case operand_type::RELU: - base += "_Relu"; - break; - case operand_type::EXP: - base += "_Exp"; - break; - case operand_type::SIGMOID: - base += "_Sigmoid"; - break; - case operand_type::TANH: - base += "_Tanh"; - break; - case operand_type::IDENTITY: - base += "_Identity"; - break; - default: - break; - } - }; - AddTypeStr(act_gate_); - AddTypeStr(act_cand_); - return base; - } - void genCode() override; - - protected: - int id_; - int num_; - operand_type act_gate_; - operand_type act_cand_; - reg64_t param1{abi_param1}; -}; - -#define DECLARE_GRU_JITCODE(name, id) \ - class name##JitCode : public GRUJitCode { \ - public: \ - explicit name##JitCode(const gru_attr_t& attr, \ - size_t code_size, \ - void* code_ptr = nullptr) \ - : GRUJitCode(id, attr, code_size, code_ptr) {} \ - }; - -DECLARE_GRU_JITCODE(GRUH1, 0); -DECLARE_GRU_JITCODE(GRUHtPart1, 1); -DECLARE_GRU_JITCODE(GRUHtPart2, 2); - -#undef DECLARE_GRU_JITCODE - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/hopv.cc b/lite/backends/x86/jit/gen/hopv.cc deleted file mode 100644 index 4304dc48c5..0000000000 --- a/lite/backends/x86/jit/gen/hopv.cc +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/hopv.h" -#include -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void HOPVJitCode::genCode() { - const int num_blocks = num_ / YMM_FLOAT_BLOCK; - int offset = 0; - - if (num_blocks > 0) { - // load one firstly - vmovups(ymm_tmp, ptr[param_src]); - offset += sizeof(float) * YMM_FLOAT_BLOCK; - for (int i = 1; i < num_blocks; ++i) { - vmovups(ymm_src, ptr[param_src + offset]); - process(ymm_tmp, ymm_src, ymm_tmp); - offset += sizeof(float) * YMM_FLOAT_BLOCK; - } - vextractf128(xmm_dst, ymm_tmp, 1); - process(xmm_dst, xmm_dst, xmm_tmp); - } else { - if (type_ == operand_type::MAX) { - vbroadcastss(ymm_dst, ptr[param_src]); - } else if (type_ == operand_type::ADD) { - vxorps(ymm_dst, ymm_dst, ymm_dst); - } - } - - int rest = num_ % YMM_FLOAT_BLOCK; - if (rest >= 4) { - vmovups(xmm_src, ptr[param_src + offset]); - offset += sizeof(float) * 4; - rest -= 4; - process(xmm_dst, xmm_dst, xmm_src); - } - - vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3); - process(xmm_dst, xmm_dst, xmm_tmp); - - if (rest >= 2) { - vmovq(xmm_src, ptr[param_src + offset]); - offset += sizeof(float) * 2; - rest -= 2; - process(xmm_dst, xmm_dst, xmm_src); - } - - vpermilps(xmm_tmp, xmm_dst, 1); - process(xmm_dst, xmm_dst, xmm_tmp); - - if (rest >= 1) { - vmovss(xmm_src, ptr[param_src + offset]); - process(xmm_dst, xmm_dst, xmm_src); - } - vmovss(ptr[param_dst], xmm_dst); - ret(); -} - -#define DECLARE_HOP_CREATOR(name) \ - class name##Creator : public JitCodeCreator { \ - public: \ - bool CanBeUsed(const int& attr) const override { \ - return x86::MayIUse(x86::avx); \ - } \ - size_t CodeSize(const int& d) const override { \ - return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ - } \ - std::unique_ptr CreateJitCode(const int& attr) const override { \ - return make_unique(attr, CodeSize(attr)); \ - } \ - } - -DECLARE_HOP_CREATOR(HMax); -DECLARE_HOP_CREATOR(HSum); - -#undef DECLARE_HOP_CREATOR - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator); -REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator); diff --git a/lite/backends/x86/jit/gen/hopv.h b/lite/backends/x86/jit/gen/hopv.h deleted file mode 100644 index 801131d630..0000000000 --- a/lite/backends/x86/jit/gen/hopv.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "glog/logging.h" -#include "lite/backends/x86/jit/gen/jitcode.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -// horizontal operand vector -class HOPVJitCode : public JitCode { - public: - explicit HOPVJitCode(int d, - operand_type type, - size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), num_(d), type_(type) { - if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) { - LOG(FATAL) << "Do not support this operand type: " << type_; - } - this->genCode(); - } - - std::string name() const override { - std::string base = "VXXJitCode"; - if (type_ == operand_type::MAX) { - base += "_MAX"; - } else { - base += "_SUM"; - } - return base; - } - void genCode() override; - - protected: - template - void process(JMM& dst, JMM& src1, JMM& src2) { // NOLINT - if (type_ == operand_type::MAX) { - vmaxps(dst, src1, src2); - } else if (type_ == operand_type::ADD) { - vaddps(dst, src1, src2); - } - } - - private: - int num_; - operand_type type_; - reg64_t param_src{abi_param1}; - reg64_t param_dst{abi_param2}; - reg64_t param_attr{abi_param3}; - - ymm_t ymm_tmp = ymm_t(0); - ymm_t ymm_src = ymm_t(1); - ymm_t ymm_dst = ymm_t(2); - - xmm_t xmm_tmp = xmm_t(0); - xmm_t xmm_src = xmm_t(1); - xmm_t xmm_dst = xmm_t(2); -}; - -#define DECLARE_HOP_JITCODE(name, op_type) \ - class name##JitCode : public HOPVJitCode { \ - public: \ - explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ - : HOPVJitCode(d, op_type, code_size, code_ptr) {} \ - }; - -DECLARE_HOP_JITCODE(HMax, operand_type::MAX); -DECLARE_HOP_JITCODE(HSum, operand_type::ADD); - -#undef DECLARE_HOP_JITCODE - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/jitcode.h b/lite/backends/x86/jit/gen/jitcode.h deleted file mode 100644 index 1840dcac68..0000000000 --- a/lite/backends/x86/jit/gen/jitcode.h +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/gen_base.h" - -#define XBYAK_USE_MMAP_ALLOCATOR -#include "xbyak/xbyak.h" -#include "xbyak/xbyak_util.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -// Application Binary Interface -constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI), - abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), - abi_param4(Xbyak::Operand::RCX), abi_param5(Xbyak::Operand::R8), - abi_param6(Xbyak::Operand::R9); - -constexpr Xbyak::Operand::Code g_abi_regs[] = {Xbyak::Operand::RBX, - Xbyak::Operand::RBP, - Xbyak::Operand::R12, - Xbyak::Operand::R13, - Xbyak::Operand::R14, - Xbyak::Operand::R15}; - -constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]); - -using reg64_t = const Xbyak::Reg64; -using reg32_t = const Xbyak::Reg32; -using xmm_t = const Xbyak::Xmm; -using ymm_t = const Xbyak::Ymm; -using zmm_t = const Xbyak::Zmm; -using Label = Xbyak::Label; - -typedef enum { - MUL = 0, - MAX, - ADD, - SUB, - RELU, - EXP, - SQUARE, - SIGMOID, - TANH, - IDENTITY -} operand_type; - -#define DECLARE_JIT_CODE(codename) \ - std::string name() const override { return #codename; } - -class JitCode : public GenBase, public Xbyak::CodeGenerator { - public: - explicit JitCode(size_t code_size, void* code_ptr = nullptr) - : Xbyak::CodeGenerator( - (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size), - code_ptr) {} - - virtual void genCode() = 0; - - size_t getSize() const override { return CodeGenerator::getSize(); } - const unsigned char* getCodeInternal() const override { - const Xbyak::uint8* code = CodeGenerator::getCode(); - return code; - } - - protected: - Xbyak::Reg64 param1{abi_param1}; - const int EVEX_max_8b_offt = 0x200; - const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; - - virtual void preCode() { - for (int i = 0; i < num_g_abi_regs; ++i) { - push(Xbyak::Reg64(g_abi_regs[i])); - } - if (x86::MayIUse(x86::avx512f)) { - mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); - } - } - virtual void postCode() { - for (int i = 0; i < num_g_abi_regs; ++i) { - pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i])); - } - ret(); - } - void L(const char* label) { Xbyak::CodeGenerator::L(label); } - void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } - // Enhanced vector extension - Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, - int offt, - bool bcast = false) { - int scale = 0; - // Learn from https://github.com/intel/mkl-dnn - if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { - offt = offt - 2 * EVEX_max_8b_offt; - scale = 1; - } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { - offt = offt - 4 * EVEX_max_8b_offt; - scale = 2; - } - auto re = Xbyak::RegExp() + base + offt; - if (scale) { - re = re + reg_EVEX_max_8b_offt * scale; - } - if (bcast) { - return zword_b[re]; - } else { - return zword[re]; - } - } -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/lstm.cc b/lite/backends/x86/jit/gen/lstm.cc deleted file mode 100644 index 44e58d0b75..0000000000 --- a/lite/backends/x86/jit/gen/lstm.cc +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/lstm.h" -#include // offsetof -#include -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void LSTMJitCode::genCode() { - if (use_peephole_) { - preCode(); - } - reg64_t reg_ptr_gates = rax; - reg64_t reg_ptr_ct_1 = r9; - reg64_t reg_ptr_ct = r10; - reg64_t reg_ptr_ht = r11; - reg64_t reg_ptr_wp = r12; - mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); - mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); - mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); - mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); - if (use_peephole_) { - mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]); - } - - int offset = 0; - int d = num_ * sizeof(float); - for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { - /* gates: W_ch, W_ih, W_fh, W_oh */ - ymm_t ymm_c = ymm_t(0); - ymm_t ymm_i = ymm_t(1); - ymm_t ymm_f = ymm_t(2); - ymm_t ymm_o = ymm_t(3); - ymm_t ymm_ct_1 = ymm_t(4); - ymm_t ymm_wp0 = ymm_t(5); - ymm_t ymm_wp1 = ymm_t(6); - ymm_t ymm_wp2 = ymm_t(7); - vmovups(ymm_c, ptr[reg_ptr_gates + offset]); - vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]); - vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]); - vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]); - if (!compute_c1h1_) { - vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); - } - if (use_peephole_) { - vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]); - vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]); - vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]); - } - /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */ - // act_cand(c) - act(ymm_c, ymm_c, act_cand_); - // act_gate(i) or act_gate(ct_1 * wp0 + i) - if (!compute_c1h1_ && use_peephole_) { - vmulps(ymm_wp0, ymm_ct_1, ymm_wp0); - vaddps(ymm_i, ymm_i, ymm_wp0); - } - act(ymm_i, ymm_i, act_gate_); - vmulps(ymm_c, ymm_c, ymm_i); - if (!compute_c1h1_) { - // act_gate(f) or act_gate(ct_1 * wp1 + f) - if (use_peephole_) { - vmulps(ymm_wp1, ymm_ct_1, ymm_wp1); - vaddps(ymm_f, ymm_f, ymm_wp1); - } - act(ymm_f, ymm_f, act_gate_); - // ct - vmulps(ymm_f, ymm_f, ymm_ct_1); - vaddps(ymm_f, ymm_f, ymm_c); - } - /* H_t = act_cell(C_t) * act_gate(o) */ - // act_cell(C_t) - ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; - ymm_t ymm_tmp = ymm_i; - act(ymm_tmp, ymm_ct, act_cell_); - // act_gate(o) or act_gate(ct * wp2 + o) - if (use_peephole_) { - vmulps(ymm_wp2, ymm_ct, ymm_wp2); - vaddps(ymm_o, ymm_o, ymm_wp2); - } - act(ymm_o, ymm_o, act_gate_); - // ht - vmulps(ymm_o, ymm_o, ymm_tmp); - // save ct and ht - vmovups(ptr[reg_ptr_ct + offset], ymm_ct); - vmovups(ptr[reg_ptr_ht + offset], ymm_o); - offset += sizeof(float) * YMM_FLOAT_BLOCK; - } - - if (use_peephole_) { - postCode(); - } else { - ret(); - } -} - -#define DECLARE_LSTM_CREATOR(name) \ - class name##Creator : public JitCodeCreator { \ - public: \ - /* TODO(TJ): enable more */ \ - bool CanBeUsed(const lstm_attr_t& attr) const override { \ - return x86::MayIUse(x86::avx) && attr.d % 8 == 0; \ - } \ - size_t CodeSize(const lstm_attr_t& attr) const override { \ - return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \ - } \ - std::unique_ptr CreateJitCode( \ - const lstm_attr_t& attr) const override { \ - return make_unique(attr, CodeSize(attr)); \ - } \ - } - -DECLARE_LSTM_CREATOR(LSTMCtHt); -DECLARE_LSTM_CREATOR(LSTMC1H1); - -#undef DECLARE_LSTM_CREATOR - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator); -REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator); diff --git a/lite/backends/x86/jit/gen/lstm.h b/lite/backends/x86/jit/gen/lstm.h deleted file mode 100644 index 141419505c..0000000000 --- a/lite/backends/x86/jit/gen/lstm.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "glog/logging.h" -#include "lite/backends/x86/jit/gen/act.h" -#include "lite/backends/x86/jit/gen/jitcode.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -class LSTMJitCode : public VActFunc { - public: - explicit LSTMJitCode(bool compute_c1h1, - const lstm_attr_t& attr, - size_t code_size, - void* code_ptr = nullptr) - : VActFunc(code_size, code_ptr), - num_(attr.d), - compute_c1h1_(compute_c1h1), - use_peephole_(attr.use_peephole) { - auto typeExchange = [](KernelType type) -> gen::operand_type { - if (type == KernelType::kVSigmoid) { - return operand_type::SIGMOID; - } else if (type == KernelType::kVRelu) { - return operand_type::RELU; - } else if (type == KernelType::kVTanh) { - return operand_type::TANH; - } else if (type == KernelType::kVIdentity) { - return operand_type::IDENTITY; - } else { - LOG(FATAL) << "Do not support this jit::KernelType: " << type; - } - return operand_type::IDENTITY; - }; - act_gate_ = typeExchange(attr.act_gate); - act_cand_ = typeExchange(attr.act_cand); - act_cell_ = typeExchange(attr.act_cell); - - this->genCode(); - } - - std::string name() const override { - std::string base = "LSTMJitCode"; - if (use_peephole_) { - base += "_Peephole"; - } - if (compute_c1h1_) { - base += "_C1H1"; - } - auto AddTypeStr = [&](operand_type type) { - switch (type) { - case operand_type::RELU: - base += "_Relu"; - break; - case operand_type::EXP: - base += "_Exp"; - break; - case operand_type::SIGMOID: - base += "_Sigmoid"; - break; - case operand_type::TANH: - base += "_Tanh"; - break; - case operand_type::IDENTITY: - base += "_Identity"; - break; - default: - break; - } - }; - AddTypeStr(act_gate_); - AddTypeStr(act_cand_); - AddTypeStr(act_cell_); - return base; - } - void genCode() override; - - protected: - int num_; - bool compute_c1h1_; - bool use_peephole_; - operand_type act_gate_; - operand_type act_cand_; - operand_type act_cell_; - reg64_t param1{abi_param1}; -}; - -#define DECLARE_LSTM_JITCODE(name, compute_c1h1) \ - class name##JitCode : public LSTMJitCode { \ - public: \ - explicit name##JitCode(const lstm_attr_t& attr, \ - size_t code_size, \ - void* code_ptr = nullptr) \ - : LSTMJitCode(compute_c1h1, attr, code_size, code_ptr) {} \ - }; - -DECLARE_LSTM_JITCODE(LSTMCtHt, false); -DECLARE_LSTM_JITCODE(LSTMC1H1, true); - -#undef DECLARE_LSTM_JITCODE - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc deleted file mode 100644 index 103b9101ba..0000000000 --- a/lite/backends/x86/jit/gen/matmul.cc +++ /dev/null @@ -1,127 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/matmul.h" -#include // offsetof -#include -#include -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void MatMulJitCode::genCode() { - preCode(); - int block, rest; - const auto groups = packed_groups(n_, k_, &block, &rest); - PADDLE_ENFORCE_GT(groups.front(), 0); - - const int block_len = sizeof(float) * block; - const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; - const int w_reg_idx = x_reg_idx - 1; - // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t, - // packed_weight)]); - mov(reg_ptr_wgt, param_y); - size_t z_offset = 0; - size_t wgt_offset = 0; - for (size_t g = 0; g < groups.size(); ++g) { - size_t x_offset = 0; - for (int k = 0; k < k_; ++k) { - vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]); - // clean - if (k == 0) { - for (int i = 0; i < groups[g]; ++i) { - vxorps(zmm_t(i), zmm_t(i), zmm_t(i)); - } - } - for (int i = 0; i < groups[g]; ++i) { - vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]); - vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx)); - wgt_offset += block_len; - } - // last one, save - if (k == k_ - 1) { - for (int i = 0; i < groups[g]; ++i) { - // only rest save should be careful - if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) { - break; - } - vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i)); - } - } - x_offset += sizeof(float); - } - z_offset += block_len * groups[g]; - } - - if (rest != 0) { - // below should refine with mask - int reg_idx = groups.back() - 1; - z_offset = (n_ - rest) * sizeof(float); - int inner_block = 8; - while (rest > 0) { - if (rest >= 8) { - inner_block = 8; - vmovups(ptr[param_z + z_offset], ymm_t(reg_idx)); - // shift zmm of inner_block, change reg_idx if update - } else if (rest >= 4) { - inner_block = 4; - vmovups(ptr[param_z + z_offset], xmm_t(reg_idx)); - } else if (rest >= 2) { - inner_block = 2; - vmovq(ptr[param_z + z_offset], xmm_t(reg_idx)); - } else { - inner_block = 1; - vmovss(ptr[param_z + z_offset], xmm_t(reg_idx)); - } - z_offset += inner_block * sizeof(float); - rest -= inner_block; - } - } - - postCode(); -} - -class MatMulCreator : public JitCodeCreator { - public: - bool CanBeUsed(const matmul_attr_t& attr) const override { - return attr.m == 1 && x86::MayIUse(x86::avx512f) && - attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; - } - size_t CodeSize(const matmul_attr_t& attr) const override { - int block = YMM_FLOAT_BLOCK; - if (x86::MayIUse(x86::avx512f)) { - block = ZMM_FLOAT_BLOCK; - } - return 96 + 4 * attr.k * (attr.n / block + 1) * 8; - } - std::unique_ptr CreateJitCode( - const matmul_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.m, 0); - PADDLE_ENFORCE_GT(attr.n, 0); - PADDLE_ENFORCE_GT(attr.k, 0); - return make_unique(attr, CodeSize(attr)); - } -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator); diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h deleted file mode 100644 index b1b302b790..0000000000 --- a/lite/backends/x86/jit/gen/matmul.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include // for malloc and free -#include -#include -#include "glog/logging.h" -#include "lite/backends/x86/jit/gen/jitcode.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -class MatMulJitCode : public JitCode { - public: - explicit MatMulJitCode(const matmul_attr_t& attr, - size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { - PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); - this->genCode(); - } - - std::string name() const override { - std::string base = "MatMulJitCode"; - base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + - std::to_string(k_); - return base; - } - void genCode() override; - - private: - int m_, n_, k_; - - reg64_t param_x{abi_param1}; - reg64_t param_y{abi_param2}; - reg64_t param_z{abi_param3}; - reg64_t param_attr{abi_param4}; - reg64_t reg_tmp{rax}; - - reg64_t reg_ptr_wgt{r10}; -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc deleted file mode 100644 index e0cf5e5a5a..0000000000 --- a/lite/backends/x86/jit/gen/seqpool.cc +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/seqpool.h" -#include -#include "lite/backends/x86/jit/gen/act.h" // for exp_float_consts ones -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void SeqPoolJitCode::genCode() { - constexpr int block = YMM_FLOAT_BLOCK; - constexpr int max_num_regs = 8; - const int num_block = w_ / block; - const int num_groups = num_block / max_num_regs; - int rest_num_regs = num_block % max_num_regs; - mov(reg32_int_h, dword[param_attr]); - if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - mov(reg_tmp, reinterpret_cast(exp_float_consts)); - vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]); - mov(reg_tmp, reinterpret_cast(fp_h_)); - fild(dword[param_attr]); - fstp(dword[reg_tmp]); - vmovss(xmm_t(0), ptr[reg_tmp]); - if (type_ == SeqPoolType::kSqrt) { - vsqrtps(xmm_t(0), xmm_t(0)); - } - vdivps(xmm_t(1), xmm_t(1), xmm_t(0)); - vmovss(ptr[reg_tmp], xmm_t(1)); - } - const int group_len = max_num_regs * block * sizeof(float); - for (int g = 0; g < num_groups; ++g) { - pool_height(g * group_len, block, max_num_regs); - } - if (rest_num_regs > 0) { - pool_height(num_groups * group_len, block, rest_num_regs); - } - // part of rest_w * height - const int rest = w_ % block; - pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs); - ret(); -} - -class SeqPoolCreator : public JitCodeCreator { - public: - bool CanBeUsed(const seq_pool_attr_t& attr) const override { - return x86::MayIUse(x86::avx); - } - size_t CodeSize(const seq_pool_attr_t& attr) const override { - return 96 + - ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) * - 4 /* load, mul and save */ + - 256) * - 8; - } - std::unique_ptr CreateJitCode( - const seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.w, 0); - PADDLE_ENFORCE_GT(attr.h, 0); - return make_unique(attr, CodeSize(attr)); - } -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator); diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h deleted file mode 100644 index 346179cfbb..0000000000 --- a/lite/backends/x86/jit/gen/seqpool.h +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/backends/x86/jit/gen/jitcode.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -class SeqPoolJitCode : public JitCode { - public: - explicit SeqPoolJitCode(const seq_pool_attr_t& attr, - size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) { - if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg || - type_ == SeqPoolType::kSqrt)) { - LOG(FATAL) << "Only supported pool type: sum, avg and sqrt."; - } - fp_h_[0] = 1.f; - this->genCode(); - } - - std::string name() const override { - std::string base = "SeqPoolJitCode"; - if (type_ == SeqPoolType::kSum) { - base += "_Sum"; - } else if (type_ == SeqPoolType::kAvg) { - base += "_Avg"; - } else if (type_ == SeqPoolType::kSqrt) { - base += "_Sqrt"; - } - base += ("_W" + std::to_string(w_)); - return base; - } - void genCode() override; - - protected: - template - void pool_height(int w_offset, int block, int max_num_regs) { - int offset = w_offset; - for (int i = 0; i < max_num_regs; ++i) { - vmovups(JMM(i), ptr[param_src + offset]); - offset += sizeof(float) * block; - } - cmp(reg32_int_h, 1); - Label l_next_h, l_h_done; - jle(l_h_done, T_NEAR); - mov(reg_h_i, 1); - mov(reg_tmp, param_src); - add(reg_tmp, w_ * sizeof(float) + w_offset); - L(l_next_h); - { - mov(reg_ptr_src_i, reg_tmp); - for (int i = 0; i < max_num_regs; ++i) { - vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]); - // sum anyway - vaddps(JMM(i), JMM(i), JMM(i + max_num_regs)); - add(reg_ptr_src_i, sizeof(float) * block); - } - inc(reg_h_i); - add(reg_tmp, w_ * sizeof(float)); - cmp(reg_h_i, reg32_int_h); - jl(l_next_h, T_NEAR); - } - L(l_h_done); - // save right now - if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - mov(reg_tmp, reinterpret_cast(fp_h_)); - vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]); - } - offset = w_offset; - for (int i = 0; i < max_num_regs; ++i) { - if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - vmulps(JMM(i), JMM(i), JMM(max_num_regs)); - } - vmovups(ptr[param_dst + offset], JMM(i)); - offset += sizeof(float) * block; - } - } - - void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) { - const int rest_used_num_regs = load_rest(rest, w_offset, 0); - const bool has_block4 = rest / 4 > 0; - const bool has_block2 = (rest % 4) / 2 > 0; - const bool has_block1 = (rest % 2) == 1; - cmp(reg32_int_h, 1); - Label l_next_h, l_h_done; - jle(l_h_done, T_NEAR); - mov(reg_h_i, 1); - mov(reg_tmp, param_src); - add(reg_tmp, w_ * sizeof(float) + w_offset); - L(l_next_h); - { - int reg_idx = 0; - mov(reg_ptr_src_i, reg_tmp); - if (has_block4) { - vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); - add(reg_ptr_src_i, sizeof(float) * 4); - reg_idx++; - } - if (has_block2) { - vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); - add(reg_ptr_src_i, sizeof(float) * 2); - reg_idx++; - } - if (has_block1) { - vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); - reg_idx++; - } - PADDLE_ENFORCE_EQ( - reg_idx, rest_used_num_regs, "All heights should use same regs"); - for (int i = 0; i < reg_idx; ++i) { - vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); - } - inc(reg_h_i); - add(reg_tmp, w_ * sizeof(float)); - cmp(reg_h_i, reg32_int_h); - jl(l_next_h, T_NEAR); - } - L(l_h_done); - // save right now - if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) { - mov(reg_tmp, reinterpret_cast(fp_h_)); - vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]); - for (int i = 0; i < rest_used_num_regs; ++i) { - vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs)); - } - } - save_rest(rest, w_offset); - } - - // return the number of used regs, use start from reg 0 - int load_rest(int rest, - int w_offset, - const int num_shift_regs, - const int reg_start = 0) { - const bool has_block4 = rest / 4 > 0; - const bool has_block2 = (rest % 4) / 2 > 0; - const bool has_block1 = (rest % 2) == 1; - int reg_idx = reg_start; - if (has_block4) { - vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); - w_offset += sizeof(float) * 4; - reg_idx++; - } - if (has_block2) { - vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); - w_offset += sizeof(float) * 2; - reg_idx++; - } - if (has_block1) { - vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]); - reg_idx++; - } - return reg_idx; - } - - // use reg start from 0 - void save_rest(int rest, int w_offset, int reg_start = 0) { - const bool has_block4 = rest / 4 > 0; - const bool has_block2 = (rest % 4) / 2 > 0; - const bool has_block1 = (rest % 2) == 1; - int reg_idx = reg_start; - if (has_block4) { - vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx)); - w_offset += sizeof(float) * 4; - reg_idx++; - } - if (has_block2) { - vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx)); - w_offset += sizeof(float) * 2; - reg_idx++; - } - if (has_block1) { - vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx)); - } - } - - private: - float ALIGN32_BEG fp_h_[1] ALIGN32_END; - int w_; - SeqPoolType type_; - reg64_t param_src{abi_param1}; - reg64_t param_dst{abi_param2}; - reg64_t param_attr{abi_param3}; - reg64_t reg_tmp{rax}; - - reg32_t reg32_int_h{r8d}; - reg32_t reg32_fp_h{r9d}; - - reg64_t reg_h_i{r10}; - reg64_t reg_ptr_src_i{r11}; -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc deleted file mode 100644 index 10659f5084..0000000000 --- a/lite/backends/x86/jit/gen/sgd.cc +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/sgd.h" -#include // offsetof -#include -#include -#include "lite/backends/x86/jit/registry.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void SgdJitCode::genCode() { - preCode(); - constexpr int block = YMM_FLOAT_BLOCK; - constexpr int max_num_regs = 7; - const int num_block = w_ / block; - const int num_groups = num_block / max_num_regs; - const size_t block_size = sizeof(float) * block; - const size_t width_size = w_ * sizeof(float); - std::vector groups(num_groups, max_num_regs); - int rest_num_regs = num_block % max_num_regs; - if (rest_num_regs > 0) { - groups.push_back(rest_num_regs); - } - - vbroadcastss(ymm_lr, ptr[param_lr]); - // protect rdx - mov(reg_ptr_grad_i, param_grad); - mov(reg_ptr_rows_i, param_rows); - - mov(reg_rows_size_in_byte, - qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]); - mov(rax, sizeof(int64_t)); - mul(reg_rows_size_in_byte); - mov(reg_rows_size_in_byte, rax); - add(reg_rows_size_in_byte, reg_ptr_rows_i); - - Label l_next_row; - L(l_next_row); - { - mov(reg_row, qword[reg_ptr_rows_i]); - mov(rax, width_size); - mul(reg_row); - mov(reg_row, rax); - - mov(reg_ptr_param_i, param_param); - mov(reg_ptr_out_i, param_out); - add(reg_ptr_param_i, reg_row); - add(reg_ptr_out_i, reg_row); - - size_t w_offset = 0; - for (int num_regs : groups) { - // load grad - size_t inner_offfset = w_offset; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]); - inner_offfset += block_size; - } - - // load param - inner_offfset = w_offset; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]); - inner_offfset += block_size; - } - - // compute out - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr); - vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i)); - } - - // save out - inner_offfset = w_offset; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs)); - inner_offfset += block_size; - } - w_offset += (block_size * num_regs); - } - - add(reg_ptr_grad_i, width_size); - add(reg_ptr_rows_i, sizeof(int64_t)); - cmp(reg_ptr_rows_i, reg_rows_size_in_byte); - jl(l_next_row, T_NEAR); - } - - postCode(); -} - -class SgdCreator : public JitCodeCreator { - public: - bool CanBeUsed(const sgd_attr_t& attr) const override { - return x86::MayIUse(x86::avx) && attr.grad_width % YMM_FLOAT_BLOCK == 0; - } - size_t CodeSize(const sgd_attr_t& attr) const override { - return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8; - } - std::unique_ptr CreateJitCode( - const sgd_attr_t& attr) const override { - PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); - PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); - PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); - return make_unique(attr, CodeSize(attr)); - } -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator); diff --git a/lite/backends/x86/jit/gen/sgd.h b/lite/backends/x86/jit/gen/sgd.h deleted file mode 100644 index 303d94f2ab..0000000000 --- a/lite/backends/x86/jit/gen/sgd.h +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "glog/logging.h" -#include "lite/backends/x86/jit/gen/jitcode.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -class SgdJitCode : public JitCode { - public: - explicit SgdJitCode(const sgd_attr_t& attr, - size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), w_(attr.grad_width) { - this->genCode(); - } - - DECLARE_JIT_CODE(SgdJitCode); - void genCode() override; - - private: - int w_; - reg64_t param_lr{abi_param1}; - reg64_t param_param{abi_param2}; - reg64_t param_grad{abi_param3}; - reg64_t param_rows{abi_param4}; - reg64_t param_out{abi_param5}; - reg64_t param_attr{abi_param6}; - - ymm_t ymm_lr = ymm_t(15); - - reg64_t reg_ptr_grad_i{r10}; - reg64_t reg_ptr_rows_i{r11}; - reg64_t reg_rows_size_in_byte{r12}; - reg64_t reg_row{r13}; - reg64_t reg_ptr_param_i{r14}; - reg64_t reg_ptr_out_i{r15}; -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc deleted file mode 100644 index 9e02dca8c4..0000000000 --- a/lite/backends/x86/jit/gen/vbroadcast.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen/vbroadcast.h" -#include -#include -#include "lite/backends/x86/jit/registry.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -void VBroadcastJitCode::genCode() { - preCode(); - constexpr int block = YMM_FLOAT_BLOCK; - constexpr int max_num_regs = 16; - const int num_block = w_ / block; - const int num_groups = num_block / max_num_regs; - const size_t block_size = sizeof(float) * block; - std::vector groups(num_groups, max_num_regs); - int rest_num_regs = num_block % max_num_regs; - if (rest_num_regs > 0) { - groups.push_back(rest_num_regs); - } - - // protect param_h - mov(reg_height, param_h); - Label l_next_h; - xor_(reg_h_i, reg_h_i); - mov(reg_ptr_dst_i, param_dst); - L(l_next_h); - { - mov(reg_ptr_src_i, param_src); - for (int num_regs : groups) { - size_t w_offset = 0; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]); - w_offset += block_size; - } - add(reg_ptr_src_i, num_regs * block_size); - - w_offset = 0; - for (int reg_i = 0; reg_i < num_regs; ++reg_i) { - vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i)); - w_offset += block_size; - } - add(reg_ptr_dst_i, num_regs * block_size); - } // end of groups - inc(reg_h_i); - cmp(reg_h_i, reg_height); - jl(l_next_h, T_NEAR); - } // end of l_next_h - - postCode(); -} - -class VBroadcastCreator : public JitCodeCreator { - public: - bool CanBeUsed(const int64_t& w) const override { - return x86::MayIUse(x86::avx) && w % YMM_FLOAT_BLOCK == 0; - } - size_t CodeSize(const int64_t& w) const override { - return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; - } - std::unique_ptr CreateJitCode(const int64_t& w) const override { - PADDLE_ENFORCE_GT(w, 0); - return make_unique(w, CodeSize(w)); - } -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace gen = paddle::lite::jit::gen; - -REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator); diff --git a/lite/backends/x86/jit/gen/vbroadcast.h b/lite/backends/x86/jit/gen/vbroadcast.h deleted file mode 100644 index 39bcd4965f..0000000000 --- a/lite/backends/x86/jit/gen/vbroadcast.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "glog/logging.h" -#include "lite/backends/x86/jit/gen/jitcode.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace gen { - -class VBroadcastJitCode : public JitCode { - public: - explicit VBroadcastJitCode(const int64_t& w, - size_t code_size = 256 * 1024, - void* code_ptr = nullptr) - : JitCode(code_size, code_ptr), w_(w) { - this->genCode(); - } - - DECLARE_JIT_CODE(VBroadcastJitCode); - void genCode() override; - - private: - int w_; - reg64_t param_src{abi_param1}; - reg64_t param_dst{abi_param2}; - reg64_t param_h{abi_param3}; - reg64_t param_w{abi_param4}; - - reg64_t reg_height{r9}; - reg64_t reg_h_i{r10}; - reg64_t reg_ptr_src_i{r11}; - reg64_t reg_ptr_dst_i{r12}; -}; - -} // namespace gen -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc deleted file mode 100644 index 38250d533d..0000000000 --- a/lite/backends/x86/jit/gen_base.cc +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/gen_base.h" -#include -#include -#include -#include -// #include "paddle/fluid/memory/allocation/cpu_allocator.h" // for -// posix_memalign -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/macro.h" -#include "lite/utils/paddle_enforce.h" - -#ifndef _WIN32 -#define posix_memalign_free free -#endif - -DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); - -namespace paddle { -namespace lite { -namespace jit { - -// refer do not need CanBeUsed, it would be the last one. -void GenBase::dumpCode(const unsigned char* code) const { - if (code) { - static int counter = 0; - std::ostringstream filename; - filename << "paddle_jitcode_" << name() << "." << counter << ".bin"; - counter++; - std::ofstream fout(filename.str(), std::ios::out); - if (fout.is_open()) { - fout.write(reinterpret_cast(code), this->getSize()); - fout.close(); - } - } -} - -void* GenBase::operator new(size_t size) { - void* ptr; - constexpr size_t alignment = 32ul; - PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), - 0, - "GenBase Alloc %ld error!", - size); - PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); - return ptr; -} - -void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); } - -std::vector packed_groups(int n, int k, int* block_out, int* rest_out) { - int block; - int max_num_regs; - if (x86::MayIUse(x86::avx512f)) { - block = ZMM_FLOAT_BLOCK; - max_num_regs = 32; - } else { - block = YMM_FLOAT_BLOCK; - max_num_regs = 16; - } - // one for x, one for y, others for z - const int max_used_regs_for_n = max_num_regs - 2; - const int aligned_n = n % block == 0 ? n : (n / block + 1) * block; - const int num_block = aligned_n / block; - const int num_groups = num_block / max_used_regs_for_n; - std::vector groups(num_groups, max_used_regs_for_n); - int rest_num_regs = num_block % max_used_regs_for_n; - if (rest_num_regs != 0) { - groups.push_back(rest_num_regs); - } - if (block_out) { - *block_out = block; - } - if (rest_out) { - *rest_out = n % block; - } - return groups; -} - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/gen_base.h b/lite/backends/x86/jit/gen_base.h deleted file mode 100644 index b5f942615a..0000000000 --- a/lite/backends/x86/jit/gen_base.h +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include // for unique_ptr -#include -#include -#include "lite/backends/x86/jit/kernel_base.h" - -DECLARE_bool(dump_jitcode); - -namespace paddle { -namespace lite { -namespace jit { - -class GenBase : public Kernel { - public: - virtual ~GenBase() = default; - virtual std::string name() const = 0; - virtual size_t getSize() const = 0; - virtual const unsigned char* getCodeInternal() const = 0; - const char* ImplType() const override { return "JitCode"; } - template - Func getCode() const { - const unsigned char* code = this->getCodeInternal(); - if (FLAGS_dump_jitcode) { - this->dumpCode(code); - } - // Note: failed to cast with reinterpret_cast on Mac clang, - // then workaround with const_cast. Any better idea is appreciated. - return reinterpret_cast(const_cast(code)); - } - - void* operator new(size_t size); - void operator delete(void* ptr); - void* operator new[](size_t size) { return operator new(size); } - void operator delete[](void* ptr) { operator delete(ptr); } - - protected: - void dumpCode(const unsigned char* code) const; -}; - -// Creator is used to creat the jitcode and save in pool. -// Every JitCode should have one creator. -class GenCreator { - public: - virtual ~GenCreator() = default; -}; - -template -class JitCodeCreator : public GenCreator { - public: - virtual ~JitCodeCreator() = default; - - // condition when this jit code can be used. - virtual bool CanBeUsed(const Attr& attr) const = 0; - - // estimate this code size - virtual size_t CodeSize(const Attr& attr) const = 0; - - // create this code - virtual std::unique_ptr CreateJitCode(const Attr& attr) const = 0; -}; - -// unify the method of packed groups -// output the packed groups which used in weights, the block size and rest size -std::vector packed_groups(int n, - int k, - int* block = nullptr, - int* rest = nullptr); - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/helper.cc b/lite/backends/x86/jit/helper.cc deleted file mode 100644 index 8322f7ebd2..0000000000 --- a/lite/backends/x86/jit/helper.cc +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/helper.h" -#include // tolower -#include -#include -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { - -#define ONE_CASE(key) \ - case key: \ - return #key - -const char* to_string(KernelType kt) { - switch (kt) { - ONE_CASE(kNone); - ONE_CASE(kVMul); - ONE_CASE(kVAdd); - ONE_CASE(kVAddRelu); - ONE_CASE(kVSub); - ONE_CASE(kVScal); - ONE_CASE(kStrideScal); - ONE_CASE(kVAddBias); - ONE_CASE(kVRelu); - ONE_CASE(kVBroadcast); - ONE_CASE(kVCopy); - ONE_CASE(kVIdentity); - ONE_CASE(kVExp); - ONE_CASE(kVSquare); - ONE_CASE(kVSigmoid); - ONE_CASE(kVTanh); - ONE_CASE(kLSTMCtHt); - ONE_CASE(kLSTMC1H1); - ONE_CASE(kGRUH1); - ONE_CASE(kGRUHtPart1); - ONE_CASE(kGRUHtPart2); - ONE_CASE(kCRFDecoding); - ONE_CASE(kLayerNorm); - ONE_CASE(kNCHW16CMulNC); - ONE_CASE(kSeqPool); - ONE_CASE(kMatMul); - ONE_CASE(kHMax); - ONE_CASE(kHSum); - ONE_CASE(kStrideASum); - ONE_CASE(kSoftmax); - ONE_CASE(kEmbSeqPool); - ONE_CASE(kSgd); - default: - LOG(FATAL) << "Not support type: %d, or forget to add it."; - return "NOT JITKernel"; - } - return nullptr; -} - -const char* to_string(SeqPoolType tp) { - switch (tp) { - ONE_CASE(kNonePoolType); - ONE_CASE(kSum); - ONE_CASE(kAvg); - ONE_CASE(kSqrt); - default: - LOG(FATAL) << "Not support type: %d, or forget to add it."; - return "NOT PoolType"; - } - return nullptr; -} -#undef ONE_CASE - -KernelType to_kerneltype(const std::string& act) { - std::string lower = act; - std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); - if (lower == "relu" || lower == "vrelu") { - return kVRelu; - } else if (lower == "identity" || lower == "videntity" || lower == "") { - return kVIdentity; - } else if (lower == "exp" || lower == "vexp") { - return kVExp; - } else if (lower == "sigmoid" || lower == "vsigmoid") { - return kVSigmoid; - } else if (lower == "tanh" || lower == "vtanh") { - return kVTanh; - } - LOG(FATAL) << "Not support type: %s, or forget to add this case"; - return kNone; -} - -template <> -void pack_weights(const float* src, float* dst, int n, int k) { - int block, rest; - const auto groups = packed_groups(n, k, &block, &rest); - std::for_each(groups.begin(), groups.end(), [&](int i) { - PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); - }); - int sum = std::accumulate(groups.begin(), groups.end(), 0); - std::memset(dst, 0, k * sum * block * sizeof(float)); - PADDLE_ENFORCE_GE( - sum * block, n, "The packed n should be equal to or larger than n"); - - const int block_len = sizeof(float) * block; - int n_offset = 0; - - for (size_t g = 0; g < groups.size(); ++g) { - const float* from = src + n_offset; - for (int j = 0; j < k; ++j) { - size_t copy_sz = groups[g] * block_len; - if (g == groups.size() - 1 && rest != 0) { - copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float); - } - std::memcpy(dst, from + j * n, copy_sz); - dst += groups[g] * block; - } - n_offset += groups[g] * block; - } -} - -template -typename std::enable_if::value>::type pack_weights( - const T* src, T* dst, int n, int k) { - LOG(FATAL) << "Only support pack with float type."; -} - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/helper.h b/lite/backends/x86/jit/helper.h deleted file mode 100644 index b21be9466c..0000000000 --- a/lite/backends/x86/jit/helper.h +++ /dev/null @@ -1,267 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include -#include // for std::move -#include -#include "lite/backends/x86/jit/gen_base.h" -#include "lite/backends/x86/jit/kernel_base.h" -#include "lite/backends/x86/jit/kernel_key.h" -#include "lite/backends/x86/jit/kernel_pool.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { - -template -inline typename std::enable_if< - std::is_same::value, - const Kernel*>::type -GetJitCode(const typename KernelTuple::attr_type& attr) { - using Attr = typename KernelTuple::attr_type; - int64_t key = JitCodeKey(attr); - auto& codes = JitCodePool::Instance(); - if (codes.Has(key)) { - return codes.AllKernels().at(key).get(); - } - - // creator is not related with attr, so can use KernelKey as key - KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - // pool: (KernelKey(type, place), vector) - auto& creator_map = JitCodeCreatorPool::Instance().AllCreators(); - auto iter = creator_map.find(kkey); - if (iter != creator_map.end()) { - auto& creators = iter->second; - for (auto& cur : creators) { - auto i = dynamic_cast*>(cur.get()); - if (i && i->CanBeUsed(attr)) { - auto p = i->CreateJitCode(attr); - if (p) { - auto res = p.get(); - codes.Insert(key, std::move(p)); - return res; - } - } - } - } - return nullptr; -} - -template -inline typename std::enable_if< - !std::is_same::value, - const Kernel*>::type -GetJitCode(const typename KernelTuple::attr_type& attr) { - return nullptr; -} - -// Refer code do not related with attr, which is just for cast -// Refer is always on CPUPlace -template -inline const Kernel* GetReferKernel() { - auto& ref_pool = ReferKernelPool::Instance().AllKernels(); - KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace()); - auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); - auto& ref_impls = ref_iter->second; - for (auto& impl : ref_impls) { - auto i = dynamic_cast*>(impl.get()); - if (i) { - return i; - } - } - return nullptr; -} - -template -inline typename KernelTuple::func_type GetReferFunc() { - auto ker = GetReferKernel(); - auto p = dynamic_cast*>(ker); - PADDLE_ENFORCE(p, "The Refer kernel should exsit"); - return p->GetFunc(); -} - -// Return all Kernels that can be used -template -std::vector GetAllCandidateKernels( - const typename KernelTuple::attr_type& attr) { - // the search order shoudl be jitcode > more > refer - std::vector res; - auto jitker = GetJitCode(attr); - if (jitker) { - res.emplace_back(jitker); - } - - // more kernelpool: (KernelKey(type, place), vector) - KernelKey kkey(KernelTuple::kernel_type, PlaceType()); - auto& pool = KernelPool::Instance().AllKernels(); - auto iter = pool.find(kkey); - if (iter != pool.end()) { - auto& impls = iter->second; - for (auto& impl : impls) { - auto i = dynamic_cast*>(impl.get()); - if (i && i->CanBeUsed(attr)) { - res.emplace_back(i); - } - } - } - - // The last implementation should be reference function on CPUPlace. - auto ref = GetReferKernel(); - PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); - res.emplace_back(ref); - return res; -} - -template -std::vector> -GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { - using Func = typename KernelTuple::func_type; - auto kers = GetAllCandidateKernels(attr); - std::vector> res; - for (auto k : kers) { - std::string name = k->ImplType(); - if (name == "JitCode") { - auto i = dynamic_cast(k); - PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); - res.emplace_back(std::make_pair(name, i->template getCode())); - } else { - auto i = dynamic_cast*>(k); - PADDLE_ENFORCE(i, "kernel cast can not fail."); - res.emplace_back(std::make_pair(name, i->GetFunc())); - } - } - return res; -} - -template -std::vector GetAllCandidateFuncs( - const typename KernelTuple::attr_type& attr) { - auto funcs = GetAllCandidateFuncsWithTypes(attr); - std::vector res; - for (auto& i : funcs) { - res.emplace_back(i.second); - } - return res; -} - -template -typename KernelTuple::func_type GetDefaultBestFunc( - const typename KernelTuple::attr_type& attr) { - auto funcs = GetAllCandidateFuncs(attr); - PADDLE_ENFORCE_GE(funcs.size(), 1UL); - // Here could do some runtime benchmark of this attr and return the best one. - // But yet just get the first one as the default best one, - // which is searched in order and tuned by offline. - return funcs[0]; -} - -template -class KernelFuncs { - public: - KernelFuncs() = default; - static KernelFuncs& Cache() { - static thread_local KernelFuncs g_func_cache; - return g_func_cache; - } - - // the exposed interface to use - typename KernelTuple::func_type At( - const typename KernelTuple::attr_type& attr) { - // Maybe here is not good enough, not all kernels should have jitcode - int64_t key = JitCodeKey(attr); - if (Has(key)) { - return funcs_.at(key); - } - // If do not have this attr in cache then get the default best - auto func = GetDefaultBestFunc(attr); - Insert(key, func); - return func; - } - - typename KernelTuple::func_type operator[]( - const typename KernelTuple::attr_type& attr) { - return At(attr); - } - - protected: - bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); } - void Insert(int64_t key, typename KernelTuple::func_type func) { - funcs_.emplace(key, func); - } - - private: - std::unordered_map funcs_; -}; - -const char* to_string(KernelType kt); -const char* to_string(SeqPoolType kt); - -KernelType to_kerneltype(const std::string& act); - -inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) { - os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) - << "],act_cand[" << to_string(attr.act_cand) << "],act_cell[" - << to_string(attr.act_cell) << "],use_peephole[" - << (attr.use_peephole ? "True" : "False") << "]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { - os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate) - << "],act_cand[" << to_string(attr.act_cand) << "]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) { - os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type[" - << to_string(attr.type) << "]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, - const emb_seq_pool_attr_t& attr) { - os << "table_height[" << attr.table_height << "],table_width[" - << attr.table_width << "],index_height[" << attr.index_height - << "],index_width[" << attr.index_width << "],output_width[" - << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const sgd_attr_t& attr) { - os << "param_height[" << attr.param_height << "],param_width[" - << attr.param_width << "],grad_height[" << attr.grad_height - << "],grad_width[" << attr.grad_width << "],selected_rows_size[" - << attr.selected_rows_size << "]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) { - os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]"; - return os; -} - -// expose the method to pack matmul weight -template -void pack_weights(const T* src, T* dst, int n, int k); - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/kernel_base.h b/lite/backends/x86/jit/kernel_base.h deleted file mode 100644 index dbe44a78ac..0000000000 --- a/lite/backends/x86/jit/kernel_base.h +++ /dev/null @@ -1,365 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once -#include -#include "lite/backends/x86/jit/macro.h" - -namespace paddle { -namespace lite { -namespace jit { - -typedef enum { - kNone = 0, - // sort by alphabet - kCRFDecoding = 1, - kEmbSeqPool = 2, - kGRUH1, - kGRUHtPart1, - kGRUHtPart2, - kHSum, // horizontal max - kHMax, // horizontal sum - kLSTMCtHt, - kLSTMC1H1, - kLayerNorm, - kMatMul, - kNCHW16CMulNC, - kSeqPool, - kSoftmax, - kStrideASum, - kStrideScal, - kVAdd, - kVAddBias, - kVAddRelu, - kVBroadcast, - kVCopy, - kVExp, - kVIdentity, - kVMul, - kVRelu, - kVScal, - kSgd, - kVSigmoid, - kVSquare, - kVSub, - kVTanh, -} KernelType; - -typedef enum { - kNonePoolType = 0, - kSum = 1, - kAvg, - kSqrt, -} SeqPoolType; - -// x, y, z, n -template -struct XYZNTuple { - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, const T*, T*, int); -}; - -// a, x, y, n -template -struct AXYNTuple : public XYZNTuple {}; - -// a, x, y, n, stride -template -struct AXYNSTuple { - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, const T*, T*, int, int); -}; - -// x, y, n -template -struct XYNTuple { - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, T*, int); -}; - -// x, returned value, n -template -struct XRNTuple : public XYNTuple {}; - -// x, returned value, n, stride -template -struct XRNSTuple { - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, T*, int, int); -}; - -#define DECLARE_KERNELTUPLE(kernel_tuple, type) \ - template \ - struct type##Tuple : public kernel_tuple { \ - static constexpr KernelType kernel_type = k##type; \ - } - -// Tuple should be corresponding to the KernelType -DECLARE_KERNELTUPLE(XYZNTuple, VMul); -DECLARE_KERNELTUPLE(XYZNTuple, VAdd); -DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu); -DECLARE_KERNELTUPLE(XYZNTuple, VSub); - -DECLARE_KERNELTUPLE(AXYNTuple, VScal); -DECLARE_KERNELTUPLE(AXYNTuple, VAddBias); - -DECLARE_KERNELTUPLE(AXYNSTuple, StrideScal); - -DECLARE_KERNELTUPLE(XYNTuple, VRelu); -DECLARE_KERNELTUPLE(XYNTuple, VIdentity); -DECLARE_KERNELTUPLE(XYNTuple, VSquare); -DECLARE_KERNELTUPLE(XYNTuple, VExp); -DECLARE_KERNELTUPLE(XYNTuple, VSigmoid); -DECLARE_KERNELTUPLE(XYNTuple, VTanh); -DECLARE_KERNELTUPLE(XYNTuple, VCopy); - -DECLARE_KERNELTUPLE(XRNTuple, HMax); -DECLARE_KERNELTUPLE(XRNTuple, HSum); - -DECLARE_KERNELTUPLE(XRNSTuple, StrideASum); - -typedef struct { - void* gates; // gates: x_ch, x_ih, x_fh, x_oh - const void* ct_1; - void* ct; - void* ht; - /* weight_peephole and checked data are only used in peephole*/ - const void* wp{nullptr}; // W_ic, W_fc, W_oc - void* checked{nullptr}; // size: 2 * d -} lstm_t; - -typedef struct { - void* gates; // gates: {x_update, x_reset; x_state} - const void* ht_1; - void* ht; -} gru_t; - -struct rnn_attr_s { - int d; - KernelType act_gate, act_cand; - rnn_attr_s() = default; - explicit rnn_attr_s(int _d, KernelType _act_gate, KernelType _act_cand) - : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} -}; - -struct lstm_attr_s : public rnn_attr_s { - bool use_peephole; - KernelType act_cell; - lstm_attr_s() = default; - explicit lstm_attr_s(int _d, - KernelType _act_gate, - KernelType _act_cand, - KernelType _act_cell, - bool _use_peephole = false) - : rnn_attr_s(_d, _act_gate, _act_cand), - use_peephole(_use_peephole), - act_cell(_act_cell) {} -}; - -typedef struct rnn_attr_s gru_attr_t; -typedef struct lstm_attr_s lstm_attr_t; - -template -struct LSTMTuple { - typedef T data_type; - typedef lstm_attr_t attr_type; - typedef void (*func_type)(lstm_t*, const lstm_attr_t*); -}; - -template -struct GRUTuple { - typedef T data_type; - typedef gru_attr_t attr_type; - typedef void (*func_type)(gru_t*, const gru_attr_t*); -}; - -DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt); -DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1); - -DECLARE_KERNELTUPLE(GRUTuple, GRUH1); -DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1); -DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2); - -#undef DECLARE_KERNELTUPLE - -template -struct VBroadcastTuple { - static constexpr KernelType kernel_type = kVBroadcast; - typedef T data_type; - typedef int64_t attr_type; - typedef void (*func_type)(const T*, T*, int64_t, int64_t); -}; - -typedef struct seq_pool_attr_s { - int h, w; // h should always be the first one - SeqPoolType type; - seq_pool_attr_s() = default; - explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1) - : h(height), w(width), type(pool_type) {} -} seq_pool_attr_t; - -template -struct SeqPoolTuple { - static constexpr KernelType kernel_type = kSeqPool; - typedef T data_type; - typedef seq_pool_attr_t attr_type; - typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); -}; - -typedef struct emb_seq_pool_attr_s { - int64_t table_height, table_width; - int64_t index_height, index_width; - int64_t out_width; - SeqPoolType pool_type; - emb_seq_pool_attr_s() = default; - explicit emb_seq_pool_attr_s(int64_t tbl_height, - int64_t tbl_width, - int64_t idx_height, - int64_t idx_width, - int64_t output_width, - SeqPoolType seqpool_type = SeqPoolType::kSum) - : table_height(tbl_height), - table_width(tbl_width), - index_height(idx_height), - index_width(idx_width), - out_width(output_width), - pool_type(seqpool_type) {} -} emb_seq_pool_attr_t; - -template -struct EmbSeqPoolTuple { - static constexpr KernelType kernel_type = kEmbSeqPool; - typedef T data_type; - typedef emb_seq_pool_attr_t attr_type; - typedef void (*func_type)(const T*, - const int64_t*, - T*, - const emb_seq_pool_attr_t*); -}; - -typedef struct sgd_attr_s { - int64_t param_height, param_width; - int64_t grad_height, grad_width; - int64_t selected_rows_size; - sgd_attr_s() = default; - explicit sgd_attr_s(int64_t param_h, - int64_t param_w, - int64_t grad_h, - int64_t grad_w, - int64_t selected_rows_sz) - : param_height(param_h), - param_width(param_w), - grad_height(grad_h), - grad_width(grad_w), - selected_rows_size(selected_rows_sz) {} -} sgd_attr_t; - -template -struct SgdTuple { - static constexpr KernelType kernel_type = kSgd; - typedef T data_type; - typedef sgd_attr_t attr_type; - typedef void (*func_type)( - const T*, const T*, const T*, const int64_t*, T*, const sgd_attr_t*); -}; - -typedef struct matmul_attr_s { - int m, n, k; - void* packed_weight{nullptr}; - matmul_attr_s() = default; - explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr) - : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {} -} matmul_attr_t; - -template -struct MatMulTuple { - static constexpr KernelType kernel_type = kMatMul; - typedef T data_type; - typedef matmul_attr_t attr_type; - typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); -}; - -template -struct CRFDecodingTuple { - static constexpr KernelType kernel_type = kCRFDecoding; - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); -}; - -template -struct LayerNormTuple { - static constexpr KernelType kernel_type = kLayerNorm; - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)( - T*, T*, T*, T*, const T*, const T*, int, const float, int); -}; - -template -struct SoftmaxTuple { - static constexpr KernelType kernel_type = kSoftmax; - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, T*, int, int, int); -}; - -// nChw16c = nChw16c .* NC -template -struct NCHW16CMulNCTuple { - static constexpr KernelType kernel_type = kNCHW16CMulNC; - typedef T data_type; - typedef int attr_type; - typedef void (*func_type)(const T*, const T*, T*, int, int); -}; - -// Just for adding to kernel pool without template -class Kernel { - public: - Kernel() = default; - virtual ~Kernel() = default; - virtual const char* ImplType() const = 0; -}; - -template -class KernelMore : public Kernel { - public: - using T = typename KernelTuple::data_type; - using Func = typename KernelTuple::func_type; - using Attr = typename KernelTuple::attr_type; - virtual Func GetFunc() const { return func; } - // specify this kernel can be used, means it should not fail if use it. - virtual bool CanBeUsed(const Attr& attr) const = 0; - - protected: - Func func{nullptr}; -}; - -template -class ReferKernel : public KernelMore { - public: - // Refer code can always be used - bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override { - return true; - } - const char* ImplType() const override { return "Refer"; } -}; - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/kernel_key.cc b/lite/backends/x86/jit/kernel_key.cc deleted file mode 100644 index a6288fcf19..0000000000 --- a/lite/backends/x86/jit/kernel_key.cc +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/kernel_key.h" -#include // XXH64: 13.8 GB/s -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { - -template <> -int64_t JitCodeKey(const int& d) { - return d; -} - -template <> -int64_t JitCodeKey(const int64_t& d) { - return d; -} - -template <> -int64_t JitCodeKey(const gru_attr_t& attr) { - return XXH64(&attr, sizeof(gru_attr_t), 0); -} - -template <> -int64_t JitCodeKey(const lstm_attr_t& attr) { - int keys[5] = {attr.d, - static_cast(attr.act_gate), - static_cast(attr.act_cand), - static_cast(attr.act_cell), - static_cast(attr.use_peephole)}; - return XXH64(keys, sizeof(int) * 5, 0); -} - -template <> -int64_t JitCodeKey(const seq_pool_attr_t& attr) { - int keys[2] = {attr.w, static_cast(attr.type)}; - return XXH64(keys, sizeof(int) * 2, 0); -} - -template <> -int64_t JitCodeKey(const matmul_attr_t& attr) { - return XXH64(&attr, sizeof(int) * 3, 0); // m, n, k -} - -template <> -int64_t JitCodeKey(const emb_seq_pool_attr_t& attr) { - return attr.table_width; -} - -template <> -int64_t JitCodeKey(const sgd_attr_t& attr) { - return attr.grad_width; -} - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/kernel_key.h b/lite/backends/x86/jit/kernel_key.h deleted file mode 100644 index 6df3a20a4b..0000000000 --- a/lite/backends/x86/jit/kernel_key.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once -#include -#include -#include "lite/backends/x86/jit/kernel_base.h" -#include "lite/backends/x86/legacy_place.h" - -namespace paddle { -namespace lite { -namespace jit { - -struct KernelKey { - struct Hash { - size_t operator()(const KernelKey& key) const { - int place = key.place_.which(); // less than 2^8 - int type = static_cast(key.type_) << 8; // less than 2^(32-8) - std::hash hasher; - return hasher(place + type); - } - }; - - KernelType type_; - lite::fluid::Place place_; - - KernelKey(KernelType type, lite::fluid::Place place) - : type_(type), place_(place) {} - size_t hash_key() const { return Hash()(*this); } - - bool operator==(const KernelKey& o) const { - return /*platform::places_are_same_class(place_, o.place_)*/ true && - type_ == o.type_; - } - bool operator!=(const KernelKey& o) const { return !(*this == o); } -}; - -// Every JitCode should have a method to get the key from attribution -template -int64_t JitCodeKey(const Attr& attr); - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/kernel_pool.cc b/lite/backends/x86/jit/kernel_pool.cc deleted file mode 100644 index 43ad20c90c..0000000000 --- a/lite/backends/x86/jit/kernel_pool.cc +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/kernel_pool.h" -#include // for shared_ptr -#include -#include - -namespace paddle { -namespace lite { -namespace jit { - -JitCodeCreatorPool& JitCodeCreatorPool::Instance() { - static JitCodeCreatorPool g_creator_pool; - return g_creator_pool; -} - -KernelPool& KernelPool::Instance() { - static KernelPool g_kernel_pool; - return g_kernel_pool; -} - -ReferKernelPool& ReferKernelPool::Instance() { - static ReferKernelPool g_refer_kernel_pool; - return g_refer_kernel_pool; -} - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/kernel_pool.h b/lite/backends/x86/jit/kernel_pool.h deleted file mode 100644 index dc0b1bbf2e..0000000000 --- a/lite/backends/x86/jit/kernel_pool.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include // for unique_ptr -#include -#include -#include // for move -#include -#include "lite/backends/x86/jit/gen_base.h" -#include "lite/backends/x86/jit/kernel_base.h" -#include "lite/backends/x86/jit/kernel_key.h" - -namespace paddle { -namespace lite { -namespace jit { - -template -class JitCodePool { - typedef std::unique_ptr GenBasePtr; - typedef std::unordered_map JitCodeMap; - - public: - JitCodePool() = default; - static JitCodePool& Instance() { - static thread_local JitCodePool g_jit_codes; - return g_jit_codes; - } - - const JitCodeMap& AllKernels() { return codes_; } - - bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); } - - void Insert(int64_t key, GenBasePtr value) { - codes_.emplace(key, std::move(value)); - } - - private: - JitCodeMap codes_; -}; - -class JitCodeCreatorPool { - typedef std::unique_ptr GenCreatorPtr; - typedef std::unordered_map, - KernelKey::Hash> - GenCreatorPtrMap; - - public: - JitCodeCreatorPool() = default; - static JitCodeCreatorPool& Instance(); - GenCreatorPtrMap& AllCreators() { return creators_; } - void Insert(const KernelKey& key, GenCreatorPtr value) { - if (creators_.find(key) == creators_.end()) { - creators_.emplace(key, std::vector()); - } - creators_.at(key).emplace_back(std::move(value)); - } - - private: - GenCreatorPtrMap creators_; -}; - -typedef std::unique_ptr KernelPtr; -typedef std::unordered_map, KernelKey::Hash> - KernelMap; - -class KernelPool { - public: - static KernelPool& Instance(); - KernelPool() = default; - KernelMap& AllKernels() { return pool_; } - void Insert(const KernelKey& key, KernelPtr value) { - if (pool_.find(key) == pool_.end()) { - pool_.emplace(key, std::vector()); - } - pool_.at(key).emplace_back(std::move(value)); - } - - private: - KernelMap pool_; -}; - -// Every kernel should have refer code and it should be used in unit tests, -// so refer kernels should have it's independent kernel pool -class ReferKernelPool { - public: - static ReferKernelPool& Instance(); - ReferKernelPool() = default; - KernelMap& AllKernels() { return pool_; } - void Insert(const KernelKey& key, KernelPtr value) { - if (pool_.find(key) == pool_.end()) { - pool_.emplace(key, std::vector()); - } - pool_.at(key).emplace_back(std::move(value)); - } - - private: - KernelMap pool_; -}; - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/macro.h b/lite/backends/x86/jit/macro.h deleted file mode 100644 index 703342252f..0000000000 --- a/lite/backends/x86/jit/macro.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once -#include - -namespace paddle { -namespace lite { -namespace jit { - -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/more/CMakeLists.txt b/lite/backends/x86/jit/more/CMakeLists.txt deleted file mode 100644 index 2ddbbcd16a..0000000000 --- a/lite/backends/x86/jit/more/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ - -function(USE_JITKERNEL_MORE TARGET TYPE) - file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n") -endfunction() - -# enable it latter - if(WITH_MKLML) - add_subdirectory(mkl) - endif() - -if(WITH_AVX) - add_subdirectory(intrinsic) -endif() - -# mix should be last -add_subdirectory(mix) - -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE) diff --git a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt deleted file mode 100644 index 468937a4f6..0000000000 --- a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ - -file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base) - -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) - -# use mkl kernels by name and type -USE_JITKERNEL_MORE(kCRFDecoding, intrinsic) -USE_JITKERNEL_MORE(kLayerNorm, intrinsic) diff --git a/lite/backends/x86/jit/more/intrinsic/crf_decoding.cc b/lite/backends/x86/jit/more/intrinsic/crf_decoding.cc deleted file mode 100644 index d9c939f7ef..0000000000 --- a/lite/backends/x86/jit/more/intrinsic/crf_decoding.cc +++ /dev/null @@ -1,185 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/more/intrinsic/crf_decoding.h" -#include -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace intrinsic { -// Note: intrinsic code is not runtime build. -// For example, if you build code on AVX, and run on AVX512 it can only use AVX - -void CRFDecoding(const int seq_len, - const float* x, - const float* w, - float* alpha, - int* track, - int tag_num) { -#ifdef __AVX512F__ - const int step_size = ZMM_FLOAT_BLOCK; -#else - const int step_size = YMM_FLOAT_BLOCK; -#endif - const int end = tag_num / step_size; - const int rest = tag_num % step_size; - /* Setup the alpha initial value.*/ - int i_offset = 0; - int last_offset = rest - step_size; - for (int i = 0; i <= end; ++i) { -#ifdef __AVX512F__ - // Declare the variable for the content of weights, input and alpha values. - __m512 w_content, x_content, alpha_content; - // Load the relevant data into the variables from un-aligned address. - w_content = _mm512_loadu_ps(w + i_offset); - x_content = _mm512_loadu_ps(x + i_offset); - alpha_content = _mm512_add_ps(w_content, x_content); - // Save the alpha value. - _mm512_storeu_ps(alpha_value + i_offset, alpha_content); -#else - // AVX or AVX2 - // weights, input and alpha values. - __m256 w_content, x_content, alpha_content; - // Load the relevant data into the variables from un-aligned address. - w_content = _mm256_loadu_ps(w + i_offset); - x_content = _mm256_loadu_ps(x + i_offset); - alpha_content = _mm256_add_ps(w_content, x_content); - _mm256_storeu_ps(alpha + i_offset, alpha_content); -#endif - i_offset += step_size; - if (i == end - 1) { - if (rest > 0) { - i_offset += last_offset; - } else { - break; - } - } - } - // Use the column-major strategy to get the location of maximum score. - int seq_offset = 0; - constexpr int state_trans_base_idx = 2; - for (int k = 1; k < seq_len; ++k) { - int j_offset = 0; - for (int j = 0; j <= end; ++j) { -/* Initialize the variables of maximum score and location.*/ -#ifdef __AVX512F__ - __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); - __m512i max_j = _mm512_setzero_si512(); -#else - __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); - __m256i max_j = _mm256_set1_epi32(0); -#endif - /* Calculate the offset of transition_weights.*/ - int trans_offset = state_trans_base_idx * tag_num + j_offset; - for (int i = 0; i < tag_num; ++i) { -/* Initalize the content of alpha variable with related offset.*/ -#ifdef __AVX512F__ - __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i)); - /* Obtain the content of weights from un-aligned address.*/ - __m512 w_content = _mm512_loadu_ps(w + trans_offset); - __m512 score_v = _mm512_add_ps(alpha_content, w_content); - __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); - /* AVX512 instructions.*/ - max_j = _mm512_mask_set1_epi32(max_j, mask, i); - /* Update the max_score value.*/ - max_score = _mm512_max_ps(max_score, score_v); - -#else - __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); - /* Obtain the content of weights from un-aligned address.*/ - __m256 w_content = _mm256_loadu_ps(w + trans_offset); - __m256 score_v = _mm256_add_ps(alpha_content, w_content); - __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); -/* According to the mask value, update the index of the max_score.*/ -#ifdef __AVX2__ - max_j = _mm256_or_si256( - _mm256_andnot_si256((__m256i)mask, max_j), - _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); -#else - __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); - __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); - __m128i lo_mask = - _mm256_extractf128_si256(*(__m256i*)&mask, 0); // NOLINT - __m128i hi_mask = - _mm256_extractf128_si256(*(__m256i*)&mask, 1); // NOLINT - lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); - hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); - lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); - hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); - lo_max_j = _mm_or_si128(lo_mask, lo_max_j); - hi_max_j = _mm_or_si128(hi_mask, hi_max_j); - max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); - max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); -#endif - /* Update the max_score value.*/ - max_score = _mm256_max_ps(max_score, score_v); - -#endif - - trans_offset += tag_num; - } -/* Update the alpha and track values. */ -#ifdef __AVX512F__ - __m512 x_content = - _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset); - max_score = _mm512_add_ps(max_score, x_content); - _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score); - _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset + - this->num_ + j_offset), - max_j); -#else - __m256 x_content = _mm256_loadu_ps(x + seq_offset + tag_num + j_offset); - max_score = _mm256_add_ps(max_score, x_content); - _mm256_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score); - _mm256_storeu_si256( - reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset), - max_j); -#endif - - /* Calculate the offset of next step*/ - j_offset += step_size; - if (j == end - 1) { - if (rest > 0) { - j_offset += last_offset; - } else { - break; - } - } - } - seq_offset += tag_num; - } -} - -bool CRFDecodingKernel::CanBeUsed(const int& d) const { -#ifdef __AVX512F__ - constexpr int block = ZMM_FLOAT_BLOCK; -#else - constexpr int block = YMM_FLOAT_BLOCK; -#endif - return x86::MayIUse(x86::avx) && d >= block; -} - -} // namespace intrinsic -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace intrinsic = paddle::lite::jit::more::intrinsic; - -REGISTER_JITKERNEL_MORE(kCRFDecoding, intrinsic, intrinsic::CRFDecodingKernel); diff --git a/lite/backends/x86/jit/more/intrinsic/crf_decoding.h b/lite/backends/x86/jit/more/intrinsic/crf_decoding.h deleted file mode 100644 index 8a425fb491..0000000000 --- a/lite/backends/x86/jit/more/intrinsic/crf_decoding.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "lite/backends/x86/jit/kernel_base.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace intrinsic { - -void CRFDecoding(const int seq_len, - const float* x, - const float* w, - float* alpha, - int* track, - int tag_num); - -class CRFDecodingKernel : public KernelMore> { - public: - CRFDecodingKernel() { this->func = CRFDecoding; } - bool CanBeUsed( - const typename CRFDecodingTuple::attr_type&) const override; - const char* ImplType() const override { return "Intrinsic"; } -}; - -} // namespace intrinsic -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/more/intrinsic/layer_norm.cc b/lite/backends/x86/jit/more/intrinsic/layer_norm.cc deleted file mode 100644 index bfd3409e65..0000000000 --- a/lite/backends/x86/jit/more/intrinsic/layer_norm.cc +++ /dev/null @@ -1,181 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/more/intrinsic/layer_norm.h" -#include -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace intrinsic { - -void LayerNorm(float* x, - float* out, - float* mean, - float* var, - const float* scale, - const float* bias, - int height, - const float epsilon, - int right) { - __m256 sum; - __m256 mean_vec, var_vec; - __m128 hi, lo; - __m256 tmp; - size_t offset; - size_t j; - int block = YMM_FLOAT_BLOCK; - const int rest = right % block; - const int end = right - rest; - - __m256 reverse_num_vec = - _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(right)); - __m256 epsilon_vec = _mm256_set1_ps(epsilon); - int rest_mask = - ((-1) & (~((~0U) >> (sizeof(int) * 8 - (block - rest))))) & 0x0ff; - __m256i mask_vec = _mm256_set_epi32(rest_mask & 0x80 ? 0xffffffff : 0, - rest_mask & 0x40 ? 0xffffffff : 0, - rest_mask & 0x20 ? 0xffffffff : 0, - rest_mask & 0x10 ? 0xffffffff : 0, - rest_mask & 0x8 ? 0xffffffff : 0, - rest_mask & 0x4 ? 0xffffffff : 0, - rest_mask & 0x2 ? 0xffffffff : 0, - rest_mask & 0x1 ? 0xffffffff : 0); - - for (int i = 0; i < height; ++i) { - offset = i * right; - - /* get mean */ - sum = _mm256_setzero_ps(); - for (j = offset; j < end + offset; j += block) { - sum = _mm256_add_ps(sum, _mm256_loadu_ps((const float*)x + j)); - } - if (rest != 0) { - j = offset + right - block; - tmp = _mm256_loadu_ps((const float*)x + j); - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), - tmp, - *(__m256*)&mask_vec); // NOLINT - sum = _mm256_add_ps(sum, tmp); - } - hi = _mm256_extractf128_ps(sum, 1); - lo = _mm256_extractf128_ps(sum, 0); - sum = _mm256_add_ps( - sum, - _mm256_insertf128_ps( - _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1)); - sum = _mm256_hadd_ps(sum, sum); - sum = _mm256_hadd_ps(sum, sum); - mean_vec = _mm256_mul_ps(sum, reverse_num_vec); - mean[i] = *reinterpret_cast(&mean_vec); - - /* get variance */ - sum = _mm256_setzero_ps(); - for (j = offset; j < end + offset; j += block) { - tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); - tmp = _mm256_mul_ps(tmp, tmp); - sum = _mm256_add_ps(sum, tmp); - } - if (rest != 0) { - j = offset + right - block; - tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); - tmp = _mm256_mul_ps(tmp, tmp); - tmp = _mm256_blendv_ps(_mm256_setzero_ps(), - tmp, - *(__m256*)&mask_vec); // NOLINT - sum = _mm256_add_ps(sum, tmp); - } - hi = _mm256_extractf128_ps(sum, 1); - lo = _mm256_extractf128_ps(sum, 0); - sum = _mm256_add_ps( - sum, - _mm256_insertf128_ps( - _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1)); - sum = _mm256_hadd_ps(sum, sum); - sum = _mm256_hadd_ps(sum, sum); - var_vec = _mm256_mul_ps(sum, reverse_num_vec); - var[i] = *reinterpret_cast(&var_vec); - - /* get x_norm and calculate output*/ - for (j = offset; j < end + offset; j += block) { - tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); - tmp = _mm256_div_ps(tmp, - _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec))); - _mm256_storeu_ps(reinterpret_cast(out) + j, tmp); - } - if (rest != 0) { - j = offset + right - block; - tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); - tmp = _mm256_div_ps(tmp, - _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec))); - _mm256_storeu_ps(reinterpret_cast(out) + j, tmp); - } - - if (scale) { - if (rest != 0) { - j = offset + right - block; - tmp = _mm256_loadu_ps((const float*)out + j); - } - for (j = offset; j < end + offset; j += block) { - _mm256_storeu_ps( - reinterpret_cast(out) + j, - _mm256_mul_ps(_mm256_loadu_ps((const float*)out + j), - _mm256_loadu_ps((const float*)scale + j - offset))); - } - if (rest != 0) { - j = offset + right - block; - _mm256_storeu_ps( - reinterpret_cast(out) + j, - _mm256_mul_ps(tmp, - _mm256_loadu_ps((const float*)scale + j - offset))); - } - } - - if (bias) { - if (rest != 0) { - j = offset + right - block; - tmp = _mm256_loadu_ps((const float*)out + j); - } - for (j = offset; j < end + offset; j += block) { - _mm256_storeu_ps( - reinterpret_cast(out) + j, - _mm256_add_ps(_mm256_loadu_ps((const float*)out + j), - _mm256_loadu_ps((const float*)bias + j - offset))); - } - if (rest != 0) { - j = offset + right - block; - _mm256_storeu_ps( - reinterpret_cast(out) + j, - _mm256_add_ps(tmp, - _mm256_loadu_ps((const float*)bias + j - offset))); - } - } - } -} - -bool LayerNormKernel::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx) && d >= YMM_FLOAT_BLOCK; -} - -} // namespace intrinsic -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace intrinsic = paddle::lite::jit::more::intrinsic; - -REGISTER_JITKERNEL_MORE(kLayerNorm, intrinsic, intrinsic::LayerNormKernel); diff --git a/lite/backends/x86/jit/more/intrinsic/layer_norm.h b/lite/backends/x86/jit/more/intrinsic/layer_norm.h deleted file mode 100644 index d8768d52ed..0000000000 --- a/lite/backends/x86/jit/more/intrinsic/layer_norm.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "lite/backends/x86/jit/kernel_base.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace intrinsic { - -void LayerNorm(float* x, - float* out, - float* mean, - float* var, - const float* scale, - const float* bias, - int height, - const float epsilon, - int right); - -class LayerNormKernel : public KernelMore> { - public: - LayerNormKernel() { this->func = LayerNorm; } - bool CanBeUsed( - const typename LayerNormTuple::attr_type&) const override; - const char* ImplType() const override { return "Intrinsic"; } -}; - -} // namespace intrinsic -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/more/mix/CMakeLists.txt b/lite/backends/x86/jit/more/mix/CMakeLists.txt deleted file mode 100644 index dd039d2915..0000000000 --- a/lite/backends/x86/jit/more/mix/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ - - -file(GLOB jit_kernel_mix_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base) - -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE) - -USE_JITKERNEL_MORE(kVSigmoid, mix) -USE_JITKERNEL_MORE(kVTanh, mix) -USE_JITKERNEL_MORE(kLSTMCtHt, mix) -USE_JITKERNEL_MORE(kLSTMC1H1, mix) -USE_JITKERNEL_MORE(kGRUH1, mix) -USE_JITKERNEL_MORE(kGRUHtPart1, mix) -USE_JITKERNEL_MORE(kGRUHtPart2, mix) -USE_JITKERNEL_MORE(kSoftmax, mix) diff --git a/lite/backends/x86/jit/more/mix/mix.cc b/lite/backends/x86/jit/more/mix/mix.cc deleted file mode 100644 index b904b8a24c..0000000000 --- a/lite/backends/x86/jit/more/mix/mix.cc +++ /dev/null @@ -1,255 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/more/mix/mix.h" -#include "lite/backends/x86/jit/kernels.h" -#include "lite/backends/x86/jit/registry.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace mix { - -using CPUPlace = lite::fluid::CPUPlace; - -void VSigmoid(const T* x, T* y, int n) { - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - auto compute = KernelFuncs, CPUPlace>::Cache().At(n); - compute(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -void VTanh(const T* x, T* y, int n) { - const T a = 2, b = -1; - auto compute_scal = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_addbias = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_sigmoid = KernelFuncs, CPUPlace>::Cache().At(n); - compute_scal(&a, x, y, n); - compute_sigmoid(y, y, n); - compute_scal(&a, y, y, n); - compute_addbias(&b, y, y, n); -} - -// remain is the product of dimension shapes after the axis dimension -void Softmax(const T* x, T* y, int n, int bs, int remain) { - auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_strideasum = - KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_stridescal = - KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_vaddbias = - KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_vexp = KernelFuncs, CPUPlace>::Cache().At(n); - - for (int i = 0; i < bs; ++i) { - T scalar; - compute_hmax(x, &scalar, n); - scalar = static_cast(0) - scalar; - compute_vaddbias(&scalar, x, y, n); // x - max - compute_vexp(y, y, n); - if (remain == 1) { - compute_hsum(y, &scalar, n); - scalar = static_cast(1) / scalar; - compute_vscal(&scalar, y, y, n); - } else { - for (int j = 0; j < remain; ++j) { - compute_strideasum(&y[j], &scalar, n, remain); - scalar = static_cast(1) / scalar; - compute_stridescal(&scalar, &y[j], &y[j], n, remain); - } - } - x += n; - y += n; - } -} - -void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT - if (type == kVSigmoid) { - return KernelFuncs, CPUPlace>::Cache().At(d); - } else if (type == kVRelu) { - return KernelFuncs, CPUPlace>::Cache().At(d); - } else if (type == kVTanh) { - return KernelFuncs, CPUPlace>::Cache().At(d); - } else if (type == kVIdentity) { - return KernelFuncs, CPUPlace>::Cache().At(d); - } - LOG(FATAL) << "Not support type: " << type; - return nullptr; -} - -void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - const T* ct_1 = reinterpret_cast(step->ct_1); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - const T* wp = reinterpret_cast(step->wp); - T* checked = reinterpret_cast(step->checked); - const int d = attr->d; - const int d2 = d * 2; - const int d3 = d * 3; - auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); - auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); - auto vadd_d2 = KernelFuncs, CPUPlace>::Cache().At(d2); - auto act_gate_d = getActFunc(attr->act_gate, d); - auto act_gate_d2 = getActFunc(attr->act_gate, d2); - auto act_gate_d3 = getActFunc(attr->act_gate, d3); - auto act_cand_d = getActFunc(attr->act_cand, d); - auto act_cell_d = getActFunc(attr->act_cell, d); - - if (attr->use_peephole) { - vmul_d(wp, ct_1, checked, d); - vmul_d(wp + d, ct_1, checked + d, d); - vadd_d2(checked, gates + d, gates + d, d2); - act_gate_d2(gates + d, gates + d, d2); - } else { - act_gate_d3(gates + d, gates + d, d3); - } - - // C_t = C_t-1 * fgated + cand_gated * igated - act_cand_d(gates, gates, d); - vmul_d(gates, gates + d, gates + d, d); - vmul_d(ct_1, gates + d2, gates + d2, d); - vadd_d(gates + d, gates + d2, ct, d); - - if (attr->use_peephole) { - // get ogated - vmul_d(wp + d2, ct, gates + d, d); - vadd_d(gates + d, gates + d3, gates + d3, d); - act_gate_d(gates + d3, gates + d3, d); - } - // H_t = act_cell(C_t) * ogated - act_cell_d(ct, gates + d2, d); - vmul_d(gates + d2, gates + d3, ht, d); -} - -void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - int d = attr->d; - int d2 = d * 2; - int d3 = d * 3; - auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); - auto vadd_d = KernelFuncs, CPUPlace>::Cache().At(d); - auto act_gate_d = getActFunc(attr->act_gate, d); - auto act_cand_d = getActFunc(attr->act_cand, d); - auto act_cell_d = getActFunc(attr->act_cell, d); - /* C_t = igated * cgated*/ - act_gate_d(gates + d, gates + d, d); - act_cand_d(gates, gates, d); - vmul_d(gates, gates + d, ct, d); - if (attr->use_peephole) { - // get outgated, put W_oc * C_t on igated - const T* wp = reinterpret_cast(step->wp); - vmul_d(wp + d2, ct, gates + d, d); - vadd_d(gates + d, gates + d3, gates + d3, d); - } - /* H_t = act_cell(C_t) * ogated */ - act_gate_d(gates + d3, gates + d3, d); - act_cell_d(ct, gates + d2, d); - vmul_d(gates + d2, gates + d3, ht, d); -} - -// compute h1 without h0 -void GRUH1(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - int d = attr->d; - int d2 = d * 2; - auto act_gate = getActFunc(attr->act_gate, d); - auto act_cand = getActFunc(attr->act_cand, d); - auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(d); - act_gate(gates, gates, d); - act_cand(gates + d2, gates + d2, d); - vmul_d(gates, gates + d2, ht, d); -} - -// compute the first part of GRU: ht = act_gate(r) * ht_1 -void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { - // W: {W_update, W_reset; W_state} - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - auto act_gate = getActFunc(attr->act_gate, attr->d); - auto vmul_d = KernelFuncs, CPUPlace>::Cache().At(attr->d); - act_gate(gates + attr->d, gates + attr->d, attr->d); - vmul_d(ht_1, gates + attr->d, ht, attr->d); -} - -// compute the second part of GRU: -// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 -void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - int d = attr->d; - auto act_gate = getActFunc(attr->act_gate, d); - auto act_cand = getActFunc(attr->act_cand, d); - T* y = gates + d * 2; - act_gate(gates, gates, d); - act_cand(y, y, d); - // out = zt*ht~ + (1-zt)*ht_1 - for (int i = 0; i < d; ++i) { - ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; - } -} - -// TODO(TJ): tuning me -bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; } - -bool VTanhKernel::CanBeUsed(const int& d) const { return true; } - -bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; } - -bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } - -bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; } - -bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } - -bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } - -bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; } - -} // namespace mix -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace mix = paddle::lite::jit::more::mix; - -#define REGISTER_MORE_KERNEL(func) \ - REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel) - -REGISTER_MORE_KERNEL(VSigmoid); -REGISTER_MORE_KERNEL(VTanh); -REGISTER_MORE_KERNEL(Softmax); -REGISTER_MORE_KERNEL(LSTMCtHt); -REGISTER_MORE_KERNEL(LSTMC1H1); -REGISTER_MORE_KERNEL(GRUH1); -REGISTER_MORE_KERNEL(GRUHtPart1); -REGISTER_MORE_KERNEL(GRUHtPart2); - -#undef REGISTER_MORE_KERNEL diff --git a/lite/backends/x86/jit/more/mix/mix.h b/lite/backends/x86/jit/more/mix/mix.h deleted file mode 100644 index 6ade67182c..0000000000 --- a/lite/backends/x86/jit/more/mix/mix.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include "lite/backends/x86/jit/kernel_base.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace mix { -using T = float; - -void VSigmoid(const T* x, T* y, int n); -void VTanh(const T* x, T* y, int n); -void Softmax(const T* x, T* y, int n, int bs, int remain); - -void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr); -void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr); -void GRUH1(gru_t* step, const gru_attr_t* attr); -void GRUHtPart1(gru_t* step, const gru_attr_t* attr); -void GRUHtPart2(gru_t* step, const gru_attr_t* attr); - -#define DECLARE_MORE_KERNEL(name) \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ - const char* ImplType() const override { return "Mixed"; } \ - } - -// XYN -DECLARE_MORE_KERNEL(VSigmoid); -DECLARE_MORE_KERNEL(VTanh); - -// XRN -DECLARE_MORE_KERNEL(Softmax); - -DECLARE_MORE_KERNEL(LSTMCtHt); -DECLARE_MORE_KERNEL(LSTMC1H1); - -DECLARE_MORE_KERNEL(GRUH1); -DECLARE_MORE_KERNEL(GRUHtPart1); -DECLARE_MORE_KERNEL(GRUHtPart2); - -#undef DECLARE_MORE_KERNEL - -} // namespace mix -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/more/mkl/CMakeLists.txt b/lite/backends/x86/jit/more/mkl/CMakeLists.txt deleted file mode 100644 index 56f1a62ad4..0000000000 --- a/lite/backends/x86/jit/more/mkl/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ - -cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) - -# use mkl kernels by name and type -USE_JITKERNEL_MORE(kMatMul, mkl) -USE_JITKERNEL_MORE(kVMul, mkl) -USE_JITKERNEL_MORE(kVAdd, mkl) -USE_JITKERNEL_MORE(kVScal, mkl) -USE_JITKERNEL_MORE(kStrideScal, mkl) -USE_JITKERNEL_MORE(kVExp, mkl) -USE_JITKERNEL_MORE(kVSquare, mkl) -USE_JITKERNEL_MORE(kVCopy, mkl) -USE_JITKERNEL_MORE(kVSigmoid, mkl) -USE_JITKERNEL_MORE(kVTanh, mkl) -USE_JITKERNEL_MORE(kSeqPool, mkl) -USE_JITKERNEL_MORE(kSoftmax, mkl) -USE_JITKERNEL_MORE(kEmbSeqPool, mkl) -USE_JITKERNEL_MORE(kSgd, mkl) -USE_JITKERNEL_MORE(kVBroadcast, mkl) diff --git a/lite/backends/x86/jit/more/mkl/mkl.cc b/lite/backends/x86/jit/more/mkl/mkl.cc deleted file mode 100644 index 7df930f6c0..0000000000 --- a/lite/backends/x86/jit/more/mkl/mkl.cc +++ /dev/null @@ -1,336 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/more/mkl/mkl.h" -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/refer/refer.h" -#include "lite/backends/x86/jit/registry.h" -#include "lite/backends/x86/mklml.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace mkl { - -template <> -void MatMul(const float* a, - const float* b, - float* c, - const matmul_attr_t* attr) { - lite::x86::cblas_sgemm(CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - attr->m, - attr->n, - attr->k, - 1.f, - a, - attr->k, - b, - attr->n, - 0.f, - c, - attr->n); -} - -template <> -void MatMul(const double* a, - const double* b, - double* c, - const matmul_attr_t* attr) { - lite::x86::cblas_dgemm(CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - attr->m, - attr->n, - attr->k, - 1.0, - a, - attr->k, - b, - attr->n, - 0.0, - c, - attr->n); -} - -template <> -void VMul(const float* x, const float* y, float* z, int n) { - lite::x86::vsMul(n, x, y, z); -} - -template <> -void VMul(const double* x, const double* y, double* z, int n) { - lite::x86::vdMul(n, x, y, z); -} - -template <> -void VAdd(const float* x, const float* y, float* z, int n) { - lite::x86::vsAdd(n, x, y, z); -} - -template <> -void VAdd(const double* x, const double* y, double* z, int n) { - lite::x86::vdAdd(n, x, y, z); -} - -template <> -void VScal(const float* a, const float* x, float* y, int n) { - if (x == y) { - lite::x86::cblas_sscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); - } -} - -template <> -void VScal(const double* a, const double* x, double* y, int n) { - if (x == y) { - lite::x86::cblas_dscal(n, *a, y, 1); - } else { - refer::VScal(a, x, y, n); - } -} - -template <> -void StrideScal( - const float* a, const float* x, float* y, int n, int stride) { - if (x == y) { - lite::x86::cblas_sscal(n / stride, *a, y, stride); - } else { - refer::StrideScal(a, x, y, n, stride); - } -} - -template <> -void StrideScal( - const double* a, const double* x, double* y, int n, int stride) { - if (x == y) { - lite::x86::cblas_dscal(n / stride, *a, y, stride); - } else { - refer::StrideScal(a, x, y, n, stride); - } -} - -template <> -void VExp(const float* x, float* y, int n) { - lite::x86::vsExp(n, x, y); -} - -template <> -void VExp(const double* x, double* y, int n) { - lite::x86::vdExp(n, x, y); -} - -template <> -void VSquare(const float* x, float* y, int n) { - lite::x86::vsSqr(n, x, y); -} - -template <> -void VSquare(const double* x, double* y, int n) { - lite::x86::vdSqr(n, x, y); -} - -template <> -void VCopy(const float* x, float* y, int n) { - lite::x86::cblas_scopy(n, x, 1, y, 1); -} - -template <> -void VCopy(const double* x, double* y, int n) { - lite::x86::cblas_dcopy(n, x, 1, y, 1); -} - -template <> -void VAXPY(float a, const float* x, float* y, int n) { - lite::x86::cblas_saxpy(n, a, x, 1, y, 1); -} - -template <> -void VAXPY(double a, const double* x, double* y, int n) { - lite::x86::cblas_daxpy(n, a, x, 1, y, 1); -} - -template <> -void ASum(const float* x, float* res, int n) { - res[0] = lite::x86::cblas_sasum(n, x, 1); -} - -template <> -void ASum(const double* x, double* res, int n) { - res[0] = lite::x86::cblas_dasum(n, x, 1); -} - -template <> -void StrideASum(const float* x, float* res, int n, int stride) { - res[0] = lite::x86::cblas_sasum(n / stride, x, stride); -} - -template <> -void StrideASum(const double* x, double* res, int n, int stride) { - res[0] = lite::x86::cblas_dasum(n / stride, x, stride); -} - -// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 -template <> -bool VMulKernel::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx512f) && d > 512; -} - -template <> -bool VAddKernel::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx) && d > 512; -} - -template <> -bool VScalKernel::CanBeUsed(const int& d) const { - return x86::MayIUse(x86::avx512f) && d > 512; -} - -template <> -bool StrideScalKernel::CanBeUsed(const int& d) const { - return true; -} - -template <> -bool VExpKernel::CanBeUsed(const int& d) const { - return d > 7; -} - -template <> -bool VSquareKernel::CanBeUsed(const int& d) const { - return d > 7; -} - -template <> -bool VCopyKernel::CanBeUsed(const int& d) const { - return d > 15; -} - -template <> -bool VBroadcastKernel::CanBeUsed(const int64_t& d) const { - return d > 127; -} - -template <> -bool VBroadcastKernel::CanBeUsed(const int64_t& attr) const { - return true; -} - -template <> -bool VSigmoidKernel::CanBeUsed(const int& d) const { - return d > 7; -} - -template <> -bool VTanhKernel::CanBeUsed(const int& d) const { - return d > 7; -} - -template <> -bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { - return true; -} - -template <> -bool SeqPoolKernel::CanBeUsed(const seq_pool_attr_t& attr) const { - return true; -} - -template <> -bool EmbSeqPoolKernel::CanBeUsed(const emb_seq_pool_attr_t& attr) const { - return true; -} - -template <> -bool EmbSeqPoolKernel::CanBeUsed( - const emb_seq_pool_attr_t& attr) const { - return true; -} - -template <> -bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { - return true; -} - -template <> -bool SgdKernel::CanBeUsed(const sgd_attr_t& attr) const { - return true; -} - -template <> -bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { - return x86::MayIUse(x86::avx); -} - -template <> -bool MatMulKernel::CanBeUsed(const matmul_attr_t& attr) const { - return true; -} - -template <> -bool SoftmaxKernel::CanBeUsed(const int& d) const { - // tuned on avx2 - return x86::MayIUse(x86::avx) && d < 60; -} - -#define AWALYS_USE_ME_WITH_DOUBLE(func) \ - template <> \ - bool func##Kernel::CanBeUsed(const int& d) const { \ - return true; \ - } - -AWALYS_USE_ME_WITH_DOUBLE(VMul); -AWALYS_USE_ME_WITH_DOUBLE(VAdd); -AWALYS_USE_ME_WITH_DOUBLE(VScal); -AWALYS_USE_ME_WITH_DOUBLE(StrideScal); -AWALYS_USE_ME_WITH_DOUBLE(VExp); -AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); -AWALYS_USE_ME_WITH_DOUBLE(VTanh); -AWALYS_USE_ME_WITH_DOUBLE(VSquare); -AWALYS_USE_ME_WITH_DOUBLE(VCopy); -AWALYS_USE_ME_WITH_DOUBLE(Softmax); - -#undef AWALYS_USE_ME_WITH_DOUBLE -} // namespace mkl -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle - -namespace mkl = paddle::lite::jit::more::mkl; - -#define REGISTER_MKL_KERNEL(func) \ - REGISTER_JITKERNEL_MORE( \ - k##func, mkl, mkl::func##Kernel, mkl::func##Kernel) - -REGISTER_MKL_KERNEL(MatMul); -REGISTER_MKL_KERNEL(VMul); -REGISTER_MKL_KERNEL(VAdd); -REGISTER_MKL_KERNEL(VScal); -REGISTER_MKL_KERNEL(StrideScal); -REGISTER_MKL_KERNEL(VExp); -REGISTER_MKL_KERNEL(VSquare); -REGISTER_MKL_KERNEL(VCopy); -REGISTER_MKL_KERNEL(VBroadcast); -REGISTER_MKL_KERNEL(VSigmoid); -REGISTER_MKL_KERNEL(VTanh); -REGISTER_MKL_KERNEL(SeqPool); -REGISTER_MKL_KERNEL(EmbSeqPool); -REGISTER_MKL_KERNEL(Softmax); -REGISTER_MKL_KERNEL(Sgd); - -#undef REGISTER_MKL_KERNEL diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h deleted file mode 100644 index 8b713e537e..0000000000 --- a/lite/backends/x86/jit/more/mkl/mkl.h +++ /dev/null @@ -1,244 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "lite/backends/x86/jit/kernel_base.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace more { -namespace mkl { - -template -void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr); - -template -void VMul(const T* x, const T* y, T* z, int n); - -template -void VAdd(const T* x, const T* y, T* z, int n); - -template -void VScal(const T* a, const T* x, T* y, int n); - -template -void VExp(const T* x, T* y, int n); - -template -void VSquare(const T* x, T* y, int n); - -template -void VCopy(const T* x, T* y, int n); - -template -void VAXPY(T a, const T* x, T* y, int n); - -template -void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) { - for (int64_t h = 0; h < y_h; ++h) { - VCopy(x, y + h * x_len, x_len); - } -} - -template -void VSigmoid(const T* x, T* y, int n) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - VExp(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -template -void VTanh(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoid(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - -template -void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { - VCopy(x, y, attr->w); - for (int h = 1; h != attr->h; ++h) { - VAXPY(static_cast(1), x + h * attr->w, y, attr->w); - } - if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) { - T scalar = static_cast(1); - if (attr->type == SeqPoolType::kAvg) { - scalar = scalar / static_cast(attr->h); - } else { - scalar = scalar / std::sqrt(static_cast(attr->h)); - } - VScal(&scalar, y, y, attr->w); - } -} - -template -void EmbSeqPool(const T* table, - const int64_t* idx, - T* out, - const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); - auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT( - idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); - }; - - for (int64_t w = 0; w != attr->index_width; ++w) { - check_idx_value_valid(w); - VCopy(table + idx[w] * attr->table_width, - out + w * attr->table_width, - attr->table_width); - } - - for (int64_t h = 1; h < attr->index_height; ++h) { - for (int64_t w = 0; w < attr->index_width; ++w) { - int64_t i = h * attr->index_width + w; - check_idx_value_valid(i); - VAXPY(static_cast(1), - table + idx[i] * attr->table_width, - out + w * attr->table_width, - attr->table_width); - } - } -} - -template -void ASum(const T* x, T* res, int n); - -template -void StrideASum(const T* x, T* res, int n, int stride); - -template -void StrideScal(const T* a, const T* x, T* y, int n, int stride); - -// remain is the product of dimension shapes after the axis dimension -template -void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { - std::vector entities(bs); - for (int i = 0; i < bs; ++i) { - entities[i] = x[i * n]; - for (int c = 1; c < n; ++c) { - entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i]; - } - for (int c = 0; c < n; ++c) { - y[i * n + c] = x[i * n + c] - entities[i]; - } - } - VExp(y, y, n * bs); - for (int i = 0; i < bs; ++i) { - T sum; - if (remain == 1) { - ASum(&y[i * n], &sum, n); - sum = static_cast(1) / sum; - VScal(&sum, &y[i * n], &y[i * n], n); - } else { - for (int j = 0; j < remain; ++j) { - StrideASum(&y[i * n + j], &sum, n, remain); - sum = static_cast(1) / sum; - StrideScal(&sum, &y[i * n + j], &y[i * n + j], n, remain); - } - } - } -} - -template -void Sgd(const T* lr, - const T* param, - const T* grad, - const int64_t* rows, - T* out, - const sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); - T scalar = -lr[0]; - int width = attr->grad_width; - if (out == param) { - for (int64_t i = 0; i < attr->selected_rows_size; ++i) { - auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); - VAXPY(scalar, grad + i * width, out + h_idx * width, width); - } - } else { - for (int64_t i = 0; i < attr->selected_rows_size; ++i) { - auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); - VScal(&scalar, grad + i * width, out + h_idx * width, width); - VAdd(param + h_idx * width, - out + h_idx * width, - out + h_idx * width, - width); - } - } -} - -#define DECLARE_MKL_KERNEL(name) \ - template \ - class name##Kernel : public KernelMore> { \ - public: \ - name##Kernel() { this->func = name; } \ - bool CanBeUsed(const typename name##Tuple::attr_type&) const override; \ - const char* ImplType() const override { return "MKL"; } \ - } - -// ABCMNK -DECLARE_MKL_KERNEL(MatMul); - -// XYZN -DECLARE_MKL_KERNEL(VMul); -DECLARE_MKL_KERNEL(VAdd); - -// AXYN -DECLARE_MKL_KERNEL(VScal); -DECLARE_MKL_KERNEL(StrideScal); - -// XYN -DECLARE_MKL_KERNEL(VExp); -DECLARE_MKL_KERNEL(VSigmoid); -DECLARE_MKL_KERNEL(VTanh); -DECLARE_MKL_KERNEL(VSquare); -DECLARE_MKL_KERNEL(VCopy); - -// others -DECLARE_MKL_KERNEL(SeqPool); -DECLARE_MKL_KERNEL(EmbSeqPool); -DECLARE_MKL_KERNEL(Softmax); -DECLARE_MKL_KERNEL(Sgd); -DECLARE_MKL_KERNEL(VBroadcast); - -#undef DECLARE_MKL_KERNEL - -} // namespace mkl -} // namespace more -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/refer/CMakeLists.txt b/lite/backends/x86/jit/refer/CMakeLists.txt deleted file mode 100644 index 7133f59662..0000000000 --- a/lite/backends/x86/jit/refer/CMakeLists.txt +++ /dev/null @@ -1,40 +0,0 @@ - -cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) -set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) - -function(USE_JITKERNEL_REFER TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n") -endfunction() - -# use refer kernel by name -USE_JITKERNEL_REFER(kVMul) -USE_JITKERNEL_REFER(kVAdd) -USE_JITKERNEL_REFER(kVAddRelu) -USE_JITKERNEL_REFER(kVSub) -USE_JITKERNEL_REFER(kVScal) -USE_JITKERNEL_REFER(kStrideScal) -USE_JITKERNEL_REFER(kVAddBias) -USE_JITKERNEL_REFER(kVCopy) -USE_JITKERNEL_REFER(kVRelu) -USE_JITKERNEL_REFER(kVIdentity) -USE_JITKERNEL_REFER(kVExp) -USE_JITKERNEL_REFER(kVSigmoid) -USE_JITKERNEL_REFER(kVTanh) -USE_JITKERNEL_REFER(kLSTMCtHt) -USE_JITKERNEL_REFER(kLSTMC1H1) -USE_JITKERNEL_REFER(kGRUH1) -USE_JITKERNEL_REFER(kGRUHtPart1) -USE_JITKERNEL_REFER(kGRUHtPart2) -USE_JITKERNEL_REFER(kCRFDecoding) -USE_JITKERNEL_REFER(kLayerNorm) -USE_JITKERNEL_REFER(kNCHW16CMulNC) -USE_JITKERNEL_REFER(kSeqPool) -USE_JITKERNEL_REFER(kMatMul) -USE_JITKERNEL_REFER(kVSquare) -USE_JITKERNEL_REFER(kHSum) -USE_JITKERNEL_REFER(kHMax) -USE_JITKERNEL_REFER(kStrideASum) -USE_JITKERNEL_REFER(kSoftmax) -USE_JITKERNEL_REFER(kEmbSeqPool) -USE_JITKERNEL_REFER(kSgd) -USE_JITKERNEL_REFER(kVBroadcast) diff --git a/lite/backends/x86/jit/refer/refer.cc b/lite/backends/x86/jit/refer/refer.cc deleted file mode 100644 index e1b1240c5d..0000000000 --- a/lite/backends/x86/jit/refer/refer.cc +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "lite/backends/x86/jit/refer/refer.h" -#include "lite/backends/x86/jit/registry.h" - -namespace refer = paddle::lite::jit::refer; - -#define REGISTER_REFER_KERNEL(func) \ - REGISTER_JITKERNEL_REFER( \ - k##func, refer::func##Kernel, refer::func##Kernel) - -REGISTER_REFER_KERNEL(VMul); -REGISTER_REFER_KERNEL(VAdd); -REGISTER_REFER_KERNEL(VAddRelu); -REGISTER_REFER_KERNEL(VSub); - -REGISTER_REFER_KERNEL(VScal); -REGISTER_REFER_KERNEL(StrideScal); -REGISTER_REFER_KERNEL(VAddBias); - -REGISTER_REFER_KERNEL(VRelu); -REGISTER_REFER_KERNEL(VCopy); -REGISTER_REFER_KERNEL(VIdentity); -REGISTER_REFER_KERNEL(VSquare); -REGISTER_REFER_KERNEL(VExp); -REGISTER_REFER_KERNEL(VSigmoid); -REGISTER_REFER_KERNEL(VTanh); - -REGISTER_REFER_KERNEL(LSTMCtHt); -REGISTER_REFER_KERNEL(LSTMC1H1); - -REGISTER_REFER_KERNEL(GRUH1); -REGISTER_REFER_KERNEL(GRUHtPart1); -REGISTER_REFER_KERNEL(GRUHtPart2); - -REGISTER_REFER_KERNEL(CRFDecoding); -REGISTER_REFER_KERNEL(LayerNorm); -REGISTER_REFER_KERNEL(NCHW16CMulNC); -REGISTER_REFER_KERNEL(SeqPool); -REGISTER_REFER_KERNEL(MatMul); -REGISTER_REFER_KERNEL(HMax); -REGISTER_REFER_KERNEL(HSum); -REGISTER_REFER_KERNEL(StrideASum); -REGISTER_REFER_KERNEL(Softmax); -REGISTER_REFER_KERNEL(EmbSeqPool); -REGISTER_REFER_KERNEL(Sgd); -REGISTER_REFER_KERNEL(VBroadcast); - -#undef REGISTER_REFER_KERNEL diff --git a/lite/backends/x86/jit/refer/refer.h b/lite/backends/x86/jit/refer/refer.h deleted file mode 100644 index 119ec7469e..0000000000 --- a/lite/backends/x86/jit/refer/refer.h +++ /dev/null @@ -1,603 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include "lite/backends/x86/jit/helper.h" -#include "lite/backends/x86/jit/kernel_base.h" -#include "lite/backends/x86/jit/macro.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace jit { -namespace refer { - -// Refer code only focus on correctness -template -void VMul(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - -template -void VAdd(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - -template -void VAddRelu(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} - -template -void VSub(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] - y[i]; - } -} - -template -void VScal(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] * x[i]; - } -} - -template -void VAddBias(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] + x[i]; - } -} - -template -void VCopy(const T* x, T* y, int n) { - std::memcpy(y, x, n * sizeof(T)); -} - -// x shape: (x_len) -// y shape: (h, x_len) -template -void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) { - for (int64_t h = 0; h < y_h; ++h) { - VCopy(x, y + h * x_len, x_len); - } -} - -template -void VRelu(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - -template -inline void VIdentity(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i]; - } -} - -template -inline void VSquare(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] * x[i]; - } -} - -template -void VExp(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - -template -void VSigmoid(const T* x, T* y, int n) { - // y = 1 / (1 + e^-x) - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); - } -} - -template -void VTanh(const T* x, T* y, int n) { - // y = 2 * sigmoid(2x) - 1 - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoid(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - -template -void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT - if (type == kVSigmoid) { - return VSigmoid; - } else if (type == kVRelu) { - return VRelu; - } else if (type == kVTanh) { - return VTanh; - } else if (type == kVIdentity) { - return VIdentity; - } - LOG(FATAL) << "Not support type: " << type; - return nullptr; -} - -// TODO(TJ): add refer gemm and make LSTM kernels combine as same GRU kernels - -// compute ct and ht -template -void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - const T* ct_1 = reinterpret_cast(step->ct_1); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - const T* wp = reinterpret_cast(step->wp); - T* checked = reinterpret_cast(step->checked); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - auto act_cell = getActFunc(attr->act_cell); - int d = attr->d; - int d2 = d * 2; - int d3 = d * 3; - // gates: W_ch, W_ih, W_fh, W_oh - if (attr->use_peephole) { - VMul(wp, ct_1, checked, d); - VMul(wp + d, ct_1, checked + d, d); - VAdd(checked, gates + d, gates + d, d2); - act_gate(gates + d, gates + d, d2); - } else { - act_gate(gates + d, gates + d, d3); - } - - // C_t = C_t-1 * fgated + cand_gated * igated - act_cand(gates, gates, d); - VMul(gates, gates + d, gates + d, d); - VMul(ct_1, gates + d2, gates + d2, d); - VAdd(gates + d, gates + d2, ct, d); - - if (attr->use_peephole) { - // get ogated - VMul(wp + d2, ct, gates + d, d); - VAdd(gates + d, gates + d3, gates + d3, d); - act_gate(gates + d3, gates + d3, d); - } - // H_t = act_cell(C_t) * ogated - act_cell(ct, gates + d2, d); - VMul(gates + d2, gates + d3, ht, d); -} - -// compute c1 and h1 without c0 or h0 -template -void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ct = reinterpret_cast(step->ct); - T* ht = reinterpret_cast(step->ht); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - auto act_cell = getActFunc(attr->act_cell); - int d = attr->d; - int d2 = d * 2; - int d3 = d * 3; - /* C_t = igated * cgated*/ - act_gate(gates + d, gates + d, d); - act_cand(gates, gates, d); - VMul(gates, gates + d, ct, d); - if (attr->use_peephole) { - // get outgated, put W_oc * C_t on igated - const T* wp = reinterpret_cast(step->wp); - VMul(wp + d2, ct, gates + d, d); - VAdd(gates + d, gates + d3, gates + d3, d); - } - /* H_t = act_cell(C_t) * ogated */ - act_gate(gates + d3, gates + d3, d); - act_cell(ct, gates + d2, d); - VMul(gates + d2, gates + d3, ht, d); -} - -// compute h1 without h0 -template -void GRUH1(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - int d = attr->d; - int d2 = d * 2; - act_gate(gates, gates, d); - act_cand(gates + d2, gates + d2, d); - VMul(gates, gates + d2, ht, d); -} - -// compute the first part of GRU: ht = act_gate(r) * ht_1 -template -void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { - // W: {W_update, W_reset; W_state} - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - auto act_gate = getActFunc(attr->act_gate); - act_gate(gates + attr->d, gates + attr->d, attr->d); - VMul(ht_1, gates + attr->d, ht, attr->d); -} - -// compute the second part of GRU: -// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 -template -void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { - T* gates = reinterpret_cast(step->gates); - T* ht = reinterpret_cast(step->ht); - const T* ht_1 = reinterpret_cast(step->ht_1); - auto act_gate = getActFunc(attr->act_gate); - auto act_cand = getActFunc(attr->act_cand); - int d = attr->d; - T* y = gates + d * 2; - act_gate(gates, gates, d); - act_cand(y, y, d); - // out = zt*ht~ + (1-zt)*ht_1 - for (int i = 0; i < d; ++i) { - ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; - } -} - -template -void CRFDecoding(const int seq_len, - const T* x, - const T* w, - T* alpha, - int* track, - int right) { - constexpr int state_trans_base_idx = 2; - for (int i = 0; i < right; ++i) { - alpha[i] = w[i] + x[i]; - } - for (int k = 1; k < seq_len; ++k) { - for (int i = 0; i < right; ++i) { - T max_score = -std::numeric_limits::max(); - int max_j = 0; - for (int j = 0; j < right; ++j) { - T score = alpha[(k - 1) * right + j] + - w[(j + state_trans_base_idx) * right + i]; - if (score > max_score) { - max_score = score; - max_j = j; - } - } - alpha[k * right + i] = max_score + x[k * right + i]; - track[k * right + i] = max_j; - } - } -} - -template -void LayerNorm(T* x, - T* out, - T* mean, - T* var, - const T* scale, - const T* bias, - int height, - const float epsilon, - int right) { - // get mean - for (int i = 0; i < height; i++) { - T sum = 0.0; - int offset = i * right; - for (int j = 0; j < right; j++) { - sum += x[offset + j]; - } - mean[i] = sum / right; - } - - // get variance - for (int i = 0; i < height; i++) { - T sum = 0.0; - int offset = i * right; - for (int j = 0; j < right; j++) { - sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]); - } - var[i] = sum / right; - } - - for (int i = 0; i < height; i++) { - int offset = i * right; - T sqrt_var = std::sqrt(var[i] + (T)epsilon); - for (int j = 0; j < right; j++) { - out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var; - } - } - if (scale) { - for (int i = 0; i < height; i++) { - int offset = i * right; - for (int j = 0; j < right; j++) { - out[offset + j] *= scale[j]; - } - } - } - - if (bias) { - for (int i = 0; i < height; i++) { - int offset = i * right; - for (int j = 0; j < right; j++) { - out[offset + j] += bias[j]; - } - } - } -} - -template -void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { - int offset = 0; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - for (int i = 0; i < 16; ++i) { - z[i + offset] = y[i] * x[i + offset]; - } - offset += ZMM_FLOAT_BLOCK; - } - } -} - -template -void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { - for (int w = 0; w < attr->w; ++w) { - const T* src = x + w; - T* dst = y + w; - *dst = static_cast(0); - for (int h = 0; h < attr->h; ++h) { - *dst = *dst + *src; - src += attr->w; - } - } - if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) { - T scalar = static_cast(1); - if (attr->type == SeqPoolType::kAvg) { - scalar = scalar / static_cast(attr->h); - } else { - scalar = scalar / std::sqrt(static_cast(attr->h)); - } - VScal(&scalar, y, y, attr->w); - } -} - -// A(M,K) * B(K,N) = C(M,N) -template -void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) { - int M = attr->m; - int N = attr->n; - int K = attr->k; - for (int m = 0; m < M; ++m) { - const T* pa = A + m * K; - T* pc = C + m * N; - for (int n = 0; n < N; ++n) { - const T* pb = B + n; - pc[n] = pa[0] * pb[0]; - for (int k = 1; k < K; ++k) { - pc[n] += pa[k] * pb[k * N]; - } - } - } -} - -template -void HMax(const T* x, T* res, int n) { - res[0] = x[0]; - for (int i = 1; i < n; ++i) { - res[0] = res[0] < x[i] ? x[i] : res[0]; - } -} - -template -void HSum(const T* x, T* res, int n) { - res[0] = x[0]; - for (int i = 1; i < n; ++i) { - res[0] += x[i]; - } -} - -template -void StrideASum(const T* x, T* res, int n, int stride) { - res[0] = x[0]; - for (int i = stride; i < n; i += stride) { - res[0] += std::abs(x[i]); - } -} - -template -void StrideScal(const T* a, const T* x, T* y, int n, int stride) { - for (int i = 0; i < n; ++i) { - if (i % stride == 0) { - y[i] = x[i] * a[0]; - } else { - y[i] = x[i]; - } - } -} - -// y = e^(x - max(x)) -// y = y / sum(y) -// remain is the product of dimension shapes after the axis dimension -template -void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) { - for (int i = 0; i < bs; ++i) { - T scalar; - HMax(x, &scalar, n); - scalar = static_cast(0) - scalar; - VAddBias(&scalar, x, y, n); // x - max - VExp(y, y, n); - if (remain == 1) { - HSum(y, &scalar, n); - scalar = static_cast(1) / scalar; - VScal(&scalar, y, y, n); - } else { - for (int j = 0; j < remain; j++) { - StrideASum(&y[j], &scalar, n, remain); - scalar = static_cast(1) / scalar; - StrideScal(&scalar, &y[j], &y[j], n, remain); - } - } - x += n; - y += n; - } -} - -// embedding seq pool -// table is a matrix with (tbl_h, tbl_w) -// idx is a matrix with (idx_h, idx_w) -// output is a vector with length tbl_w * idx_w -template -void EmbSeqPool(const T* table, - const int64_t* idx, - T* out, - const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); - - auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT( - idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); - }; - - for (int64_t w = 0; w != attr->index_width; ++w) { - check_idx_value_valid(w); - std::memcpy(out + w * attr->table_width, - table + idx[w] * attr->table_width, - attr->table_width * sizeof(T)); - } - - for (int64_t h = 1; h < attr->index_height; ++h) { - for (int64_t w = 0; w < attr->index_width; ++w) { - int64_t i = h * attr->index_width + w; - check_idx_value_valid(i); - VAdd(table + idx[i] * attr->table_width, - out + w * attr->table_width, - out + w * attr->table_width, - attr->table_width); - } - } -} - -// SGD algorithm: -// lr is pointor of learning rate scalar -// param is an input matrix with (param_h, param_w) -// grad is an input matrix with (grad_h, grad_w), here grad_w == param_w -// selected_rows is a vectot with size selected_rows_size( <= grad_h ) -// out is an output matrix with (param_h, param_w) -// -// support both regular and sparse grad -// regular SGD: out[:] = param[:] - lr[0] * grad[:]; -// sparse SGD: out[rows[i]][:] = param[rows[i]][:] - lr[0] * grad[i][:] -// -// Note: when use sparse SGD, and if out != param, -// the out rows which are not selected have not beed changed, which maybe empty -template -void Sgd(const T* lr, - const T* param, - const T* grad, - const int64_t* rows, - T* out, - const lite::jit::sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); - for (int64_t i = 0; i < attr->selected_rows_size; ++i) { - auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); - for (int64_t j = 0; j < attr->grad_width; ++j) { - out[h_idx * attr->grad_width + j] = - param[h_idx * attr->grad_width + j] - - lr[0] * grad[i * attr->grad_width + j]; - } - } -} - -#define DECLARE_REFER_KERNEL(name) \ - template \ - class name##Kernel : public lite::jit::ReferKernel> { \ - public: \ - name##Kernel() { this->func = name; } \ - } - -// const T* x, const T* y, T* z, int n -DECLARE_REFER_KERNEL(VMul); -DECLARE_REFER_KERNEL(VAdd); -DECLARE_REFER_KERNEL(VAddRelu); -DECLARE_REFER_KERNEL(VSub); - -// const T* a, const T* x, T* y, int n -DECLARE_REFER_KERNEL(VScal); -DECLARE_REFER_KERNEL(VAddBias); - -// const T* a, const T* x, T* y, int n, int stride -DECLARE_REFER_KERNEL(StrideScal); - -// const T* x, T* y, int n -DECLARE_REFER_KERNEL(VRelu); -DECLARE_REFER_KERNEL(VIdentity); -DECLARE_REFER_KERNEL(VExp); -DECLARE_REFER_KERNEL(VSigmoid); -DECLARE_REFER_KERNEL(VTanh); -DECLARE_REFER_KERNEL(VSquare); -DECLARE_REFER_KERNEL(VCopy); - -// lstm_t*, const lstm_attr_t* -DECLARE_REFER_KERNEL(LSTMCtHt); -DECLARE_REFER_KERNEL(LSTMC1H1); - -// gru_t*, const gru_attr_t* -DECLARE_REFER_KERNEL(GRUH1); -DECLARE_REFER_KERNEL(GRUHtPart1); -DECLARE_REFER_KERNEL(GRUHtPart2); - -DECLARE_REFER_KERNEL(HMax); -DECLARE_REFER_KERNEL(HSum); - -DECLARE_REFER_KERNEL(StrideASum); - -// others -DECLARE_REFER_KERNEL(CRFDecoding); -DECLARE_REFER_KERNEL(LayerNorm); -DECLARE_REFER_KERNEL(NCHW16CMulNC); -DECLARE_REFER_KERNEL(SeqPool); -DECLARE_REFER_KERNEL(MatMul); -DECLARE_REFER_KERNEL(Softmax); -DECLARE_REFER_KERNEL(EmbSeqPool); -DECLARE_REFER_KERNEL(Sgd); -DECLARE_REFER_KERNEL(VBroadcast); - -#undef DECLARE_REFER_KERNEL - -} // namespace refer -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/registry.h b/lite/backends/x86/jit/registry.h deleted file mode 100644 index 7613a8dd43..0000000000 --- a/lite/backends/x86/jit/registry.h +++ /dev/null @@ -1,178 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include -#include -#include // for std::move -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/kernel_base.h" -#include "lite/backends/x86/jit/kernel_pool.h" -#include "lite/backends/x86/legacy_place.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace jit { - -// make_unique is supported since c++14 -template -inline std::unique_ptr make_unique(Args&&... args) { - static_assert(!std::is_array::value, "T must not be array"); - return std::unique_ptr(new T(std::forward(args)...)); -} - -template -struct JitKernelRegistrarFunctor; - -template -struct JitKernelRegistrarFunctor { - void operator()(KernelType kt) const {} -}; - -template -struct JitKernelRegistrarFunctor { - using KERNEL_IMPL_TYPE = - typename std::tuple_element>::type; - - void operator()(KernelType kt) const { - KernelKey kkey(kt, PlaceType()); - Pool::Instance().Insert(kkey, - std::move(make_unique())); - constexpr auto size = std::tuple_size>::value; - JitKernelRegistrarFunctor - func; - func(kt); - } -}; - -template -class JitKernelRegistrar { - public: - explicit JitKernelRegistrar(KernelType kt) { - JitKernelRegistrarFunctor func; - func(kt); - } - void Touch() {} -}; - -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ - struct __test_global_namespace_##uniq_name##__ {}; \ - static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ - __test_global_namespace_##uniq_name##__>::value, \ - msg) - -// Refer always on CPUPlace -#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ - "REGISTER_KERNEL_REFER must be called in global namespace"); \ - static ::paddle::lite::jit::JitKernelRegistrar< \ - ::paddle::lite::jit::ReferKernelPool, \ - ::paddle::lite::fluid::CPUPlace, \ - __VA_ARGS__> \ - __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ - ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ - __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ - return 0; \ - } - -// kernel_type: should be in paddle::lite::jit::KernelType -// place_type: should be one of CPUPlace and GPUPlace in paddle::platform -#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ - "REGISTER_KERNEL_MORE must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \ - UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static ::paddle::lite::jit::JitKernelRegistrar< \ - ::paddle::lite::jit::KernelPool, \ - ::paddle::lite::fluid::place_type, \ - __VA_ARGS__> \ - __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \ - ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ - __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \ - .Touch(); \ - return 0; \ - } - -#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \ - REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) - -#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ - REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) - -#define REGISTER_JITKERNEL_GEN(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ - "REGISTER_JITKERNEL_GEN must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \ - TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static ::paddle::lite::jit::JitKernelRegistrar< \ - ::paddle::lite::jit::JitCodeCreatorPool, \ - ::paddle::lite::fluid::CPUPlace, \ - __VA_ARGS__> \ - __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \ - ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \ - __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \ - return 0; \ - } - -#define USE_JITKERNEL_GEN(kernel_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ - "USE_JITKERNEL_GEN must be called in global namespace"); \ - extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \ - static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \ - TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() - -#define USE_JITKERNEL_REFER(kernel_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace_, \ - "USE_JITKERNEL_REFER must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \ - TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() - -#define USE_KERNEL_MORE(kernel_type, impl_type, place_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_, \ - "USE_JITKERNEL_MORE must be called in global namespace"); \ - extern int \ - TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \ - static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \ - UNUSED = \ - TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() - -#define USE_JITKERNEL_MORE(kernel_type, impl_type) \ - USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace) - -} // namespace jit -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/jit/test.cc b/lite/backends/x86/jit/test.cc deleted file mode 100644 index aafcad579f..0000000000 --- a/lite/backends/x86/jit/test.cc +++ /dev/null @@ -1,1447 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/kernels.h" -#include "lite/backends/x86/legacy_place.h" - -DEFINE_double(acc, 1e-5, "Test accuracy threshold."); - -template -void RandomVec(const int n, - T* a, - const T lower = static_cast(-2.f), - const T upper = static_cast(2.f)) { - static unsigned int seed = 100; - std::mt19937 rng(seed++); - std::uniform_real_distribution uniform_dist(0, 1); - for (int i = 0; i < n; ++i) { - a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); - } -} - -template -void ExpectEQ(const T* target, const T* refer, size_t n) { - if (std::is_floating_point::value) { - for (size_t i = 0; i < n; ++i) { - EXPECT_NEAR(target[i], refer[i], FLAGS_acc) << " at index : " << i; - } - } else { - for (size_t i = 0; i < n; ++i) { - EXPECT_EQ(target[i], refer[i]) << " at index : " << i; - } - } -} - -std::vector TestSizes() { - std::vector s; - for (int i = 1; i < 32; ++i) { - s.push_back(i); - } - // test some large size - s.push_back(100); - s.push_back(1000); - s.push_back(2000); - return s; -} - -namespace jit = paddle::lite::jit; -using CPUPlace = paddle::lite::fluid::CPUPlace; - -template -void TestAllImpls(const typename KernelTuple::attr_type& attr, - const Tester& verifier, - const Args&... args) { - auto funcs = jit::GetAllCandidateFuncsWithTypes(attr); - for (auto f : funcs) { - VLOG(10) << "Test Kernel " << f.first; - verifier(f.second, args...); - } -} - -template -void TestKernelXYZN() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - for (int d : TestSizes()) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - - std::vector x(d), y(d), zref(d); - RandomVec(d, x.data()); - RandomVec(d, y.data()); - - std::vector xinp(d), yinp(d); // inplace test - std::copy(x.begin(), x.end(), xinp.begin()); - std::copy(y.begin(), y.end(), yinp.begin()); - - const T* x_data = x.data(); - const T* y_data = y.data(); - T* zref_data = zref.data(); - T* xinp_data = xinp.data(); - T* yinp_data = yinp.data(); - - // test refer code inplace - ref(x_data, y_data, zref_data, d); - ref(x_data, yinp_data, yinp_data, d); - ref(xinp_data, y_data, xinp_data, d); - ExpectEQ(xinp_data, zref_data, d); - ExpectEQ(yinp_data, zref_data, d); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, - const std::vector& y, - const std::vector& zref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(zref.size(), x.size()); - EXPECT_EQ(zref.size(), y.size()); - const T* x_data = x.data(); - const T* y_data = y.data(); - const T* zref_data = zref.data(); - const int d = zref.size(); - - std::vector ztgt(d); - T* ztgt_data = ztgt.data(); - // test normal - tgt(x_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ztgt.begin()); - tgt(ztgt_data, y_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - // test inplace y - std::copy(y.begin(), y.end(), ztgt.begin()); - tgt(x_data, ztgt_data, ztgt_data, d); - ExpectEQ(ztgt_data, zref_data, d); - }; - - TestAllImpls(d, verifier, x, y, zref); - } -} - -template -void TestKernelAXYN() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - for (int d : TestSizes()) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - - const T a = static_cast(3); - std::vector x(d), yref(d); - std::vector xinp(d); // inplace test - RandomVec(d, x.data()); - std::copy(x.begin(), x.end(), xinp.begin()); - - const T* x_data = x.data(); - T* yref_data = yref.data(); - T* xinp_data = xinp.data(); - // test refer code inplace - ref(&a, x_data, yref_data, d); - ref(&a, xinp_data, xinp_data, d); - ExpectEQ(xinp_data, yref_data, d); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const T a, - const std::vector& x, - const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - }; - TestAllImpls(d, verifier, a, x, yref); - } -} - -template -void TestKernelXYN() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - for (int d : TestSizes()) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - - std::vector x(d), yref(d); - std::vector xinp(d); // inplace test - RandomVec(d, x.data()); - std::copy(x.begin(), x.end(), xinp.begin()); - - const T* x_data = x.data(); - T* yref_data = yref.data(); - T* xinp_data = xinp.data(); - // test refer code inplace - ref(x_data, yref_data, d); - ref(xinp_data, xinp_data, d); - ExpectEQ(xinp_data, yref_data, d); - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, - const std::vector& yref) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, d); - ExpectEQ(ytgt_data, yref_data, d); - }; - TestAllImpls(d, verifier, x, yref); - } -} - -template -void TestKernelXRN() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - auto last_acc = FLAGS_acc; - FLAGS_acc = 1e-4; - for (int d : TestSizes()) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector x(d); - RandomVec(d, x.data()); - T ref_res; - ref(x.data(), &ref_res, d); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, - const T ref_res) { - EXPECT_TRUE(tgt != nullptr); - T tgt_res; - tgt(x.data(), &tgt_res, x.size()); - ExpectEQ(&tgt_res, &ref_res, 1); - }; - TestAllImpls(d, verifier, x, ref_res); - } - FLAGS_acc = last_acc; -} - -template -void TestKernelLSTM() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); - for (int d : test_sizes) { - for (bool use_peephole : {true, false}) { - for (auto& act_gate : all_acts) { - for (auto& act_cand : all_acts) { - for (auto& act_cell : all_acts) { - const jit::lstm_attr_t attr(d, - jit::to_kerneltype(act_gate), - jit::to_kerneltype(act_cand), - jit::to_kerneltype(act_cell), - use_peephole); - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector xsrc(4 * d), wp(3 * d), ct_1(d); - std::vector ct_ref(d), ht_ref(d), checked(2 * d); - RandomVec(4 * d, xsrc.data()); - RandomVec(3 * d, wp.data(), -1.f, 1.f); - RandomVec(d, ct_1.data(), -1.f, 1.f); - // x could be changed after compute, so copy to save src - std::vector x(xsrc.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - T* x_data = x.data(); - T* checked_data = checked.data(); - T* ct_ref_data = ct_ref.data(); - T* ht_ref_data = ht_ref.data(); - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_ref_data; - step.ht = ht_ref_data; - if (use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - ref(&step, &attr); - VLOG(10) << attr; - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& xsrc, - const std::vector& wp, - const std::vector& ct_1, - const std::vector& ct_ref, - const std::vector& ht_ref, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ct_ref.size(), ht_ref.size()); - EXPECT_EQ(ct_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 4 * ht_ref.size()); - EXPECT_EQ(wp.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ct(ct_ref.size()), - ht(ht_ref.size()); - std::vector checked(2 * d); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - - const T* ct_1_data = ct_1.data(); - const T* wp_data = wp.data(); - const T* ct_ref_data = ct_ref.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ct_data = ct.data(); - T* ht_data = ht.data(); - T* checked_data = checked.data(); - - jit::lstm_t step; - step.gates = x_data; - step.ct_1 = ct_1_data; - step.ct = ct_data; - step.ht = ht_data; - if (attr.use_peephole) { - step.wp = wp_data; - step.checked = checked_data; - } - - tgt(&step, &attr); - ExpectEQ(ct_data, ct_ref_data, d); - ExpectEQ(ht_data, ht_ref_data, d); - }; - TestAllImpls( - attr, verifier, xsrc, wp, ct_1, ct_ref, ht_ref, attr); - } - } - } - } - } -} - -template -void TestKernelGRU() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); - for (int d : test_sizes) { - for (auto& act_gate : all_acts) { - for (auto& act_cand : all_acts) { - const jit::gru_attr_t attr( - d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand)); - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector xsrc(3 * d), ht_1(d), ht_ref(d); - RandomVec(3 * d, xsrc.data()); - RandomVec(d, ht_1.data()); - // x could be changed after compute, so copy to save src - std::vector x(xsrc.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - T* x_data = x.data(); - T* ht_ref_data = ht_ref.data(); - jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_ref_data; - ref(&step, &attr); - VLOG(10) << attr; - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& xsrc, - const std::vector& ht_1, - const std::vector& ht_ref, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(ht_1.size(), ht_ref.size()); - EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); - - // x could be changed after compute, so copy to save src - int d = ht_ref.size(); - std::vector x(xsrc.size()), ht(ht_ref.size()); - std::copy(xsrc.begin(), xsrc.end(), x.begin()); - const T* ht_1_data = ht_1.data(); - const T* ht_ref_data = ht_ref.data(); - T* x_data = x.data(); - T* ht_data = ht.data(); - jit::gru_t step; - step.gates = x_data; - step.ht_1 = ht_1_data; - step.ht = ht_data; - tgt(&step, &attr); - ExpectEQ(ht_data, ht_ref_data, d); - }; - TestAllImpls( - attr, verifier, xsrc, ht_1, ht_ref, attr); - } - } - } -} - -template -void TestKernelNCHW16CMulNC() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - const int n = 3, c = 16 * 4, h = 10, w = 10; - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - int sz = n * c * h * w; - std::vector x(sz), y(n * c), zref(sz); - std::vector ztgt(sz), zjit(sz); - RandomVec(sz, x.data()); - RandomVec(n * c, y.data()); - - const T* x_data = x.data(); - const T* y_data = y.data(); - T* zref_data = zref.data(); - T* ztgt_data = ztgt.data(); - T* zjit_data = zjit.data(); - constexpr int simd_width = ZMM_FLOAT_BLOCK; - int C = c / simd_width; - auto tgt = jit::KernelFuncs::Cache().At(0); - auto funcs = jit::GetAllCandidateFuncs(0); - EXPECT_GT(funcs.size(), 0UL); - auto jitcode = funcs[0]; - EXPECT_TRUE(tgt != nullptr); - - if (std::is_same::value && - paddle::lite::x86::MayIUse(paddle::lite::x86::avx512f)) { - EXPECT_TRUE(jitcode != nullptr); - } - for (int ni = 0; ni < n; ni++) { - for (int ci = 0; ci < C; ci++) { - auto ptr_x = - x_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_y = y_data + ni * C * simd_width + ci * simd_width; - auto ptr_zref = - zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - auto ptr_ztgt = - ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - - ref(ptr_x, ptr_y, ptr_zref, h, w); - tgt(ptr_x, ptr_y, ptr_ztgt, h, w); - - if (jitcode) { - auto ptr_zjit = - zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width; - jitcode(ptr_x, ptr_y, ptr_zjit, h, w); - } - } - } - ExpectEQ(ztgt_data, zref_data, sz); - if (jitcode) { - ExpectEQ(zjit_data, zref_data, sz); - } -} - -template -void TestKernelLayerNorm() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - const T epsilon = 9.99999975e-06; - for (int n : {1, 2, 10}) { - for (int x_dim_0 : {1, 9, 17, 50}) { - int left = n * x_dim_0; - for (int x_dim_1 : TestSizes()) { - int right = x_dim_1; - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - int sz = left * right; - std::vector x(sz), mean(left), var(left), scale(right), bias(right), - outref(sz); - RandomVec(sz, x.data()); - RandomVec(left, mean.data()); - RandomVec(left, var.data()); - RandomVec(right, scale.data()); - RandomVec(right, bias.data()); - - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - - ref(x_data, - outref_data, - mean_data, - var_data, - scale_data, - bias_data, - left, - epsilon, - right); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x_, - const std::vector& outref_, - const std::vector& mean_, - const std::vector& var_, - const std::vector& scale, - const std::vector& bias, - const int& left, - const float& epsilon, - const typename KernelTuple::attr_type& right) { - EXPECT_TRUE(tgt != nullptr); - std::vector outtgt(outref_.size()); - std::vector x(x_.size()); - std::vector mean(mean_.size()); - std::vector var(var_.size()); - std::vector outref(outref_.size()); - std::copy(x_.begin(), x_.end(), x.begin()); - std::copy(mean_.begin(), mean_.end(), mean.begin()); - std::copy(var_.begin(), var_.end(), var.begin()); - std::copy(outref_.begin(), outref_.end(), outref.begin()); - - EXPECT_EQ(x.size(), static_cast(left * right)); - EXPECT_EQ(outref.size(), static_cast(left * right)); - EXPECT_EQ(mean.size(), static_cast(left)); - EXPECT_EQ(var.size(), static_cast(left)); - EXPECT_EQ(scale.size(), static_cast(right)); - EXPECT_EQ(bias.size(), static_cast(right)); - - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* x_data = x.data(); - T* mean_data = mean.data(); - T* var_data = var.data(); - T* outref_data = outref.data(); - T* outtgt_data = outtgt.data(); - tgt(x_data, - outtgt_data, - mean_data, - var_data, - scale_data, - bias_data, - left, - epsilon, - right); - ExpectEQ(outtgt_data, outref_data, left * right); - }; - TestAllImpls(right, - verifier, - x, - outref, - mean, - var, - scale, - bias, - left, - epsilon, - right); - } - } - } -} - -template -void TestKernelCRFDecoding() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - constexpr int state_trans_base_idx = 2; - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000)); - for (int seq_len : {1, 11, 17, 50}) { - for (int tag_num : test_sizes) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - int x_sz = seq_len * tag_num; - int w_sz = (tag_num + state_trans_base_idx) * tag_num; - std::vector x(x_sz), w(w_sz), alpharef(x_sz); - std::vector trackref(x_sz); - RandomVec(x_sz, x.data()); - RandomVec(w_sz, w.data()); - - ref(seq_len, - (const T*)x.data(), - (const T*)w.data(), - alpharef.data(), - trackref.data(), - tag_num); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const int& seq_len, - const std::vector& x, - const std::vector& w, - const std::vector& alpharef, - const std::vector& trackref, - const typename KernelTuple::attr_type& tag_num) { - constexpr int state_trans_base_idx = 2; - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ( - w.size(), - static_cast((tag_num + state_trans_base_idx) * tag_num)); - EXPECT_EQ(alpharef.size(), static_cast(seq_len * tag_num)); - EXPECT_EQ(trackref.size(), static_cast(seq_len * tag_num)); - std::vector alphatgt(alpharef.size()); - std::vector tracktgt(trackref.size()); - memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int)); - tgt(seq_len, - (const T*)x.data(), - (const T*)w.data(), - alphatgt.data(), - tracktgt.data(), - tag_num); - ExpectEQ(alpharef.data(), alphatgt.data(), seq_len * tag_num); - ExpectEQ(trackref.data(), tracktgt.data(), seq_len * tag_num); - }; - TestAllImpls( - tag_num, verifier, seq_len, x, w, alpharef, trackref, tag_num); - } - } -} - -template -void TestKernelSeqPool() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - std::vector pool_types = { - jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); - for (auto type : pool_types) { - for (int w : test_sizes) { - jit::seq_pool_attr_t attr(w, type); - for (int h : test_sizes) { - attr.h = h; - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector x(h * w), yref(w); - RandomVec(h * w, x.data()); - const T* x_data = x.data(); - T* yref_data = yref.data(); - ref(x_data, yref_data, &attr); - VLOG(10) << attr; - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, - const std::vector& yref, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size() % yref.size(), static_cast(0)); - int w = yref.size(); - std::vector y(w); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, &attr); - ExpectEQ(y_data, yref_data, w); - }; - TestAllImpls(attr, verifier, x, yref, attr); - } - } - } -} - -template -void TestKernelEmbSeqPool() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - int64_t tbl_h = 1e4; - std::vector pool_types = { - jit::SeqPoolType::kSum}; // only support sum yet - auto test_sizes = TestSizes(); - test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); - for (int tbl_w : test_sizes) { - std::vector table(tbl_h * tbl_w); - RandomVec(tbl_h * tbl_w, table.data()); - const T* table_data = table.data(); - for (auto type : pool_types) { - for (int idx_w : {1, 2, 10, 16}) { - for (int idx_h : {1, 2, 9, 13, 16}) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector idx(idx_h * idx_w); - RandomVec(idx_h * idx_w, idx.data(), 0, tbl_h - 1); - int64_t out_w = tbl_w * idx_w; - std::vector oref(out_w); - const int64_t* idx_data = idx.data(); - T* o_data = oref.data(); - jit::emb_seq_pool_attr_t attr( - tbl_h, tbl_w, idx_h, idx_w, out_w, type); - ref(table_data, idx_data, o_data, &attr); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& table, - const std::vector& idx, - const std::vector& oref, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ( - table.size(), - static_cast(attr.table_height * attr.table_width)); - EXPECT_EQ( - idx.size(), - static_cast(attr.index_height * attr.index_width)); - EXPECT_EQ(oref.size(), - static_cast(attr.table_width * attr.index_width)); - const T* table_data = table.data(); - const int64_t* idx_data = idx.data(); - const T* oref_data = oref.data(); - int o_w = oref.size(); - std::vector out(o_w); - T* o_data = out.data(); - tgt(table_data, idx_data, o_data, &attr); - ExpectEQ(o_data, oref_data, o_w); - }; - TestAllImpls( - attr, verifier, table, idx, oref, attr); - } - } - } - } -} - -template -void TestKernelMatMul() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - auto last_acc = FLAGS_acc; - // export MKL_CBWR=AVX would make MKL force to use AVX - // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic - FLAGS_acc = 1e-3; - for (int m : {1, 2, 3, 4}) { - for (int n : {1, 2, 3, 4}) { - for (int k : TestSizes()) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector a(m * k), b(k * n), c(m * n); - RandomVec(m * k, a.data()); - RandomVec(k * n, b.data()); - const T* a_data = a.data(); - const T* b_data = b.data(); - T* c_data = c.data(); - const jit::matmul_attr_t attr{m, n, k}; - ref(a_data, b_data, c_data, &attr); - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& a, - const std::vector& b, - const std::vector& cref, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(a.size(), static_cast(attr.m * attr.k)); - EXPECT_EQ(b.size(), static_cast(attr.k * attr.n)); - EXPECT_EQ(cref.size(), static_cast(attr.m * attr.n)); - std::vector c(cref.size()); - const T* a_data = a.data(); - const T* b_data = b.data(); - const T* cref_data = cref.data(); - T* c_data = c.data(); - tgt(a_data, b_data, c_data, &attr); - ExpectEQ(c_data, cref_data, attr.m * attr.n); - }; - TestAllImpls(attr, verifier, a, b, c, attr); - } - } - } - FLAGS_acc = last_acc; -} - -template -void TestKernelSoftmax() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - for (int bs : {1, 2, 10}) { - for (int n : TestSizes()) { - for (int m : {1, 2, 3}) { // remain - if (m > n || n % m != 0) { - continue; - } - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector x(bs * n), y(bs * n); - RandomVec(bs * n, x.data()); - const T* x_data = x.data(); - T* y_data = y.data(); - - std::vector xinp(x.size()); // inplace test - std::copy(x.begin(), x.end(), xinp.begin()); - ref(x_data, y_data, n, bs, m); - T* xinp_data = xinp.data(); - ref(xinp_data, xinp_data, n, bs, m); - ExpectEQ(xinp_data, y_data, n * bs); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, - const std::vector& yref, - int n, - int bs, - int m) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - EXPECT_EQ(x.size(), static_cast(n * bs)); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - std::vector ytgt(n * bs); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, n, bs, m); - ExpectEQ(ytgt_data, yref_data, n * bs); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, n, bs, m); - ExpectEQ(ytgt_data, yref_data, n * bs); - }; - TestAllImpls(n, verifier, x, y, n, bs, m); - } - } - } -} - -template -void TestKernelStrideASum() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - for (int d : TestSizes()) { - for (int m : {1, 2, 3}) { // stride - if (m > d || d % m != 0) { - continue; - } - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector x(d); - RandomVec(d, x.data()); - T ref_res; - ref(x.data(), &ref_res, d, m); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, - const T ref_res, - const int m) { - EXPECT_TRUE(tgt != nullptr); - T tgt_res; - tgt(x.data(), &tgt_res, x.size(), m); - ExpectEQ(&tgt_res, &ref_res, 1); - }; - TestAllImpls(d, verifier, x, ref_res, m); - } - } -} - -template -void TestKernelStrideScal() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - for (int d : TestSizes()) { - for (int m : {1, 2, 3}) { // stride - if (m > d || d % m != 0) { - continue; - } - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - - const T a = static_cast(3); - std::vector x(d), yref(d); - std::vector xinp(d); // inplace test - RandomVec(d, x.data()); - std::copy(x.begin(), x.end(), xinp.begin()); - - const T* x_data = x.data(); - T* yref_data = yref.data(); - T* xinp_data = xinp.data(); - // test refer code inplace - ref(&a, x_data, yref_data, d, m); - ref(&a, xinp_data, xinp_data, d, m); - ExpectEQ(xinp_data, yref_data, d); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const T a, - const std::vector& x, - const std::vector& yref, - const int m) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - const int d = yref.size(); - std::vector ytgt(d); - T* ytgt_data = ytgt.data(); - // test normal - tgt(&a, x_data, ytgt_data, d, m); - ExpectEQ(ytgt_data, yref_data, d); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(&a, ytgt_data, ytgt_data, d, m); - ExpectEQ(ytgt_data, yref_data, d); - }; - TestAllImpls(d, verifier, a, x, yref, m); - } - } -} - -template -void TestKernelSgd() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - const T lr = 0.1; - auto UnDuplicatedRandomVec = []( - int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); - std::vector all, out; - for (int i = 0; i < n; ++i) { - all.push_back(i); - } - std::random_shuffle(all.begin(), all.end()); - out.insert(out.begin(), all.begin(), all.begin() + n); - return out; - }; - for (int param_h : {1, 10}) { - for (int grad_w : TestSizes()) { - std::vector param(param_h * grad_w); - std::vector param_out(param_h * grad_w); - RandomVec(param_h * grad_w, param.data()); - const T* param_data = param.data(); - T* out_data = param_out.data(); - for (int rows_size = 1; rows_size <= param_h; ++rows_size) { - std::vector grad(rows_size * grad_w); - std::vector rows = - UnDuplicatedRandomVec(rows_size, 0, rows_size - 1); - RandomVec(rows_size * grad_w, grad.data()); - const int64_t* rows_data = rows.data(); - const T* grad_data = grad.data(); - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); - ref(&lr, param_data, grad_data, rows_data, out_data, &attr); - - // inplace test - std::vector inp(param.size()); - std::copy(param.begin(), param.end(), inp.begin()); - T* inp_data = inp.data(); - ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr); - // only the selected rows should be equal - for (int i = 0; i < rows_size; ++i) { - ExpectEQ( - inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w, grad_w); - } - - auto verifier = [](const typename KernelTuple::func_type tgt, - const T lr, - const std::vector& param, - const std::vector& grad, - const std::vector& rows, - const std::vector& oref, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(param.size(), - static_cast(attr.param_height * attr.param_width)); - EXPECT_EQ(grad.size(), - static_cast(attr.grad_height * attr.grad_width)); - EXPECT_EQ(rows.size(), static_cast(attr.selected_rows_size)); - EXPECT_EQ(param.size(), oref.size()); - const T* param_data = param.data(); - const T* grad_data = grad.data(); - const int64_t* rows_data = rows.data(); - const T* oref_data = oref.data(); - - std::vector out(oref.size()); - T* o_data = out.data(); - tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); - // only the selected rows should be equal - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, - attr.grad_width); - } - - // inplace - std::copy(param.begin(), param.end(), out.begin()); - tgt(&lr, o_data, grad_data, rows_data, o_data, &attr); - for (size_t i = 0; i < rows.size(); ++i) { - ExpectEQ(o_data + rows[i] * attr.grad_width, - oref_data + rows[i] * attr.grad_width, - attr.grad_width); - } - }; - TestAllImpls( - attr, verifier, lr, param, grad, rows, param_out, attr); - } - } - } -} - -template -void TestKernelVBroadcast() { - using T = typename KernelTuple::data_type; - VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - for (int w : TestSizes()) { - std::vector x(w); - RandomVec(w, x.data()); - const T* x_data = x.data(); - for (int64_t h : {1, 2, 6}) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector y(w * h); - T* y_data = y.data(); - ref(x_data, y_data, h, w); - - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, - const std::vector& yref, - const int64_t& h, - const typename KernelTuple::attr_type& attr) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(x.size(), static_cast(attr)); - EXPECT_EQ(yref.size(), x.size() * h); - std::vector y(yref.size()); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - T* y_data = y.data(); - tgt(x_data, y_data, h, attr); - ExpectEQ(y_data, yref_data, yref.size()); - }; - TestAllImpls( - static_cast(w), verifier, x, y, h, static_cast(w)); - } - } -} - -// test pool -TEST(JITKernel_pool, jitcreator) { - const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators(); -#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) - EXPECT_EQ(jitcreators.size(), 0UL); -#else - EXPECT_EQ(jitcreators.size(), 25UL); -#endif -} - -TEST(JITKernel_pool, jitpool) { - // jitpool is related with attr - const auto& kers = jit::JitCodePool().Instance().AllKernels(); - EXPECT_EQ(kers.size(), 0UL); - jit::GetAllCandidateKernels, CPUPlace>(3); -// after call GetAllCandidateKernels, it will create jitcode Automatically -#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) - EXPECT_EQ(kers.size(), 0UL); -#else - EXPECT_EQ(kers.size(), 1UL); -#endif -} - -TEST(JITKernel_pool, more) { - const auto& kers = jit::KernelPool::Instance().AllKernels(); - size_t target_num = 8; - -#ifdef __AVX__ - target_num += 2; -#endif - -#ifdef PADDLE_WITH_MKLML - target_num += 12; -#endif - - EXPECT_EQ(kers.size(), target_num); -} - -TEST(JITKernel_pool, refer) { - const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); - EXPECT_EQ(kers.size(), 31UL); -} - -// test helper -TEST(JITKernel_helper, GetAllCandidateKernels) { - auto fp_kers = - jit::GetAllCandidateKernels, CPUPlace>(10); -#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) - EXPECT_GE(fp_kers.size(), 1UL); // refer -#else -#ifdef PADDLE_WITH_MKLML - EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer -#else - EXPECT_GE(fp_kers.size(), 2UL); // jitcode, refer -#endif -#endif - - auto db_kers = - jit::GetAllCandidateKernels, CPUPlace>(10); -#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) - EXPECT_GE(db_kers.size(), 1UL); // refer -#else -#ifdef PADDLE_WITH_MKLML - EXPECT_GE(db_kers.size(), 2UL); // mkl, refer -#else - EXPECT_GE(db_kers.size(), 1UL); // refer -#endif -#endif -} - -TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) { - auto fp_kers = - jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); -#if defined(__APPLE__) || defined(__OSX__) - EXPECT_GE(fp_kers.size(), 1UL); // refer -#else -#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32) - EXPECT_GE(fp_kers.size(), 2UL); // jitcode/mkl, refer -#else - EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer -#endif -#endif - - auto db_kers = - jit::GetAllCandidateFuncsWithTypes, CPUPlace>(10); -#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML) - EXPECT_GE(db_kers.size(), 1UL); // refer -#else - EXPECT_GE(db_kers.size(), 2UL); // mkl, refer -#endif -} - -TEST(JITKernel_helper, KernelFuncs) { - auto f1 = jit::KernelFuncs, CPUPlace>::Cache().At(3); - auto f2 = jit::KernelFuncs, CPUPlace>::Cache()[3]; - EXPECT_TRUE(f1 != nullptr); - EXPECT_TRUE(f1 == f2); - - auto f3 = jit::KernelFuncs, CPUPlace>::Cache()[5]; -#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) - EXPECT_TRUE(f2 == f3); -#else - EXPECT_TRUE(f2 != f3); -#endif -} - -TEST(JITKernel_helper, GetAllCandidateFuncs) { - auto funcs = jit::GetAllCandidateFuncs, CPUPlace>(10); - auto kers = jit::GetAllCandidateKernels, CPUPlace>(10); - EXPECT_EQ(funcs.size(), kers.size()); - - std::vector x(10), tgt(10); - RandomVec(10, x.data()); - auto best = jit::GetDefaultBestFunc, CPUPlace>(10); - best(x.data(), tgt.data(), 10); - for (auto f : funcs) { - std::vector y(10); - f(x.data(), y.data(), 10); - ExpectEQ(y.data(), tgt.data(), 10); - } -} - -TEST(JITKernel_helper, pack_weights) { - const int N = 8 * 60, K = 2; - float src[K][N], yref[K][N], y[K * N]; - float* x = &(src[0][0]); - float* ref = &(yref[0][0]); - for (int i = 0; i < N * K; ++i) { - *(x + i) = static_cast(i); - } - int block = 0; - std::vector groups; - if (paddle::lite::x86::MayIUse(paddle::lite::x86::avx512f)) { - block = ZMM_FLOAT_BLOCK; - groups.push_back(30); - } else { - block = YMM_FLOAT_BLOCK; - groups.insert(groups.end(), {14, 14, 14, 14, 4}); - } - - int offset = 0; - int acc = 0; - for (int g : groups) { - g = g * block; - for (int k = 0; k < K; ++k) { - for (int i = 0; i < g; ++i) { - *(ref + offset) = src[k][i + acc]; - offset++; - } - } - acc += g; - } - - jit::pack_weights(x, y, N, K); - ExpectEQ(y, ref, N * K); -} - -TEST(JITKernel_helper, attr) { - std::ostringstream out; - // KernelTypes - out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding) - << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1) - << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2) - << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax) - << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1) - << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul) - << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool) - << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd) - << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu) - << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy) - << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity) - << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu) - << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd) - << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare) - << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh); - EXPECT_EQ(out.str().size(), 234); - - // SeqPoolTypes - out.str(""); - out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg) - << jit::to_string(jit::kSqrt); - EXPECT_EQ(out.str().size(), 13); - - EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu); - EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity); - EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp); - EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid); - EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh); - - out.str(""); - out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - EXPECT_EQ(out.str().size(), 89); - - out.str(""); - out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid); - EXPECT_EQ(out.str().size(), 52); - - out.str(""); - out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum); - EXPECT_EQ(out.str().size(), 44); - - out.str(""); - out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg); - EXPECT_EQ(out.str().size(), 93); - - out.str(""); - out << jit::sgd_attr_t(1, 2, 3, 4, 5); - EXPECT_EQ(out.str().size(), 81); - - out.str(""); - out << jit::matmul_attr_t(1, 2, 3); - EXPECT_EQ(out.str().size(), 14); -} - -// test keys -TEST(JITKernel_key, int) { - EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); - EXPECT_TRUE(jit::JitCodeKey(2) == jit::JitCodeKey(2)); - EXPECT_TRUE(jit::JitCodeKey(2) != jit::JitCodeKey(3)); -} - -TEST(JITKernel_key, gru) { - jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); - jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); - jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - auto key5 = jit::JitCodeKey(attr5); - - EXPECT_TRUE(key1 == key2); - EXPECT_TRUE(key2 != key3); - EXPECT_TRUE(key2 != key4); - EXPECT_TRUE(key2 != key5); - EXPECT_TRUE(key3 != key4); - EXPECT_TRUE(key3 != key5); - EXPECT_TRUE(key4 != key5); -} - -TEST(JITKernel_key, lstm) { - jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); - jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); - jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - auto key5 = jit::JitCodeKey(attr5); - auto key6 = jit::JitCodeKey(attr6); - - EXPECT_TRUE(key1 == key2); - EXPECT_TRUE(key2 != key3); - EXPECT_TRUE(key2 != key4); - EXPECT_TRUE(key2 != key5); - EXPECT_TRUE(key3 != key4); - EXPECT_TRUE(key3 != key5); - EXPECT_TRUE(key4 != key5); - EXPECT_TRUE(key5 == key6); -} - -TEST(JITKernel_key, seq_pool) { - jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1); - jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3); - jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3); - jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - - EXPECT_TRUE(key1 == key2); - EXPECT_TRUE(key2 != key3); - EXPECT_TRUE(key2 != key4); - EXPECT_TRUE(key3 != key4); -} - -TEST(JITKernel_key, matmul) { - jit::matmul_attr_t attr1(1, 2, 3); - jit::matmul_attr_t attr2(1, 2, 3); - jit::matmul_attr_t attr3(1, 3, 3); - jit::matmul_attr_t attr4(2, 3, 4); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - - EXPECT_TRUE(key1 == key2); - EXPECT_TRUE(key2 != key3); - EXPECT_TRUE(key2 != key4); - EXPECT_TRUE(key3 != key4); -} - -TEST(JITKernel_key, emb_seq_pool) { - jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); - jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum); - jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg); - jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum); - jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - auto key5 = jit::JitCodeKey(attr5); - - EXPECT_TRUE(key1 == key2); - EXPECT_TRUE(key2 == key3); - EXPECT_TRUE(key2 != key4); - EXPECT_TRUE(key2 != key5); - EXPECT_TRUE(key4 != key5); -} - -TEST(JITKernel_key, sgd) { - jit::sgd_attr_t attr1(1, 2, 3, 4, 5); - jit::sgd_attr_t attr2(1, 2, 3, 4, 5); - jit::sgd_attr_t attr3(9, 8, 7, 4, 6); - jit::sgd_attr_t attr4(1, 2, 3, 6, 5); - jit::sgd_attr_t attr5(10, 9, 8, 7, 6); - - auto key1 = jit::JitCodeKey(attr1); - auto key2 = jit::JitCodeKey(attr2); - auto key3 = jit::JitCodeKey(attr3); - auto key4 = jit::JitCodeKey(attr4); - auto key5 = jit::JitCodeKey(attr5); - - EXPECT_TRUE(key1 == key2); - EXPECT_TRUE(key2 == key3); - EXPECT_TRUE(key3 != key4); - EXPECT_TRUE(key3 != key5); - EXPECT_TRUE(key4 != key5); -} - -// test kernerls -#define TestKernelVMul TestKernelXYZN -#define TestKernelVAdd TestKernelXYZN -#define TestKernelVAddRelu TestKernelXYZN -#define TestKernelVSub TestKernelXYZN - -#define TestKernelVScal TestKernelAXYN -#define TestKernelVAddBias TestKernelAXYN - -#define TestKernelVRelu TestKernelXYN -#define TestKernelVIdentity TestKernelXYN -#define TestKernelVSquare TestKernelXYN -#define TestKernelVExp TestKernelXYN -#define TestKernelVSigmoid TestKernelXYN -#define TestKernelVTanh TestKernelXYN -#define TestKernelVCopy TestKernelXYN - -#define TestKernelHMax TestKernelXRN -#define TestKernelHSum TestKernelXRN - -#define TestKernelLSTMCtHt TestKernelLSTM -#define TestKernelLSTMC1H1 TestKernelLSTM - -#define TestKernelGRUH1 TestKernelGRU -#define TestKernelGRUHtPart1 TestKernelGRU -#define TestKernelGRUHtPart2 TestKernelGRU - -#define TEST_CPU_KERNEL(kernel_type) \ - TEST(JITKernel, kernel_type) { \ - TestKernel##kernel_type, CPUPlace>(); \ - TestKernel##kernel_type, CPUPlace>(); \ - } - -TEST_CPU_KERNEL(VMul); -TEST_CPU_KERNEL(VAdd); -TEST_CPU_KERNEL(VAddRelu); -TEST_CPU_KERNEL(VSub); - -TEST_CPU_KERNEL(VScal); -TEST_CPU_KERNEL(VAddBias); - -TEST_CPU_KERNEL(VRelu); -TEST_CPU_KERNEL(VIdentity); -TEST_CPU_KERNEL(VSquare); -TEST_CPU_KERNEL(VExp); -TEST_CPU_KERNEL(VSigmoid); -TEST_CPU_KERNEL(VTanh); -TEST_CPU_KERNEL(VCopy); - -TEST_CPU_KERNEL(HMax); -TEST_CPU_KERNEL(HSum); - -TEST_CPU_KERNEL(LSTMCtHt); -TEST_CPU_KERNEL(LSTMC1H1); - -TEST_CPU_KERNEL(GRUH1); -TEST_CPU_KERNEL(GRUHtPart1); -TEST_CPU_KERNEL(GRUHtPart2); - -TEST_CPU_KERNEL(NCHW16CMulNC); -TEST_CPU_KERNEL(LayerNorm); -TEST_CPU_KERNEL(CRFDecoding); - -TEST_CPU_KERNEL(SeqPool); -TEST_CPU_KERNEL(EmbSeqPool); -TEST_CPU_KERNEL(MatMul); -TEST_CPU_KERNEL(Softmax); -TEST_CPU_KERNEL(Sgd); -TEST_CPU_KERNEL(VBroadcast); - -TEST_CPU_KERNEL(StrideASum); -TEST_CPU_KERNEL(StrideScal); diff --git a/lite/backends/x86/legacy_place.h b/lite/backends/x86/legacy_place.h deleted file mode 100644 index 8f96bbd7da..0000000000 --- a/lite/backends/x86/legacy_place.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -namespace paddle { -namespace lite { -namespace fluid { - -// Fake the legacy Place. -struct Place { - int which() const { return 1; } // fake -}; - -struct CPUPlace : Place {}; - -} // namespace fluid -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt deleted file mode 100644 index 5f440947fe..0000000000 --- a/lite/backends/x86/math/CMakeLists.txt +++ /dev/null @@ -1,62 +0,0 @@ -add_subdirectory(detail) - -function(math_library TARGET) - # math_library is a function to create math library. - # The interface is the same as lite_cc_library. - # But it handle split GPU/CPU code and link some common library. - set(cc_srcs) - set(hip_srcs) - set(math_common_deps context framework_proto) - set(multiValueArgs DEPS) - cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" - "${multiValueArgs}" ${ARGN}) - - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) - list(APPEND cc_srcs ${TARGET}.cc) - endif() - - list(LENGTH cc_srcs cc_srcs_len) - lite_cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps} eigen3 dynload_mklml) -endfunction() - -# please add new math_library in alphabetical order -math_library(concat_and_split) -math_library(context_project DEPS im2col math_function) -math_library(cross_entropy) -math_library(cos_sim_functor) -## math_library(depthwise_conv DEPS cub) -math_library(im2col) -math_library(sample_prob) -math_library(sampler) - -math_library(gru_compute DEPS activation_functions math_function) -## math_library(lstm_compute DEPS activation_functions) - -lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3) -math_library(math_function DEPS blas) -math_library(maxouting) -math_library(pooling) -# math_library(selected_rows_functor DEPS selected_rows math_function blas) -math_library(sequence2batch) -math_library(sequence_padding) -math_library(sequence_pooling DEPS math_function jit_kernel_helper) -math_library(sequence_scale) -math_library(softmax DEPS math_function jit_kernel_helper) -math_library(beam_search DEPS math_function) -# -## math_library(matrix_bit_code) -# -math_library(unpooling) -math_library(vol2col) -## math_library(prelu) -math_library(tree2col DEPS math_function) - -# cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) -# cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) -# cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) -# cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) -# cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) -# cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling) -# cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search) -# cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) -# cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc deleted file mode 100644 index 93726afcc2..0000000000 --- a/lite/backends/x86/math/beam_search.cc +++ /dev/null @@ -1,322 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/beam_search.h" -#include -#include -#include "lite/fluid/lod.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -class BeamSearchFunctor { - public: - void operator()(const lite::X86Context &context, - const lite::Tensor *pre_ids, - const lite::Tensor *pre_scores, - const lite::Tensor *ids, - const lite::Tensor *scores, - lite::Tensor *selected_ids, - lite::Tensor *selected_scores, - lite::Tensor *parent_idx, - size_t level, - size_t beam_size, - int end_id, - bool is_accumulated) { - auto abs_lod = lite::fluid::ToAbsOffset(scores->lod()); - auto &high_level = abs_lod[level]; - - auto items = SelectTopBeamSizeItems(pre_ids, - pre_scores, - ids, - scores, - level, - beam_size, - end_id, - is_accumulated); - auto selected_items = ToMap(items, high_level.back()); - if (FLAGS_v == 3) { - VLOG(3) << "selected_items:"; - for (size_t i = 0; i < selected_items.size(); ++i) { - VLOG(3) << "offset: " << i; - for (auto &item : selected_items[i]) { - VLOG(3) << item.ToString(); - } - } - } - - PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id); - // calculate the output tensor's height - size_t num_instances = std::accumulate( - std::begin(selected_items), - std::end(selected_items), - 0, - [](size_t a, std::vector &b) { return a + b.size(); }); - // the output tensor shape should be [num_instances, 1] - // auto dims = framework::make_ddim( - // std::vector({static_cast(num_instances), 1})); - lite::DDim dims(std::vector({num_instances, 1L})); - - selected_ids->Resize(dims); - auto *selected_ids_data = selected_ids->mutable_data(TARGET(kX86)); - - selected_scores->Resize(dims); - auto *selected_scores_data = - selected_scores->mutable_data(TARGET(kX86)); - - // auto *selected_ids_data = - // selected_ids->mutable_data(dims, platform::CPUPlace()); - // auto *selected_scores_data = - // selected_scores->mutable_data(dims, platform::CPUPlace()); - parent_idx->Resize({static_cast(num_instances)}); - auto *parent_idx_data = - parent_idx ? parent_idx->mutable_data(TARGET(kX86)) : nullptr; - // auto *parent_idx_data = - // parent_idx - // ? parent_idx->mutable_data( - // {static_cast(num_instances)}, platform::CPUPlace()) - // : nullptr; - - // fill in data - std::vector low_level; - size_t low_offset = 0; - for (auto &items : selected_items) { - low_level.push_back(low_offset); - for (auto &item : items) { - if (parent_idx) { - parent_idx_data[low_offset] = static_cast(low_level.size() - 1); - } - selected_ids_data[low_offset] = item.id; - selected_scores_data[low_offset] = item.score; - low_offset++; - } - } - low_level.push_back(low_offset); - - // fill lod - lite::LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - // if (!lite::fluid::CheckLoD(lod)) { - // //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); - //} - selected_ids->set_lod(lod); - selected_scores->set_lod(lod); - } - - /* - * The basic items help to sort. - */ - struct Item { - Item() {} - Item(size_t offset, size_t id, float score) - : offset(offset), id(id), score(score) {} - // offset in the higher lod level. - size_t offset; - // prefix id in the lower lod level. - // size_t prefix; - // the candidate id - size_t id; - // the corresponding score - float score; - - inline bool operator<(const Item &in) const { - return (score < in.score) || - ((score == in.score) && (offset < in.offset)); - } - - inline void operator=(const Item &in) { - offset = in.offset; - id = in.id; - score = in.score; - } - - std::string ToString() { - std::ostringstream os; - os << "{"; - os << "offset: " << offset << ", "; - os << "id: " << id << ", "; - os << "score: " << score << ""; - os << "}"; - return os.str(); - } - }; - - protected: - /* - * Prune the source sentences all branchs finished, and it is optional. - * Pruning must one step later than finishing (thus pre_ids is needed here), - * since the end tokens must be writed out. - */ - void PruneEndBeams(const lite::Tensor *pre_ids, - const lite::LoD &abs_lod, - std::vector> *items, - size_t lod_level, - int end_id) { - auto *pre_ids_data = pre_ids->data(); - auto &high_level = abs_lod[lod_level]; - for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) { - size_t src_prefix_start = high_level[src_idx]; - size_t src_prefix_end = high_level[src_idx + 1]; - bool finish_flag = true; - for (size_t offset = src_prefix_start; offset < src_prefix_end; - offset++) { - for (auto &item : items->at(offset)) { - if (item.id != static_cast(end_id) || - pre_ids_data[offset] != end_id) { - finish_flag = false; - break; - } - } - if (!finish_flag) break; - } - if (finish_flag) { // all branchs of the beam (source sentence) end and - // prune this beam - for (size_t offset = src_prefix_start; offset < src_prefix_end; - offset++) - items->at(offset).clear(); - } - } - } - - /* - * Transform the items into a map whose key is offset, value is the items. - * NOTE low performance. - */ - std::vector> ToMap( - const std::vector> &items, size_t element_num) { - std::vector> result; - result.resize(element_num); - for (auto &entries : items) { - for (const auto &item : entries) { - result[item.offset].push_back(item); - } - } - return result; - } - - void Insert(std::vector *top_beam_ptr, - const Item &item, - size_t beam_size) { - std::vector &top_beam = *top_beam_ptr; - - size_t num_beams = top_beam.size(); - if (num_beams < beam_size) { - top_beam.resize(num_beams + 1); - num_beams++; - } else { - if (item < top_beam[beam_size - 1]) { - return; - } - } - - for (int k = static_cast(num_beams) - 2; k >= 0; --k) { - if (top_beam[k] < item) { - top_beam[k + 1] = top_beam[k]; - } else { - top_beam[k + 1] = item; - return; - } - } - top_beam[0] = item; - } - - /* - * For each source, select top beam_size records. - */ - std::vector> SelectTopBeamSizeItems( - const lite::Tensor *pre_ids, - const lite::Tensor *pre_scores, - const lite::Tensor *ids, - const lite::Tensor *scores, - size_t lod_level, - size_t beam_size, - int end_id, - bool is_accumulated) { - std::vector> result; - - // find the current candidates - auto abs_lod = lite::fluid::ToAbsOffset(scores->lod()); - - auto *pre_ids_data = pre_ids->data(); - auto *pre_scores_data = pre_scores->data(); - - auto *ids_data = ids ? ids->data() : nullptr; - auto *scores_data = scores->data(); - - // size_t num_seqs = scores->NumElements(lod_level); - size_t num_seqs = scores->lod()[lod_level].size() - 1; - size_t seq_width = 1; - for (int i = 1; i < scores->dims().size(); i++) { - seq_width *= scores->dims()[i]; - } - - for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) { - size_t seq_offset_start = abs_lod[lod_level][seq_id]; - size_t seq_offset_end = abs_lod[lod_level][seq_id + 1]; - - std::vector top_beam; - top_beam.reserve(beam_size); - - for (size_t offset = seq_offset_start; offset < seq_offset_end; - ++offset) { - auto pre_id = pre_ids_data[offset]; - auto pre_score = pre_scores_data[offset]; - if (pre_id == end_id) { - // Allocate all probability mass to end_id for finished branchs and - // the other candidate ids can be ignored. - Item item(offset, end_id, pre_score); - Insert(&top_beam, item, beam_size); - } else { - size_t index = offset * seq_width; - for (size_t d = 0; d < seq_width; d++, index++) { - int64_t id = ids_data ? ids_data[index] : static_cast(d); - float score = is_accumulated - ? scores_data[index] - : pre_score + std::log(scores_data[index]); - Item item(offset, id, score); - Insert(&top_beam, item, beam_size); - } - } - } - - result.emplace_back(top_beam); - } - - if (FLAGS_v == 3) { - VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); - for (auto &items : result) { - VLOG(3) << "item set:"; - for (auto &item : items) { - VLOG(3) << item.ToString(); - } - } - } - - return result; - } -}; - -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/beam_search.h b/lite/backends/x86/math/beam_search.h deleted file mode 100644 index 40998c89f9..0000000000 --- a/lite/backends/x86/math/beam_search.h +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * This is an implementation of beam search. - * - * To explain the details, lets take machine translation task for example, in - * this task, one source sentence is translated to multiple target sentences, - * during this period, one sentence will be translated to multiple translation - * prefixes(target sentence that have not ended), in each time step a prefix - * will have some candidates, input the candidate ids and their corresponding - * scores (probabilities), it will sort and select the top beam_size candidates - * for each source sentence, and store the selected candidates's score and their - * corresponding ids to LoDTensors. - * - * A detailed example: - * - * Input - * - * ids: - * - LoD (should have 2 levels) - * - first level: [0, 1, 4] - * - second level: [0, 1, 2, 3, 4] - * - tensor's data: - * [[4, 2, 5] - * [2, 1, 3] - * [3, 5, 2] - * [8, 2, 1]] - * - * scores: - * - LoD same as `ids` - * - tensor's data - * [[0.5, 0.3, 0.2] - * [0.6, 0.3, 0.1] - * [0.9, 0.5, 0.1] - * [0.7, 0.5, 0.1]] - * - * The inputs means that there are 2 source sentences to translate, and the - * first source has 1 prefix, the second source has 2 prefix. - * - * Lets assume beam size is 2, and the beam search's output should be - * - LoD - * - first level: [0, 1, 2] - * - second level: [0, 2, 4] - * - id tensor's data - * [[4, - * 1, - * 3, - * 8]] - * - score tensor's data - * [[0.5, - * 0.3, - * 0.9, - * 0.7]] - * - * TODO all the prune operations should be in the beam search, so it is better - * to split the beam search algorithm into a sequence of smaller operators, and - * the prune operators can be inserted in this sequence. - */ -template -class BeamSearchFunctor { - public: - /* - * The main function of beam search. - * - * @selected_ids: a [None, 1]-shaped tensor with LoD. - * In a machine translation model, it might be the candidate term id sets, - * each set stored as a varience-length sequence. - * The format might be described with a two-level LoD - * - [[0 1], - * [0 1 2]] - * - [[] - * [0 1]] - * the first level of LoD tells that there are two source sentences. The - * second level describes the details of the candidate id set's offsets in - * the source sentences. - * - * @selected_scores: a LoD tensor with the same shape and LoD with - * selected_ids. - * It stores the corresponding scores of candidate ids in selected_ids. - * - * Return false if all the input tensor is empty, in machine translation task - * that means no candidates is provided, and the task will stop running. - */ - void operator()(const lite::Context& context, - const lite::Tensor* pre_ids, - const lite::Tensor* pre_scores, - const lite::Tensor* ids, - const lite::Tensor* scores, - lite::Tensor* selected_ids, - lite::Tensor* selected_scores, - lite::Tensor* parent_idx, - size_t level, - size_t beam_size, - int end_id, - bool is_accumulated); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/beam_search_test.cc b/lite/backends/x86/math/beam_search_test.cc deleted file mode 100644 index 904870207b..0000000000 --- a/lite/backends/x86/math/beam_search_test.cc +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/beam_search.h" -#include -#include - -void PrepareCPUTensors(paddle::framework::LoDTensor* ids, - paddle::framework::LoDTensor* scores, - paddle::framework::LoDTensor* pre_ids, - paddle::framework::LoDTensor* pre_scores) { - // lod - paddle::framework::LoD lod; - std::vector level0({0, 2, 4}); - std::vector level1({0, 1, 2, 3, 4}); - lod.push_back(level0); - lod.push_back(level1); - ids->set_lod(lod); - scores->set_lod(lod); - - auto dims = paddle::framework::make_ddim({4, 3}); - ids->Resize(dims); - scores->Resize(dims); - - paddle::platform::CPUPlace place; - auto* ids_data = ids->mutable_data(place); - auto* scores_data = scores->mutable_data(place); - std::vector ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - std::vector scores_vec_data( - {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); - - CHECK_EQ(static_cast(ids->numel()), ids_vec_data.size()); - CHECK_EQ(static_cast(ids->numel()), scores_vec_data.size()); - - for (int i = 0; i < ids->numel(); i++) { - ids_data[i] = ids_vec_data[i]; - scores_data[i] = scores_vec_data[i]; - } - - // pre_ids - pre_ids->Resize(paddle::framework::make_ddim({4, 1})); - for (int i = 0; i < 4; i++) { - pre_ids->mutable_data(place)[i] = i + 1; - } - - // pre_scores - pre_scores->Resize(paddle::framework::make_ddim({4, 1})); - for (int i = 0; i < 4; i++) { - pre_scores->mutable_data(place)[i] = 0.1 * (i + 1); - } -} - -template -void TestBeamSearch() { - paddle::framework::LoDTensor ids; - paddle::framework::LoDTensor scores; - paddle::framework::LoDTensor pre_ids; - paddle::framework::LoDTensor pre_scores; - - auto* place = new Place(); - DeviceContext* context = new DeviceContext(*place); - if (paddle::platform::is_cpu_place(*place)) { - PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores); - } else { - paddle::framework::LoDTensor cpu_ids; - paddle::framework::LoDTensor cpu_scores; - paddle::framework::LoDTensor cpu_pre_ids; - paddle::framework::LoDTensor cpu_pre_scores; - - PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores); - - TensorCopySync(cpu_ids, *place, &ids); - TensorCopySync(cpu_scores, *place, &scores); - TensorCopySync(cpu_pre_ids, *place, &pre_ids); - TensorCopySync(cpu_pre_scores, *place, &pre_scores); - - ids.set_lod(cpu_ids.lod()); - scores.set_lod(cpu_scores.lod()); - pre_ids.set_lod(cpu_pre_ids.lod()); - pre_scores.set_lod(cpu_pre_scores.lod()); - } - - paddle::framework::LoDTensor selected_ids; - paddle::framework::LoDTensor selected_scores; - paddle::framework::LoDTensor parent_idx; - - size_t level = 0; - size_t beam_size = 2; - int end_id = 0; - paddle::operators::math::BeamSearchFunctor beamsearch; - beamsearch(*context, - &pre_ids, - &pre_scores, - &ids, - &scores, - &selected_ids, - &selected_scores, - &parent_idx, - level, - beam_size, - end_id, - true); - - ASSERT_EQ(selected_ids.lod(), selected_scores.lod()); - - paddle::framework::LoDTensor cpu_selected_ids; - paddle::framework::LoDTensor cpu_selected_scores; - if (paddle::platform::is_cpu_place(*place)) { - cpu_selected_ids = selected_ids; - cpu_selected_scores = selected_scores; - } else { - TensorCopySync( - selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids); - TensorCopySync( - selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores); - cpu_selected_ids.set_lod(selected_ids.lod()); - cpu_selected_scores.set_lod(selected_scores.lod()); - } - - std::vector expected_ids({4, 5, 3, 8}); - std::vector expected_scores({0.6f, 0.5f, 0.9f, 0.7f}); - for (int i = 0; i < 4; i++) { - ASSERT_EQ(expected_ids[i], cpu_selected_ids.data()[i]); - ASSERT_EQ(expected_scores[i], cpu_selected_scores.data()[i]); - } - - delete place; - delete context; -} - -TEST(BeamSearch, CPU) { - TestBeamSearch(); -} - -#ifdef PADDLE_WITH_CUDA -TEST(BeamSearch, GPU) { - TestBeamSearch(); -} -#endif diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc deleted file mode 100644 index 2d21adaf5d..0000000000 --- a/lite/backends/x86/math/blas.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/x86/math/blas.h" - -#include - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, - int num_flatten_cols, - bool trans) { - PADDLE_ENFORCE_GT(tensor_dim.size(), 1); - MatDescriptor retv; - if (num_flatten_cols > 1) { - auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); - retv.height_ = flatten_dim[0]; - retv.width_ = flatten_dim[1]; - } else { - if (tensor_dim.size() == 2) { - retv.height_ = tensor_dim[0]; - retv.width_ = tensor_dim[1]; - } else { - auto dim_vec = tensor_dim.Vectorize(); - retv.batch_size_ = 1; - for (size_t i = 0; i < dim_vec.size() - 2; ++i) { - retv.batch_size_ *= dim_vec[i]; - } - retv.height_ = dim_vec[dim_vec.size() - 2]; - retv.width_ = dim_vec[dim_vec.size() - 1]; - retv.stride_ = retv.height_ * retv.width_; - } - } - if (trans) { - std::swap(retv.width_, retv.height_); - } - retv.trans_ = trans; - return retv; -} - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/blas.h b/lite/backends/x86/math/blas.h deleted file mode 100644 index c7d5abfce3..0000000000 --- a/lite/backends/x86/math/blas.h +++ /dev/null @@ -1,408 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/core/op_lite.h" -#include "lite/core/tensor.h" - -#ifdef PADDLE_WITH_MKLML -#include "lite/backends/x86/mklml.h" -#endif - -#ifdef PADDLE_WITH_LIBXSMM -#include -#endif - -#ifdef PADDLE_USE_OPENBLAS -#include -#endif - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/** - * Matrix Descriptor of a memory buffer. - * - * It is used for Blas::MatMul. MatMul operator can be batched. - * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a - * `batch_size` times of GEMM. The batched GEMM could be faster base on the - * implementation of the blas library. The batch size could be zero. If any - * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g., - * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be - * [BatchSize, H1, W2] - * - * The boolean flag, `trans`, describe the memory is the transpose of matrix or - * not. If the trans is true, the last two dims of matrix are transposed. The - * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height]. - * - * The MatDescriptor is not only the dimension or shape of a matrix, it also - * contains the layout, stride of matrix. It is clearer to have a structure than - * reuse `DDim`. - */ -struct MatDescriptor { - int64_t height_; - int64_t width_; - int64_t stride_{0}; - int64_t batch_size_{0}; - bool trans_; -}; - -/** - * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose - * flag - * - * @param tensor_dim: The dimension of the tensor. The rank of this dimension - * must larger than 1. - * - * @param num_flatten_cols: Reshape a tensor to a matrix. The matrix's first - * dimension(column length) will be the product of tensor's first `num_col_dims` - * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the - * batch_size of descriptor. - * - * @param trans: True if the matrix is transposed. - */ -extern MatDescriptor CreateMatrixDescriptor(const lite::DDimLite& tensor_dim, - int num_flatten_cols, - bool trans); - -template -class Blas { - public: - explicit Blas(const lite::Context& context) : context_(context) {} - - template - void GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T* A, - const T* B, - T beta, - T* C) const; - - template - void GEMM(bool transA, - bool transB, - int M, - int N, - int K, - T alpha, - const T* A, - int lda, - const T* B, - int ldb, - T beta, - T* C, - int ldc) const; - - template - void GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T* A, - int lda, - const T* B, - int ldb, - T beta, - T* C, - int ldc) const; - -#ifdef PADDLE_WITH_MKLML - template - T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, - const int M, - const int N, - const int K) const; - - template - void GEMM_PACK(const CBLAS_IDENTIFIER id, - const CBLAS_TRANSPOSE trans, - int M, - int N, - int K, - const T alpha, - const T* src, - const int ld, - T* dst) const; - - template - void GEMM_COMPUTE(int transA, - int transB, - int M, - int N, - int K, - const T* A, - const int lda, - const T* B, - const int ldb, - T beta, - T* C, - const int ldc) const; - - template - void GEMM_FREE(T* data) const; -#endif - - template - void MatMul(const int M, - const int N, - const int K, - const T* A, - const T* B, - T* C) const; - - template - void MatMul(const lite::TensorLite& mat_a, - bool trans_a, - const lite::TensorLite& mat_b, - bool trans_b, - T alpha, - lite::TensorLite* mat_out, - T beta) const; - - template - void MatMul(const lite::TensorLite& mat_a, - bool trans_a, - const lite::TensorLite& mat_b, - bool trans_b, - lite::TensorLite* mat_out) const { - MatMul(mat_a, - trans_a, - mat_b, - trans_b, - static_cast(1.0), - mat_out, - static_cast(0.0)); - } - - template - void MatMul(const lite::TensorLite& mat_a, - const lite::TensorLite& mat_b, - lite::TensorLite* mat_out) const { - this->template MatMul(mat_a, false, mat_b, false, mat_out); - } - - template - void AXPY(int n, T alpha, const T* x, T* y) const; - - template - void VADD(int n, const T* x, const T* y, T* z) const; - - template - void VMUL(int n, const T* x, const T* y, T* z) const; - - template - void VCOPY(int n, const T* x, T* y) const; - - template - void VEXP(int n, const T* x, T* y) const; - - template - void VSQUARE(int n, const T* x, T* y) const; - - template - void VPOW(int n, const T* x, T alpha, T* y) const; - - template - void GEMV(bool trans_a, - int M, - int N, - T alpha, - const T* A, - const T* B, - T beta, - T* C) const; - - template - T DOT(int n, const T* x, const T* y) const; - - template - void SCAL(int n, const T a, T* x) const; - - template - T ASUM(int n, T* x, int inc) const; - - template - void BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T* A, - const T* B, - T beta, - T* C, - int batchCount, - int64_t strideA, - int64_t strideB) const; - - template - void MatMul(const lite::TensorLite& mat_a, - const MatDescriptor& dim_a, - const lite::TensorLite& mat_b, - const MatDescriptor& dim_b, - T alpha, - lite::TensorLite* mat_out, - T beta) const; - - template - void VINV(int n, const T* a, T* y) const; - - template - void VMERF(int n, const T* a, T* y, int64_t mode) const; - - private: - const lite::Context& context_; -}; - -template -class BlasT : private Blas { - public: - using Blas::Blas; - - template - void GEMM(ARGS... args) const { - Base()->template GEMM(args...); - } - -#ifdef PADDLE_WITH_MKLML - template - T* GEMM_ALLOC(ARGS... args) const { - return Base()->template GEMM_ALLOC(args...); - } - - template - void GEMM_PACK(ARGS... args) const { - Base()->template GEMM_PACK(args...); - } - - template - void GEMM_COMPUTE(ARGS... args) const { - Base()->template GEMM_COMPUTE(args...); - } - - template - void GEMM_FREE(ARGS... args) const { - Base()->template GEMM_FREE(args...); - } -#endif - - template - void MatMul(ARGS... args) const { - Base()->template MatMul(args...); - } - - template - void AXPY(ARGS... args) const { - Base()->template AXPY(args...); - } - - template - void VADD(ARGS... args) const { - Base()->template VADD(args...); - } - - template - void VMUL(ARGS... args) const { - Base()->template VMUL(args...); - } - - template - void VCOPY(ARGS... args) const { - Base()->template VCOPY(args...); - } - - template - void VEXP(ARGS... args) const { - Base()->template VEXP(args...); - } - - template - void VSQUARE(ARGS... args) const { - Base()->template VSQUARE(args...); - } - - template - void VPOW(ARGS... args) const { - Base()->template VPOW(args...); - } - - template - void GEMV(ARGS... args) const { - Base()->template GEMV(args...); - } - - template - T DOT(ARGS... args) const { - return Base()->template DOT(args...); - } - - template - void SCAL(ARGS... args) const { - Base()->template SCAL(args...); - } - - template - T ASUM(ARGS... args) const { - return Base()->template ASUM(args...); - } - - template - void BatchedGEMM(ARGS... args) const { - Base()->template BatchedGEMM(args...); - } - - template - void VINV(ARGS... args) const { - Base()->template VINV(args...); - } - - template - void VMERF(ARGS... args) const { - Base()->template VMERF(args...); - } - - private: - const Blas* Base() const { - return static_cast*>(this); - } -}; - -// template -// inline BlasT GetBlas( -// const framework::ExecutionContext& exe_ctx) { -// return BlasT( -// exe_ctx.template device_context()); -//} - -template -inline BlasT GetBlas(const lite::Context& dev_ctx) { - return BlasT(dev_ctx); -} - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle - -#include "lite/backends/x86/math/blas_impl.h" diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h deleted file mode 100644 index c4844a4df3..0000000000 --- a/lite/backends/x86/math/blas_impl.h +++ /dev/null @@ -1,812 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once -#include -#include -#include -#include "lite/backends/x86/math/math_function.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -struct CBlas; - -#ifdef PADDLE_WITH_MKLML -template <> -struct CBlas { - template - static void GEMM(ARGS... args) { - lite::x86::cblas_sgemm(args...); - } - - template - static float *GEMM_ALLOC(ARGS... args) { - return lite::x86::cblas_sgemm_alloc(args...); - } - - template - static void GEMM_PACK(ARGS... args) { - lite::x86::cblas_sgemm_pack(args...); - } - - template - static void GEMM_COMPUTE(ARGS... args) { - lite::x86::cblas_sgemm_compute(args...); - } - - template - static void GEMM_FREE(ARGS... args) { - lite::x86::cblas_sgemm_free(args...); - } - -#ifdef PADDLE_WITH_LIBXSMM - template - static void SMM_GEMM(ARGS... args) { - libxsmm_sgemm(args...); - } -#endif - - template - static void AXPY(ARGS... args) { - lite::x86::cblas_saxpy(args...); - } - - template - static void VCOPY(ARGS... args) { - lite::x86::cblas_scopy(args...); - } - - template - static void GEMV(ARGS... args) { - lite::x86::cblas_sgemv(args...); - } - - template - static float DOT(ARGS... args) { - return lite::x86::cblas_sdot(args...); - } - - template - static void SCAL(ARGS... args) { - lite::x86::cblas_sscal(args...); - } - - template - static float ASUM(ARGS... args) { - return lite::x86::cblas_sasum(args...); - } - - template - static void GEMM_BATCH(ARGS... args) { - lite::x86::cblas_sgemm_batch(args...); - } - - template - static void VADD(ARGS... args) { - lite::x86::vsAdd(args...); - } - - template - static void VMUL(ARGS... args) { - lite::x86::vsMul(args...); - } - - template - static void VEXP(ARGS... args) { - lite::x86::vsExp(args...); - } - - template - static void VSQUARE(ARGS... args) { - lite::x86::vsSqr(args...); - } - - template - static void VPOW(ARGS... args) { - lite::x86::vsPowx(args...); - } - - template - static void VINV(ARGS... args) { - lite::x86::vsInv(args...); - } - - template - static void VMERF(ARGS... args) { - lite::x86::vmsErf(args...); - } -}; - -template <> -struct CBlas { - template - static void GEMM(ARGS... args) { - lite::x86::cblas_dgemm(args...); - } - - template - static double *GEMM_ALLOC(ARGS... args) { - return lite::x86::cblas_dgemm_alloc(args...); - } - - template - static void GEMM_PACK(ARGS... args) { - lite::x86::cblas_dgemm_pack(args...); - } - - template - static void GEMM_COMPUTE(ARGS... args) { - lite::x86::cblas_dgemm_compute(args...); - } - - template - static void GEMM_FREE(ARGS... args) { - lite::x86::cblas_dgemm_free(args...); - } - -#ifdef PADDLE_WITH_LIBXSMM - template - static void SMM_GEMM(ARGS... args) { - libxsmm_dgemm(args...); - } -#endif - - template - static void AXPY(ARGS... args) { - lite::x86::cblas_daxpy(args...); - } - - template - static void VCOPY(ARGS... args) { - lite::x86::cblas_dcopy(args...); - } - - template - static void GEMV(ARGS... args) { - lite::x86::cblas_dgemv(args...); - } - - template - static double DOT(ARGS... args) { - return lite::x86::cblas_ddot(args...); - } - - template - static void SCAL(ARGS... args) { - lite::x86::cblas_dscal(args...); - } - - template - static double ASUM(ARGS... args) { - return lite::x86::cblas_dasum(args...); - } - - template - static void GEMM_BATCH(ARGS... args) { - lite::x86::cblas_dgemm_batch(args...); - } - - template - static void VADD(ARGS... args) { - lite::x86::vdAdd(args...); - } - - template - static void VMUL(ARGS... args) { - lite::x86::vdMul(args...); - } - - template - static void VEXP(ARGS... args) { - lite::x86::vdExp(args...); - } - - template - static void VSQUARE(ARGS... args) { - lite::x86::vdSqr(args...); - } - - template - static void VPOW(ARGS... args) { - lite::x86::vdPowx(args...); - } - - template - static void VINV(ARGS... args) { - lite::x86::vdInv(args...); - } - - template - static void VMERF(ARGS... args) { - lite::x86::vmdErf(args...); - } -}; - -#else - -template <> -struct CBlas { - template - static void GEMM(ARGS... args) { - cblas_sgemm(args...); - } - - template - static void AXPY(ARGS... args) { - cblas_saxpy(args...); - } - - template - static void VCOPY(ARGS... args) { - cblas_scopy(args...); - } - - template - static void GEMV(ARGS... args) { - cblas_sgemv(args...); - } -}; - -template <> -struct CBlas { - template - static void GEMM(ARGS... args) { - cblas_dgemm(args...); - } - - template - static void AXPY(ARGS... args) { - cblas_daxpy(args...); - } - - template - static void VCOPY(ARGS... args) { - cblas_dcopy(args...); - } - - template - static void GEMV(ARGS... args) { - cblas_dgemv(args...); - } -}; -#endif - -template <> -struct CBlas { - static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } - static void SMM_GEMM(...) { - PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); - } - static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } - static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } - static void VSQUARE(...) { - PADDLE_THROW("float16 VSQUARE not supported on CPU"); - } - static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); } - static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; - static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; - static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); }; -#ifdef PADDLE_WITH_MKLML - static void GEMM_BATCH(...) { - PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); - } -#endif -}; - -#ifdef PADDLE_WITH_MKLML -template <> -template -T *Blas::GEMM_ALLOC(const CBLAS_IDENTIFIER id, - const int M, - const int N, - const int K) const { - return CBlas::GEMM_ALLOC(id, M, N, K); -} - -template <> -template -void Blas::GEMM_PACK(const CBLAS_IDENTIFIER id, - const CBLAS_TRANSPOSE trans, - int M, - int N, - int K, - const T alpha, - const T *src, - const int ld, - T *dst) const { - CBlas::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst); -} - -template <> -template -void Blas::GEMM_COMPUTE(int transA, - int transB, - int M, - int N, - int K, - const T *A, - const int lda, - const T *B, - const int ldb, - T beta, - T *C, - const int ldc) const { - CBlas::GEMM_COMPUTE( - CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc); -} - -template <> -template -void Blas::GEMM_FREE(T *data) const { - CBlas::GEMM_FREE(data); -} -#endif - -template <> -template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - CBlas::GEMM(CblasRowMajor, - transA, - transB, - M, - N, - K, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc); -} - -template <> -template -void Blas::GEMM(bool transA, - bool transB, - int M, - int N, - int K, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) const { - CBlas::GEMM(CblasRowMajor, - transA == false ? CblasNoTrans : CblasTrans, - transB == false ? CblasNoTrans : CblasTrans, - M, - N, - K, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc); -} - -template <> -template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) const { - CBlas::GEMM(CblasRowMajor, - transA, - transB, - M, - N, - K, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc); -} - -template -template -void Blas::MatMul(const lite::Tensor &mat_a, - bool trans_a, - const lite::Tensor &mat_b, - bool trans_b, - T alpha, - lite::Tensor *mat_out, - T beta) const { - auto dim_a = mat_a.dims(); - auto dim_b = mat_b.dims(); - auto dim_out = mat_out->dims(); - PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, - "The input and output of matmul be matrix"); - PADDLE_ENFORCE( - mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(), - "The targets of matrices must be same"); - - int M = dim_out[0]; - int N = dim_out[1]; - int K = !trans_a ? dim_a[1] : dim_a[0]; - - CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans; - CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans; - - this->GEMM(transA, - transB, - M, - N, - K, - alpha, - mat_a.data(), - mat_b.data(), - beta, - mat_out->mutable_data()); -} - -template <> -template -void Blas::AXPY(int n, - T alpha, - const T *x, - T *y) const { - CBlas::AXPY(n, alpha, x, 1, y, 1); -} - -template <> -template -void Blas::VCOPY(int n, const T *x, T *y) const { - CBlas::VCOPY(n, x, 1, y, 1); -} - -template <> -template -void Blas::VADD(int n, - const T *x, - const T *y, - T *z) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VADD(n, x, y, z); -#else - this->template VCOPY(n, y, z); - this->template AXPY(n, 1., x, z); -#endif -} - -template <> -template -void Blas::VMUL(int n, - const T *x, - const T *y, - T *z) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VMUL(n, x, y, z); -#else - // try to find if openblas support vmul - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -#endif -} - -template <> -template -void Blas::VEXP(int n, const T *x, T *y) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VEXP(n, x, y); -#else - // try to find if openblas support vexp - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -#endif -} - -template <> -template -void Blas::VSQUARE(int n, const T *x, T *y) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VSQUARE(n, x, y); -#else - for (int i = 0; i < n; ++i) { - y[i] = x[i] * x[i]; - } -#endif -} - -template <> -template -void Blas::VPOW(int n, const T *x, T a, T *y) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VPOW(n, x, a, y); -#else - for (int i = 0; i < n; ++i) { - y[i] = std::pow(x[i], a); - } -#endif -} - -template <> -template -T Blas::DOT(int n, const T *x, const T *y) const { -#ifdef PADDLE_WITH_MKLML - return CBlas::DOT(n, x, 1, y, 1); -#else - // try to find if openblas support cblas_dot - T sum = 0; - for (int i = 0; i < n; ++i) { - sum += x[i] * y[i]; - } - return sum; -#endif -} - -template <> -template -void Blas::SCAL(int n, const T a, T *x) const { -#ifdef PADDLE_WITH_MKLML - CBlas::SCAL(n, a, x, 1); -#else - // try to find if openblas support cblas_scal - for (int i = 0; i < n; ++i) { - x[i] = a * x[i]; - } -#endif -} - -template <> -template -T Blas::ASUM(int n, T *x, int inc) const { - auto sum = static_cast(0.0); -#ifdef PADDLE_WITH_MKLML - sum = CBlas::ASUM(n, x, inc); -#else - // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum - for (int c = 0; c < n; ++c) { - sum += x[c]; - } -#endif - return sum; -} - -template <> -template -void Blas::GEMV(bool trans_a, - int M, - int N, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans; - CBlas::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); -} - -template <> -template -void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { -#ifdef PADDLE_WITH_MKLML - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - auto a_array = std::vector(batchCount); - auto b_array = std::vector(batchCount); - auto c_array = std::vector(batchCount); - for (int k = 0; k < batchCount; ++k) { - a_array[k] = &A[k * strideA]; - b_array[k] = &B[k * strideB]; - c_array[k] = &C[k * M * N]; - } - - CBlas::GEMM_BATCH(CblasRowMajor, - &transA, - &transB, - &M, - &N, - &K, - &alpha, - a_array.data(), - &lda, - b_array.data(), - &ldb, - &beta, - c_array.data(), - &ldc, - 1 /* group_count */, - &batchCount); -#else - for (int k = 0; k < batchCount; ++k) { - auto *Ak = &A[k * strideA]; - auto *Bk = &B[k * strideB]; - auto *Ck = &C[k * M * N]; - this->template GEMM(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck); - } -#endif -} - -template -template -void Blas::MatMul( - const int M, const int N, const int K, const T *A, const T *B, T *C) const { - this->template GEMM(CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - M, - N, - K, - static_cast(1), - A, - K, - B, - N, - static_cast(0), - C, - N); -} - -template <> -template -void Blas::MatMul( - const int M, const int N, const int K, const T *A, const T *B, T *C) const { -#ifdef PADDLE_WITH_LIBXSMM - // Refer to https://github.com/hfp/libxsmm/blob/master/README.md - // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20; - - // Since the matrix is very small, - // so the unit of calculation is already very fast, - // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead, - // use xsmm directly. - // Note: SMM use ColMajor - const char transa = 'N'; - const char transb = 'N'; - const T alpha = static_cast(1); - const T beta = static_cast(0); - CBlas::SMM_GEMM( - &transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N); - return; -#endif - - CBlas::GEMM(CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - M, - N, - K, - static_cast(1), - A, - K, - B, - N, - static_cast(0), - C, - N); -} - -template -template -void Blas::MatMul(const lite::Tensor &mat_a, - const MatDescriptor &dim_a, - const lite::Tensor &mat_b, - const MatDescriptor &dim_b, - T alpha, - lite::Tensor *mat_out, - T beta) const { - PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_); - CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans; - CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans; - if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) { - this->template GEMM(transA, - transB, - dim_a.height_, - dim_b.width_, - dim_a.width_, - alpha, - mat_a.data(), - mat_b.data(), - beta, - mat_out->mutable_data()); - } else { - PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || - dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); - this->template BatchedGEMM( - transA, - transB, - dim_a.height_, - dim_b.width_, - dim_a.width_, - alpha, - mat_a.data(), - mat_b.data(), - beta, - mat_out->mutable_data(), - dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_, - dim_a.stride_, - dim_b.stride_); - } -} -template -template -void Blas::VINV(int n, const T *a, T *y) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VINV(n, a, y); -#else - for (int i = 0; i < n; ++i) { - y[i] = 1.0 / a[i]; - } -#endif -} - -template <> -template -void Blas::VMERF(int n, - const T *a, - T *y, - int64_t mode) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VMERF(n, a, y, mode); -#else - for (int i = 0; i < n; ++i) { - y[i] = std::erf(a[i]); - } -#endif -} - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/concat_and_split.cc b/lite/backends/x86/math/concat_and_split.cc deleted file mode 100644 index bec93dde41..0000000000 --- a/lite/backends/x86/math/concat_and_split.cc +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/concat_and_split.h" -#include -#include - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * All tensors' dimension should be the same and the values of - * each dimension must be the same, except the axis dimension. - */ -template -class ConcatFunctor { - public: - void operator()(const lite::X86Context& context, - const std::vector& input, - int axis, - lite::Tensor* output) { - // TODO(zcd): Add input data validity checking - int num = input.size(); - - int rows = 1; - auto dim_0 = input[0].dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (int i = 0; i < num; ++i) { - int t_cols = input[i].numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; - } - // auto cpu_place = boost::get(context.GetPlace()); - - // computation - auto output_data = output->mutable_data(); - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = input_cols[j]; - auto* input_data = input[j].data(); - for (int k = 0; k < out_rows; ++k) { - // memory::Copy(cpu_place, output_data + k * out_cols + col_idx, - // cpu_place, - // input_data + k * col_len, sizeof(T) * col_len); - std::copy_n(input_data + k * col_len, - col_len, - output_data + k * out_cols + col_idx); - } - col_idx += col_len; - } - } -}; - -/* - * All tensors' dimension should be the same and the values of - * each dimension must be the same, except the axis dimension. - */ -template -class SplitFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const std::vector& ref_inputs, - const int axis, - std::vector* outputs) { - // TODO(zcd): Add input data validity checking - size_t num = outputs->size(); - - int input_rows = 1; - auto dim_0 = ref_inputs[0]->dims(); - for (int i = 0; i < axis; ++i) { - input_rows *= dim_0[i]; - } - - int input_cols = 0; - - std::vector output_cols(outputs->size()); - for (size_t i = 0; i < num; ++i) { - int t_cols = ref_inputs[i]->numel() / input_rows; - input_cols += t_cols; - output_cols[i] = t_cols; - } - // auto cpu_place = boost::get(context.GetPlace()); - - // computation - for (int k = 0; k < input_rows; ++k) { - const T* src_ptr = input.data() + k * input_cols; - int col_idx = 0; - for (size_t j = 0; j < num; ++j) { - int col_len = output_cols[j]; - auto* out_tensor = outputs->at(j); - if (out_tensor != nullptr) { - T* dst_ptr = out_tensor->mutable_data() + k * col_len; - std::copy_n(src_ptr + col_idx, col_len, dst_ptr); - // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, - // sizeof(T) * col_len); - } - col_idx += col_len; - } - } - } -}; - -#define DEFINE_FUNCTOR(type) \ - template class ConcatFunctor; \ - template class SplitFunctor; - -FOR_ALL_TYPES(DEFINE_FUNCTOR); - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/concat_and_split.h b/lite/backends/x86/math/concat_and_split.h deleted file mode 100644 index 8c996411cd..0000000000 --- a/lite/backends/x86/math/concat_and_split.h +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" -#include "lite/fluid/data_type.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * \brief Concatenate the input tensors along the dimension axis. - * TODO(zcd): maybe it needs to be more detailed. - * Examples: - * Input[0] = [[1,2],[3,4]] - * Input[1] = [[5,6]] - * axis = 0 - * - * Output = [[1,2], - * [3,4], - * [5,6]] - */ -template -class ConcatFunctor { - public: - void operator()(const lite::Context& context, - const std::vector& input, - int axis, - lite::Tensor* output); -}; - -/* - * \brief Split the input tensors along the dimension axis into outputs. - * TODO(zcd): maybe it needs to be more detailed. - * Examples: - * Input = [[1,2], - * [3,4], - * [5,6]] - * axis = 0 - * - * Output[0] = [[1,2],[3,4]] - * Output[1] = [[5,6]] - */ -template -class SplitFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const std::vector& ref_inputs, - int axis, - std::vector* outputs); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle - -#define FOR_ALL_TYPES(macro) \ - macro(int); \ - macro(float); \ - macro(double); \ - macro(bool); \ - macro(int64_t); \ - macro(int16_t); \ - macro(uint8_t); \ - macro(int8_t); \ - macro(::paddle::lite::fluid::float16) diff --git a/lite/backends/x86/math/context_project.cc b/lite/backends/x86/math/context_project.cc deleted file mode 100644 index dafced7780..0000000000 --- a/lite/backends/x86/math/context_project.cc +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/context_project.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template class ContextProjectFunctor; -template class ContextProjectFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/context_project.h b/lite/backends/x86/math/context_project.h deleted file mode 100644 index 0c56e0d759..0000000000 --- a/lite/backends/x86/math/context_project.h +++ /dev/null @@ -1,361 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/backends/x86/math/blas.h" -#include "lite/backends/x86/math/im2col.h" -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * \brief Context projection concatenates features in adjacent time-steps in - * a sequence. The i-th row of the output is the concatenation of - * context_length rows of the input. The context_length rows are the - * consecutive rows from the i+shift_start row. - * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor. - * - * \param in Input data. - * \param Shape The shape of Input data: - * [mini-batch, input_hidden_size]. - * - * \param padding_data Padding data. - * \param Shape The shape of Padding data: - * [up_pad + down_pad, input_hidden_size]. - * - * \param col Col data. - * \param Shape The shape of Col data: - * [mini-batch, context_length * input_hidden_size]. - * - * For a mini-batch of 2 variable lengths sentences, containing 3, and 1 - * time-steps: - * - * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3, - * 4]. - * Besides, for the sake of simplicity, we assume M=1 and N=2. - * - * X = [[a1, a2; - * b1, b2; - * c1, c2] - * [d1, d2]] - * - * This is to say that input (X) has 4 words and the dimension of each word - * representation is 2. - * - * - Case1: - * If context_start is -1 and padding_trainable is false, we use zero to pad - * instead of learned weight to pad, - * and the context_length is 3, the output (Out) is: - * - * Out =[[0, 0, a1, a2, b1, b2; - * a1, a2, b1, b2, c1, c2; - * b1, b2, c1, c2, 0, 0 ] - * [0, 0, d1, d2, 0, 0 ]] - * - * - Case2: - * If context_start is -1 and padding_trainable is true, we use learned weight - * to pad, - * and the context_length is 3, the output (Out) is: - * - * Out = [[w1, w2, a1, a2, b1, b2; - * a1, a2, b1, b2, c1, c2; - * b1, b2, c1, c2, w3, w4] - * [w1, w2, d1, d2, w3, w4]] - * - */ - -template -class ContextProjectFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& in, - const lite::Tensor* padding_data, - bool padding_trainable, - const int context_start, - const int context_length, - const int context_stride, - const int up_pad, - const int down_pad, - lite::Tensor* col) { - auto lod_level_0 = in.lod()[0]; - - math::Im2ColFunctor im2col_ocf; - - std::vector dilation({1, 1}); - std::vector padding({up_pad, 0, down_pad, 0}); - std::vector stride({context_stride, 1}); - - int input_row_begin, input_row_end; - int sequence_height, sequence_width; - sequence_width = in.dims()[1]; - - for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { - if (lod_level_0[i] == lod_level_0[i + 1]) continue; - - input_row_begin = (context_start > 0) - ? static_cast(lod_level_0[i]) + context_start - : static_cast(lod_level_0[i]); - input_row_end = static_cast(lod_level_0[i + 1]); - - // lite::Tensor out_t = - // col->Slice(static_cast(lod_level_0[i]), - // static_cast(lod_level_0[i + 1])); - lite::Tensor out_t = - col->Slice(static_cast(lod_level_0[i]), - static_cast(lod_level_0[i + 1])); - - sequence_height = static_cast(out_t.dims()[0]); - - if (input_row_begin < input_row_end) { - lite::Tensor in_t = in.Slice(input_row_begin, input_row_end); - - std::vector output_shape( - {sequence_height, - 1, - 1, - context_length, - sequence_width}); // output_height, output_width, - // input_channels, filter_height, filter_width - out_t.Resize(output_shape); - - std::vector input_shape( - {1, - input_row_end - input_row_begin, - sequence_width}); // input_channels, input_height, input_width - in_t.Resize(input_shape); - im2col_ocf(context, in_t, dilation, stride, padding, &out_t); - out_t.Resize({sequence_height, context_length * sequence_width}); - } - } - if (padding_trainable) { - PADDLE_ENFORCE(padding_data != nullptr); - for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { - if (lod_level_0[i] == lod_level_0[i + 1]) continue; - - lite::Tensor out_t = - col->Slice(static_cast(lod_level_0[i]), - static_cast(lod_level_0[i + 1])); - - sequence_height = static_cast(out_t.dims()[0]); - - // add up trainable data - out_t.Resize({static_cast(sequence_height) * context_length, - sequence_width}); - - if (up_pad > 0) { // add up pad - int padding_rows = std::min( - up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); - - for (int k = 0; k < padding_rows; ++k) { - int padding_size = - k + context_length < up_pad ? context_length : up_pad - k; - lite::Tensor out_t_sub = out_t.Slice( - k * context_length, k * context_length + padding_size); - lite::Tensor w_sub = - padding_data->Slice(k, k + padding_size); - - out_t_sub.CopyDataFrom(w_sub); - - // framework::TensorCopy(w_sub, context.GetPlace(), context, - // &out_t_sub); - } - } - if (down_pad > 0) { // add down pad - int down_pad_begin_row = - std::max(0, - (sequence_height - context_start - context_length) + 1) + - 1; - int padding_begin = std::max(0, context_start - sequence_height); - int padding_size = - sequence_height - context_start >= context_length - ? 1 - : context_length - (sequence_height - context_start); - if (context_start >= sequence_height) padding_size = context_length; - int padding_idx = padding_begin; - for (int t = 0; t + down_pad_begin_row <= sequence_height; - ++t, ++padding_size) { - if (context_start >= sequence_height) padding_size = context_length; - if (padding_size > context_length) { - padding_size = context_length; - padding_idx++; - } - if (padding_begin > 0 || sequence_height == context_start) - padding_idx = padding_begin + t; - - lite::Tensor out_t_sub = out_t.Slice( - (down_pad_begin_row + t) * context_length - padding_size, - (down_pad_begin_row + t) * context_length); - lite::Tensor w_sub = padding_data->Slice( - up_pad + padding_idx, up_pad + padding_idx + padding_size); - out_t_sub.CopyDataFrom(w_sub); - // framework::TensorCopy(w_sub, context.GetPlace(), context, - // &out_t_sub); - } - } - out_t.Resize({sequence_height, - static_cast(context_length) * sequence_width}); - } - } - } -}; - -template -class ContextProjectGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& in, - bool padding_trainable, - const int context_start, - const int context_length, - const int context_stride, - const int up_pad, - const int down_pad, - bool pad_grad, - bool input_grad, - lite::Tensor* padding_data, - lite::Tensor* col) { - auto lod_level_0 = in.lod()[0]; - - math::Col2ImFunctor col2im_ocf; - - std::vector dilation({1, 1}); - std::vector padding({up_pad, 0, down_pad, 0}); - std::vector stride({context_stride, 1}); - - int input_row_begin, input_row_end; - int sequence_height, sequence_width; - sequence_width = in.dims()[1]; - auto blas = math::GetBlas, T>(context); - - if (input_grad) { - for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { - if (lod_level_0[i] == lod_level_0[i + 1]) continue; - - input_row_begin = (context_start > 0) - ? static_cast(lod_level_0[i]) + context_start - : static_cast(lod_level_0[i]); - input_row_end = static_cast(lod_level_0[i + 1]); - - lite::Tensor out_t = - col->Slice(static_cast(lod_level_0[i]), - static_cast(lod_level_0[i + 1])); - - sequence_height = static_cast(out_t.dims()[0]); - - if (input_row_begin < input_row_end) { - lite::Tensor in_t = in.Slice(input_row_begin, input_row_end); - - std::vector output_shape( - {sequence_height, - 1, - 1, - context_length, - sequence_width}); // output_height, output_width, - // input_channels, filter_height, filter_width - out_t.Resize(output_shape); - - std::vector input_shape( - {1, - input_row_end - input_row_begin, - sequence_width}); // input_channels, input_height, input_width - in_t.Resize(input_shape); - - col2im_ocf(context, out_t, dilation, stride, padding, &in_t); - out_t.Resize({sequence_height, context_length * sequence_width}); - } - } - } - if (pad_grad) { - if (padding_trainable) { - for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { - if (lod_level_0[i] == lod_level_0[i + 1]) continue; - - lite::Tensor out_t = - col->Slice(static_cast(lod_level_0[i]), - static_cast(lod_level_0[i + 1])); - - sequence_height = static_cast(out_t.dims()[0]); - out_t.Resize({static_cast(sequence_height) * context_length, - sequence_width}); - - if (up_pad > 0) { - int padding_rows = std::min( - up_pad, static_cast(lod_level_0[i + 1] - lod_level_0[i])); - - for (int k = 0; k < padding_rows; ++k) { - int padding_size = - k + context_length < up_pad ? context_length : up_pad - k; - lite::Tensor out_t_sub = out_t.Slice( - k * context_length, k * context_length + padding_size); - lite::Tensor w_sub = - padding_data->Slice(k, k + padding_size); - blas.AXPY(w_sub.numel(), - static_cast(1), - out_t_sub.data(), - w_sub.data()); - } - } - if (down_pad > 0) { - int down_pad_begin_row = - std::max( - 0, (sequence_height - context_start - context_length) + 1) + - 1; - int padding_begin = std::max(0, context_start - sequence_height); - int padding_size = - sequence_height - context_start >= context_length - ? 1 - : context_length - (sequence_height - context_start); - if (context_start >= sequence_height) padding_size = context_length; - int padding_idx = padding_begin; - for (int t = 0; t + down_pad_begin_row <= sequence_height; - ++t, ++padding_size) { - if (context_start >= sequence_height) - padding_size = context_length; - if (padding_size > context_length) { - padding_size = context_length; - padding_idx++; - } - if (padding_begin > 0 || sequence_height == context_start) - padding_idx = padding_begin + t; - - lite::Tensor out_t_sub = out_t.Slice( - (down_pad_begin_row + t) * context_length - padding_size, - (down_pad_begin_row + t) * context_length); - lite::Tensor w_sub = padding_data->Slice( - up_pad + padding_idx, up_pad + padding_idx + padding_size); - blas.AXPY(w_sub.numel(), - static_cast(1), - out_t_sub.data(), - w_sub.data()); - } - } - out_t.Resize({sequence_height, - static_cast(context_length) * sequence_width}); - } - } - } - } -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/cos_sim_functor.cc b/lite/backends/x86/math/cos_sim_functor.cc deleted file mode 100644 index 8dffa380f1..0000000000 --- a/lite/backends/x86/math/cos_sim_functor.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/cos_sim_functor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -struct CosSimDyFunctor { - void operator()(const lite::X86Context& ctx, - const T* x_norm, - const T* y_norm, - const T* x, - const T* y, - const T* z, - const T* dz, - const size_t rows, - const size_t cols, - T* dy) const { - for (size_t row_id = 0; row_id < rows; ++row_id) { - auto xy_norm_prod = x_norm[row_id] * y_norm[0]; - auto dz_data = dz[row_id]; - auto z_data = z[row_id]; - auto* x_data = x + cols * row_id; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - - auto y_norm_square = y_norm[0] * y_norm[0]; - auto reciprocal_y_norm_square = 1 / y_norm_square; - for (size_t i = 0; i < cols; ++i) { - dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod - - z_data * y[i] * reciprocal_y_norm_square); - } - } - } -}; - -template struct CosSimDyFunctor; -template struct CosSimDyFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/cos_sim_functor.h b/lite/backends/x86/math/cos_sim_functor.h deleted file mode 100644 index 16470f302a..0000000000 --- a/lite/backends/x86/math/cos_sim_functor.h +++ /dev/null @@ -1,187 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "lite/core/context.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -struct CosSimFunctor { - CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto* x = x_ + cols_ * row_id; - T xx = 0, xy = 0, yy = 0; - if (same_row) { - auto* y = y_ + cols_ * row_id; - T tep_x, tep_y; - for (size_t i = 0; i < cols_; ++i) { - tep_x = x[i]; - tep_y = y[i]; - xx += tep_x * tep_x; - yy += tep_y * tep_y; - xy += tep_x * tep_y; - } - xx = sqrt(xx); - yy = sqrt(yy); - y_norm_[row_id] = yy; - x_norm_[row_id] = xx; - z_[row_id] = xy / (xx * yy); - } else { // This can be wrote in a better way. - T tep_x, tep_y; - for (size_t i = 0; i < cols_; ++i) { - tep_x = x[i]; - tep_y = y_[i]; - xx += tep_x * tep_x; - yy += tep_y * tep_y; - xy += tep_x * tep_y; - } - xx = sqrt(xx); - yy = sqrt(yy); - if (row_id == 0) y_norm_[0] = yy; - x_norm_[row_id] = xx; - z_[row_id] = xy / (xx * yy); - } - } - - T* x_norm_; - T* y_norm_; - const T* x_; - const T* y_; - T* z_; - const size_t cols_; -}; - -template -struct CosSimGradFunctor { - CosSimGradFunctor(const T* x_norm, - const T* y_norm, - const T* x, - const T* y, - const T* z, - const T* dz, - T* dx, - int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dx_(dx), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; - auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id]; - auto dz = dz_[row_id]; - auto z = z_[row_id]; - - auto* dx = dx_ + cols_ * row_id; - auto* x = x_ + cols_ * row_id; - auto* y = y_ + cols_ * row_id; - - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto reciprocal_x_norm_square = 1 / x_norm_square; - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } - - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDxFunctor { - CosSimDxFunctor(const T* x_norm, - const T* y_norm, - const T* x, - const T* y, - const T* z, - const T* dz, - T* dx, - int cols) - : x_norm_(x_norm), - y_norm_(y_norm), - x_(x), - y_(y), - z_(z), - dz_(dz), - dx_(dx), - cols_(static_cast(cols)) {} - - inline HOSTDEVICE void operator()(size_t row_id) const { - auto xy_norm_prod = x_norm_[row_id] * y_norm_[0]; - auto dz = dz_[row_id]; - auto z = z_[row_id]; - auto* x = x_ + cols_ * row_id; - auto reciprocal_xy_norm_prod = 1 / xy_norm_prod; - auto x_norm_square = x_norm_[row_id] * x_norm_[row_id]; - auto* dx = dx_ + cols_ * row_id; - auto reciprocal_x_norm_square = 1 / x_norm_square; - - for (size_t i = 0; i < cols_; ++i) { - dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod - - z * x[i] * reciprocal_x_norm_square); - } - } - const T* x_norm_; - const T* y_norm_; - const T* x_; - const T* y_; - const T* z_; - const T* dz_; - T* dx_; - const size_t cols_; -}; - -template -struct CosSimDyFunctor { - void operator()(const lite::Context& ctx, - const T* x_norm, - const T* y_norm, - const T* x, - const T* y, - const T* z, - const T* dz, - const size_t rows, - const size_t cols, - T* dy) const; -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/cpu_vec.h b/lite/backends/x86/math/cpu_vec.h deleted file mode 100644 index 9ff64d53f0..0000000000 --- a/lite/backends/x86/math/cpu_vec.h +++ /dev/null @@ -1,662 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "lite/backends/x86/cpu_info.h" -#include "lite/utils/paddle_enforce.h" - -#ifdef PADDLE_WITH_MKLML -#include "lite/backends/x86/mklml.h" -#endif - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 - -#define YMM_FLOAT_BLOCK 8 -#define AVX_DOUBLE_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define AVX2_DOUBLE_BLOCK 4 -#define ZMM_FLOAT_BLOCK 16 -#define AVX512_DOUBLE_BLOCK 8 - -template -inline void vec_exp(const int n, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - -template -inline void vec_scal(const int n, const T a, T* x) { - for (int i = 0; i < n; ++i) { - x[i] = a * x[i]; - } -} - -#ifdef PADDLE_WITH_MKLML -template <> -inline void vec_exp(const int n, const float* x, float* y) { - constexpr int small_enough = 128; - if (n < small_enough) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } - } else { - lite::x86::vsExp(n, x, y); - } -} - -template <> -inline void vec_exp(const int n, const double* x, double* y) { - lite::x86::vdExp(n, x, y); -} - -template <> -inline void vec_scal(const int n, const float a, float* x) { - lite::x86::cblas_sscal(n, a, x, 1); -} - -template <> -inline void vec_scal(const int n, const double a, double* x) { - lite::x86::cblas_dscal(n, a, x, 1); -} -#endif - -// MKL scal only support inplace, choose this if src and dst are not equal -template -inline void vec_scal(const int n, const T a, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = a * x[i]; - } -} - -template <> -inline void vec_scal(const int n, - const float a, - const float* x, - float* y) { -#ifdef __AVX__ - constexpr int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_scal(n, a, x, y); - return; - } - const int rest = n % block; - const int end = n - rest; - int i = 0; - __m256 scalar = _mm256_set1_ps(a); - __m256 tmp; -#define MOVE_ONE_STEP \ - tmp = _mm256_loadu_ps(x + i); \ - tmp = _mm256_mul_ps(tmp, scalar); \ - _mm256_storeu_ps(y + i, tmp) - for (i = 0; i < end; i += block) { - MOVE_ONE_STEP; - } -#undef MOVE_ONE_STEP - if (rest == 0) { - return; - } - // can not continue move step if src and dst are inplace - for (i = n - rest; i < n; ++i) { - y[i] = a * x[i]; - } -#else - vec_scal(n, a, x, y); -#endif -} - -template <> -inline void vec_scal(const int n, - const float a, - const float* x, - float* y) { - vec_scal(n, a, x, y); -} - -template <> -inline void vec_scal(const int n, - const float a, - const float* x, - float* y) { - // TODO(TJ): enable me - vec_scal(n, a, x, y); -} - -template -inline void vec_sum(const size_t n, const T* x, T* s) { - s[0] = x[0]; - for (size_t i = 1; i < n; ++i) { - s[0] += x[i]; - } -} - -template <> -inline void vec_sum(const size_t n, - const float* x, - float* s) { -#ifdef __AVX__ - constexpr unsigned int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_sum(n, x, s); - return; - } - - unsigned int i, end; - i = end = 0; - s[0] = 0.f; - - end = n & ~(block - 1); - __m256 tmp = _mm256_setzero_ps(); - for (i = 0; i < end; i += block) { - tmp = _mm256_add_ps(tmp, _mm256_load_ps(x + i)); - } - - __m256 hsum = _mm256_hadd_ps(tmp, tmp); - hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1)); - _mm_store_ss( - s, - _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum))); - - for (; i < n; i++) { - s[0] += x[i]; - } -#else - vec_sum(n, x, s); -#endif -} - -template -inline void vec_mul(const size_t n, const T* x, const T* y, T* z) { - for (size_t i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - -template <> -inline void vec_mul(const size_t n, - const float* x, - const float* y, - float* z) { -#ifdef __AVX__ - constexpr unsigned int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_mul(n, x, y, z); - return; - } - - unsigned int i = 0, end = 0; - end = n & ~(block - 1); - for (i = 0; i < end; i += block) { - _mm256_storeu_ps( - z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i))); - } - - for (; i < n; i++) { - z[i] = x[i] * y[i]; - } -#else - vec_mul(n, x, y, z); -#endif -} - -template -inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) { - z[0] = x[0] * y[0]; - for (size_t i = 1; i < n; ++i) { - z[0] += x[i] * y[i]; - } -} - -template <> -inline void vec_mul_reduce(const size_t n, - const float* x, - const float* y, - float* z) { -#ifdef __AVX__ - constexpr unsigned int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_mul_reduce(n, x, y, z); - return; - } - - unsigned int i = 0, end = 0; - z[0] = 0.f; - - end = n & ~(block - 1); - __m256 tmp = _mm256_setzero_ps(); - for (i = 0; i < end; i += block) { - tmp = _mm256_add_ps( - tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i))); - } - - __m256 hsum = _mm256_hadd_ps(tmp, tmp); - hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1)); - _mm_store_ss( - z, - _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum))); - - for (; i < n; i++) { - z[0] += x[i] * y[i]; - } -#else - vec_mul_reduce(n, x, y, z); -#endif -} - -template -inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = a - x[i]; - } -} - -template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { -#ifdef __AVX__ - constexpr int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_bias_sub(n, a, x, y); - return; - } - const int rest = n % block; - const int end = n - rest; - int i = 0; - __m256 bias = _mm256_set1_ps(a); - __m256 tmp; -#define MOVE_ONE_STEP \ - tmp = _mm256_loadu_ps(x + i); \ - tmp = _mm256_sub_ps(bias, tmp); \ - _mm256_storeu_ps(y + i, tmp) - for (i = 0; i < end; i += block) { - MOVE_ONE_STEP; - } -#undef MOVE_ONE_STEP - if (rest == 0) { - return; - } - // can not continue move step if src and dst are inplace - for (i = n - rest; i < n; ++i) { - y[i] = a - x[i]; - } -#else - vec_bias_sub(n, a, x, y); -#endif -} - -template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { - vec_bias_sub(n, a, x, y); -} - -template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { - // TODO(TJ): enable me - vec_bias_sub(n, a, x, y); -} - -// out = x*y + (1-x)*z -template -inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { - for (int i = 0; i < n; ++i) { - out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; - } -} - -template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { -#ifdef __AVX__ - constexpr int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_cross(n, x, y, z, out); - return; - } - const int rest = n % block; - const int end = n - rest; - int i = 0; - __m256 bias = _mm256_set1_ps(1.f); - __m256 tmpx, tmpy, tmpz; - for (i = 0; i < end; i += block) { - tmpx = _mm256_loadu_ps(x + i); - tmpy = _mm256_loadu_ps(y + i); - tmpz = _mm256_loadu_ps(z + i); - tmpy = _mm256_mul_ps(tmpx, tmpy); - tmpx = _mm256_sub_ps(bias, tmpx); - tmpz = _mm256_mul_ps(tmpx, tmpz); - tmpz = _mm256_add_ps(tmpy, tmpz); - _mm256_storeu_ps(out + i, tmpz); - } - if (rest == 0) { - return; - } - // can not continue move step if src and dst are inplace - for (i = n - rest; i < n; ++i) { - out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; - } -#else - vec_cross(n, x, y, z, out); -#endif -} - -template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { - vec_cross(n, x, y, z, out); -} - -template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { - // TODO(TJ): enable me - vec_cross(n, x, y, z, out); -} - -template -inline void vec_clip(const size_t n, const T a, const T* x, T* y) { - for (size_t i = 0; i < n; ++i) { - y[i] = x[i] < a ? a : x[i]; - } -} - -template <> -inline void vec_clip(const size_t n, - const float a, - const float* x, - float* y) { -#ifdef __AVX__ - constexpr unsigned int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_clip(n, a, x, y); - return; - } - - unsigned int i = 0, end = 0; - end = n & ~(block - 1); - __m256 threshold = _mm256_set1_ps(a); - - for (i = 0; i < end; i += block) { - _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold)); - } - - for (; i < n; i++) { - y[i] = x[i] < a ? a : x[i]; - } -#else - vec_clip(n, a, x, y); -#endif -} - -template -inline void vec_add_bias(const int n, const T a, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] + a; - } -} - -template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { -#ifdef __AVX__ - constexpr int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_add_bias(n, a, x, y); - return; - } - const int rest = n % block; - const int end = n - rest; - int i = 0; - __m256 bias = _mm256_set1_ps(a); - __m256 tmp; -#define MOVE_ONE_STEP \ - tmp = _mm256_loadu_ps(x + i); \ - tmp = _mm256_add_ps(tmp, bias); \ - _mm256_storeu_ps(y + i, tmp) - for (i = 0; i < end; i += block) { - MOVE_ONE_STEP; - } -#undef MOVE_ONE_STEP - if (rest == 0) { - return; - } - // can not continue move step if src and dst are inplace - for (i = n - rest; i < n; ++i) { - y[i] = x[i] + a; - } -#else - vec_add_bias(n, a, x, y); -#endif -} - -template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { - vec_add_bias(n, a, x, y); -} - -template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { - // TODO(TJ): enable me - vec_add_bias(n, a, x, y); -} - -template -inline void vec_identity(const int n, const T* x, T* y) { - // do nothing - return; -} - -template -inline void vec_sigmoid(const int n, const T* x, T* y) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(0) - y[i]; - } - vec_exp(n, y, y); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(1) / (static_cast(1) + y[i]); - } -} - -template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { -#ifdef __AVX__ - constexpr int block = YMM_FLOAT_BLOCK; - if (n < block) { - vec_sigmoid(n, x, y); - return; - } - const int rest = n % block; - const int end = n - rest; - int i = 0; - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); - __m256 zeros = _mm256_setzero_ps(); - __m256 tmp; -#define MOVE_ONE_STEP \ - tmp = _mm256_loadu_ps(x + i); \ - tmp = _mm256_max_ps(tmp, min); \ - tmp = _mm256_min_ps(tmp, max); \ - tmp = _mm256_sub_ps(zeros, tmp); \ - _mm256_storeu_ps(y + i, tmp) - for (i = 0; i < end; i += block) { - MOVE_ONE_STEP; - } -#undef MOVE_ONE_STEP - if (rest != 0) { - // can not continue move step since the src and dst address could be equal - const float xmin = SIGMOID_THRESHOLD_MIN; - const float xmax = SIGMOID_THRESHOLD_MAX; - for (i = n - rest; i < n; ++i) { - y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i])); - } - } - - vec_exp(n, y, y); - - __m256 ones = _mm256_set1_ps(1.0f); -#define MOVE_ONE_STEP \ - tmp = _mm256_loadu_ps(y + i); \ - tmp = _mm256_add_ps(ones, tmp); \ - tmp = _mm256_div_ps(ones, tmp); \ - _mm256_storeu_ps(y + i, tmp) - for (i = 0; i < end; i += block) { - MOVE_ONE_STEP; - } -#undef MOVE_ONE_STEP - if (rest == 0) { - return; - } - // can not continue move step - for (i = n - rest; i < n; ++i) { - y[i] = 1.f / (1.f + y[i]); - } -#else - vec_sigmoid(n, x, y); -#endif -} - -template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { - vec_sigmoid(n, x, y); -} - -template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { - // TODO(TJ): enable me - vec_sigmoid(n, x, y); -} - -template -inline void vec_tanh(const int n, const T* x, T* y) { - vec_scal(n, static_cast(2), x, y); - vec_sigmoid(n, y, y); - vec_scal(n, static_cast(2), y); - vec_add_bias(n, static_cast(-1), y, y); -} - -// TODO(TJ): make relu clip -template -inline void vec_relu(const int n, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - -template <> -inline void vec_relu(const int n, - const float* x, - float* y) { -#ifdef __AVX__ - constexpr int block = YMM_FLOAT_BLOCK; - if (n < block * 4) { - vec_relu(n, x, y); - return; - } - - const int rest = n % block; - const int end = n - rest; - int i = 0; - __m256 zeros = _mm256_setzero_ps(); - __m256 tmp; -#define MOVE_ONE_STEP \ - tmp = _mm256_loadu_ps(x + i); \ - tmp = _mm256_max_ps(tmp, zeros); \ - _mm256_storeu_ps(y + i, tmp) - for (i = 0; i < end; i += block) { - MOVE_ONE_STEP; - } - if (rest == 0) { - return; - } - i = n - block; - MOVE_ONE_STEP; -#undef MOVE_ONE_STEP - -#else - vec_relu(n, x, y); -#endif -} - -template <> -inline void vec_relu(const int n, - const float* x, - float* y) { - vec_relu(n, x, y); -} - -template <> -inline void vec_relu(const int n, - const float* x, - float* y) { - // TODO(TJ): enable me - vec_relu(n, x, y); -} - -// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary - -template -class VecActivations { - public: - std::function operator()( - const std::string& type) { - if (type == "sigmoid") { - return vec_sigmoid; - } else if (type == "relu") { - return vec_relu; - } else if (type == "tanh") { - return vec_tanh; - } else if (type == "identity" || type == "") { - return vec_identity; - } - PADDLE_THROW("Not support type: %s", type); - } -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc deleted file mode 100644 index 366486924a..0000000000 --- a/lite/backends/x86/math/cross_entropy.cc +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/cross_entropy.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -using EigenMatrix = lite::fluid::EigenMatrix; - -template -class CrossEntropyFunctor { - public: - void operator()(const lite::X86Context& ctx, - lite::Tensor* out, - const lite::Tensor* prob, - const lite::Tensor* labels, - const bool softLabel, - const int ignore_index, - const int axis_dim) { - const int batch_size = prob->dims()[0]; - const int num_classes = prob->dims()[1]; - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - - if (softLabel) { - auto in = EigenMatrix::From(*prob); - auto lbl = EigenMatrix::From(*labels); - auto loss = EigenMatrix::From(*out); - - loss.device(lite::fluid::EigenDeviceType()) = - -((lbl * in.log().unaryExpr(math::TolerableValue())) - .reshape(batch_axis_remain) - .sum(Eigen::DSizes(1))); - } else { - const T* prob_data = prob->data(); - T* loss_data = out->mutable_data(); - - const int64_t* label_data = labels->data(); - for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < num_remain; j++) { - int lbl = label_data[i * num_remain + j]; - PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index); - int index = i * num_classes + lbl * num_remain + j; - int loss_idx = i * num_remain + j; - loss_data[loss_idx] = - lbl == ignore_index - ? 0 - : -math::TolerableValue()(std::log(prob_data[index])); - } - } - } - } -}; - -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/cross_entropy.h b/lite/backends/x86/math/cross_entropy.h deleted file mode 100644 index 6b66f0b085..0000000000 --- a/lite/backends/x86/math/cross_entropy.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" -#include "lite/fluid/eigen.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -struct TolerableValue { - HOSTDEVICE T operator()(const T& x) const { - PADDLE_ENFORCE(static_cast(std::is_floating_point::value)); - const T kApproInf = 1e20; - - if (x == INFINITY) return kApproInf; - if (x == -INFINITY) return -kApproInf; - return x; - } -}; - -// NOTE(dzh): float16 value clip behave different. -// 1. Our ValueClipping has a hardcore threshold 1e20 -// for float number. 1e20 will resulting in overflow in float16. -// 2. float16 should expose the the real number overflow to python. -// because mixed-training depends the inf/nan value to determine -// if the scale value will be adjusted. -// Also. In standard implementation of cross entropy, other -// framework not has the ValueClipping. -template <> -struct TolerableValue { - HOSTDEVICE lite::fluid::float16 operator()( - const lite::fluid::float16& x) const { - if (lite::fluid::isfinite(x)) - return x; - else if (x > static_cast(0)) - return std::numeric_limits::max(); - else - return std::numeric_limits::min(); - } -}; - -template -class CrossEntropyFunctor { - public: - void operator()(const lite::Context& context, - lite::Tensor* out, - const lite::Tensor* prob, - const lite::Tensor* labels, - const bool softLabel, - const int ignore_index, - const int axis_dim); -}; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/detail/CMakeLists.txt b/lite/backends/x86/math/detail/CMakeLists.txt deleted file mode 100644 index 0df1c060f9..0000000000 --- a/lite/backends/x86/math/detail/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(activation_functions SRCS avx_functions.cc) diff --git a/lite/backends/x86/math/detail/activation_functions.h b/lite/backends/x86/math/detail/activation_functions.h deleted file mode 100644 index cb215df722..0000000000 --- a/lite/backends/x86/math/detail/activation_functions.h +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "lite/backends/x86/cpu_info.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -namespace detail { - -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 - -enum ActivationType { - kSigmoid, - kReLU, - kTanh, - kIdentity, -}; - -inline ActivationType GetActivationType(const std::string &type) { - if (type == "sigmoid") { - return ActivationType::kSigmoid; - } else if (type == "relu") { - return ActivationType::kReLU; - } else if (type == "tanh") { - return ActivationType::kTanh; - } else if (type == "identity" || type == "") { - return ActivationType::kIdentity; - } - PADDLE_ENFORCE(false, "Not support type %s", type); - // PADDLE_THROW("Not support type %s.", type); -} - -namespace forward { - -template -T Identity(const T a) { - return a; -} - -template -T Relu(const T a) { - return a > static_cast(0.0) ? a : static_cast(0.0); -} - -template -T Sigmoid(const T a) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - T tmp = (a < min) ? min : ((a > max) ? max : a); - return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); -} - -template -T Tanh(const T a) { - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} - -} // namespace forward - -namespace backward { - -template -T Identity(const T a, const T b) { - return a; -} - -template -T Relu(const T a, const T b) { - return a * (b > 0.0 ? 1.0 : 0.0); -} - -template -T Sigmoid(const T a, const T b) { - return a * b * (1.0 - b); -} - -template -T Tanh(const T a, const T b) { - return a * (1.0 - b * b); -} - -} // namespace backward - -template -struct Active { - typedef T (*Act)(T); - typedef T (*ActGrad)(T, T); -}; - -static Active::Act kActFloat[] = {&forward::Sigmoid, - &forward::Relu, - &forward::Tanh, - &forward::Identity}; - -static Active::ActGrad kActGradFloat[] = {&backward::Sigmoid, - &backward::Relu, - &backward::Tanh, - &backward::Identity}; - -static Active::Act kActDouble[] = {&forward::Sigmoid, - &forward::Relu, - &forward::Tanh, - &forward::Identity}; - -static Active::ActGrad kActGradDouble[] = {&backward::Sigmoid, - &backward::Relu, - &backward::Tanh, - &backward::Identity}; - -namespace forward { -inline float activation(float a, int index) { return kActFloat[index](a); } - -inline double activation(double a, int index) { return kActDouble[index](a); } - -} // namespace forward - -namespace backward { -inline float activation(float a, float b, int index) { - return kActGradFloat[index](a, b); -} - -inline double activation(double a, double b, int index) { - return kActGradDouble[index](a, b); -} -} // namespace backward - -#ifdef __AVX__ -namespace forward { -namespace avx { -__m256 Relu(const __m256 a); -__m256 Sigmoid(const __m256 a); -__m256 Tanh(const __m256 a); -__m256 Identity(const __m256 a); -} // namespace avx -} // namespace forward - -namespace backward { -namespace avx { -__m256 Relu(const __m256 a, const __m256 b); -__m256 Sigmoid(const __m256 a, const __m256 b); -__m256 Tanh(const __m256 a, const __m256 b); -__m256 Identity(const __m256 a, const __m256 b); -} // namespace avx -} // namespace backward - -static Active<__m256>::Act kActAvx[] = {&forward::avx::Sigmoid, - &forward::avx::Relu, - &forward::avx::Tanh, - &forward::avx::Identity}; - -static Active<__m256>::ActGrad kActGradAvx[] = {&backward::avx::Sigmoid, - &backward::avx::Relu, - &backward::avx::Tanh, - &backward::avx::Identity}; - -namespace forward { -inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); } -} // namespace forward - -namespace backward { -inline __m256 activation(__m256 a, __m256 b, int index) { - return kActGradAvx[index](a, b); -} -} // namespace backward - -#endif - -} // namespace detail -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/detail/avx_functions.cc b/lite/backends/x86/math/detail/avx_functions.cc deleted file mode 100644 index 0b0c5b977b..0000000000 --- a/lite/backends/x86/math/detail/avx_functions.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef __AVX__ - -#include "lite/backends/x86/math/detail/activation_functions.h" -#include "lite/backends/x86/math/detail/avx_mathfun.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -namespace detail { - -__m256 Exp(__m256 a) { return exp256_ps(a); } - -namespace forward { -namespace avx { -__m256 Relu(const __m256 a) { - __m256 tmp = _mm256_set1_ps(0.0f); - return _mm256_max_ps(a, tmp); -} - -__m256 Sigmoid(const __m256 a) { - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); - __m256 tmp = _mm256_max_ps(a, min); - tmp = _mm256_min_ps(tmp, max); - tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); - tmp = Exp(tmp); - tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); - tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); - return tmp; -} - -__m256 Tanh(const __m256 a) { - __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); - __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); - tmp = _mm256_min_ps(tmp, max); - tmp = Exp(tmp); - return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), - _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), - _mm256_set1_ps(1.0f)); -} - -__m256 Identity(const __m256 a) { return a; } - -} // namespace avx -} // namespace forward - -namespace backward { -namespace avx { -__m256 Relu(const __m256 a, const __m256 b) { - return _mm256_mul_ps( - a, - _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), - _mm256_set1_ps(1.0f))); -} - -__m256 Sigmoid(const __m256 a, const __m256 b) { - return _mm256_mul_ps(_mm256_mul_ps(a, b), - _mm256_sub_ps(_mm256_set1_ps(1.0f), b)); -} - -__m256 Tanh(const __m256 a, const __m256 b) { - return _mm256_mul_ps( - a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b))); -} - -__m256 Identity(const __m256 a, const __m256 b) { return a; } -} // namespace avx -} // namespace backward - -} // namespace detail -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle - -#endif diff --git a/lite/backends/x86/math/detail/avx_mathfun.h b/lite/backends/x86/math/detail/avx_mathfun.h deleted file mode 100644 index c95c881512..0000000000 --- a/lite/backends/x86/math/detail/avx_mathfun.h +++ /dev/null @@ -1,731 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -/* - AVX implementation of sin, cos, sincos, exp and log - - Based on "sse_mathfun.h", by Julien Pommier - http://gruntthepeon.free.fr/ssemath/ - - Copyright (C) 2012 Giovanni Garberoglio - Interdisciplinary Laboratory for Computational Science (LISC) - Fondazione Bruno Kessler and University of Trento - via Sommarive, 18 - I-38123 Trento (Italy) - - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for any damages - arising from the use of this software. - - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; you must not - claim that you wrote the original software. If you use this software - in a product, an acknowledgment in the product documentation would be - appreciated but is not required. - 2. Altered source versions must be plainly marked as such, and must not be - misrepresented as being the original software. - 3. This notice may not be removed or altered from any source distribution. - - (this is the zlib license) -*/ - -#include "lite/backends/x86/cpu_info.h" - -/* __m128 is ugly to write */ -typedef __m256 v8sf; // vector of 8 float (avx) -typedef __m256i v8si; // vector of 8 int (avx) -typedef __m128i v4si; // vector of 8 int (avx) - -#define _PI32AVX_CONST(Name, Val) \ - static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \ - Val, Val, Val, Val} - -_PI32AVX_CONST(1, 1); -_PI32AVX_CONST(inv1, ~1); -_PI32AVX_CONST(2, 2); -_PI32AVX_CONST(4, 4); - -/* declare some AVX constants -- why can't I figure a better way to do that? */ -#define _PS256_CONST(Name, Val) \ - static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \ - Val, Val, Val, Val, Val, Val, Val, Val} -#define _PI32_CONST256(Name, Val) \ - static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \ - Val, Val, Val, Val, Val, Val, Val, Val} -#define _PS256_CONST_TYPE(Name, Type, Val) \ - static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \ - Val, Val, Val, Val, Val, Val, Val, Val} - -_PS256_CONST(1, 1.0f); -_PS256_CONST(0p5, 0.5f); -/* the smallest non denormalized float number */ -_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); -_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); -_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); - -_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); -_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); - -_PI32_CONST256(0, 0); -_PI32_CONST256(1, 1); -_PI32_CONST256(inv1, ~1); -_PI32_CONST256(2, 2); -_PI32_CONST256(4, 4); -_PI32_CONST256(0x7f, 0x7f); - -_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); -_PS256_CONST(cephes_log_p0, 7.0376836292E-2); -_PS256_CONST(cephes_log_p1, -1.1514610310E-1); -_PS256_CONST(cephes_log_p2, 1.1676998740E-1); -_PS256_CONST(cephes_log_p3, -1.2420140846E-1); -_PS256_CONST(cephes_log_p4, +1.4249322787E-1); -_PS256_CONST(cephes_log_p5, -1.6668057665E-1); -_PS256_CONST(cephes_log_p6, +2.0000714765E-1); -_PS256_CONST(cephes_log_p7, -2.4999993993E-1); -_PS256_CONST(cephes_log_p8, +3.3333331174E-1); -_PS256_CONST(cephes_log_q1, -2.12194440e-4); -_PS256_CONST(cephes_log_q2, 0.693359375); - -#ifndef __AVX2__ - -typedef union imm_xmm_union { - v8si imm; - v4si xmm[2]; -} imm_xmm_union; - -#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ - { \ - imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ - u.imm = imm_; \ - xmm0_ = u.xmm[0]; \ - xmm1_ = u.xmm[1]; \ - } - -#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ - { \ - imm_xmm_union ALIGN32_BEG u ALIGN32_END; \ - u.xmm[0] = xmm0_; \ - u.xmm[1] = xmm1_; \ - imm_ = u.imm; \ - } - -#define AVX2_BITOP_USING_SSE2(fn) \ - static inline v8si avx2_mm256_##fn(v8si x, int a) { \ - /* use SSE2 instruction to perform the bitop AVX2 */ \ - v4si x1, x2; \ - v8si ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - x1 = _mm_##fn(x1, a); \ - x2 = _mm_##fn(x2, a); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return (ret); \ - } - -//#warning "Using SSE2 to perform AVX2 bitshift ops" -AVX2_BITOP_USING_SSE2(slli_epi32) -AVX2_BITOP_USING_SSE2(srli_epi32) - -#define AVX2_INTOP_USING_SSE2(fn) \ - static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \ - /* use SSE2 instructions to perform the AVX2 integer operation */ \ - v4si x1, x2; \ - v4si y1, y2; \ - v8si ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - COPY_IMM_TO_XMM(y, y1, y2); \ - x1 = _mm_##fn(x1, y1); \ - x2 = _mm_##fn(x2, y2); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return (ret); \ - } - -//#warning "Using SSE2 to perform AVX2 integer ops" -AVX2_INTOP_USING_SSE2(and_si128) -AVX2_INTOP_USING_SSE2(andnot_si128) -AVX2_INTOP_USING_SSE2(cmpeq_epi32) -AVX2_INTOP_USING_SSE2(sub_epi32) -AVX2_INTOP_USING_SSE2(add_epi32) -#define avx2_mm256_and_si256 avx2_mm256_and_si128 -#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128 -#else -#define avx2_mm256_slli_epi32 _mm256_slli_epi32 -#define avx2_mm256_srli_epi32 _mm256_srli_epi32 -#define avx2_mm256_and_si256 _mm256_and_si256 -#define avx2_mm256_andnot_si256 _mm256_andnot_si256 -#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32 -#define avx2_mm256_sub_epi32 _mm256_sub_epi32 -#define avx2_mm256_add_epi32 _mm256_add_epi32 -#endif /* __AVX2__ */ - -/* natural logarithm computed for 8 simultaneous float - return NaN for x <= 0 -*/ -v8sf log256_ps(v8sf x) { - v8si imm0; - v8sf one = *(v8sf *)_ps256_1; - - // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); - v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); - - x = _mm256_max_ps( - x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ - - // can be done with AVX2 - imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); - - /* keep only the fractional part */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); - x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); - - // this is again another AVX2 instruction - imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); - v8sf e = _mm256_cvtepi32_ps(imm0); - - e = _mm256_add_ps(e, one); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); - v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); - v8sf tmp = _mm256_and_ps(x, mask); - x = _mm256_sub_ps(x, one); - e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); - x = _mm256_add_ps(x, tmp); - - v8sf z = _mm256_mul_ps(x, x); - - v8sf y = *(v8sf *)_ps256_cephes_log_p0; - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); - y = _mm256_mul_ps(y, x); - - y = _mm256_mul_ps(y, z); - - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); - y = _mm256_add_ps(y, tmp); - - tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); - y = _mm256_sub_ps(y, tmp); - - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); - x = _mm256_add_ps(x, y); - x = _mm256_add_ps(x, tmp); - x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN - return x; -} - -_PS256_CONST(exp_hi, 88.3762626647949f); -_PS256_CONST(exp_lo, -88.3762626647949f); - -_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); -_PS256_CONST(cephes_exp_C1, 0.693359375); -_PS256_CONST(cephes_exp_C2, -2.12194440e-4); - -_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); -_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); -_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); -_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); -_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); -_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); - -v8sf exp256_ps(v8sf x) { - v8sf tmp = _mm256_setzero_ps(), fx; - v8si imm0; - v8sf one = *(v8sf *)_ps256_1; - - x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); - x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); - fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); - - /* how to perform a floorf with SSE: just below */ - // imm0 = _mm256_cvttps_epi32(fx); - // tmp = _mm256_cvtepi32_ps(imm0); - - tmp = _mm256_floor_ps(fx); - - /* if greater, substract 1 */ - // v8sf mask = _mm256_cmpgt_ps(tmp, fx); - v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); - mask = _mm256_and_ps(mask, one); - fx = _mm256_sub_ps(tmp, mask); - - tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); - v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); - x = _mm256_sub_ps(x, tmp); - x = _mm256_sub_ps(x, z); - - z = _mm256_mul_ps(x, x); - - v8sf y = *(v8sf *)_ps256_cephes_exp_p0; - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); - y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, x); - y = _mm256_add_ps(y, one); - - /* build 2^n */ - imm0 = _mm256_cvttps_epi32(fx); - // another two AVX2 instructions - imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); - imm0 = avx2_mm256_slli_epi32(imm0, 23); - v8sf pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} - -_PS256_CONST(minus_cephes_DP1, -0.78515625); -_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); -_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); -_PS256_CONST(sincof_p0, -1.9515295891E-4); -_PS256_CONST(sincof_p1, 8.3321608736E-3); -_PS256_CONST(sincof_p2, -1.6666654611E-1); -_PS256_CONST(coscof_p0, 2.443315711809948E-005); -_PS256_CONST(coscof_p1, -1.388731625493765E-003); -_PS256_CONST(coscof_p2, 4.166664568298827E-002); -_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI - -/* evaluation of 8 sines at onces using AVX intrisics - - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. - -*/ -v8sf sin256_ps(v8sf x) { // any x - v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; - v8si imm0, imm2; - -#ifndef __AVX2__ - v4si imm0_1, imm0_2; - v4si imm2_1, imm2_2; -#endif - - sign_bit = x; - /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); - /* extract the sign bit (upper one) */ - sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); - - /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); - -/* - Here we start a series of integer operations, which are in the - realm of AVX2. - If we don't have AVX, let's perform them using SSE2 directives -*/ - -#ifdef __AVX2__ - /* store the integer part of y in mm0 */ - imm2 = _mm256_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - // another two AVX2 instruction - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); - y = _mm256_cvtepi32_ps(imm2); - - /* get the swap sign flag */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); - imm0 = avx2_mm256_slli_epi32(imm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 -#include "lite/backends/x86/math/detail/activation_functions.h" -#include "lite/backends/x86/math/gru_compute.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -namespace detail { - -#ifndef __NVCC__ - -template -void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, - T *gate_value, - T *reset_output_value, - T *prev_output_value, - int frame_size, - ActivationType active_gate) { - T r_value_update_gate; - T r_value_reset_gate; - T r_value_reset_output; - T r_prev_out = 0; - T *update_gate = gate_value; - T *reset_gate = gate_value + frame_size; - - for (int i = 0; i < frame_size; i++) { - r_value_update_gate = update_gate[i]; - r_value_reset_gate = reset_gate[i]; - if (prev_output_value) { - r_prev_out = prev_output_value[i]; - } - - op_reset_output(&r_value_update_gate, - &r_value_reset_gate, - &r_prev_out, - &r_value_reset_output, - active_gate); - - update_gate[i] = r_value_update_gate; - reset_gate[i] = r_value_reset_gate; - reset_output_value[i] = r_value_reset_output; - } -} - -template -void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, - T *gate_value, - T *prev_output_value, - T *output_value, - int frame_size, - ActivationType active_node, - bool origin_mode) { - T r_value_update_gate; - T r_value_frame_state; - T r_prev_out = 0; - T r_output; - T *update_gate = gate_value; - T *frame_state = gate_value + frame_size * 2; - - for (int i = 0; i < frame_size; i++) { - r_value_update_gate = update_gate[i]; - r_value_frame_state = frame_state[i]; - if (prev_output_value) { - r_prev_out = prev_output_value[i]; - } - - op_final_output(&r_value_update_gate, - &r_value_frame_state, - &r_prev_out, - &r_output, - active_node, - origin_mode); - - frame_state[i] = r_value_frame_state; - output_value[i] = r_output; - } -} - -template -void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, - T *gate_value, - T *reset_output_value, - T *prev_output_value, - int frame_size, - ActivationType active_gate) { -#ifdef __AVX__ - __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f); - __m256 r_value_reset_gate, r_value_reset_gate_last = _mm256_set1_ps(0.0f); - __m256 r_value_reset_output; - __m256 r_prev_out = _mm256_set1_ps(0.0f), - r_prev_out_last = _mm256_set1_ps(0.0f); - T *update_gate = gate_value; - T *reset_gate = gate_value + frame_size; - int block = 8; - const int n = frame_size; - const int rest = n % block; - const int end = n - rest; - int i = 0; - - if (rest > 0) { - i = n - block; - r_value_update_gate_last = - _mm256_loadu_ps((const float *)(update_gate + i)); - r_value_reset_gate_last = _mm256_loadu_ps((const float *)(reset_gate + i)); - if (prev_output_value) { - r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i)); - } - } - - for (i = 0; i < end; i += block) { - r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i)); - r_value_reset_gate = _mm256_loadu_ps((const float *)(reset_gate + i)); - if (prev_output_value) { - r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i)); - } - - op_reset_output(&r_value_update_gate, - &r_value_reset_gate, - &r_prev_out, - &r_value_reset_output, - active_gate); - - _mm256_storeu_ps(reinterpret_cast(update_gate + i), - r_value_update_gate); - _mm256_storeu_ps(reinterpret_cast(reset_gate + i), - r_value_reset_gate); - _mm256_storeu_ps(reinterpret_cast(reset_output_value + i), - r_value_reset_output); - } - - if (rest > 0) { - i = n - block; - - op_reset_output(&r_value_update_gate_last, - &r_value_reset_gate_last, - &r_prev_out_last, - &r_value_reset_output, - active_gate); - - _mm256_storeu_ps(reinterpret_cast(update_gate + i), - r_value_update_gate_last); - _mm256_storeu_ps(reinterpret_cast(reset_gate + i), - r_value_reset_gate_last); - _mm256_storeu_ps(reinterpret_cast(reset_output_value + i), - r_value_reset_output); - } -#endif -} - -template -void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, - T *gate_value, - T *prev_output_value, - T *output_value, - int frame_size, - ActivationType active_node, - bool origin_mode) { -#ifdef __AVX__ - __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f); - __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f); - __m256 r_prev_out = _mm256_set1_ps(0.0f), - r_prev_out_last = _mm256_set1_ps(0.0f); - __m256 r_output; - T *update_gate = gate_value; - T *frame_state = gate_value + frame_size * 2; - int block = 8; - const int n = frame_size; - const int rest = n % block; - const int end = n - rest; - int i = 0; - - if (rest > 0) { - i = n - block; - r_value_update_gate_last = - _mm256_loadu_ps((const float *)(update_gate + i)); - r_value_frame_state_last = - _mm256_loadu_ps((const float *)(frame_state + i)); - if (prev_output_value) { - r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i)); - } - } - - for (i = 0; i < end; i += block) { - r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i)); - r_value_frame_state = _mm256_loadu_ps((const float *)(frame_state + i)); - if (prev_output_value) { - r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i)); - } - - op_final_output(&r_value_update_gate, - &r_value_frame_state, - &r_prev_out, - &r_output, - active_node, - origin_mode); - - _mm256_storeu_ps(reinterpret_cast(frame_state + i), - r_value_frame_state); - _mm256_storeu_ps(reinterpret_cast(output_value + i), r_output); - } - - if (rest > 0) { - i = n - block; - op_final_output(&r_value_update_gate_last, - &r_value_frame_state_last, - &r_prev_out_last, - &r_output, - active_node, - origin_mode); - - _mm256_storeu_ps(reinterpret_cast(frame_state + i), - r_value_frame_state_last); - _mm256_storeu_ps(reinterpret_cast(output_value + i), r_output); - } - -#endif -} - -template -inline void forward_reset_output(OpResetOutput op_reset_output, - GRUMetaValue value, - int frame_size, - int batch_size, - ActivationType active_gate) { - for (int b = 0; b < batch_size; b++) { - if (OpResetOutput::avx && (frame_size > static_cast(8 - 1)) && - (sizeof(T) == 4)) { - hl_avx_gru_forward_reset_output(op_reset_output, - value.gate_value, - value.reset_output_value, - value.prev_out_value, - frame_size, - active_gate); - } else { - hl_naive_gru_forward_reset_output(op_reset_output, - value.gate_value, - value.reset_output_value, - value.prev_out_value, - frame_size, - active_gate); - } - - value.gate_value += frame_size * 3; - value.reset_output_value += frame_size; - if (value.prev_out_value) { - value.prev_out_value += frame_size; - } - } -} - -template -inline void forward_final_output(OpFinalOutput op_final_output, - GRUMetaValue value, - int frame_size, - int batch_size, - ActivationType active_node, - bool origin_mode) { - for (int b = 0; b < batch_size; b++) { - if (OpFinalOutput::avx && (frame_size > static_cast(8 - 1)) && - (sizeof(T) == 4)) { - hl_avx_gru_forward_final_output(op_final_output, - value.gate_value, - value.prev_out_value, - value.output_value, - frame_size, - active_node, - origin_mode); - } else { - hl_naive_gru_forward_final_output(op_final_output, - value.gate_value, - value.prev_out_value, - value.output_value, - frame_size, - active_node, - origin_mode); - } - - value.gate_value += frame_size * 3; - value.output_value += frame_size; - if (value.prev_out_value) { - value.prev_out_value += frame_size; - } - } -} - -template -void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, - T *gate_value, - T *gate_grad, - T *prev_out_value, - T *prev_out_grad, - T *output_grad, - int frame_size, - ActivationType active_node, - bool origin_mode) { - T r_update_gate_value; - T r_update_gate_grad; - T r_frame_state_value; - T r_frame_state_grad; - T r_out_grad; - T r_prev_out_value = 0; - T r_prev_out_grad = 0; - T *update_gate_value = gate_value; - T *update_gate_grad = gate_grad; - T *frame_state_value = gate_value + frame_size * 2; - T *frame_state_grad = gate_grad + frame_size * 2; - - for (int i = 0; i < frame_size; i++) { - r_update_gate_value = update_gate_value[i]; - r_frame_state_value = frame_state_value[i]; - r_out_grad = output_grad[i]; - if (prev_out_value) { - r_prev_out_value = prev_out_value[i]; - } - if (prev_out_grad) { - r_prev_out_grad = prev_out_grad[i]; - } - - op_state_grad(&r_update_gate_value, - &r_update_gate_grad, - &r_frame_state_value, - &r_frame_state_grad, - &r_prev_out_value, - &r_prev_out_grad, - &r_out_grad, - active_node, - origin_mode); - - update_gate_grad[i] = r_update_gate_grad; - frame_state_grad[i] = r_frame_state_grad; - if (prev_out_grad) { - prev_out_grad[i] = r_prev_out_grad; - } - } -} - -template -void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, - T *gate_value, - T *gate_grad, - T *prev_out_value, - T *prev_out_grad, - T *reset_output_grad, - int frame_size, - ActivationType active_gate) { - T r_update_gate_value; - T r_update_gate_grad; - T r_reset_gate_value; - T r_reset_gate_grad; - T r_reset_output_grad = 0; - T r_prev_out_value = 0; - T r_prev_out_grad = 0; - T *update_gate_value = gate_value; - T *update_gate_grad = gate_grad; - T *reset_gate_value = gate_value + frame_size; - T *reset_gate_grad = gate_grad + frame_size; - - for (int i = 0; i < frame_size; i++) { - r_update_gate_value = update_gate_value[i]; - r_update_gate_grad = update_gate_grad[i]; - r_reset_gate_value = reset_gate_value[i]; - - if (prev_out_value && prev_out_grad) { - r_reset_output_grad = reset_output_grad[i]; - } - if (prev_out_value) { - r_prev_out_value = prev_out_value[i]; - } - if (prev_out_grad) { - r_prev_out_grad = prev_out_grad[i]; - } - - op_reset_grad(&r_update_gate_value, - &r_update_gate_grad, - &r_reset_gate_value, - &r_reset_gate_grad, - &r_prev_out_value, - &r_prev_out_grad, - &r_reset_output_grad, - active_gate); - - update_gate_grad[i] = r_update_gate_grad; - reset_gate_grad[i] = r_reset_gate_grad; - if (prev_out_grad) { - prev_out_grad[i] = r_prev_out_grad; - } - } -} - -template -void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, - T *gate_value, - T *gate_grad, - T *prev_out_value, - T *prev_out_grad, - T *output_grad, - int frame_size, - ActivationType active_node, - bool origin_mode) { -#ifdef __AVX__ - __m256 r_update_gate_value; - __m256 r_update_gate_grad; - __m256 r_frame_state_value; - __m256 r_frame_state_grad; - __m256 r_out_grad; - __m256 r_prev_out_value = _mm256_set1_ps(0.0f); - __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); - __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value); - __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad); - __m256 *frame_state_value = - reinterpret_cast<__m256 *>(gate_value + frame_size * 2); - __m256 *frame_state_grad = - reinterpret_cast<__m256 *>(gate_grad + frame_size * 2); - - for (int i = 0; i < frame_size / 8; i++) { - r_update_gate_value = update_gate_value[i]; - r_frame_state_value = frame_state_value[i]; - r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i]; - if (prev_out_value) { - r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i]; - } - if (prev_out_grad) { - r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; - } - - op_state_grad(&r_update_gate_value, - &r_update_gate_grad, - &r_frame_state_value, - &r_frame_state_grad, - &r_prev_out_value, - &r_prev_out_grad, - &r_out_grad, - active_node, - origin_mode); - - update_gate_grad[i] = r_update_gate_grad; - frame_state_grad[i] = r_frame_state_grad; - if (prev_out_grad) { - (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad; - } - } -#endif -} - -template -void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, - T *gate_value, - T *gate_grad, - T *prev_out_value, - T *prev_out_grad, - T *reset_output_grad, - int frame_size, - ActivationType active_gate) { -#ifdef __AVX__ - __m256 r_update_gate_value; - __m256 r_update_gate_grad; - __m256 r_reset_gate_value; - __m256 r_reset_gate_grad; - __m256 r_reset_output_grad = _mm256_set1_ps(0.0f); - __m256 r_prev_out_value = _mm256_set1_ps(0.0f); - __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); - __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value); - __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad); - __m256 *reset_gate_value = - reinterpret_cast<__m256 *>(gate_value + frame_size); - __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size); - - for (int i = 0; i < frame_size / 8; i++) { - r_update_gate_value = update_gate_value[i]; - r_update_gate_grad = update_gate_grad[i]; - r_reset_gate_value = reset_gate_value[i]; - - if (prev_out_value && prev_out_grad) { - r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i]; - } - if (prev_out_value) { - r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i]; - } - if (prev_out_grad) { - r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; - } - - op_reset_grad(&r_update_gate_value, - &r_update_gate_grad, - &r_reset_gate_value, - &r_reset_gate_grad, - &r_prev_out_value, - &r_prev_out_grad, - &r_reset_output_grad, - active_gate); - - update_gate_grad[i] = r_update_gate_grad; - reset_gate_grad[i] = r_reset_gate_grad; - if (prev_out_grad) { - (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad; - } - } -#endif -} - -template -inline void backward_state_grad(OpStateGrad op_state_grad, - GRUMetaValue value, - GRUMetaGrad grad, - int frame_size, - int batch_size, - ActivationType active_node, - bool origin_mode) { - for (int b = 0; b < batch_size; b++) { - if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_backward_state_grad(op_state_grad, - value.gate_value, - grad.gate_grad, - value.prev_out_value, - grad.prev_out_grad, - grad.output_grad, - frame_size, - active_node, - origin_mode); - } else { - hl_naive_gru_backward_state_grad(op_state_grad, - value.gate_value, - grad.gate_grad, - value.prev_out_value, - grad.prev_out_grad, - grad.output_grad, - frame_size, - active_node, - origin_mode); - } - - value.gate_value += frame_size * 3; - if (value.prev_out_value) { - value.prev_out_value += frame_size; - } - - grad.gate_grad += frame_size * 3; - grad.output_grad += frame_size; - if (grad.prev_out_grad) { - grad.prev_out_grad += frame_size; - } - } -} - -template -inline void backward_reset_grad(OpResetGrad op_reset_grad, - GRUMetaValue value, - GRUMetaGrad grad, - int frame_size, - int batch_size, - ActivationType active_gate) { - for (int b = 0; b < batch_size; b++) { - if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_backward_reset_grad(op_reset_grad, - value.gate_value, - grad.gate_grad, - value.prev_out_value, - grad.prev_out_grad, - grad.reset_output_grad, - frame_size, - active_gate); - } else { - hl_naive_gru_backward_reset_grad(op_reset_grad, - value.gate_value, - grad.gate_grad, - value.prev_out_value, - grad.prev_out_grad, - grad.reset_output_grad, - frame_size, - active_gate); - } - - value.gate_value += frame_size * 3; - if (value.prev_out_value) { - value.prev_out_value += frame_size; - } - - grad.gate_grad += frame_size * 3; - grad.reset_output_grad += frame_size; - if (grad.prev_out_grad) { - grad.prev_out_grad += frame_size; - } - } -} - -#endif - -} // namespace detail -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/detail/gru_kernel.h b/lite/backends/x86/math/detail/gru_kernel.h deleted file mode 100644 index 91c753c685..0000000000 --- a/lite/backends/x86/math/detail/gru_kernel.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/backends/x86/math/detail/activation_functions.h" -#include "lite/utils/macros.h" - -// TODO(guosheng): refine code style in gru_kernel -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -namespace detail { - -namespace forward { - -template -class gru_resetOutput { - public: - HOSTDEVICE void operator()(T *value_update_gate, - T *value_reset_gate, - T *prev_out, - T *value_reset_output, - ActivationType act_gate) { - *value_update_gate = activation(*value_update_gate, act_gate); - *value_reset_gate = activation(*value_reset_gate, act_gate); - *value_reset_output = (*prev_out) * (*value_reset_gate); - } -#ifndef __AVX__ - static const bool avx = false; -#else - static const bool avx = true; - HOSTDEVICE void operator()(__m256 *value_update_gate, - __m256 *value_reset_gate, - __m256 *prev_out, - __m256 *value_reset_output, - ActivationType act_gate) { - *value_update_gate = activation(*value_update_gate, act_gate); - *value_reset_gate = activation(*value_reset_gate, act_gate); - *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate); - } -#endif -}; - -template -class gru_finalOutput { - public: - HOSTDEVICE void operator()(T *value_update_gate, - T *value_frame_state, - T *prev_out, - T *value_output, - ActivationType act_input, - bool origin_mode) { - *value_frame_state = activation(*value_frame_state, act_input); - if (origin_mode) { - *value_output = ((*value_update_gate) * (*prev_out)) + - *value_frame_state - - ((*value_update_gate) * (*value_frame_state)); - } else { - *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) + - ((*value_update_gate) * (*value_frame_state)); - } - } -#ifndef __AVX__ - static const bool avx = false; -#else - static const bool avx = true; - HOSTDEVICE void operator()(__m256 *value_update_gate, - __m256 *value_frame_state, - __m256 *prev_out, - __m256 *value_output, - ActivationType act_input, - bool origin_mode) { - *value_frame_state = activation(*value_frame_state, act_input); - if (origin_mode) { - *value_output = _mm256_sub_ps( - _mm256_add_ps(_mm256_mul_ps(*value_update_gate, *prev_out), - *value_frame_state), - _mm256_mul_ps(*value_update_gate, *value_frame_state)); - } else { - *value_output = _mm256_add_ps( - _mm256_sub_ps(*prev_out, - _mm256_mul_ps(*value_update_gate, *prev_out)), - _mm256_mul_ps(*value_update_gate, *value_frame_state)); - } - } -#endif -}; -} // namespace forward - -namespace backward { - -template -class gru_stateGrad { - public: - HOSTDEVICE void operator()(T *value_update_gate, - T *grad_update_gate, - T *value_frame_state, - T *grad_frame_state, - T *value_prev_out, - T *grad_prev_out, - T *grad_output, - ActivationType act_input, - bool origin_mode) { - if (origin_mode) { - *grad_update_gate = - (*grad_output) * ((*value_prev_out) - (*value_frame_state)); - *grad_prev_out += (*grad_output * (*value_update_gate)); - *grad_frame_state = activation( - *grad_output * (static_cast(1.0) - (*value_update_gate)), - *value_frame_state, - act_input); - } else { - *grad_update_gate = - (*grad_output) * ((*value_frame_state) - (*value_prev_out)); - *grad_prev_out += - (*grad_output * (static_cast(1.0) - *value_update_gate)); - *grad_frame_state = activation( - *grad_output * (*value_update_gate), *value_frame_state, act_input); - } - } -#ifndef __AVX__ - static const bool avx = false; -#else - static const bool avx = true; - HOSTDEVICE void operator()(__m256 *value_update_gate, - __m256 *grad_update_gate, - __m256 *value_frame_state, - __m256 *grad_frame_state, - __m256 *value_prev_out, - __m256 *grad_prev_out, - __m256 *grad_output, - ActivationType act_input, - bool origin_mode) { - if (origin_mode) { - *grad_update_gate = _mm256_mul_ps( - *grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state)); - *grad_prev_out = _mm256_add_ps( - *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate)); - *grad_frame_state = activation( - _mm256_mul_ps( - *grad_output, - _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)), - *value_frame_state, - act_input); - } else { - *grad_update_gate = _mm256_mul_ps( - *grad_output, _mm256_sub_ps(*value_frame_state, *value_prev_out)); - *grad_prev_out = _mm256_add_ps( - *grad_prev_out, - _mm256_mul_ps( - *grad_output, - _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate))); - *grad_frame_state = - activation(_mm256_mul_ps(*grad_output, *value_update_gate), - *value_frame_state, - act_input); - } - } -#endif -}; - -template -class gru_resetGrad { - public: - HOSTDEVICE void operator()(T *value_update_gate, - T *grad_update_gate, - T *value_reset_gate, - T *grad_reset_gate, - T *value_prev_out, - T *grad_prev_out, - T *grad_reset_output, - ActivationType act_gate) { - *grad_reset_gate = (*grad_reset_output * (*value_prev_out)); - *grad_prev_out += (*grad_reset_output * (*value_reset_gate)); - *grad_update_gate = - activation(*grad_update_gate, *value_update_gate, act_gate); - *grad_reset_gate = - activation(*grad_reset_gate, *value_reset_gate, act_gate); - } -#ifndef __AVX__ - static const bool avx = false; -#else - static const bool avx = true; - HOSTDEVICE void operator()(__m256 *value_update_gate, - __m256 *grad_update_gate, - __m256 *value_reset_gate, - __m256 *grad_reset_gate, - __m256 *value_prev_out, - __m256 *grad_prev_out, - __m256 *grad_reset_output, - ActivationType act_gate) { - *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out); - *grad_prev_out = _mm256_add_ps( - *grad_prev_out, _mm256_mul_ps(*grad_reset_output, *value_reset_gate)); - *grad_update_gate = - activation(*grad_update_gate, *value_update_gate, act_gate); - *grad_reset_gate = - activation(*grad_reset_gate, *value_reset_gate, act_gate); - } -#endif -}; - -} // namespace backward - -} // namespace detail -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/gru_compute.cc b/lite/backends/x86/math/gru_compute.cc deleted file mode 100644 index b1fdfe18a5..0000000000 --- a/lite/backends/x86/math/gru_compute.cc +++ /dev/null @@ -1,181 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/gru_compute.h" -#include "lite/backends/x86/math/blas.h" -#include "lite/backends/x86/math/detail/gru_cpu_kernel.h" -#include "lite/backends/x86/math/detail/gru_kernel.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -struct GRUUnitFunctor { - static void compute(const lite::X86Context &context, - GRUMetaValue value, - int frame_size, - int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode) { -#ifndef __NVCC__ - auto blas = math::GetBlas(context); - if (value.prev_out_value) { - blas.GEMM(false, - false, - batch_size, - frame_size * 2, - frame_size, - 1, - value.prev_out_value, - frame_size, - value.gate_weight, - frame_size * 2, - 1, - value.gate_value, - frame_size * 3); - } - - detail::forward_reset_output(detail::forward::gru_resetOutput(), - value, - frame_size, - batch_size, - active_gate); - - if (value.prev_out_value) { - blas.GEMM(false, - false, - batch_size, - frame_size, - frame_size, - 1, - value.reset_output_value, - frame_size, - value.state_weight, - frame_size, - 1, - value.gate_value + frame_size * 2, - frame_size * 3); - } - - detail::forward_final_output(detail::forward::gru_finalOutput(), - value, - frame_size, - batch_size, - active_node, - origin_mode); -#endif - } -}; - -template -struct GRUUnitGradFunctor { - static void compute(const lite::X86Context &context, - GRUMetaValue value, - GRUMetaGrad grad, - int frame_size, - int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode) { -#ifndef __NVCC__ - detail::backward_state_grad(detail::backward::gru_stateGrad(), - value, - grad, - frame_size, - batch_size, - active_node, - origin_mode); - auto blas = math::GetBlas(context); - if (value.prev_out_value && grad.prev_out_grad) { - blas.GEMM(false, - true, - batch_size, - frame_size, - frame_size, - 1, - grad.gate_grad + frame_size * 2, - frame_size * 3, - value.state_weight, - frame_size, - 0, - grad.reset_output_grad, - frame_size); - - if (grad.state_weight_grad) { - blas.GEMM(true, - false, - frame_size, - frame_size, - batch_size, - 1, - value.reset_output_value, - frame_size, - grad.gate_grad + frame_size * 2, - frame_size * 3, - 1, - grad.state_weight_grad, - frame_size); - } - } - - detail::backward_reset_grad(detail::backward::gru_resetGrad(), - value, - grad, - frame_size, - batch_size, - active_gate); - if (grad.prev_out_grad && value.prev_out_value) { - blas.GEMM(false, - true, - batch_size, - frame_size, - frame_size * 2, - 1, - grad.gate_grad, - frame_size * 3, - value.gate_weight, - frame_size * 2, - 1, - grad.prev_out_grad, - frame_size); - - if (grad.gate_weight_grad) { - blas.GEMM(true, - false, - frame_size, - frame_size * 2, - batch_size, - 1, - value.prev_out_value, - frame_size, - grad.gate_grad, - frame_size * 3, - 1, - grad.gate_weight_grad, - frame_size * 2); - } - } -#endif - } -}; - -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/gru_compute.h b/lite/backends/x86/math/gru_compute.h deleted file mode 100644 index 86b7a91f41..0000000000 --- a/lite/backends/x86/math/gru_compute.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/backends/x86/math/detail/activation_functions.h" -#include "lite/core/context.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -struct GRUMetaValue { - T *gate_weight; - T *state_weight; - T *gate_value; - T *reset_output_value; - T *output_value; - T *prev_out_value; -}; - -template -struct GRUMetaGrad { - T *gate_weight_grad; - T *state_weight_grad; - T *gate_grad; - T *reset_output_grad; - T *output_grad; - T *prev_out_grad; -}; - -template -struct GRUUnitFunctor { - static void compute(const lite::Context &context, - GRUMetaValue value, - int frame_size, - int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode); -}; - -template -struct GRUUnitGradFunctor { - static void compute(const lite::Context &context, - GRUMetaValue value, - GRUMetaGrad grad, - int frame_size, - int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc deleted file mode 100644 index 1c4c6a49f5..0000000000 --- a/lite/backends/x86/math/im2col.cc +++ /dev/null @@ -1,292 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/im2col.h" -#include -#include "lite/backends/x86/math/im2col_cfo_cpu.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * im = [input_channels, input_height, input_width] - * col = - * [input_channels, filter_height, filter_width, output_height, output_width] - */ -template -class Im2ColFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& im, - const std::vector& dilation, - const std::vector& stride, - const std::vector& padding, - lite::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); - - if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && - dilation[1] == 1) { - if (padding[0] == 0 && padding[1] == 0) { - im2col_sh1sw1dh1dw1ph0pw0(im, col); - return; - } else if (padding[0] == 1 && padding[1] == 1) { - im2col_sh1sw1dh1dw1ph1pw1(im, col); - return; - } - // TODO(TJ): complete padding >=2 - } - im2col_common(im, dilation, stride, padding, col); - } -}; - -/* - * im = [input_channels, input_height, input_width] - * col = - * [input_channels, filter_height, filter_width, output_height, output_width] - */ -template -class Col2ImFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& col, - const std::vector& dilation, - const std::vector& stride, - const std::vector& padding, - lite::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); - int im_channels = im->dims()[0]; - int im_height = im->dims()[1]; - int im_width = im->dims()[2]; - int filter_height = col.dims()[1]; - int filter_width = col.dims()[2]; - int col_height = col.dims()[3]; - int col_width = col.dims()[4]; - - PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - - ((dilation[0] * (filter_height - 1) + 1))) / - stride[0] + - 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - - ((dilation[1] * (filter_width - 1) + 1))) / - stride[1] + - 1, - col_width, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - - int channels_col = im_channels * filter_height * filter_width; - - T* im_data = im->mutable_data(); - const T* col_data = col.data(); - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < col_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < col_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - if ((im_row_idx) >= 0 && (im_row_idx) < im_height && - (im_col_idx) >= 0 && (im_col_idx) < im_width) { - im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] += - col_data[(c * col_height + h) * col_width + w]; - } - } - } - } - } -}; - -template class Im2ColFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; - -/* - * im = [input_channels, input_height, input_width] - * col = - * [output_height, output_width, input_channels, filter_height, filter_width] - */ -template -class Im2ColFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& im, - const std::vector& dilation, - const std::vector& stride, - const std::vector& padding, - lite::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[3]; - int filter_width = col->dims()[4]; - int col_height = col->dims()[0]; - int col_width = col->dims()[1]; - - const T* im_data = im.data(); - T* col_data = col->mutable_data(); - - for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { - for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { - for (int channel = 0; channel < im_channels; ++channel) { - for (int filter_row_idx = 0; filter_row_idx < filter_height; - ++filter_row_idx) { - int im_row_offset = - col_row_idx * stride[0] + filter_row_idx - padding[0]; - for (int filter_col_idx = 0; filter_col_idx < filter_width; - ++filter_col_idx) { - int im_col_offset = - col_col_idx * stride[1] + filter_col_idx - padding[1]; - - int col_offset = - ((((col_row_idx)*col_width + col_col_idx) * im_channels + - channel) * - filter_height + - filter_row_idx) * - filter_width + - filter_col_idx; - - int im_offset = (channel * im_height + im_row_offset) * im_width + - im_col_offset; - col_data[col_offset] = - (im_row_offset < 0 || im_row_offset >= im_height || - im_col_offset < 0 || im_col_offset >= im_width) - ? static_cast(0) - : im_data[im_offset]; - } - } - } - } - } - } -}; - -/* - * im = [input_channels, input_height, input_width] - * col = - * [output_height, output_width, input_channels, filter_height, filter_width] - */ -template -class Col2ImFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& col, - const std::vector& dilation, - const std::vector& stride, - const std::vector& padding, - lite::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); - int im_channels = im->dims()[0]; - int im_height = im->dims()[1]; - int im_width = im->dims()[2]; - int filter_height = col.dims()[3]; - int filter_width = col.dims()[4]; - int col_height = col.dims()[0]; - int col_width = col.dims()[1]; - - PADDLE_ENFORCE_EQ( - (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ( - (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1, - col_width, - "col_width and padding(padding_left, padding_right) are " - "inconsistent."); - - T* im_data = im->mutable_data(); - const T* col_data = col.data(); - - for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { - for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { - for (int channel = 0; channel < im_channels; ++channel) { - for (int filter_row_idx = 0; filter_row_idx < filter_height; - ++filter_row_idx) { - int im_row_offset = - col_row_idx * stride[0] + filter_row_idx - padding[0]; - for (int filter_col_idx = 0; filter_col_idx < filter_width; - ++filter_col_idx) { - int im_col_offset = - col_col_idx * stride[1] + filter_col_idx - padding[1]; - - int col_offset = - (((col_row_idx * col_width + col_col_idx) * im_channels + - channel) * - filter_height + - filter_row_idx) * - filter_width + - filter_col_idx; - - if (im_row_offset >= 0 && im_row_offset < im_height && - im_col_offset >= 0 && im_col_offset < im_width) { - int im_offset = - (channel * im_height + im_row_offset) * im_width + - im_col_offset; - im_data[im_offset] += col_data[col_offset]; - } - } - } - } - } - } - } -}; - -template class Im2ColFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/im2col.h b/lite/backends/x86/math/im2col.h deleted file mode 100644 index 8fb89ccb5f..0000000000 --- a/lite/backends/x86/math/im2col.h +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */ -enum class ColFormat { kCFO = 0, kOCF = 1 }; - -/* - * \brief Converts the image data of three dimensions(CHW) into a colData of - * five dimensions in the Im2ColFunctor calculation, - * And in the Col2ImFunctor calculation, it is reversed. - * - * \param imData Image data. - * \param imShape The shape of imData, - * [input_channels, input_height, input_width]. - * \param colData Column data. - * \param colShape The shape of colData. - * - * \param dilations dilation data. - * \param 2-dimension [dilation_height, dilation_width]. - * - * \param strides stride data. - * \param 2-dimension [stride_height, stride_width]. - * - * \param paddings padding data. - * \param 4-dimension [up_pad, left_pad, down_pad, right_pad]. - * - * If the template argument Format is kCFO, the shape of colData is: - * [input_channels, filter_height, filter_width, output_height, output_width] - * So, it is easy to reshape into a convolution matrix for convolution - * calculation based on matrix multiplication. - * The shape of convolution matrix is [height, width], where the height is equal - * input_channels * filter_height * filter_width, and the width is equal - * output_height * output_width. - * - * Reshape: - * shape of colData shape of convolution matrix - * [input_channels, - * filter_height, - * filter_width, ======> [height, width] - * output_height, - * output_width] - * - * If the template argument Format is kOCF, the shape of colData is: - * [output_height, output_width, input_channels, filter_height, filter_width] - * So, it is easy to reshape into a sequence matrix for rnn calculation. - * The shape of sequence matrix is [seq_length, step_size], where the seq_length - * is equal output_height * output_width, and the step_size is equal - * input_channels * filter_height * filter_width. - * - * Reshape: - * shape of colData shape of sequence matrix - * [output_height, - * output_width, - * input_channels, ======> [seqLength, stepSize] - * filter_height, - * filter_width] - * - * \note The caller needs to ensure that imShape.inputChannels is equal to - * colShape.inputChannels. - */ -template -class Im2ColFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& im, - const std::vector& dilation, - const std::vector& stride, - const std::vector& padding, - lite::Tensor* col); -}; - -template -class Col2ImFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& col, - const std::vector& dilation, - const std::vector& stride, - const std::vector& padding, - lite::Tensor* im); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/im2col_cfo_cpu.h b/lite/backends/x86/math/im2col_cfo_cpu.h deleted file mode 100644 index 4623f045bb..0000000000 --- a/lite/backends/x86/math/im2col_cfo_cpu.h +++ /dev/null @@ -1,256 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/** - * The most common im2col algorithm. - * Support dilation, stride and padding. - */ -template -inline void im2col_common(const lite::Tensor& im, - const std::vector& dilation, - const std::vector& stride, - const std::vector& padding, - lite::Tensor* col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - int channels_col = im_channels * filter_height * filter_width; - - const T* im_data = im.data(); - T* col_data = col->mutable_data(); - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; - } - } - } -} - -/** - * im2col algorithm with strides == 1, dilations == 1, paddings == 0 - */ -template -inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im, - lite::Tensor* col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - - const T* im_data = im.data(); - T* col_data = col->mutable_data(); - int col_matrix_width = output_width * output_height; - int im_size = im_height * im_width; - size_t copy_size = sizeof(T) * output_width; - const T* im_data_oh = im_data; - T* dst_data_oh = col_data; - for (int oh = 0; oh < output_height; ++oh) { - const T* src_data_ic = im_data_oh; - T* dst_data = dst_data_oh; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = src_data_ic; - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - std::memcpy(dst_data, src_data + kw, copy_size); - dst_data = dst_data + col_matrix_width; - } - src_data = src_data + im_width; - } - src_data_ic = src_data_ic + im_size; - } - im_data_oh = im_data_oh + im_width; - dst_data_oh = dst_data_oh + output_width; - } -} - -/** - * im2col algorithm with strides == 1, dilations == 1, paddings == 1 - * and filter_width == 1 have a special implementation - */ -template -inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im, - lite::Tensor* col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - - constexpr int plh = 1; - constexpr int prh = 1; - constexpr int plw = 1; - constexpr int prw = 1; - - const T* im_data = im.data(); - T* col_data = col->mutable_data(); - int im_size = im_height * im_width; - int col_matrix_width = output_width * output_height; - int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow - int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow - - // fill height padding - { - size_t copy_size = sizeof(T) * output_width; - T* col_start_l = col_data; - T* col_start_r = col_data + (filter_height - 1) * col_block_fh + - col_matrix_width - output_width; - for (int ic = 0; ic < im_channels; ++ic) { - T* dst_data_l = col_start_l; - T* dst_data_r = col_start_r; - for (int kw = 0; kw < filter_width; ++kw) { - std::memset(dst_data_l, 0, copy_size); - std::memset(dst_data_r, 0, copy_size); - dst_data_l = dst_data_l + col_matrix_width; - dst_data_r = dst_data_r + col_matrix_width; - } - col_start_l = col_start_l + col_block_ic; - col_start_r = col_start_r + col_block_ic; - } - } - - auto pad = static_cast(0); - if (filter_width == 1) { - // fill width padding - T* dst_data_ic = col_data; - for (int ic = 0; ic < im_channels; ++ic) { - T* dst_data_kh = dst_data_ic; - for (int kh = 0; kh < filter_height; ++kh) { - T* dst_data = dst_data_kh; - for (int oh = 0; oh < output_height; ++oh) { - *dst_data = pad; - dst_data = dst_data + output_width - 1; - *dst_data = pad; - ++dst_data; - } - dst_data_kh = dst_data_kh + col_block_fh; - } - dst_data_ic = dst_data_ic + col_block_ic; - } - // fill core - size_t copy_size = sizeof(T) * (output_width - plw - prw); - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = - im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_size; - for (int kh = 0; kh < filter_height; ++kh) { - if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && - kh > (filter_height - prh - 1))) { - dst_data = dst_data + col_matrix_width; - continue; - } - std::memcpy(dst_data + plw, src_data, copy_size); - dst_data = dst_data + col_matrix_width; - src_data = src_data + im_width; - } - } - } - return; - } - - // filter_width != 1 - // fill width padding - T* dst_data_ic = col_data; - for (int ic = 0; ic < im_channels; ++ic) { - T* dst_data_kh = dst_data_ic; - for (int kh = 0; kh < filter_height; ++kh) { - for (T* dst_data : - {dst_data_kh, - dst_data_kh + (filter_width - prw) * col_matrix_width + - output_width - 1}) { - // TODO(TJ): from plh, saving repeated assignment - for (int oh = 0; oh < output_height; ++oh) { - *dst_data = pad; - dst_data = dst_data + output_width; - } - } - dst_data_kh = dst_data_kh + col_block_fh; - } - dst_data_ic = dst_data_ic + col_block_ic; - } - - // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * - // (output_width-1)} - // length of copy_size is equal kw. - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_size; - for (int kh = 0; kh < filter_height; ++kh) { - if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && - kh > (filter_height - prh - 1))) { - dst_data = dst_data + filter_width * col_matrix_width; - continue; - } - // TODO(TJ): reuse plw-kw outside this for - // try to unify - for (int kw = 0; kw < plw; ++kw) { - std::memcpy(dst_data + (plw - kw), - src_data, - sizeof(T) * (output_width - (plw - kw))); - dst_data = dst_data + col_matrix_width; - } - for (int kw = plw; kw < filter_width - prw; ++kw) { - std::memcpy( - dst_data, src_data + (kw - plw), sizeof(T) * output_width); - dst_data = dst_data + col_matrix_width; - } - int i = 1; - for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { - std::memcpy( - dst_data, src_data + (kw - plw), sizeof(T) * (output_width - i)); - dst_data = dst_data + col_matrix_width; - } - src_data = src_data + im_width; - } - } - } -} - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/im2col_test.cc b/lite/backends/x86/math/im2col_test.cc deleted file mode 100644 index 3881d5ff33..0000000000 --- a/lite/backends/x86/math/im2col_test.cc +++ /dev/null @@ -1,331 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/im2col.h" -#include -#include -#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" -#include "paddle/fluid/platform/port.h" - -template -void testIm2col() { - paddle::framework::Tensor input_tmp; - paddle::framework::Tensor input; - paddle::framework::Tensor output_cfo; - paddle::framework::Tensor output_ocf; - paddle::framework::Tensor output_tmp; - - /** - * input = [0, 1, 2, - * 3, 4, 5] - * - * output_cfo = [0, 1 - * 1, 2 - * 3, 4 - * 4, 5] - * - * output_ocf = [0, 1, 3, 4 - * 1, 2, 4, 5] - * - * col2im_cfo = [0, 2, 2 - * 3, 4, 5] - * - * col2im_ocf = [0, 2, 2 - * 3, 4, 5] - */ - int input_height = 2; - int input_width = 3; - int filter_size = 2; - std::vector stride({1, 1}); // stride_y, stride_x - std::vector padding( - {0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad - std::vector dilation({1, 1}); // dilation_y, dilation_x - int output_height = - (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1; - int output_width = - (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1; - float* input_ptr = input_tmp.mutable_data( - {1, input_height, input_width}, paddle::platform::CPUPlace()); - float arr[6] = {0, 1, 2, 3, 4, 5}; - memcpy(input_ptr, arr, 6 * sizeof(float)); - - auto* place = new Place(); - DeviceContext* context = new DeviceContext(*place); - if (paddle::platform::is_cpu_place(*place)) { - input = input_tmp; - } else { - TensorCopySync(input_tmp, *place, &input); - } - output_cfo.mutable_data( - {1, filter_size, filter_size, output_height, output_width}, *place); - output_ocf.mutable_data( - {output_height, output_width, 1, filter_size, filter_size}, *place); - - // Im2Col - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, - DeviceContext, - float> - im2col; - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kOCF, - DeviceContext, - float> - im2col_ocf; - - im2col(*context, input, dilation, stride, padding, &output_cfo); - im2col_ocf(*context, input, dilation, stride, padding, &output_ocf); - - float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5}; - float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5}; - - float* out_cfo_ptr; - if (paddle::platform::is_cpu_place(*place)) { - out_cfo_ptr = output_cfo.data(); - } else { - TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp); - out_cfo_ptr = output_tmp.data(); - } - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]); - } - - float* out_ocf_ptr; - if (paddle::platform::is_cpu_place(*place)) { - out_ocf_ptr = output_ocf.data(); - } else { - TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp); - out_ocf_ptr = output_tmp.data(); - } - - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]); - } - - // Col2Im: kCFO - paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kCFO, - DeviceContext, - float> - col2im; - paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kOCF, - DeviceContext, - float> - col2im_ocf; - float col2im_data[] = {0, 2, 2, 3, 8, 5}; - - memset(input_ptr, 0, 6 * sizeof(float)); - if (paddle::platform::is_cpu_place(*place)) { - input = input_tmp; - } else { - TensorCopySync(input_tmp, *place, &input); - } - - col2im(*context, output_cfo, dilation, stride, padding, &input); - - float* in_ptr; - if (paddle::platform::is_cpu_place(*place)) { - in_ptr = input.data(); - } else { - TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); - in_ptr = input_tmp.data(); - } - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(in_ptr[i], col2im_data[i]); - } - - // Col2Im: kOCF - memset(input_ptr, 0, 6 * sizeof(float)); - if (paddle::platform::is_cpu_place(*place)) { - input = input_tmp; - } else { - TensorCopySync(input_tmp, *place, &input); - } - - col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); - - if (paddle::platform::is_cpu_place(*place)) { - in_ptr = input.data(); - } else { - TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); - in_ptr = input_tmp.data(); - } - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(in_ptr[i], col2im_data[i]); - } - - delete place; - delete context; -} - -TEST(math, im2col) { - testIm2col(); -#ifdef PADDLE_WITH_CUDA - testIm2col(); -#endif -} - -#define PREPARE_IM2COL_CPU \ - paddle::platform::CPUPlace place; \ - paddle::platform::CPUDeviceContext context(place); \ - paddle::framework::Tensor input; \ - paddle::framework::Tensor out; \ - paddle::framework::Tensor ref; \ - std::vector padding({ph, pw}); \ - std::vector stride({1, 1}); \ - std::vector dilation({1, 1}); \ - float* input_ptr = input.mutable_data({ic, ih, iw}, place); \ - for (int i = 0; i < input.numel(); ++i) { \ - input_ptr[i] = static_cast(i + 1); \ - } \ - int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \ - int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \ - out.mutable_data({ic, fh, fw, output_height, output_width}, place); \ - ref.mutable_data({ic, fh, fw, output_height, output_width}, place); \ - paddle::operators::math::Im2ColFunctor< \ - paddle::operators::math::ColFormat::kCFO, \ - paddle::platform::CPUDeviceContext, \ - float> \ - im2col - -void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { - PREPARE_IM2COL_CPU; - - im2col(context, input, dilation, stride, padding, &out); - paddle::operators::math::im2col_common( - input, dilation, stride, padding, &ref); - - float* ref_data = ref.data(); - float* out_data = out.data(); - for (int i = 0; i < out.numel(); ++i) { - EXPECT_EQ(out_data[i], ref_data[i]); - } -} - -void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { - PREPARE_IM2COL_CPU; - constexpr int repeat = 100; - auto GetCurrentMs = []() -> double { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec; - }; - auto t1 = GetCurrentMs(); - for (int i = 0; i < repeat; ++i) { - im2col(context, input, dilation, stride, padding, &out); - } - auto t2 = GetCurrentMs(); - - for (int i = 0; i < repeat; ++i) { - paddle::operators::math::im2col_common( - input, dilation, stride, padding, &ref); - } - auto t3 = GetCurrentMs(); - - LOG(INFO) << "before: " << (t3 - t2) / repeat - << ",after: " << (t2 - t1) / repeat - << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%"; -} - -TEST(math, im2col_cputest) { - // padding_h == padding_w - for (int p = 0; p < 4; ++p) { - // width == height - testIm2colCPU(/*ic*/ 2, - /*ih*/ 5, - /*iw*/ 5, - /*fh*/ 4, - /*fw*/ 4, - /*ph*/ p, - /*pw*/ p); - testIm2colCPU(/*ic*/ 2, - /*ih*/ 4, - /*iw*/ 4, - /*fh*/ 3, - /*fw*/ 3, - /*ph*/ p, - /*pw*/ p); - testIm2colCPU(/*ic*/ 2, - /*ih*/ 4, - /*iw*/ 4, - /*fh*/ 2, - /*fw*/ 2, - /*ph*/ p, - /*pw*/ p); - - // height != width - testIm2colCPU(/*ic*/ 2, - /*ih*/ 5, - /*iw*/ 4, - /*fh*/ 2, - /*fw*/ 3, - /*ph*/ p, - /*pw*/ p); - testIm2colCPU(/*ic*/ 2, - /*ih*/ 5, - /*iw*/ 4, - /*fh*/ 1, - /*fw*/ 3, - /*ph*/ p, - /*pw*/ p); - testIm2colCPU(/*ic*/ 2, - /*ih*/ 4, - /*iw*/ 5, - /*fh*/ 3, - /*fw*/ 1, - /*ph*/ p, - /*pw*/ p); - - // filter == 1 - testIm2colCPU(/*ic*/ 3, - /*ih*/ 4, - /*iw*/ 4, - /*fh*/ 1, - /*fw*/ 1, - /*ph*/ p, - /*pw*/ p); - testIm2colCPU(/*ic*/ 3, - /*ih*/ 3, - /*iw*/ 4, - /*fh*/ 1, - /*fw*/ 1, - /*ph*/ p, - /*pw*/ p); - } - - // padding_h != padding_w - testIm2colCPU(/*ic*/ 2, - /*ih*/ 4, - /*iw*/ 4, - /*fh*/ 2, - /*fw*/ 3, - /*ph*/ 1, - /*pw*/ 2); - - // benchmark - for (int p : {0, 1}) { - for (int k : {1, 3, 5}) { - LOG(INFO) << "padding == " << p << ", filter == " << k; - benchIm2col(/*ic*/ 3, - /*ih*/ 224, - /*iw*/ 224, - /*fh*/ k, - /*fw*/ k, - /*ph*/ p, - /*pw*/ p); - } - } -} diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc deleted file mode 100644 index 822b7df936..0000000000 --- a/lite/backends/x86/math/math_function.cc +++ /dev/null @@ -1,158 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/math_function.h" - -#ifdef PADDLE_WITH_MKLML -#include "lite/backends/x86/mklml.h" -#endif - -#ifdef PADDLE_USE_OPENBLAS -#include -#endif - -#include -#include "lite/backends/x86/math/math_function_impl.h" -#include "lite/fluid/data_type.h" -#include "lite/fluid/float16.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; - -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; - -DEFINE_CPU_TRANS(1); -DEFINE_CPU_TRANS(2); -DEFINE_CPU_TRANS(3); -DEFINE_CPU_TRANS(4); -DEFINE_CPU_TRANS(5); -DEFINE_CPU_TRANS(6); - -struct TensorSetConstantCPU { - TensorSetConstantCPU(lite::Tensor* tensor, float value) - : tensor_(tensor), value_(value) {} - template - void apply() const { - auto* begin = tensor_->mutable_data(lite::TargetType::kX86); - std::fill(begin, begin + tensor_->numel(), static_cast(value_)); - } - lite::Tensor* tensor_; - float value_; -}; - -template <> -void set_constant_with_place( - const lite::Context& context, - lite::Tensor* tensor, - float value) { - // lite::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); - TensorSetConstantCPU(tensor, value).apply(); -} - -// template <> -// void set_constant_with_place( -// const platform::DeviceContext& context, framework::Tensor* tensor, -// float value) { -// framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, -// value)); -//} - -template -struct TensorSetConstantWithTarget /*: public boost::static_visitor*/ { - TensorSetConstantWithTarget(const lite::Context& context, - lite::Tensor* tensor, - float value) - : context_(context), tensor_(tensor), value_(value) {} - - void operator()() const { - set_constant_with_place(context_, tensor_, value_); - } - - const lite::Context& context_; - lite::Tensor* tensor_; - float value_; -}; - -template -void set_constant(const lite::Context& context, - lite::Tensor* tensor, - float value) { - TensorSetConstantWithTarget func(context, tensor, value); - //#ifdef PADDLE_WITH_CUDA - // tensor->target().apply_visitor(func); - //#else - func(); - //#endif -} - -template -struct RowwiseAdd { - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& vector, - lite::Tensor* output) { - auto in_dims = input.dims(); - auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector.numel(), size); - PADDLE_ENFORCE_EQ(output->dims(), in_dims); - - auto in = lite::fluid::EigenMatrix::From(input); - auto vec = lite::fluid::EigenVector::Flatten(vector); - auto out = lite::fluid::EigenMatrix::From(*output); - - for (int64_t i = 0; i < in_dims[0]; ++i) { - out.chip(i, 0) = in.chip(i, 0) + vec; - } - } -}; - -template struct RowwiseAdd; -template struct RowwiseAdd; - -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; - -template struct RowwiseSum; -template struct RowwiseSum; - -template struct RowwiseMean; -template struct RowwiseMean; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/math_function.h b/lite/backends/x86/math/math_function.h deleted file mode 100644 index 8f629b5f17..0000000000 --- a/lite/backends/x86/math/math_function.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "lite/core/context.h" -#include "lite/core/op_lite.h" -#include "lite/core/tensor.h" -#include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" -//#include "lite/tensor_util.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -// template -// struct Transpose { -// void operator()(const lite::Context &context) -// }; - -template -struct Transpose { - void operator()(const lite::Context& context, - const lite::Tensor& in, - lite::Tensor* out, - const std::vector& axis); -}; - -template -struct SetConstant { - void operator()(const lite::Context& context, - lite::Tensor* tensor, - T num); -}; - -template -void set_constant_with_place(const lite::Context& context, - lite::Tensor* tensor, - float value); - -template -void set_constant(const lite::Context& context, - lite::Tensor* tensor, - float value); - -template -struct RowwiseAdd { - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& vec, - lite::Tensor* output); -}; - -template -struct ColwiseSum { - void operator()(const lite::Context& context, - const lite::Tensor& input, - lite::Tensor* vec); -}; - -template -struct RowwiseSum { - void operator()(const lite::Context& context, - const lite::Tensor& input, - lite::Tensor* vec); -}; - -template -struct RowwiseMean { - void operator()(const lite::Context& context, - const lite::Tensor& input, - lite::Tensor* vec); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h deleted file mode 100644 index 3aaca2e593..0000000000 --- a/lite/backends/x86/math/math_function_impl.h +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/backends/x86/math/math_function.h" -#include "lite/fluid/data_type.h" -#include "lite/fluid/eigen.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -void SetConstant::operator()(const lite::Context& context, - lite::Tensor* tensor, - T num) { - auto t = lite::fluid::EigenVector::Flatten(*tensor); - - // t.device(*Eigen::DefaultDevice()) = t.constant(static_cast(num)); - // t.device(*context.eigen_device()) = t.constant(static_cast(num)); - t.device(typename lite::fluid::EigenDevice::Type()) = - t.constant(static_cast(num)); -} - -template -void Transpose::operator()( - const lite::Context& context, - const lite::TensorLite& in, - lite::TensorLite* out, - const std::vector& axis) { - Eigen::array permute; - for (int i = 0; i < Rank; i++) { - permute[i] = axis[i]; - } - auto eigen_in = lite::fluid::EigenTensor::From(in); - auto eigen_out = lite::fluid::EigenTensor::From(*out); - // auto* dev = context.eigen_device(); - // eigen_out.device(*dev) = eigen_in.shuffle(permute); - eigen_out.device(typename lite::fluid::EigenDevice::Type()) = - eigen_in.shuffle(permute); -} - -template -void ColwiseSum::operator()(const lite::Context& context, - const lite::TensorLite& input, - lite::TensorLite* out) { - auto in_dims = input.dims(); - auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(out->numel(), size); - - auto in = lite::fluid::EigenMatrix::From(input); - auto vec = lite::fluid::EigenVector::Flatten(*out); - - // vec.device(*context.eigen_device()) = in.sum(Eigen::array({{0}})); - vec.device(typename lite::fluid::EigenDevice::Type()) = - in.sum(Eigen::array({{0}})); -} - -// Specialize for CPU, since Eigen implement a general reduce. However, -// colwise-sum can be easily implemented. General reduce has a huge overhead in -// CPU -template -class ColwiseSum { - public: - void operator()(const lite::X86Context& context, - const lite::TensorLite& input, - lite::TensorLite* out) { - auto& in_dims = input.dims(); - auto height = in_dims[0]; - auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), size); - - T* out_buf = out->mutable_data(out->target()); - const T* in_buf = input.data(); - - for (size_t i = 0; i < static_cast(height); ++i) { - for (size_t j = 0; j < static_cast(size); ++j) { - if (i == 0) { - out_buf[j] = in_buf[i * size + j]; - } else { - out_buf[j] += in_buf[i * size + j]; - } - } - } - } -}; - -template -void RowwiseMean::operator()(const lite::Context& context, - const lite::TensorLite& input, - lite::TensorLite* out) { - auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); - - auto in = lite::fluid::EigenMatrix::From(input); - auto vec = lite::fluid::EigenVector::Flatten(*out); - - // vec.device(*context.eigen_device()) = in.mean(Eigen::array({{1}})); - vec.device(typename lite::fluid::EigenDevice::Type()) = - in.mean(Eigen::array({{1}})); -} -// TODO(zcd): Following ColwiseSum format, need to confirm. -// Specialize for CPU, since Eigen implement a general reduce. However, -// rowwise-sum can be easily implemented. General reduce has a huge overhead in -// CPU -template -class RowwiseMean { - public: - void operator()(const lite::X86Context& context, - const lite::TensorLite& input, - lite::TensorLite* out) { - auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - auto height = in_dims[0]; - auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); - auto inv_size = 1.0 / size; - T* out_buf = out->mutable_data(out->target()); - const T* in_buf = input.data(); - - for (size_t i = 0; i < static_cast(height); ++i) { - T sum = 0; - for (size_t j = 0; j < static_cast(size); ++j) { - sum += in_buf[i * size + j]; - } - out_buf[i] = sum * inv_size; - } - } -}; - -template -void RowwiseSum::operator()(const lite::Context& context, - const lite::TensorLite& input, - lite::TensorLite* out) { - auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); - - auto in = lite::fluid::EigenMatrix::From(input); - auto vec = lite::fluid::EigenVector::Flatten(*out); - - // vec.device(*context.eigen_device()) = in.sum(Eigen::array({{1}})); - vec.device(typename lite::fluid::EigenDevice::Type()) = - in.sum(Eigen::array({{1}})); -} -// TODO(zcd): Following ColwiseSum format, need to confirm. -// Specialize for CPU, since Eigen implement a general reduce. However, -// rowwise-sum can be easily implemented. General reduce has a huge overhead in -// CPU -template -class RowwiseSum { - public: - void operator()(const lite::X86Context& context, - const lite::TensorLite& input, - lite::TensorLite* out) { - auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - auto height = in_dims[0]; - auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); - - T* out_buf = out->mutable_data(out->target()); - const T* in_buf = input.data(); - - for (size_t i = 0; i < static_cast(height); ++i) { - T sum = 0; - for (size_t j = 0; j < static_cast(size); ++j) { - sum += in_buf[i * size + j]; - } - out_buf[i] = sum; - } - } -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/math_function_test.cc b/lite/backends/x86/math/math_function_test.cc deleted file mode 100644 index 19122a6169..0000000000 --- a/lite/backends/x86/math/math_function_test.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/math/math_function.h" -#include "gtest/gtest.h" -#include "paddle/fluid/operators/math/blas.h" - -template -inline paddle::operators::math::BlasT -GetBlas(const paddle::platform::CPUDeviceContext& context) { - return paddle::operators::math::GetBlas(context); -} - -TEST(math_function, gemm_notrans_cblas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - - int m = 2; - int n = 3; - int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); - float arr1[6] = {0, 1, 2, 3, 4, 5}; - memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); - float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); - float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - memcpy(input3_ptr, arr3, 8 * sizeof(float)); - - paddle::platform::CPUDeviceContext context(*cpu_place); - GetBlas(context).GEMM(false, - false, - m, - n, - k, - 1, - input1_ptr, - 3, - input2_ptr + 1, - 4, - 1, - input3_ptr + 1, - 4); - - EXPECT_EQ(input3_ptr[0], 0); - EXPECT_EQ(input3_ptr[1], 24); - EXPECT_EQ(input3_ptr[2], 28); - EXPECT_EQ(input3_ptr[3], 32); - EXPECT_EQ(input3_ptr[4], 4); - EXPECT_EQ(input3_ptr[5], 73); - EXPECT_EQ(input3_ptr[6], 86); - EXPECT_EQ(input3_ptr[7], 99); -} -#ifdef PADDLE_WITH_LIBXSMM -template -void MklSmmCompare(int m, int n, int k) { - paddle::framework::Tensor mat_a; - paddle::framework::Tensor mat_b; - paddle::framework::Tensor mat_c_smm; - paddle::framework::Tensor mat_c_mkl; - auto* cpu_place = new paddle::platform::CPUPlace(); - - T* A = mat_a.mutable_data({m, k}, *cpu_place); - T* B = mat_b.mutable_data({k, n}, *cpu_place); - T* CSMM = mat_c_smm.mutable_data({m, n}, *cpu_place); - T* CMKL = mat_c_mkl.mutable_data({m, n}, *cpu_place); - T alpha = static_cast(1); - T beta = static_cast(0); - for (int i = 0; i < mat_a.numel(); ++i) { - A[i] = static_cast(i); - } - for (int i = 0; i < mat_b.numel(); ++i) { - B[i] = static_cast(i); - } - // lda,ldb,ldc follow RowMajor - int lda = k; - int ldb = n; - int ldc = n; - - auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { - const char transa = 'N'; - const char transb = 'N'; - paddle::operators::math::CBlas::SMM_GEMM(&transa, - &transb, - &n, - &m, - &k, - &alpha, - B, - &ldb, - A, - &lda, - &beta, - CSMM, - &ldc); - }; - - auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { - paddle::operators::math::CBlas::GEMM(CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - m, - n, - k, - alpha, - A, - lda, - B, - ldb, - beta, - CMKL, - ldc); - }; - - smm(); - mkl(); - ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel()); - for (int i = 0; i < mat_c_mkl.numel(); ++i) { - EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]); - } -} -TEST(math_function, gemm_mkl_vs_smm) { - MklSmmCompare(1, 2, 3); - MklSmmCompare(1, 2, 3); - MklSmmCompare(3, 2, 1); - MklSmmCompare(3, 2, 1); - MklSmmCompare(3, 8, 5); - MklSmmCompare(3, 8, 5); -} -#endif - -TEST(math_function, gemm_trans_cblas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - - int m = 2; - int n = 3; - int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); - float arr1[6] = {0, 1, 2, 3, 4, 5}; - memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); - float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; - memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); - float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - memcpy(input3_ptr, arr3, 8 * sizeof(float)); - - paddle::platform::CPUDeviceContext context(*cpu_place); - GetBlas(context).GEMM(false, - true, - m, - n, - k, - 1, - input1_ptr, - 3, - input2_ptr + 3, - 3, - 1, - input3_ptr + 1, - 4); - delete cpu_place; - cpu_place = NULL; - - EXPECT_EQ(input3_ptr[0], 0); - EXPECT_EQ(input3_ptr[1], 24); - EXPECT_EQ(input3_ptr[2], 28); - EXPECT_EQ(input3_ptr[3], 32); - EXPECT_EQ(input3_ptr[4], 4); - EXPECT_EQ(input3_ptr[5], 73); - EXPECT_EQ(input3_ptr[6], 86); - EXPECT_EQ(input3_ptr[7], 99); -} - -TEST(math_function, zero) { - paddle::framework::Tensor tensor; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* t = tensor.mutable_data({2, 2}, *cpu_place); - paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::SetConstant - functor; - functor(context, &tensor, 0); - EXPECT_EQ(t[0], 0); - EXPECT_EQ(t[1], 0); - EXPECT_EQ(t[2], 0); - EXPECT_EQ(t[3], 0); - - functor(context, &tensor, 1); - - EXPECT_EQ(t[0], 1); - EXPECT_EQ(t[1], 1); - EXPECT_EQ(t[2], 1); - EXPECT_EQ(t[3], 1); -} - -template -void GemvTest(int m, int n, bool trans) { - paddle::framework::Tensor mat_a; - paddle::framework::Tensor vec_b; - paddle::framework::Tensor vec_c; - auto* cpu_place = new paddle::platform::CPUPlace(); - int b_num = trans ? m : n; - int c_num = trans ? n : m; - - T* data_a = mat_a.mutable_data({m, n}, *cpu_place); - T* data_b = vec_b.mutable_data({b_num}, *cpu_place); - T* data_c = vec_c.mutable_data({c_num}, *cpu_place); - for (int i = 0; i < mat_a.numel(); ++i) { - data_a[i] = static_cast(i); - } - for (int i = 0; i < vec_b.numel(); ++i) { - data_b[i] = static_cast(i); - } - - paddle::platform::CPUDeviceContext context(*cpu_place); - GetBlas(context).GEMV(trans, - static_cast(m), - static_cast(n), - 1., - data_a, - data_b, - 0., - data_c); - - if (!trans) { - for (int i = 0; i < m; ++i) { - T sum = 0.0; - for (int j = 0; j < n; ++j) { - sum += data_a[i * n + j] * data_b[j]; - } - ASSERT_FLOAT_EQ(data_c[i], sum); - } - } else { - for (int i = 0; i < n; ++i) { - T sum = 0.0; - for (int j = 0; j < m; ++j) { - sum += data_a[j * n + i] * data_b[j]; - } - ASSERT_FLOAT_EQ(data_c[i], sum); - } - } -} - -TEST(math_function, gemv) { - GemvTest(3, 13, false); - GemvTest(4, 5, false); - GemvTest(12, 7, true); - GemvTest(7, 9, true); -} - -TEST(math_funciton, set_constant) { - paddle::framework::Tensor t; - t.Resize({10, 10}); - t.mutable_data(paddle::platform::CPUPlace()); - auto* ctx = new paddle::platform::CPUDeviceContext(); - paddle::operators::math::set_constant(*ctx, &t, 10); - for (int64_t i = 0; i < t.numel(); ++i) { - PADDLE_ENFORCE_EQ(10, t.data()[i]); - } - delete ctx; -} - -template -void GemmWarpTest(int m, int n, int k, T alpha, T beta) { - paddle::framework::Tensor mat_a; - paddle::framework::Tensor mat_b; - paddle::framework::Tensor mat_c_ref; - paddle::framework::Tensor mat_c_mkl; - auto* cpu_place = new paddle::platform::CPUPlace(); - - T* A = mat_a.mutable_data({m, k}, *cpu_place); - T* B = mat_b.mutable_data({k, n}, *cpu_place); - T* CREF = mat_c_ref.mutable_data({m, n}, *cpu_place); - T* CMKL = mat_c_mkl.mutable_data({m, n}, *cpu_place); - - ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel()); - for (int i = 0; i < mat_a.numel(); ++i) { - A[i] = static_cast(i); - } - for (int i = 0; i < mat_b.numel(); ++i) { - B[i] = static_cast(i + 1); - } - for (int i = 0; i < mat_c_ref.numel(); ++i) { - CREF[i] = static_cast(i + 2); - CMKL[i] = CREF[i]; - } - - // this would call gemm_warp - paddle::platform::CPUDeviceContext context(*cpu_place); - GetBlas(context).GEMM( - CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, beta, CREF); - - // lda,ldb,ldc follow RowMajor - int lda = k; - int ldb = n; - int ldc = n; - paddle::operators::math::CBlas::GEMM(CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - m, - n, - k, - alpha, - A, - lda, - B, - ldb, - beta, - CMKL, - ldc); - - for (int i = 0; i < mat_c_mkl.numel(); ++i) { - EXPECT_FLOAT_EQ(CREF[i], CMKL[i]); - } -} - -TEST(math_function, gemm_warp) { - GemmWarpTest(3, 2, 5, 1.f, 0.f); - GemmWarpTest(3, 2, 5, 2.f, 1.f); - GemmWarpTest(8, 5, 6, 1.f, 0.f); - GemmWarpTest(8, 5, 6, 2.f, 1.f); - GemmWarpTest(3, 2, 5, 1.0, 0.0); - GemmWarpTest(3, 2, 5, 2.0, 1.0); - GemmWarpTest(8, 5, 6, 1.0, 0.0); - GemmWarpTest(8, 5, 6, 2.0, 1.0); -} diff --git a/lite/backends/x86/math/maxouting.cc b/lite/backends/x86/math/maxouting.cc deleted file mode 100644 index 20b40fe7c5..0000000000 --- a/lite/backends/x86/math/maxouting.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/maxouting.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -// All tensors are in NCHW format, and the groups must be greater than 1 -template -class MaxOutFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - lite::Tensor* output, - int groups) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output->dims()[1]; - int fea_size = input_height * input_width; - // c_size means the output size of each sample - int c_size = fea_size * output_channels; - const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); - - for (int i = 0; i < batch_size; ++i) { - int new_bindex = c_size * i; - for (int c = 0; c < output_channels; ++c) { - int new_cindex = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - T ele = static_cast(-FLT_MAX); - for (int ph = 0; ph < groups; ++ph) { - T x = input_data[(new_bindex + new_cindex) * groups + - ph * fea_size + f]; - ele = ele > x ? ele : x; - } - output_data[(new_bindex + new_cindex + f)] = ele; - } - } - } - } -}; - -template -class MaxOutGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - lite::Tensor* input_grad, - const lite::Tensor& output, - const lite::Tensor& output_grad, - int groups) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output.dims()[1]; - int fea_size = input_height * input_width; - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; - for (int c = 0; c < output_channels; ++c) { - int clen = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - int input_idx0 = (blen + clen) * groups + f; - bool continue_match = true; - int output_idx = blen + clen + f; - for (int g = 0; g < groups && continue_match; ++g) { - int input_idx = input_idx0 + fea_size * g; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - continue_match = false; - } - } - } - } - } - } -}; - -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/maxouting.h b/lite/backends/x86/math/maxouting.h deleted file mode 100644 index f84d2f6c9d..0000000000 --- a/lite/backends/x86/math/maxouting.h +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "lite/core/context.h" -#include "lite/core/tensor.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -class MaxOutFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - lite::Tensor* output, - int groups); -}; - -template -class MaxOutGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - lite::Tensor* input_grad, - const lite::Tensor& output, - const lite::Tensor& output_grad, - int groups); -}; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc deleted file mode 100644 index e700c5f7c7..0000000000 --- a/lite/backends/x86/math/pooling.cc +++ /dev/null @@ -1,906 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/pooling.h" -#include -#include - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * All tensors are in NCHW format. - * Ksize, strides, paddings are two elements. These two elements represent - * height and width, respectively. - */ -template -class Pool2dFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_process, - bool exclusive, - bool adaptive, - lite::Tensor* output) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output->dims()[1]; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - const int ksize_height = ksize[0]; - const int ksize_width = ksize[1]; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - - const int input_stride = input_height * input_width; - const int output_stride = output_height * output_width; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); - - int hstart, hend; - int wstart, wend; - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < output_height; ++ph) { - if (adaptive) { - hstart = AdaptStartIndex(ph, input_height, output_height); - hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - } - for (int pw = 0; pw < output_width; ++pw) { - if (adaptive) { - wstart = AdaptStartIndex(pw, input_width, output_width); - wend = AdaptEndIndex(pw, input_width, output_width); - } else { - wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - } - - T ele = pool_process.initial(); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - pool_process.compute(input_data[h * input_width + w], &ele); - } - } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; - pool_process.finalize(static_cast(pool_size), &ele); - output_data[ph * output_width + pw] = ele; - } - } - input_data += input_stride; - output_data += output_stride; - } - } - } -}; - -/* -* All tensors are in NCHW format. -* Ksize, strides, paddings are two elements. These two elements represent height -* and width, respectively. -*/ -template -class Pool2dGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_grad_process, - bool exclusive, - bool adaptive, - lite::Tensor* input_grad) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output.dims()[1]; - const int output_height = output.dims()[2]; - const int output_width = output.dims()[3]; - const int ksize_height = ksize[0]; - const int ksize_width = ksize[1]; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - const int input_stride = input_height * input_width; - const int output_stride = output_height * output_width; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - int hstart, hend; - int wstart, wend; - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < output_height; ++ph) { - if (adaptive) { - hstart = AdaptStartIndex(ph, input_height, output_height); - hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - } - for (int pw = 0; pw < output_width; ++pw) { - if (adaptive) { - wstart = AdaptStartIndex(pw, input_width, output_width); - wend = AdaptEndIndex(pw, input_width, output_width); - } else { - wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; - float scale = 1.0 / pool_size; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - pool_grad_process.compute( - input_data[h * input_width + w], - output_data[ph * output_width + pw], - output_grad_data[ph * output_width + pw], - static_cast(scale), - input_grad_data + h * input_width + w); - } - } - } - } - input_data += input_stride; - output_data += output_stride; - input_grad_data += input_stride; - output_grad_data += output_stride; - } - } - } -}; - -/* - * All tensors are in NCHW format. - * Ksize, strides, paddings are two elements. These two elements represent - * height and width, respectively. - */ -template -class MaxPool2dGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* input_grad) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output.dims()[1]; - const int output_height = output.dims()[2]; - const int output_width = output.dims()[3]; - const int ksize_height = ksize[0]; - const int ksize_width = ksize[1]; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - const int input_stride = input_height * input_width; - const int output_stride = output_height * output_width; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - - bool stop = false; - for (int h = hstart; h < hend && !stop; ++h) { - for (int w = wstart; w < wend && !stop; ++w) { - int input_idx = h * input_width + w; - int output_idx = ph * output_width + pw; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - stop = true; - } - } - } - } - } - input_data += input_stride; - output_data += output_stride; - input_grad_data += input_stride; - output_grad_data += output_stride; - } - } - } -}; - -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; - -template class Pool2dFunctor, - float>; -template class Pool2dFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dFunctor, - double>; -template class Pool2dFunctor, - double>; -template class Pool2dGradFunctor, - double>; -template class Pool2dGradFunctor, - double>; - -/* - * All tensors are in NCDHW format. - * Ksize, strides, paddings are three elements. These three elements represent - * depth, height and width, respectively. - */ -template -class Pool3dFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_process, - bool exclusive, - bool adaptive, - lite::Tensor* output) { - const int batch_size = input.dims()[0]; - const int input_depth = input.dims()[2]; - const int input_height = input.dims()[3]; - const int input_width = input.dims()[4]; - const int output_channels = output->dims()[1]; - const int output_depth = output->dims()[2]; - const int output_height = output->dims()[3]; - const int output_width = output->dims()[4]; - const int ksize_depth = ksize[0]; - const int ksize_height = ksize[1]; - const int ksize_width = ksize[2]; - const int stride_depth = strides[0]; - const int stride_height = strides[1]; - const int stride_width = strides[2]; - const int padding_depth = paddings[0]; - const int padding_height = paddings[1]; - const int padding_width = paddings[2]; - - const int input_stride = input_depth * input_height * input_width; - const int output_stride = output_depth * output_height * output_width; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); - - int dstart, dend; - int hstart, hend; - int wstart, wend; - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int pd = 0; pd < output_depth; ++pd) { - if (adaptive) { - dstart = AdaptStartIndex(pd, input_depth, output_depth); - dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); - } - for (int ph = 0; ph < output_height; ++ph) { - if (adaptive) { - hstart = AdaptStartIndex(ph, input_height, output_height); - hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - } - for (int pw = 0; pw < output_width; ++pw) { - if (adaptive) { - wstart = AdaptStartIndex(pw, input_width, output_width); - wend = AdaptEndIndex(pw, input_width, output_width); - } else { - wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - } - int output_idx = (pd * output_height + ph) * output_width + pw; - T ele = pool_process.initial(); - for (int d = dstart; d < dend; ++d) { - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - pool_process.compute( - input_data[(d * input_height + h) * input_width + w], - &ele); - } - } - } - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; - pool_process.finalize(static_cast(pool_size), &ele); - output_data[output_idx] = ele; - } - } - } - input_data += input_stride; - output_data += output_stride; - } - } - } -}; - -/* - * All tensors are in NCDHW format. - * Ksize, strides, paddings are three elements. These three elements represent - * depth, height and width, respectively. - */ -template -class Pool3dGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_grad_process, - bool exclusive, - bool adaptive, - lite::Tensor* input_grad) { - const int batch_size = input.dims()[0]; - const int input_depth = input.dims()[2]; - const int input_height = input.dims()[3]; - const int input_width = input.dims()[4]; - const int output_channels = output.dims()[1]; - const int output_depth = output.dims()[2]; - const int output_height = output.dims()[3]; - const int output_width = output.dims()[4]; - const int ksize_depth = ksize[0]; - const int ksize_height = ksize[1]; - const int ksize_width = ksize[2]; - const int stride_depth = strides[0]; - const int stride_height = strides[1]; - const int stride_width = strides[2]; - const int padding_depth = paddings[0]; - const int padding_height = paddings[1]; - const int padding_width = paddings[2]; - const int input_stride = input_depth * input_height * input_width; - const int output_stride = output_depth * output_height * output_width; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - int dstart, dend; - int hstart, hend; - int wstart, wend; - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int pd = 0; pd < output_depth; ++pd) { - if (adaptive) { - dstart = AdaptStartIndex(pd, input_depth, output_depth); - dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); - } - for (int ph = 0; ph < output_height; ++ph) { - if (adaptive) { - hstart = AdaptStartIndex(ph, input_height, output_height); - hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - } - for (int pw = 0; pw < output_width; ++pw) { - if (adaptive) { - wstart = AdaptStartIndex(pw, input_width, output_width); - wend = AdaptEndIndex(pw, input_width, output_width); - } else { - wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - } - - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; - float scale = 1.0 / pool_size; - for (int d = dstart; d < dend; ++d) { - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int input_idx = (d * input_height + h) * input_width + w; - int output_idx = - (pd * output_height + ph) * output_width + pw; - pool_grad_process.compute(input_data[input_idx], - output_data[output_idx], - output_grad_data[output_idx], - static_cast(scale), - input_grad_data + input_idx); - } - } - } - } - } - } - input_data += input_stride; - output_data += output_stride; - input_grad_data += input_stride; - output_grad_data += output_stride; - } - } - } -}; - -/* - * All tensors are in NCDHW format. - * Ksize, strides, paddings are three elements. These three elements represent - * depth, height and width, respectively. - */ -template -class MaxPool3dGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* input_grad) { - const int batch_size = input.dims()[0]; - const int input_depth = input.dims()[2]; - const int input_height = input.dims()[3]; - const int input_width = input.dims()[4]; - const int output_channels = output.dims()[1]; - const int output_depth = output.dims()[2]; - const int output_height = output.dims()[3]; - const int output_width = output.dims()[4]; - const int ksize_depth = ksize[0]; - const int ksize_height = ksize[1]; - const int ksize_width = ksize[2]; - const int stride_depth = strides[0]; - const int stride_height = strides[1]; - const int stride_width = strides[2]; - const int padding_depth = paddings[0]; - const int padding_height = paddings[1]; - const int padding_width = paddings[2]; - const int input_stride = input_depth * input_height * input_width; - const int output_stride = output_depth * output_height * output_width; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); - for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - bool stop = false; - for (int d = dstart; d < dend && !stop; ++d) { - for (int h = hstart; h < hend && !stop; ++h) { - for (int w = wstart; w < wend && !stop; ++w) { - int input_idx = (d * input_height + h) * input_width + w; - int output_idx = - (pd * output_height + ph) * output_width + pw; - - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += - output_grad_data[output_idx]; - stop = true; - } - } - } - } - } - } - } - input_data += input_stride; - output_data += output_stride; - input_grad_data += input_stride; - output_grad_data += output_stride; - } - } - } -}; - -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; - -template class Pool3dFunctor, - float>; -template class Pool3dFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dFunctor, - double>; -template class Pool3dFunctor, - double>; -template class Pool3dGradFunctor, - double>; -template class Pool3dGradFunctor, - double>; - -/* - * All tensors are in NCHW format. - * Ksize, strides, paddings are two elements. These two elements represent - * height and width, respectively. - */ -template -class MaxPool2dWithIndexFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* output, - lite::Tensor* mask) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output->dims()[1]; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - const int ksize_height = ksize[0]; - const int ksize_width = ksize[1]; - const int stride_height = strides[0]; - const int stride_width = strides[1]; - const int padding_height = paddings[0]; - const int padding_width = paddings[1]; - const int input_stride = input_height * input_width; - const int output_stride = output_height * output_width; - - const T1* input_data = input.data(); - T1* output_data = output->mutable_data(lite::TargetType::kX86); - T2* mask_data = mask->mutable_data(lite::TargetType::kX86); - - int hstart, hend; - int wstart, wend; - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < output_height; ++ph) { - if (adaptive) { - hstart = AdaptStartIndex(ph, input_height, output_height); - hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - } - for (int pw = 0; pw < output_width; ++pw) { - if (adaptive) { - wstart = AdaptStartIndex(pw, input_width, output_width); - wend = AdaptEndIndex(pw, input_width, output_width); - } else { - wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - } - - T1 ele = static_cast(-FLT_MAX); - int index = -1; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (ele < input_data[h * input_width + w]) { - ele = input_data[h * input_width + w]; - index = h * input_width + w; - } - } - } - output_data[ph * output_width + pw] = ele; - mask_data[ph * output_width + pw] = index; - } - } - // offset - input_data += input_stride; - output_data += output_stride; - mask_data += output_stride; - } - } - } -}; - -/* - * All tensors are in NCHW format. - * Ksize, strides, paddings are two elements. These two elements represent - * height and width, respectively. - */ -template -class MaxPool2dWithIndexGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& output_grad, - const lite::Tensor& mask, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* input_grad) { - const int batch_size = input_grad->dims()[0]; - const int input_height = input_grad->dims()[2]; - const int input_width = input_grad->dims()[3]; - const int output_channels = output_grad.dims()[1]; - const int output_height = output_grad.dims()[2]; - const int output_width = output_grad.dims()[3]; - const int input_stride = input_height * input_width; - const int output_stride = output_height * output_width; - - const T2* mask_data = mask.data(); - const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - for (int n = 0; n < batch_size; ++n) { - for (int c = 0; c < output_channels; ++c) { - for (int ph = 0; ph < output_height; ++ph) { - for (int pw = 0; pw < output_width; ++pw) { - const int output_idx = ph * output_width + pw; - const int input_idx = static_cast(mask_data[output_idx]); - input_grad_data[input_idx] += output_grad_data[output_idx]; - } - } - // offset - input_grad_data += input_stride; - output_grad_data += output_stride; - mask_data += output_stride; - } - } - } -}; - -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; - -/* - * All tensors are in NCDHW format. - * Ksize, strides, paddings are three elements. These three elements represent - * depth, height and width, respectively. - */ -template -class MaxPool3dWithIndexFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* output, - lite::Tensor* mask) { - const int batch_size = input.dims()[0]; - const int input_depth = input.dims()[2]; - const int input_height = input.dims()[3]; - const int input_width = input.dims()[4]; - const int output_channels = output->dims()[1]; - const int output_depth = output->dims()[2]; - const int output_height = output->dims()[3]; - const int output_width = output->dims()[4]; - const int ksize_depth = ksize[0]; - const int ksize_height = ksize[1]; - const int ksize_width = ksize[2]; - const int stride_depth = strides[0]; - const int stride_height = strides[1]; - const int stride_width = strides[2]; - const int padding_depth = paddings[0]; - const int padding_height = paddings[1]; - const int padding_width = paddings[2]; - const int input_stride = input_depth * input_height * input_width; - const int output_stride = output_depth * output_height * output_width; - - const T1* input_data = input.data(); - T1* output_data = output->mutable_data(lite::TargetType::kX86); - T2* mask_data = mask->mutable_data(lite::TargetType::kX86); - - int dstart, dend; - int hstart, hend; - int wstart, wend; - for (int i = 0; i < batch_size; i++) { - for (int c = 0; c < output_channels; ++c) { - for (int pd = 0; pd < output_depth; ++pd) { - if (adaptive) { - dstart = AdaptStartIndex(pd, input_depth, output_depth); - dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); - } - for (int ph = 0; ph < output_height; ++ph) { - if (adaptive) { - hstart = AdaptStartIndex(ph, input_height, output_height); - hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - } - for (int pw = 0; pw < output_width; ++pw) { - if (adaptive) { - wstart = AdaptStartIndex(pw, input_width, output_width); - wend = AdaptEndIndex(pw, input_width, output_width); - } else { - wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - } - - int output_idx = (pd * output_height + ph) * output_width + pw; - T1 ele = static_cast(-FLT_MAX); - int index = -1; - for (int d = dstart; d < dend; ++d) { - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int input_idx = (d * input_height + h) * input_width + w; - if (ele < input_data[input_idx]) { - index = input_idx; - ele = input_data[input_idx]; - } - } - } - } - output_data[output_idx] = ele; - mask_data[output_idx] = index; - } - } - } - // offset - input_data += input_stride; - output_data += output_stride; - mask_data += output_stride; - } - } - } -}; - -/* - * All tensors are in NCDHW format. - * Ksize, strides, paddings are three elements. These three elements represent - * depth, height and width, respectively. - */ -template -class MaxPool3dWithIndexGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& output_grad, - const lite::Tensor& mask, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* input_grad) { - const int batch_size = input_grad->dims()[0]; - const int input_depth = input_grad->dims()[2]; - const int input_height = input_grad->dims()[3]; - const int input_width = input_grad->dims()[4]; - const int output_channels = output_grad.dims()[1]; - const int output_depth = output_grad.dims()[2]; - const int output_height = output_grad.dims()[3]; - const int output_width = output_grad.dims()[4]; - const int input_stride = input_depth * input_height * input_width; - const int output_stride = output_depth * output_height * output_width; - - const T2* mask_data = mask.data(); - const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - for (int n = 0; n < batch_size; ++n) { - for (int c = 0; c < output_channels; ++c) { - for (int pd = 0; pd < output_depth; ++pd) { - for (int ph = 0; ph < output_height; ++ph) { - for (int pw = 0; pw < output_width; ++pw) { - const int output_idx = - (pd * output_height + ph) * output_width + pw; - const int input_idx = static_cast(mask_data[output_idx]); - input_grad_data[input_idx] += output_grad_data[output_idx]; - } - } - } - // offset - input_grad_data += input_stride; - output_grad_data += output_stride; - mask_data += output_stride; - } - } - } -}; - -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/pooling.h b/lite/backends/x86/math/pooling.h deleted file mode 100644 index 64015e32c8..0000000000 --- a/lite/backends/x86/math/pooling.h +++ /dev/null @@ -1,258 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" -#include "lite/fluid/eigen.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * \brief Extracting simple operations from pooling. - * Both MaxPool and AvgPool need "initial", "compute" and "finalize" - * operation. - * MaxPool initializes temp variable to the negative maximum to find the - * maximum value in the pooling field. - * AvgPool initializes temp variable to the zero to accumulate all values - * in pool pooling, and finally takes the average. - * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. - */ -template -class MaxPool { - public: - DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } - DEVICE inline void finalize(const T& pool_field, T* y) {} -}; - -template -class AvgPool { - public: - DEVICE inline T initial() { return static_cast(0); } - DEVICE inline void compute(const T& x, T* y) { *y += x; } - DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; } -}; - -template -class MaxPoolGrad { - public: - DEVICE inline void compute( - const T& x, const T& y, const T& dy, T scale, T* dx) { - *dx += dy * (x == y); - } -}; - -template -class AvgPoolGrad { - public: - DEVICE inline void compute( - const T& x, const T& y, const T& dy, T scale, T* dx) { - *dx += (scale * dy); - } -}; - -/* used for adaptive pool to calculate start and end index of each divided grid - */ -HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - -/* - * \brief Getting pooling results, and calculating gradient. - * - * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the - * number of channels, H and W is the height and width of feature. - * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the - * number of channels, D, H and W is the depth, height and width of feature. - * - * In max pooling, it is possible that the pooling region has multiple maximum - * elements. In this case, we should compute the gradient of the first maximum - * element. - * This is different from average pooling. So we rewrite the max_pool_grad: - * MaxPool2dGradFunctor, MaxPool3dGradFunctor. - */ -//#ifdef PADDLE_WITH_CUDA -// template -// class Pool2dDirectCUDAFunctor { -// public: -// void operator()(const T* input, const std::vector& input_shape, -// const std::vector& output_shape, -// const std::vector& ksize, -// const std::vector& strides, -// const std::vector& paddings, PoolProcess pool_compute, -// bool exclusive, T* output, cudaStream_t stream); -//}; -//#endif - -template -class Pool2dFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_compute, - bool exclusive, - bool adaptive, - lite::Tensor* output); -}; - -template -class Pool2dGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_compute, - bool exclusive, - bool adaptive, - lite::Tensor* input_grad); -}; - -template -class MaxPool2dGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* input_grad); -}; - -template -class Pool3dFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_compute, - bool exclusive, - bool adaptive, - lite::Tensor* output); -}; - -template -class Pool3dGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - PoolProcess pool_compute, - bool exclusive, - bool adaptive, - lite::Tensor* input_grad); -}; - -template -class MaxPool3dGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& output, - const lite::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* input_grad); -}; - -/* - * \brief Getting max pooling results and corresponding max index, and - * calculating gradient. - * In up-sampling-pooling, it is necessary to know max element index. - * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in - * NCDHW format. - */ -template -class MaxPool2dWithIndexFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* output, - lite::Tensor* mask); -}; - -template -class MaxPool2dWithIndexGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& output_grad, - const lite::Tensor& mask, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* input_grad); -}; - -template -class MaxPool3dWithIndexFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* output, - lite::Tensor* mask); -}; - -template -class MaxPool3dWithIndexGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& output_grad, - const lite::Tensor& mask, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - bool adaptive, - lite::Tensor* input_grad); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/prelu.h b/lite/backends/x86/math/prelu.h deleted file mode 100644 index 049397c72c..0000000000 --- a/lite/backends/x86/math/prelu.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/backends/x86/math/math_function.h" -// #include "paddle/fluid/platform/cudnn_helper.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -// #ifdef PADDLE_WITH_CUDA -// template -// class PreluChannelWiseDirectCUDAFunctor { -// public: -// void operator()(cudaStream_t stream, const T *input, const T *alpha, -// T *output, std::vector input_shape); -// }; -// -// template -// class PreluElementWiseDirectCUDAFunctor { -// public: -// void operator()(cudaStream_t stream, const T *input, const T *alpha, -// T *output, std::vector input_shape); -// }; -// -// template -// class PreluScalarDirectCUDAFunctor { -// public: -// void operator()(cudaStream_t stream, const T *input, const T *alpha, -// T *output, std::vector input_shape); -// }; -// #endif - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sample_prob.cc b/lite/backends/x86/math/sample_prob.cc deleted file mode 100644 index ecf1ca8e1a..0000000000 --- a/lite/backends/x86/math/sample_prob.cc +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/sample_prob.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template class SampleWithProb; -template class SampleWithProb; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sample_prob.h b/lite/backends/x86/math/sample_prob.h deleted file mode 100644 index 5312b3df10..0000000000 --- a/lite/backends/x86/math/sample_prob.h +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "lite/backends/x86/math/sampler.h" -#include "lite/core/context.h" -#include "lite/core/tensor.h" -#include "lite/fluid/eigen.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* UNDERSTAND: utility function to adjust probability for unique sampling, -return whatever as it is if not using unique samping */ -template -static T adjust_prob(const T prob, const int num_samples, const int num_tries) { - if (num_samples == num_tries) { - return prob * num_samples; - } else { - return -expm1(num_tries * log1p(-prob)); - } -} - -template -class SampleWithProb { - public: - void operator()(const lite::Context& context, - const Sampler& sampler, - const std::size_t num_samples, - const lite::Tensor* L, - lite::Tensor* S, - lite::Tensor* P) { - // UNDERSTAND: dimension issues - const auto lbl_dim = L->dims(); - const int batch_size = lbl_dim[0]; - const int num_true = lbl_dim[1]; - const int num_sampled_classes = num_true + num_samples; - // std::vector ret_dim_vec = {batch_size, num_sampled_classes}; - // lite::DDim ret_dim(ret_dim_vec); - - // UNDERSTAND: raw data view - const int64_t* label_data = L->data(); - // int64_t* samples_data = - // S->mutable_data(ret_dim, Target); - // T* probabilities_data = P->mutable_data(ret_dim, Target); - S->Resize({batch_size, num_sampled_classes}); - auto* samples_data = S->mutable_data(Target); - P->Resize({batch_size, num_sampled_classes}); - auto* probabilities_data = P->mutable_data(Target); - - // temp sets for unique sampling - std::unordered_set tmp_samples; - int j = 0; // column index - // add true labels, not that efficient - while (j < num_true) { - for (int i = 0; i < batch_size; ++i) { - auto samples_index = i * num_sampled_classes + j; - auto v = label_data[i * num_true + j]; - samples_data[samples_index] = v; - probabilities_data[samples_index] = sampler.Probability(v); - } - ++j; - } - - // sample num_samles unique samples for an example, note that they are not - // all negative samples - tmp_samples.clear(); - int num_tries = 0; - while (j < num_sampled_classes) { - ++num_tries; - auto v = sampler.Sample(); - auto insert_ok = tmp_samples.insert(v).second; - if (!insert_ok) { - continue; - } - auto p = sampler.Probability(v); - for (int i = 0; i < batch_size; ++i) { - auto samples_index = i * num_sampled_classes + j; - samples_data[samples_index] = v; - probabilities_data[samples_index] = p; - } - ++j; - } - - // compute Q(y|x), because of unique sampling, probabilities need to be - // adjusted - for (int k = 0; k < num_sampled_classes; ++k) { - for (int i = 0; i < batch_size; ++i) { - auto samples_index = i * num_sampled_classes + k; - probabilities_data[samples_index] = adjust_prob( - probabilities_data[samples_index], num_samples, num_tries); - } - } - } -}; - -// #ifdef PADDLE_WITH_CUDA -// template -// class GPUSampleWithProb { -// public: -// void operator()(const platform::CUDAlite::Context& context, const -// int seed, -// const int dict_size, const bool uniq, -// const std::size_t num_samples, const lite::Tensor* L, -// lite::Tensor* S, -// lite::Tensor* P); -// }; -// #endif -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sampler.cc b/lite/backends/x86/math/sampler.cc deleted file mode 100644 index 1246806372..0000000000 --- a/lite/backends/x86/math/sampler.cc +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/sampler.h" -#include -#include -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -Sampler::~Sampler() {} - -UniformSampler::UniformSampler(int64_t range, unsigned int seed) - : Sampler(range, seed), inv_range_(1.0 / (range + 1)) { - random_engine_ = std::make_shared(seed_); - dist_ = std::make_shared>(0, range); -} - -int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); } - -float UniformSampler::Probability(int64_t value) const { return inv_range_; } - -LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed) - : Sampler(range, seed), log_range_(log(range + 1)) { - random_engine_ = std::make_shared(seed_); - dist_ = std::make_shared>(0, 1); -} - -int64_t LogUniformSampler::Sample() const { - // Got Log Uniform distribution from uniform distribution by - // inverse_transform_sampling method - // More details: - // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/ - const int64_t value = - static_cast(exp((*dist_)(*random_engine_) * log_range_)) - 1; - // Mathematically, value should be <= range_, but might not be due to some - // floating point roundoff, so we mod by range_. - return value % range_; -} - -float LogUniformSampler::Probability(int64_t value) const { - // Given f(x) = 1/[(x+1) * log_range_] - // The value's probability is integral of f(x) from value to (value + 1) - // More details: - // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler - return (log((value + 2.0) / (value + 1.0))) / log_range_; -} - -CustomSampler::CustomSampler(int64_t range, - const float *probabilities, - const int *alias, - const float *alias_probabilities, - unsigned int seed) - : Sampler(range, seed) { - random_engine_ = std::make_shared(seed_); - real_dist_ = std::make_shared>(0, 1); - int_dist_ = std::make_shared>(0, range); - - alias_probs_ = alias_probabilities; - probs_ = probabilities; - alias_ = alias; -} - -int64_t CustomSampler::Sample() const { - auto index = (*int_dist_)(*random_engine_); - auto p = (*real_dist_)(*random_engine_); - if (p > alias_probs_[index]) { - int alias = alias_[index]; - - if (alias == exceptional_val) { - LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val; - return index; - } - - return alias; - } else { - return index; - } -} - -float CustomSampler::Probability(int64_t value) const { return probs_[value]; } - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sampler.h b/lite/backends/x86/math/sampler.h deleted file mode 100644 index efd9e48e54..0000000000 --- a/lite/backends/x86/math/sampler.h +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -// TODO(wanghaoshuang): Support for GPU - -/** -* Sample integers from [0, range). -*/ -class Sampler { - public: - explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) { - // PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0."); - if (seed == 0) { - std::random_device r; - seed_ = r(); - } else { - seed_ = seed; - } - } - - virtual ~Sampler(); - - // Sample a single value - virtual int64_t Sample() const = 0; - - // The probability that a single call to Sample() returns the given value. - virtual float Probability(int64_t value) const = 0; - - int64_t range() { return range_; } - - protected: - const int64_t range_; - unsigned int seed_; -}; - -/** - * Sample integers from [0, range). - * And the distribution function is: - * P(x) = 1 / range - */ -class UniformSampler : public Sampler { - public: - explicit UniformSampler(int64_t range, unsigned int seed = 0UL); - - ~UniformSampler() override {} - - int64_t Sample() const override; - - float Probability(int64_t value) const override; - - private: - const float inv_range_; - std::shared_ptr random_engine_; - std::shared_ptr> dist_; -}; - -/** - * Sample integers from [0, range). - * And the distribution function is: - * P(x) = (1/ln(range+1)) * ln(1 + 1/(x + 1)) - */ -class LogUniformSampler : public Sampler { - public: - explicit LogUniformSampler(int64_t range, unsigned int seed = 0UL); - - ~LogUniformSampler() override {} - - int64_t Sample() const override; - - float Probability(int64_t value) const override; - - private: - const float log_range_; - std::shared_ptr random_engine_; - std::shared_ptr> dist_; -}; - -/** - * Sample integers from [0, range) from custom distribution. - */ -class CustomSampler : public Sampler { - public: - explicit CustomSampler(int64_t range, - const float* probabilities, - const int* alias, - const float* alias_probabilities, - unsigned int seed = 0UL); - - ~CustomSampler() override {} - - int64_t Sample() const override; - - float Probability(int64_t value) const override; - - private: - const float* alias_probs_; - const int* alias_; - const float* probs_; - const int exceptional_val = -1; - std::shared_ptr random_engine_; - std::shared_ptr> real_dist_; - std::shared_ptr> int_dist_; -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc deleted file mode 100644 index ff215781f1..0000000000 --- a/lite/backends/x86/math/sequence2batch.cc +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/sequence2batch.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -class CopyMatrixRowsFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& src, - std::vector index_lod, - lite::Tensor* dst, - bool is_src_index) { - size_t* index = index_lod.data(); - auto src_dims = src.dims(); - auto dst_dims = dst->dims(); - PADDLE_ENFORCE_EQ( - src_dims.size(), 2UL, "The src must be matrix with rank 2."); - PADDLE_ENFORCE_EQ( - dst_dims.size(), 2UL, "The dst must be matrix with rank 2."); - PADDLE_ENFORCE_EQ( - src_dims[1], dst_dims[1], "The width of src and dst must be same."); - auto height = dst_dims[0]; - auto width = dst_dims[1]; - auto* src_data = src.data(); - auto* dst_data = dst->mutable_data(); - const int sz = width * sizeof(T); - if (is_src_index) { - for (int i = 0; i < height; ++i) { - memcpy(dst_data + i * width, src_data + index[i] * width, sz); - } - } else { - for (int i = 0; i < height; ++i) { - memcpy(dst_data + index[i] * width, src_data + i * width, sz); - } - } - } -}; - -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; - -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h deleted file mode 100644 index 807558e9d8..0000000000 --- a/lite/backends/x86/math/sequence2batch.h +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "lite/core/context.h" -#include "lite/core/tensor.h" -#include "lite/fluid/eigen.h" -#include "lite/fluid/lod.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -using EigenMatrix = lite::fluid::EigenMatrix; - -template -class CopyMatrixRowsFunctor { - public: - // If is_src_index is true, - // copy the indexed rows of input src to the output dst. - // If is_src_index is false, - // copy the input src to the indexed rows of output dst. - // The indexed rows are based on the input index. - void operator()(const lite::Context& context, - const lite::Tensor& src, - std::vector index_lod, - lite::Tensor* dst, - bool is_src_index); -}; - -template -class LoDTensor2BatchFunctor { - // Calculate the length of each sequence and - // sort sequence index by the length. - // example: sequences = {s0, s1, s2} - // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 - // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} - // - struct SeqInfo { - SeqInfo(int start, int length, int seq_idx) - : start(start), length(length), seq_idx(seq_idx) {} - int start; - int length; - int seq_idx; - }; - - public: - void operator()(const lite::Context& context, - const lite::Tensor& lod_tensor, - lite::Tensor* batch, - bool is_cal_batch_lod, - bool is_reverse = false) const { - if (!is_cal_batch_lod) { - auto lods = batch->lod(); - PADDLE_ENFORCE_GT(lods.size(), - 2UL, - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_ENFORCE_EQ( - lods[1].size(), - static_cast(lod_tensor.dims()[0]), - "The LoD information should be consistent with the dims."); - CopyMatrixRowsFunctor to_batch; - to_batch(context, lod_tensor, lods[1], batch, true); - return; - } - - auto lods = lod_tensor.lod(); - PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now."); - - const auto& lod = lods[0]; - - std::vector seq_info; - for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { - int length = lod[seq_id + 1] - lod[seq_id]; - seq_info.emplace_back(lod[seq_id], length, seq_id); - } - - std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) { - return a.length > b.length; - }); - - // Calculate the start position of each batch. - // example: sequences = {s0, s1, s2} - // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 - // max_seqlen = 5, - // batchIndex = {b0, b1, b2, b3, b4} - // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 - // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} - // batch_start_positions[0] = len(b0) - // batch_start_positions[1] = len(b0) + len(b1) - // batch_start_positions[2] = len(b0) + len(b1) + len(b2) - // ... - // seq2batch_idx[12] = {4, 0, 9, - // 5, 1, 10, - // 6, 2, 11, - // 7, 3, - // 8} - // seq_order = {1, 0, 2}, the sort order. - // where 1 is the second sequence, - // 0 is the first sequence, - // 2 is the third sequence. - // The max_seqlen represents batch size after rearranging the - // input LodTensor. It is also the maximum length of input sequence. - - lite::LoD batch_lods; - batch_lods.emplace_back(std::vector{0}); - batch_lods.emplace_back(std::vector{0}); - batch_lods.emplace_back(std::vector{0}); - - // batch_lods[0] is the start positions for batch LoDTensor - int max_seqlen = seq_info[0].length; - batch_lods[0].resize(static_cast(max_seqlen + 1)); - // batch_lods[1] is the raw index in the input LoDTensor - batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); - // batch_lods[2] is the sort order for the input LoDTensor. - batch_lods[2].resize(seq_info.size()); - - size_t* batch_starts = batch_lods[0].data(); - size_t* seq2batch_idx = batch_lods[1].data(); - batch_starts[0] = 0; - for (int n = 0; n < max_seqlen; n++) { - auto batch_id = static_cast(batch_starts[n]); - for (size_t i = 0; i < seq_info.size(); ++i) { - int seq_len = seq_info[i].length; - int start = seq_info[i].start; - if (n < seq_len) { - seq2batch_idx[batch_id] = - is_reverse ? start + seq_len - 1 - n : start + n; - batch_id++; - } else { - break; - } - } - batch_starts[n + 1] = static_cast(batch_id); - } - size_t* seq_order = batch_lods[2].data(); - for (size_t i = 0; i < seq_info.size(); ++i) { - seq_order[i] = seq_info[i].seq_idx; - } - batch->set_lod(batch_lods); - - CopyMatrixRowsFunctor to_batch; - to_batch(context, lod_tensor, batch_lods[1], batch, true); - } -}; - -template -class Batch2LoDTensorFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& batch, - lite::Tensor* lod_tensor) const { - auto in_lod = batch.lod(); - PADDLE_ENFORCE_GT(in_lod.size(), - 2UL, - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_ENFORCE_EQ( - in_lod[1].size(), - static_cast(lod_tensor->dims()[0]), - "The LoD information should be consistent with the dims."); - CopyMatrixRowsFunctor to_seq; - to_seq(context, batch, in_lod[1], lod_tensor, false); - } -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc deleted file mode 100644 index fbb6c11a5f..0000000000 --- a/lite/backends/x86/math/sequence_padding.cc +++ /dev/null @@ -1,187 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/sequence_padding.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -void CopyValidData(lite::Tensor* dst_tensor, - const lite::Tensor* src_tensor, - const std::vector& seq_offsets, - int pad_seq_len, - int step_width, - bool norm_by_len, - CopyType type, - PadLayout layout) { - int seq_num = seq_offsets.size() - 1; - const T* src_data = src_tensor->data(); - T* dst_data = dst_tensor->mutable_data(); - - int seq_cpy_gap = step_width; - int pad_cpy_gap = - layout == kBatchLengthWidth ? step_width : seq_num * step_width; - for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) { - int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; - PADDLE_ENFORCE_GE( - pad_seq_len, - valid_seq_len, - "The padded sequence length can not be less than its original length."); - int seq_data_offset = seq_offsets[seq_idx] * step_width; - int pad_data_offset = layout == kBatchLengthWidth - ? seq_idx * pad_seq_len * step_width - : seq_idx * step_width; - float scale = 1.0f / static_cast(valid_seq_len); - - for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) { - const T* src = - src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset); - T* dst = - dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset); - memcpy(dst, src, step_width * sizeof(T)); - if (norm_by_len) { - for (int i = 0; i < step_width; ++i) { - *(dst + i) *= scale; - } - } - seq_data_offset += seq_cpy_gap; - pad_data_offset += pad_cpy_gap; - } - } -} - -template -static void fast_mem_init(void* dest, - size_t dest_size, - const T* src, - size_t num_bytes) { - if (dest == nullptr || dest_size == 0 || src == nullptr) return; - - memcpy(dest, src, num_bytes); - - dest_size *= num_bytes; - while (dest_size > num_bytes) { - size_t remaining = dest_size - num_bytes; - size_t count = (remaining > num_bytes) ? num_bytes : remaining; - memcpy((unsigned char*)dest + num_bytes, dest, count); - num_bytes += count; - } -} - -template -class PaddingLoDTensorFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& seq_tensor, - lite::Tensor* pad_tensor, - const lite::Tensor& pad_value, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth) { - auto seq_lod = seq_tensor.lod(); - const auto seq_offsets = lite::fluid::ToAbsOffset(seq_lod)[lod_level]; - const auto& seq_tensor_dims = seq_tensor.dims(); - const auto& pad_tensor_dims = pad_tensor->dims(); - if (pad_seq_len == -1) { - pad_seq_len = MaximumSequenceLength(seq_offsets); - } - int step_width = seq_tensor.numel() / seq_tensor_dims[0]; - - CheckDims(seq_tensor_dims, - pad_tensor_dims, - seq_offsets, - pad_seq_len, - step_width, - layout); - PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width, - "The numel of 'pad_value' can only be 1 or be equal to the " - "'step_width'."); - - // fill padding value - T* pad_data = pad_tensor->mutable_data(); - const T* pad_value_data = pad_value.data(); - if (pad_value.numel() == 1) { - fast_mem_init( - pad_data, pad_tensor->numel(), pad_value_data, sizeof(T)); - } else { - for (int i = 0; i < pad_tensor->numel(); i += step_width) { - memcpy(pad_data + i, pad_value_data, step_width * sizeof(T)); - } - } - - CopyValidData(pad_tensor, - &seq_tensor, - seq_offsets, - pad_seq_len, - step_width, - norm_by_times, - kSeqToPad, - layout); - } -}; - -template -class UnpaddingLoDTensorFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& pad_tensor, - lite::Tensor* seq_tensor, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth) { - auto seq_offsets = lite::fluid::ToAbsOffset(seq_tensor->lod())[lod_level]; - const auto& seq_tensor_dims = seq_tensor->dims(); - const auto& pad_tensor_dims = pad_tensor.dims(); - if (pad_seq_len == -1) { - pad_seq_len = MaximumSequenceLength(seq_offsets); - } - int step_width = seq_tensor->numel() / seq_tensor_dims[0]; - - CheckDims(seq_tensor_dims, - pad_tensor_dims, - seq_offsets, - pad_seq_len, - step_width, - layout); - - CopyValidData(seq_tensor, - &pad_tensor, - seq_offsets, - pad_seq_len, - step_width, - norm_by_times, - kPadToSeq, - layout); - } -}; - -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; - -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h deleted file mode 100644 index a3f4512042..0000000000 --- a/lite/backends/x86/math/sequence_padding.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" -#include "lite/fluid/lod.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth }; - -enum CopyType { kSeqToPad, kPadToSeq }; - -inline static size_t MaximumSequenceLength( - const std::vector& seq_offset) { - size_t seq_num = seq_offset.size() - 1; - size_t max_seq_len = 0; - for (size_t i = 0; i < seq_num; ++i) { - max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); - } - return max_seq_len; -} - -inline static void CheckDims(const lite::DDim& seq_tensor_dims, - const lite::DDim& pad_tensor_dims, - const std::vector& seq_offset, - int64_t padded_seq_len, - int64_t step_width, - const PadLayout& layout) { - PADDLE_ENFORCE_EQ(static_cast(seq_tensor_dims[0]), - seq_offset.back(), - "Value of 1st dimension of the sequence tensor should be " - "equal to sum of lengths of all sequences."); - - PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || - seq_tensor_dims.size() == pad_tensor_dims.size(), - "pad_tensor's rank should be 1 greater than seq_tensor's " - "rank, or be equal with it."); -} - -/* - * \brief Padding/Unpadding LoDTensor to/from normal Tensor of the shape - * [max_sequence_length, num_sequences, sequence_width]. - * - * Padding sequence: - * padding[i] = seq[lod[level][i]] - * Unpadding sequence: - * seq[lod[level][i]] = padding[i] - * - * All sequences will be padded to the same length and stored in a transposed - * shape. - * Example: - * seq (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) - * padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0) - * - * \param context device context of this functor. - * \param seq LoDTensor which is stored in sequence format, the shape - * is [total_sequence_length, sequence_width] where - * total_sequence_length is the sum of all sequences' - * length. - * \param padding Tensor which is padded to the same length, the shape is - * [max_sequence_length, num_sequences, sequence_width]. - * \param norm_by_times whether dividing sequence's length. - * - * \note transposition is also done in this functor. - */ -template -class PaddingLoDTensorFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& seq_tensor, - lite::Tensor* pad_tensor, - const lite::Tensor& pad_value, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth); -}; - -template -class UnpaddingLoDTensorFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& pad_tensor, - lite::Tensor* seq_tensor, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc deleted file mode 100644 index 186b8b5543..0000000000 --- a/lite/backends/x86/math/sequence_pooling.cc +++ /dev/null @@ -1,406 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "lite/backends/x86/jit/kernels.h" -#include "lite/backends/x86/legacy_place.h" -#include "lite/backends/x86/math/blas.h" -#include "lite/backends/x86/math/math_function.h" -#include "lite/backends/x86/math/sequence_pooling.h" -#include "lite/fluid/eigen.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -using EigenVector = lite::fluid::EigenVector; -template -using EigenMatrix = lite::fluid::EigenMatrix; - -template -class MaxSeqPoolFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - T pad_value, - lite::Tensor* output, - lite::Tensor* index) { - auto in_dims = input.dims(); - auto out_dims = output->dims(); - auto idx_dims = index->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); - for (int64_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); - } - PADDLE_ENFORCE_EQ(idx_dims, out_dims); - - auto starts = input.lod()[0]; - const T* in_data = input.data(); - T* out_data = output->mutable_data(); - int* max_index = index->mutable_data(); - - int64_t num_seq = out_dims[0]; - int64_t dim = output->numel() / num_seq; - for (int64_t i = 0; i < num_seq; ++i) { - if (starts[i] == starts[i + 1]) { - for (int64_t k = 0; k < dim; ++k) { - out_data[i * dim + k] = pad_value; - max_index[i * dim + k] = -1; - } - continue; - } - for (int64_t k = 0; k < dim; ++k) { - out_data[i * dim + k] = in_data[starts[i] * dim + k]; - max_index[i * dim + k] = starts[i]; - } - for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) { - for (int64_t k = 0; k < dim; ++k) { - if (in_data[j * dim + k] > out_data[i * dim + k]) { - out_data[i * dim + k] = in_data[j * dim + k]; - max_index[i * dim + k] = j; - } - } - } - } - } -}; -// Instantisation of Max Sequence Pooling for test phase eg. no need to fill -// index buffer -template -class MaxSeqPoolFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - T pad_value, - lite::Tensor* output, - lite::Tensor* index) { - auto in_dims = input.dims(); - auto out_dims = output->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); - for (int64_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); - } - - auto starts = input.lod()[0]; - const T* in_data = input.data(); - T* out_data = output->mutable_data(); - - int64_t num_seq = out_dims[0]; - int64_t dim = output->numel() / num_seq; - for (int64_t i = 0; i < num_seq; ++i) { - if (starts[i] == starts[i + 1]) { - for (int64_t k = 0; k < dim; ++k) { - out_data[i * dim + k] = pad_value; - } - continue; - } - std::memcpy( - &out_data[i * dim], &in_data[starts[i] * dim], dim * sizeof(T)); - for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) { - for (int64_t k = 0; k < dim; ++k) { - if (in_data[j * dim + k] > out_data[i * dim + k]) { - out_data[i * dim + k] = in_data[j * dim + k]; - } - } - } - } - } -}; -template -class MaxSeqPoolGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& out_grad, - const lite::Tensor& index, - lite::Tensor* in_grad) { - auto og_dims = out_grad.dims(); - auto ig_dims = in_grad->dims(); - auto idx_dims = index.dims(); - PADDLE_ENFORCE_GT(og_dims.size(), 1); - PADDLE_ENFORCE_GT(ig_dims.size(), 1); - for (int64_t i = 1; i < og_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); - } - PADDLE_ENFORCE_EQ(idx_dims, og_dims); - - const T* og_data = out_grad.data(); - const int* max_index = index.data(); - T* ig_data = in_grad->mutable_data(); - - SetConstant set_zero; - set_zero(context, in_grad, static_cast(0.0)); - int64_t num_seq = og_dims[0]; - int64_t dim = out_grad.numel() / num_seq; - for (int64_t i = 0; i < num_seq; ++i) { - for (int64_t j = 0; j < dim; ++j) { - int step_id = max_index[i * dim + j]; - if (step_id == -1) continue; - ig_data[step_id * dim + j] = og_data[i * dim + j]; - } - } - } -}; - -template -class LastSeqPoolFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - T pad_value, - lite::Tensor* output) { - // Create pointers to input and output data - auto* in_data = input.data(); - auto* out_data = output->mutable_data(); - - // Calculate the size of each item in sequence - int64_t item_size = input.numel() / input.dims()[0]; - auto lod = input.lod()[0]; - int seq_num = static_cast(lod.size()) - 1; - for (int i = 0; i < seq_num; ++i) { - // Calculate the length of each sequence - int64_t seq_len = static_cast(lod[i + 1] - lod[i]); - if (seq_len == 0) { - for (int j = 0; j < item_size; ++j) { - out_data[j] = pad_value; - } - } else { - // Point to the begin of next sequence - in_data += seq_len * item_size; - // Copy the last item of sequence to output - std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T)); - } - out_data += item_size; - } - } -}; - -template -class FirstSeqPoolFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - T pad_value, - lite::Tensor* output) { - // Create pointers to input and output data - auto* in_data = input.data(); - auto* out_data = output->mutable_data(); - - // Calculate the size of each item in sequence - int64_t item_size = input.numel() / input.dims()[0]; - auto lod = input.lod()[0]; - int seq_num = static_cast(lod.size()) - 1; - for (int i = 0; i < seq_num; ++i) { - // Calculate the length of each sequence - int64_t seq_len = static_cast(lod[i + 1] - lod[i]); - if (seq_len == 0) { - for (int j = 0; j < item_size; ++j) { - out_data[j] = pad_value; - } - } else { - // Copy the first item of sequence to output - std::memcpy(out_data, in_data, item_size * sizeof(T)); - // Point to the next sequence - in_data += seq_len * item_size; - } - out_data += item_size; - } - } -}; - -template -class SumSeqPoolGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& out_grad, - lite::Tensor* in_grad) { - auto lod = in_grad->lod()[0]; - int64_t out_w = out_grad.numel() / out_grad.dims()[0]; - int64_t in_w = in_grad->numel() / in_grad->dims()[0]; - PADDLE_ENFORCE(in_w == out_w); - const T* out_g_data = out_grad.data(); - T* in_g_data = in_grad->mutable_data(TARGET(kX86)); - auto blas = math::GetBlas(context); - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - int64_t h = static_cast(lod[i + 1] - lod[i]); - if (h == 0) continue; - int64_t in_offset = lod[i] * in_w; - const T* out_pos = out_g_data + i * out_w; - T* in_pos = in_g_data + in_offset; - for (int r = 0; r != h; ++r) { - blas.VCOPY(in_w, out_pos, in_pos + r * in_w); - } - } - } -}; - -template -class SequencePoolFunctor { - public: - /* max pool has index output */ - void operator()(const lite::X86Context& context, - const std::string pooltype, - T pad_value, - const lite::Tensor& input, - lite::Tensor* output, - bool is_test, - lite::Tensor* index = nullptr) { - if (pooltype == "MAX") { - if (is_test) { - math::MaxSeqPoolFunctor max_pool; - max_pool(context, input, pad_value, output, index); - } else { - math::MaxSeqPoolFunctor max_pool; - max_pool(context, input, pad_value, output, index); - } - return; - } - if (pooltype == "LAST") { - math::LastSeqPoolFunctor last_pool; - last_pool(context, input, pad_value, output); - return; - } - if (pooltype == "FIRST") { - math::FirstSeqPoolFunctor first_pool; - first_pool(context, input, pad_value, output); - return; - } - - auto lod = input.lod()[0]; - if (pooltype == "SUM") { - const T* src = input.data(); - T* dst = output->mutable_data(TARGET(kX86)); - jit::seq_pool_attr_t attr( - static_cast(input.numel() / input.dims()[0]), - jit::SeqPoolType::kSum); - auto seqpool = - jit::KernelFuncs, lite::fluid::CPUPlace>::Cache() - .At(attr); - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - attr.h = static_cast(lod[i + 1] - lod[i]); - if (attr.h == 0) { - for (int j = 0; j < attr.w; ++j) { - dst[j] = pad_value; - } - } else { - seqpool(src, dst, &attr); - } - dst += attr.w; - src += attr.h * attr.w; - } - return; - } - auto eigen_device = lite::fluid::EigenDeviceType(); - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - Tensor out_t = output->Slice(i, i + 1); - int64_t w = input.numel() / input.dims()[0]; - if (lod[i] == lod[i + 1]) { - for (int j = 0; j < w; ++j) { - out_t.mutable_data()[j] = pad_value; - } - continue; - } - Tensor in_t = input.Slice(static_cast(lod[i]), - static_cast(lod[i + 1])); - int64_t h = static_cast(lod[i + 1] - lod[i]); - auto in_e = EigenMatrix::From(in_t, lite::DDim({h, w})); - auto out_e = EigenVector::Flatten(out_t); - if (pooltype == "AVERAGE") { - out_e.device(eigen_device) = in_e.mean(Eigen::array({{0}})); - } else if (pooltype == "SQRT") { - out_e.device(eigen_device) = in_e.sum(Eigen::array({{0}})) / - std::sqrt(static_cast(h)); - } else { - PADDLE_THROW("unsupported pooling pooltype"); - } - } - } -}; - -template -class SequencePoolGradFunctor { - public: - void operator()(const lite::X86Context& context, - const std::string pooltype, - const lite::Tensor& out_grad, - lite::Tensor* in_grad, - /* max pool has index */ - const lite::Tensor* index = nullptr) { - if (pooltype == "MAX") { - math::MaxSeqPoolGradFunctor max_pool_grad; - max_pool_grad(context, out_grad, *index, in_grad); - return; - } - - if (pooltype == "LAST" || pooltype == "FIRST") { - // set X@Grad be zero at first when pooltype is LAST/FIRST - math::SetConstant functor; - functor(context, in_grad, 0); - } - - if (pooltype == "SUM") { - math::SumSeqPoolGradFunctor sum_pool_grad; - sum_pool_grad(context, out_grad, in_grad); - return; - } - - auto lod = in_grad->lod()[0]; - - auto eigen_device = lite::fluid::EigenDeviceType(); - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - if (lod[i] == lod[i + 1]) continue; - auto in_g_t = in_grad->Slice(static_cast(lod[i]), - static_cast(lod[i + 1])); - auto out_g_t = out_grad.Slice(i, i + 1); - int64_t h = static_cast(lod[i + 1] - lod[i]); - int64_t w = in_grad->numel() / in_grad->dims()[0]; - auto in_g_e = EigenMatrix::From(in_g_t, DDim({h, w})); - auto out_g_e = EigenMatrix::From(out_g_t, DDim({1, w})); - auto out_g_e_v = EigenVector::Flatten(out_g_t); - Eigen::DSizes bcast(h, 1); - - if (pooltype == "AVERAGE") { - in_g_e.device(eigen_device) = - (out_g_e / static_cast(h)).broadcast(bcast); - } else if (pooltype == "SQRT") { - in_g_e.device(eigen_device) = - (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); - } else if (pooltype == "LAST") { - in_g_e.chip(h - 1, 0).device(eigen_device) = out_g_e_v; - } else if (pooltype == "FIRST") { - in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v; - } else { - PADDLE_THROW("unsupported pooling pooltype"); - } - } - } -}; - -template class SequencePoolFunctor; -template class SequencePoolFunctor; -template class SequencePoolGradFunctor; -template class SequencePoolGradFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence_pooling.h b/lite/backends/x86/math/sequence_pooling.h deleted file mode 100644 index d1a9f88f62..0000000000 --- a/lite/backends/x86/math/sequence_pooling.h +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -class SequencePoolFunctor { - public: - /* max pool has index output */ - void operator()(const lite::Context& context, - const std::string pooltype, - T pad_value, - const lite::Tensor& input, - lite::Tensor* output, - bool is_test = false, - lite::Tensor* index = nullptr); -}; - -template -class SequencePoolGradFunctor { - public: - void operator()(const lite::Context& context, - const std::string pooltype, - const lite::Tensor& out_grad, - lite::Tensor* in_grad, - /* max pool has index */ - const lite::Tensor* index = nullptr); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc deleted file mode 100644 index a730147673..0000000000 --- a/lite/backends/x86/math/sequence_pooling_test.cc +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/sequence_pooling.h" -#include -#include - -template -void TestSequencePoolingSum(const paddle::framework::LoD& lod) { - paddle::framework::LoDTensor cpu_out_grad; - paddle::framework::LoDTensor cpu_in_grad; - paddle::framework::LoDTensor out_grad; - paddle::framework::LoDTensor in_grad; - const size_t second_dim = 128u; - - // construct out_grad's tensor in cpu - const size_t out_first_dim = lod[0].size() - 1; - auto out_dims = paddle::framework::make_ddim( - {static_cast(out_first_dim), static_cast(second_dim)}); - - cpu_out_grad.mutable_data(out_dims, paddle::platform::CPUPlace()); - for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) { - cpu_out_grad.data()[i] = static_cast(i); - } - - // copy to dst out_grad - auto* place = new Place(); - DeviceContext* context = new DeviceContext(*place); - if (paddle::platform::is_cpu_place(*place)) { - out_grad = cpu_out_grad; - } else { - TensorCopySync(cpu_out_grad, *place, &out_grad); - } - - // construct in_grad - in_grad.set_lod(lod); - auto in_dims = paddle::framework::make_ddim( - {static_cast(lod[0].back()), static_cast(second_dim)}); - in_grad.mutable_data(in_dims, context->GetPlace()); - - // check tensor contruction result - PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size()); - for (int64_t i = 1; i < out_grad.dims().size(); ++i) { - PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]); - } - - // call functor - paddle::operators::math::SequencePoolGradFunctor()( - *context, "SUM", out_grad, &in_grad); - - if (paddle::platform::is_cpu_place(*place)) { - cpu_in_grad = in_grad; - } else { - TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad); - cpu_in_grad.set_lod(in_grad.lod()); - } - - EXPECT_EQ(in_grad.numel(), static_cast(lod[0].back() * second_dim)); - EXPECT_EQ(in_grad.lod(), lod); - - if (paddle::platform::is_cpu_place(*place)) { - for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) { - int64_t begin = in_grad.lod()[0][i]; - int64_t end = in_grad.lod()[0][i + 1]; - paddle::framework::Tensor tmp = in_grad.Slice(begin, end); - for (size_t j = 0; j != tmp.numel() / second_dim; ++j) { - for (int64_t m = 0; m != second_dim; ++m) { - EXPECT_EQ(tmp.data()[m + j * second_dim], - out_grad.data()[m + i * second_dim]); - } - } - } - } else { - for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) { - int64_t begin = cpu_in_grad.lod()[0][i]; - int64_t end = cpu_in_grad.lod()[0][i + 1]; - paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end); - for (size_t j = 0; j != tmp.numel() / second_dim; ++j) { - for (int64_t m = 0; m != second_dim; ++m) { - EXPECT_EQ(tmp.data()[m + j * second_dim], - cpu_out_grad.data()[m + i * second_dim]); - } - } - } - } - - delete place; - delete context; -} - -TEST(SequencePoolingGrad, CPU_SUM) { - paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); - TestSequencePoolingSum(lod1); - - paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePoolingSum(lod2); -} - -#ifdef PADDLE_WITH_CUDA -TEST(SequencePoolingGrad, CUDA_SUM) { - paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); - TestSequencePoolingSum(lod1); - - paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePoolingSum(lod2); -} -#endif diff --git a/lite/backends/x86/math/sequence_scale.cc b/lite/backends/x86/math/sequence_scale.cc deleted file mode 100644 index fad0628de1..0000000000 --- a/lite/backends/x86/math/sequence_scale.cc +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/sequence_scale.h" -#include "lite/fluid/lod.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -class ScaleLoDTensorFunctor { - public: - void operator()(const lite::Context& context, - const T* scales, - lite::Tensor* seq) { - const size_t level = 0; - auto lod = seq->lod(); - const size_t num_seq = lod[level].size() - 1; - size_t seq_width = seq->dims()[1]; - lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod); - - T* seq_data = seq->mutable_data(lite::TargetType::kX86); - for (size_t i = 0; i < num_seq; ++i) { - for (size_t j = lod[level][i] * seq_width; - j < lod[level][i + 1] * seq_width; - ++j) { - seq_data[j] *= scales[i]; - } - } - } -}; - -template class ScaleLoDTensorFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/sequence_scale.h b/lite/backends/x86/math/sequence_scale.h deleted file mode 100644 index 44c1a233d9..0000000000 --- a/lite/backends/x86/math/sequence_scale.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * \brief Scale a sequence. - * - * All sequences will be padded to the same length and stored in a transposed - * shape. - * Example: - * Given: - * seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) - * scales = (2, 3, 4, 5) - * then: - * result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3) - - * - * \param context Device context of this functor. - * \param seq LoDTensor which is stored in sequence format, the shape - * is [total_sequence_length, sequence_width] where - * total_sequence_length is the sum of all sequences' - * length. - * \param scales Array. The i-th sequence will be scaled by scales[i]. - * \param num_seq Number of sequence - * - */ - -template -class ScaleLoDTensorFunctor { - public: - void operator()(const lite::Context& context, - const T* scales, - lite::Tensor* seq); -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/softmax.cc b/lite/backends/x86/math/softmax.cc deleted file mode 100644 index 1f7144dd8b..0000000000 --- a/lite/backends/x86/math/softmax.cc +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/softmax.h" -#include "lite/backends/x86/math/softmax_impl.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/softmax.h b/lite/backends/x86/math/softmax.h deleted file mode 100644 index 299ccef58a..0000000000 --- a/lite/backends/x86/math/softmax.h +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -class SoftmaxFunctor { - public: - void operator()(const lite::Context& context, - const int axis_dim, - const lite::Tensor* X, - lite::Tensor* Y); -}; - -template -class SoftmaxGradFunctor { - public: - void operator()(const lite::Context& context, - const int axis_dim, - const lite::TensorLite* y, - const lite::TensorLite* y_grad, - lite::TensorLite* x_grad); -}; - -//#ifdef PADDLE_WITH_CUDA -// template -// class SoftmaxCUDNNFunctor { -// public: -// void operator()(const platform::CUDADeviceContext& context, -// const lite::TensorLite* X, lite::TensorLite* Y); -//}; -// -// template -// class SoftmaxGradCUDNNFunctor { -// public: -// void operator()(const platform::CUDADeviceContext& context, -// const lite::TensorLite* Y, const lite::TensorLite* y_grad, -// lite::TensorLite* x_grad); -//}; -// -//#endif - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h deleted file mode 100644 index ae997a8680..0000000000 --- a/lite/backends/x86/math/softmax_impl.h +++ /dev/null @@ -1,245 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/backends/x86/cpu_info.h" -#include "lite/backends/x86/jit/helper.h" -#include "lite/backends/x86/jit/kernel_base.h" -#include "lite/backends/x86/jit/kernels.h" -#include "lite/backends/x86/math/cpu_vec.h" -#include "lite/core/tensor.h" -#include "lite/fluid/eigen.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -template -using EigenMatrix = lite::fluid::EigenMatrix; - -template -struct ValueClip { - HOSTDEVICE T operator()(const T& x) const { - const T kThreshold = static_cast(-64.); - return x < kThreshold ? kThreshold : x; - } -}; - -template -void SoftmaxEigen(const lite::Context& context, - const int axis_dim, - const lite::Tensor* X, - lite::Tensor* Y) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - auto logits = EigenMatrix::From(*X); - auto softmax = EigenMatrix::From(*Y); - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - Eigen::DSizes one_axis(1, axis_dim); - - auto shifted_logits = (logits - - logits.maximum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); - - softmax.device(typename lite::fluid::EigenDevice::Type()) = - shifted_logits.exp(); - softmax.device(typename lite::fluid::EigenDevice::Type()) = - (softmax * - softmax.reshape(batch_axis_remain) - .sum(along_class) - .inverse() - .eval() - .broadcast(one_axis)); -} - -template -void SoftmaxFunctor::operator()( - const lite::Context& context, - const int axis_dim, - const lite::Tensor* X, - lite::Tensor* Y) { - SoftmaxEigen, T, is_test>(context, axis_dim, X, Y); -} - -template -using enable_if_CPU = typename std::enable_if< - std::is_same, lite::X86Context>::value>::type; - -template -class SoftmaxFunctor> { - public: - void operator()(const lite::Context& context, - const int axis_dim, - const lite::Tensor* X, - lite::Tensor* Y) { - auto in_dims = X->dims(); - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - const int num_classes = in_dims[kClassDim]; - const int batch_size = in_dims[kBatchDim]; - const int num_remain = num_classes / axis_dim; - - if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* in_data = X->data(); - auto* out_data = Y->mutable_data(); - for (int bs = 0; bs < batch_size; ++bs) { - T max_val = *std::max_element(in_data, in_data + num_classes); - max_val *= static_cast(-1); - vec_add_bias( - num_classes, max_val, in_data, out_data); - vec_clip( - num_classes, static_cast(-64), out_data, out_data); - vec_exp(num_classes, out_data, out_data); - - T sum = 0; - vec_sum(num_classes, out_data, &sum); - sum = static_cast(1) / sum; - vec_scal(num_classes, sum, out_data, out_data); - - in_data += num_classes; - out_data += num_classes; - } - } else { - SoftmaxEigen(context, axis_dim, X, Y); - } - } -}; - -template -class SoftmaxFunctor> { - public: - void operator()(const lite::Context& context, - const int axis_dim, - const lite::Tensor* X, - lite::Tensor* Y) { - auto in_dims = X->dims(); - const float* in_data = X->data(); - float* out_data = Y->mutable_data(); - const int kBatchDim = 0; - const int kClassDim = 1; - // 2D data. Batch x C - auto compute_softmax = - lite::jit::KernelFuncs, - fluid::CPUPlace>::Cache() - .At(in_dims[kClassDim]); - compute_softmax(in_data, - out_data, - in_dims[kClassDim], - in_dims[kBatchDim], - in_dims[kClassDim] / axis_dim); - } -}; - -template -void SoftmaxGradEigen(const lite::Context& context, - const int axis_dim, - const lite::Tensor* y, - const lite::Tensor* y_grad, - lite::Tensor* x_grad) { - auto softmax = EigenMatrix::From(*y); - auto softmax_grad = EigenMatrix::From(*y_grad); - auto logits_grad = EigenMatrix::From(*x_grad); - - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - const int batch_size = softmax.dimension(kBatchDim); - const int num_classes = softmax.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - Eigen::DSizes one_axis(1, axis_dim); - - auto dot = (softmax * softmax_grad) - .reshape(batch_axis_remain) - .sum(along_class) - .eval() - .broadcast(one_axis); - // logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * - // softmax; - logits_grad.device(typename lite::fluid::EigenDevice::Type()) = - (softmax_grad - dot) * softmax; -} - -template -void SoftmaxGradFunctor::operator()( - const lite::Context& context, - const int axis_dim, - const lite::Tensor* y, - const lite::Tensor* y_grad, - lite::Tensor* x_grad) { - SoftmaxGradEigen, T>( - context, axis_dim, y, y_grad, x_grad); -} - -template -class SoftmaxGradFunctor> { - public: - void operator()(const lite::Context& context, - const int axis_dim, - const lite::Tensor* y, - const lite::Tensor* y_grad, - lite::Tensor* x_grad) { - auto out_dims = y->dims(); - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - const int num_classes = out_dims[kClassDim]; - const int batch_size = out_dims[kBatchDim]; - const int num_remain = num_classes / axis_dim; - - if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* out_data = y->data(); - const T* out_grad = y_grad->data(); - T* in_grad = x_grad->mutable_data(); - for (int bs = 0; bs < batch_size; ++bs) { - T scalar; - vec_mul_reduce( - num_classes, out_grad, out_data, &scalar); - scalar *= static_cast(-1); - vec_add_bias(num_classes, scalar, out_grad, in_grad); - vec_mul(num_classes, out_data, in_grad, in_grad); - out_data += num_classes; - out_grad += num_classes; - in_grad += num_classes; - } - } else { - SoftmaxGradEigen(context, axis_dim, y, y_grad, x_grad); - } - } -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc deleted file mode 100644 index 8a34bebef0..0000000000 --- a/lite/backends/x86/math/tree2col.cc +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/x86/math/tree2col.h" -#include -#include - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -std::vector Tree2ColUtil::construct_patch( - size_t root, int max_depth, const std::vector> &tr) { - std::stack> stack; - std::unordered_map visited; - std::vector patch; - - stack.push(TreeNode(root, 1, 1, 0)); - patch.emplace_back(TreeNode(root, 1, 1, 0)); - visited[root] = true; - - while (!stack.empty()) { - TreeNode &u = stack.top(); - bool end = true; - size_t node = u.get_node(), sz = tr[node].size(); - visited[node] = true; - for (size_t i = 0; i < sz; i++) { - size_t v = tr[node][i]; - if (!visited[v] && static_cast(u.get_depth()) + 1 < max_depth) { - visited[v] = true; - stack.push(TreeNode(v, i, sz, u.get_depth() + 1)); - patch.push_back(TreeNode(v, i + 1, sz, u.get_depth() + 1)); - end = false; - } - } - if (end) { - stack.pop(); - } - } - return patch; -} - -void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet, - std::vector> *tr, - size_t *node_count) { - auto edge_set_dims = EdgeSet.dims(); - PADDLE_ENFORCE_EQ(edge_set_dims[1], 2); - int64_t edge_count = EdgeSet.numel(); - - const int *edge_data = EdgeSet.data(); - - for (int64_t i = 0; i < edge_count; i += 2) { - int u = edge_data[i], v = edge_data[i + 1]; - if (u != 0 && v != 0) (*node_count)++; - } - (*node_count)++; - - tr->resize(static_cast(*node_count + 1)); - - for (int64_t i = 0; i < edge_count; i += 2) { - int u = edge_data[i], v = edge_data[i + 1]; - if (u != 0 && v != 0) { - tr->at(u).push_back(v); - } else { - break; - } - } -} - -template -class Tree2ColFunctor { - public: - void operator()(const lite::X86Context &context, - const lite::Tensor &EdgeSet, - const lite::Tensor &node_features, - lite::Tensor *patch, - int max_depth) { - std::vector> tr; - auto feature_dims = node_features.dims(); - math::SetConstant constant; - int64_t feature_size = feature_dims[1]; - size_t patch_elem_size = 3 * static_cast(feature_size); - size_t node_count = 0, patch_count = 0, patch_size; - Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count); - std::vector> processing_list; - for (size_t u = 1; u <= node_count; u++) { - std::vector temp_patch = - Tree2ColUtil::construct_patch(u, max_depth, tr); - if (!temp_patch.empty()) { - processing_list.emplace_back(temp_patch); - } - } - patch_size = processing_list.size(); - - // T *patch_data = - // patch->mutable_data({static_cast(patch_size), - // static_cast(patch_elem_size)}, - // cpu_place); - patch->Resize({static_cast(patch_size, patch_elem_size)}); - auto *patch_data = patch->mutable_data(lite::TargetType::kX86); - constant(context, patch, 0); - const T *features = node_features.data(); - - for (auto &patch_item : processing_list) { - size_t pointer_base = patch_count * patch_elem_size; - for (auto &v : patch_item) { - T eta_l = v.eta_l(max_depth), eta_r = v.eta_r(max_depth), - eta_t = v.eta_t(max_depth); - size_t id = v.get_node() - 1; - for (int i = 0; i < feature_size; i++) { - patch_data[pointer_base + i * 3] += - eta_l * features[id * feature_size + i]; - patch_data[pointer_base + i * 3 + 1] += - eta_r * features[id * feature_size + i]; - patch_data[pointer_base + i * 3 + 2] += - eta_t * features[id * feature_size + i]; - } - } - patch_count++; - } - patch->Resize({static_cast(patch_count), - static_cast(patch_elem_size)}); - } -}; -template -class Col2TreeFunctor { - public: - void operator()(const lite::X86Context &context, - const lite::Tensor &EdgeSet, - const lite::Tensor &out_grad, - lite::Tensor *in_grad, - int max_depth) { - std::vector> tr; - auto output_dims = out_grad.dims(); - // auto cpu_place = boost::get(context.GetPlace()); - math::SetConstant constant; - int64_t output_size = output_dims[1]; - size_t grad_elem_size = 3 * static_cast(output_size); - size_t node_count = 0, grad_count = 0; - Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count); - std::vector> processing_list; - std::vector> grad_list; - grad_list.resize(node_count); - for (size_t u = 1; u <= node_count; u++) { - std::vector tmp = - Tree2ColUtil::construct_patch(u, max_depth, tr); - if (!tmp.empty()) { - processing_list.push_back(tmp); - } - } - for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) { - for (auto v : processing_list[patch_id]) { - grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1)); - } - } - // T *grad_data = - // in_grad->mutable_data({static_cast(node_count), - // static_cast(grad_elem_size)}, - // cpu_place); - in_grad->Resize({static_cast(node_count), - static_cast(grad_elem_size)}); - auto *grad_data = in_grad->mutable_data(lite::TargetType::kX86); - - constant(context, in_grad, 0); - const T *out_g = out_grad.data(); - for (auto &patch_item : grad_list) { - size_t pointer_base = grad_count * grad_elem_size; - for (auto &v : patch_item) { - T eta_l = v.eta_l(max_depth), eta_r = v.eta_r(max_depth), - eta_t = v.eta_t(max_depth); - size_t id = v.get_node() - 1; - for (int i = 0; i < output_size; i++) { - grad_data[pointer_base + i * 3] += - eta_l * out_g[id * output_size + i]; - grad_data[pointer_base + i * 3 + 1] += - eta_r * out_g[id * output_size + i]; - grad_data[pointer_base + i * 3 + 2] += - eta_t * out_g[id * output_size + i]; - } - } - grad_count++; - } - } -}; - -template class Tree2ColFunctor; -template class Tree2ColFunctor; -template class Col2TreeFunctor; -template class Col2TreeFunctor; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/tree2col.h b/lite/backends/x86/math/tree2col.h deleted file mode 100644 index 3a48c2f40a..0000000000 --- a/lite/backends/x86/math/tree2col.h +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/backends/x86/math/math_function.h" -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -class TreeNode { - public: - size_t node; - explicit TreeNode(size_t node = 0, - size_t index = 0, - size_t pclen = 0, - size_t depth = 0) - : node(node), index(index), pclen(pclen), depth(depth) {} - template - T eta_t(T filter_depth) { - return ((filter_depth - this->depth) / filter_depth); - } - template - T eta_l(T filter_depth) { - T temp; - if (this->pclen == 1) { - temp = 0.5; - } else { - temp = (this->index - 1.0) / (this->pclen - 1.0); - } - return (1.0 - this->eta_t(filter_depth)) * temp; - } - template - T eta_r(T filter_depth) { - return (1.0 - this->eta_t(filter_depth)) * - (1.0 - this->eta_l(filter_depth)); - } - TreeNode change_node(size_t v) { - return TreeNode(v, this->index, this->pclen, this->depth); - } - size_t get_node() { return this->node; } - size_t get_depth() { return this->depth; } - - private: - size_t index, pclen, depth; -}; -class Tree2ColUtil { - public: - static std::vector construct_patch( - size_t root, int max_depth, const std::vector> &tr); - - static void construct_tree(const lite::Tensor &EdgeSet, - std::vector> *tr, - size_t *node_count); -}; - -template -class Tree2ColFunctor { - public: - void operator()(const lite::Context &context, - const lite::Tensor &EdgeSet, - const lite::Tensor &node_features, - lite::Tensor *patch, - int max_depth); -}; -template -class Col2TreeFunctor { - public: - void operator()(const lite::Context &context, - const lite::Tensor &EdgeSet, - const lite::Tensor &out_grad, - lite::Tensor *in_grad, - int max_depth); -}; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc deleted file mode 100644 index 568f9952ca..0000000000 --- a/lite/backends/x86/math/unpooling.cc +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/unpooling.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -template -class Unpool2dMaxFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const lite::Tensor& indices, - lite::Tensor* output) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output->dims()[1]; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - int input_feasize = input_height * input_width; - int output_feasize = output_height * output_width; - const T* input_data = input.data(); - const int* indices_data = indices.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); - for (int b = 0; b < batch_size; ++b) { - for (int c = 0; c < output_channels; ++c) { - for (int i = 0; i < input_feasize; ++i) { - int index = indices_data[i]; - PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); - output_data[index] = input_data[i]; - } - input_data += input_feasize; - indices_data += input_feasize; - output_data += output_feasize; - } - } - } -}; -template -class Unpool2dMaxGradFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& input, - const lite::Tensor& indices, - const lite::Tensor& output, - const lite::Tensor& output_grad, - lite::Tensor* input_grad) { - const int batch_size = input.dims()[0]; - const int input_height = input.dims()[2]; - const int input_width = input.dims()[3]; - const int output_channels = output.dims()[1]; - const int output_height = output.dims()[2]; - const int output_width = output.dims()[3]; - int input_feasize = input_height * input_width; - int output_feasize = output_height * output_width; - const int* indices_data = indices.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); - - for (int b = 0; b < batch_size; ++b) { - for (int c = 0; c < output_channels; ++c) { - for (int i = 0; i < input_feasize; ++i) { - int index = indices_data[i]; - PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); - input_grad_data[i] = output_grad_data[index]; - } - input_grad_data += input_feasize; - indices_data += input_feasize; - output_grad_data += output_feasize; - } - } - } -}; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/unpooling.h b/lite/backends/x86/math/unpooling.h deleted file mode 100644 index 18948465f3..0000000000 --- a/lite/backends/x86/math/unpooling.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -template -class Unpool2dMaxFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& indices, - lite::Tensor* output); -}; -template -class Unpool2dMaxGradFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& input, - const lite::Tensor& indices, - const lite::Tensor& output, - const lite::Tensor& output_grad, - lite::Tensor* input_grad); -}; -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc deleted file mode 100644 index 8fd5e8954e..0000000000 --- a/lite/backends/x86/math/vol2col.cc +++ /dev/null @@ -1,204 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/math/vol2col.h" -#include -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { - -/* - * vol = [input_channels, input_depth, input_height, input_width] - * col = - * [input_channels, filter_depth, filter_height, filter_width, - * output_depth, output_height, output_width] - */ -template -class Vol2ColFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& vol, - const std::vector& dilations, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* col) const { - PADDLE_ENFORCE(vol.dims().size() == 4); - PADDLE_ENFORCE(col->dims().size() == 7); - - int input_channels = vol.dims()[0]; - int input_depth = vol.dims()[1]; - int input_height = vol.dims()[2]; - int input_width = vol.dims()[3]; - int filter_depth = col->dims()[1]; - int filter_height = col->dims()[2]; - int filter_width = col->dims()[3]; - int output_depth = col->dims()[4]; - int output_height = col->dims()[5]; - int output_width = col->dims()[6]; - int channels_col = - input_channels * filter_depth * filter_height * filter_width; - - PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); - - const T* vol_data = vol.data(); - T* col_data = col->mutable_data(); - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int d_offset = (c / filter_width / filter_height) % filter_depth; - int c_in = c / filter_width / filter_height / filter_depth; - for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; - for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; - for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; - - int col_idx = - ((c * output_depth + d) * output_height + h) * output_width + w; - int vol_idx = - ((c_in * input_depth + d_pad) * input_height + h_pad) * - input_width + - w_pad; - col_data[col_idx] = - (h_pad < 0 || h_pad >= input_height || w_pad < 0 || - w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) - ? static_cast(0) - : vol_data[vol_idx]; - } - } - } - } - } -}; - -/* - * vol = [input_channels,input_depth, input_height, input_width] - * col = - * [input_channels, filter_depth, filter_height, filter_width, - * output_depth, output_height, output_width] - */ -template -class Col2VolFunctor { - public: - void operator()(const lite::X86Context& context, - const lite::Tensor& col, - const std::vector& dilations, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* vol) const { - PADDLE_ENFORCE(vol->dims().size() == 4); - PADDLE_ENFORCE(col.dims().size() == 7); - - int input_channels = vol->dims()[0]; - int input_depth = vol->dims()[1]; - int input_height = vol->dims()[2]; - int input_width = vol->dims()[3]; - int filter_depth = col.dims()[1]; - int filter_height = col.dims()[2]; - int filter_width = col.dims()[3]; - int output_depth = col.dims()[4]; - int output_height = col.dims()[5]; - int output_width = col.dims()[6]; - int channels_col = - input_channels * filter_depth * filter_height * filter_width; - - PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); - T* vol_data = vol->mutable_data(); - const T* col_data = col.data(); - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int d_offset = (c / filter_width / filter_height) % filter_depth; - int cIm = c / filter_width / filter_height / filter_depth; - for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0]; - for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1]; - for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2]; - - if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && - w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { - int vol_idx = - ((cIm * input_depth + d_pad) * input_height + h_pad) * - input_width + - w_pad; - - int col_idx = - ((c * output_depth + d) * output_height + h) * output_width + - w; - vol_data[vol_idx] += col_data[col_idx]; - } - } - } - } - } - } -}; - -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/math/vol2col.h b/lite/backends/x86/math/vol2col.h deleted file mode 100644 index 4583fde6b2..0000000000 --- a/lite/backends/x86/math/vol2col.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/core/context.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace x86 { -namespace math { -/* - * \brief Converts the feature data of four dimensions(CDHW) into a colData of - * seven dimensions in the Vol2ColFunctor calculation, - * And in the Col2VolFunctor calculation, it is reversed. - * - * \param volData Vol data. - * \param volShape The shape of volData, - * [input_channels, input_depth, input_height, input_width]. - * \param colData Column data. - * \param colShape The shape of colData. - * - * \param dilations dilation data. - * \param 3-dimension [dilation_depth, dilation_height, dilation_width]. - * - * \param strides stride data. - * \param 3-dimension [stride_depth, stride_height, stride_width]. - * - * \param paddings padding data. - * \param 3-dimension [d_pad, h_pad, w_pad]. - * - * The shape of colData is: - * [input_channels, filter_depth, filter_height, filter_width, output_depth, - * output_height, output_width] - * So, it is easy to reshape into a convolution matrix for convolution - * calculation based on matrix multiplication. - * The shape of convolution matrix is [height, width], where the height is equal - * input_channels * filter_depth * filter_height * filter_width, and the width - * is equal output_depth * output_height * output_width. - * - * Reshape: - * shape of colData shape of convolution matrix - * [input_channels, - * filter_depth, - * filter_height, - * filter_width, ======> [height, width] - * output_depth, - * output_height, - * output_width] - * - * \note The caller needs to ensure that volShape.inputChannels is equal to - * colShape.inputChannels. - */ -template -class Vol2ColFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& vol, - const std::vector& dilations, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* col) const; -}; - -template -class Col2VolFunctor { - public: - void operator()(const lite::Context& context, - const lite::Tensor& col, - const std::vector& dilations, - const std::vector& strides, - const std::vector& paddings, - lite::Tensor* vol) const; -}; - -} // namespace math -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/mklml.cc b/lite/backends/x86/mklml.cc deleted file mode 100644 index 1c3c3c3bde..0000000000 --- a/lite/backends/x86/mklml.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/backends/x86/mklml.h" - -namespace paddle { -namespace lite { -namespace x86 { - -std::once_flag mklml_dso_flag; -void* mklml_dso_handle = nullptr; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -MKLML_ROUTINE_EACH(DEFINE_WRAP); - -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/mklml.h b/lite/backends/x86/mklml.h deleted file mode 100644 index 753c42f295..0000000000 --- a/lite/backends/x86/mklml.h +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include // NOLINT -#include "lite/backends/x86/dynamic_loader.h" -#include "lite/backends/x86/port.h" - -namespace paddle { -namespace lite { -namespace x86 { - -extern std::once_flag mklml_dso_flag; -extern void* mklml_dso_handle; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load mklml routine - * via operator overloading. - */ -#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using mklmlFunc = decltype(&::__name); \ - std::call_once(mklml_dso_flag, []() { \ - mklml_dso_handle = paddle::lite::x86::GetMKLMLDsoHandle(); \ - }); \ - static void* p_##_name = dlsym(mklml_dso_handle, #__name); \ - return reinterpret_cast(p_##_name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name) - -#define MKLML_ROUTINE_EACH(__macro) \ - __macro(cblas_sgemm); \ - __macro(cblas_dgemm); \ - __macro(cblas_saxpy); \ - __macro(cblas_daxpy); \ - __macro(cblas_scopy); \ - __macro(cblas_dcopy); \ - __macro(cblas_sgemv); \ - __macro(cblas_dgemv); \ - __macro(cblas_sgemm_alloc); \ - __macro(cblas_dgemm_alloc); \ - __macro(cblas_sgemm_pack); \ - __macro(cblas_dgemm_pack); \ - __macro(cblas_sgemm_compute); \ - __macro(cblas_dgemm_compute); \ - __macro(cblas_sgemm_free); \ - __macro(cblas_dgemm_free); \ - __macro(cblas_sgemm_batch); \ - __macro(cblas_dgemm_batch); \ - __macro(cblas_sdot); \ - __macro(cblas_ddot); \ - __macro(cblas_sasum); \ - __macro(cblas_dasum); \ - __macro(cblas_isamax); \ - __macro(cblas_idamax); \ - __macro(cblas_sscal); \ - __macro(cblas_dscal); \ - __macro(vsAdd); \ - __macro(vdAdd); \ - __macro(vsMul); \ - __macro(vdMul); \ - __macro(vsExp); \ - __macro(vdExp); \ - __macro(vsSqr); \ - __macro(vdSqr); \ - __macro(vsPowx); \ - __macro(vdPowx); \ - __macro(vsInv); \ - __macro(vdInv); \ - __macro(vmsErf); \ - __macro(vmdErf); \ - __macro(MKL_Set_Num_Threads) - -MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); - -#undef DYNAMIC_LOAD_MKLML_WRAP - -} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/port.h b/lite/backends/x86/port.h deleted file mode 100644 index c1b81159ac..0000000000 --- a/lite/backends/x86/port.h +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include -#include -#include - -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" - -#if !defined(_WIN32) -#include // dladdr -#include // backtrace -#include -#include -#include // std::accumulate -#else -#define NOMINMAX // msvc max/min macro conflict with std::min/max -// solve static linking error in windows -// https://github.com/google/glog/issues/301 -#define GOOGLE_GLOG_DLL_DECL -#include // _popen, _pclose -#include -#include -#include // std::accumulate in msvc -#ifndef S_ISDIR // windows port for sys/stat.h -#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) -#endif // S_ISDIR - -static void *dlsym(void *handle, const char *symbol_name) { - FARPROC found_symbol; - found_symbol = GetProcAddress((HMODULE)handle, symbol_name); - - if (found_symbol == NULL) { - throw std::runtime_error(std::string(symbol_name) + " not found."); - } - return reinterpret_cast(found_symbol); -} - -static void *dlopen(const char *filename, int flag) { - std::string file_name(filename); - HMODULE hModule = LoadLibrary(file_name.c_str()); - if (!hModule) { - throw std::runtime_error(file_name + " not found."); - } - return reinterpret_cast(hModule); -} - -static int gettimeofday(struct timeval *tp, void *tzp) { - time_t clock; - struct tm tm; - SYSTEMTIME wtm; - - GetLocalTime(&wtm); - tm.tm_year = wtm.wYear - 1900; - tm.tm_mon = wtm.wMonth - 1; - tm.tm_mday = wtm.wDay; - tm.tm_hour = wtm.wHour; - tm.tm_min = wtm.wMinute; - tm.tm_sec = wtm.wSecond; - tm.tm_isdst = -1; - clock = mktime(&tm); - tp->tv_sec = clock; - tp->tv_usec = wtm.wMilliseconds * 1000; - - return (0); -} -#endif // !_WIN32 - -static void ExecShellCommand(const std::string &cmd, std::string *message) { - char buffer[128]; -#if !defined(_WIN32) - std::shared_ptr pipe(popen(cmd.c_str(), "r"), pclose); -#else - std::shared_ptr pipe(_popen(cmd.c_str(), "r"), _pclose); -#endif // _WIN32 - if (!pipe) { - LOG(ERROR) << "error running command: " << cmd; - return; - } - while (!feof(pipe.get())) { - if (fgets(buffer, 128, pipe.get()) != nullptr) { - *message += buffer; - } - } -} - -static bool PathExists(const std::string &path) { -#if !defined(_WIN32) - struct stat statbuf; - if (stat(path.c_str(), &statbuf) != -1) { - if (S_ISDIR(statbuf.st_mode)) { - return true; - } - } -#else - struct _stat statbuf; - if (_stat(path.c_str(), &statbuf) != -1) { - if (S_ISDIR(statbuf.st_mode)) { - return true; - } - } -#endif // !_WIN32 - return false; -} - -// TODO(yuyang18): If the functions below are needed by other files, move them -// to paddle::filesystem namespace. -#if !defined(_WIN32) -constexpr char kSEP = '/'; -#else -constexpr char kSEP = '\\'; -#endif // _WIN32 - -static bool FileExists(const std::string &filepath) { -#if !defined(_WIN32) - struct stat buffer; - return (stat(filepath.c_str(), &buffer) == 0); -#else - struct _stat buffer; - return (_stat(filepath.c_str(), &buffer) == 0); -#endif // !_WIN32 -} - -static std::string DirName(const std::string &filepath) { - auto pos = filepath.rfind(kSEP); - if (pos == std::string::npos) { - return ""; - } - return filepath.substr(0, pos); -} - -static void MkDir(const char *path) { - std::string path_error(path); - path_error += " mkdir failed!"; -#if !defined(_WIN32) - if (mkdir(path, 0755)) { - if (errno != EEXIST) { - throw std::runtime_error(path_error); - } - } -#else - BOOL return_value = CreateDirectory(path, NULL); - if (!return_value) { - auto errorno = GetLastError(); - if (errorno != ERROR_ALREADY_EXISTS) { - throw std::runtime_error(path_error); - } - } -#endif // !_WIN32 -} - -static void MkDirRecursively(const char *fullpath) { - if (*fullpath == '\0') return; // empty string - if (FileExists(fullpath)) return; - - MkDirRecursively(DirName(fullpath).c_str()); - MkDir(fullpath); -} diff --git a/lite/backends/x86/target_wrapper.cc b/lite/backends/x86/target_wrapper.cc deleted file mode 100644 index 34227abd98..0000000000 --- a/lite/backends/x86/target_wrapper.cc +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/target_wrapper.h" -#include -#include "lite/backends/x86/target_wrapper.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { - -template <> -void TargetWrapper::MemcpySync(void *dst, - const void *src, - size_t size, - IoDirection dir) { - std::copy_n(reinterpret_cast(src), - size, - reinterpret_cast(dst)); -} - -template class TargetWrapper; - -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/target_wrapper.h b/lite/backends/x86/target_wrapper.h deleted file mode 100644 index a57f51d8f1..0000000000 --- a/lite/backends/x86/target_wrapper.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { -namespace x86 {} // namespace x86 -} // namespace lite -} // namespace paddle diff --git a/lite/backends/x86/warpctc_lib_path.h.in b/lite/backends/x86/warpctc_lib_path.h.in deleted file mode 100644 index dc5064f457..0000000000 --- a/lite/backends/x86/warpctc_lib_path.h.in +++ /dev/null @@ -1,17 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define WARPCTC_LIB_PATH "@WARPCTC_INSTALL_DIR@/lib/" diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt deleted file mode 100644 index ff80accbb7..0000000000 --- a/lite/core/CMakeLists.txt +++ /dev/null @@ -1,124 +0,0 @@ -if (WITH_TESTING) - lite_cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags) -endif() -lite_cc_library(target_wrapper SRCS target_wrapper.cc - DEPS target_wrapper_host place - X86_DEPS target_wrapper_x86 - CUDA_DEPS target_wrapper_cuda - CL_DEPS cl_target_wrapper - FPGA_DEPS fpga_target_wrapper) - -lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper) - -set(tensor_extra_deps "") -if (LITE_WITH_FPGA) - set(tensor_extra_deps lite_tensor_fpga) -endif() -lite_cc_library(tensor SRCS tensor.cc DEPS memory ${tensor_extra_deps}) - - -if (NOT LITE_ON_TINY_PUBLISH) - proto_library(framework_proto SRCS framework.proto) -endif() - -if (LITE_WITH_X86) -lite_cc_library(variable SRCS variable.cc DEPS tensor) -lite_cc_library(types SRCS types.cc) -else() -lite_cc_library(variable SRCS variable.cc DEPS tensor) -lite_cc_library(types SRCS types.cc) -endif() -lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel) -lite_cc_library(scope SRCS scope.cc DEPS tensor) -lite_cc_library(device_info SRCS device_info.cc DEPS tensor) - -if (LITE_WITH_ARM) -lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS ${npu_ddk_libs}) -else() -lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags) -endif() - -#----------------------------------------------- NOT CHANGE ----------------------------------------------- -# A trick to generate the paddle_use_kernels.h -add_custom_command( - COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py - ${kernels_src_list} - ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h - OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h - ) -# A trick to generate the paddle_use_ops.h -add_custom_command( - COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py - ${ops_src_list} - ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h - OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h - ) -# generate fake kernels for memory_optimize_tool -add_custom_command( - COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py - ${kernels_src_list} - ${CMAKE_BINARY_DIR}/all_kernel_faked.cc - OUTPUT ${CMAKE_BINARY_DIR}/all_kernel_faked.cc - ) -add_custom_target(op_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h) -add_custom_target(kernel_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h) -add_custom_target(all_kernel_faked_cc DEPENDS ${CMAKE_BINARY_DIR}/all_kernel_faked.cc) - -#----------------------------------------------- NOT CHANGE ----------------------------------------------- -lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor - ) -lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel - cpp_op_desc tensor - ) - -add_dependencies(kernel kernel_list_h) -add_dependencies(op op_list_h) - - -lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper) - -lite_cc_library(program SRCS program.cc - DEPS op kernel model_parser ${ops} ${cpp_wrapper} - PROFILE_DEPS basic_profiler) - -if (NOT LITE_ON_TINY_PUBLISH) - lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program) - add_subdirectory(mir) - add_subdirectory(profile) - add_subdirectory(arena) -endif() - -# for mobile, unnecessary to compile the following testings. -if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - return() -endif() - -# lite_cc_library(program_fake_utils SRCS program_fake_utils.cc DEPS mir_ssa_graph -# scope op_registry proto_desc op -# ${ops} -# ${host_kernels} -# ) - -lite_cc_test(test_scope SRCS scope_test.cc DEPS scope) -lite_cc_test(test_kernel SRCS kernel_test.cc DEPS kernel target_wrapper any) -lite_cc_test(test_op SRCS op_lite_test.cc DEPS op) -lite_cc_test(test_tensor SRCS lite_tensor_test.cc DEPS tensor) -lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils) -#lite_cc_test(test_optimizer SRCS optimizer_test.cc DEPS mir_pass_manager program_fake_utils mir_passes optimizer fc_op) -lite_cc_test(test_types SRCS types_test.cc DEPS types) -lite_cc_test(test_memory SRCS memory_test.cc DEPS memory) -lite_cc_test(test_context SRCS context_test.cc DEPS context) - - -# # A trick to generate the paddle_use_kernels.h -# execute_process( -# COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py -# ${kernels_src_list} -# ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h -# ) -# # A trick to generate the paddle_use_ops.h -# execute_process( -# COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py -# ${ops_src_list} -# ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h -# ) diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt deleted file mode 100644 index 854d2f4172..0000000000 --- a/lite/core/arena/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -# To make sure the test framework is only actived in TESTING mode. -if(NOT WITH_TESTING) - return() -endif() - -lite_cc_library(arena_framework SRCS framework.cc DEPS program) - -if(NOT LITE_WITH_OPENCL AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) -endif() diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc deleted file mode 100644 index c59c078787..0000000000 --- a/lite/core/arena/framework.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/arena/framework.h" -#include "lite/core/context.h" - -namespace paddle { -namespace lite { -namespace arena { - -void TestCase::CreateInstruction() { - auto op = LiteOpRegistry::Global().Create(op_desc().Type()); - CHECK(op) << "no op for " << op_desc().Type(); - op->Attach(*op_desc_, inst_scope_); - auto kernels = op->CreateKernels({place_}); - // filter out the target kernel - CHECK(!kernels.empty()) << "No kernel found for place " - << place_.DebugString(); - auto it = std::remove_if( - kernels.begin(), kernels.end(), [&](std::unique_ptr& k) { - return k->alias() == alias_; - }); - CHECK(it != kernels.end()) << "failed to create the kernel in " - << place_.DebugString() - << " with alias: " << alias_; - // prepare context - (*it)->SetContext(std::move(ctx_)); - instruction_.reset(new Instruction(op, std::move(*it))); -} - -void TestCase::PrepareInputsForInstruction() { - for (auto& arg : op_desc().InputArgumentNames()) { - for (auto& var : op_desc().Input(arg)) { - std::string kernel_key = instruction_->kernel()->key_with_alias(); - const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument( - place_, kernel_key, arg); - - const auto* inst_type = Type::GetTensorTy(TARGET(kHost)); - CHECK(scope_->FindVar(var)); - const auto* shared_tensor = scope_->FindTensor((var)); - if (!TargetCompatibleTo(*inst_type, *param_type->type)) { - /// Create a tensor in the instruction's scope, alloc memory and then - /// copy data there. - auto* target_tensor = inst_scope_->NewTensor(var); - CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; - target_tensor->Resize(shared_tensor->dims()); - TargetCopy(param_type->type->target(), - target_tensor->mutable_data(param_type->type->target(), - shared_tensor->memory_size()), - shared_tensor->raw_data(), - shared_tensor->memory_size()); - } - } - } -} - -} // namespace arena -} // namespace lite -} // namespace paddle diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h deleted file mode 100644 index 48a8571a19..0000000000 --- a/lite/core/arena/framework.h +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include -#include "lite/core/op_registry.h" -#include "lite/core/program.h" -#include "lite/core/scope.h" -#include "lite/core/types.h" -#include "lite/model_parser/cpp/op_desc.h" - -namespace paddle { -namespace lite { -namespace arena { - -/* - * Init data and prepare the op. - */ -class TestCase { - public: - explicit TestCase(const Place& place, const std::string& alias) - : place_(place), scope_(new Scope), alias_(alias) { - ctx_ = ContextScheduler::Global().NewContext(place_.target); - } - - void Prepare() { - PrepareScopes(); - PrepareData(); - op_desc_.reset(new cpp::OpDesc); - PrepareOpDesc(op_desc_.get()); - - PrepareOutputsForInstruction(); - CreateInstruction(); - PrepareInputsForInstruction(); - } - - /// Run the target instruction, that is run the test operator. - void RunInstruction() { instruction_->Run(); } - - KernelContext* context() { return ctx_.get(); } - - /// The baseline should be implemented, which acts similar to an operator, - /// that is take several tensors as input and output several tensors as - /// output. - virtual void RunBaseline(Scope* scope) = 0; - - /// Check the precision of the output tensors. It will compare the same tensor - /// in two scopes, one of the instruction execution, and the other for the - /// baseline. - template - bool CheckPrecision(const std::string& var_name, float abs_error); - - const cpp::OpDesc& op_desc() { return *op_desc_; } - - // Check whether the output tensor is consistent with the output definition in - // kernel registry. - void CheckKernelConsistWithDefinition() {} - - Scope& scope() { return *scope_; } - - Scope* baseline_scope() { return base_scope_; } - Scope* inst_scope() { return inst_scope_; } - - protected: - // Prepare inputs in scope() for Tester. - virtual void PrepareData() = 0; - - /// Prepare a tensor in host. The tensors will be created in scope_. - /// Need to specify the targets other than X86 or ARM. - template - void SetCommonTensor(const std::string& var_name, - const DDim& ddim, - const T* data, - const LoD& lod = {}) { - auto* tensor = scope_->NewTensor(var_name); - tensor->Resize(ddim); - auto* d = tensor->mutable_data(); - memcpy(d, data, ddim.production() * sizeof(T)); - - // set lod - if (!lod.empty()) *tensor->mutable_lod() = lod; - } - - // Prepare for the operator. - virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0; - - public: - const Instruction& instruction() { return *instruction_; } - - private: - std::unique_ptr ctx_; - void CreateInstruction(); - - void PrepareScopes() { - inst_scope_ = &scope_->NewScope(); - base_scope_ = &scope_->NewScope(); - } - - // Check shape - // TODO(Superjomn) Move this method to utils or DDim? - bool ShapeEquals(const DDim& a, const DDim& b) { - if (a.size() != b.size()) return false; - for (int i = 0; i < a.size(); i++) { - if (a[i] != b[i]) return false; - } - return true; - } - - /// Copy the input tensors to target devices needed by the instruction. - void PrepareInputsForInstruction(); - - // Create output tensors and variables. - void PrepareOutputsForInstruction() { - for (auto x : op_desc().output_vars()) { - inst_scope_->NewTensor(x); - base_scope_->NewTensor(x); - } - } - - private: - std::shared_ptr scope_; - // The workspace for the Instruction. - Scope* inst_scope_{}; - // The workspace for the baseline implementation. - Scope* base_scope_{}; - std::unique_ptr op_desc_; - std::unique_ptr instruction_; - Place place_; - std::string alias_; -}; - -class Arena { - float abs_error_{}; - - public: - Arena(std::unique_ptr&& tester, - const Place& place, - float abs_error = 1e-5) - : tester_(std::move(tester)), place_(place), abs_error_(abs_error) { - tester_->Prepare(); - } - - bool TestPrecision() { - tester_->RunBaseline(tester_->baseline_scope()); - tester_->RunInstruction(); - - bool success = true; - for (auto& out : tester_->op_desc().OutputArgumentNames()) { - for (auto& var : tester_->op_desc().Output(out)) { - success = success && CompareTensor(out, var); - } - } - LOG(INFO) << "done"; - return success; - } - - void TestPerformance(int times = 100) { - auto timer = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < times; i++) { - tester_->RunInstruction(); - } - auto duration = std::chrono::duration_cast( - std::chrono::high_resolution_clock::now() - timer); - LOG(INFO) << "average duration: " << duration.count() << " ms"; - } - - private: - // input_name: X - bool CompareTensor(const std::string& arg_name, const std::string& var_name) { - // get tensor type. - const Type* type = - tester_->instruction().kernel()->GetOutputDeclType(arg_name); - - switch (type->precision()) { - case PRECISION(kFloat): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kInt8): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kInt32): - return tester_->CheckPrecision(var_name, abs_error_); - case PRECISION(kBool): - return tester_->CheckPrecision(var_name, abs_error_); - - default: - LOG(FATAL) << "not support type " << PrecisionToStr(type->precision()); - } - } - - private: - std::unique_ptr tester_; - Place place_; -}; - -template -bool TestCase::CheckPrecision(const std::string& var_name, float abs_error) { - auto a_tensor = inst_scope_->FindTensor(var_name); - auto b_tensor = base_scope_->FindTensor(var_name); - CHECK(a_tensor); - CHECK(b_tensor); - - CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims())); - - CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match"; - - // The baseline should output in host devices. - CHECK(b_tensor->target() == TARGET(kHost) || - b_tensor->target() == TARGET(kX86) || - b_tensor->target() == TARGET(kARM)); - - const T* a_data{}; - switch (a_tensor->target()) { - case TARGET(kX86): - case TARGET(kHost): - case TARGET(kARM): - a_data = static_cast(a_tensor->raw_data()); - break; - - default: - // Before compare, need to copy data from `target` device to host. - LOG(FATAL) << "Not supported"; - } - - CHECK(a_data); - - const T* b_data = static_cast(b_tensor->raw_data()); - - bool success = true; - for (int i = 0; i < a_tensor->dims().production(); i++) { - EXPECT_NEAR(a_data[i], b_data[i], abs_error); - if (fabsf(a_data[i] - b_data[i]) > abs_error) { - success = false; - } - } - return success; -} - -} // namespace arena -} // namespace lite -} // namespace paddle diff --git a/lite/core/arena/framework_test.cc b/lite/core/arena/framework_test.cc deleted file mode 100644 index 411ab26a71..0000000000 --- a/lite/core/arena/framework_test.cc +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/arena/framework.h" -#include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" - -namespace paddle { -namespace lite { - -class ScaleComputeTester : public arena::TestCase { - // common attributes for this op. - std::string input_ = "x"; - std::string output_ = "out"; - float scale_ = 1.2f; - float bias_ = 0.f; - DDim dims_{{3, 2, 10}}; - - public: - explicit ScaleComputeTester(const Place& place, const std::string& alias) - : TestCase(place, alias) {} - - void RunBaseline(Scope* scope) override { - auto* out = scope->NewTensor(output_); - CHECK(out); - out->Resize(dims_); - auto* out_data = out->mutable_data(); - - auto* x = scope->FindTensor(input_); - const auto* x_data = x->data(); - - for (int i = 0; i < dims_.production(); i++) { - out_data[i] = x_data[i] * scale_ + bias_; - } - } - - void PrepareOpDesc(cpp::OpDesc* op_desc) { - op_desc->SetType("scale"); - op_desc->SetInput("X", {input_}); - op_desc->SetOutput("Out", {output_}); - op_desc->SetAttr("scale", scale_); - op_desc->SetAttr("bias", bias_); - op_desc->SetAttr("bias_after_scale", false); - } - - void PrepareData() override { - std::vector data(dims_.production()); - - for (int i = 0; i < dims_.production(); i++) { - data[i] = i * 1.1; - } - - SetCommonTensor(input_, dims_, data.data()); - } -}; - -TEST(scale, basic) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); -#endif - std::unique_ptr tester(new ScaleComputeTester(place, "def")); - arena::Arena arena(std::move(tester), place); - - arena.TestPrecision(); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/context.cc b/lite/core/context.cc deleted file mode 100644 index 948aac0c79..0000000000 --- a/lite/core/context.cc +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/context.h" - -#ifdef LITE_WITH_OPENCL -DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path."); -#endif - -namespace paddle { -namespace lite {} // namespace lite -} // namespace paddle diff --git a/lite/core/context.h b/lite/core/context.h deleted file mode 100644 index bac0e3a627..0000000000 --- a/lite/core/context.h +++ /dev/null @@ -1,400 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/utils/any.h" -#ifdef LITE_WITH_CUDA -#include "lite/backends/cuda/blas.h" -#include "lite/backends/cuda/cuda_utils.h" -#endif -#ifdef LITE_WITH_OPENCL -#include -#include -#include "lite/backends/opencl/cl_context.h" -#include "lite/backends/opencl/cl_runtime.h" -#endif -#ifdef LITE_WITH_NPU -#include "lite/backends/npu/npu_helper.h" -#endif - -#include -#include -#include -#include -#include -#include -#include "lite/core/device_info.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" -#include "lite/utils/all.h" - -#ifdef LITE_WITH_OPENCL -DECLARE_string(cl_path); -#endif - -namespace paddle { -namespace lite { - -template -class Context; - -using HostContext = Context; -using X86Context = Context; -using CUDAContext = Context; -using ARMContext = Context; -using NPUContext = Context; -using OpenCLContext = Context; -using FPGAContext = Context; - -template <> -class Context { - public: - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() {} - - void CopySharedTo(HostContext* ctx) {} - - std::string name() const { return "HostContext"; } -}; - -#ifdef LITE_WITH_NPU -template <> -class Context { - public: - Context() {} - explicit Context(const NPUContext& ctx); - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() {} - void CopySharedTo(NPUContext* ctx) {} - - NPUContext& operator=(const NPUContext& ctx) {} - std::string name() const { return "NPUContext"; } - hiai::AiModelMngerClient* client(const std::string& model_name) const { - return npu::DeviceInfo::Global().client(model_name); - } -}; -#endif - -#ifdef LITE_WITH_ARM -template <> -class Context { - public: - Context() {} - explicit Context(const ARMContext& ctx); - - ARMContext& operator=(const ARMContext& ctx) {} - - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() { DeviceInfo::Init(); } - - void CopySharedTo(ARMContext* ctx) {} - - void SetRunMode(lite_api::PowerMode mode, int threads) { - return DeviceInfo::Global().SetRunMode(mode, threads); - } - void SetCache(int l1size, int l2size, int l3size) { - return DeviceInfo::Global().SetCache(l1size, l2size, l3size); - } - void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); } - - lite_api::PowerMode mode() const { return DeviceInfo::Global().mode(); } - int threads() const { return DeviceInfo::Global().threads(); } - ARMArch arch() const { return DeviceInfo::Global().arch(); } - int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); } - int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); } - int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); } - int llc_size() const { return DeviceInfo::Global().llc_size(); } - bool has_dot() const { return DeviceInfo::Global().has_dot(); } - bool has_fp16() const { return DeviceInfo::Global().has_fp16(); } - - template - T* workspace_data() { - return DeviceInfo::Global().workspace_data(); - } - - bool ExtendWorkspace(size_t size) { - return DeviceInfo::Global().ExtendWorkspace(size); - } - - std::string name() const { return "ARMContext"; } -}; -#endif - -#ifdef LITE_WITH_FPGA -// TODO(tianxiaogang): add needed implementation to context -template <> -class Context { - public: - Context() {} - void InitOnce() {} - - FPGAContext& operator=(const FPGAContext& ctx) {} - - void CopySharedTo(FPGAContext* ctx) {} - - std::string name() const { return "FPGAContext"; } -}; -#endif - -#ifdef LITE_WITH_CUDA -// Only works with CUDA kernels. -template <> -class Context { - public: - typename Env::Devs& devs = - Env::Global(); - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() { - cublas_fp32_ = std::make_shared>(); - } - void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { - CHECK_GT(devs.size(), 0) - << "Env is not initialized or current target is not exit!"; - if (dev_id >= devs.size()) { - LOG(WARNING) << "device index exceeds the number of devices, set to " - "default device(0)!"; - device_id_ = 0; - } else { - device_id_ = dev_id; - } - if (io_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "data stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - io_stream_id = 0; - } - if (exec_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "exec stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - exec_stream_id = 0; - } - - exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; - io_stream_ = devs[dev_id].io_streams()[io_stream_id]; - - exec_stream_id_ = exec_stream_id; - io_stream_id_ = io_stream_id; - } - void CopySharedTo(CUDAContext* ctx) { - CHECK(ctx); - CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; - ctx->cublas_fp32_ = cublas_fp32_; - } - - const cudaStream_t exec_stream() { return exec_stream_; } - void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } - - const cudaStream_t io_stream() { return io_stream_; } - void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } - - std::shared_ptr> cublas_fp32() { return cublas_fp32_; } - void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { - cublas_fp32_ = cublas_fp32; - } - - const std::vector& input_events() { return input_events_; } - void SetInputEvents(const std::vector& input_events) { - input_events_.clear(); - input_events_.assign(input_events.begin(), input_events.end()); - } - - const std::vector& output_events() { return output_events_; } - void SetOutputEvents(const std::vector& output_events) { - output_events_.clear(); - output_events_.assign(output_events.begin(), output_events.end()); - } - - std::string name() const { return "CUDAContext"; } - - private: - int device_id_; - // overall information - int exec_stream_id_; - int io_stream_id_; - cudaStream_t exec_stream_; - cudaStream_t io_stream_; - - // not thread-safe, should allocate for each thread. - std::shared_ptr> cublas_fp32_; - - // kernel information - std::vector input_events_; - std::vector output_events_; -}; -#endif - -#ifdef LITE_WITH_X86 -template <> -class Context { - public: - Context() {} - - Context(Context&& ctx) {} - - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() {} - - void CopySharedTo(X86Context* ctx) {} - - std::string name() const { return "X86Context"; } - - private: - // overall information - // - // kernel information -}; -#endif - -#ifdef LITE_WITH_OPENCL -template <> -class Context { - std::shared_ptr cl_context_; - using WaitListType = - std::unordered_map(nullptr)), - std::shared_ptr>; - std::shared_ptr cl_wait_list_; - - public: - CLContext* cl_context() { return cl_context_.get(); } - WaitListType* cl_wait_list() { return cl_wait_list_.get(); } - - void InitOnce() { - // Init cl runtime. - CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; - CLRuntime::Global()->set_cl_path(FLAGS_cl_path); - - cl_context_ = std::make_shared(); - cl_wait_list_ = std::make_shared(); - } - - void CopySharedTo(OpenCLContext* ctx) { - ctx->cl_context_ = cl_context_; - ctx->cl_wait_list_ = cl_wait_list_; - } -}; -#endif - -// Context for running a kernel. -// Holds the necessary resource and information. -class KernelContext { - public: - template - ContextT& As() { - if (!ctx_.valid()) { - ctx_.set(); - } - return *ctx_.get_mutable(); - } - - private: - Any ctx_; -}; - -// The ContextScheduler helps to assign different context for each kernel. -class ContextScheduler { - public: - static ContextScheduler& Global() { - static auto* x = new ContextScheduler; - return *x; - } - - std::unique_ptr NewContext(TargetType target) { - std::unique_ptr ctx(new KernelContext); - switch (target) { - case TARGET(kHost): - kernel_contexts_[TargetType::kHost].As().CopySharedTo( - &ctx->As()); - break; -#ifdef LITE_WITH_X86 - case TARGET(kX86): - kernel_contexts_[TargetType::kX86].As().CopySharedTo( - &ctx->As()); - break; -#endif -#ifdef LITE_WITH_CUDA - case TARGET(kCUDA): { - int dev_id = TargetWrapper::GetCurDevice(); - auto& context = ctx->As(); - context.Init(dev_id); - kernel_contexts_[TargetType::kCUDA].As().CopySharedTo( - &context); - } break; -#endif -#ifdef LITE_WITH_ARM - case TARGET(kARM): - kernel_contexts_[TargetType::kARM].As().CopySharedTo( - &ctx->As()); - break; -#endif -#ifdef LITE_WITH_NPU - case TARGET(kNPU): - kernel_contexts_[TargetType::kNPU].As().CopySharedTo( - &ctx->As()); - break; -#endif -#ifdef LITE_WITH_OPENCL - case TARGET(kOpenCL): - kernel_contexts_[TargetType::kOpenCL].As().CopySharedTo( - &ctx->As()); - break; -#endif -#ifdef LITE_WITH_FPGA - case TARGET(kFPGA): - kernel_contexts_[TargetType::kFPGA].As().CopySharedTo( - &ctx->As()); - break; -#endif - default: -#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL - LOG(FATAL) << "unsupported target " << TargetToStr(target); -#endif - break; - } - return ctx; - } - - private: - template - void InitContext() { - kernel_contexts_[Type].As().InitOnce(); - } - - ContextScheduler() { - InitContext(); -#ifdef LITE_WITH_X86 - InitContext(); -#endif -#ifdef LITE_WITH_CUDA - InitContext(); -#endif -#ifdef LITE_WITH_ARM - InitContext(); -#endif -#ifdef LITE_WITH_OPENCL - InitContext(); -#endif -#ifdef LITE_WITH_FPGA - InitContext(); -#endif -#ifdef LITE_WITH_NPU - InitContext(); -#endif - } - - private: - std::map kernel_contexts_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/context_test.cc b/lite/core/context_test.cc deleted file mode 100644 index 80b642bfad..0000000000 --- a/lite/core/context_test.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/context.h" -#include - -namespace paddle { -namespace lite { - -// #ifdef LITE_WITH_X86 -// TEST(ContextScheduler, NewContext) { -// auto ctx1_p = ContextScheduler::Global().NewContext(TargetType::kX86); -// auto ctx2_p = ContextScheduler::Global().NewContext(TargetType::kX86); -// ASSERT_FALSE(ctx1_p.get() == ctx2_p.get()); - -// auto& ctx1 = ctx1_p->As(); -// auto& ctx2 = ctx2_p->As(); - -// ASSERT_EQ(ctx1.name(), "X86Context"); -// ASSERT_EQ(ctx2.name(), "X86Context"); - -// ASSERT_FALSE(ctx1.x86_device_context() == nullptr || -// ctx2.x86_device_context() == nullptr); -// ASSERT_FALSE(ctx1.x86_execution_context() == nullptr || -// ctx2.x86_execution_context() == nullptr); - -// ASSERT_TRUE(ctx1.x86_device_context() != ctx2.x86_device_context()); -// ASSERT_TRUE(ctx1.x86_execution_context() != ctx2.x86_execution_context()); - -// using device_ctx_t = ::paddle::platform::CPUDeviceContext; -// using exec_ctx_t = ::paddle::framework::ExecutionContext; -// auto* device_ctx = new device_ctx_t; -// ctx1.SetX86DeviceContext(std::unique_ptr(device_ctx)); -// ctx1.SetX86ExecutionContext( -// std::unique_ptr(new exec_ctx_t(*device_ctx))); -// } -// #endif - -} // namespace lite -} // namespace paddle diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc deleted file mode 100644 index de53d9ba67..0000000000 --- a/lite/core/device_info.cc +++ /dev/null @@ -1,1151 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Parts of the following code in this file refs to -// https://github.com/Tencent/ncnn/blob/master/src/cpu.cpp -// Tencent is pleased to support the open source community by making ncnn -// available. -// -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of the -// License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations under -// the License. - -#ifdef LITE_WITH_LINUX -#include -#include -#endif -#if __APPLE__ -#include "TargetConditionals.h" -#if LITE_WITH_IPHONE -#include -#include -#include -#endif // LITE_WITH_IPHONE -#endif // __APPLE__ - -#ifdef ARM_WITH_OMP -#include -#endif - -#include -#include -#include "lite/core/device_info.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM - -#ifdef TARGET_IOS -const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; -const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; -const int DEFAULT_L3_CACHE_SIZE = 0; -#else -const int DEFAULT_L1_CACHE_SIZE = 32 * 1024; -const int DEFAULT_L2_CACHE_SIZE = 512 * 1024; -const int DEFAULT_L3_CACHE_SIZE = 0; -#endif - -int get_cpu_num() { -#ifdef LITE_WITH_LINUX - // get cpu count from /sys/devices/system/cpu/cpunum/uevent - int max_cpu_num = 20; - int cpu_num = 0; - for (int i = 0; i < max_cpu_num; ++i) { - char path[256]; - snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); - FILE* fp = fopen(path, "rb"); - if (!fp) { - break; - } - cpu_num++; - fclose(fp); - } - if (cpu_num < 1) { - cpu_num = 1; - } - return cpu_num; -#elif defined(TARGET_IOS) - int cpu_num = 0; - size_t len = sizeof(cpu_num); - sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0); - if (cpu_num < 1) { - cpu_num = 1; - } - return cpu_num; -#else - return 1; -#endif -} - -size_t get_mem_size() { -#ifdef LITE_WITH_LINUX - // get cpu count from /proc/cpuinfo - FILE* fp = fopen("/proc/meminfo", "rb"); - if (!fp) { - return 1; - } - size_t memsize = 0; - char line[1024]; - while (!feof(fp)) { - char* s = fgets(line, 1024, fp); - if (!s) { - break; - } - sscanf(s, "MemTotal: %d kB", &memsize); - } - fclose(fp); - return memsize; -#elif defined(TARGET_IOS) - // to be implemented - printf("not implemented, set to default 4GB\n"); - return 4096 * 1024; -#endif - return 0; -} - -void get_cpu_arch(std::vector* archs, const int cpu_num) { - archs->resize(cpu_num); - for (int i = 0; i < cpu_num; ++i) { - archs->at(i) = kARMArch_UNKOWN; - } -#ifdef LITE_WITH_LINUX - //! get CPU ARCH - FILE* fp = fopen("/proc/cpuinfo", "rb"); - if (!fp) { - return; - } - int cpu_idx = 0; - char line[1024]; - while (!feof(fp)) { - char* s = fgets(line, 1024, fp); - if (!s) { - break; - } - if (strstr(line, "part") != NULL) { - ARMArch arch_type = kARMArch_UNKOWN; - int arch_id = 0; - sscanf(s, "CPU part\t: %x", &arch_id); - switch (arch_id) { - case 0xd03: - arch_type = kA53; - break; - case 0xd05: - arch_type = kA55; - break; - case 0xd07: - arch_type = kA57; - break; - case 0xd08: - arch_type = kA72; - break; - case 0xd09: - arch_type = kA73; - break; - case 0xd0a: - arch_type = kA75; - break; - case 0xd40: - arch_type = kA76; - break; - case 0x804: - // 855 - arch_type = kA76; - break; - case 0x805: - // 855 - arch_type = kA55; - break; - case 0x802: - // 845 - arch_type = kA75; - break; - case 0x803: - // 845 - arch_type = kA55; - break; - case 0x801: - // 835 - arch_type = kA73; - break; - case 0x800: - // 835 - arch_type = kA73; - break; - case 0x205: - // 820 - arch_type = kA72; - break; - default: - LOG(ERROR) << "Unknow cpu arch: " << arch_id; - } - archs->at(cpu_idx) = arch_type; - cpu_idx++; - } - } - fclose(fp); - for (; cpu_idx > 0 && cpu_idx < cpu_num; ++cpu_idx) { - archs->at(cpu_idx) = archs->at(cpu_idx - 1); - } -#elif defined(TARGET_IOS) - for (int i = 0; i < cpu_num; ++i) { - archs->at(i) = kAPPLE; - } -#endif -} - -#ifdef LITE_WITH_LINUX - -std::string get_cpu_name() { - FILE* fp = fopen("/proc/cpuinfo", "rb"); - if (!fp) { - return ""; - } - char line[1024]; - while (!feof(fp)) { - char* s = fgets(line, 1024, fp); - if (!s) { - break; - } - if (strstr(line, "Hardware") != NULL) { - fclose(fp); - return std::string(line); - } - } - fclose(fp); - return ""; -} - -void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) { - *max_freq = 0; - *min_freq = 0; - // first try, for all possible cpu - char path[256]; - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", - cpu_id); - FILE* fp = fopen(path, "rb"); - if (!fp) { - // second try, for online cpu - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", - cpu_id); - fp = fopen(path, "rb"); - if (!fp) { - // third try, for online cpu - // get max_freq - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", - cpu_id); - fp = fopen(path, "rb"); - if (!fp) { - return; - } - fscanf(fp, "%d", max_freq); - fclose(fp); - // get min_freq - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq", - cpu_id); - fp = fopen(path, "rb"); - if (!fp) { - return; - } - fscanf(fp, "%d", min_freq); - fclose(fp); - return; - } - } - *min_freq = std::numeric_limits::max(); - while (!feof(fp)) { - int freq = 0; - int nscan = fscanf(fp, "%d %*d", &freq); - if (nscan != 1) { - break; - } - if (freq > *max_freq) { - *max_freq = freq; - } - if (freq < *min_freq) { - *min_freq = freq; - } - } - fclose(fp); -} - -void sort_cpuid_by_max_freq(const std::vector& max_freqs, - std::vector* cpu_ids, - std::vector* cluster_ids) { - int cpu_num = max_freqs.size(); - if (cpu_num == 0) { - return; - } - cpu_ids->resize(cpu_num); - cluster_ids->resize(cpu_num); - for (int i = 0; i < cpu_num; i++) { - cpu_ids->at(i) = i; - } - // sort cpuid as big core first - // simple bubble sort - for (int i = 0; i < cpu_num; i++) { - for (int j = i + 1; j < cpu_num; j++) { - if (max_freqs[i] < max_freqs[j]) { - // swap - int tmp = cpu_ids->at(i); - cpu_ids->at(i) = cpu_ids->at(j); - cpu_ids->at(j) = tmp; - } - } - } - // SMP - int mid_max_freq = - (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2; - - for (int i = 0; i < cpu_num; i++) { - cpu_ids->at(i) = i; - if (max_freqs[i] >= mid_max_freq) { - cluster_ids->at(i) = 0; - } else { - cluster_ids->at(i) = 1; - } - } -} - -void get_cpu_cache_size(int cpu_id, - int* l1_cache_size, - int* l2_cache_size, - int* l3_cache_size) { - int max_cache_idx_num = 10; - *l1_cache_size = DEFAULT_L1_CACHE_SIZE; - *l2_cache_size = DEFAULT_L2_CACHE_SIZE; - *l3_cache_size = DEFAULT_L3_CACHE_SIZE; - for (int i = 0; i < max_cache_idx_num; i++) { - char path[256]; - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpu%d/cache/index%d/level", - cpu_id, - i); - FILE* fp = fopen(path, "rb"); - if (fp) { - int level = -1; - fscanf(fp, "%d", &level); - fclose(fp); - snprintf(path, - sizeof(path), - "/sys/devices/system/cpu/cpu%d/cache/index%d/size", - cpu_id, - i); - fp = fopen(path, "rb"); - if (fp) { - int size = -1; - fscanf(fp, "%d", &size); - fclose(fp); - if (size >= 0) { - if (level == 1) { - *l1_cache_size = size * 1024; - } else if (level == 2) { - *l2_cache_size = size * 1024; - } else if (level == 3) { - *l3_cache_size = size * 1024; - } - } - } - } - } -} - -bool check_cpu_online(const std::vector& cpu_ids) { - if (cpu_ids.size() == 0) { - return false; - } - char path[256]; - bool all_online = true; - for (int i = 0; i < cpu_ids.size(); ++i) { - snprintf( - path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online", cpu_ids[i]); - FILE* fp = fopen(path, "rb"); - int is_online = 0; - if (fp) { - fscanf(fp, "%d", &is_online); - fclose(fp); - } else { - LOG(ERROR) << "Failed to query the online statue of CPU id:" - << cpu_ids[i]; - } - if (is_online == 0) { - all_online = false; - LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine"; - } - } - return all_online; -} - -int set_sched_affinity(const std::vector& cpu_ids) { -// #define CPU_SETSIZE 1024 -// #define __NCPUBITS (8 * sizeof (unsigned long)) -// typedef struct -// { -// unsigned long __bits[CPU_SETSIZE / __NCPUBITS]; -// } cpu_set_t; - -// set affinity for thread -#ifdef __GLIBC__ - pid_t pid = syscall(SYS_gettid); -#else - pid_t pid = gettid(); -#endif - cpu_set_t mask; - CPU_ZERO(&mask); - for (int i = 0; i < cpu_ids.size(); ++i) { - CPU_SET(cpu_ids[i], &mask); - } - int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); - if (syscallret) { - return -1; - } - return 0; -} - -bool bind_threads(const std::vector cpu_ids) { -#ifdef ARM_WITH_OMP - int thread_num = cpu_ids.size(); - omp_set_num_threads(thread_num); - std::vector ssarets; - for (int i = 0; i < thread_num; ++i) { - ssarets.push_back(0); - } -#pragma omp parallel for - for (int i = 0; i < thread_num; i++) { - ssarets[i] = set_sched_affinity(cpu_ids); - } - for (int i = 0; i < thread_num; i++) { - if (ssarets[i] != 0) { - LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i]; - return false; - } - } -#else // ARM_WITH_OMP - std::vector first_cpu_id; - first_cpu_id.push_back(cpu_ids[0]); - int ssaret = set_sched_affinity(first_cpu_id); - if (ssaret != 0) { - LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0]; - return false; - } -#endif // ARM_WITH_OMP - return true; -} - -#endif // LITE_WITH_LINUX - -void DeviceInfo::SetDotInfo(int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - dot_.resize(core_num_); - if (argc == 1) { - bool flag = va_arg(arg_ptr, int) > 0; - for (int i = 0; i < core_num_; ++i) { - dot_[i] = flag; - } - } else { - bool flag_big_core = va_arg(arg_ptr, int) > 0; - bool flag_little_core = va_arg(arg_ptr, int) > 0; - int big_core_num = big_core_ids_.size(); - int little_core_num = little_core_ids_.size(); - for (int i = 0; i < big_core_num; ++i) { - dot_[big_core_ids_[i]] = flag_big_core; - } - for (int i = 0; i < little_core_num; ++i) { - dot_[little_core_ids_[i]] = flag_little_core; - } - } - va_end(arg_ptr); -} - -void DeviceInfo::SetFP16Info(int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - fp16_.resize(core_num_); - if (argc == 1) { - bool flag = va_arg(arg_ptr, int) > 0; - for (int i = 0; i < core_num_; ++i) { - fp16_[i] = flag; - } - } else { - bool flag_big_core = va_arg(arg_ptr, int) > 0; - bool flag_little_core = va_arg(arg_ptr, int) > 0; - int big_core_num = big_core_ids_.size(); - int little_core_num = little_core_ids_.size(); - for (int i = 0; i < big_core_num; ++i) { - fp16_[big_core_ids_[i]] = flag_big_core; - } - for (int i = 0; i < little_core_num; ++i) { - fp16_[little_core_ids_[i]] = flag_little_core; - } - } - va_end(arg_ptr); -} - -void DeviceInfo::SetFP32Info(int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - fp32_.resize(core_num_); - if (argc == 1) { - bool flag = va_arg(arg_ptr, int) > 0; - for (int i = 0; i < core_num_; ++i) { - fp32_[i] = flag; - } - } else { - bool flag_big_core = va_arg(arg_ptr, int) > 0; - bool flag_little_core = va_arg(arg_ptr, int) > 0; - int big_core_num = big_core_ids_.size(); - int little_core_num = little_core_ids_.size(); - for (int i = 0; i < big_core_num; ++i) { - fp32_[big_core_ids_[i]] = flag_big_core; - } - for (int i = 0; i < little_core_num; ++i) { - fp32_[little_core_ids_[i]] = flag_little_core; - } - } - va_end(arg_ptr); -} - -// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 -void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - std::vector* cache; - switch (cache_id) { - case 0: - cache = &L1_cache_; - break; - case 1: - cache = &L2_cache_; - break; - case 2: - cache = &L3_cache_; - break; - default: - break; - } - cache->resize(core_num_); - if (argc == 1) { - int cache_size = va_arg(arg_ptr, int); - for (int i = 0; i < core_num_; ++i) { - (*cache)[i] = cache_size; - } - } else { - int big_core_num = big_core_ids_.size(); - int little_core_num = little_core_ids_.size(); - int big_core_cache_size = va_arg(arg_ptr, int); - int little_core_cache_size = va_arg(arg_ptr, int); - for (int i = 0; i < big_core_num; ++i) { - (*cache)[big_core_ids_[i]] = big_core_cache_size; - } - for (int i = 0; i < little_core_num; ++i) { - (*cache)[little_core_ids_[i]] = little_core_cache_size; - } - } - va_end(arg_ptr); -} - -void DeviceInfo::SetArchInfo(int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - archs_.resize(core_num_); - if (argc == 1) { - ARMArch arch = (ARMArch)va_arg(arg_ptr, int); - for (int i = 0; i < core_num_; ++i) { - archs_[i] = arch; - } - } else { - ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); - ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); - int big_core_num = big_core_ids_.size(); - int little_core_num = little_core_ids_.size(); - for (int i = 0; i < big_core_num; ++i) { - archs_[big_core_ids_[i]] = big_core_arch; - } - for (int i = 0; i < little_core_num; ++i) { - archs_[little_core_ids_[i]] = little_core_arch; - } - } - va_end(arg_ptr); -} - -bool DeviceInfo::SetCPUInfoByName() { - /* Snapdragon */ - if (dev_name_.find("SM8150") != std::string::npos) { // 855 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(2, kA76, kA55); - SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); - SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); - SetCacheInfo(2, 1, 2048 * 1024); - SetFP16Info(1, 1); - SetDotInfo(1, 1); - return true; - } else if (dev_name_.find("SDM845") != std::string::npos) { // 845 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(2, kA75, kA55); - SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); - SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); - SetCacheInfo(2, 1, 2048 * 1024); - SetFP16Info(1, 1); - return true; - } else if (dev_name_.find("SDM710") != std::string::npos) { // 710 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {6, 7}; - little_core_ids_ = {0, 1, 2, 3, 4, 5}; - cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0}; - SetArchInfo(2, kA75, kA55); - SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); - SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); - SetCacheInfo(2, 1, 1024 * 1024); - return true; - } else if (dev_name_.find("MSM8998") != std::string::npos) { // 835 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(2, kA73, kA53); - SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); - SetCacheInfo(1, - 2, - 1024 * 1024, - /*real cache size is 2M, while that will get bad performace - on conv3x3s1 or gemm, set to 1M or 512K*/ - 1024 * 1024); - return true; - } else if (dev_name_.find("MSM8996") != std::string::npos) { // 820 - core_num_ = 4; - core_ids_ = {0, 1, 2, 3}; - big_core_ids_ = {2, 3}; - little_core_ids_ = {0, 1}; - cluster_ids_ = {1, 1, 0, 0}; - SetArchInfo(1, kA72); - SetCacheInfo(0, 1, 24 * 1024); - SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); - return true; - } else if (dev_name_.find("SDM660") != std::string::npos || - dev_name_.find("SDM636") != std::string::npos) { // 660, 636 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(1, kA73); - SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); - SetCacheInfo(1, 1, 1024 * 1024); - return true; - } else if (dev_name_.find("MSM8976") != std::string::npos) { // 652,653 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(2, kA72, kA53); - SetCacheInfo(0, 1, 32 * 1024); - SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); - return true; - } else if (dev_name_.find("MSM8953") != std::string::npos) { // 625 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - little_core_ids_ = {}; - cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; - SetArchInfo(1, kA53); - SetCacheInfo(0, 1, 32 * 1024); - SetCacheInfo(1, 1, 1024 * 1024); - return true; - } else if (dev_name_.find("MSM8939") != std::string::npos) { // 615 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {0, 1, 2, 3}; - little_core_ids_ = {4, 5, 6, 7}; - cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1}; - SetArchInfo(1, kA53); - SetCacheInfo(0, 1, 32 * 1024); - SetCacheInfo(1, 2, 512 * 1024, 256 * 1024); - return true; - /* MediaTek */ - } else if (dev_name_.find("MT6797") != - std::string::npos) { // X20/X23/X25/X27 - core_num_ = 10; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - big_core_ids_ = {8, 9}; - little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; - SetArchInfo(2, kA72, kA53); - SetCacheInfo(0, 1, 32 * 1024); - SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); - return true; - } else if (dev_name_.find("MT6799") != std::string::npos) { // X30 - core_num_ = 10; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - big_core_ids_ = {8, 9}; - little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; - SetArchInfo(2, kA73, kA53); - return true; - } else if (dev_name_.find("MT6795") != std::string::npos || - dev_name_.find("MT6762") != std::string::npos || - dev_name_.find("MT6755T") != std::string::npos || - dev_name_.find("MT6755S") != std::string::npos || - dev_name_.find("MT6753") != std::string::npos || - dev_name_.find("MT6752") != std::string::npos || - dev_name_.find("MT6750") != std::string::npos) { - // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - little_core_ids_ = {}; - cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; - SetArchInfo(1, kA53); - return true; - } else if (dev_name_.find("MT6758") != std::string::npos || - dev_name_.find("MT6757") != std::string::npos || - dev_name_.find("MT6763") != std::string::npos || - dev_name_.find("MT6755M") != std::string::npos || - dev_name_.find("MT6755") != - std::string::npos) { // P30, P20/P25, P23, P10 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(1, kA53); - return true; - } else if (dev_name_.find("MT6771") != std::string::npos) { // P60 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(2, kA73, kA53); - return true; - } else if (dev_name_.find("MT6765") != std::string::npos || - dev_name_.find("MT6739") != std::string::npos || - dev_name_.find("MT6738") != std::string::npos || - dev_name_.find("MT6737") != - std::string::npos) { // A22, MT6739, MT6738, MT6767 - core_num_ = 4; - core_ids_ = {0, 1, 2, 3}; - big_core_ids_ = {0, 1, 2, 3}; - little_core_ids_ = {}; - cluster_ids_ = {0, 0, 0, 0}; - SetArchInfo(1, kA53); - return true; - } else if (dev_name_.find("KIRIN980") != std::string::npos) { // Kirin 980 - core_num_ = 8; - core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - big_core_ids_ = {4, 5, 6, 7}; - little_core_ids_ = {0, 1, 2, 3}; - cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - SetArchInfo(2, kA76, kA55); - SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); - SetCacheInfo(1, 2, 512 * 1024, 128 * 1024); - SetCacheInfo(2, 1, 4096 * 1024); - SetFP16Info(1, 1); - SetDotInfo(1, 1); - return true; - } - return false; -} - -void DeviceInfo::SetCPUInfoByProb() { -#ifdef LITE_WITH_LINUX - // get big.LITTLE cores by sorting CPU frequency - sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_); - big_core_ids_.clear(); - little_core_ids_.clear(); - for (int i = 0; i < cluster_ids_.size(); ++i) { - if (cluster_ids_[i] == 0) { - big_core_ids_.push_back(core_ids_[i]); - } else { - little_core_ids_.push_back(core_ids_[i]); - } - } - // get l1, l2, l3 cache size for each core - for (int i = 0; i < core_num_; i++) { - get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i])); - } -#endif // LITE_WITH_LINUX -} - -void DeviceInfo::RequestPowerFullMode(int thread_num) { - int big_core_size = big_core_ids_.size(); - int little_core_size = little_core_ids_.size(); - active_ids_.clear(); - for (int i = 0; i < thread_num; ++i) { - if (i < big_core_size) { - active_ids_.push_back(big_core_ids_[i]); - } else if (i < big_core_size + little_core_size) { - active_ids_.push_back(little_core_ids_[i - big_core_size]); - } - } - mode_ = lite_api::PowerMode::LITE_POWER_FULL; -} - -void DeviceInfo::RequestPowerHighMode(int thread_num) { - int big_core_size = big_core_ids_.size(); - int little_core_size = little_core_ids_.size(); - active_ids_.clear(); - if (big_core_size > 0) { - mode_ = lite_api::PowerMode::LITE_POWER_HIGH; - if (thread_num > big_core_size) { - LOG(ERROR) << "Request thread num: " << thread_num - << ", exceed the big cores size: " << big_core_size - << ", truncate thread num to " << big_core_size; - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < thread_num; ++i) { - active_ids_.push_back(big_core_ids_[i]); - } - } - } else { - mode_ = lite_api::PowerMode::LITE_POWER_LOW; - LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores."; - if (thread_num > little_core_size) { - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < thread_num; ++i) { - active_ids_.push_back(little_core_ids_[i]); - } - } - } -} - -void DeviceInfo::RequestPowerLowMode(int thread_num) { - int big_core_size = big_core_ids_.size(); - int little_core_size = little_core_ids_.size(); - active_ids_.clear(); - if (little_core_size > 0) { - mode_ = lite_api::PowerMode::LITE_POWER_LOW; - if (thread_num > little_core_size) { - LOG(WARNING) << "Request thread num: " << thread_num - << ", exceed the little cores size: " << little_core_size - << ", truncate thread num to " << little_core_size; - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < thread_num; i++) { - active_ids_.push_back(little_core_ids_[i]); - } - } - } else { - mode_ = lite_api::PowerMode::LITE_POWER_HIGH; - LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; - if (thread_num > big_core_size) { - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < thread_num; i++) { - active_ids_.push_back(big_core_ids_[i]); - } - } - } -} - -void DeviceInfo::RequestPowerNoBindMode(int thread_num) { - active_ids_.clear(); - if (thread_num > core_ids_.size()) { - active_ids_ = core_ids_; - } else { - active_ids_.resize(thread_num); - for (int i = 0; i < thread_num; ++i) { - if (i < big_core_ids_.size()) { - active_ids_[i] = big_core_ids_[i]; - } else { - active_ids_[i] = little_core_ids_[i - big_core_ids_.size()]; - } - } - } - mode_ = lite_api::PowerMode::LITE_POWER_NO_BIND; -} - -void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) { - int big_core_size = big_core_ids_.size(); - int little_core_size = little_core_ids_.size(); - active_ids_.clear(); - if (big_core_size > 0) { - mode_ = lite_api::PowerMode::LITE_POWER_RAND_HIGH; - if (thread_num > big_core_size) { - LOG(WARNING) << "Request thread num: " << thread_num - << ", exceed the big cores size: " << big_core_size - << ", truncate thread num to " << big_core_size; - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < thread_num; ++i) { - active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]); - } - } - } else { - mode_ = lite_api::PowerMode::LITE_POWER_LOW; - LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores."; - if (thread_num > little_core_size) { - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < thread_num; ++i) { - active_ids_.push_back(little_core_ids_[i]); - } - } - } -} - -void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) { - int big_core_size = big_core_ids_.size(); - int little_core_size = little_core_ids_.size(); - active_ids_.clear(); - if (little_core_size > 0) { - mode_ = lite_api::PowerMode::LITE_POWER_RAND_LOW; - if (thread_num > little_core_size) { - LOG(WARNING) << "Request thread num: " << thread_num - << ", exceed the little cores size: " << little_core_size - << ", truncate thread num to " << little_core_size; - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < thread_num; ++i) { - active_ids_.push_back( - little_core_ids_[(i + shift_num) % little_core_size]); - } - } - } else { - mode_ = lite_api::PowerMode::LITE_POWER_HIGH; - LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores."; - if (thread_num > big_core_size) { - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < thread_num; ++i) { - active_ids_.push_back(big_core_ids_[i]); - } - } - } -} - -int DeviceInfo::Setup() { - core_num_ = get_cpu_num(); - mem_size_ = get_mem_size(); - get_cpu_arch(&archs_, core_num_); - // set defalut CPU info - SetCacheInfo(0, 1, DEFAULT_L1_CACHE_SIZE); - SetCacheInfo(1, 1, DEFAULT_L2_CACHE_SIZE); - SetCacheInfo(2, 1, DEFAULT_L3_CACHE_SIZE); - SetFP32Info(1, 1); - SetFP16Info(1, 0); - SetDotInfo(1, 0); - max_freqs_.resize(core_num_); - min_freqs_.resize(core_num_); -#ifdef LITE_WITH_LINUX - // get max&min freq - for (int i = 0; i < core_num_; ++i) { - int max_freq, min_freq; - get_cpu_max_min_freq(i, &max_freq, &min_freq); - max_freqs_[i] = max_freq / 1000; - min_freqs_[i] = min_freq / 1000; - } - // get cache size and big.LITTLE core ids - dev_name_ = get_cpu_name(); - if (!SetCPUInfoByName()) { - SetCPUInfoByProb(); - } - core_ids_.resize(core_num_); - cluster_ids_.resize(core_num_); - for (int i = 0; i < core_num_; ++i) { - max_freqs_[i] = 1000000; - min_freqs_[i] = 1000000; - cluster_ids_[i] = 0; - } -#else -#ifdef TARGET_IOS - dev_name_ = "Apple"; -#else - dev_name_ = "Unknown"; -#endif - core_ids_.resize(core_num_); - cluster_ids_.resize(core_num_); - big_core_ids_.resize(core_num_); - for (int i = 0; i < core_num_; ++i) { - max_freqs_[i] = 1000000; - min_freqs_[i] = 1000000; - cluster_ids_[i] = 0; - core_ids_[i] = i; - big_core_ids_[i] = i; - } -#endif - // output info - LOG(INFO) << "ARM multiprocessors name: " << dev_name_; - LOG(INFO) << "ARM multiprocessors number: " << core_num_; - for (int i = 0; i < core_num_; ++i) { - LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i] - << ", max freq: " << max_freqs_[i] - << ", min freq: " << min_freqs_[i] - << ", cluster ID: " << cluster_ids_[core_ids_[i]] - << ", CPU ARCH: A" << archs_[i]; - } - LOG(INFO) << "L1 DataCache size is: "; - for (int i = 0; i < core_num_; ++i) { - LOG(INFO) << L1_cache_[i] / 1024 << " KB"; - } - LOG(INFO) << "L2 Cache size is: "; - for (int i = 0; i < core_num_; ++i) { - LOG(INFO) << L2_cache_[i] / 1024 << " KB"; - } - LOG(INFO) << "L3 Cache size is: "; - for (int i = 0; i < core_num_; ++i) { - LOG(INFO) << L3_cache_[i] / 1024 << " KB"; - } - LOG(INFO) << "Total memory: " << mem_size_ << "KB"; - // set default run mode - SetRunMode(lite_api::PowerMode::LITE_POWER_NO_BIND, - 1); // use single thread by default - return 0; -} - -void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { -#ifdef ARM_WITH_OMP - thread_num = std::min(thread_num, core_num_); -#else - thread_num = 1; // force thread_num to 1 if OpenMP is disabled -#endif -#ifdef LITE_WITH_LINUX - int big_core_size = big_core_ids_.size(); - int little_core_size = little_core_ids_.size(); - int big_little_core_size = big_core_size + little_core_size; - thread_num = std::min(thread_num, big_little_core_size); - count_++; - int shift_num = (count_ / 10) % big_core_size; - switch (mode) { - case lite_api::LITE_POWER_FULL: - RequestPowerFullMode(thread_num); - break; - case lite_api::LITE_POWER_HIGH: - RequestPowerHighMode(thread_num); - break; - case lite_api::LITE_POWER_LOW: - RequestPowerLowMode(thread_num); - break; - case lite_api::LITE_POWER_NO_BIND: - RequestPowerNoBindMode(thread_num); - break; - case lite_api::LITE_POWER_RAND_HIGH: - RequestPowerRandHighMode(shift_num, thread_num); - break; - case lite_api::LITE_POWER_RAND_LOW: - RequestPowerRandLowMode(shift_num, thread_num); - break; - default: - LOG(FATAL) << "Unsupported power mode: " << mode; - break; - } - if (active_ids_.empty()) { - active_ids_.push_back(0); - } -#ifdef ARM_WITH_OMP - omp_set_num_threads(active_ids_.size()); -#endif - if (mode_ != lite_api::LITE_POWER_NO_BIND) { - if (check_cpu_online(active_ids_)) { - bind_threads(active_ids_); - } else { - LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE"; - mode_ = lite_api::LITE_POWER_NO_BIND; - } - } -#else // LITE_WITH_LINUX - // only LITE_POWER_NO_BIND is supported in other OS - RequestPowerNoBindMode(thread_num); -#ifdef ARM_WITH_OMP - omp_set_num_threads(active_ids_.size()); -#endif -#endif // LITE_WITH_LINUX - //! alloc memory for sgemm in this context - workspace_.Resize({llc_size()}); - workspace_.mutable_data(); - arch_ = archs_[active_ids_[0]]; -} - -void DeviceInfo::SetCache(int l1size, int l2size, int l3size) { - SetCacheInfo(0, 1, l1size); - SetCacheInfo(1, 1, l2size); - SetCacheInfo(2, 1, l3size); - workspace_.Resize({2 * (l1size + l2size)}); -} - -bool DeviceInfo::ExtendWorkspace(int size) { - workspace_.Resize({size + llc_size()}); - workspace_.mutable_data(); - return true; -} - -#endif // LITE_WITH_ARM - -#ifdef LITE_WITH_CUDA - -void Device::Init() { - GetInfo(); - CreateStream(); -} - -void Device::GetInfo() { - cudaGetDeviceProperties(&device_prop_, idx_); - cudaRuntimeGetVersion(&runtime_version_); - sm_version_ = (device_prop_.major << 8 | device_prop_.minor); - has_hmma_ = - (sm_version_ == 0x0700 || sm_version_ == 0x0702 || sm_version_ == 0x0705); - has_fp16_ = (sm_version_ == 0x0602 || sm_version_ == 0x0600 || - sm_version_ == 0x0503 || has_hmma_); - has_imma_ = (sm_version_ == 0x0702 || sm_version_ == 0x0705); - has_int8_ = (sm_version_ == 0x0601 || sm_version_ == 0x0700 || has_imma_); -} - -void Device::CreateStream() { - exec_stream_.clear(); - io_stream_.clear(); - for (int i = 0; i < max_stream_; i++) { - cudaStream_t exec_stream; - cudaStream_t io_stream; - cudaStreamCreate(&exec_stream); - cudaStreamCreate(&io_stream); - exec_stream_.push_back(exec_stream); - io_stream_.push_back(io_stream); - } -} - -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/core/device_info.h b/lite/core/device_info.h deleted file mode 100644 index 96f4680135..0000000000 --- a/lite/core/device_info.h +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/core/tensor.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM - -typedef enum { - kAPPLE = 0, - kA53 = 53, - kA55 = 55, - kA57 = 57, - kA72 = 72, - kA73 = 73, - kA75 = 75, - kA76 = 76, - kARMArch_UNKOWN = -1 -} ARMArch; - -class DeviceInfo { - public: - static DeviceInfo& Global() { - static auto* x = new DeviceInfo; - return *x; - } - - static int Init() { - static int ret = Global().Setup(); - return ret; - } - - int Setup(); - - void SetRunMode(lite_api::PowerMode mode, int thread_num); - void SetCache(int l1size, int l2size, int l3size); - void SetArch(ARMArch arch) { arch_ = arch; } - - lite_api::PowerMode mode() const { return mode_; } - int threads() const { return active_ids_.size(); } - ARMArch arch() const { return arch_; } - int l1_cache_size() const { return L1_cache_[active_ids_[0]]; } - int l2_cache_size() const { return L2_cache_[active_ids_[0]]; } - int l3_cache_size() const { return L3_cache_[active_ids_[0]]; } - int llc_size() const { - auto size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]] - : L2_cache_[active_ids_[0]]; - return size > 0 ? size : 512 * 1024; - } - bool has_dot() const { return dot_[active_ids_[0]]; } - bool has_fp16() const { return fp16_[active_ids_[0]]; } - - template - T* workspace_data() { - return reinterpret_cast(workspace_.mutable_data()); - } - bool ExtendWorkspace(int size); - - private: - int core_num_; - std::vector max_freqs_; - std::vector min_freqs_; - int mem_size_; - std::string dev_name_; - - std::vector L1_cache_; - std::vector L2_cache_; - std::vector L3_cache_; - std::vector core_ids_; - std::vector big_core_ids_; - std::vector little_core_ids_; - std::vector cluster_ids_; - std::vector archs_; - std::vector fp32_; - std::vector fp16_; - std::vector dot_; - - ARMArch arch_; - // LITE_POWER_HIGH stands for using big cores, - // LITE_POWER_LOW stands for using small core, - // LITE_POWER_FULL stands for using all cores - lite_api::PowerMode mode_; - std::vector active_ids_; - TensorLite workspace_; - int64_t count_{0}; - - void SetDotInfo(int argc, ...); - void SetFP16Info(int argc, ...); - void SetFP32Info(int argc, ...); - void SetCacheInfo(int cache_id, int argc, ...); - void SetArchInfo(int argc, ...); - bool SetCPUInfoByName(); - void SetCPUInfoByProb(); - void RequestPowerFullMode(int thread_num); - void RequestPowerHighMode(int thread_num); - void RequestPowerLowMode(int thread_num); - void RequestPowerNoBindMode(int thread_num); - void RequestPowerRandHighMode(int shift_num, int thread_num); - void RequestPowerRandLowMode(int shift_num, int thread_num); - - DeviceInfo() = default; -}; - -#endif // LITE_WITH_ARM - -template -class Device; - -template -class Env { - public: - typedef TargetWrapper API; - typedef std::vector> Devs; - static Devs& Global() { - static Devs* devs = new Devs(); - return *devs; - } - static void Init(int max_stream = 4) { - Devs& devs = Global(); - if (devs.size() > 0) { - return; - } - int count = 0; - // Get device count - count = API::num_devices(); - if (count == 0) { - CHECK(false) << "No device found!"; - } else { - LOG(INFO) << "Found " << count << " device(s)"; - } - // create all device - for (int i = 0; i < count; i++) { - auto dev = Device(i, max_stream); - dev.Init(); - devs.push_back(dev); - } - LOG(INFO) << "dev size = " << devs.size(); - } -}; - -#ifdef LITE_WITH_CUDA -template <> -class Device { - public: - Device(int dev_id, int max_stream = 1) - : idx_(dev_id), max_stream_(max_stream) {} - void Init(); - - int id() { return idx_; } - int max_stream() { return max_stream_; } - int SetId(int idx) { idx_ = idx; } - std::string name() { return device_prop_.name; } - int core_num() { return device_prop_.multiProcessorCount; } - float max_memory() { return device_prop_.totalGlobalMem / 1048576.; } - std::vector exec_streams() { return exec_stream_; } - std::vector io_streams() { return io_stream_; } - - int sm_version() { return sm_version_; } - bool has_fp16() { return has_fp16_; } - bool has_int8() { return has_fp16_; } - bool has_hmma() { return has_fp16_; } - bool has_imma() { return has_fp16_; } - int runtime_version() { return runtime_version_; } - - private: - void CreateStream(); - void GetInfo(); - - private: - int max_stream_; - int idx_{0}; - cudaDeviceProp device_prop_; - std::string device_name_; - float max_memory_; - - int sm_version_; - bool has_fp16_; - bool has_int8_; - bool has_hmma_; - bool has_imma_; - int runtime_version_; - std::vector exec_stream_; - std::vector io_stream_; -}; - -template class Env; -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/core/framework.proto b/lite/core/framework.proto deleted file mode 100644 index 6c60a041a1..0000000000 --- a/lite/core/framework.proto +++ /dev/null @@ -1,188 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto2"; -// option optimize_for = LITE_RUNTIME; -package paddle.framework.proto; - -// Any incompatible changes to ProgramDesc and its dependencies should -// raise the version defined version.h. -// -// Serailization and Deserialization codes should be modified in a way -// that supports old versions following the version and compatibility policy. -message Version { optional int64 version = 1 [ default = 0 ]; } - -enum AttrType { - INT = 0; - FLOAT = 1; - STRING = 2; - INTS = 3; - FLOATS = 4; - STRINGS = 5; - BOOLEAN = 6; - BOOLEANS = 7; - BLOCK = 8; - LONG = 9; - BLOCKS = 10; - LONGS = 11; -} - -// OpDesc describes an instance of a C++ framework::OperatorBase -// derived class type. -message OpDesc { - - message Attr { - required string name = 1; - required AttrType type = 2; - optional int32 i = 3; - optional float f = 4; - optional string s = 5; - repeated int32 ints = 6; - repeated float floats = 7; - repeated string strings = 8; - optional bool b = 10; - repeated bool bools = 11; - optional int32 block_idx = 12; - optional int64 l = 13; - repeated int32 blocks_idx = 14; - repeated int64 longs = 15; - }; - - message Var { - required string parameter = 1; - repeated string arguments = 2; - }; - - required string type = 3; - repeated Var inputs = 1; - repeated Var outputs = 2; - repeated Attr attrs = 4; - optional bool is_target = 5 [ default = false ]; -}; - -// OpProto describes a C++ framework::OperatorBase derived class. -message OpProto { - - // VarProto describes the C++ type framework::Variable. - message Var { - required string name = 1; - required string comment = 2; - - optional bool duplicable = 3 [ default = false ]; - optional bool intermediate = 4 [ default = false ]; - optional bool dispensable = 5 [ default = false ]; - } - - // AttrProto describes the C++ type Attribute. - message Attr { - required string name = 1; - required AttrType type = 2; - required string comment = 3; - // If that attribute is generated, it means the Paddle third - // language binding has responsibility to fill that - // attribute. End-User should not set that attribute. - optional bool generated = 4 [ default = false ]; - } - - required string type = 1; - repeated Var inputs = 2; - repeated Var outputs = 3; - repeated Attr attrs = 4; - required string comment = 5; -} - -message VarType { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - // Tensor is used in C++. - SIZE_T = 19; - UINT8 = 20; - INT8 = 21; - - // Other types that may need additional descriptions - LOD_TENSOR = 7; - SELECTED_ROWS = 8; - FEED_MINIBATCH = 9; - FETCH_LIST = 10; - STEP_SCOPES = 11; - LOD_RANK_TABLE = 12; - LOD_TENSOR_ARRAY = 13; - PLACE_LIST = 14; - READER = 15; - // Any runtime decided variable type is raw - // raw variables should manage their own allocations - // in operators like nccl_op - RAW = 17; - TUPLE = 18; - } - - required Type type = 1; - - message TensorDesc { - // Should only be PODType. Is enforced in C++ - required Type data_type = 1; - repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] - } - optional TensorDesc selected_rows = 2; - - message LoDTensorDesc { - required TensorDesc tensor = 1; - optional int32 lod_level = 2 [ default = 0 ]; - } - optional LoDTensorDesc lod_tensor = 3; - - message LoDTensorArrayDesc { - required TensorDesc tensor = 1; - optional int32 lod_level = 2 [ default = 0 ]; - } - optional LoDTensorArrayDesc tensor_array = 4; - - message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } - optional ReaderDesc reader = 5; - - message Tuple { repeated Type element_type = 1; } - optional Tuple tuple = 7; -} - -message VarDesc { - required string name = 1; - required VarType type = 2; - optional bool persistable = 3 [ default = false ]; -} - -message BlockDesc { - required int32 idx = 1; - required int32 parent_idx = 2; - repeated VarDesc vars = 3; - repeated OpDesc ops = 4; - optional int32 forward_block_idx = 5 [ default = -1 ]; -} - -// Please refer to -// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md -// for more details. -// TODO(panyx0718): A model can have multiple programs. Need a -// way to distinguish them. Maybe ID or name? -message ProgramDesc { - repeated BlockDesc blocks = 1; - - optional Version version = 2; -} diff --git a/lite/core/kernel.cc b/lite/core/kernel.cc deleted file mode 100644 index 7ec718cb38..0000000000 --- a/lite/core/kernel.cc +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/kernel.h" -#include -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { - -std::string KernelBase::summary() const { - STL::stringstream ss; - ss << op_type() << ":" << TargetToStr(target()) << "/" - << PrecisionToStr(precision()) << "/" << DataLayoutToStr(layout()) << "(" - << alias() << ")"; - return ss.str(); -} - -const Type *KernelBase::GetInputDeclType(const std::string &arg_name) const { - CHECK(!op_type_.empty()) << "op_type should be set first"; - const auto *type = ParamTypeRegistry::Global().RetrieveInArgument( - place(), GenParamTypeKey(), arg_name); - CHECK(type) << "no type registered for kernel [" << op_type_ - << "] input argument [" << arg_name << "]" - << " with key " << GenParamTypeKey(); - return type->type; -} - -const Type *KernelBase::GetOutputDeclType(const std::string &arg_name) const { - CHECK(!op_type_.empty()) << "op_type should be set first"; - const auto *type = ParamTypeRegistry::Global().RetrieveOutArgument( - place(), GenParamTypeKey(), arg_name); - CHECK(type) << "no type registered for kernel [" << GenParamTypeKey() - << "] output argument [" << arg_name << "]"; - return type->type; -} - -std::string KernelBase::GenParamTypeKey() const { - STL::stringstream ss; - ss << op_type() << "/" << alias_; - return ss.str(); -} - -void KernelBase::ParseKernelType(const std::string &kernel_type, - std::string *op_type, - std::string *alias, - Place *place) { - auto parts = Split(kernel_type, "/"); - CHECK_EQ(parts.size(), 5); - *op_type = parts[0]; - *alias = parts[1]; - - std::string target, precision, layout; - - target = parts[2]; - precision = parts[3]; - layout = parts[4]; - - place->target = static_cast(std::atoi(target.c_str())); - place->precision = static_cast(std::atoi(precision.c_str())); - place->layout = static_cast(std::atoi(layout.c_str())); -} - -std::string KernelBase::SerializeKernelType(const std::string &op_type, - const std::string &alias, - const Place &place) { - STL::stringstream ss; - ss << op_type << "/"; - ss << alias << "/"; - // We serialize the place value not the string representation here for - // easier deserialization. - ss << static_cast(place.target) << "/"; - ss << static_cast(place.precision) << "/"; - ss << static_cast(place.layout); - return ss.str(); -} - -bool ParamTypeRegistry::KeyCmp::operator()( - const ParamTypeRegistry::key_t &a, - const ParamTypeRegistry::key_t &b) const { - return a.hash() < b.hash(); -} - -STL::ostream &operator<<(STL::ostream &os, - const ParamTypeRegistry::KernelIdTy &other) { - std::string io_s = other.io == ParamTypeRegistry::IO::kInput ? "in" : "out"; - os << other.kernel_type << ":" << other.arg_name << ":" << io_s << ":" - << other.place.DebugString(); - return os; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/kernel.h b/lite/core/kernel.h deleted file mode 100644 index 92eca6af54..0000000000 --- a/lite/core/kernel.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "lite/backends/arm/math/type_trans.h" -#include "lite/core/context.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/type_system.h" -#include "lite/core/types.h" -#include "lite/core/workspace.h" -#include "lite/operators/op_params.h" -#include "lite/utils/all.h" -#include "lite/utils/replace_stl/stream.h" - -namespace paddle { -namespace lite { - -// An base with virtual functions to unify all the kernel implementation on -// different targets. -class KernelBase { - public: - // type_infer_handler is used to inference a output type by considering the - // input types in the type system. - using type_infer_handler_t = std::function& input_types, - const std::string& out_arg)>; - - protected: - /// Run some initialization before `Run`, it will invoke after `SetParam` and - /// `SetContext`, that is both the param_ and context_ are valid. - virtual void PrepareForRun() {} - - /// Run the kernel. Before Run, both the param_ and context_ should be valid. - virtual void Run() = 0; - - public: - void Launch() { - if (is_first_epoch_) { - PrepareForRun(); - is_first_epoch_ = false; - } - - // Reset the workspace to make every kernel in the same thread to share the - // temporary memory. - WorkSpace::Global_Host().AllocReset(); -#if defined(LITE_WITH_X86) - WorkSpace::Global_X86().AllocReset(); -#endif -#if defined(LITE_WITH_CUDA) - WorkSpace::Global_CUDA().AllocReset(); -#endif - Run(); - } - - void SetContext(std::unique_ptr&& ctx) { - ctx_ = std::move(ctx); - } - template - void SetParam(T param) { - param_.set(param); - } - template - P& Param() const { - return *param_.get_mutable

      (); - } - - // This is used in the kernels that takes 'kAny' places and inference the - // output place. For `ScaleCompute` and `IoCopyCompute`, their input types are - // declared as 'kAny' in some Place field, and the output is also `kAny`, but - // when in real execution, when takes some non-kAny type as input, the - // output's kAny-fields can be determained. For example, when the - // `ScaleCompute` takes `TensorFp32NCHWTy` as input, its output should be also - // `TensorFp32NCHWTy`. This type inference rule is different for each kernel, - // so we make it a virtual method. - // One can custom this handler to make a specific type inference rule for a - // kernel, or leave the default to force the kernel use the system's - // type-inference rules. - virtual std::unique_ptr GetTypeInferHandler() { - return nullptr; - } - - void set_op_type(const std::string& type) { op_type_ = type; } - const std::string& op_type() const { return op_type_; } - - // Get input declaration Type. - const Type* GetInputDeclType(const std::string& arg_name) const; - - // Get output declaration Type. - const Type* GetOutputDeclType(const std::string& arg_name) const; - - void set_alias(const std::string& x) { alias_ = x; } - const std::string& alias() const { return alias_; } - - virtual Place place() const = 0; - virtual TargetType target() const = 0; - virtual PrecisionType precision() const = 0; - virtual DataLayoutType layout() const = 0; - const KernelContext* context() const { return ctx_.get(); } - KernelContext* mutable_context() { return ctx_.get(); } - virtual std::string name() const = 0; - - // Short human-readable document. - std::string summary() const; - // Long human-readable document. - virtual std::string doc() const { return ""; } - // Generate the key of the parameter type. - std::string GenParamTypeKey() const; - - // Used to serialize the kernel. - std::string SerializedKernelType() const { - return SerializeKernelType(op_type(), alias(), place()); - } - - static std::string SerializeKernelType(const std::string& op_type, - const std::string& alias, - const Place& place); - - static void ParseKernelType(const std::string& kernel_type, - std::string* op_type, - std::string* alias, - Place* place); - - std::string key_with_alias() const { return op_type() + "/" + alias(); } - - virtual ~KernelBase() = default; - void Torch() {} - - protected: - std::unique_ptr ctx_{nullptr}; - mutable operators::param_t param_; - // The corresponding op type. - std::string op_type_{}; - // The extra identity to help defficiate a specific kernel, op_type_ + alias_ - // is the unique ID for the kernel. - std::string alias_{}; - bool is_first_epoch_{true}; -}; - -// Light-weight kernel implementation. -// The OpKernel is designed to implement the specific algorithm on a target -// device. -// TODO(Superjomn) Consider to add a Platform type to differentiate CUDNN, -// MKLDNN, plain CUDA C implementations. -template -class KernelLite : public KernelBase { - public: - // Run the kernel. - virtual void Run() { CHECK(false) << "Not Implemented"; } - - TargetType target() const override { return Target; } - PrecisionType precision() const override { return Precision; } - DataLayoutType layout() const override { return DataLayout; } - Place place() const override { return Place{Target, Precision, DataLayout}; } - std::string name() const override; - - void Touch() {} - - KernelLite() = default; - virtual ~KernelLite() = default; -}; - -template -std::string KernelLite::name() const { - return op_type() + ":" + TargetToStr(Target) + "/" + - PrecisionToStr(Precision) + "/" + DataLayoutToStr(DataLayout); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/kernel_test.cc b/lite/core/kernel_test.cc deleted file mode 100644 index 8ad8b47744..0000000000 --- a/lite/core/kernel_test.cc +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/kernel.h" -#include -#include "lite/core/op_lite.h" - -namespace paddle { -namespace lite { -namespace core { - -int test_code{-1}; -class SomeKernel : public KernelLite { - public: - void Run() override { - LOG(INFO) << "SomeKernel executed"; - LOG(INFO) << Param().in_num_col_dims; - test_code = Param().in_num_col_dims; - } - - TargetType target() const override { return TARGET(kHost); } - PrecisionType precision() const override { return PRECISION(kFloat); } -}; - -TEST(Kernel, test) { - SomeKernel kernel; - operators::FcParam param; - param.in_num_col_dims = 100; - kernel.SetParam(param); - kernel.Run(); - ASSERT_EQ(test_code, 100); -} - -TEST(Kernel, kernel_type) { - const std::string op_type = "fc"; - const std::string alias = "def"; - Place place(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto kernel_type = KernelBase::SerializeKernelType(op_type, alias, place); - LOG(INFO) << "kernel_type: " << kernel_type; - ASSERT_EQ(kernel_type, "fc/def/1/1/1"); - - std::string op_type1, alias1; - Place place1; - KernelBase::ParseKernelType(kernel_type, &op_type1, &alias1, &place1); - ASSERT_EQ(op_type, op_type1); - ASSERT_EQ(alias, alias1); - ASSERT_EQ(place, place1); -} - -} // namespace core -} // namespace lite -} // namespace paddle diff --git a/lite/core/lite.map b/lite/core/lite.map deleted file mode 100644 index 31adae4219..0000000000 --- a/lite/core/lite.map +++ /dev/null @@ -1,6 +0,0 @@ -{ - global: - *paddle*; - local: - *; -}; diff --git a/lite/core/lite_gtest_main.cc b/lite/core/lite_gtest_main.cc deleted file mode 100644 index 9784fc7994..0000000000 --- a/lite/core/lite_gtest_main.cc +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - google::ParseCommandLineFlags(&argc, &argv, false); - - return RUN_ALL_TESTS(); -} diff --git a/lite/core/lite_tensor_test.cc b/lite/core/lite_tensor_test.cc deleted file mode 100644 index d667a9f885..0000000000 --- a/lite/core/lite_tensor_test.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { - -TEST(tensor, test) { - TensorLite tensor; - DDimLite ddim({1, 8}); - tensor.Resize(ddim); - - for (int i = 0; i < 8; i++) { - tensor.mutable_data()[i] = i; - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/memory.cc b/lite/core/memory.cc deleted file mode 100644 index 463e10b9f9..0000000000 --- a/lite/core/memory.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/memory.h" - -namespace paddle { -namespace lite { - -void* TargetMalloc(TargetType target, size_t size) { - void* data{nullptr}; - switch (target) { - case TargetType::kHost: - case TargetType::kX86: - case TargetType::kARM: - data = TargetWrapper::Malloc(size); - break; -#ifdef LITE_WITH_CUDA - case TargetType::kCUDA: - data = TargetWrapper::Malloc(size); - break; -#endif // LITE_WITH_CUDA -#ifdef LITE_WITH_OPENCL - case TargetType::kOpenCL: - data = TargetWrapperCL::Malloc(size); - break; -#endif // LITE_WITH_OPENCL -#ifdef LITE_WITH_FPGA - case TargetType::kFPGA: - data = TargetWrapper::Malloc(size); - break; -#endif // LITE_WITH_OPENCL - default: - LOG(FATAL) << "Unknown supported target " << TargetToStr(target); - } - return data; -} - -void TargetFree(TargetType target, void* data) { - switch (target) { - case TargetType::kHost: - case TargetType::kX86: - case TargetType::kARM: - TargetWrapper::Free(data); - break; - -#ifdef LITE_WITH_CUDA - case TargetType::kCUDA: - TargetWrapper::Free(data); - break; -#endif // LITE_WITH_CUDA -#ifdef LITE_WITH_OPENCL - case TargetType::kOpenCL: - TargetWrapperCL::Free(data); - break; -#endif // LITE_WITH_OPENCL -#ifdef LITE_WITH_FPGA - case TargetType::kFPGA: - TargetWrapper::Free(data); - break; -#endif // LITE_WITH_CUDA - default: - LOG(FATAL) << "Unknown type"; - } -} - -void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { - switch (target) { - case TargetType::kHost: - case TargetType::kX86: - case TargetType::kARM: - TargetWrapper::MemcpySync( - dst, src, size, IoDirection::DtoD); - break; - -#ifdef LITE_WITH_CUDA - case TargetType::kCUDA: - TargetWrapper::MemcpySync( - dst, src, size, IoDirection::DtoD); - break; -#endif -#ifdef LITE_WITH_FPGA - case TargetType::kFPGA: - TargetWrapper::MemcpySync( - dst, src, size, IoDirection::DtoD); - break; -#endif -#ifdef LITE_WITH_OPENCL - case TargetType::kOpenCL: - TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); - break; -#endif // LITE_WITH_OPENCL - default: - LOG(FATAL) << "unsupported type"; - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/memory.h b/lite/core/memory.h deleted file mode 100644 index 31d7fd34e1..0000000000 --- a/lite/core/memory.h +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/api/paddle_place.h" -#include "lite/core/target_wrapper.h" -#include "lite/utils/macros.h" - -#ifdef LITE_WITH_OPENCL -#include "lite/backends/opencl/target_wrapper.h" -#endif // LITE_WITH_OPENCL - -#ifdef LITE_WITH_CUDA -#include "lite/backends/cuda/target_wrapper.h" -#endif // LITE_WITH_CUDA - -namespace paddle { -namespace lite { - -// Malloc memory for a specific Target. All the targets should be an element in -// the `switch` here. -LITE_API void* TargetMalloc(TargetType target, size_t size); - -// Free memory for a specific Target. All the targets should be an element in -// the `switch` here. -void LITE_API TargetFree(TargetType target, void* data); - -// Copy a buffer from host to another target. -void TargetCopy(TargetType target, void* dst, const void* src, size_t size); - -template -void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { - switch (Target) { - case TARGET(kX86): - case TARGET(kHost): - case TARGET(kARM): - TargetWrapper::MemcpySync( - dst, src, size, IoDirection::HtoH); - break; -#ifdef LITE_WITH_CUDA - case TARGET(kCUDA): - TargetWrapperCuda::MemcpySync(dst, src, size, dir); - break; -#endif -#ifdef LITE_WITH_OPENCL - case TargetType::kOpenCL: - TargetWrapperCL::MemcpySync(dst, src, size, dir); - break; -#endif // LITE_WITH_OPENCL -#ifdef LITE_WITH_FPGA - case TARGET(kFPGA): - TargetWrapper::MemcpySync(dst, src, size, dir); - break; -#endif - } -} - -// Memory buffer manager. -class Buffer { - public: - Buffer() = default; - Buffer(TargetType target, size_t size) : space_(size), target_(target) {} - - void* data() const { return data_; } - TargetType target() const { return target_; } - size_t space() const { return space_; } - - void ResetLazy(TargetType target, size_t size) { - if (target != target_ || space_ < size) { - Free(); - data_ = TargetMalloc(target, size); - target_ = target; - space_ = size; - } - } - - void ResizeLazy(size_t size) { ResetLazy(target_, size); } - - void Free() { - if (space_ > 0) { - TargetFree(target_, data_); - } - target_ = TargetType::kHost; - space_ = 0; - } - - void CopyDataFrom(const Buffer& other, size_t nbytes) { - target_ = other.target_; - ResizeLazy(nbytes); - // TODO(Superjomn) support copy between different targets. - TargetCopy(target_, data_, other.data_, nbytes); - } - - ~Buffer() { Free(); } - - private: - // memory it actually malloced. - size_t space_{0}; - void* data_{nullptr}; - TargetType target_{TargetType::kHost}; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/memory_test.cc b/lite/core/memory_test.cc deleted file mode 100644 index cd9062afca..0000000000 --- a/lite/core/memory_test.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/memory.h" -#include - -namespace paddle { -namespace lite { - -TEST(memory, test) { - auto* buf = TargetMalloc(TARGET(kX86), 10); - ASSERT_TRUE(buf); - TargetFree(TARGET(kX86), buf); - -#ifdef LITE_WITH_CUDA - auto* buf_cuda = TargetMalloc(TARGET(kCUDA), 10); - ASSERT_TRUE(buf_cuda); - TargetFree(TARGET(kCUDA), buf_cuda); -#endif -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt deleted file mode 100644 index 6dfc2bd295..0000000000 --- a/lite/core/mir/CMakeLists.txt +++ /dev/null @@ -1,109 +0,0 @@ -lite_cc_library(mir_node SRCS node.cc DEPS kernel) -lite_cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node program) -lite_cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph) -lite_cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes) -lite_cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager) - -add_subdirectory(fusion) -add_subdirectory(elimination) -add_subdirectory(subgraph) - -lite_cc_library(mir_passes - SRCS - fusion/fc_fuse_pass.cc - fusion/shuffle_channel_fuse_pass.cc - fusion/transpose_softmax_transpose_fuse_pass.cc - fusion/interpolate_fuse_pass.cc - fusion/conv_elementwise_fuse_pass.cc - fusion/conv_activation_fuse_pass.cc - fusion/conv_bn_fuse_pass.cc - fusion/elementwise_add_activation_fuse_pass.cc - fusion/quant_dequant_fuse_pass.cc - elimination/identity_scale_eliminate_pass.cc - static_kernel_pick_pass.cc - variable_place_inference_pass.cc - type_target_cast_pass.cc - type_layout_cast_pass.cc - type_precision_cast_pass.cc - io_copy_kernel_pick_pass.cc - graph_visualize_pass.cc - generate_program_pass.cc - argument_type_display_pass.cc - demo_pass.cc - runtime_context_assign_pass.cc - DEPS mir_pass types context ${mir_fusers} ${subgraph_passes}) - -# lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS - #mir_ssa_graph scope op - #fc_op - #${host_kernels} - #mir_passes - #mir_pass_manager - #program_fake_utils - #) -# lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc -# DEPS -# mul_op -# feed_op -# fetch_op -# io_copy_op -# ${host_kernels} -# mir_passes -# mir_pass_manager -# optimizer -# program_fake_utils -# target_wrapper_host -# PROFILE_DEPS basic_profiler -# CUDA_DEPS target_wrapper_cuda kernels_cuda -# ARM_DEPS mul_compute_arm -# X86_DEPS mul_compute_x86 -# ) - -set(pattern_deps mir_node mir_ssa_graph op) -if (WITH_TESTING) - list(APPEND pattern_deps gtest) -endif() -lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps}) -lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher) - -lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher) - - -# for mobile, unnecessary to compile the following testings. -if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - return() -endif() -lite_cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes) - - -# TODO(wz) replace framework/proto to lite proto. -if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - # it depends on the fluid/framework/proto, that is too heavy for mobile execution. - # TODO(wz) enable it latter. - # lite_cc_test(test_pattern_matcher_high_api SRCS pattern_matcher_high_api_test.cc DEPS - # pattern_matcher_high_api proto_desc mir_pass_manager fc_op mul_op elementwise_ops - # mir_passes compatible_pb program ${ops}) -endif() - -message(STATUS "----> Ops lite: ${ops}") -message(STATUS "----> Host kernels: ${host_kernels}") -message(STATUS "----> X86 kernels: ${x86_kernels}") - -# lite_cc_test(test_lite_fc_fuse SRCS fusion/fc_fuse_pass_test.cc -# DEPS cxx_api mir_passes -# ${ops} ${host_kernels} ${x86_kernels} ${arm_kernels} -# ARGS --model_dir=${LITE_MODEL_DIR}/lite_fc_model -# --optimized_model=${LITE_MODEL_DIR}/lite_fc_model_opt SERIAL) - -# lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_fc_model.tar.gz") -# add_dependencies(test_lite_fc_fuse extern_lite_download_lite_fc_model_tar_gz) - - -# lite_cc_test(test_lite_conv_elementwise_add_activation_fuse -# SRCS fusion/conv_elementwise_add_activation_fuse_pass_test.cc -# DEPS cxx_api mir_passes -# ${ops} ${host_kernels} ${x86_kernels}) -# lite_cc_test(test_lite_elementwise_add_activation_fuse -# SRCS fusion/elementwise_add_activation_fuse_pass_test.cc -# DEPS cxx_api mir_passes -# ${ops} ${host_kernels} ${x86_kernels}) diff --git a/lite/core/mir/argument_type_display_pass.cc b/lite/core/mir/argument_type_display_pass.cc deleted file mode 100644 index ea44245225..0000000000 --- a/lite/core/mir/argument_type_display_pass.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -class ArgumentTypeDisplayPass : public DebugPass { - public: - void Apply(const std::unique_ptr& graph) override { - VLOG(3) << "== Argument types =="; - for (auto& node : graph->mutable_nodes()) { - if (!node.IsArg()) continue; - - auto* type = node.AsArg().type; - if (type) { - VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type; - } else { - VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK"; - } - } - VLOG(3) << "---------------------"; - } -}; - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(argument_type_display_pass, - paddle::lite::mir::ArgumentTypeDisplayPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/demo_pass.cc b/lite/core/mir/demo_pass.cc deleted file mode 100644 index b92a2b0751..0000000000 --- a/lite/core/mir/demo_pass.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -class DemoPass : public mir::DebugPass { - public: - void Apply(const std::unique_ptr &graph) override {} -}; - -/* -bool RegisterDemoPass() { - return PassManager::Global().AddNewPass("demo", new DemoPass); -} - */ - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(demo, paddle::lite::mir::DemoPass).SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/dot.h b/lite/core/mir/dot.h deleted file mode 100644 index df70565c07..0000000000 --- a/lite/core/mir/dot.h +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file implements some helper classes and methods for DOT programming - * support. It will give a visualization of the graph and that helps to debug - * the logics of each Pass. - */ -#pragma once - -#include -#include -#include -#include "lite/utils/cp_logging.h" -#include "lite/utils/replace_stl/stream.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace inference { -namespace analysis { - -static size_t dot_node_counter{0}; - -/* - * A Dot template that helps to build a DOT graph definition. - */ -class Dot { - public: - struct Attr { - std::string key; - std::string value; - - Attr(const std::string& key, const std::string& value) - : key(key), value(value) {} - - std::string repr() const { - STL::stringstream ss; - ss << key << "=" << '"' << value << '"'; - return ss.str(); - } - }; - - struct Node { - std::string name; - std::vector attrs; - - Node(const std::string& name, const std::vector& attrs) - : name(name), attrs(attrs) { - STL::stringstream ss; - ss << "node_" << dot_node_counter++; - id_ = ss.str(); - } - - std::string id() const { return id_; } - - std::string repr() const { - STL::stringstream ss; - CHECK(!name.empty()); - ss << id_; - if (attrs.empty()) { - ss << "[label=" << '"' << name << '"' << "]"; - return ss.str(); - } - for (size_t i = 0; i < attrs.size(); i++) { - if (i == 0) { - ss << "[label=" << '"' << name << '"' << " "; - } - ss << attrs[i].repr(); - ss << ((i < attrs.size() - 1) ? " " : "]"); - } - return ss.str(); - } - - private: - std::string id_; - }; - - struct Edge { - std::string source; - std::string target; - std::vector attrs; - - Edge(const std::string& source, - const std::string& target, - const std::vector& attrs) - : source(source), target(target), attrs(attrs) {} - - std::string repr() const { - STL::stringstream ss; - CHECK(!source.empty()); - CHECK(!target.empty()); - ss << source << "->" << target; - for (size_t i = 0; i < attrs.size(); i++) { - if (i == 0) { - ss << "["; - } - ss << attrs[i].repr(); - ss << ((i < attrs.size() - 1) ? " " : "]"); - } - return ss.str(); - } - }; - - Dot() = default; - - explicit Dot(const std::vector& attrs) : attrs_(attrs) {} - - void AddNode(const std::string& id, - const std::vector& attrs, - std::string label = "") { - CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'"; - if (label.empty()) label = id; - nodes_.emplace(id, Node{label, attrs}); - } - - void AddEdge(const std::string& source, - const std::string& target, - const std::vector& attrs) { - CHECK(!source.empty()); - CHECK(!target.empty()); - auto sid = nodes_.at(source).id(); - auto tid = nodes_.at(target).id(); - edges_.emplace_back(sid, tid, attrs); - } - - // Compile to DOT language codes. - std::string Build() const { - STL::stringstream ss; - const std::string indent = " "; - ss << "digraph G {" << '\n'; - - // Add graph attrs - for (const auto& attr : attrs_) { - ss << indent << attr.repr() << '\n'; - } - // add nodes - for (auto& item : nodes_) { - ss << indent << item.second.repr() << '\n'; - } - // add edges - for (auto& edge : edges_) { - ss << indent << edge.repr() << '\n'; - } - ss << "} // end G"; - return ss.str(); - } - - private: - std::unordered_map nodes_; - std::vector edges_; - std::vector attrs_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/lite/core/mir/elimination/CMakeLists.txt b/lite/core/mir/elimination/CMakeLists.txt deleted file mode 100644 index 9b6598630b..0000000000 --- a/lite/core/mir/elimination/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - # NOTE disabled for the proto_desc is not valid yet. - # TODO(Superjomn) enable them if valid latter. - # lite_cc_test(test_identity_scale_eliminate_pass - # SRCS identity_scale_eliminate_pass_test.cc - # DEPS mir_passes program proto_desc cpp_op_desc - # ${ops} - # ) -endif() - diff --git a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc deleted file mode 100644 index 00290937b2..0000000000 --- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" -#include "lite/core/mir/pass_registry.h" -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { - -namespace { - -class Eliminator : public FuseBase { - public: - void BuildPattern() override { - auto* pre_op = OpNode("preop"); // the previous op's output need update - // TODO(Superjomn) check has only one output - auto* x = VarNode("x")->assert_is_op_input("scale", "X"); - auto* scale_op = OpNode("scale", "scale") - ->assert_op_attr("scale", 1.) - ->assert_op_attr("bias", 0.); - auto* out = VarNode("out")->assert_is_op_output("scale", "Out"); - - *pre_op >> *x >> *scale_op >> *out; - - // The pre_op will be eliminated, and a new output-updated op will insert. - x->AsIntermediate(); // x is pre_op's output, need to update - } - - private: - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { - auto& pre_op = matched.at("preop")->AsStmt(); - auto op_info = *pre_op.op_info(); - - op_info.UpdateAllOutputs(matched.at("x")->AsArg().name, - matched.at("out")->AsArg().name); - pre_op.ResetOp(op_info, graph->valid_places()); - - GraphSafeRemoveNodes(graph, {matched.at("scale")}); - - IR_NODE_LINK_TO(matched.at("preop"), matched.at("out")); - } -}; - -} // namespace - -class IdentityScaleEliminatePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override { - Eliminator eliminator; - eliminator(graph.get()); - } -}; - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(identity_scale_eliminate_pass, - paddle::lite::mir::IdentityScaleEliminatePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc b/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc deleted file mode 100644 index 7130a13c47..0000000000 --- a/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/pass_registry.h" -#include "lite/core/mir/ssa_graph.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace lite { -namespace mir { - -std::unique_ptr BuildGraph(framework::ProgramDesc* program_desc, - const std::shared_ptr& scope, - const std::vector& valid_places) { - // Op list: - // (x)->feed -> (feed) -> scale -> (scale_out) -> fetch->(fetch) - // After pass - // (x)->feed->(scale_out)->fetch->(fetch) - auto* main_block = program_desc->MutableBlock(0); - auto* feed_op = main_block->AppendOp(); - auto* scale_op = main_block->AppendOp(); - auto* fetch_op = main_block->AppendOp(); - main_block->Var("x"); - main_block->Var("feed"); - main_block->Var("scale_out"); - main_block->Var("fetch_out"); - - scope->Var("x")->GetMutable(); - scope->Var("feed")->GetMutable(); - scope->Var("scale_out")->GetMutable(); - scope->Var("fetch_out")->GetMutable(); - - feed_op->SetType("feed"); - feed_op->SetInput("X", {"x"}); - feed_op->SetAttr("col", 1); - feed_op->SetOutput("Out", {"feed"}); - - scale_op->SetType("scale"); - scale_op->SetInput("X", {"feed"}); - scale_op->SetOutput("Out", {"scale_out"}); - scale_op->SetAttr("scale", 1.f); - scale_op->SetAttr("bias", 0.f); - scale_op->SetAttr("bias_after_scale", true); - - fetch_op->SetType("fetch"); - fetch_op->SetInput("X", {"scale_out"}); - fetch_op->SetOutput("Out", {"fetch"}); - fetch_op->SetAttr("col", 1); - - program_desc->Flush(); - - lite::Program program(*program_desc->Proto(), scope, valid_places); - auto graph = std::unique_ptr(new SSAGraph()); - graph->Build(program, valid_places); - - VLOG(5) << Visualize(graph.get()); - - return graph; -} - -TEST(identity_test, test) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - const int num_nodes = graph->nodes().size(); - auto pass = PassManager::Global().LookUp("identity_scale_eliminate_pass"); - ASSERT_TRUE(pass); - pass->Apply(graph); - ASSERT_EQ(graph->nodes().size(), num_nodes - 2UL); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(feed) -USE_LITE_OP(fetch) -USE_LITE_OP(scale) -USE_MIR_PASS(identity_scale_eliminate_pass) diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt deleted file mode 100644 index 5ac5283755..0000000000 --- a/lite/core/mir/fusion/CMakeLists.txt +++ /dev/null @@ -1,48 +0,0 @@ -lite_cc_library(fuse_fc - SRCS fc_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_shuffle_channel - SRCS shuffle_channel_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_conv_elementwise - SRCS conv_elementwise_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_conv_activation - SRCS conv_activation_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_conv_bn - SRCS conv_bn_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_elementwise_add_activation - SRCS elementwise_add_activation_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_quant_dequant - SRCS quant_dequant_op_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_transpose_softmax_transpose - SRCS transpose_softmax_transpose_fuser.cc - DEPS pattern_matcher_high_api) -lite_cc_library(fuse_interpolate - SRCS interpolate_fuser.cc - DEPS pattern_matcher_high_api) - -set(mir_fusers - fuse_fc - fuse_shuffle_channel - fuse_conv_elementwise - fuse_conv_activation - fuse_conv_bn - fuse_quant_dequant - fuse_elementwise_add_activation - fuse_transpose_softmax_transpose - fuse_interpolate - CACHE INTERNAL "fusers") - -if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) - return() -endif() - -# TODO(Superjomn) Enable it latter -# NOTE disabled for the proto_desc is not valid yet. -# lite_cc_test(test_lite_conv_bn_fuse SRCS conv_bn_fuse_pass_test.cc -# DEPS elementwise_ops batch_norm_op conv_op proto_desc compatible_pb program mir_pass mir_pass_manager pattern_matcher_high_api) diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc deleted file mode 100644 index c6939e1983..0000000000 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/conv_activation_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/conv_activation_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { - for (auto conv_type : {"conv2d", "depthwise_conv2d"}) { - for (auto act_type : {"relu"}) { - for (auto has_bias : {true, false}) { - fusion::ConvActivationFuser fuser(conv_type, act_type, has_bias); - fuser(graph.get()); - } - } - } -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_conv_activation_fuse_pass, - paddle::lite::mir::ConvActivationFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.h b/lite/core/mir/fusion/conv_activation_fuse_pass.h deleted file mode 100644 index e6f0f34be0..0000000000 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class ConvActivationFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_activation_fuser.cc b/lite/core/mir/fusion/conv_activation_fuser.cc deleted file mode 100644 index 8e18b368f4..0000000000 --- a/lite/core/mir/fusion/conv_activation_fuser.cc +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/conv_activation_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void ConvActivationFuser::BuildPattern() { - // create nodes. - auto* input = - VarNode("input")->assert_is_op_input(conv_type_, "Input")->AsInput(); - auto* filter = - VarNode("filter")->assert_is_op_input(conv_type_, "Filter")->AsInput(); - PMNode* bias = nullptr; - if (has_bias_) { - bias = VarNode("bias")->assert_is_op_input(conv_type_, "Bias")->AsInput(); - } - auto* conv2d = OpNode("conv2d", conv_type_)->AsIntermediate(); - - auto* act = OpNode("act", act_type_)->AsIntermediate(); - - auto* conv2d_out = VarNode("conv2d_out") - ->assert_is_op_output(conv_type_, "Output") - ->assert_is_op_input(act_type_, "X") - ->AsIntermediate(); - - auto* out = - VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput(); - - // create topology. - std::vector conv2d_inputs{filter, input}; - conv2d_inputs >> *conv2d >> *conv2d_out >> *act >> *out; - if (has_bias_) { - *bias >> *conv2d; - } -} - -void ConvActivationFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto conv_op = LiteOpRegistry::Global().Create(conv_type_); - auto conv_old = matched.at("conv2d")->stmt()->op(); - auto* scope = conv_old->scope(); - auto& valid_places = conv_old->valid_places(); - conv_op->Attach(op_desc, scope); - - auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places); - - IR_NODE_LINK_TO(matched.at("input"), new_op_node); - IR_NODE_LINK_TO(matched.at("filter"), new_op_node); - if (has_bias_) { - IR_NODE_LINK_TO(matched.at("bias"), new_op_node); - } - IR_NODE_LINK_TO(new_op_node, matched.at("output")); -} - -cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) { - cpp::OpDesc op_desc = *matched.at("conv2d")->stmt()->op_info(); - op_desc.SetOutput("Output", {matched.at("output")->arg()->name}); - op_desc.SetAttr("fuse_relu", true); - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_activation_fuser.h b/lite/core/mir/fusion/conv_activation_fuser.h deleted file mode 100644 index 0d09c9dce2..0000000000 --- a/lite/core/mir/fusion/conv_activation_fuser.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class ConvActivationFuser : public FuseBase { - public: - explicit ConvActivationFuser(const std::string& conv_type, - const std::string& act_type, - bool has_bias) { - CHECK(act_type == "relu") << "Only relu activation be supported now"; - conv_type_ = conv_type; - act_type_ = act_type; - has_bias_ = has_bias; - } - - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - std::string conv_type_; - std::string act_type_; - bool has_bias_; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc deleted file mode 100644 index 2e962017bc..0000000000 --- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/conv_bn_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/conv_bn_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void ConvBNFusePass::Apply(const std::unique_ptr& graph) { - fusion::ConvBNFuser fuser("conv2d"); - fuser(graph.get()); - - fusion::ConvBNFuser fuser2("depthwise_conv2d"); - fuser2(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.h b/lite/core/mir/fusion/conv_bn_fuse_pass.h deleted file mode 100644 index b2c56d1802..0000000000 --- a/lite/core/mir/fusion/conv_bn_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class ConvBNFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc b/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc deleted file mode 100644 index 7e720bcc3d..0000000000 --- a/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/conv_bn_fuse_pass.h" -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/program.h" -#include "lite/core/tensor.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -std::unique_ptr BuildGraph(framework::ProgramDesc* program_desc, - const std::shared_ptr& scope, - const std::vector& valid_places) { - auto* main_block = program_desc->MutableBlock(0); - auto* conv_op = main_block->AppendOp(); - auto* bn_op = main_block->AppendOp(); - main_block->Var("conv_i"); - main_block->Var("conv_param"); - main_block->Var("conv_out"); - - main_block->Var("bn_scale"); - main_block->Var("bn_bias"); - main_block->Var("bn_mean"); - main_block->Var("bn_var"); - main_block->Var("bn_out"); - main_block->Var("bn_mean_out"); - main_block->Var("bn_var_out"); - main_block->Var("bn_saved_mean"); - main_block->Var("bn_saved_var"); - - scope->Var("conv_i")->GetMutable(); - auto conv_param_t = scope->Var("conv_param")->GetMutable(); - std::vector conv_param_shape = {3, 1, 2, 2}; - conv_param_t->Resize(lite::DDim(conv_param_shape)); - conv_param_t->mutable_data(); - scope->Var("conv_out")->GetMutable(); - auto bn_scale_t = scope->Var("bn_scale")->GetMutable(); - std::vector bn_scale_shape = {3}; - bn_scale_t->Resize(lite::DDim(bn_scale_shape)); - bn_scale_t->mutable_data(); - - auto bn_bias_t = scope->Var("bn_bias")->GetMutable(); - std::vector bn_bias_shape = {3}; - bn_bias_t->Resize(lite::DDim(bn_bias_shape)); - bn_bias_t->mutable_data(); - - auto bn_mean_t = scope->Var("bn_mean")->GetMutable(); - bn_mean_t->Resize(lite::DDim(bn_bias_shape)); - bn_mean_t->mutable_data(); - - auto bn_var_t = scope->Var("bn_var")->GetMutable(); - bn_var_t->Resize(lite::DDim(bn_bias_shape)); - bn_var_t->mutable_data(); - - scope->Var("bn_out")->GetMutable(); - scope->Var("bn_mean_out")->GetMutable(); - scope->Var("bn_var_out")->GetMutable(); - scope->Var("bn_saved_mean")->GetMutable(); - scope->Var("bn_saved_var")->GetMutable(); - - conv_op->SetType("conv2d"); - conv_op->SetInput("Input", {"conv_i"}); - conv_op->SetInput("Filter", {"conv_param"}); - conv_op->SetOutput("Output", {"conv_out"}); - const std::vector strides({1, 1}); - const std::vector paddings({1, 1}); - const std::vector dilations({1, 1}); - const int groups = 1; - conv_op->SetAttr("strides", strides); - conv_op->SetAttr("paddings", paddings); - conv_op->SetAttr("dilations", dilations); - conv_op->SetAttr("groups", groups); - conv_op->SetAttr("fuse_relu", false); - - bn_op->SetType("batch_norm"); - bn_op->SetInput("X", {"conv_out"}); - bn_op->SetInput("Bias", {"bn_bias"}); - bn_op->SetInput("Mean", {"bn_mean"}); - bn_op->SetInput("Scale", {"bn_scale"}); - bn_op->SetInput("Variance", {"bn_var"}); - - bn_op->SetOutput("Y", {"bn_out"}); - bn_op->SetOutput("MeanOut", {"bn_mean_out"}); - bn_op->SetOutput("VarianceOut", {"bn_var_out"}); - bn_op->SetOutput("SavedMean", {"bn_saved_mean"}); - bn_op->SetOutput("SavedVariance", {"bn_saved_var"}); - float eps = 1e-5; - bn_op->SetAttr("epsilon", eps); - bn_op->SetAttr("is_test", static_cast(1)); - bn_op->SetAttr("use_global_stats", false); - bn_op->SetAttr("momentum", 0.9f); - bn_op->SetAttr("data_layout", std::string("NCHW")); - - program_desc->Flush(); - - lite::Program program(*program_desc->Proto(), scope, valid_places); - auto graph = std::unique_ptr(new SSAGraph()); - graph->Build(program, valid_places); - - return graph; -} - -TEST(pattern_matcher2, test) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - const int num_nodes = graph->nodes().size(); - auto* fuser = new ConvBNFusePass; - fuser->Apply(graph); - ASSERT_EQ(graph->nodes().size(), - num_nodes - 8UL /*nodes removed */ + 1UL /* eltwise_add node*/); -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(conv2d); -USE_LITE_OP(batch_norm); -USE_LITE_OP(elementwise_add); diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc deleted file mode 100644 index 77ad8237fe..0000000000 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/conv_bn_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void ConvBNFuser::BuildPattern() { - auto* conv_input = - VarNode("conv_input")->assert_is_op_input(conv_type_, "Input")->AsInput(); - auto* conv_weight = VarNode("conv_weight") - ->assert_is_op_input(conv_type_, "Filter") - ->AsInput(); - auto* conv = OpNode("conv2d", conv_type_)->assert_is_op(conv_type_); - auto* conv_out = VarNode("conv_out") - ->assert_is_op_output(conv_type_, "Output") - ->assert_is_op_input("batch_norm", "X"); - - auto* bn_scale = VarNode("bn_scale") - ->assert_is_op_input("batch_norm", "Scale") - ->AsIntermediate(); - auto* bn_bias = - VarNode("bn_bias")->assert_is_op_input("batch_norm", "Bias")->AsInput(); - auto* bn_mean = VarNode("bn_mean") - ->assert_is_op_input("batch_norm", "Mean") - ->AsIntermediate(); - auto* bn_var = VarNode("bn_variance") - ->assert_is_op_input("batch_norm", "Variance") - ->AsIntermediate(); - auto* bn = - OpNode("bn", "batch_norm")->assert_is_op("batch_norm")->AsIntermediate(); - - auto* bn_out = - VarNode("bn_out")->assert_is_op_output("batch_norm", "Y")->AsOutput(); - auto* bn_mean_out = VarNode("bn_mean_out") - ->assert_is_op_output("batch_norm", "MeanOut") - ->AsIntermediate(); - auto* bn_var_out = VarNode("bn_var_out") - ->assert_is_op_output("batch_norm", "VarianceOut") - ->AsIntermediate(); - auto* bn_saved_mean = VarNode("bn_saved_mean") - ->assert_is_op_output("batch_norm", "SavedMean") - ->AsIntermediate(); - auto* bn_saved_var = VarNode("bn_saved_var") - ->assert_is_op_output("batch_norm", "SavedVariance") - ->AsIntermediate(); - - conv->LinksFrom({conv_input, conv_weight}).LinksTo({conv_out}); - - bn->LinksFrom({conv_out, bn_scale, bn_bias, bn_mean, bn_var}) - .LinksTo({bn_out, bn_mean_out, bn_saved_mean, bn_saved_var, bn_var_out}); -} - -void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto eltwise_op = LiteOpRegistry::Global().Create("elementwise_add"); - - auto conv_instruct = matched.at("conv2d")->stmt(); - auto conv = conv_instruct->op(); - auto* scope = conv->scope(); - auto& valid_places = conv->valid_places(); - - auto conv_weight_t = scope->FindVar(matched.at("conv_weight")->arg()->name) - ->GetMutable(); - auto conv_weight_dims = conv_weight_t->dims(); - size_t weight_num = conv_weight_t->data_size(); - - auto bn_scale_t = scope->FindVar(matched.at("bn_scale")->arg()->name) - ->GetMutable(); - size_t bias_size = bn_scale_t->data_size(); - auto bn_scale_d = bn_scale_t->mutable_data(); - CHECK_EQ(bias_size, static_cast(conv_weight_dims[0])) - << "The BN bias's size should be equal to the size of the first " - << "dim size of the conv weights"; - - auto bn_mean_t = scope->FindVar(matched.at("bn_mean")->arg()->name) - ->GetMutable(); - auto bn_mean_d = bn_mean_t->mutable_data(); - - auto bn_var_t = scope->FindVar(matched.at("bn_variance")->arg()->name) - ->GetMutable(); - auto bn_var_d = bn_var_t->mutable_data(); - - auto bn_bias_t = scope->FindVar(matched.at("bn_bias")->arg()->name) - ->GetMutable(); - auto bn_bias_d = bn_bias_t->mutable_data(); - auto eps = matched.at("bn")->stmt()->op_info()->GetAttr("epsilon"); - - auto conv_op_desc = conv_instruct->mutable_op_info(); - - bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; - Tensor alpha_tensor, beta_tensor; - alpha_tensor.CopyDataFrom(*bn_bias_t); - beta_tensor.CopyDataFrom(*bn_bias_t); - auto alpha_data = alpha_tensor.mutable_data(); - auto beta_data = beta_tensor.mutable_data(); - - int h = bias_size; - int w = weight_num / bias_size; - ComputeAlphaAndBeta( - bn_scale_d, bn_mean_d, bn_var_d, alpha_data, beta_data, eps, h, w); - - if (enable_int8) { - PADDLE_ENFORCE(conv_op_desc->HasAttr("weight_scale"), - "INT8 mode: Conv should has weight_scale attr"); - auto weight_scale = - conv_op_desc->GetAttr>("weight_scale"); - for (int i = 0; i < h; i++) { - weight_scale[i] *= alpha_data[i]; - } - // Interface like this should be abandoned. - conv_op_desc->SetAttr("weight_scale", weight_scale); - auto update_conv_desc = *conv_instruct->mutable_op_info(); - conv_instruct->ResetOp(update_conv_desc, graph->valid_places()); - } else { - auto conv_weight_d = conv_weight_t->mutable_data(); - for (int i = 0; i < h; i++) { - for (int j = 0; j < w; j++) { - conv_weight_d[i * w + j] *= alpha_data[i]; - } - } - } - for (int i = 0; i < bias_size; i++) { - bn_bias_d[i] += beta_data[i]; - } - eltwise_op->Attach(op_desc, scope); - auto* new_op_node = graph->GraphCreateInstructNode(eltwise_op, valid_places); - - IR_NODE_LINK_TO(matched.at("conv_out"), new_op_node); - IR_NODE_LINK_TO(matched.at("bn_bias"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("bn_out")); -} - -cpp::OpDesc ConvBNFuser::GenOpDesc(const key2nodes_t& matched) { - cpp::OpDesc op_desc; - op_desc.SetType("elementwise_add"); - op_desc.SetInput("X", {matched.at("conv_out")->arg()->name}); - op_desc.SetInput("Y", {matched.at("bn_bias")->arg()->name}); - op_desc.SetOutput("Out", {matched.at("bn_out")->arg()->name}); - op_desc.SetAttr("axis", 1); - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_bn_fuser.h b/lite/core/mir/fusion/conv_bn_fuser.h deleted file mode 100644 index 9acf65f9e2..0000000000 --- a/lite/core/mir/fusion/conv_bn_fuser.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class ConvBNFuser : public FuseBase { - public: - explicit ConvBNFuser(const std::string& conv_type) : conv_type_(conv_type) {} - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - void ComputeAlphaAndBeta(float* scale_d, - float* mean_d, - float* var_d, - float* alpha, - float* beta, - float eps, - int h, - int w) { - for (int i = 0; i < h; i++) { - alpha[i] = scale_d[i] / std::sqrt(var_d[i] + eps); - } - for (int i = 0; i < h; i++) { - beta[i] = (-mean_d[i]) * alpha[i]; - } - } - - private: - std::string conv_type_{"conv2d"}; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc b/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc deleted file mode 100644 index 59bf7035e7..0000000000 --- a/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/mir/fusion/conv_activation_fuse_pass.h" -#include "lite/core/mir/fusion/conv_elementwise_fuse_pass.h" -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/op_registry.h" -#include "lite/core/program.h" -#include "lite/core/tensor.h" -#include "paddle/fluid/framework/program_desc.h" - -DEFINE_string(model_dir, "", ""); -DEFINE_string(optimized_model, "", ""); - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -std::unique_ptr BuildGraph(framework::ProgramDesc* program_desc, - const std::shared_ptr& scope, - const std::vector& valid_places) { - auto* main_block = program_desc->MutableBlock(0); - - auto* conv2d_1 = main_block->AppendOp(); - auto* conv2d_2 = main_block->AppendOp(); - auto* add_1 = main_block->AppendOp(); - auto* relu_1 = main_block->AppendOp(); - auto* add_2 = main_block->AppendOp(); - auto* relu_2 = main_block->AppendOp(); - - main_block->Var("input_1"); - main_block->Var("input_2"); - main_block->Var("filter_1"); - main_block->Var("filter_2"); - main_block->Var("conv2d_1_out"); - main_block->Var("conv2d_2_out"); - main_block->Var("bias_1"); - main_block->Var("add_1_out"); - main_block->Var("add_2_out"); - main_block->Var("relu_1_out"); - main_block->Var("out"); - - scope->Var("input_1")->GetMutable(); - scope->Var("input_2")->GetMutable(); - scope->Var("filter_1")->GetMutable(); - scope->Var("filter_2")->GetMutable(); - scope->Var("conv2d_1_out")->GetMutable(); - scope->Var("conv2d_2_out")->GetMutable(); - scope->Var("bias_1")->GetMutable(); - scope->Var("add_1_out")->GetMutable(); - scope->Var("add_2_out")->GetMutable(); - scope->Var("relu_1_out")->GetMutable(); - scope->Var("out")->GetMutable(); - - conv2d_1->SetType("conv2d"); - conv2d_1->SetInput("Input", {"input_1"}); - conv2d_1->SetInput("Filter", {"filter_1"}); - conv2d_1->SetOutput("Output", {"conv2d_1_out"}); - conv2d_1->SetAttr("strides", std::vector({1, 1})); - conv2d_1->SetAttr("paddings", std::vector({0, 0})); - conv2d_1->SetAttr("groups", 1); - conv2d_1->SetAttr("dilations", std::vector({1, 1})); - conv2d_1->SetAttr("fuse_relu", false); - - add_1->SetType("elementwise_add"); - add_1->SetInput("X", {"conv2d_1_out"}); - add_1->SetInput("Y", {"bias_1"}); - add_1->SetOutput("Out", {"add_1_out"}); - add_1->SetAttr("axis", 1); - - relu_1->SetType("relu"); - relu_1->SetInput("X", {"add_1_out"}); - relu_1->SetOutput("Out", {"relu_1_out"}); - - conv2d_2->SetType("conv2d"); - conv2d_2->SetInput("Input", {"input_2"}); - conv2d_2->SetInput("Filter", {"filter_2"}); - conv2d_2->SetOutput("Output", {"conv2d_2_out"}); - conv2d_2->SetAttr("strides", std::vector({1, 1})); - conv2d_2->SetAttr("paddings", std::vector({0, 0})); - conv2d_2->SetAttr("groups", 1); - conv2d_2->SetAttr("dilations", std::vector({1, 1})); - conv2d_2->SetAttr("fuse_relu", false); - - add_2->SetType("elementwise_add"); - add_2->SetInput("X", {"conv2d_2_out"}); - add_2->SetInput("Y", {"relu_1_out"}); - add_2->SetOutput("Out", {"add_2_out"}); - add_2->SetAttr("axis", 1); - - relu_2->SetType("relu"); - relu_2->SetInput("X", {"add_2_out"}); - relu_2->SetOutput("Out", {"out"}); - - program_desc->Flush(); - - lite::Program program(*program_desc->Proto(), scope, valid_places); - auto graph = std::unique_ptr(new SSAGraph()); - graph->Build(program, valid_places); - - return graph; -} - -TEST(conv_elementwise_add_relu_fuse_pass, graph_test) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - - Visualize(graph.get()); - ASSERT_EQ(graph->nodes().size(), 11UL /*vars*/ + 6UL /*ops*/); - Visualize(graph.get()); -} - -TEST(conv_elementwise_add_relu_fuse_pass, fuse_test_op) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - Visualize(graph.get()); - const int num_nodes = graph->nodes().size(); - auto* fuser_eltwise = new ConvElementwiseFusePass; - auto* fuser_act = new ConvActivationFusePass; - fuser_eltwise->Apply(graph); - fuser_act->Apply(graph); - - Visualize(graph.get()); - ASSERT_EQ(graph->nodes().size(), - num_nodes - 5UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/); -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(elementwise_add); -USE_LITE_OP(conv2d); -USE_LITE_OP(depthwise_conv2d); -USE_LITE_OP(relu); diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc deleted file mode 100644 index 631c6b883e..0000000000 --- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/conv_elementwise_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/conv_elementwise_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void ConvElementwiseFusePass::Apply(const std::unique_ptr& graph) { - fusion::ConvElementwiseFuser fuser("conv2d"); - fuser(graph.get()); - - fusion::ConvElementwiseFuser depthwise_fuser("depthwise_conv2d"); - depthwise_fuser(graph.get()); - - fusion::ConvElementwiseFuser conv2d_transpose_fuser("conv2d_transpose"); - conv2d_transpose_fuser(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass, - paddle::lite::mir::ConvElementwiseFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.h b/lite/core/mir/fusion/conv_elementwise_fuse_pass.h deleted file mode 100644 index 11953e9b10..0000000000 --- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class ConvElementwiseFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_elementwise_fuser.cc b/lite/core/mir/fusion/conv_elementwise_fuser.cc deleted file mode 100644 index c3ab3e4c4c..0000000000 --- a/lite/core/mir/fusion/conv_elementwise_fuser.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/conv_elementwise_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void ConvElementwiseFuser::BuildPattern() { - // create input nodes. - auto* input = - VarNode("input")->assert_is_op_input(conv_type_, "Input")->AsInput(); - auto* filter = - VarNode("filter")->assert_is_op_input(conv_type_, "Filter")->AsInput(); - auto* bias = - VarNode("bias")->assert_is_op_input("elementwise_add", "Y")->AsInput(); - - // create op nodes - auto* conv2d = - OpNode("conv2d", conv_type_)->assert_is_op(conv_type_)->AsIntermediate(); - auto* add = OpNode("add", "elementwise_add") - ->assert_is_op("elementwise_add") - ->AsIntermediate(); - - // create intermediate nodes - auto* conv2d_out = VarNode("conv2d_out") - ->assert_is_op_output(conv_type_, "Output") - ->assert_is_op_input("elementwise_add", "X") - ->AsIntermediate(); - // create output node - auto* add_out = VarNode("output") - ->assert_is_op_output("elementwise_add", "Out") - ->AsOutput(); - - // create topology. - std::vector conv2d_inputs{filter, input}; - std::vector add_inputs{conv2d_out, bias}; - conv2d_inputs >> *conv2d >> *conv2d_out; - add_inputs >> *add >> *add_out; -} - -void ConvElementwiseFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto conv_op = LiteOpRegistry::Global().Create(conv_type_); - auto conv_old = matched.at("conv2d")->stmt()->op(); - auto* scope = conv_old->scope(); - auto& valid_places = conv_old->valid_places(); - conv_op->Attach(op_desc, scope); - - auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places); - - IR_NODE_LINK_TO(matched.at("input"), new_op_node); - IR_NODE_LINK_TO(matched.at("filter"), new_op_node); - IR_NODE_LINK_TO(matched.at("bias"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("output")); -} - -cpp::OpDesc ConvElementwiseFuser::GenOpDesc(const key2nodes_t& matched) { - auto* desc = matched.at("conv2d")->stmt()->op_info(); - - cpp::OpDesc op_desc = *desc; - op_desc.SetType(conv_type_); - op_desc.SetInput("Input", {matched.at("input")->arg()->name}); - op_desc.SetInput("Filter", {matched.at("filter")->arg()->name}); - op_desc.SetInput("Bias", {matched.at("bias")->arg()->name}); - op_desc.SetOutput("Output", {matched.at("output")->arg()->name}); - // Other inputs. See operators/conv_op.h - std::vector input_arg_names = desc->InputArgumentNames(); - - if (std::find(input_arg_names.begin(), - input_arg_names.end(), - "ResidualData") != input_arg_names.end()) { - op_desc.SetInput("ResidualData", desc->Input("ResidualData")); - } - // Only consider strides, padding, groups, dilations for now - op_desc.SetAttr("strides", desc->GetAttr>("strides")); - op_desc.SetAttr("paddings", desc->GetAttr>("paddings")); - op_desc.SetAttr("groups", desc->GetAttr("groups")); - op_desc.SetAttr("dilations", desc->GetAttr>("dilations")); - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/conv_elementwise_fuser.h b/lite/core/mir/fusion/conv_elementwise_fuser.h deleted file mode 100644 index 4514fc5010..0000000000 --- a/lite/core/mir/fusion/conv_elementwise_fuser.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class ConvElementwiseFuser : public FuseBase { - public: - explicit ConvElementwiseFuser(const std::string& conv_type) { - conv_type_ = conv_type; - } - - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - std::string conv_type_; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc deleted file mode 100644 index 71dc31d49a..0000000000 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/elementwise_add_activation_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void ElementwiseAddActivationFusePass::Apply( - const std::unique_ptr& graph) { - fusion::ElementwiseAddActivationFuser fuser("relu"); - fuser(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass, - paddle::lite::mir::ElementwiseAddActivationFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h deleted file mode 100644 index 299b6b89a0..0000000000 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class ElementwiseAddActivationFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc deleted file mode 100644 index ca5127db16..0000000000 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h" -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/op_registry.h" -#include "lite/core/program.h" -#include "lite/core/tensor.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -std::unique_ptr BuildGraph(framework::ProgramDesc* program_desc, - const std::shared_ptr& scope, - const std::vector& valid_places) { - auto* main_block = program_desc->MutableBlock(0); - - auto* add_1 = main_block->AppendOp(); - auto* add_2 = main_block->AppendOp(); - auto* relu_1 = main_block->AppendOp(); - auto* relu_2 = main_block->AppendOp(); - - main_block->Var("x_1"); - main_block->Var("y_1"); - main_block->Var("add_out_1"); - main_block->Var("relu_out_1"); - main_block->Var("y_2"); - main_block->Var("add_out_2"); - main_block->Var("out"); - - scope->Var("x_1")->GetMutable(); - scope->Var("y_1")->GetMutable(); - scope->Var("add_out_1")->GetMutable(); - scope->Var("relu_out_1")->GetMutable(); - scope->Var("y_2")->GetMutable(); - scope->Var("add_out_2")->GetMutable(); - scope->Var("out")->GetMutable(); - - add_1->SetType("elementwise_add"); - add_1->SetInput("X", {"x_1"}); - add_1->SetInput("Y", {"y_1"}); - add_1->SetOutput("Out", {"add_out_1"}); - add_1->SetAttr("axis", 1); - - relu_1->SetType("relu"); - relu_1->SetInput("X", {"add_out_1"}); - relu_1->SetOutput("Out", {"relu_out_1"}); - - add_2->SetType("elementwise_add"); - add_2->SetInput("X", {"relu_out_1"}); - add_2->SetInput("Y", {"y_2"}); - add_2->SetOutput("Out", {"add_out_2"}); - add_2->SetAttr("axis", 1); - - relu_2->SetType("relu"); - relu_2->SetInput("X", {"add_out_2"}); - relu_2->SetOutput("Out", {"out"}); - - program_desc->Flush(); - - lite::Program program(*program_desc->Proto(), scope, valid_places); - auto graph = std::unique_ptr(new SSAGraph()); - graph->Build(program, valid_places); - - return graph; -} - -TEST(elementwise_add_activation_fuse_pass, graph_test) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - ASSERT_EQ(graph->nodes().size(), - 7UL /*vars*/ + 4UL /*ops*/ + 1UL /* SSAGraph tmp node*/); -} - -TEST(elementwise_add_activation_fuse_pass, fuse_test_op) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - Visualize(graph.get()); - const int num_nodes = graph->nodes().size(); - auto* fuser = new ElementwiseAddActivationFusePass; - fuser->Apply(graph); - Visualize(graph.get()); - ASSERT_EQ(graph->nodes().size(), - num_nodes - 3UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/); -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(elementwise_add); -USE_LITE_OP(fusion_elementwise_add_activation); -USE_LITE_OP(relu); diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc b/lite/core/mir/fusion/elementwise_add_activation_fuser.cc deleted file mode 100644 index 3c6bf4768b..0000000000 --- a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/elementwise_add_activation_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void ElementwiseAddActivationFuser::BuildPattern() { - // create input nodes. - auto* x = VarNode("x")->assert_is_op_input("elementwise_add", "X")->AsInput(); - auto* y = VarNode("y")->assert_is_op_input("elementwise_add", "Y")->AsInput(); - - // create op nodes - auto* add = OpNode("add", "elementwise_add") - ->assert_is_op("elementwise_add") - ->AsIntermediate(); - auto* act = - OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate(); - - // create intermediate nodes - auto* add_out = VarNode("add_out") - ->assert_is_op_output("elementwise_add", "Out") - ->assert_is_op_input(act_type_, "X") - ->AsIntermediate(); - - // create output node - auto* out = - VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput(); - - // create topology. - std::vector add_inputs{x, y}; - add_inputs >> *add >> *add_out; - *add_out >> *act >> *out; -} - -void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto op = - LiteOpRegistry::Global().Create("fusion_elementwise_add_activation"); - auto old_op = matched.at("add")->stmt()->op(); - auto* scope = old_op->scope(); - auto& valid_places = old_op->valid_places(); - op->Attach(op_desc, scope); - - auto* new_op_node = graph->GraphCreateInstructNode(op, valid_places); - - IR_NODE_LINK_TO(matched.at("x"), new_op_node); - IR_NODE_LINK_TO(matched.at("y"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("output")); -} - -cpp::OpDesc ElementwiseAddActivationFuser::GenOpDesc( - const key2nodes_t& matched) { - auto* desc = matched.at("add")->stmt()->op_info(); - - cpp::OpDesc op_desc; - op_desc.SetType("fusion_elementwise_add_activation"); - op_desc.SetInput("X", {matched.at("x")->arg()->name}); - op_desc.SetInput("Y", {matched.at("y")->arg()->name}); - op_desc.SetOutput("Out", {matched.at("output")->arg()->name}); - - op_desc.SetAttr("axis", desc->GetAttr("axis")); - op_desc.SetAttr("act_type", act_type_); - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuser.h b/lite/core/mir/fusion/elementwise_add_activation_fuser.h deleted file mode 100644 index 47bb2fcf82..0000000000 --- a/lite/core/mir/fusion/elementwise_add_activation_fuser.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class ElementwiseAddActivationFuser : public FuseBase { - public: - explicit ElementwiseAddActivationFuser(const std::string& act_type) - : act_type_(act_type) {} - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - std::string act_type_; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc deleted file mode 100644 index 3a68fd19bf..0000000000 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/fc_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/fc_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void FcFusePass::Apply(const std::unique_ptr& graph) { - fusion::FcFuser fuser; - fuser(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/fc_fuse_pass.h b/lite/core/mir/fusion/fc_fuse_pass.h deleted file mode 100644 index 44771345a7..0000000000 --- a/lite/core/mir/fusion/fc_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class FcFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/fc_fuse_pass_test.cc b/lite/core/mir/fusion/fc_fuse_pass_test.cc deleted file mode 100644 index cbf77084dd..0000000000 --- a/lite/core/mir/fusion/fc_fuse_pass_test.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/fc_fuse_pass.h" -#include -#include -#include -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/op_registry.h" - -DEFINE_string(model_dir, "", ""); -DEFINE_string(optimized_model, "", ""); - -namespace paddle { -namespace lite { -namespace mir { - -TEST(fc_fuse_pass, fuse_test) { - lite::Predictor predictor; -#ifndef LITE_WITH_CUDA - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); -#else - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)}, - Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)}, - Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)}, - Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)}, - Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)}, - Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)}, - }); -#endif - - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, // origin cuda - valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({100, 100}))); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - predictor.Run(); - - auto* out = predictor.GetOutput(0); - LOG(INFO) << out << " memory size " << out->data_size(); - LOG(INFO) << "out " << out->data()[0]; - LOG(INFO) << "out " << out->data()[1]; - LOG(INFO) << "dims " << out->dims(); - EXPECT_NEAR(out->data()[0], 38.120617f, 1e-5); - EXPECT_NEAR(out->data()[1], 10.109812f, 1e-5); - CHECK_EQ(out->dims()[0], 100); - CHECK_EQ(out->dims()[1], 500); -} - -#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK -TEST(fc_fuse_pass, save_model_test) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}}); - predictor.Build(FLAGS_model_dir, - "", - "", - Place{TARGET(kX86), PRECISION(kFloat)}, - valid_places); - - LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; - predictor.SaveModel(FLAGS_optimized_model); -} -#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(mul); -USE_LITE_OP(elementwise_add); -USE_LITE_OP(elementwise_sub); -USE_LITE_OP(fc); -USE_LITE_OP(feed); -USE_LITE_OP(fetch); -USE_LITE_OP(io_copy); -USE_LITE_OP(softmax); -USE_LITE_OP(scale); -USE_LITE_KERNEL(feed, kHost, kAny, kAny, def); -USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); - -// #ifdef LITE_WITH_X86 -// USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def); -// USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def); -// #endif - -#ifdef LITE_WITH_CUDA -USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def); -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host); -#endif diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc deleted file mode 100644 index 72e1a4684d..0000000000 --- a/lite/core/mir/fusion/fc_fuser.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/fc_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void FcFuser::BuildPattern() { - // create nodes. - auto* x = VarNode("x")->assert_is_op_input("mul", "X"); - auto* W = VarNode("W")->assert_is_op_input("mul", "Y"); - auto* b = VarNode("b"); - auto* mul = OpNode("mul", "mul"); - auto* mul_out = VarNode("mul_out"); - auto* add = OpNode("add", "elementwise_add"); - auto* Out = VarNode("Out"); - - // create topology. - std::vector mul_inputs{W, x}; - std::vector add_inputs{mul_out, b}; - mul_inputs >> *mul >> *mul_out; - add_inputs >> *add >> *Out; - - // Some op specialities. - mul_out->AsIntermediate(); - mul->AsIntermediate(); - add->AsIntermediate(); -} - -void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto fc_op = LiteOpRegistry::Global().Create("fc"); - auto mul = matched.at("mul")->stmt()->op(); - auto* scope = mul->scope(); - auto& valid_places = mul->valid_places(); - fc_op->Attach(op_desc, scope); - - auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places); - - IR_NODE_LINK_TO(matched.at("W"), new_op_node); - IR_NODE_LINK_TO(matched.at("x"), new_op_node); - IR_NODE_LINK_TO(matched.at("b"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("Out")); -} - -cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) { - cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info(); - op_desc.SetType("fc"); - op_desc.SetInput("Input", {matched.at("x")->arg()->name}); - op_desc.SetInput("W", {matched.at("W")->arg()->name}); - op_desc.SetInput("Bias", {matched.at("b")->arg()->name}); - op_desc.SetOutput("Out", {matched.at("Out")->arg()->name}); - op_desc.SetAttr( - "in_num_col_dims", - matched.at("mul")->stmt()->op_info()->GetAttr("x_num_col_dims")); - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/fc_fuser.h b/lite/core/mir/fusion/fc_fuser.h deleted file mode 100644 index 7ba0752789..0000000000 --- a/lite/core/mir/fusion/fc_fuser.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class FcFuser : public FuseBase { - public: - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/interpolate_fuse_pass.cc b/lite/core/mir/fusion/interpolate_fuse_pass.cc deleted file mode 100644 index 5a0e1384a7..0000000000 --- a/lite/core/mir/fusion/interpolate_fuse_pass.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/interpolate_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/interpolate_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void InterpolateFusePass::Apply(const std::unique_ptr& graph) { - fusion::InterpolateFuser bilinear_interp_fuser("bilinear_interp"); - bilinear_interp_fuser(graph.get()); - - fusion::InterpolateFuser nearest_interp_fuser("nearest_interp"); - nearest_interp_fuser(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_interpolate_fuse_pass, - paddle::lite::mir::InterpolateFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/interpolate_fuse_pass.h b/lite/core/mir/fusion/interpolate_fuse_pass.h deleted file mode 100644 index 2beb4bb5b0..0000000000 --- a/lite/core/mir/fusion/interpolate_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class InterpolateFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/interpolate_fuser.cc b/lite/core/mir/fusion/interpolate_fuser.cc deleted file mode 100644 index 458ef76cb4..0000000000 --- a/lite/core/mir/fusion/interpolate_fuser.cc +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/interpolate_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void InterpolateFuser::BuildPattern() { - auto* x = VarNode("x"); - auto* shape = OpNode("shape", "shape")->AsIntermediate(); - auto* shape_out = VarNode("shape_out")->AsIntermediate(); - auto* slice = OpNode("slice", "slice") - ->assert_op_attr_satisfied>( - "axes", - [](const std::vector& attr) { - return attr.size() == 1 && attr[0] == 0; - }) - ->assert_op_attr_satisfied>( - "starts", - [](const std::vector& attr) { - return attr.size() == 1 && attr[0] == 2; - }) - ->assert_op_attr_satisfied>( - "ends", - [](const std::vector& attr) { - return attr.size() == 1 && attr[0] == 4; - }) - ->AsIntermediate(); - auto* slice_out = VarNode("slice_out")->AsIntermediate(); - auto* cast = OpNode("cast", "cast")->AsIntermediate(); - auto* cast_out = VarNode("cast_out")->AsIntermediate(); - auto* fill_constant = - OpNode("fill_constant", "fill_constant")->AsIntermediate(); - auto* fill_constant_out = VarNode("fill_constant_out")->AsIntermediate(); - auto* elementwise_mul = - OpNode("elementwise_mul", "elementwise_mul") - ->assert_op_attr_satisfied( - "axis", [](int attr) { return attr == -1 || attr == 0; }) - ->AsIntermediate(); - auto* elementwise_mul_out = VarNode("elementwise_mul_out")->AsIntermediate(); - auto* interpolate = OpNode("interpolate", interp_type_)->AsIntermediate(); - auto* interpolate_out = VarNode("interpolate_out"); - - // create topology. - *x >> *shape >> *shape_out >> *slice >> *slice_out >> *cast >> *cast_out >> - *elementwise_mul >> *elementwise_mul_out >> *interpolate >> - *interpolate_out; - *fill_constant >> *fill_constant_out >> *elementwise_mul; - *x >> *interpolate; -} - -void InterpolateFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto interp_op = LiteOpRegistry::Global().Create(interp_type_); - auto interp_old = matched.at("interpolate")->stmt()->op(); - auto* scope = interp_old->scope(); - auto& valid_places = interp_old->valid_places(); - interp_op->Attach(op_desc, scope); - - auto* new_op_node = graph->GraphCreateInstructNode(interp_op, valid_places); - - IR_NODE_LINK_TO(matched.at("x"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("interpolate_out")); -} - -cpp::OpDesc InterpolateFuser::GenOpDesc(const key2nodes_t& matched) { - auto op_desc = *matched.at("interpolate")->stmt()->op_info(); - op_desc.SetInput("OutSize", {}); - op_desc.SetAttr( - "scale", - matched.at("fill_constant")->stmt()->op_info()->GetAttr("value")); - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/interpolate_fuser.h b/lite/core/mir/fusion/interpolate_fuser.h deleted file mode 100644 index 51f5655e76..0000000000 --- a/lite/core/mir/fusion/interpolate_fuser.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class InterpolateFuser : public FuseBase { - public: - explicit InterpolateFuser(const std::string& interp_type) - : interp_type_(interp_type) {} - - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - std::string interp_type_; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc deleted file mode 100644 index 9773caa3c1..0000000000 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/quant_dequant_fuse_pass.h" -#include -#include -#include "lite/api/paddle_place.h" -#include "lite/core/mir/fusion/quant_dequant_op_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void QuantDequantFusePass::Apply(const std::unique_ptr& graph) { - std::unordered_set quant_types = { - "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; - std::unordered_set quantized_op_types = { - "conv2d", "mul", "depthwise_conv2d"}; - for (auto& quant_type : quant_types) { - for (auto& op_type : quantized_op_types) { - for (int i = 6; i >= 1; i--) { - fusion::QuantDequantOpFuser fuser(op_type, quant_type, i); - fuser(graph.get()); - } - } - } -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_quant_dequant_fuse_pass, - paddle::lite::mir::QuantDequantFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.h b/lite/core/mir/fusion/quant_dequant_fuse_pass.h deleted file mode 100644 index 243241bfb7..0000000000 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class QuantDequantFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc deleted file mode 100644 index 1c7cf866b9..0000000000 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/quant_dequant_op_fuser.h" -#include -#include -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void QuantDequantOpFuser::BuildPattern() { - const int kNumFields = 5; - const int kQuantizedWeightOffset = 0; - const int kQuantizedOpOffset = 1; - const int kQuantizedOpOutOffset = 2; - const int kDequantOpOffset = 3; - const int kDequantOpOutOffset = 4; - - std::string weight_name = ""; - if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") { - weight_name = "Filter"; - } else { - weight_name = "Y"; - } - auto* quant_op_input = VarNode("quant_op_input") - ->assert_is_op_input(quant_type_, "X") - ->AsInput(); - auto* quant_op_in_scale = VarNode("quant_op_in_scale") - ->assert_is_op_input(quant_type_, "InScale") - ->AsIntermediate(); - auto* quant_op = OpNode("quant_op", quant_type_) - ->assert_is_op(quant_type_) - ->AsIntermediate(); - - auto* quant_op_out_scale = - VarNode("quant_op_out_scale") - ->assert_is_op_output(quant_type_, "OutScale") - ->assert_is_op_input("fake_dequantize_max_abs", "Scale") - ->AsIntermediate(); - - auto* quant_op_out = VarNode("quant_op_out") - ->assert_is_op_output(quant_type_, "Out") - ->assert_is_op_input(op_type_) - ->AsIntermediate(); - std::vector nodes; - for (int i = 0; i < times_; i++) { - nodes.push_back(VarNode(string_format("quantized_op_weight%d", i)) - ->assert_is_op_input(op_type_, weight_name) - ->AsInput()); - - nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_) - ->assert_is_op(op_type_) - ->AsIntermediate()); - - nodes.push_back(VarNode(string_format("quantized_op_out%d", i)) - ->assert_is_op_output(op_type_) - ->assert_is_op_input("fake_dequantize_max_abs", "X") - ->AsIntermediate()); - - nodes.push_back( - OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs") - ->assert_is_op("fake_dequantize_max_abs") - ->AsIntermediate()); - nodes.push_back(VarNode(string_format("dequant_op_out%d", i)) - ->assert_is_op_output("fake_dequantize_max_abs", "Out") - ->AsOutput()); - } - - quant_op->LinksFrom({quant_op_input, quant_op_in_scale}); - quant_op_out->LinksFrom({quant_op}); - quant_op_out_scale->LinksFrom({quant_op}); - for (int i = 0; i < times_; i++) { - nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom( - {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]}); - nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom( - {nodes[i * kNumFields + kQuantizedOpOffset]}); - nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( - {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale}); - nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom( - {nodes[i * kNumFields + kDequantOpOffset]}); - } -} - -void QuantDequantOpFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { - const int kNumFields = 5; - const int kQuantizedWeightOffset = 0; - const int kQuantizedOpOffset = 1; - const int kDequantOpOffset = 3; - const int kDequantOpOutOffset = 4; - - auto* quant_op_input = matched.at("quant_op_input"); - auto* quant_op_in_scale = matched.at("quant_op_in_scale"); - auto* quant_op = matched.at("quant_op"); - - std::vector nodes; - for (int i = 0; i < times_; i++) { - nodes.push_back(matched.at(string_format("quantized_op_weight%d", i))); - nodes.push_back(matched.at(string_format("quantized_op%d", i))); - nodes.push_back(matched.at(string_format("quantized_op_out%d", i))); - nodes.push_back(matched.at(string_format("dequant_op%d", i))); - nodes.push_back(matched.at(string_format("dequant_op_out%d", i))); - } - int bit_length = quant_op->stmt()->op_info()->GetAttr("bit_length"); - auto* scope = quant_op->stmt()->op()->scope(); - auto& valid_places = quant_op->stmt()->op()->valid_places(); - int range = ((1 << (bit_length - 1)) - 1); - auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name) - ->GetMutable(); - float input_scale = input_scale_t->data()[0] / range; - - VLOG(4) << "range: " << range << " input_scale: " << input_scale; - for (int i = 0; i < times_; i++) { - float max_range = nodes[i * kNumFields + kDequantOpOffset] - ->stmt() - ->op_info() - ->GetAttr("max_range"); - // weight_scale = max(abs(weight)) - float whole_weight_scale = - static_cast(range * range) / max_range / range; - - cpp::OpDesc op_desc = - *nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info(); - - auto quantized_weight_var_name = - nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name; - auto quantized_weight_t = - scope->FindVar(quantized_weight_var_name)->GetMutable(); - std::vector weight_scale; - int weight_scale_size; - - if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") { - op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name}); - op_desc.SetOutput( - "Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name}); - // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should - // be Cout. - weight_scale_size = quantized_weight_t->dims()[0]; - } else if (op_type_ == "mul") { - op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name}); - op_desc.SetOutput( - "Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name}); - // Fc weight: Cin * Cout, the weight_scale_size should be Cout. - weight_scale_size = quantized_weight_t->dims()[1]; - } - for (int i = 0; i < weight_scale_size; i++) { - weight_scale.push_back(whole_weight_scale); - } - op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("input_scale", input_scale); - op_desc.SetAttr("weight_scale", weight_scale); - - Tensor temp_tensor; - temp_tensor.CopyDataFrom(*quantized_weight_t); - float* temp_data = temp_tensor.mutable_data(); - - size_t weight_num = quantized_weight_t->data_size(); - int8_t* quantized_weight_data = quantized_weight_t->mutable_data(); - - // change the weight from the float type to int8 type. - for (size_t i = 0; i < weight_num; i++) { - quantized_weight_data[i] = static_cast(temp_data[i]); - } - quantized_weight_t->set_persistable(true); - quantized_weight_t->set_precision(PRECISION(kInt8)); - auto quantized_op = LiteOpRegistry::Global().Create(op_type_); - - quantized_op->Attach(op_desc, scope); - auto* new_op_node = - graph->GraphCreateInstructNode(quantized_op, valid_places); - IR_NODE_LINK_TO(quant_op_input, new_op_node); - IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], - new_op_node); - IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]); - } -} - -cpp::OpDesc QuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { - cpp::OpDesc op_desc; - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h deleted file mode 100644 index 15833ad258..0000000000 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -/* The model trained by fluid quantization is a simulation of real int8. - * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quantop - * in front and fake_dequantop behind. - * - * When in int8 mode, the pattern like "fake_quant + quantized_op + - * fake_dequant" - * can be detected by this fuser. The fuser extract the input_scale and - * the weight_scale info from fake_quant, fake_dequant op and fuse those into - * the quantized_op. - * In addition, the fuser delete fake_quant and fake_dequant op in the graph at - * the last. - */ -class QuantDequantOpFuser : public FuseBase { - public: - explicit QuantDequantOpFuser(const std::string& op_type, - const std::string& quant_type, - int times) - : op_type_(op_type), quant_type_(quant_type), times_(times) {} - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - - private: - std::string op_type_{"conv2d"}; - std::string quant_type_; - int times_; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/shuffle_channel_fuse_pass.cc b/lite/core/mir/fusion/shuffle_channel_fuse_pass.cc deleted file mode 100644 index 049be721e9..0000000000 --- a/lite/core/mir/fusion/shuffle_channel_fuse_pass.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/shuffle_channel_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/shuffle_channel_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void ShuffleChannelFusePass::Apply(const std::unique_ptr& graph) { - fusion::ShuffleChannelFuser fuser("reshape", "transpose"); - fuser(graph.get()); - - fusion::ShuffleChannelFuser fuser2("reshape2", "transpose2"); - fuser2(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_shuffle_channel_fuse_pass, - paddle::lite::mir::ShuffleChannelFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/shuffle_channel_fuse_pass.h b/lite/core/mir/fusion/shuffle_channel_fuse_pass.h deleted file mode 100644 index 0524aff395..0000000000 --- a/lite/core/mir/fusion/shuffle_channel_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class ShuffleChannelFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/shuffle_channel_fuser.cc b/lite/core/mir/fusion/shuffle_channel_fuser.cc deleted file mode 100644 index f0087f8991..0000000000 --- a/lite/core/mir/fusion/shuffle_channel_fuser.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/shuffle_channel_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void ShuffleChannelFuser::BuildPattern() { - // create nodes. - auto* x1 = VarNode("x1")->assert_is_op_input(reshape_type_, "X"); - auto* y1 = VarNode("y1")->assert_is_op_output(reshape_type_, "Out"); - auto* y2 = VarNode("y2")->assert_is_op_output(transpose_type_, "Out"); - auto* out = VarNode("out")->assert_is_op_output(reshape_type_, "Out"); - - PMNode* xshape1 = nullptr; - PMNode* xshape2 = nullptr; - PMNode* xshape3 = nullptr; - if (reshape_type_ == "reshape2") { - xshape1 = VarNode("xshape1")->assert_is_op_output(reshape_type_, "XShape"); - xshape3 = VarNode("xshape3")->assert_is_op_output(reshape_type_, "XShape"); - } - if (transpose_type_ == "transpose2") { - xshape2 = - VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape"); - } - - auto* reshape1 = OpNode("reshape1", reshape_type_) - ->assert_op_attr_satisfied>( - "shape", [](const std::vector& attr) { - return attr.size() >= 5 && attr[1] > 0; - }); - auto* transpose = - OpNode("transpose_op", transpose_type_) - ->assert_op_attr_satisfied>( - "axis", [](const std::vector& attr) { - return attr.size() >= 5 && attr[1] == 2 && attr[2] == 1; - }); - auto* reshape2 = OpNode("reshape2", reshape_type_) - ->assert_op_attr_satisfied>( - "shape", [](const std::vector& attr) { - return attr.size() >= 4; - }); - - // create topology. - *x1 >> *reshape1 >> *y1 >> *transpose >> *y2 >> *reshape2 >> *out; - if (xshape1) *reshape1 >> *xshape1; - if (xshape2) *transpose >> *xshape2; - if (xshape3) *reshape2 >> *xshape3; - - // Some op specialities. - y1->AsIntermediate(); - y2->AsIntermediate(); - if (xshape1) xshape1->AsIntermediate(); - if (xshape2) xshape2->AsIntermediate(); - if (xshape3) xshape3->AsIntermediate(); - reshape1->AsIntermediate(); - transpose->AsIntermediate(); - reshape2->AsIntermediate(); -} - -void ShuffleChannelFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto shuffle_channel_op = LiteOpRegistry::Global().Create("shuffle_channel"); - auto transpose = matched.at("transpose_op")->stmt()->op(); - auto* scope = transpose->scope(); - auto& valid_places = transpose->valid_places(); - shuffle_channel_op->Attach(op_desc, scope); - - auto* new_op_node = - graph->GraphCreateInstructNode(shuffle_channel_op, valid_places); - - IR_NODE_LINK_TO(matched.at("x1"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("out")); -} - -cpp::OpDesc ShuffleChannelFuser::GenOpDesc(const key2nodes_t& matched) { - cpp::OpDesc op_desc; - op_desc.SetType("shuffle_channel"); - op_desc.SetInput("X", {matched.at("x1")->arg()->name}); - op_desc.SetOutput("Out", {matched.at("out")->arg()->name}); - op_desc.SetAttr("group", - matched.at("reshape1") - ->stmt() - ->op_info() - ->GetAttr>("shape")[1]); - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/shuffle_channel_fuser.h b/lite/core/mir/fusion/shuffle_channel_fuser.h deleted file mode 100644 index 4fb99ab5c8..0000000000 --- a/lite/core/mir/fusion/shuffle_channel_fuser.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class ShuffleChannelFuser : public FuseBase { - public: - explicit ShuffleChannelFuser(const std::string& reshape_type, - const std::string& transpose_type) - : reshape_type_(reshape_type), transpose_type_(transpose_type) {} - - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - std::string reshape_type_; - std::string transpose_type_; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc b/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc deleted file mode 100644 index 47c866d87a..0000000000 --- a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h" -#include -#include -#include "lite/core/mir/fusion/transpose_softmax_transpose_fuser.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void TransposeSoftmaxTransposeFusePass::Apply( - const std::unique_ptr& graph) { - fusion::TransposeSoftmaxTransposeFuser fuser("transpose", "softmax"); - fuser(graph.get()); - - fusion::TransposeSoftmaxTransposeFuser fuser2("transpose2", "softmax"); - fuser2(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass, - paddle::lite::mir::TransposeSoftmaxTransposeFusePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h b/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h deleted file mode 100644 index 4ae6ce83c4..0000000000 --- a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class TransposeSoftmaxTransposeFusePass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc deleted file mode 100644 index d578b725ec..0000000000 --- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/fusion/transpose_softmax_transpose_fuser.h" -#include -#include - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -void TransposeSoftmaxTransposeFuser::BuildPattern() { - // create nodes. - auto* x1 = VarNode("x1")->assert_is_op_input(transpose_type_, "X"); - auto* y1 = VarNode("y1")->assert_is_op_output(transpose_type_, "Out"); - auto* y2 = VarNode("y2")->assert_is_op_output(softmax_type_, "Out"); - auto* out = VarNode("out")->assert_is_op_output(transpose_type_, "Out"); - - PMNode* xshape1 = nullptr; - PMNode* xshape2 = nullptr; - if (transpose_type_ == "transpose2") { - xshape1 = - VarNode("xshape1")->assert_is_op_output(transpose_type_, "XShape"); - xshape2 = - VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape"); - } - - auto* transpose1 = - OpNode("transpose1", transpose_type_)->assert_is_op(transpose_type_); - - auto* softmax = OpNode("softmax", softmax_type_) - ->assert_op_attr_satisfied( - "axis", [](int attr) { return attr == -1; }); - - auto* transpose2 = - OpNode("transpose2", transpose_type_)->assert_is_op(transpose_type_); - - // create topology. - *x1 >> *transpose1 >> *y1 >> *softmax >> *y2 >> *transpose2 >> *out; - if (xshape1) *transpose1 >> *xshape1; - if (xshape2) *transpose2 >> *xshape2; - - // nodes to remove - y1->AsIntermediate(); - y2->AsIntermediate(); - if (xshape1) xshape1->AsIntermediate(); - if (xshape2) xshape2->AsIntermediate(); - transpose1->AsIntermediate(); - softmax->AsIntermediate(); - transpose2->AsIntermediate(); -} - -void TransposeSoftmaxTransposeFuser::InsertNewNode(SSAGraph* graph, - const key2nodes_t& matched) { - auto op_desc = GenOpDesc(matched); - auto softmax_op = LiteOpRegistry::Global().Create(softmax_type_); - auto softmax_old = matched.at("softmax")->stmt()->op(); - auto* scope = softmax_old->scope(); - auto& valid_places = softmax_old->valid_places(); - softmax_op->Attach(op_desc, scope); - - auto* new_op_node = graph->GraphCreateInstructNode(softmax_op, valid_places); - - IR_NODE_LINK_TO(matched.at("x1"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("out")); -} - -cpp::OpDesc TransposeSoftmaxTransposeFuser::GenOpDesc( - const key2nodes_t& matched) { - cpp::OpDesc op_desc; - op_desc.SetType("softmax"); - op_desc.SetInput("X", {matched.at("x1")->arg()->name}); - op_desc.SetOutput("Out", {matched.at("out")->arg()->name}); - op_desc.SetAttr("axis", - matched.at("transpose1") - ->stmt() - ->op_info() - ->GetAttr>("axis") - .back()); - - return op_desc; -} - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.h b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.h deleted file mode 100644 index fbccfd2c6a..0000000000 --- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pattern_matcher_high_api.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace fusion { - -class TransposeSoftmaxTransposeFuser : public FuseBase { - public: - explicit TransposeSoftmaxTransposeFuser(const std::string& transpose_type, - const std::string& softmax_type) - : transpose_type_(transpose_type), softmax_type_(softmax_type) {} - - void BuildPattern() override; - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - std::string transpose_type_; - std::string softmax_type_; -}; - -} // namespace fusion -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc deleted file mode 100644 index 23f2de564e..0000000000 --- a/lite/core/mir/generate_program_pass.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/generate_program_pass.h" -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void GenerateProgramPass::Apply(const std::unique_ptr& graph) { - VLOG(4) << "final program \n" << Visualize(graph.get()); - for (auto& item : graph->StmtTopologicalOrder()) { - if (item->IsStmt()) { - auto& stmt = item->AsStmt(); - VLOG(4) << stmt; - insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); - } - } -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(generate_program_pass, paddle::lite::mir::GenerateProgramPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/generate_program_pass.h b/lite/core/mir/generate_program_pass.h deleted file mode 100644 index b126b4aba4..0000000000 --- a/lite/core/mir/generate_program_pass.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -/* - * GenerateProgramPass will build the execution program for executor from a mir - * graph. - */ -class GenerateProgramPass : public ProgramPass { - public: - void Apply(const std::unique_ptr &graph) override; - - std::unique_ptr GenProgram() { - LOG(INFO) << "insts.size " << insts_.size(); - std::unique_ptr program( - new RuntimeProgram(std::move(insts_))); - - return program; - } - - private: - std::vector insts_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc deleted file mode 100644 index f97dbfc7cd..0000000000 --- a/lite/core/mir/graph_visualize_pass.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/graph_visualize_pass.h" -#include -#include -#include -#include -#include -#include "lite/core/mir/pass_registry.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -namespace mir { - -using inference::analysis::Dot; - -void GraphVisualizePass::Apply(const std::unique_ptr& graph) { - Visualize(graph.get()); -} - -std::string Visualize(mir::SSAGraph* graph) { - inference::analysis::Dot dot; - - int id = 0; - std::set exists_args; - std::map graph_col; // Different colors of subgraphs - graph_col.insert({{1, "red"}, - {2, "green"}, - {3, "cyan"}, - {4, "bisque3"}, - {5, "coral"}, - {6, "darkseagreen1"}, - {7, "goldenrod1"}, - {8, "darkorchid"}}); - for (auto& node : graph->mutable_nodes()) { - std::string key; - if (node.IsArg()) { - key = node.AsArg().name; - } else { - key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++); - } - - if (node.IsStmt()) { - auto& stmt = node.AsStmt(); - auto sub_id = stmt.subgraph_id(); - auto it = graph_col.find(sub_id); - if (sub_id > 0 && it != graph_col.end()) { - dot.AddNode(key, - {Dot::Attr("shape", "box"), - Dot::Attr("style", "filled"), - Dot::Attr("color", "black"), - Dot::Attr("fillcolor", it->second)}); - } else { - dot.AddNode(key, - {Dot::Attr("shape", "box"), - Dot::Attr("style", "filled"), - Dot::Attr("color", "black"), - Dot::Attr("fillcolor", "yellow")}); - } - for (auto& x : node.inlinks) { - auto name = x->AsArg().name; - if (!exists_args.count(name)) { - dot.AddNode(name, {}); - } - dot.AddEdge(name, key, {}); - exists_args.insert(name); - } - for (auto& x : node.outlinks) { - auto name = x->AsArg().name; - if (!exists_args.count(name)) { - dot.AddNode(name, {}); - } - dot.AddEdge(key, name, {}); - exists_args.insert(name); - } - } - } - - auto res = dot.Build(); - LOG(INFO) << "dot:\n" << res; - return res; -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(graph_visualze, paddle::lite::mir::GraphVisualizePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/graph_visualize_pass.h b/lite/core/mir/graph_visualize_pass.h deleted file mode 100644 index bde58a63b3..0000000000 --- a/lite/core/mir/graph_visualize_pass.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/dot.h" -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -/* - * GraphVisualizePass helps to visualize an mir graph by exporting a DOT - * language file. - */ -class GraphVisualizePass : public DebugPass { - public: - void Apply(const std::unique_ptr& graph) override; -}; - -std::string Visualize(mir::SSAGraph* graph); - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/io_copy_kernel_pick_pass.cc b/lite/core/mir/io_copy_kernel_pick_pass.cc deleted file mode 100644 index b2ea823e0b..0000000000 --- a/lite/core/mir/io_copy_kernel_pick_pass.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -class IoCopyKernelPickPass : public StmtPass { - public: - void Apply(const std::unique_ptr& graph) override { - for (auto& node : graph->mutable_nodes()) { - if (!node.IsStmt()) continue; - auto& inst = node.AsStmt(); - if (inst.op_type() != "io_copy") continue; - - LOG(INFO) << "....> picking a IO COPY kernel"; - - auto& kernels = node.AsStmt().kernels(); - CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op"; - const auto* inty = node.inlinks.front()->AsArg().type; - const auto* outy = node.outlinks.front()->AsArg().type; - LOG(INFO) << "input type " << *inty; - LOG(INFO) << "output type " << *outy; - - bool is_found = false; - LOG(INFO) << "kernels size " << kernels.size(); - for (auto& kernel : kernels) { - CHECK_EQ(node.inlinks.size(), 1UL); - CHECK_EQ(node.outlinks.size(), 1UL); - - const Type* in_arg_ty = kernel->GetInputDeclType("Input"); - const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); - LOG(INFO) << "checking kernel candidate " << *in_arg_ty << "->" - << *out_arg_ty; - if (TargetCompatibleTo(*inty, *in_arg_ty)) { - // Both the input and output type matches, remove other kernels - // directly. - if (TargetCompatibleTo(*outy, *out_arg_ty)) { - LOG(INFO) << "get a IOCopy kernel"; - auto x = std::move(kernel); - kernels.clear(); - kernels.emplace_back(std::move(x)); - is_found = true; - break; - } - } - } - - CHECK(is_found) << "Can't find a IoCopy kernel for IO: " << *inty << "->" - << *outy; - } - } -}; - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(io_copy_kernel_pick_pass, - paddle::lite::mir::IoCopyKernelPickPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/node.cc b/lite/core/mir/node.cc deleted file mode 100644 index 61d3d317e7..0000000000 --- a/lite/core/mir/node.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/node.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -const OpInfo *mir::Node::Stmt::op_info() const { - CHECK(op_); - return op_->op_info(); -} - -Place mir::Node::Stmt::place() const { - CHECK(!valid_kernels_.empty()); - return valid_kernels_.front()->place(); -} - -KernelBase &mir::Node::Stmt::picked_kernel() { - CHECK(!valid_kernels_.empty()) << "no kernel for " << op_type(); - return *valid_kernels_.front(); -} - -OpInfo *mir::Node::Stmt::mutable_op_info() { - CHECK(op_); - return op_->mutable_op_info(); -} - -void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc, - const std::vector &valid_places, - lite::Scope *scope) { - CHECK((op_ && op_->scope()) || scope) << "Either scope should be set"; - lite::Scope *the_scope = scope ? scope : op_->scope(); - op_->Attach(op_desc, the_scope); - // Recreate the kernels with the latest OpInfo. - valid_kernels_.clear(); - - if (!op_ || op_->op_info()->Type() != op_desc.Type()) { - op_ = LiteOpRegistry::Global().Create(op_desc.Type()); - CHECK(op_) << "No op found for " << op_desc.Type(); - } - valid_kernels_ = op_->CreateKernels(valid_places); -} - -std::ostream &mir::operator<<(std::ostream &os, const mir::Node::Stmt &other) { - os << "Statement " << other.op_type() << " " << other.place().DebugString(); - return os; -} - -mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) { - auto &x = AsArg(); - x.name = name; - x.id = id; - return x; -} -mir::Node::Arg &mir::Node::AsArg(const std::string &name) { - auto &x = AsArg(); - x.name = name; - return x; -} -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h deleted file mode 100644 index 9c7d441ca3..0000000000 --- a/lite/core/mir/node.h +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_lite.h" - -namespace paddle { -namespace lite { -namespace mir { - -// Node in a MIR graph. -class Node { - public: - std::list inlinks; - std::list outlinks; - - Node() = default; - - enum class Role { - kArg = 0, - kStmt, - kNumRoles, /*should be last*/ - kUnk, - }; - - class Stmt { - // The kernel instances this Statement contains. - std::vector> valid_kernels_; - // TODO(Superjomn) make this a shared_ptr for resource safety. - std::shared_ptr op_; // we hold op to run InferShape - - public: - // Refresh the operator and kernels with the latest OpInfo. - void ResetOp(const cpp::OpDesc& op_desc, - const std::vector& valid_places, - lite::Scope* scope = nullptr); - - std::string op_type() const { return op_info()->Type(); } - const OpInfo* op_info() const; - OpInfo* mutable_op_info(); - - void SetKernels(std::vector>&& kernels) { - valid_kernels_ = std::move(kernels); - } - std::vector>& kernels() { - return valid_kernels_; - } - - void ClearSubgraphID() { subgraph_id_ = -1 /* note: not 0 */; } - void SetSubgraphID(int id) { subgraph_id_ = id; } - int subgraph_id() const { return subgraph_id_; } - void SetOp(const std::shared_ptr& op) { op_ = op; } - const std::shared_ptr op() const { return op_; } - - Place place() const; - - KernelBase& picked_kernel(); - - friend std::ostream& operator<<(std::ostream& os, const Stmt& other); - - // Description. - std::string desc; - - protected: - // -1 means not in subgraph, 0 means supported but not one id, id started - // from 1 - int subgraph_id_{-1}; - }; - - struct Arg { - std::string name; - int id{0}; - const Type* type{}; - // Weight is a special kind of argument, it is marked as weight explicitly - // so that some weight related optimization can take place. - bool is_weight{false}; - // is_persist indicate that whether is the argument trans from Weight - // if the need more than one tool operator(eg. io_copy layout calib), the - // argument between them should be persist to make sure it's only run once - bool is_persist{false}; - }; - - Arg& AsArg(const std::string& name, int id); - - Arg& AsArg(const std::string& name); - - Stmt& AsStmt(const std::string& op_type, - std::vector>&& kernels, - const std::shared_ptr& op) { - auto& x = AsStmt(); - x.SetOp(op); - x.SetKernels(std::move(kernels)); - return x; - } - - Stmt* stmt() const { - CHECK(IsStmt()); - return stmt_.get(); - } - - Arg* arg() const { - CHECK(IsArg()); - return arg_.get(); - } - - // Set roles. - Arg& AsArg() { - if (role_ != Role::kUnk) { - CHECK(role_ == Role::kArg); - return *arg_; - } - role_ = Role::kArg; - arg_.reset(new Arg); - return *arg_; - } - Stmt& AsStmt() { - if (role_ != Role::kUnk) { - CHECK(role_ == Role::kStmt); - return *stmt_; - } - role_ = Role::kStmt; - stmt_.reset(new Stmt); - return *stmt_; - } - - friend std::ostream& operator<<(std::ostream& os, Node& other) { - os << static_cast(other.role_) << " "; - if (!other.IsRoleSet()) { - os << "Unk role node"; - } - if (other.IsArg()) { - auto& arg = other.AsArg(); - os << "Argument " << arg.name; - } - if (other.IsStmt()) { - auto& arg = other.AsStmt(); - os << "Statement " << arg.op_type(); - } - return os; - } - - // Check roles. - bool IsRoleSet() const { return role_ != Role::kUnk; } - bool IsStmt() const { return role_ == Role::kStmt; } - bool IsArg() const { return role_ == Role::kArg; } - - private: - // Either stmt_ or argument_ is used. - std::unique_ptr stmt_; - std::unique_ptr arg_; - Role role_{Role::kUnk}; -}; -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pass.cc b/lite/core/mir/pass.cc deleted file mode 100644 index 2aaa5a4a17..0000000000 --- a/lite/core/mir/pass.cc +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h deleted file mode 100644 index cd7684ae32..0000000000 --- a/lite/core/mir/pass.h +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -#include "lite/core/mir/node.h" -#include "lite/core/mir/ssa_graph.h" - -namespace paddle { -namespace lite { -namespace mir { - -class Pass { - public: - // Some appoint here, one pass should be only one of the following kinds. - enum class Kind { - // Will modify the program/graph topology. - kProgramWise = 0, - // Will modify the statement, with the graph topology fixed. - kStmtWise, - // Will not modify the IR, just collect information or visualization. - kDebug, - }; - - explicit Pass(Kind kind) : kind_(kind) {} - - virtual void Apply(const std::unique_ptr& graph) = 0; - - void set_name(const std::string& name) { name_ = name; } - const std::string& name() const { return name_; } - - void set_doc(const std::string& doc) { doc_ = doc; } - const std::string& doc() const { return doc_; } - - void set_targets(const std::set& targets) { targets_ = targets; } - const std::set& targets() const { return targets_; } - bool is_supported_target(TargetType target) const { - if (targets_.find(TARGET(kAny)) != targets_.end()) return true; - return (targets_.find(target) != targets_.end()); - } - - Kind kind() const { return kind_; } - bool is_debug_pass() const { return kind_ == Kind::kDebug; } - bool is_program_pass() const { return kind_ == Kind::kProgramWise; } - bool is_stmt_pass() const { return kind_ == Kind::kStmtWise; } - - virtual ~Pass() = default; - - private: - const Kind kind_; - std::string name_; - std::string doc_; - std::set targets_; -}; - -// Different kinds. -class ProgramPass : public Pass { - public: - ProgramPass() : Pass(Kind::kProgramWise) {} -}; - -class StmtPass : public Pass { - public: - StmtPass() : Pass(Kind::kStmtWise) {} -}; - -class DebugPass : public Pass { - public: - DebugPass() : Pass(Kind::kDebug) {} -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pass_manager.cc b/lite/core/mir/pass_manager.cc deleted file mode 100644 index 17f81b3bdd..0000000000 --- a/lite/core/mir/pass_manager.cc +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass_manager.h" - -namespace paddle { -namespace lite { -namespace mir {} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pass_manager.h b/lite/core/mir/pass_manager.h deleted file mode 100644 index ca40f2deca..0000000000 --- a/lite/core/mir/pass_manager.h +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { - -class PassManager { - public: - static PassManager& Global() { - static PassManager x; - return x; - } - - PassManager() {} - - void Run(const std::unique_ptr& graph) { - for (auto& pass : passes_) { - LOG(INFO) << "Running MIR pass " << pass->name(); - pass->Apply(graph); - } - } - - bool AddNewPass(const std::string& name, Pass* pass) { - passes_.emplace_back(pass); - pass_map_.emplace(name, passes_.back().get()); - passes_.back()->set_name(name); - return true; - } - - // Clear all the passes. - void Clear() { passes_.clear(); } - - std::list>::iterator passes_begin() { - return passes_.begin(); - } - std::list>::iterator passes_end() { - return passes_.end(); - } - std::list>::const_iterator passes_const_begin() - const { - return passes_.begin(); - } - std::list>::const_iterator passes_const_end() - const { - return passes_.end(); - } - - Pass* LookUp(const std::string& key) { - auto it = pass_map_.find(key); - if (it != pass_map_.end()) return it->second; - return nullptr; - } - - template - PassTy* LookUp(const std::string& key) { - auto it = pass_map_.find(key); - if (it != pass_map_.end()) return dynamic_cast(it->second); - return nullptr; - } - - private: - std::list> passes_; - std::map pass_map_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pass_manager_test.cc b/lite/core/mir/pass_manager_test.cc deleted file mode 100644 index 05e11ed5d1..0000000000 --- a/lite/core/mir/pass_manager_test.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass_manager.h" -#include -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -TEST(PassManager, test) { - auto* pass = PassManager::Global().LookUp("demo"); - LOG(INFO) << "pass: " << pass; - ASSERT_TRUE(pass != nullptr); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_MIR_PASS(demo); diff --git a/lite/core/mir/pass_registry.cc b/lite/core/mir/pass_registry.cc deleted file mode 100644 index e80db5d4ca..0000000000 --- a/lite/core/mir/pass_registry.cc +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir {} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pass_registry.h b/lite/core/mir/pass_registry.h deleted file mode 100644 index cc5c119ecb..0000000000 --- a/lite/core/mir/pass_registry.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/api/paddle_lite_factory_helper.h" -#include "lite/api/paddle_place.h" -#include "lite/core/mir/pass_manager.h" - -namespace paddle { -namespace lite { -namespace mir { - -class PassRegistry { - public: - PassRegistry(const std::string& name, mir::Pass* pass) - : name_(name), pass_(pass) { - PassManager::Global().AddNewPass(name_, pass_); - } - PassRegistry& SetTargets(const std::set& targets) { - pass_->set_targets(targets); - return *this; - } - bool Touch() const { return true; } - - private: - std::string name_; - mir::Pass* pass_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle - -#define REGISTER_MIR_PASS(name__, class__) \ - paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \ - new class__); \ - bool mir_pass_registry##name__##_fake() { \ - return mir_pass_registry##name__.Touch(); \ - } \ - static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \ - __attribute__((unused)) = mir_pass_registry##name__ diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc deleted file mode 100644 index 8ec85a4ef1..0000000000 --- a/lite/core/mir/pattern_matcher.cc +++ /dev/null @@ -1,528 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "lite/core/mir/dot.h" -#include "lite/core/mir/pattern_matcher.h" -#include "lite/core/op_lite.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -namespace mir { - -size_t PMPattern::id_ = 0UL; - -PMNode &PMNode::operator>>(PMNode &right) { - pattern_->AddEdge(this, &right); - // automatically add out op link relation. - if (right.IsOp()) { - CHECK(!right.op_type_.empty()); - this->assert_is_op_input(right.op_type_); - } - - return right; -} - -PMNode &PMNode::operator>>(std::vector &nodes) { - for (auto *node : nodes) { - *this >> *node; - } - return *this; -} - -PMNode &operator>>(std::vector &others, PMNode &me) { - for (auto *o : others) { - *o >> me; - } - return me; -} - -PMNode *PMPattern::NewNode(const std::string &name) { - if (!name.empty()) { - CHECK_EQ(node_map_.count(name), 0UL) - << "PMNode's name should be unique, get duplicate " << name; - } - - nodes_.emplace_back(new PMNode(this, name)); - auto *cur = nodes_.back().get(); - node_map_[name] = cur; - return cur; -} - -PMNode *PMPattern::NewNode(PMNode::teller_t &&teller, const std::string &name) { - if (!name.empty()) { - CHECK_EQ(node_map_.count(name), 0UL) - << "PMNode's name should be unique, get duplicate " << name; - } - - nodes_.emplace_back(new PMNode(std::move(teller), this, name)); - auto *cur = nodes_.back().get(); - node_map_[name] = cur; - return cur; -} - -PMNode *PMPattern::RetrieveNode(const std::string &id) const { - auto it = node_map_.find(id); - if (it == node_map_.end()) { - return nullptr; - } - - return it->second; -} - -void PMPattern::AddEdge(PMNode *a, PMNode *b) { - CHECK(a); - CHECK(b); - CHECK_NE(a, b) << "Can't connect to the same nodes."; - edges_.emplace_back(a, b); -} - -void PatternMatcher::operator()(SSAGraph *graph, - PatternMatcher::handle_t handler) { - if (!MarkPMNodesInGraph(graph)) { - return; - } - - auto subgraphs = DetectPatterns(); - UniquePatterns(&subgraphs); - RemoveOverlappedMatch(&subgraphs); - ValidateByNodeRole(&subgraphs); - - if (subgraphs.empty()) return; - LOG(INFO) << "detected " << subgraphs.size() << " subgraph"; - int id = 0; - for (auto &g : subgraphs) { - VLOG(3) << "optimizing #" << id++ << " subgraph"; - handler(g, graph); - } -} - -bool PatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) { - VLOG(3) << "mark pmnodes in graph"; - if (graph->nodes().empty()) return false; - for (auto &node : graph->mutable_nodes()) { - for (const auto &pmnode : pattern_.nodes()) { - if (pmnode->Tell(&node)) { - pmnodes2nodes_[pmnode.get()].insert(&node); - } - } - } - // Check to early stop if some PMNode can't find matched Node. - for (auto &pmnode : pattern_.nodes()) { - if (!pmnodes2nodes_.count(pmnode.get())) { - VLOG(4) << pmnode->name() << " can't find matched Node, early stop"; - // return false; - } - } - VLOG(3) << pmnodes2nodes_.size() << " nodes marked"; - - return !pmnodes2nodes_.empty(); -} - -// The intermediate Nodes can only link to the nodes inside the pattern, or this -// subgraph will be droped. -void PatternMatcher::ValidateByNodeRole( - std::vector *subgraphs) { - std::vector result; - - subgraphs->erase( - std::remove_if(subgraphs->begin(), - subgraphs->end(), - [](const PatternMatcher::subgraph_t &subgraph) -> bool { - // Collect the inlinks and outlinks. - std::unordered_set ios; - for (auto &item : subgraph) { - ios.insert(item.second); - } - for (auto &item : subgraph) { - if (item.first->IsIntermediate()) { - for (auto *x : item.second->inlinks) { - if (!ios.count(x)) { - return true; - } - } - for (auto *x : item.second->outlinks) { - if (!ios.count(x)) { - return true; - } - } - } - } - return false; - }), - subgraphs->end()); -} - -struct HitGroup { - std::unordered_map roles; - - bool Match(Node *node, PMNode *pat) { - if (nodes_.count(node)) { - if (roles.count(pat) && roles[pat] == node) return true; - return false; - } else { - if (roles.count(pat) && roles[pat] != node) return false; - return true; - } - } - - void Register(Node *node, PMNode *pat) { - roles[pat] = node; - nodes_.insert(node); - } - - private: - std::unordered_set nodes_; -}; - -// Tell whether Node a links to b. -bool IsNodesLink(Node *a, Node *b) { - for (auto *node : a->outlinks) { - if (b == node) { - return true; - } - } - return false; -} - -std::vector PatternMatcher::DetectPatterns() { - // Init empty subgraphs. - std::vector result; - std::vector init_groups; - std::array, 2> bi_records; - auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() - : pattern_.edges().front().first; - if (!pmnodes2nodes_.count(first_pnode)) return result; - for (auto *node : pmnodes2nodes_[first_pnode]) { - HitGroup group; - group.roles[first_pnode] = node; - init_groups.emplace_back(group); - } - - int step = 0; - bi_records[0] = std::move(init_groups); - - // Extend a PMNode to subgraphs by deducing the connection relations defined - // in edges of PMNodes. - for (const auto &edge : pattern_.edges()) { - VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); - // TODO(Superjomn) Fix bug here, the groups might be duplicate here. - // Each role has two PMNodes, which indicates two roles. - // Detect two Nodes that can match these two roles and they are connected. - auto &pre_groups = bi_records[step % 2]; - auto &cur_groups = bi_records[1 - (step++ % 2)]; - cur_groups.clear(); - if (pre_groups.empty()) break; - // source -> target - for (Node *source : pmnodes2nodes_[edge.first]) { - for (Node *target : pmnodes2nodes_[edge.second]) { - // TODO(Superjomn) add some prune strategies. - for (const auto &group : pre_groups) { - if (IsNodesLink(source, target)) { - HitGroup new_group = group; - bool flag = new_group.Match(source, edge.first) && - new_group.Match(target, edge.second); - if (flag) { - new_group.Register(source, edge.first); - new_group.Register(target, edge.second); - cur_groups.push_back(new_group); - // TODO(Superjomn) need to unique - } - } - } - } - } - VLOG(3) << "step " << step << " get records: " << cur_groups.size(); - } - - for (auto &group : bi_records[step % 2]) { - PatternMatcher::subgraph_t subgraph; - for (auto &role : group.roles) { - subgraph.emplace(role.first, role.second); - } - result.emplace_back(subgraph); - } - return result; -} - -struct GraphItemLessThan { - bool operator()(const std::pair &a, - const std::pair &b) { - if (a.first != b.first) { - return a.first < b.first; - } else { - return a.second < b.second; - } - } -}; - -// TODO(Superjomn) enhance the function as it marks unique unique as duplicates -// see https://github.com/PaddlePaddle/Paddle/issues/13550 -void PatternMatcher::UniquePatterns( - std::vector *subgraphs) { - if (subgraphs->empty()) return; - std::vector result; - - std::unordered_set set; - std::hash hasher; - for (auto &g : *subgraphs) { - // Sort the items in the sub-graph, and transform to a string key. - std::vector> sorted_keys(g.begin(), g.end()); - std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan()); - STL::stringstream ss; - for (auto &item : sorted_keys) { - ss << reinterpret_cast(item.first) << ":" - << reinterpret_cast(item.second); - } - auto key = hasher(ss.str()); - if (!set.count(key)) { - result.emplace_back(g); - set.insert(key); - } - } - *subgraphs = result; -} - -void PatternMatcher::RemoveOverlappedMatch(std::vector *subgraphs) { - std::vector result; - std::unordered_set node_set; - - for (const auto &subgraph : *subgraphs) { - bool valid = true; - for (auto &item : subgraph) { - if (item.first->IsIntermediate() && node_set.count(item.second)) { - valid = false; - break; - } - } - if (valid) { - for (auto &item : subgraph) { - node_set.insert(item.second); - } - result.push_back(subgraph); - } - } - *subgraphs = result; -} - -std::string PMPattern::DotString() const { - using inference::analysis::Dot; - Dot dot; - int id = 0; - // Create Nodes - std::unordered_map node2dot; - for (const auto &node : nodes()) { - std::string node_id = string_format("Node%d", id++); - dot.AddNode(node_id, {}, node->name()); - node2dot[node.get()] = node_id; - } - // Create Edges - for (const auto &edge : edges()) { - if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) { - continue; - } - auto &src = node2dot.at(edge.first); - auto &trg = node2dot.at(edge.second); - dot.AddEdge(src, trg, {}); - } - return dot.Build(); -} - -PMNode &PMNode::LinksTo(const std::vector &others) { - // extend outlinks. - for (PMNode *x : others) { - pattern_->AddEdge(this, x); - } - return *this; -} - -PMNode &PMNode::LinksFrom(const std::vector &others) { - // extend outlinks. - for (PMNode *x : others) { - pattern_->AddEdge(x, this); - } - return *this; -} - -PMNode *PMNode::assert_is_op() { - asserts_.emplace_back([](const Node *x) { return x && x->IsStmt(); }); - return this; -} - -PMNode *PMNode::assert_is_op(const std::string &op_type) { - asserts_.emplace_back([op_type](const Node *x) { - if (x && x->IsStmt()) { - auto *op_info = x->stmt()->op_info(); - return op_info->Type() == op_type; - } else { - return false; - } - }); - return this; -} - -PMNode *PMNode::assert_is_var() { - asserts_.emplace_back([](const Node *x) { return x && x->IsArg(); }); - return this; -} - -PMNode *PMNode::assert_var_not_persistable() { - assert_is_var(); - asserts_.emplace_back([](const Node *x) { return !x->arg()->is_weight; }); - return this; -} - -PMNode *PMNode::assert_is_persistable_var() { - assert_is_var(); - asserts_.emplace_back([=](const Node *x) { return x->arg()->is_weight; }); - return this; -} - -PMNode *PMNode::assert_is_op_output(const std::string &op_type) { - assert_is_var(); - asserts_.emplace_back([=](const Node *x) { - for (auto *op : x->inlinks) { - if (op && op->IsStmt()) { - auto *op_info = op->stmt()->op_info(); - if (op_info->Type() == op_type) return true; - } - } - return false; - }); - return this; -} - -bool IsNthOutput(const Node *var, - const Node *op, - const std::string &argument, - size_t nth) { - CHECK(var->IsArg()); - CHECK(op->IsStmt()); - auto op_info = op->stmt()->op_info(); - if (op_info->Output(argument).size() <= nth) return false; - return var->arg()->name == op_info->Output(argument)[nth]; -} - -bool IsNthInput(const Node *var, - const Node *op, - const std::string &argument, - size_t nth) { - CHECK(var->IsArg()); - CHECK(op->IsStmt()); - auto op_info = op->stmt()->op_info(); - if (op_info->Input(argument).size() <= nth) return false; - return var->arg()->name == op_info->Input(argument)[nth]; -} - -PMNode *PMNode::assert_is_op_input(const std::string &op_type, - const std::string &argument) { - assert_is_var(); - assert_is_op_nth_input(op_type, argument, 0); - return this; -} - -PMNode *PMNode::assert_is_op_nth_input(const std::string &op_type, - const std::string &argument, - int nth) { - assert_is_var(); - assert_is_op_input(op_type); - asserts_.emplace_back([=](const Node *x) { - for (auto *op : x->outlinks) { - if (op && op->IsStmt() && op->stmt()->op_info()->Type() == op_type && - IsNthInput(x, op, argument, nth)) - return true; - } - return false; - }); - return this; -} - -PMNode *PMNode::assert_is_op_output(const std::string &op_type, - const std::string &argument) { - assert_is_var(); - assert_is_op_nth_output(op_type, argument, 0); - return this; -} - -PMNode *PMNode::assert_is_op_nth_output(const std::string &op_type, - const std::string &argument, - int nth) { - assert_is_var(); - asserts_.emplace_back([=](const Node *x) { - for (auto *op : x->inlinks) { - if (op && op->IsStmt() && op->stmt()->op_info()->Type() == op_type && - IsNthOutput(x, op, argument, nth)) - return true; - } - return false; - }); - return this; -} - -PMNode *PMNode::assert_is_op_input(const std::string &op_type) { - assert_is_var(); - asserts_.emplace_back([=](const Node *x) { - for (auto *op : x->outlinks) { - if (op && op->IsStmt()) { - auto *op_info = op->stmt()->op_info(); - if (op_info->Type() == op_type) { - return true; - } - } - } - return false; - }); - return this; -} - -bool HasInput(const Node &op, const std::string &argument) { - CHECK(op.IsStmt()); - auto const &names = op.stmt()->op_info()->input_argnames(); - if (std::find(names.begin(), names.end(), argument) == names.end()) - return false; - return true; -} - -void GraphSafeRemoveNodes(SSAGraph *graph, - const std::unordered_set &nodes) { - for (auto *node : nodes) { - graph->RemoveNode(node); - } - - for (auto &node : graph->mutable_nodes()) { - for (auto it = node.inlinks.begin(); it != node.inlinks.end();) { - if (nodes.count(*it)) { - it = node.inlinks.erase(it); - } else { - it++; - } - } - for (auto it = node.outlinks.begin(); it != node.outlinks.end();) { - if (nodes.count(*it)) { - it = node.outlinks.erase(it); - } else { - it++; - } - } - } -} - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pattern_matcher.h b/lite/core/mir/pattern_matcher.h deleted file mode 100644 index 47a0a30b56..0000000000 --- a/lite/core/mir/pattern_matcher.h +++ /dev/null @@ -1,432 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_TESTING -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include "lite/core/mir/node.h" -#include "lite/core/mir/ssa_graph.h" -#include "lite/model_parser/pb/op_desc.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/replace_stl/stream.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -namespace mir { -class PMPattern; - -// Some basic terminologies: -// - PMPattern: a pattern defined as a data flow graph. -// - PMNode: the node in the pattern, each PMNode represents an `mir::Node` -// that meets some conditions defined in `PMNode.teller`. -// - A pattern is defined with PMNodes with edges. - -// Pattern matcher node. This node helps to build a pattern. -struct PMNode { - // tell whether an mir::Node* is a candidation for a PMNode. - using teller_t = std::function; - enum class Type { kOp, kVar }; - enum class Role { - kUnknown, // No role, - kInput, // an input and will be retained, - kOutput, // an output and will be retained, - kIntermediate // will be removed after handler. - }; - - // this link to others - PMNode& LinksTo(const std::vector& others); - PMNode& LinksFrom(const std::vector& others); - - // Link this to another node. - PMNode& operator>>(PMNode& right); - - // Link many nodes to this node. - friend PMNode& operator>>(std::vector& others, PMNode& me); - - // Link this to many other nodes. - PMNode& operator>>(std::vector& nodes); - - bool Tell(const Node* node) const { - if (teller_) return teller_(node); - - for (auto& asrt : asserts_) { - if (!asrt(node)) return false; - } - return true; - } - - bool IsOp() const { return type_ == Type::kOp; } - bool IsVar() const { return type_ == Type::kVar; } - - const std::string& name() const { return name_; } - - PMNode& operator=(const PMNode&) = delete; - PMNode(const PMNode&) = delete; - - // Mark this node is an Input of a subgraph and will be retained. - PMNode* AsInput() { - role_ = Role::kInput; - return this; - } - // Mark this node is an Output of a subgraph and will be retained. - PMNode* AsOutput() { - role_ = Role::kOutput; - return this; - } - // Mark this node will be removed, so all the links should be inside a matched - // sub-graph. - PMNode* AsIntermediate() { - role_ = Role::kIntermediate; - return this; - } - - PMNode* AsVar() { - type_ = Type::kVar; - assert_is_var(); - return this; - } - - PMNode* AsOp(const std::string& op_type) { - type_ = Type::kOp; - assert_is_op(op_type); - return this; - } - - void set_op_type(const std::string& op_type) { op_type_ = op_type; } - - bool IsIntermediate() const { return role_ == Role::kIntermediate; } - bool IsInput() const { return role_ == Role::kInput; } - bool IsOutput() const { return role_ == Role::kOutput; } - - // Assertions, helper functions to simplify the pattern definition. - PMNode* assert_is_op(); - PMNode* assert_is_op(const std::string& op_type); - PMNode* assert_is_var(); - PMNode* assert_var_not_persistable(); - PMNode* assert_is_persistable_var(); - PMNode* assert_is_op_output(const std::string& op_type); - PMNode* assert_is_op_input(const std::string& op_type); - PMNode* assert_is_op_input(const std::string& op_type, - const std::string& argument); - PMNode* assert_is_op_output(const std::string& op_type, - const std::string& argument); - - PMNode* assert_is_op_nth_input(const std::string& op_type, - const std::string& argument, - int nth); - PMNode* assert_is_op_nth_output(const std::string& op_type, - const std::string& argument, - int nth); - - template - PMNode* assert_op_attr_satisfied( - const std::string& attr_name, - const std::function& condition) { - asserts_.push_back([=](const Node* x) { - if (x && x->IsStmt()) { - auto* op_info = x->stmt()->op_info(); - return op_info->HasAttr(attr_name) && - condition(op_info->GetAttr(attr_name)); - } - return false; - }); - return this; - } - - template - PMNode* assert_op_attr(const std::string& attr_name, const T& attr) { - return assert_op_attr_satisfied( - attr_name, [=](const T& src) { return src == attr; }); - } - - private: - PMNode(PMPattern* pattern, - const std::string& name = "", - Type type = Type::kVar) - : pattern_(pattern), name_(name), type_(type) {} - PMNode(teller_t&& teller, - PMPattern* pattern, - const std::string& name = "", - Type type = Type::kVar) - : teller_(std::move(teller)), - pattern_(pattern), - name_(name), - type_(type) { - CHECK(teller_ != nullptr) << "invalid teller functer is set."; - } - - PMNode(PMNode&& other) = default; - - friend class PMPattern; - - // Will removed latter. - teller_t teller_; - std::vector asserts_; - PMPattern* pattern_; - std::string name_; - std::string op_type_; - Type type_; - Role role_{Role::kUnknown}; -}; - -/* - * A pattern in a graph, which defined with PMNode and edges. Most graph - * patterns can be divided into PMNodes and link relations between them. - * - * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD - * operators from the computation graph, the MUL's output should have only one - * consumer which is the ELEMENTWISE_ADD. - * This pattern can be defined as with the following pseudo codes - * - * // Create two operator PMNodes. - * MUL = PMPattern.NewNode().assert_is_op("mul"); - * ELE = PMPattern.NewNode().assert_is_op("elementwise_add"); - * // Create the variable PMNodes. - * MUL_out = PMPattern.NewNode().assert_is_op_output("mul") \ - * .assert_is_op_input("elementwise_add") \ - * .AsIntermediate(); - * // Add relations. - * MUL->LinksTo({MUL_out}); - * MUL_out->LinksTo({ELE}); - * - * One can add more specific asserts for PMNodes or edges, both the Operator - * and Variable Nodes can be ruled in PMNode.assert_more(...). - * - * PMPattern can record the general patterns, such as the pattern represents - * - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place. - * - Ops whose inputs and outputs share the same variables - */ -class PMPattern { - public: - using edge_t = std::pair; - - void AddEdge(PMNode* a, PMNode* b); - - PMNode* NewNode(PMNode::teller_t&& teller, const std::string& name = NewID()); - PMNode* NewNode(const std::string& name = NewID()); - PMNode* NewNode(const std::string& prefix, const std::string& name) { - return NewNode(prefix + "/" + name); - } - PMNode* RetrieveNode(const std::string& id) const; - - const std::vector>& nodes() const { return nodes_; } - const std::vector& edges() const { return edges_; } - - std::string DotString() const; - - private: -#ifdef PADDLE_WITH_TESTING - FRIEND_TEST(PMPattern, AddEdge); - FRIEND_TEST(PMPattern, NewNode); -#endif - - static std::string NewID() { return string_format("pmnode-%d", id_++); } - - std::vector> nodes_; - std::vector edges_; - std::unordered_map node_map_; - static size_t id_; -}; - -/* - * PatternMatcher helps to detect the specific patterns in the graph. - * Input a pattern, output a list of the matched subgraphs/nodes. - * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). - * - * The algorithm has three phases: - * 1. Mark the nodes that match the defined PMNodes in a PMPattern, - * 2. Extend a PMNode to subgraphs by deducing the connection relation defined - * in PAPattern(the edges), - * 3. Get the filtered subgraphs and treat them with a pre-defined handler. - * - * Usage: - * // Create a matcher - * PatternMatcher matcher; - * // Define the matcher's pattern, by adding PMNode and define the edges. - * auto* node0 = matcher.mutable_pattern().AddNode(...) - * auto* node1 = matcher.mutable_pattern().AddNode(...) - * node0->teller = some lambda. - * node1->teller = some lambda. - * matcher.mutable_pattern().AddEdge(node0, node1); - * // Create an handler, to define the behavior of treating the filtered - * // subgraphs that comply with the patterns. - * PatternMatcher::handle_t handler = some labmda - * // Execute the matcher. - * matcher(&graph, handler); - */ -class PatternMatcher { - public: - using subgraph_t = std::unordered_map; - - // Operate on the detected pattern. - using handle_t = - std::function; - - void operator()(SSAGraph* graph, handle_t handler); - - const PMPattern& pattern() const { return pattern_; } - PMPattern* mutable_pattern() { return &pattern_; } - - private: - // Mark the nodes that fits the pattern. - bool MarkPMNodesInGraph(SSAGraph* graph); - - // Detect all the pattern and output the hit records. - std::vector DetectPatterns(); - - // Remove duplicate patterns. - void UniquePatterns(std::vector* subgraphs); - - // Remove overlapped match subgraphs, when overlapped, keep the previous one. - // The intermediate PMNodes will be removed, so can't shared by multiple - // patterns. - void RemoveOverlappedMatch(std::vector* subgraphs); - - // Validate whether the intermediate nodes are linked by external nodes. - void ValidateByNodeRole(std::vector* subgraphs); - -#ifdef PADDLE_WITH_TESTING - FRIEND_TEST(PatternMatcher, MarkPMNodesInGraph); - FRIEND_TEST(PatternMatcher, DetectPatterns); -#endif - - private: - using hit_rcd_t = - std::pair; - PMPattern pattern_; - std::unordered_map> pmnodes2nodes_; -}; - -// Check whether a var node is a op node's nth input. -bool IsNthInput(const Node& var, - const Node& op, - const std::string& argument, - int nth); - -// Check whether the op node has input of given name. -bool HasInput(const Node& op, const std::string& argument); - -// Graph safely remove some nodes, will automatically clean up the edges. -void GraphSafeRemoveNodes(SSAGraph* graph, - const std::unordered_set& nodes); - -// Some pre-defined patterns those can be reused in multiple passes. -// The related Fluid Layer or Op should be one pattern here for better re-usage -// across different fusion. -namespace patterns { - -struct KeyCounter { - static KeyCounter& Instance() { - static KeyCounter x; - return x; - } - - int IncCounter(const std::string& key) { return dic_[key]++; } - - private: - std::unordered_map dic_; -}; - -// Generate a unique PMNode's name with name_scope and id. -// The format is {name_scope}/{repr}/{id}/{name} -static std::string PMNodeName(const std::string& name_scope, - const std::string& repr, - size_t id, - const std::string& name) { - STL::stringstream ss; - ss << name_scope << "/" << repr << "/" << id << "/" << name; - return ss.str(); -} -// Generate a unique PMNode's name. -// The format is {name_scope}/{repr}/{id} -static std::string PMNodeName(const std::string& name_scope, - const std::string& repr) { - STL::stringstream ss; - ss << name_scope << "/" << repr << "/" - << KeyCounter::Instance().IncCounter(repr); - return ss.str(); -} -// Generate a unique key. It can be used for a universally unique temporary -// name. -// The format is {repr}/{id} -static std::string UniqueKey(const std::string& repr) { - STL::stringstream ss; - ss << repr << "/" << KeyCounter::Instance().IncCounter(repr); - return ss.str(); -} - -// Declare a PMNode in a pattern, will create two methods: -// std::string xxx_repr(); return this PMNode's string id. -// PMNode* xxx_n(); return the corresponding PMNode. -#define PATTERN_DECL_NODE(name__) \ - std::string name__##_repr() const { \ - return PMNodeName(name_scope_, repr_, id_, #name__); \ - } \ - PMNode* name__##_n() const { return pattern->RetrieveNode(name__##_repr()); } - -// Get an mir::Node* from the matched subgraph. -// var: variable. -// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition. -// pat: the pattern object. -#define GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat) \ - CHECK(subgraph.count(pat.arg##_n())) \ - << "Node not found for PMNode " pat.arg##_repr(); \ - Node* var = subgraph.at(pat.arg##_n()); \ - CHECK(var) << "node " << #arg << "not exists in the sub-graph" - -// The base class of all the patterns. -struct PatternBase { - PatternBase(PMPattern* pattern, - const std::string& name_scope, - const std::string& repr) - : pattern(pattern), - name_scope_(name_scope), - repr_(repr), - id_(KeyCounter::Instance().IncCounter(repr)) {} - - PMPattern* pattern; - - protected: - std::string name_scope_; - std::string repr_; - size_t id_; -}; - -} // namespace patterns - -// Link two mir::Nodes from each other. -#define IR_NODE_LINK_TO(a, b) \ - a->outlinks.push_back(b); \ - b->inlinks.push_back(a); - -// Set the out_var as the output of the op -#define IR_OP_VAR_LINK(op, out_var) \ - op->outlinks.push_back(out_var); \ - out_var->inlinks.clear(); \ - out_var->inlinks.push_back(op); - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pattern_matcher_high_api.cc b/lite/core/mir/pattern_matcher_high_api.cc deleted file mode 100644 index 620f4ebbea..0000000000 --- a/lite/core/mir/pattern_matcher_high_api.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pattern_matcher_high_api.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace mir { - -void FuseBase::PerformPatternMatcher(SSAGraph *graph) { - VLOG(4) << "\n" << matcher_.pattern().DotString(); - // Get subgraphs and record the mir::Node pointers for each PMNode. - auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) { - // get all the reigistered nodes. - key2nodes_.emplace_back(); - for (auto &item : nodes_) { - key2nodes_.back()[item.first] = subgraph.at(item.second); - } - }; - - matcher_(graph, handler); -} - -void FuseBase::DeleteInterNodes(SSAGraph *graph) { - std::set keys; - for (auto &node : nodes_) { - if (node.second->IsIntermediate()) { - keys.insert(node.first); - } - } - - VLOG(4) << "keys: " << key2nodes_.size(); - std::unordered_set nodes2rm; - for (auto &matched : key2nodes_) { - for (const auto &key : keys) { - nodes2rm.insert(matched.at(key)); - } - } - - VLOG(3) << "clean nodes " << nodes2rm.size(); - GraphSafeRemoveNodes(graph, nodes2rm); -} - -PMNode *FuseBase::GetOrCreateNode(const std::string &key) { - auto it = nodes_.find(key); - if (it != nodes_.end()) { - return it->second; - } - nodes_.emplace(key, - matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key))); - it = nodes_.find(key); - return it->second; -} - -PMNode *FuseBase::OpNode(const std::string &key, const std::string &op_type) { - GetOrCreateNode(key)->set_op_type(op_type); - GetOrCreateNode(key)->AsOp(op_type); - return GetOrCreateNode(key); -} - -PMNode *FuseBase::VarNode(const std::string &key) { - GetOrCreateNode(key)->AsVar(); - return GetOrCreateNode(key); -} - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pattern_matcher_high_api.h b/lite/core/mir/pattern_matcher_high_api.h deleted file mode 100644 index e62a4fc749..0000000000 --- a/lite/core/mir/pattern_matcher_high_api.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include "lite/core/mir/node.h" -#include "lite/core/mir/pattern_matcher.h" -#include "lite/core/mir/ssa_graph.h" - -namespace paddle { -namespace lite { -namespace mir { - -class FuseBase { - public: - using key2nodes_t = std::map; - - virtual ~FuseBase() = default; - - void operator()(SSAGraph* graph) { - BuildPattern(); - PerformPatternMatcher(graph); - - for (const auto& matched : key2nodes_) { - InsertNewNode(graph, matched); - } - - DeleteInterNodes(graph); - } - - // Build a PMPattern using PMNode. - virtual void BuildPattern() = 0; - - // Generate an operator desc with a matched subgraph. - virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) { - return cpp::OpDesc(); - } - - PMNode* OpNode(const std::string& key) { - return GetOrCreateNode(key)->assert_is_op(); - } - - PMNode* OpNode(const std::string& key, const std::string& op_type); - - PMNode* VarNode(const std::string& key); - - protected: - virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0; - - private: - void PerformPatternMatcher(SSAGraph* graph); - - // Delete nodes that are marked as Intermediate - void DeleteInterNodes(SSAGraph* graph); - - PMNode* GetOrCreateNode(const std::string& key); - - protected: - PatternMatcher matcher_; - std::map nodes_; - std::vector key2nodes_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pattern_matcher_high_api_test.cc b/lite/core/mir/pattern_matcher_high_api_test.cc deleted file mode 100644 index 61914c5a0b..0000000000 --- a/lite/core/mir/pattern_matcher_high_api_test.cc +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pattern_matcher_high_api.h" -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/program.h" -#include "lite/core/tensor.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace lite { -namespace mir { - -// An demo. -class FcFuser : public FuseBase { - public: - void BuildPattern() override { - // create nodes. - auto* x = VarNode("x")->assert_is_op_input("mul", "X"); - auto* W = VarNode("W")->assert_is_op_input("mul", "Y"); - auto* b = VarNode("b"); - auto* mul = OpNode("mul", "mul"); - auto* mul_out = VarNode("mul_out"); - auto* add = OpNode("add", "elementwise_add"); - auto* Out = VarNode("Out"); - - // create topology. - std::vector mul_inputs{W, x}; - std::vector add_inputs{mul_out, b}; - mul_inputs >> *mul >> *mul_out; - add_inputs >> *add >> *Out; - - // Some op specialities. - mul_out->AsIntermediate(); - mul->AsIntermediate(); - add->AsIntermediate(); - } - - void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { - auto op_desc = GenOpDesc(matched); - auto fc_op = LiteOpRegistry::Global().Create("fc"); - auto mul = matched.at("mul")->stmt()->op(); - auto* scope = mul->scope(); - auto& valid_places = mul->valid_places(); - fc_op->Attach(op_desc, scope); - - auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places); - - IR_NODE_LINK_TO(matched.at("W"), new_op_node); - IR_NODE_LINK_TO(matched.at("x"), new_op_node); - IR_NODE_LINK_TO(matched.at("b"), new_op_node); - IR_NODE_LINK_TO(new_op_node, matched.at("Out")); - } - - private: - cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override { - cpp::OpDesc op_desc; - op_desc.SetType("fc"); - op_desc.SetInput("Input", {matched.at("x")->arg()->name}); - op_desc.SetInput("W", {matched.at("W")->arg()->name}); - op_desc.SetInput("Bias", {matched.at("b")->arg()->name}); - op_desc.SetOutput("Out", {matched.at("Out")->arg()->name}); - op_desc.SetAttr("in_num_col_dims", 1); - return op_desc; - } -}; - -std::unique_ptr BuildGraph(framework::ProgramDesc* program_desc, - const std::shared_ptr& scope, - const std::vector& valid_places) { - auto* main_block = program_desc->MutableBlock(0); - auto* mul = main_block->AppendOp(); - auto* add = main_block->AppendOp(); - main_block->Var("x"); - main_block->Var("b"); - main_block->Var("mul_out"); - main_block->Var("w"); - main_block->Var("out"); - - scope->Var("x")->GetMutable(); - scope->Var("b")->GetMutable(); - scope->Var("mul_out")->GetMutable(); - scope->Var("w")->GetMutable(); - scope->Var("out")->GetMutable(); - - mul->SetInput("X", {"x"}); - mul->SetInput("Y", {"w"}); - mul->SetOutput("Out", {"mul_out"}); - mul->SetType("mul"); - mul->SetAttr("x_num_col_dims", 1); - mul->SetAttr("y_num_col_dims", 1); - - add->SetInput("X", {"mul_out"}); - add->SetInput("Y", {"b"}); - add->SetOutput("Out", {"out"}); - add->SetType("elementwise_add"); - add->SetAttr("axis", 1); - - program_desc->Flush(); - - lite::Program program(*program_desc->Proto(), scope, valid_places); - auto graph = std::unique_ptr(new SSAGraph()); - graph->Build(program, valid_places); - - return graph; -} - -TEST(pattern_matcher_high_api, graph_test) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - - ASSERT_EQ(graph->nodes().size(), 7UL /*real nodes*/); - Visualize(graph.get()); -} - -TEST(pattern_matcher_high_api, fuse_test) { - framework::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildGraph(&program_desc, scope, places); - const int num_nodes = graph->nodes().size(); - FcFuser fuser; - fuser(graph.get()); - ASSERT_EQ(graph->nodes().size(), - num_nodes - 3UL /*nodes removed */ + 1UL /* fused fc node*/); - Visualize(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(fc); -USE_LITE_OP(mul); -USE_LITE_OP(elementwise_add); diff --git a/lite/core/mir/pattern_matcher_test.cc b/lite/core/mir/pattern_matcher_test.cc deleted file mode 100644 index 728681a459..0000000000 --- a/lite/core/mir/pattern_matcher_test.cc +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pattern_matcher.h" - -#include - -namespace paddle { -namespace lite { -namespace mir { - -void BuildGraph(SSAGraph* g) { - g->mutable_nodes().emplace_back(); - Node& o1 = g->mutable_nodes().back(); - o1.AsStmt().desc = "op1"; - g->mutable_nodes().emplace_back(); - Node& o2 = g->mutable_nodes().back(); - o2.AsStmt().desc = "op2"; - g->mutable_nodes().emplace_back(); - Node& o3 = g->mutable_nodes().back(); - o3.AsStmt().desc = "op3"; - g->mutable_nodes().emplace_back(); - Node& o4 = g->mutable_nodes().back(); - o4.AsStmt().desc = "op4"; - g->mutable_nodes().emplace_back(); - Node& o5 = g->mutable_nodes().back(); - o5.AsStmt().desc = "op5"; - g->mutable_nodes().emplace_back(); - Node& v1 = g->mutable_nodes().back(); - v1.AsArg("var1"); - g->mutable_nodes().emplace_back(); - Node& v2 = g->mutable_nodes().back(); - v2.AsArg("var2"); - g->mutable_nodes().emplace_back(); - Node& v3 = g->mutable_nodes().back(); - v3.AsArg("var3"); - g->mutable_nodes().emplace_back(); - Node& v4 = g->mutable_nodes().back(); - v4.AsArg("var4"); - - // o1->v1->o2 - o1.outlinks.push_back(&v1); - o2.inlinks.push_back(&v1); - v1.inlinks.push_back(&o1); - v1.outlinks.push_back(&o2); - // o2->v2->o3 - // o2->v2->o4 - o2.outlinks.push_back(&v2); - o3.inlinks.push_back(&v2); - o4.inlinks.push_back(&v2); - v2.inlinks.push_back(&o2); - v2.outlinks.push_back(&o3); - v2.outlinks.push_back(&o4); - // o2->v3->o5 - o2.outlinks.push_back(&v3); - o5.inlinks.push_back(&v3); - v3.inlinks.push_back(&o2); - v3.outlinks.push_back(&o5); - // o3-v4->o5 - o3.outlinks.push_back(&v4); - o5.inlinks.push_back(&v4); - v4.inlinks.push_back(&o3); - v4.outlinks.push_back(&o5); -} - -TEST(PMPattern, NewNode) { - PMPattern x; - auto* n = x.NewNode([](const Node* x) { return true; }); - ASSERT_TRUE(n); - ASSERT_EQ(x.nodes_.size(), 1UL); -} - -TEST(PMPattern, AddEdge) { - PMPattern x; - auto* a = x.NewNode([](const Node* x) { return true; }); - auto* b = x.NewNode([](const Node* x) { return true; }); - ASSERT_TRUE(a); - ASSERT_TRUE(b); - x.AddEdge(a, b); - ASSERT_EQ(x.nodes_.size(), 2UL); - ASSERT_EQ(x.edges_.size(), 1UL); - ASSERT_EQ(x.edges_.front().first, a); - ASSERT_EQ(x.edges_.front().second, b); - - ASSERT_EQ(x.nodes().size(), 2UL); - ASSERT_EQ(x.edges().size(), 1UL); - ASSERT_EQ(x.edges().front().first, a); - ASSERT_EQ(x.edges().front().second, b); -} - -TEST(PatternMatcher, MarkPMNodesInGraph) { - PatternMatcher x; - // mark o2, o3, v2 - - // The pattern is a graph: - // o2(a node named o2) -> v2(a node named v2) - // v2 -> o3(a node named o3) - auto* o2 = x.pattern_.NewNode([](const Node* node) { - // The teller can be any condition, such as op type, or variable's shape. - return node && node->IsStmt() && node->stmt()->desc == "op2"; - }); - auto* o3 = x.pattern_.NewNode([](const Node* node) { - // The teller can be any condition, such as op type, or variable's shape. - return node && node->IsStmt() && node->stmt()->desc == "op3"; - }); - auto* v2 = x.pattern_.NewNode([](const Node* node) { - // The teller can be any condition, such as op type, or variable's shape. - return node && node->IsArg() && node->arg()->name == "var2"; - }); - - ASSERT_FALSE(o2->Tell(nullptr)); - ASSERT_FALSE(o3->Tell(nullptr)); - ASSERT_FALSE(v2->Tell(nullptr)); - - x.pattern_.AddEdge(o2, v2); - x.pattern_.AddEdge(v2, o3); - - ASSERT_EQ(x.pattern_.edges().size(), 2UL); - ASSERT_EQ(x.pattern_.edges()[0].first, o2); - ASSERT_EQ(x.pattern_.edges()[0].second, v2); - ASSERT_EQ(x.pattern_.edges()[1].first, v2); - ASSERT_EQ(x.pattern_.edges()[1].second, o3); - - SSAGraph graph; - BuildGraph(&graph); - - x.MarkPMNodesInGraph(&graph); - - ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL); - - auto subgraphs = x.DetectPatterns(); - ASSERT_EQ(subgraphs.size(), 1UL); -} - -TEST(PatternMatcher, MultiSubgraph) { - SSAGraph graph; - BuildGraph(&graph); - - PatternMatcher x; - - // The pattern is a graph: - // op -> var - auto* any_op = x.mutable_pattern()->NewNode( - [](const Node* node) { - return node->IsStmt() && - (node->stmt()->desc == "op2" || node->stmt()->desc == "op3"); - }, - "OP0"); - auto* any_var = - x.mutable_pattern() - ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR") - ->AsIntermediate(); - auto* any_op1 = x.mutable_pattern()->NewNode( - [](const Node* node) { return node->IsStmt(); }, "OP1"); - - x.mutable_pattern()->AddEdge(any_op, any_var); - x.mutable_pattern()->AddEdge(any_var, any_op1); - - int count = 0; - PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s, - SSAGraph* g) { - LOG(INFO) << "Detect " << s.at(any_op)->stmt()->desc << " -> " - << s.at(any_var)->arg()->name << " -> " - << s.at(any_op1)->stmt()->desc; - count++; - }; - - x(&graph, handle); - - // 1. Detect op3 -> var4 -> op5 - // 2. Detect op2 -> var2 -> op3 - // 3. Detect op2 -> var2 -> op4 - // 4. Detect op2 -> var3 -> op5 - // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2 - ASSERT_GE(count, 1); - ASSERT_LE(count, 2); -} - -TEST(PatternMatcher, IntermediateCheck) { - SSAGraph graph; - BuildGraph(&graph); - - // o2->v2->o3 - // o2->v2->o4 - // check o2+o3 fuse, should fail because v2 also link to o4. - PatternMatcher matcher; - auto* op2 = matcher.mutable_pattern()->NewNode( - [](const Node* x) { - return x && x->IsStmt() && x->stmt()->desc == "op2"; - }, - "op2"); - auto* op3 = matcher.mutable_pattern()->NewNode( - [](const Node* x) { - return x && x->IsStmt() && x->stmt()->desc == "op3"; - }, - "op3"); - auto* v2 = matcher.mutable_pattern() - ->NewNode( - [](const Node* x) { - return x && x->IsArg() && x->arg()->name == "var2"; - }, - "var2") - ->AsIntermediate(); - v2->LinksFrom({op2}).LinksTo({op3}); - - int count = 0; - matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) { - ++count; - }); - EXPECT_EQ(count, 0); - - count = 0; - v2->AsInput(); - matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) { - ++count; - }); - ASSERT_EQ(count, 1); -} - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/pattern_matcher_tester.cc b/lite/core/mir/pattern_matcher_tester.cc deleted file mode 100644 index a62c3af62f..0000000000 --- a/lite/core/mir/pattern_matcher_tester.cc +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pattern_matcher.h" - -#include - -namespace paddle { -namespace lite { -namespace mir { - -void BuildGraph(SSAGraph* g) { - g->mutable_nodes().emplace_back(); - Node& o1 = g->mutable_nodes().back(); - o1.AsStmt().op_type = "op1"; - g->mutable_nodes().emplace_back(); - Node& o2 = g->mutable_nodes().back(); - o2.AsStmt().op_type = "op2"; - g->mutable_nodes().emplace_back(); - Node& o3 = g->mutable_nodes().back(); - o3.AsStmt().op_type = "op3"; - g->mutable_nodes().emplace_back(); - Node& o4 = g->mutable_nodes().back(); - o4.AsStmt().op_type = "op4"; - g->mutable_nodes().emplace_back(); - Node& o5 = g->mutable_nodes().back(); - o5.AsStmt().op_type = "op5"; - g->mutable_nodes().emplace_back(); - Node& v1 = g->mutable_nodes().back(); - v1.AsArg("var1"); - g->mutable_nodes().emplace_back(); - Node& v2 = g->mutable_nodes().back(); - v2.AsArg("var2"); - g->mutable_nodes().emplace_back(); - Node& v3 = g->mutable_nodes().back(); - v3.AsArg("var3"); - g->mutable_nodes().emplace_back(); - Node& v4 = g->mutable_nodes().back(); - v4.AsArg("var4"); - - // o1->v1->o2 - o1.outlinks.push_back(&v1); - o2.inlinks.push_back(&v1); - v1.inlinks.push_back(&o1); - v1.outlinks.push_back(&o2); - // o2->v2->o3 - // o2->v2->o4 - o2.outlinks.push_back(&v2); - o3.inlinks.push_back(&v2); - o4.inlinks.push_back(&v2); - v2.inlinks.push_back(&o2); - v2.outlinks.push_back(&o3); - v2.outlinks.push_back(&o4); - // o2->v3->o5 - o2.outlinks.push_back(&v3); - o5.inlinks.push_back(&v3); - v3.inlinks.push_back(&o2); - v3.outlinks.push_back(&o5); - // o3-v4->o5 - o3.outlinks.push_back(&v4); - o5.inlinks.push_back(&v4); - v4.inlinks.push_back(&o3); - v4.outlinks.push_back(&o5); -} - -TEST(PMPattern, NewNode) { - PMPattern x; - auto* n = x.NewNode([](const Node* x) { return true; }); - ASSERT_TRUE(n); - ASSERT_EQ(x.nodes_.size(), 1UL); -} - -TEST(PMPattern, AddEdge) { - PMPattern x; - auto* a = x.NewNode([](const Node* x) { return true; }); - auto* b = x.NewNode([](const Node* x) { return true; }); - ASSERT_TRUE(a); - ASSERT_TRUE(b); - x.AddEdge(a, b); - ASSERT_EQ(x.nodes_.size(), 2UL); - ASSERT_EQ(x.edges_.size(), 1UL); - ASSERT_EQ(x.edges_.front().first, a); - ASSERT_EQ(x.edges_.front().second, b); - - ASSERT_EQ(x.nodes().size(), 2UL); - ASSERT_EQ(x.edges().size(), 1UL); - ASSERT_EQ(x.edges().front().first, a); - ASSERT_EQ(x.edges().front().second, b); -} - -TEST(PatternMatcher, MarkPMNodesInGraph) { - PatternMatcher x; - // mark o2, o3, v2 - - // The pattern is a graph: - // o2(a node named o2) -> v2(a node named v2) - // v2 -> o3(a node named o3) - auto* o2 = x.pattern_.NewNode([](const Node* node) { - // The teller can be any condition, such as op type, or variable's shape. - return node && node->IsStmt() && node->stmt()->op_type == "op2"; - }); - auto* o3 = x.pattern_.NewNode([](const Node* node) { - // The teller can be any condition, such as op type, or variable's shape. - return node && node->IsStmt() && node->stmt()->op_type == "op3"; - }); - auto* v2 = x.pattern_.NewNode([](const Node* node) { - // The teller can be any condition, such as op type, or variable's shape. - return node && node->IsArg() && node->arg()->name == "var2"; - }); - - ASSERT_FALSE(o2->Tell(nullptr)); - ASSERT_FALSE(o3->Tell(nullptr)); - ASSERT_FALSE(v2->Tell(nullptr)); - - x.pattern_.AddEdge(o2, v2); - x.pattern_.AddEdge(v2, o3); - - ASSERT_EQ(x.pattern_.edges().size(), 2UL); - ASSERT_EQ(x.pattern_.edges()[0].first, o2); - ASSERT_EQ(x.pattern_.edges()[0].second, v2); - ASSERT_EQ(x.pattern_.edges()[1].first, v2); - ASSERT_EQ(x.pattern_.edges()[1].second, o3); - - SSAGraph graph; - BuildGraph(&graph); - - x.MarkPMNodesInGraph(&graph); - - ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL); - - auto subgraphs = x.DetectPatterns(); - ASSERT_EQ(subgraphs.size(), 1UL); -} - -TEST(PatternMatcher, MultiSubgraph) { - SSAGraph graph; - BuildGraph(&graph); - - PatternMatcher x; - - // The pattern is a graph: - // op -> var - auto* any_op = x.mutable_pattern()->NewNode( - [](const Node* node) { - return node->IsStmt() && (node->stmt()->op_type == "op2" || - node->stmt()->op_type == "op3"); - }, - "OP0"); - auto* any_var = - x.mutable_pattern() - ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR") - ->AsIntermediate(); - auto* any_op1 = x.mutable_pattern()->NewNode( - [](const Node* node) { return node->IsStmt(); }, "OP1"); - - x.mutable_pattern()->AddEdge(any_op, any_var); - x.mutable_pattern()->AddEdge(any_var, any_op1); - - int count = 0; - PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s, - SSAGraph* g) { - LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> " - << s.at(any_var)->arg()->name << " -> " - << s.at(any_op1)->stmt()->op_type; - count++; - }; - - x(&graph, handle); - - // 1. Detect op3 -> var4 -> op5 - // 2. Detect op2 -> var2 -> op3 - // 3. Detect op2 -> var2 -> op4 - // 4. Detect op2 -> var3 -> op5 - // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2 - ASSERT_GE(count, 1); - ASSERT_LE(count, 2); -} - -TEST(PatternMatcher, IntermediateCheck) { - SSAGraph graph; - BuildGraph(&graph); - - // o2->v2->o3 - // o2->v2->o4 - // check o2+o3 fuse, should fail because v2 also link to o4. - PatternMatcher matcher; - auto* op2 = matcher.mutable_pattern()->NewNode( - [](const Node* x) { - return x && x->IsStmt() && x->stmt()->op_type == "op2"; - }, - "op2"); - auto* op3 = matcher.mutable_pattern()->NewNode( - [](const Node* x) { - return x && x->IsStmt() && x->stmt()->op_type == "op3"; - }, - "op3"); - auto* v2 = matcher.mutable_pattern() - ->NewNode( - [](const Node* x) { - return x && x->IsArg() && x->arg()->name == "var2"; - }, - "var2") - ->AsIntermediate(); - v2->LinksFrom({op2}).LinksTo({op3}); - - int count = 0; - matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) { - ++count; - }); - EXPECT_EQ(count, 0); - - count = 0; - v2->AsInput(); - matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) { - ++count; - }); - ASSERT_EQ(count, 1); -} - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc deleted file mode 100644 index 652932c149..0000000000 --- a/lite/core/mir/runtime_context_assign_pass.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -class RuntimeContextAssignPass : public StmtPass { - public: - RuntimeContextAssignPass() {} - - void Apply(const std::unique_ptr& graph) override { - for (auto& node : graph->mutable_nodes()) { - if (!node.IsStmt()) continue; - auto& inst = node.AsStmt(); - inst.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(inst.picked_kernel().target())); - } - } -}; - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(runtime_context_assign_pass, - paddle::lite::mir::RuntimeContextAssignPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc deleted file mode 100644 index 5193d9c899..0000000000 --- a/lite/core/mir/ssa_graph.cc +++ /dev/null @@ -1,240 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/ssa_graph.h" -#include -#include -#include -#include -#include - -namespace paddle { -namespace lite { -namespace mir { - -bool SSAGraph::CheckBidirectionalConnection() { - VLOG(4) << "node count " << node_storage_.size(); - for (auto &node : node_storage_) { - if (node.IsStmt()) VLOG(4) << node.AsStmt().op_info()->Type(); - if (node.IsArg()) VLOG(4) << node.AsArg().name << " " << node.AsArg().id; - for (auto *in : node.inlinks) { - CHECK(in->outlinks.end() != - std::find(in->outlinks.begin(), in->outlinks.end(), &node)); - } - for (auto *out : node.outlinks) { - CHECK(out->inlinks.end() != - std::find(out->inlinks.begin(), out->inlinks.end(), &node)); - } - } - return true; -} - -std::map> SSAGraph::BuildOperationAdjList() { - std::map> adj_list; - - for (auto &n : mutable_nodes()) { - if (!n.IsStmt()) continue; - if (adj_list.find(&n) == adj_list.end()) { - adj_list[&n] = std::set(); - } - std::vector nodes; - for (auto &var : n.inlinks) { - for (auto &adj_n : var->inlinks) { - CHECK(adj_n->IsStmt()); - nodes.push_back(adj_n); - } - } - std::sort(nodes.begin(), - nodes.end(), - [](mir::Node *node1, mir::Node *node2) { return node1 > node2; }); - adj_list[&n].insert(std::make_move_iterator(nodes.begin()), - std::make_move_iterator(nodes.end())); - } - return adj_list; -} - -void SSAGraph::SortHelper( - const std::map> &adj_list, - mir::Node *node, - std::set *visited, - std::vector *ret) { - visited->insert(node); - - for (auto adj : adj_list.at(node)) { - if (visited->find(adj) == visited->end()) { - SortHelper(adj_list, adj, visited, ret); - } - } - - ret->push_back(node); -} - -std::vector SSAGraph::StmtTopologicalOrder() { - CheckBidirectionalConnection(); - - std::stack stack; - std::set visited; - std::vector res; - - auto adj_list = BuildOperationAdjList(); - - for (auto adj : adj_list) { - if (visited.find(adj.first) == visited.end()) { - SortHelper(adj_list, adj.first, &visited, &res); - } - } - - return res; -} - -Node *SSAGraph::GraphCreateInstructNode( - const std::shared_ptr &op, const std::vector &valid_places) { - node_storage_.emplace_back(); - // TODO(Superjomn) remove one valid_places here. - op->SetValidPlaces(valid_places); - auto &new_node = node_storage_.back(); - auto kernels = op->CreateKernels(valid_places); - node_storage_.back().AsStmt(op->op_type_, std::move(kernels), op); - - CHECK(new_node.inlinks.empty()) << "duplicate Build found"; - CHECK(new_node.outlinks.empty()) << "duplicate Build found"; - return &node_storage_.back(); -} - -void SSAGraph::Build(const Program &program, - const std::vector &valid_places) { - CHECK(node_storage_.empty()); - - auto weights_name = program.weights(); - auto is_weights = [&](const std::string &name) -> bool { - auto it = std::find(weights_name.begin(), weights_name.end(), name); - if (it == weights_name.end()) return false; - return true; - }; - - std::unordered_map arg_update_node_map_; - for (auto &op : program.ops()) { - VLOG(3) << op->op_info()->Type(); - auto *op_node = GraphCreateInstructNode(op, valid_places); - for (const std::string &name : op->op_info()->input_names()) { - mir::Node *arg_node = nullptr; - if (arg_update_node_map_.count(name)) { - arg_node = arg_update_node_map_.at(name); - } else { - node_storage_.emplace_back(); - arg_node = &node_storage_.back(); - arg_node->AsArg(name, node_storage_.size() - 1); - arg_update_node_map_[name] = arg_node; - } - if (is_weights(name)) arg_node->AsArg().is_weight = true; - CHECK(arg_node->IsRoleSet()); - DirectedLink(arg_node, op_node); - } - for (const std::string &name : op->op_info()->output_names()) { - node_storage_.emplace_back(); - auto *arg_node = &node_storage_.back(); - arg_node->AsArg(name, node_storage_.size() - 1); - arg_update_node_map_[name] = arg_node; - - if (is_weights(name)) arg_node->AsArg().is_weight = true; - CHECK(arg_node->IsRoleSet()); - DirectedLink(op_node, arg_node); - } - CHECK(CheckLinksRoleSet()); - } - - CHECK(CheckNodesRoleSet()); - CheckValid(); -} - -void SSAGraph::RemoveNode(const mir::Node *node) { - auto pos = std::find_if(node_storage_.begin(), - node_storage_.end(), - [&node](mir::Node &n) { return &n == node; }); - CHECK(pos != node_storage_.end()); - node_storage_.erase(pos); -} - -mir::Node *SSAGraph::Argument(const std::string &name) { - auto it = arguments_.find(name); - CHECK(it != arguments_.end()) << "no argument called " << name; - return it->second; -} - -std::vector SSAGraph::inputs() { - std::vector res; - for (auto &node : node_storage_) { - if (node.inlinks.empty()) { - res.push_back(&node); - } - } - return res; -} - -std::vector SSAGraph::outputs() { - std::vector res; - for (auto &node : node_storage_) { - if (node.outlinks.empty()) { - res.push_back(&node); - } - } - return res; -} - -mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) { - auto it = arguments_.find(arg); - if (it != arguments_.end()) { - return it->second; - } - return nullptr; -} - -bool SSAGraph::CheckNodesRoleSet() { - for (auto &node : mutable_nodes()) { - CHECK_OR_FALSE(node.IsRoleSet()); - } - return true; -} - -bool SSAGraph::CheckLinksRoleSet() { - for (auto &node : mutable_nodes()) { - CHECK_OR_FALSE(node.IsRoleSet()); - if (!node.IsStmt()) continue; - for (auto *x : node.inlinks) { - CHECK_OR_FALSE(x->IsRoleSet()); - CHECK_OR_FALSE(x->IsArg()); - } - for (auto *x : node.outlinks) { - CHECK_OR_FALSE(x->IsRoleSet()); - CHECK_OR_FALSE(x->IsArg()); - } - } - return true; -} - -Node *SSAGraph::NewArgumentNode(const std::string &name) { - node_storage_.emplace_back(); - auto &arg_node = node_storage_.back(); - arg_node.AsArg(name, node_storage_.size() - 1); - return &arg_node; -} - -Node *SSAGraph::NewInstructNode() { - node_storage_.emplace_back(); - return &node_storage_.back(); -} - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h deleted file mode 100644 index b5b9fb1cb2..0000000000 --- a/lite/core/mir/ssa_graph.h +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/mir/node.h" -#include "lite/core/op_lite.h" -#include "lite/core/program.h" - -namespace paddle { -namespace lite { -namespace mir { - -// An Graph for MIR. It is built from a list of Op and a scope. -class GraphBase {}; - -class SSAGraph : GraphBase { - public: - // @param program: the op program - // @param valid_places: the valid places user set for the system. - void Build(const Program &program, const std::vector &valid_places); - void RemoveNode(const mir::Node *node); - - std::vector StmtTopologicalOrder(); - - // The inputs of the graph. - std::vector inputs(); - - // The outputs of the graph. - std::vector outputs(); - - const std::list &nodes() const { return node_storage_; } - std::list &mutable_nodes() { return node_storage_; } - - mir::Node *RetrieveArgument(const std::string &arg); - - Node *NewArgumentNode(const std::string &name); - Node *NewInstructNode(); - - void CheckValid() { - CHECK(CheckBidirectionalConnection()); - CHECK(CheckNodesRoleSet()); - CHECK(CheckLinksRoleSet()); - } - - Node *GraphCreateInstructNode(const std::shared_ptr &op, - const std::vector &valid_places); - - // Device related attributes - const std::vector &valid_places() const { return valid_places_; } - void SetValidPlaces(const std::vector &x) { valid_places_ = x; } - - private: - mir::Node *Argument(const std::string &name); - // Check the bidirectional connection. - bool CheckBidirectionalConnection(); - bool CheckNodesRoleSet(); - // Check all the items's role in inlinks and outlinks is set. - bool CheckLinksRoleSet(); - - void MarkArgumentWeights(const Program &program) { - for (const auto &name : program.weights()) { - arguments_[name]->AsArg().is_weight = true; - } - } - - // Build operator inlink edge table. - std::map> BuildOperationAdjList(); - - void SortHelper(const std::map> &adj_list, - mir::Node *node, - std::set *visited, - std::vector *ret); - - private: - std::list node_storage_; - std::map arguments_; - std::vector valid_places_; -}; - -// Remove the link between a -> b. -static void RemoveDirectedLink(Node *a, Node *b) { - auto it = std::find(b->inlinks.begin(), b->inlinks.end(), a); - if (it != b->inlinks.end()) { - b->inlinks.erase(it); - } - - auto it1 = std::find(a->outlinks.begin(), a->outlinks.end(), b); - if (it1 != a->outlinks.end()) { - a->outlinks.erase((it1)); - } -} - -// Link a -> b. -static void DirectedLink(Node *a, Node *b) { - // Eagerly remove first, to avoid duplicate link. - RemoveDirectedLink(a, b); - a->outlinks.push_back(b); - b->inlinks.push_back(a); -} - -static void LocalInferenceType(Node *a, Node *b, const std::string &arg_name) { - // instr -> output argument - if (a->IsStmt() && b->IsArg()) { - auto &inst = a->AsStmt(); - auto &output = b->AsArg(); - - if (!output.type) { - output.type = inst.picked_kernel().GetOutputDeclType(arg_name); - } - } - - // input argument -> instr - if (a->IsArg() && b->IsStmt()) { - auto &input = a->AsArg(); - auto &inst = b->AsStmt(); - if (!input.type) { - input.type = inst.picked_kernel().GetInputDeclType(arg_name); - } - } -} - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/ssa_graph_test.cc b/lite/core/mir/ssa_graph_test.cc deleted file mode 100644 index ef49001ba2..0000000000 --- a/lite/core/mir/ssa_graph_test.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/ssa_graph.h" -#include -#include -#include "lite/api/paddle_use_passes.h" -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/op_registry.h" -#include "lite/core/program_fake_utils.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace lite { -namespace mir { - -void BuildFc(framework::ProgramDesc* desc, - const std::string& x, - const std::string& w, - const std::string& b, - const std::string& out) { - auto* fc = desc->MutableBlock(0)->AppendOp(); - fc->SetInput("Input", {x}); - fc->SetInput("W", {w}); - fc->SetInput("Bias", {b}); - fc->SetOutput("Out", {out}); -} - -TEST(SSAGraph, test) { - auto program_faker = ProgramFaker(); - SSAGraph graph; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - - lite::Program program(*program_faker.program()->Proto(), scope, places); - graph.Build(program, places); - - Visualize(&graph); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(fc); -#ifdef LITE_WITH_X86 -// USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def); -#endif diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc deleted file mode 100644 index 37bcb1e317..0000000000 --- a/lite/core/mir/static_kernel_pick_pass.cc +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/static_kernel_pick_pass.h" -#include -#include -#include -#include -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -bool KernelScoreCmp(const std::pair>& a, - const std::pair>& b) { - return a.first > b.first; -} - -void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { - CHECK(kernel_pick_factors_.any_factor_considered()) - << "kernel_pick_factors should be specified first"; - CHECK(graph) << "graph not valid"; - // sort kernels by the factors. - - for (auto& node : graph->mutable_nodes()) { - if (!node.IsStmt()) continue; - auto& instruct = node.AsStmt(); - - // Get candidate kernels - std::vector>> scored; - CHECK(!instruct.kernels().empty()) << "No kernels found for " - << instruct.op_type(); - for (auto&& kernel : instruct.kernels()) { - size_t score = KernelGrade(*kernel); - scored.emplace_back(score, std::move(kernel)); - } - std::sort(scored.begin(), scored.end(), KernelScoreCmp); - instruct.kernels().clear(); - - if (!instruct.op_info()->HasAttr("enable_int8")) { - // Move kernel back - // Just keep a single best kernel. - // TODO(Superjomn) reconsider this. - instruct.kernels().emplace_back(std::move(scored.front().second)); - VLOG(2) << "pick " << instruct.kernels().front()->name(); - - } else { - bool out_type_int8 = true; - // Only if all ops linked to this op output has enable_int8 attr, - // then the op output type is int8, or fp32. - for (auto* out_n : node.outlinks) { - CHECK(out_n->IsArg()); - for (auto* tmp_op : out_n->outlinks) { - CHECK(tmp_op->IsStmt()); - if (!tmp_op->AsStmt().op_info()->HasAttr("enable_int8")) { - out_type_int8 = false; - break; - } - } - if (!out_type_int8) break; - } - // If the out_type_int8 is true, it turns out that the output type of this - // op can be int8. - // So we need to specify output scale for this op. - if (out_type_int8) { - auto out_node = node.outlinks.front(); - CHECK(out_node->IsArg()); - auto one_adj_op_node = out_node->outlinks.front(); - CHECK(one_adj_op_node->IsStmt()); - auto& one_adj_instruct = one_adj_op_node->AsStmt(); - CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8")); - CHECK(one_adj_instruct.op_info()->HasAttr("input_scale")); - - instruct.mutable_op_info()->SetAttr( - "output_scale", - one_adj_instruct.op_info()->GetAttr("input_scale")); - - auto update_desc = *instruct.mutable_op_info(); - instruct.ResetOp(update_desc, graph->valid_places()); - scored.clear(); - for (auto&& kernel : instruct.kernels()) { - size_t score = KernelGrade(*kernel); - scored.emplace_back(score, std::move(kernel)); - } - std::sort(scored.begin(), scored.end(), KernelScoreCmp); - instruct.kernels().clear(); - } - // If the out_type_int8 is true, we should pick the kernel with the - // int8 input and int8 output. - // If the out_type_int8 is false, we should pick the kernel with the - // int8 input and fp32 output. - auto output_arguments = instruct.op_info()->OutputArgumentNames(); - for (auto& candidate : scored) { - bool all_output_type_match = true; - auto expect_output_type = - out_type_int8 ? PRECISION(kInt8) : PRECISION(kFloat); - - for (auto& arg_name : output_arguments) { - const Type* out_arg_ty = - candidate.second->GetOutputDeclType(arg_name); - if (out_arg_ty->precision() != expect_output_type) { - all_output_type_match = false; - } - } - - if (all_output_type_match) { - instruct.kernels().emplace_back(std::move(candidate.second)); - VLOG(2) << "pick " << instruct.kernels().front()->name(); - break; - } - } - CHECK(!instruct.kernels().empty()) << "No kernels found for " - << instruct.op_type(); - } - } -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(static_kernel_pick_pass, - paddle::lite::mir::StaticKernelPickPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h deleted file mode 100644 index 3412278229..0000000000 --- a/lite/core/mir/static_kernel_pick_pass.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/mir/pass.h" -#include "lite/core/types.h" - -namespace paddle { -namespace lite { -namespace mir { - -/* - * StaticKernelPickPass is a simple strategy for picking the kernel for each - * Operator using operator developer defined rule, there are many other tactics - * such as considering IO or kernel execution latency and we will implement them - * latter. - * - * There are two argument for this pass: - * - place, the target place. - * - kernel_pick_factors, the factors to consider in picking kernels. - * Set them first before execute the pass. - */ -class StaticKernelPickPass : public mir::StmtPass { - public: - void Apply(const std::unique_ptr& graph) override; - - void SetPreferPlace(const Place& place) { place_ = place; } - const Place& place() const { return place_; } - const core::KernelPickFactor& kernel_pick_factors() const { - return kernel_pick_factors_; - } - core::KernelPickFactor* mutable_kernel_pick_factors() { - return &kernel_pick_factors_; - } - - private: - // Score the kernel. - size_t KernelGrade(const lite::KernelBase& kernel) { - size_t score{}; - const int kMax = - std::numeric_limits::max(); - - // The more important factor comes first - if (kernel_pick_factors_.IsTargetConsidered() && - (place().target == kernel.target() || kernel.target() == TARGET(kAny) || - place().target == TARGET(kAny))) { - score += - kMax / static_cast(core::KernelPickFactor::Factor::TargetFirst); - } - if (kernel_pick_factors_.IsPrecisionConsidered() && - (place().precision == kernel.precision() || - kernel.precision() == PRECISION(kAny) || - place().precision == PRECISION(kAny))) { - score += kMax / - static_cast(core::KernelPickFactor::Factor::PrecisionFirst); - } - if (kernel_pick_factors_.IsDataLayoutConsidered() && - (place().layout == kernel.layout() || - kernel.layout() == DATALAYOUT(kAny) || - place().layout == DATALAYOUT(kAny))) { - score += kMax / static_cast( - core::KernelPickFactor::Factor::DataLayoutFirst); - } - VLOG(4) << "picker tactic " << kernel_pick_factors_; - VLOG(4) << "kernel place " << kernel.place().DebugString(); - VLOG(4) << "picker place " << place().DebugString(); - VLOG(4) << "score " << score; - - // The data layout is not considered, for the input and output arguments - // might have different data layout. - // TODO(Superjomn) reconsider the idea of taking the data layout as a kernel - // specification. - return score; - } - - private: - core::KernelPickFactor kernel_pick_factors_; - Place place_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt deleted file mode 100644 index 9984e202db..0000000000 --- a/lite/core/mir/subgraph/CMakeLists.txt +++ /dev/null @@ -1,34 +0,0 @@ - -lite_cc_library(subgraph_pass - SRCS subgraph_program_pass.cc - DEPS mir_pass types ${mir_fusers}) -lite_cc_test(test_subgraph_pass SRCS subgraph_program_pass_test.cc - DEPS subgraph_pass mir_passes gflags model_parser cxx_api - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL) -if (WITH_TESTING) - add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v1_tar_gz) - add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz) - set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") -endif() - -set(subgraph_passes subgraph_pass) - -if(LITE_WITH_NPU) - lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc - DEPS mir_pass types context ${mir_fusers} ${npu_bridges} npu_helper ${npu_ddk_libs} graph_op subgraph_pass) - list(APPEND subgraph_passes npu_pass) - lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc - DEPS npu_pass cxx_api mir_passes gflags - ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 - --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL) - if (WITH_TESTING) - add_dependencies(test_npu_pass extern_lite_download_mobilenet_v1_tar_gz) - add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz) - set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set_target_properties(test_npu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") - endif() -endif() - -set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes") -message(STATUS "----> subgraph_passes: ${subgraph_passes}") diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc deleted file mode 100644 index 76e295c7af..0000000000 --- a/lite/core/mir/subgraph/generate_npu_program_pass.cc +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/subgraph/generate_npu_program_pass.h" -#include -#include -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/pass_registry.h" -#include "lite/core/mir/pattern_matcher.h" - -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" // for ge::op::Data -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/backends/npu/bridge/paddle_use_npu_bridges.h" -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/bridge/utils.h" -#include "lite/backends/npu/npu_helper.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace subgraph { - -std::shared_ptr GenerateNPUProgramPass::CvtVarNode( - lite::mir::Node* var_node, const Scope* scope) { - CHECK(var_node->IsArg()); - const auto& arg = var_node->AsArg(); - VLOG(4) << "Convert var node " << arg.name; - - auto* var = scope->FindVar(arg.name); - CHECK(var); - auto* tensor = var->GetMutable(); - CHECK(tensor); - auto dims = tensor->dims(); - if (arg.is_weight) { - auto wgt = std::make_shared(arg.name); - LOG(INFO) << "in convert const:" << arg.name; - VLOG(4) << dims; - wgt->set_attr_value(lite::npu::bridge::CvtFromLiteTensor(tensor)); - return wgt; - } else { - CHECK_EQ(dims.size(), 4); - LOG(INFO) << "in convert data:" << arg.name; - LOG(INFO) << dims; - // TODO(xxx): support more types and dims size - ge::TensorDesc desc(ge::Shape(dims.Vectorize()), - ge::Format::FORMAT_NCHW, - ge::DataType::DT_FLOAT); - - // auto size = desc.GetShape().GetShapeSize(); - // ge::TensorUtils::SetSize(desc, size*sizeof(float)); - // ge::TensorUtils::SetRealDimCnt(desc, 4); - auto data = std::make_shared(arg.name); - data->update_input_desc_x(desc); - return data; - } - return nullptr; -} - -void GenerateNPUProgramPass::CvtAllOpNodes( - const std::vector& nodes2cvt, - lite::npu::bridge::node_map_type* converted_vars) { - const auto& bridges = lite::npu::bridge::Factory::Instance(); - const auto& cvtfunc_map = bridges.AllFunctions(); - // return record all converted vars - // op node's inputs must be found in converted_vars - for (auto& node : nodes2cvt) { - lite::npu::bridge::node_map_type node_inputs; - auto& stmt = node->AsStmt(); - for (auto& var_node : node->inlinks) { - auto& arg = var_node->AsArg(); - // weight should be handled in the converter, so skip here - if (arg.is_weight) { - continue; - } - auto var_name = arg.name; - if (!converted_vars->count(var_name)) { - converted_vars->insert( - std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope()))); - } - node_inputs.insert(*converted_vars->find(var_name)); - } - auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs); - converted_vars->insert(node_outputs.begin(), node_outputs.end()); - } -} - -std::string GenerateNPUProgramPass::BuildNPUGraph( - const std::unordered_set& op_nodes, - const std::unordered_set& in_data_vars, - const std::unordered_set& out_data_vars, - int sub_id) { - auto ordered_nodes = GetTopologicalOrder(op_nodes); - lite::npu::bridge::node_map_type converted_vars; - CvtAllOpNodes(ordered_nodes, &converted_vars); - - std::vector in_var_names; - std::vector out_var_names; - std::vector inputs; - std::vector outputs; - for (auto i : in_data_vars) { - auto argname = i->AsArg().name; - in_var_names.push_back(argname); - inputs.push_back(*converted_vars.at(argname)); - } - for (auto i : out_data_vars) { - auto argname = i->AsArg().name; - out_var_names.push_back(argname); - outputs.push_back(*converted_vars.at(argname)); - } - - std::string model_name("hiai_npu_client_" + std::to_string(sub_id) + ".om"); - if (!npu::BuildNPUClient(inputs, outputs, model_name)) { - LOG(WARNING) << "Build NPU failed subgraph " << sub_id; - throw std::runtime_error("Build NPU failed subgraph."); - } - LOG(INFO) << "[NPU] Build NPU Client success subgraph " << sub_id; - return model_name; -} - -void GenerateNPUProgramPass::GenNPUSubgraph( - const std::unique_ptr& graph, - const std::unordered_set& op_nodes, - int sub_id) { - std::unordered_set in_data_vars; - std::unordered_set in_wgt_vars; - std::unordered_set out_data_vars; - std::unordered_set out_unused_vars; - FindInputOutputVars( - op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars); - - auto model_name = - BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id); - - auto any_op = (*op_nodes.begin())->AsStmt().op(); - InsertNewNode(graph, - model_name, - any_op->scope(), - any_op->valid_places(), - in_data_vars, - in_wgt_vars, - out_data_vars, - out_unused_vars); - - auto nodes2rm = GetNode2rm( - op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars}); - - GraphSafeRemoveNodes(graph.get(), nodes2rm); -} - -void GenerateNPUProgramPass::Apply(const std::unique_ptr& graph) { - LOG(INFO) << "Before NPU Pass \n" << Visualize(graph.get()); - const auto& bridges = lite::npu::bridge::Factory::Instance(); - const auto& op_map = bridges.AllFunctions(); - std::vector supported_op_types; - for (auto& i : op_map) { - LOG(INFO) << "Supported type: " << i.first; - supported_op_types.push_back(i.first); - } - - try { - int num_subgraph = FuseSubgraph(graph, supported_op_types); - InferOnce(graph); - auto op_nodes_all = ClassifySubgraph(graph); - CHECK_EQ(op_nodes_all.size(), num_subgraph); - int id = 1; - for (auto& op_nodes : op_nodes_all) { - LOG(INFO) << "Converting subgraph_id:" << id; - GenNPUSubgraph(graph, op_nodes.second, id); - LOG(INFO) << "After NPU Pass Subgraph " << id << "\n" - << Visualize(graph.get()); - id++; - } - } catch (...) { - LOG(WARNING) << "Build NPU graph failed"; - throw std::runtime_error("Build NPU graph failed"); - } - - for (auto& item : graph->StmtTopologicalOrder()) { - if (item->IsStmt()) { - auto& stmt = item->AsStmt(); - LOG(INFO) << stmt; - insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); - } - } -} - -std::unique_ptr GenerateNPUProgramPass::GenProgram() { - LOG(INFO) << "insts.size " << insts_.size(); - std::unique_ptr program( - new RuntimeProgram(std::move(insts_))); - return program; -} - -} // namespace subgraph -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(generate_npu_program_pass, - paddle::lite::mir::subgraph::GenerateNPUProgramPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h deleted file mode 100644 index 9e030287cb..0000000000 --- a/lite/core/mir/subgraph/generate_npu_program_pass.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "lite/backends/npu/bridge/registry.h" -#include "lite/backends/npu/npu_helper.h" -#include "lite/core/mir/pass.h" -#include "lite/core/mir/subgraph/subgraph_program_pass.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace subgraph { - -class GenerateNPUProgramPass : public SubgraphProgramPass { - public: - using key2nodes_t = std::map; - - void Apply(const std::unique_ptr& graph) override; - std::unique_ptr GenProgram(); - - protected: - // nodes2cvt: op nodes to convert - // return cvted_vars: converted var nodes - void CvtAllOpNodes(const std::vector& nodes2cvt, - lite::npu::bridge::node_map_type* cvted_vars); - - std::shared_ptr CvtVarNode(lite::mir::Node* var_node, - const Scope* scope); - - std::string BuildNPUGraph(const std::unordered_set& op_nodes, - const std::unordered_set& in_data_vars, - const std::unordered_set& out_data_vars, - int sub_id); - - void GenNPUSubgraph(const std::unique_ptr& graph, - const std::unordered_set& op_nodes, - int sub_id); - - private: - std::vector insts_; -}; - -} // namespace subgraph -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc deleted file mode 100644 index a1f39441cb..0000000000 --- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/subgraph/subgraph_program_pass.h" -#include "lite/core/op_registry.h" -#include "lite/core/program.h" -#include "lite/core/tensor.h" - -#include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/api/test_helper.h" - -#include "lite/model_parser/pb/program_desc.h" - -DEFINE_string(optimized_model, "", "optimized_model"); -DEFINE_int32(batch_size, 1, "batch size"); -DEFINE_int32(im_channel, 3, "im_channel"); - -namespace paddle { -namespace lite { - -void TestModel(lite::Predictor* predictor, - const std::vector& valid_places, - const std::string& model_dir) { - predictor->Build(model_dir, - model_dir + "/model", - model_dir + "/params", - Place{TARGET(kARM), PRECISION(kFloat)}, - valid_places); - - auto* input_tensor = predictor->GetInput(0); - input_tensor->Resize(DDim(std::vector( - {FLAGS_batch_size, FLAGS_im_channel, FLAGS_im_height, FLAGS_im_width}))); - auto* data = input_tensor->mutable_data(); - auto item_size = input_tensor->dims().production(); - for (int i = 0; i < item_size; i++) { - data[i] = 1; - } - - predictor->Run(); - if (model_dir != FLAGS_optimized_model && - std::find(valid_places.begin(), - valid_places.end(), - Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) { - predictor->SaveModel(FLAGS_optimized_model); - } -} - -void CompareOutData(const lite::Predictor& tgt, const lite::Predictor& ref) { - auto* tgt_otensor = tgt.GetOutput(0); - auto* ref_otensor = ref.GetOutput(0); - const auto* tgt_pdata = tgt_otensor->data(); - const auto* ref_pdata = ref_otensor->data(); - EXPECT_EQ(tgt_otensor->dims().production(), ref_otensor->dims().production()); - for (size_t i = 0; i < tgt_otensor->dims().production(); ++i) { - auto diff = std::fabs(tgt_pdata[i] - ref_pdata[i]) / - (std::fabs(ref_pdata[i]) + 1e-6); - VLOG(3) << diff; - EXPECT_LT(diff, 0.1); - } -} - -TEST(NPUSubgraph, compare) { - DeviceInfo::Init(); - DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, 1); - - lite::Predictor predictor_arm, predictor_npu, predictor_npu_savedmodel; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}}); - - TestModel(&predictor_arm, valid_places, FLAGS_model_dir); - - valid_places.push_back(Place{TARGET(kNPU), PRECISION(kFloat)}); - TestModel(&predictor_npu, valid_places, FLAGS_model_dir); - - CompareOutData(predictor_npu, predictor_arm); - LOG(INFO) << " ================ NPU speed ================== "; - for (int i = 0; i < FLAGS_repeats; ++i) { - auto start = GetCurrentUS(); - predictor_npu.Run(); - LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; - } - - LOG(INFO) << " =================== ARM CPU speed =================== "; - for (int i = 0; i < FLAGS_repeats; ++i) { - auto start = GetCurrentUS(); - predictor_arm.Run(); - LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; - } - - TestModel(&predictor_npu_savedmodel, valid_places, FLAGS_optimized_model); - - CompareOutData(predictor_npu_savedmodel, predictor_arm); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/subgraph/subgraph_program_pass.cc b/lite/core/mir/subgraph/subgraph_program_pass.cc deleted file mode 100644 index 2b6206f891..0000000000 --- a/lite/core/mir/subgraph/subgraph_program_pass.cc +++ /dev/null @@ -1,314 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/subgraph/subgraph_program_pass.h" -#include -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/pass_registry.h" -#include "lite/core/mir/pattern_matcher.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace subgraph { - -std::unordered_map> -SubgraphProgramPass::ClassifySubgraph(const std::unique_ptr& graph) { - std::unordered_map> op_nodes; - for (auto& item : graph->StmtTopologicalOrder()) { - if (!item->IsStmt()) continue; - auto& stmt = item->AsStmt(); - int sub_id = stmt.subgraph_id(); - if (sub_id < 1) continue; - if (!op_nodes.count(sub_id)) { - op_nodes[sub_id] = std::unordered_set(); - } - op_nodes.at(sub_id).insert(item); - } - return op_nodes; -} - -cpp::OpDesc SubgraphProgramPass::GenGraphOpDesc( - const std::string& model_name, - const std::vector& in_var_names, - const std::vector& out_var_names) { - cpp::OpDesc op_desc; - op_desc.SetType("graph_op"); - op_desc.SetInput("Inputs", in_var_names); - op_desc.SetOutput("Outputs", out_var_names); - op_desc.SetAttr("model_name", model_name); - return op_desc; -} - -void SubgraphProgramPass::InsertNewNode( - const std::unique_ptr& graph, - const std::string& model_name, - Scope* scope, - const std::vector& valid_places, - std::unordered_set in_data_vars, - std::unordered_set in_wgt_vars, - std::unordered_set out_data_vars, - std::unordered_set out_unused_vars) { - std::vector in_var_names; - std::vector out_var_names; - for (auto i : in_data_vars) { - in_var_names.push_back(i->AsArg().name); - } - for (auto i : out_data_vars) { - out_var_names.push_back(i->AsArg().name); - } - - auto op_desc = GenGraphOpDesc(model_name, in_var_names, out_var_names); - - auto graph_op = LiteOpRegistry::Global().Create("graph_op"); - graph_op->Attach(op_desc, scope); - auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places); - - for (auto& in_var : in_data_vars) { - IR_NODE_LINK_TO(in_var, new_op_node); - } - for (auto& in_var : in_wgt_vars) { - IR_NODE_LINK_TO(in_var, new_op_node); - } - for (auto& out_var : out_data_vars) { - IR_OP_VAR_LINK(new_op_node, out_var); - } - for (auto& out_var : out_unused_vars) { - IR_OP_VAR_LINK(new_op_node, out_var); - } - - // assign context - auto& inst = new_op_node->AsStmt(); - inst.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(inst.picked_kernel().target())); -} - -void SubgraphProgramPass::SortHelper( - Node* node, - const std::unordered_set& nodes_all, - std::unordered_set* visited_nodes, - std::vector* ret) { - for (auto& var_node : node->inlinks) { - if (var_node->inlinks.empty()) continue; - auto* op_node = var_node->inlinks.front(); - if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) { - SortHelper(op_node, nodes_all, visited_nodes, ret); - } - } - ret->push_back(node); - visited_nodes->insert(node); -} - -std::vector SubgraphProgramPass::GetTopologicalOrder( - const std::unordered_set& nodes) { - std::unordered_set visited; - std::vector ret; - for (auto& node : nodes) { - if (!node->IsStmt()) continue; - if (visited.count(node)) continue; - SortHelper(node, nodes, &visited, &ret); - } - return ret; -} - -void SubgraphProgramPass::FindInputOutputVars( - const std::unordered_set& op_nodes, - std::unordered_set* in_data_vars, - std::unordered_set* in_wgt_vars, - std::unordered_set* out_data_vars, - std::unordered_set* out_unused_vars) { - for (auto& op_node : op_nodes) { - for (auto& in_var : op_node->inlinks) { - if (in_var->AsArg().is_weight) { - in_wgt_vars->insert(in_var); - continue; - } - if (!in_var->inlinks.empty()) { - // var can only come from one op node, so use front - auto* pre_op_node = in_var->inlinks.front(); - if (op_nodes.count(pre_op_node)) { - continue; - } - } - in_data_vars->insert(in_var); - } - for (auto& out_var : op_node->outlinks) { - if (out_var->outlinks.empty()) { - // the next op is empty so this var is actually unused - out_unused_vars->insert(out_var); - continue; - } - // var can have more than one next op node - // so, if any one in the op_nodes then continue - bool next_op_in_nodes = false; - for (auto& next_op_node : out_var->outlinks) { - if (op_nodes.count(next_op_node)) { - next_op_in_nodes = true; - } - } - if (next_op_in_nodes) { - continue; - } - - out_data_vars->insert(out_var); - } - } -} - -std::unordered_set SubgraphProgramPass::GetNode2rm( - const std::unordered_set& op_nodes, - const std::vector>& excluded_nodes) { - std::unordered_set nodes2rm(op_nodes.begin(), op_nodes.end()); - for (auto& op_node : op_nodes) { - for (auto& in_var : op_node->inlinks) { - if (!nodes2rm.count(in_var)) { - nodes2rm.insert(in_var); - } - } - for (auto& out_var : op_node->outlinks) { - if (!nodes2rm.count(out_var)) { - nodes2rm.insert(out_var); - } - } - } - // some nodes should not be removed - for (auto& e : excluded_nodes) { - for (auto& i : e) { - if (nodes2rm.count(i)) { - nodes2rm.erase(i); - } - } - } - return nodes2rm; -} - -void SubgraphProgramPass::InferOnce(const std::unique_ptr& graph) { - for (auto& item : graph->StmtTopologicalOrder()) { - if (!item->IsStmt()) continue; - auto& stmt = item->AsStmt(); - auto& op = stmt.op(); - op->CheckShape(); - op->InferShape(); - // TOOD(xxx): remove Launch() at last - auto& kkks = stmt.kernels(); - if (!kkks.empty()) { - auto& kk = stmt.kernels().front(); - if (kk) { - kk->Launch(); - } - } - } -} - -void SubgraphProgramPass::InitSubgraphID( - const std::unique_ptr& graph, - const std::vector& supported_op_types) { - for (auto& item : graph->StmtTopologicalOrder()) { - if (!item->IsStmt()) continue; - auto& stmt = item->AsStmt(); - stmt.ClearSubgraphID(); - if (std::find(supported_op_types.begin(), - supported_op_types.end(), - stmt.op_type()) != supported_op_types.end()) { - stmt.SetSubgraphID(0); - LOG(INFO) << "supported " << stmt.op_type(); - } else { - LOG(INFO) << "======= not supported " << stmt.op_type(); - } - } -} - -// mark current and all output supported nodes -void SubgraphProgramPass::ChangeAllOutConnectedID(Node* node, - int to_id, - int from_id) { - if (!node) return; - if (node->IsStmt()) { - auto& stmt = node->AsStmt(); - if (stmt.subgraph_id() == from_id) { - stmt.SetSubgraphID(to_id); - for (auto& i : node->outlinks) { - ChangeAllOutConnectedID(i, to_id, from_id); - } - } else { - LOG(INFO) << "failed op type:" << stmt.op_type(); - return; - } - } else { - // this it arg node - bool all_out_op_supported = true; - for (auto& i : node->outlinks) { - if (!i->IsStmt()) return; - auto& stmt = i->AsStmt(); - if (stmt.subgraph_id() < from_id) { - all_out_op_supported = false; - } - } - if (!all_out_op_supported) { - return; - } - for (auto& i : node->outlinks) { - CHECK(i->IsStmt()); - auto& stmt = i->AsStmt(); - if (stmt.subgraph_id() == from_id) { - stmt.SetSubgraphID(to_id); - for (auto& o : i->outlinks) { - ChangeAllOutConnectedID(o, to_id, from_id); - } - } - } - } -} - -int SubgraphProgramPass::FuseSubgraphID( - const std::unique_ptr& graph) { - int sub_id = 1; // id start from 1 not 0 - for (auto& item : graph->StmtTopologicalOrder()) { - bool inputvar = 0; - if (!item->IsStmt()) continue; - auto& stmt = item->AsStmt(); - if (stmt.subgraph_id() == -1) { - for (auto& i : item->outlinks) { - for (auto& j : i->outlinks) { - if (j->IsStmt()) { - auto& jstmt = j->AsStmt(); - if (jstmt.subgraph_id() == 0) inputvar = 1; - } - } - } - } - if (stmt.subgraph_id() != 0) continue; - ChangeAllOutConnectedID(item, sub_id); - sub_id++; - } - return sub_id - 1; -} - -int SubgraphProgramPass::FuseSubgraph( - const std::unique_ptr& graph, - const std::vector& supported_op_types) { - InitSubgraphID(graph, supported_op_types); - return FuseSubgraphID(graph); -} -} // namespace subgraph -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(subgraph_program_pass, - paddle::lite::mir::subgraph::SubgraphProgramPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/subgraph/subgraph_program_pass.h b/lite/core/mir/subgraph/subgraph_program_pass.h deleted file mode 100644 index 51e9367539..0000000000 --- a/lite/core/mir/subgraph/subgraph_program_pass.h +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "lite/core/mir/pass.h" - -namespace paddle { -namespace lite { -namespace mir { -namespace subgraph { - -class SubgraphProgramPass : public ProgramPass { - public: - using key2nodes_t = std::map; - - // make all the linked ops in subgraph with same subgraph_id - // return the fused subgraph numbers - int FuseSubgraph(const std::unique_ptr& graph, - const std::vector& supported_op_types); - - void Apply(const std::unique_ptr& graph) override{}; - - protected: - void InferOnce(const std::unique_ptr& graph); - - // clear all subgraph id and mark all ops, which could be fuse, as id zero - void InitSubgraphID(const std::unique_ptr& graph, - const std::vector& supported_op_types); - - // make all the linked ops in subgraph with same subgraph_id - // return the fused subgraph numbers - int FuseSubgraphID(const std::unique_ptr& graph); - - // // GenerateFusedGraph: - // std::unique_ptr GenerateFusedGraph(const - // std::unique_ptr& graph, int sub_num); - void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0); - - // Below function cloud be useful in child classes // - // classify node by subgraph id - std::unordered_map> ClassifySubgraph( - const std::unique_ptr& graph); - - // generate the graph op desc - cpp::OpDesc GenGraphOpDesc(const std::string& model_name, - const std::vector& in_var_names, - const std::vector& out_var_names); - - // insert a new graph op node - void InsertNewNode(const std::unique_ptr& graph, - const std::string& model_name, - Scope* scope, - const std::vector& valid_places, - std::unordered_set in_data_vars, - std::unordered_set in_wgt_vars, - std::unordered_set out_data_vars, - std::unordered_set out_unused_vars); - - // Sort and return the topology order of nodes set - std::vector GetTopologicalOrder( - const std::unordered_set& nodes); - - // find all input data vars, input weight vars, - // output data vars and output vars from the nodes - void FindInputOutputVars(const std::unordered_set& op_nodes, - std::unordered_set* in_data_vars, - std::unordered_set* in_wgt_vars, - std::unordered_set* out_data_vars, - std::unordered_set* out_unused_vars); - - // return the node to remove in the subgraph - std::unordered_set GetNode2rm( - const std::unordered_set& op_nodes, - const std::vector>& excluded_nodes); - - private: - // sort nodes to operational sequence - void SortHelper(Node* node, - const std::unordered_set& nodes_all, - std::unordered_set* visited_nodes, - std::vector* ret); -}; - -} // namespace subgraph -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/subgraph/subgraph_program_pass_test.cc b/lite/core/mir/subgraph/subgraph_program_pass_test.cc deleted file mode 100644 index de4acec91d..0000000000 --- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/subgraph/subgraph_program_pass.h" -#include -#include -#include -#include "lite/api/paddle_use_ops.h" -#include "lite/api/paddle_use_passes.h" -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/ssa_graph.h" -#include "lite/core/program.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/model_parser.h" - -DEFINE_string(model_dir, "", "model_dir"); - -namespace paddle { -namespace lite { - -TEST(SubgraphTest, models) { - cpp::ProgramDesc program_desc; - auto scope = std::make_shared(); - // LoadModelPb(FLAGS_model_dir, - // FLAGS_model_dir + "/model", - // FLAGS_model_dir + "/params", - // scope.get(), - // &program_desc, - // true); - LoadModelPb(FLAGS_model_dir, "", "", scope.get(), &program_desc); - std::vector valid_places({ - Place{TARGET(kHost), PRECISION(kFloat)}, -#ifdef LITE_WITH_ARM - Place{TARGET(kARM), PRECISION(kFloat)}, -#endif -#ifdef LITE_WITH_NPU - Place{TARGET(kNPU), PRECISION(kFloat)}, -#endif - }); - lite::Program program(program_desc, scope, valid_places); - auto graph = std::unique_ptr(new mir::SSAGraph()); - graph->Build(program, valid_places); - - std::vector supported_op_types{"concat", - "conv2d", - "depthwise_conv2d", - "batch_norm", - "scale", - "pool2d", - "mul", - "elementwise_add", - "softmax", - "split", - "relu", - "reshape2", - "transpose2"}; - auto* pass = new mir::subgraph::SubgraphProgramPass; - ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1); - LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get()); -} - -// return output_var_names -std::vector AddFCDesc( - cpp::BlockDesc* block_desc, - const std::shared_ptr& scope, - const std::vector& input_var_names, - const std::vector& wshape) { - CHECK_EQ(input_var_names.size(), 1); - CHECK_EQ(wshape.size(), 2); - static int id = 0; - std::string prefix = "fc_" + std::to_string(id); - auto* op_desc = block_desc->AddOp(); - auto* wgt = block_desc->AddVar(); - auto* bias = block_desc->AddVar(); - auto* out = block_desc->AddVar(); - - wgt->SetName(prefix + "_W"); - bias->SetName(prefix + "_Bias"); - out->SetName(prefix + "_Out"); - std::vector out_var_names{prefix + "_Out"}; - - auto* wtensor = scope->Var(prefix + "_W")->GetMutable(); - wtensor->Resize(wshape); - wtensor->mutable_data(); - - auto* btensor = scope->Var(prefix + "_Bias")->GetMutable(); - btensor->Resize({wshape[1]}); - btensor->mutable_data(); - - scope->Var(prefix + "_Out")->GetMutable(); - - op_desc->SetType("fc"); - op_desc->SetInput("Input", input_var_names); - op_desc->SetInput("W", {prefix + "_W"}); - op_desc->SetInput("Bias", {prefix + "_Bias"}); - op_desc->SetAttr("in_num_col_dims", 1); - op_desc->SetOutput("Out", out_var_names); - id++; - return out_var_names; -} - -std::vector AddElementwiseAddDesc( - cpp::BlockDesc* block_desc, - const std::shared_ptr& scope, - const std::vector& input_X_names, - const std::vector& input_Y_names) { - // CHECK_EQ(input_var_names.size(), 2); - static int id = 0; - std::string prefix = "elementwise_add_" + std::to_string(id); - auto* op_desc = block_desc->AddOp(); - auto* out = block_desc->AddVar(); - - out->SetName(prefix + "_Out"); - std::vector out_var_names{prefix + "_Out"}; - - scope->Var(prefix + "_Out")->GetMutable(); - - op_desc->SetType("elementwise_add"); - op_desc->SetInput("X", input_X_names); - op_desc->SetInput("Y", input_Y_names); - op_desc->SetOutput("Out", out_var_names); - op_desc->SetAttr("axis", -1); - id++; - return out_var_names; -} - -std::vector AddFeedDesc( - cpp::BlockDesc* block_desc, - const std::shared_ptr& scope, - const std::vector& input_X_names) { - // CHECK_EQ(input_var_names.size(), 1); - static int id = 0; - std::string prefix = "feed_" + std::to_string(id); - auto* op_desc = block_desc->AddOp(); - auto* out = block_desc->AddVar(); - - out->SetName(prefix + "_Out"); - std::vector out_var_names{prefix + "_Out"}; - - scope->Var(prefix + "_Out")->GetMutable(); - - op_desc->SetType("feed"); - op_desc->SetInput("X", input_X_names); - op_desc->SetOutput("Out", out_var_names); - op_desc->SetAttr("col", 1); - id++; - return out_var_names; -} - -std::vector AddFetchDesc( - cpp::BlockDesc* block_desc, - const std::shared_ptr& scope, - const std::vector& input_X_names) { - // CHECK_EQ(input_var_names.size(), 1); - static int id = 0; - std::string prefix = "fetch_" + std::to_string(id); - auto* op_desc = block_desc->AddOp(); - auto* out = block_desc->AddVar(); - - out->SetName(prefix + "_Out"); - std::vector out_var_names{prefix + "_Out"}; - - scope->Var(prefix + "_Out")->GetMutable(); - - op_desc->SetType("fetch"); - op_desc->SetInput("X", input_X_names); - op_desc->SetOutput("Out", out_var_names); - op_desc->SetAttr("col", 1); - id++; - return out_var_names; -} - -std::unique_ptr BuildSimpleNet( - cpp::ProgramDesc* program_desc, - const std::shared_ptr& scope, - const std::vector& valid_places) { - program_desc->ClearBlocks(); - auto* block_desc = program_desc->AddBlock(); - block_desc->ClearOps(); - block_desc->ClearVars(); - - auto* var_desc = block_desc->AddVar(); - var_desc->SetName("feed_var"); - auto* feed_var = scope->Var("feed_var")->GetMutable(); - feed_var->Resize({1, 4}); - auto fc1_out = AddFCDesc(block_desc, scope, {"feed_var"}, {4, 5}); - auto fc2_out = AddFCDesc(block_desc, scope, fc1_out, {5, 2}); - - lite::Program program(*program_desc, scope, valid_places); - auto graph = std::unique_ptr(new mir::SSAGraph()); - graph->Build(program, valid_places); - - return graph; -} - -TEST(SubGraphTest, SimpleNet) { - cpp::ProgramDesc program_desc; - std::vector places{{TARGET(kHost), PRECISION(kFloat)}}; - auto scope = std::make_shared(); - auto graph = BuildSimpleNet(&program_desc, scope, places); - - std::vector supported_op_types{"fc"}; - auto* pass = new mir::subgraph::SubgraphProgramPass; - ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1); - - const int num_nodes = graph->nodes().size(); - ASSERT_EQ(graph->nodes().size(), 9); - // LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get()); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc deleted file mode 100644 index fbd3f9e1d2..0000000000 --- a/lite/core/mir/type_layout_cast_pass.cc +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/type_layout_cast_pass.h" -#include -#include -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/pass_registry.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -namespace mir { - -void TypeLayoutTransformPass::Apply(const std::unique_ptr& graph) { - // Start from inputs of the graph, those should have place set. - std::list nodes; - for (auto& node : graph->mutable_nodes()) { - nodes.push_back(&node); - } - - for (auto& node : nodes) { - if (!node->IsStmt()) continue; - auto inlinks = node->inlinks; - for (auto* in : inlinks) { - ComplementInputs(graph.get(), node, in); - } - } - VLOG(3) << "\n" << Visualize(graph.get()); -} - -void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, - Node* inst_node, - Node* in) { - // If this input is out of date. - if (inst_node->inlinks.end() == - std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in)) - return; - - CHECK(inst_node->IsStmt()); - auto& inst = inst_node->AsStmt(); - CHECK(in->IsRoleSet()); - CHECK(in->IsArg()); - auto in_arg_name = in->AsArg().name; - std::string tmp; - CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); - auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); - CHECK(in->AsArg().type); - if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) { - VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name - << " for kernel " << inst.op()->DebugString() << " " - << *in->AsArg().type << " -> " << *decl_arg_type; - AddLayoutInst(*in->AsArg().type, - *decl_arg_type, - in, - graph, - inst_node, - graph->valid_places()); - } -} - -void TypeLayoutTransformPass::AddLayoutInst( - const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places) { - CHECK(!valid_places.empty()) << "valid_place should be set"; - - CHECK(in->IsArg()); - auto node_id = [&] { return graph->nodes().size(); }; - auto layout_output_name = - string_format("%s/trans/%d", in->AsArg().name.c_str(), node_id()); - auto* layout_output_arg = graph->NewArgumentNode(layout_output_name); - auto* layout_inst = graph->NewInstructNode(); - - bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; - std::string layout_type = in_persist ? "layout_once" : "layout"; - // create Op and kernels. - auto layout_op = LiteOpRegistry::Global().Create(layout_type); - CHECK(layout_op) << "create op [" << layout_op << "] failed"; - layout_output_arg->AsArg().is_persist = in_persist; - // Create the new var manually. - inst_node->AsStmt().op()->scope()->Var(layout_output_name); - - // Create IoCopy Instruction. - cpp::OpDesc op_desc; - op_desc.SetType(layout_type); - op_desc.SetInput("Input", {in->AsArg().name}); - op_desc.SetOutput("Out", {layout_output_name}); - - layout_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); - auto kernels = layout_op->CreateKernels(valid_places); - std::vector> selected_kernels; - bool is_found = false; - for (auto& kernel : kernels) { - const Type* in_arg_ty = kernel->GetInputDeclType("Input"); - const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); - if (TypeCompatible(*in_arg_ty, from)) { - is_found = true; - selected_kernels.emplace_back(std::move(kernel)); - // we pick the kernel - layout_inst->AsStmt(layout_type, std::move(kernels), layout_op); - break; - } - } - CHECK(is_found) << "Can't find a layout kernel for layout op: " << from - << ":" << in->AsArg().name << "->" << to << ":" - << inst_node->AsStmt().op_info()->Type(); - - // Remove the old link - RemoveDirectedLink(in, inst_node); - - // Update the original instruction OpDesc. - // Update its input to the layout_output_name - // Add new link, var -> new_inst, new_inst->newarg, newarg->inst - DirectedLink(in, layout_inst); - DirectedLink(layout_inst, layout_output_arg); - DirectedLink(layout_output_arg, inst_node); - - // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - layout_output_name); - auto original_selected_kernel = - std::move(inst_node->AsStmt().kernels().front()); - auto update_op_info = *inst_node->AsStmt().op_info(); - // ResetOp() will change the Stmt op_info_ value, - // after that the old op_info_ value will be nullified. - // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp. - // `update_op_info` is the copy of `*inst_node->AsStmt().op_info(). - // Whenever update the op_info of a stmt, we should call its ResetOp(). - inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places()); - inst_node->AsStmt().kernels().clear(); - inst_node->AsStmt().kernels().emplace_back( - std::move(original_selected_kernel)); - - std::string tmp; - if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) { - CHECK(false) << "get old a " << tmp; - } - - for (auto& kernel : inst_node->AsStmt().kernels()) { - inst_node->AsStmt().op()->AttachKernel(kernel.get()); - } - - graph->CheckValid(); -} - -void TypeLayoutTransformPass::SetValidPlaces( - const std::vector& valid_places) { - CHECK(!valid_places.empty()); - valid_places_ = valid_places; -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(type_layout_cast_pass, - paddle::lite::mir::TypeLayoutTransformPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/type_layout_cast_pass.h b/lite/core/mir/type_layout_cast_pass.h deleted file mode 100644 index bf36214e1d..0000000000 --- a/lite/core/mir/type_layout_cast_pass.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/core/mir/pass.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} - -class TypeLayoutTransformPass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; - - void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in); - - void AddLayoutInst(const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places); - - void SetValidPlaces(const std::vector& valid_places); - - const std::vector& valid_places() const { return valid_places_; } - - private: - std::vector valid_places_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc deleted file mode 100644 index 7cd22e25ac..0000000000 --- a/lite/core/mir/type_precision_cast_pass.cc +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/type_precision_cast_pass.h" -#include -#include -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void PrecisionCastPass::Apply(const std::unique_ptr& graph) { - // Start from inputs of the graph, those should have place set. - std::list nodes; - for (auto& node : graph->mutable_nodes()) { - nodes.push_back(&node); - } - - for (auto& node : nodes) { - if (!node->IsStmt()) continue; - auto inlinks = node->inlinks; - for (auto* in : inlinks) { - ComplementInputs(graph.get(), node, in); - } - } -} - -void PrecisionCastPass::ComplementInputs(SSAGraph* graph, - Node* inst_node, - Node* in) { - // If this input is out of date. - if (inst_node->inlinks.end() == - std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in)) - return; - - CHECK(inst_node->IsStmt()); - auto& inst = inst_node->AsStmt(); - CHECK(in->IsRoleSet()); - CHECK(in->IsArg()); - auto in_arg_name = in->AsArg().name; - std::string tmp; - CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); - auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); - CHECK(in->AsArg().type); - VLOG(4) << inst.picked_kernel().name(); - // if (!in->AsArg().is_weight && !PrecisionCompatibleTo(*in->AsArg().type, - // *decl_arg_type)) { - if (!PrecisionCompatibleTo(*in->AsArg().type, *decl_arg_type)) { - VLOG(4) << "found Target unmatched tensor: " << in->AsArg().name - << " for kernel " << inst.op()->DebugString() << " " - << *in->AsArg().type << " -> " << *decl_arg_type; - // Add an Cast instruction to make the input compatible with other dist. - AddCastInst(*in->AsArg().type, - *decl_arg_type, - in, - graph, - inst_node, - graph->valid_places()); - } -} - -void PrecisionCastPass::AddCastInst(const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places) { - CHECK(!valid_places.empty()) << "valid_place should be set"; - - // var -> new_transform_op -> new_var -> inst - // So there will be a new Argument node and a new Cast Statement Node. - CHECK(in->IsArg()); - auto node_id = [&] { return graph->nodes().size(); }; - auto cast_op_output_name = - in->AsArg().name + "/trans/" + std::to_string(node_id()); - auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); - auto* cast_inst = graph->NewInstructNode(); - - // create Op and kernels. - bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; - std::string cast_type = in_persist ? "calib_once" : "calib"; - cast_op_output_arg->AsArg().is_persist = in_persist; - auto cast_op = LiteOpRegistry::Global().Create(cast_type); - CHECK(cast_op) << "create op [" << cast_op << "] failed"; - - // Create the new var manually. - inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); - - // Create Calib Instruction. - cpp::OpDesc op_desc; - op_desc.SetType(cast_type); - op_desc.SetInput("Input", {in->AsArg().name}); - op_desc.SetOutput("Out", {cast_op_output_name}); - if (inst_node->AsStmt().op_info()->HasAttr("input_scale")) { - op_desc.SetAttr( - "scale", inst_node->AsStmt().op_info()->GetAttr("input_scale")); - } - cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); - auto kernels = cast_op->CreateKernels(valid_places); - std::vector> selected_kernels; - bool is_found = false; - for (auto& kernel : kernels) { - const Type* in_arg_ty = kernel->GetInputDeclType("Input"); - const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); -// TODO(xg): to optimize this -#ifndef LITE_WITH_FPGA - if (in_arg_ty->precision() == from.precision() && - out_arg_ty->precision() == to.precision()) { -#else - if (TypeCompatible(*in_arg_ty, from)) { -#endif - is_found = true; - selected_kernels.emplace_back(std::move(kernel)); - // we pick the kernel - cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); - break; - } - } - - CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" - << in->AsArg().name << "->" << to << ":" - << inst_node->AsStmt().op_info()->Type(); - - // Remove the old link - RemoveDirectedLink(in, inst_node); - - // Update the original instruction OpDesc. - // Update its input to the io_copy_output_name - - // Add new link, var -> new_inst, new_inst->newarg, newarg->inst - DirectedLink(in, cast_inst); - DirectedLink(cast_inst, cast_op_output_arg); - DirectedLink(cast_op_output_arg, inst_node); - - // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - cast_op_output_name); - - // recreate the op - auto original_selected_kernel = - std::move(inst_node->AsStmt().kernels().front()); - auto updated_op_info = *inst_node->AsStmt().mutable_op_info(); - - inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places()); - inst_node->AsStmt().kernels().clear(); - inst_node->AsStmt().kernels().emplace_back( - std::move(original_selected_kernel)); - for (auto& kernel : inst_node->AsStmt().kernels()) { - VLOG(4) << "kernel info: " << kernel->name(); - inst_node->AsStmt().op()->AttachKernel(kernel.get()); - } - graph->CheckValid(); -} - -void PrecisionCastPass::SetValidPlaces(const std::vector& valid_places) { - CHECK(!valid_places.empty()); - valid_places_ = valid_places; -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(type_precision_cast_pass, - paddle::lite::mir::PrecisionCastPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h deleted file mode 100644 index 3f55e52ef9..0000000000 --- a/lite/core/mir/type_precision_cast_pass.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/core/mir/pass.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} - -/* - * The pass complement the necessary instruction to make data - * transferring or transformation between different places. - */ -class PrecisionCastPass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; - - void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in); - - void AddCastInst(const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places); - - void SetValidPlaces(const std::vector& valid_places); - - const std::vector& valid_places() const { return valid_places_; } - - private: - std::vector valid_places_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc deleted file mode 100644 index 5a07fdd9d9..0000000000 --- a/lite/core/mir/type_target_cast_pass.cc +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/type_target_cast_pass.h" -#include -#include -#include -#include -#include -#include "lite/core/mir/graph_visualize_pass.h" -#include "lite/core/mir/pass_registry.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -namespace mir { - -void TypeTargetTransformPass::Apply(const std::unique_ptr& graph) { - // Start from inputs of the graph, those should have place set. - std::list nodes; - for (auto& node : graph->mutable_nodes()) { - nodes.push_back(&node); - } - - CHECK(!valid_places_.empty()); - - for (auto& node : nodes) { - if (!node->IsStmt()) continue; - auto inlinks = node->inlinks; - for (auto* in : inlinks) { - ComplementInputs(graph.get(), node, in); - } - } -} - -void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph, - Node* inst_node, - Node* in) { - // If this input is out of date. - if (inst_node->inlinks.end() == - std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in)) - return; - - CHECK(inst_node->IsStmt()); - auto& inst = inst_node->AsStmt(); - LOG(INFO) << "found Target tensor: " << in->AsArg().name; - CHECK(in->IsRoleSet()); - CHECK(in->IsArg()); - auto in_arg_name = in->AsArg().name; - std::string tmp; - CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); - auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); - CHECK(in->AsArg().type); - if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) { - LOG(INFO) << "found Target unmatched tensor: " << in->AsArg().name - << " for kernel " << inst.op()->DebugString() << " " - << *in->AsArg().type << " -> " << *decl_arg_type; - // Add an IoCopy instruction to make the input compatible with other dist. - AddIoCopyInst( - *in->AsArg().type, *decl_arg_type, in, graph, inst_node, valid_places_); - } -} - -void TypeTargetTransformPass::AddIoCopyInst( - const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places) { - CHECK(!valid_places.empty()) << "valid_place should be set"; - // var -> new_transform_op -> new_var -> inst - // So there will be a new Argument node and a new IoCopy Statement Node. - - CHECK(in->IsArg()); - auto node_id = [&] { return graph->nodes().size(); }; - auto io_copy_output_name = - string_format("%s/trans/%d", in->AsArg().name.c_str(), node_id()); - // TODO(MyPandaShaoxiang) should set same place with input? - auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name); - auto* io_copy_inst = graph->NewInstructNode(); - - bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; - std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy"; - io_copy_output_arg->AsArg().is_persist = in_persist; - // create Op and kernels. - auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type); - CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed"; - // CHECK(io_copy_op); - // Create the new var manually. - inst_node->AsStmt().op()->scope()->Var(io_copy_output_name); - - // Create IoCopy Instruction. - cpp::OpDesc op_desc; - op_desc.SetType(io_copy_type); - op_desc.SetInput("Input", {in->AsArg().name}); - op_desc.SetOutput("Out", {io_copy_output_name}); - - io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); - auto kernels = io_copy_op->CreateKernels(valid_places); - // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type - bool is_found = false; - std::vector> selected_kernels; - for (auto& kernel : kernels) { - const Type* in_arg_ty = kernel->GetInputDeclType("Input"); - const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); - if (TypeCompatible(*in_arg_ty, from)) { - is_found = true; - selected_kernels.emplace_back(std::move(kernel)); - // we pick the kernel - io_copy_inst->AsStmt( - io_copy_type, std::move(selected_kernels), io_copy_op); - break; - } - } - CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from - << ":" << in->AsArg().name << "->" << to << ":" - << inst_node->AsStmt().op_info()->Type(); - - // Remove the old link - RemoveDirectedLink(in, inst_node); - - // Update the original instruction OpDesc. - // Update its input to the io_copy_output_name - // Add new link, var -> new_inst, new_inst->newarg, newarg->inst - DirectedLink(in, io_copy_inst); - DirectedLink(io_copy_inst, io_copy_output_arg); - DirectedLink(io_copy_output_arg, inst_node); - - // reset opdesc and update kernel information - UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), - in->AsArg().name, - io_copy_output_name); - auto original_selected_kernel = - std::move(inst_node->AsStmt().kernels().front()); - auto update_op_info = *inst_node->AsStmt().op_info(); - // ResetOp() will change the Stmt op_info_ value, - // after that the old op_info_ value will be nullified. - // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp. - // `update_op_info` is the copy of `*inst_node->AsStmt().op_info(). - // Whenever update the op_info of a stmt, we should call its ResetOp(). - inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places()); - inst_node->AsStmt().kernels().clear(); - inst_node->AsStmt().kernels().emplace_back( - std::move(original_selected_kernel)); - - std::string tmp; - if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) { - CHECK(false) << "get old a " << tmp; - } - - for (auto& kernel : inst_node->AsStmt().kernels()) { - VLOG(4) << "kernel info: " << kernel->name(); - inst_node->AsStmt().op()->AttachKernel(kernel.get()); - } - - graph->CheckValid(); -} - -void TypeTargetTransformPass::SetValidPlaces( - const std::vector& valid_places) { - CHECK(!valid_places.empty()); - valid_places_ = valid_places; -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(type_target_cast_pass, - paddle::lite::mir::TypeTargetTransformPass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/type_target_cast_pass.h b/lite/core/mir/type_target_cast_pass.h deleted file mode 100644 index 8a8cfaf9f9..0000000000 --- a/lite/core/mir/type_target_cast_pass.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "lite/core/mir/pass.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -static void UpdateInputTo(cpp::OpDesc* desc, - const std::string& from, - const std::string& to) { - for (auto& item : *desc->mutable_inputs()) { - for (auto& input : item.second) { - if (input == from) { - input = to; - } - } - } -} - -/* - * IoComplementPass complement the necessary instruction to make data - * transferring or transformation between different places. - */ -class TypeTargetTransformPass : public ProgramPass { - public: - void Apply(const std::unique_ptr& graph) override; - - void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in); - - void AddIoCopyInst(const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places); - - void SetValidPlaces(const std::vector& valid_places); - - const std::vector& valid_places() const { return valid_places_; } - - private: - std::vector valid_places_; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/variable_place_inference_pass.cc b/lite/core/mir/variable_place_inference_pass.cc deleted file mode 100644 index 1f8aea8172..0000000000 --- a/lite/core/mir/variable_place_inference_pass.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/variable_place_inference_pass.h" -#include -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -void VariablePlaceInferencePass::Apply(const std::unique_ptr &graph) { - MarkInputPlace(graph.get()); - InferenceArgumentPlace(graph.get()); - CheckAllArgumentTypeDetermined(graph.get()); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(variable_place_inference_pass, - paddle::lite::mir::VariablePlaceInferencePass) - .SetTargets({TARGET(kAny)}); diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h deleted file mode 100644 index d5b0bb8aa6..0000000000 --- a/lite/core/mir/variable_place_inference_pass.h +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "lite/core/mir/pass.h" -#include "lite/core/target_wrapper.h" - -namespace paddle { -namespace lite { -namespace mir { - -/* - * Mark the place of the variables in the SSAGrpah, it will inference the - * variables' place by the kernels outputs them. - */ -class VariablePlaceInferencePass : public DebugPass { - public: - void Apply(const std::unique_ptr& graph) override; - - private: - // Mark the place of input arguments. - void MarkInputPlace(SSAGraph* graph) { - CHECK(!graph->inputs().empty()) << "graph's inputs should be set"; - for (const auto& v : graph->inputs()) { - // the feed op might in the inputs - if (v->IsStmt()) { - VLOG(4) << "found kernel in inputs " << v->AsStmt().op_type(); - continue; - } - } - } - - void CheckAllArgumentTypeDetermined(SSAGraph* graph) { - for (auto& node : graph->mutable_nodes()) { - if (node.IsArg()) { - CHECK(node.AsArg().type) << "node " << node.AsArg().name - << " type not determined, " << &node; - } - } - } - - // Set the tye of the weight - void SetWeightType(Node* w, const LiteType& type) { -// TODO(xg) to optimize this -#ifndef LITE_WITH_FPGA - w->AsArg().type = - LiteType::GetTensorTy(TARGET(kHost), type.precision(), type.layout()); -#else - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); -#endif - } - - void InferenceArgumentPlace(SSAGraph* graph) { - VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); - for (auto& x : graph->StmtTopologicalOrder()) { - auto& inst = x->AsStmt(); -// The IoCopyOp is a tool operator, it won't support the type inference. -// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for -// tool operator -#ifndef LITE_WITH_FPGA - if (inst.op_type() == "io_copy") continue; -#endif - // deal with inputs - VLOG(4) << "Infering op " << inst.op_info()->Repr(); - // TODO(zhaolong): Add check if the node's name in op's arguments. - - auto get_argname = [&]( - const std::string& node_name, - const std::map>& argname_map) - -> std::string { - for (auto& ele : argname_map) { - auto it = - std::find(ele.second.begin(), ele.second.end(), node_name); - if (it != ele.second.end()) return ele.first; - } - return ""; - }; - - for (auto* x_in : x->inlinks) { - std::string node_name = x_in->AsArg().name; - std::string arg_name = get_argname(node_name, inst.op_info()->inputs()); - CHECK(arg_name.size() > 0) << "can not found op arguments for node " - << node_name; - VLOG(4) << "-- input arg_name " << arg_name - << "-- node name :" << node_name; - auto type = inst.picked_kernel().GetInputDeclType(arg_name); - if (!x_in->AsArg().type) { - VLOG(4) << "set type " << *type << " " << x_in->AsArg().name; - if (x_in->AsArg().is_weight) { - SetWeightType(x_in, *type); - } else { - x_in->AsArg().type = type; - } - } - } - - VLOG(4) << "inst " << inst.op_info()->Repr(); - for (auto* x_out : x->outlinks) { - std::string node_name = x_out->AsArg().name; - std::string arg_name = - get_argname(node_name, inst.op_info()->outputs()); - CHECK(arg_name.size() > 0) << "can not found op arguments for node " - << node_name << " in Inst " - << inst.op_type(); - VLOG(4) << "-- output arg_name " << arg_name; - auto type = inst.picked_kernel().GetOutputDeclType(arg_name); - if (!x_out->AsArg().type) { - VLOG(4) << "set type " << *type << " " << x_out->AsArg().name; - if (x_out->AsArg().is_weight) { - SetWeightType(x_out, *type); - } else { - x_out->AsArg().type = type; - } - } - } - } - } - - // Update me's kUnk fields by other's fields. - void UpdatePlace(Place* me, const Place& other) { - CHECK(other.is_valid()); - if (me->target == TARGET(kUnk)) { - me->target = other.target; - } - if (me->precision == PRECISION(kUnk)) { - me->precision = other.precision; - } - if (me->layout == DATALAYOUT(kUnk)) { - me->layout = other.layout; - } - } - - private: - // The default target for arguments, e.g. load weights to CPU memory for CUDA - // computation by default. - TargetType argument_default_target_{TARGET(kHost)}; -}; - -} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/mir/variable_place_inference_pass_test.cc b/lite/core/mir/variable_place_inference_pass_test.cc deleted file mode 100644 index cf86afd590..0000000000 --- a/lite/core/mir/variable_place_inference_pass_test.cc +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/api/paddle_use_passes.h" -#include "lite/core/optimizer.h" -#include "lite/core/program_fake_utils.h" -#include "lite/kernels/cuda/use_kernels.h" -#include "lite/kernels/host/use_kernels.h" - -namespace paddle { -namespace lite { -namespace mir { - -TEST(variable_place_inference_pass, test) { - std::shared_ptr scope(new lite::Scope); - ProgramFaker program_faker; - program_faker.AddFeed("a", 0); - program_faker.AddMul("a", "W", "a1"); - program_faker.AddMul("a1", "W1", "a2"); - program_faker.AddFetch("a2", 0); - program_faker.CreateVars(scope.get()); - - auto* desc = program_faker.program(); - - Optimizer optimizer; - std::vector places({ - Place{ - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW), - }, - Place{ - TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW), - }, - Place{ - TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW), - }, - Place{ - TARGET(kX86), PRECISION(kAny), DATALAYOUT(kAny), - }, - }); - - Program program(*desc->Proto(), scope, places); - - core::KernelPickFactor factor; - factor.ConsiderTarget(); - - std::vector passes({ - "static_kernel_pick_pass", // - "argument_type_display_pass", // - "variable_place_inference_pass", // - "argument_type_display_pass", // - "type_target_cast_pass", // - }); - - Place prefered_place{ -#ifdef PADDLE_WITH_CUDA - TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW), -#else -#ifdef PADDLE_WITH_ARM - TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW), -#else // X86 - TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW), -#endif // ARM -#endif - }; - optimizer.KernelPickPreferPlace(prefered_place); - optimizer.Run(std::move(program), places, factor, passes); -} - -} // namespace mir -} // namespace lite -} // namespace paddle - -USE_LITE_OP(mul); -USE_LITE_OP(feed); -USE_LITE_OP(fetch); -USE_LITE_OP(io_copy); - -#ifdef LITE_WITH_X86 -USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def); -#endif - -#ifdef LITE_WITH_ARM -USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); -#endif - -#ifdef LITE_WITH_CUDA -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); -USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host); -#endif diff --git a/lite/core/naive_test_model.py b/lite/core/naive_test_model.py deleted file mode 100644 index f89a5e115f..0000000000 --- a/lite/core/naive_test_model.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy -import sys, os -import numpy as np -import paddle.fluid as fluid -from paddle.fluid.backward import append_backward - -a = fluid.layers.data(name="a", shape=[2], dtype='float32') -label = fluid.layers.data(name="label", shape=[10], dtype='float32') - -a1 = fluid.layers.fc(input=a, size=3, act=None, bias_attr=False) - -cost = fluid.layers.square_error_cost(a1, label) -avg_cost = fluid.layers.mean(cost) - -optimizer = fluid.optimizer.SGD(learning_rate=0.001) -optimizer.minimize(cost) - -cpu = fluid.core.CPUPlace() -loss = exe = fluid.Executor(cpu) - -exe.run(fluid.default_startup_program()) -with open('startup_program.pb', 'wb') as f: - f.write(fluid.default_startup_program().desc.serialize_to_string()) - -#data_1 = np.array(numpy.random.random([100, 100]), dtype='float32') - -#fluid.default_main_program().desc. - -#prog = fluid.compiler.CompiledProgram(fluid.default_main_program()) -prog = fluid.default_main_program() - -#append_backward(loss) - -with open('main_program.pb', 'wb') as f: - f.write(prog.desc.serialize_to_string()) - -#outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost]) - -#sys.exit(0) -fluid.io.save_inference_model("./model2", [a.name], [a1], exe) - -#print(numpy.array(outs)) diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc deleted file mode 100644 index 412b299339..0000000000 --- a/lite/core/op_lite.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/op_lite.h" -#include -#include -#include -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { - -std::vector> OpLite::CreateKernels( - const std::vector &places, const std::string &kernel_type) { - std::vector> kernels; - CHECK(!op_type_.empty()) << "op_type_ should be set first"; - - auto pick_kernel = [&](const Place &place) { - auto ks = KernelRegistry::Global().Create( - op_type_, place.target, place.precision, place.layout); - VLOG(5) << "pick kernel for " << op_info()->Type() << " " - << place.DebugString() << " get " << ks.size() << " kernels"; - for (auto &&it : ks) { - AttachKernel(it.get()); - kernels.emplace_back(std::move(it)); - } - }; - - if (!kernel_type.empty()) { - Place place; - std::string op_type, alias; - KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); - pick_kernel(place); - CHECK(!kernels.empty()) << "no kernel for kernel type " << kernel_type; - return kernels; - } - - std::set place_set; - for (auto place : places) { - place_set.insert(place); - // Pick kernels those support any Precision and any DataLayout - place.precision = PRECISION(kAny); - place_set.insert(place); - place.layout = DATALAYOUT(kAny); - place_set.insert(place); - } - - std::set targets; - for (auto place : place_set) { - pick_kernel(place); - targets.insert(place.target); - } - - VLOG(4) << "op " << op_type_ << " get " << kernels.size() << " kernels"; - return kernels; -} - -bool OpLite::Run() { - CHECK(kernel_); - SyncInputEvents(); - - kernel_->Launch(); - - RecordOutputEvents(); - return true; -} - -bool OpLite::Attach(const cpp::OpDesc &opdesc, lite::Scope *scope) { - // valid_places_.clear(); - CHECK(scope != nullptr); - // CHECK(!op_info_.get()); - scope_ = scope; - op_info_.reset( - new OpInfo(opdesc)); // Force clean the out-of-date infomation. - return AttachImpl(*op_info(), scope); -} - -const Tensor *OpLite::GetTensor(lite::Scope *scope, - const std::string &name) const { - auto *var = scope->FindVar(name); - CHECK(var) << "no variable called " << name << " found"; - return &var->Get(); -} - -Tensor *OpLite::GetMutableTensor(lite::Scope *scope, - const std::string &name) const { - auto *var = scope->FindVar(name); - CHECK(var) << "no variable called " << name << " found"; - return var->GetMutable(); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h deleted file mode 100644 index f843ef6f2b..0000000000 --- a/lite/core/op_lite.h +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "lite/core/context.h" -#include "lite/core/kernel.h" -#include "lite/core/scope.h" -#include "lite/model_parser/cpp/op_desc.h" - -namespace paddle { -namespace lite { - -// For registry factory. -struct Registry { - void Touch() {} -}; - -namespace mir { -class Node; -class SSAGraph; -} - -class OpInfo; - -/** - * The base class of an light-weight operators, currently just used in inference - * to eliminate overhead of some operations in current framework. - * - * The Operator are designed as follows: - * - it can has some members to hold the argument and some other computation - * resources, - * - it should act like a function call, no more logic included. - */ -class OpLite : public Registry { - public: - OpLite() = default; - explicit OpLite(const std::string &type) : op_type_(type) {} - explicit OpLite(const std::vector &valid_places) - : valid_places_(valid_places) {} - - void SetValidPlaces(const std::vector &places) { - VLOG(3) << "valid places " << valid_places_.size(); - valid_places_ = places; - } - const std::vector &valid_places() const { return valid_places_; } - // Check the shape. - virtual bool CheckShape() const { return true; } - // Inference the outputs' shape. - virtual bool InferShape() const { return true; } - // Run this operator. - virtual bool Run(); - // Indicate whether the Op runs only once or not - virtual bool run_once() const { return false; } - std::string Type() { return op_type_; } - - // Link the external execution environ to internal context. - bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); - - const OpInfo *op_info() const { return op_info_.get(); } - OpInfo *mutable_op_info() { return op_info_.get(); } - - // Human-readable information. - virtual std::string DebugString() const = 0; - - const Place &kernel_place() const { return kernel_place_; } - - // Create all the kernels for the valid targets. - std::vector> CreateKernels( - const std::vector &places, const std::string &kernel_type = ""); - - lite::Scope *scope() { return scope_; } - - // Assign op param to kernel. - virtual void AttachKernel(KernelBase *kernel) = 0; - void SetKernel(std::vector> &kernels) { // NOLINT - kernel_ = std::move(kernels.front()); - kernel_->SetContext( - ContextScheduler::Global().NewContext(kernel_->target())); - } - - KernelBase *GetKernel() { // NOLINT - return kernel_.get(); - } - - virtual ~OpLite() = default; - - protected: - // Attach it with the runtime environment. - virtual bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) = 0; - - // Specify the kernel to run by default. This will specify the value of - // `kernel_place_`. - virtual void StaticPickKernel(const std::vector &valid_targets) { - auto kernels = CreateKernels(valid_targets); - kernel_ = std::move(kernels.front()); - } - - // Wait until all the inputs' events are ready. - void SyncInputEvents() {} - - // Record the output events, and that will tell all the dependent operators - // some inputs are ready. - void RecordOutputEvents() {} - - const Tensor *GetTensor(lite::Scope *scope, const std::string &name) const; - Tensor *GetMutableTensor(lite::Scope *scope, const std::string &name) const; - - friend class mir::Node; - friend class mir::SSAGraph; - - protected: - // some helper functions. - template - const T *GetVar(Scope *scope, const std::string &name) { - auto *var = scope->FindVar(name); - CHECK(var) << "No var found for " << name; - return &var->Get(); - } - template - T *GetMutableVar(Scope *scope, const std::string &name) { - auto *var = scope->FindVar(name); - CHECK(var) << "No var found for " << name; - return var->GetMutable(); - } - - protected: - lite::Scope *scope_{nullptr}; - std::unique_ptr kernel_; - std::string op_type_; - std::vector valid_places_; - Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; - std::unique_ptr op_info_; -}; - -/* - * Operator Information, such as some description. It will be shared by all the - * kernels of the same operator. - */ -class OpInfo : public cpp::OpDesc { - public: - OpInfo(const OpInfo &) = default; - explicit OpInfo(const cpp::OpDesc &other) : cpp::OpDesc(other) {} - - // Collect all the input variable's name. - std::vector input_names() const { - std::vector res; - for (auto ¶m : InputArgumentNames()) { - for (auto &x : Input(param)) { - res.push_back(x); - } - } - return res; - } - - // Collect all the output variable's name. - std::vector output_names() const { - std::vector res; - for (auto ¶m : OutputArgumentNames()) { - for (auto &x : Output(param)) { - res.push_back(x); - } - } - return res; - } - - std::vector input_argnames() const { - return InputArgumentNames(); - } - - std::vector output_argnames() const { - return OutputArgumentNames(); - } - - bool GetInputArgname(const std::string &value_name, std::string *out) const { - for (auto &item : inputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = item.first; - return true; - } - } - return false; - } - bool GetOutputArgname(const std::string &value_name, std::string *out) const { - for (auto &item : outputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = item.first; - return true; - } - } - return false; - } - - void UpdateAllInputs(const std::string &from, const std::string &to) { - for (auto &item : inputs_) { - for (auto &var : item.second) { - if (var == from) var = to; - } - } - } - - void UpdateAllOutputs(const std::string &from, const std::string &to) { - for (auto &item : outputs_) { - for (auto &var : item.second) { - if (var == from) var = to; - } - } - } -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/op_lite_test.cc b/lite/core/op_lite_test.cc deleted file mode 100644 index a18607834a..0000000000 --- a/lite/core/op_lite_test.cc +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/op_lite.h" -#include - -namespace paddle { -namespace lite { - -TEST(OpLite, test) {} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc deleted file mode 100644 index 53d4afa9ff..0000000000 --- a/lite/core/op_registry.cc +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/op_registry.h" -#include -#include - -namespace paddle { -namespace lite { - -std::list> KernelRegistry::Create( - const std::string &op_type, - TargetType target, - PrecisionType precision, - DataLayoutType layout) { - Place place{target, precision, layout}; - VLOG(5) << "creating " << op_type << " kernel for " << place.DebugString(); -#define CREATE_KERNEL1(target__, precision__) \ - switch (layout) { \ - case DATALAYOUT(kNCHW): \ - return Create(op_type); \ - case DATALAYOUT(kAny): \ - return Create(op_type); \ - case DATALAYOUT(kNHWC): \ - return Create(op_type); \ - default: \ - LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \ - } - -#define CREATE_KERNEL(target__) \ - switch (precision) { \ - case PRECISION(kFloat): \ - CREATE_KERNEL1(target__, kFloat); \ - case PRECISION(kInt8): \ - CREATE_KERNEL1(target__, kInt8); \ - case PRECISION(kFP16): \ - CREATE_KERNEL1(target__, kFP16); \ - case PRECISION(kAny): \ - CREATE_KERNEL1(target__, kAny); \ - default: \ - CHECK(false) << "not supported kernel precision " \ - << PrecisionToStr(precision); \ - } - - switch (target) { - case TARGET(kHost): { - CREATE_KERNEL(kHost); - } break; - case TARGET(kX86): { - CREATE_KERNEL(kX86); - } break; - case TARGET(kCUDA): { - CREATE_KERNEL(kCUDA); - } break; - case TARGET(kARM): { - CREATE_KERNEL(kARM); - } break; - case TARGET(kOpenCL): { - CREATE_KERNEL(kOpenCL); - } break; - case TARGET(kNPU): { - CREATE_KERNEL(kNPU); - } break; - case TARGET(kFPGA): { - CREATE_KERNEL(kFPGA); - } break; - default: - CHECK(false) << "not supported kernel target " << TargetToStr(target); - } - -#undef CREATE_KERNEL - return std::list>(); -} - -KernelRegistry::KernelRegistry() - : registries_(static_cast(TARGET(NUM)) * - static_cast(PRECISION(NUM)) * - static_cast(DATALAYOUT(NUM))) { -#define INIT_FOR(target__, precision__, layout__) \ - registries_[KernelRegistry::GetKernelOffset()] \ - .set *>( \ - &KernelRegistryForTarget::Global()); - // Currently, just register 2 kernel targets. - INIT_FOR(kCUDA, kFloat, kNCHW); - INIT_FOR(kCUDA, kInt8, kNCHW); - INIT_FOR(kCUDA, kAny, kNCHW); - INIT_FOR(kCUDA, kAny, kAny); - INIT_FOR(kCUDA, kInt8, kNHWC); - - INIT_FOR(kHost, kFloat, kNCHW); - INIT_FOR(kHost, kAny, kNCHW); - INIT_FOR(kHost, kFloat, kNHWC); - INIT_FOR(kHost, kFloat, kAny); - INIT_FOR(kHost, kAny, kNHWC); - INIT_FOR(kHost, kAny, kAny); - INIT_FOR(kHost, kAny, kNHWC); - INIT_FOR(kHost, kAny, kAny); - - INIT_FOR(kX86, kFloat, kNCHW); - INIT_FOR(kX86, kAny, kNCHW); - INIT_FOR(kX86, kAny, kAny); - - INIT_FOR(kARM, kFloat, kNCHW); - INIT_FOR(kARM, kInt8, kNCHW); - INIT_FOR(kARM, kAny, kNCHW); - INIT_FOR(kARM, kAny, kAny); - - INIT_FOR(kOpenCL, kFloat, kNCHW); - INIT_FOR(kOpenCL, kAny, kNCHW); - INIT_FOR(kOpenCL, kAny, kAny); - - INIT_FOR(kNPU, kFloat, kNCHW); - INIT_FOR(kNPU, kInt8, kNCHW); - INIT_FOR(kNPU, kAny, kNCHW); - INIT_FOR(kNPU, kAny, kAny); - - INIT_FOR(kFPGA, kFP16, kNHWC); - INIT_FOR(kFPGA, kFP16, kAny); - INIT_FOR(kFPGA, kFloat, kNHWC); - INIT_FOR(kFPGA, kAny, kNHWC); - INIT_FOR(kFPGA, kAny, kAny); -#undef INIT_FOR -} - -KernelRegistry &KernelRegistry::Global() { - static auto *x = new KernelRegistry; - return *x; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h deleted file mode 100644 index 5b48c251c8..0000000000 --- a/lite/core/op_registry.h +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "lite/api/paddle_lite_factory_helper.h" -#include "lite/core/kernel.h" -#include "lite/core/op_lite.h" -#include "lite/core/target_wrapper.h" -#include "lite/utils/all.h" -#include "lite/utils/macros.h" - -using LiteType = paddle::lite::Type; - -namespace paddle { -namespace lite { - -using KernelFunc = std::function; -using KernelFuncCreator = std::function()>; -class LiteOpRegistry final : public Factory> { - public: - static LiteOpRegistry &Global() { - static auto *x = new LiteOpRegistry; - return *x; - } - - private: - LiteOpRegistry() = default; -}; - -template -class OpLiteRegistor : public Registor { - public: - explicit OpLiteRegistor(const std::string &op_type) - : Registor([&] { - LiteOpRegistry::Global().Register( - op_type, [op_type]() -> std::unique_ptr { - return std::unique_ptr(new OpClass(op_type)); - }); - }) {} -}; - -template -using KernelRegistryForTarget = - Factory, std::unique_ptr>; - -class KernelRegistry final { - public: - using any_kernel_registor_t = - variant *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget * // - >; - - KernelRegistry(); - - static KernelRegistry &Global(); - - template - void Register( - const std::string &name, - typename KernelRegistryForTarget::creator_t - &&creator) { - using kernel_registor_t = - KernelRegistryForTarget; - auto &varient = registries_[GetKernelOffset()]; - auto *reg = varient.template get(); - CHECK(reg) << "Can not be empty of " << name; - reg->Register(name, std::move(creator)); -#ifdef LITE_ON_MODEL_OPTIMIZE_TOOL - kernel_info_map_[name].push_back( - std::make_tuple(Target, Precision, Layout)); -#endif // LITE_ON_MODEL_OPTIMIZE_TOOL - } - - template - std::list> Create(const std::string &op_type) { - using kernel_registor_t = - KernelRegistryForTarget; - return registries_[GetKernelOffset()] - .template get() - ->Creates(op_type); - } - - std::list> Create(const std::string &op_type, - TargetType target, - PrecisionType precision, - DataLayoutType layout); - - // Get a kernel registry offset in all the registries. - template - static int GetKernelOffset() { - CHECK_LT(static_cast(Target), static_cast(TARGET(NUM))); - CHECK_LT(static_cast(Precision), static_cast(PRECISION(NUM))); - CHECK_LT(static_cast(Layout), static_cast(DATALAYOUT(NUM))); - return static_cast(Target) * static_cast(PRECISION(NUM)) * - static_cast(DATALAYOUT(NUM)) + // - static_cast(Precision) * static_cast(DATALAYOUT(NUM)) + // - static_cast(Layout); - } - - std::string DebugString() const { -#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL - return "No more debug info"; -#else // LITE_ON_MODEL_OPTIMIZE_TOOL - STL::stringstream ss; - ss << "\n"; - ss << "Count of kernel kinds: "; - int count = 0; - for (auto &item : kernel_info_map_) { - for (auto &kernel : item.second) ++count; - } - ss << count << "\n"; - - ss << "Count of registered kernels: " << kernel_info_map_.size() << "\n"; - for (auto &item : kernel_info_map_) { - ss << "op: " << item.first << "\n"; - for (auto &kernel : item.second) { - ss << " - (" << TargetToStr(std::get<0>(kernel)) << ","; - ss << PrecisionToStr(std::get<1>(kernel)) << ","; - ss << DataLayoutToStr(std::get<2>(kernel)); - ss << ")"; - ss << "\n"; - } - } - - return ss.str(); -#endif // LITE_ON_MODEL_OPTIMIZE_TOOL - } - - private: - mutable std::vector registries_; -#ifndef LITE_ON_TINY_PUBLISH - mutable std::map< - std::string, - std::vector>> - kernel_info_map_; -#endif -}; - -template -class KernelRegistor : public lite::Registor { - public: - KernelRegistor(const std::string &op_type, const std::string &alias) - : Registor([=] { - KernelRegistry::Global().Register( - op_type, [=]() -> std::unique_ptr { - std::unique_ptr x(new KernelType); - x->set_op_type(op_type); - x->set_alias(alias); - return x; - }); - }) {} -}; - -} // namespace lite -} // namespace paddle - -// Operator registry -#define LITE_OP_REGISTER_INSTANCE(op_type__) op_type__##__registry__instance__ -#define REGISTER_LITE_OP(op_type__, OpClass) \ - static paddle::lite::OpLiteRegistor LITE_OP_REGISTER_INSTANCE( \ - op_type__)(#op_type__); \ - int touch_op_##op_type__() { \ - return LITE_OP_REGISTER_INSTANCE(op_type__).Touch(); \ - } - -// Kernel registry -#define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \ - op_type__##__##target__##__##precision__##__registor__ -#define LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##__##target__##__##precision__##__registor__instance__##alias__ -#define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \ - LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__) - -#define REGISTER_LITE_KERNEL( \ - op_type__, target__, precision__, layout__, KernelClass, alias__) \ - static paddle::lite::KernelRegistor \ - LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__)(#op_type__, \ - #alias__); \ - static KernelClass LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__); \ - int touch_##op_type__##target__##precision__##layout__##alias__() { \ - LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ - .Touch(); \ - return 0; \ - } \ - static bool LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - __attribute__((unused)) = \ - paddle::lite::ParamTypeRegistry::NewInstance( \ - #op_type__ "/" #alias__) - -#define LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##target__##precision__##layout__##alias__ -#define LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##target__##precision__##layout__##alias__##param_register diff --git a/lite/core/optimizer.cc b/lite/core/optimizer.cc deleted file mode 100644 index 38a64a589f..0000000000 --- a/lite/core/optimizer.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/optimizer.h" -#include -#include "lite/core/mir/static_kernel_pick_pass.h" -#include "lite/core/mir/type_target_cast_pass.h" -#include "lite/model_parser/model_parser.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { - -void Optimizer::SpecifyKernelPickTactic(core::KernelPickFactor factor) { - auto* pass = mir::PassManager::Global().LookUp( - "static_kernel_pick_pass"); - CHECK(pass); - - *pass->mutable_kernel_pick_factors() = factor; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h deleted file mode 100644 index 031ffded45..0000000000 --- a/lite/core/optimizer.h +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "lite/core/mir/generate_program_pass.h" -#include "lite/core/mir/pass_manager.h" -#include "lite/core/mir/ssa_graph.h" -#include "lite/core/mir/static_kernel_pick_pass.h" -#include "lite/core/mir/type_target_cast_pass.h" -#include "lite/core/program.h" -#include "lite/core/types.h" -#include "lite/model_parser/model_parser.h" -#ifdef LITE_WITH_NPU -#include "lite/core/mir/subgraph/generate_npu_program_pass.h" -#endif - -namespace paddle { -namespace lite { - -/* - * lite::Optimizer optimize a program. It utilize the mir passes to analysis the - * program and export an optimized program. - */ -class Optimizer { - public: - void Run(Program&& program, - const std::vector& valid_places, - core::KernelPickFactor kernel_pick_factor, - const std::vector& passes = {}) { - program_ = &program; - valid_places_ = valid_places; - CHECK(!valid_places.empty()) << "At least one valid_place should be set"; - CHECK(!graph_) << "duplicate optimize found"; - graph_.reset(new mir::SSAGraph); - graph_->Build(program, valid_places); - graph_->SetValidPlaces(valid_places); - - SpecifyKernelPickTactic(kernel_pick_factor); - InitTargetTypeTransformPass(); - - if (passes.empty()) { - RunPasses(std::vector{ - {"lite_quant_dequant_fuse_pass", // - "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn - "lite_conv_bn_fuse_pass", // - "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise - // This pass is disabled to force some opencl kernels selected for - // final running, otherwise, they will be fused to ARM fusion - // kernels, and the OpenCL devices will be discarded. - // TODO(Superjomn) Refine the fusion related design to select fusion - // kernels for devices automatically. - "lite_conv_activation_fuse_pass", // - "lite_fc_fuse_pass", // - "lite_shuffle_channel_fuse_pass", // - "lite_transpose_softmax_transpose_fuse_pass", // - "lite_interpolate_fuse_pass", // - "identity_scale_eliminate_pass", // -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - "lite_elementwise_add_activation_fuse_pass", // -#endif - "static_kernel_pick_pass", // - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "type_target_cast_pass", // - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "io_copy_kernel_pick_pass", // - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "type_precision_cast_pass", // - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "type_layout_cast_pass", // - "variable_place_inference_pass", // - "argument_type_display_pass", // - - "runtime_context_assign_pass", - "graph_visualze"}}); - } else { - RunPasses(passes); - } - exec_scope_ = program.exec_scope(); - } - - void KernelPickPreferPlace(const Place& place) { - auto* pass = mir::PassManager::Global().LookUp( - "static_kernel_pick_pass"); - CHECK(pass); - pass->SetPreferPlace(place); - } - - const lite::Scope* exec_scope() const { return exec_scope_; } - - // Generate a new program based on the mir graph. - std::unique_ptr GenRuntimeProgram() { -#ifdef LITE_WITH_NPU - if (std::find(valid_places_.begin(), - valid_places_.end(), - Place{TARGET(kNPU), PRECISION(kFloat)}) != - valid_places_.end()) { - CheckInputDimsNotEmpty(exec_scope_); - auto pass = mir::PassManager::Global() - .LookUp( - "generate_npu_program_pass"); - try { - pass->Apply(graph_); - auto program = pass->GenProgram(); - CHECK(exec_scope_); - program->set_exec_scope(exec_scope_); - return program; - } catch (...) { - LOG(WARNING) << "Build NPU graph failed"; - } - } -#endif - auto pass = mir::PassManager::Global().LookUp( - "generate_program_pass"); - pass->Apply(graph_); - auto program = pass->GenProgram(); - CHECK(exec_scope_); - program->set_exec_scope(exec_scope_); - return program; - } - - // check the input dims in the scope, must not be empty - void CheckInputDimsNotEmpty(const lite::Scope* scope) { - CHECK(scope); - auto* feed_var = scope->FindVar("feed"); - CHECK(feed_var) << "no feed variable in exec_scope: " << scope; - auto* feed_tensor_list = feed_var->GetMutable>(); - CHECK_GE(feed_tensor_list->size(), 1); - for (size_t i = 0; i < feed_tensor_list->size(); ++i) { - CHECK(!feed_tensor_list->at(i).dims().empty()) - << "Input " << i << " dims can not be empty."; - } - } - - void InitTargetTypeTransformPass() { - auto* pass = - mir::PassManager::Global().LookUp( - "type_target_cast_pass"); - CHECK(pass); - CHECK(!valid_places_.empty()); - pass->SetValidPlaces(valid_places_); - } - - // Generate C++ code which combines the inference program, model and weights. - void GenCode(const std::string& code_dir); - - const mir::SSAGraph& ssa_graph() const { - CHECK(graph_); - return *graph_; - } - - mir::SSAGraph* mutable_ssa_graph() { - CHECK(graph_); - return graph_.get(); - } - - lite::Scope* exec_scope() { return exec_scope_; } - - protected: - void SpecifyKernelPickTactic(core::KernelPickFactor factor); - - // Specify the passes and run them. - void RunPasses(const std::vector& passes) { - for (auto& x : passes) { - LOG(INFO) << "== Running pass: " << x; - mir::Pass* pass = mir::PassManager::Global().LookUp(x); - CHECK(pass) << "Can not find pass: " << x; - bool supported = false; - for (const auto& place : valid_places_) { - if (pass->is_supported_target(place.target)) { - supported = true; - } - } - if (!supported) { - LOG(WARNING) << "Skip " << x - << " pass because the target does not match."; - } else { - pass->Apply(graph_); - LOG(INFO) << "== Finished running: " << x; - } - } - } - - private: - std::unique_ptr graph_; - std::vector valid_places_; - lite::Scope* exec_scope_{}; - Program* program_{}; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/optimizer_test.cc b/lite/core/optimizer_test.cc deleted file mode 100644 index ba5bc01b58..0000000000 --- a/lite/core/optimizer_test.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/optimizer.h" -#include -#include -#include -#include "lite/api/paddle_use_passes.h" -#include "lite/core/mir/generate_program_pass.h" -#include "lite/core/mir/pass_manager.h" -#include "lite/core/mir/static_kernel_pick_pass.h" -#include "lite/core/program_fake_utils.h" - -namespace paddle { -namespace lite { - -TEST(Optimizer, test) { - Optimizer optimizer; - auto program_faker = ProgramFaker(); - program_faker.AddFeed("X", 0); - program_faker.AddFetch("X", 0); - - std::vector places({Place{TARGET(kHost), PRECISION(kFloat)}}); - - core::KernelPickFactor factor; - factor.ConsiderTarget(); - - auto scope = std::make_shared(); - auto program_proto = *program_faker.program()->Proto(); - Program program(program_proto, scope, places); - optimizer.Run(std::move(program), places, factor); - auto runtime_program = optimizer.GenRuntimeProgram(); - LOG(INFO) << "num statements " << runtime_program->num_instructions(); -} - -} // namespace lite -} // namespace paddle - -USE_LITE_OP(fc); -USE_LITE_KERNEL(fc, kHost, kFloat, kNCHW, def); diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt deleted file mode 100644 index de8a60bdc2..0000000000 --- a/lite/core/profile/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -if (NOT LITE_WITH_PROFILE) - return() -endif() - -lite_cc_library(basic_profiler SRCS basic_profiler.cc) -lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler) - - diff --git a/lite/core/profile/basic_profiler.cc b/lite/core/profile/basic_profiler.cc deleted file mode 100644 index 031b86beb6..0000000000 --- a/lite/core/profile/basic_profiler.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/profile/basic_profiler.h" - -namespace paddle { -namespace lite { -namespace profile { - -const int BasicTimer::data_w = 10; -const int BasicTimer::name_w = 15; - -} // namespace profile -} // namespace lite -} // namespace paddle diff --git a/lite/core/profile/basic_profiler.h b/lite/core/profile/basic_profiler.h deleted file mode 100644 index f55a5764a0..0000000000 --- a/lite/core/profile/basic_profiler.h +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file implements BasicProfile, a profiler that helps to profile the basic - * CPU execution. It can display the min, max, average lantency of the execution - * of each kernel. - */ -#pragma once -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "lite/utils/cp_logging.h" -#include "lite/utils/replace_stl/stream.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { -namespace profile { - -/* Base class of all the profile records */ -template -class TimerBase { - public: - void Start() { self()->Start(); } - void Stop() { self()->Stop(); } - void Log(uint32_t x) { return self()->Log(x); } - std::string basic_repr() const { return const_self()->basic_repr(); } - - void SetId(int id) { self()->SetId(id); } - void SetKey(const std::string &key) { self()->SetKey(key); } - - int id() const { return const_self()->id(); } - - protected: - ChildT *self() { return reinterpret_cast(this); } - const ChildT *const_self() const { - return reinterpret_cast(this); - } -}; - -class BasicTimer : TimerBase { - uint64_t total_{}; - uint64_t count_{}; - uint32_t max_{std::numeric_limits::min()}; - uint32_t min_{std::numeric_limits::max()}; - int id_{-1}; - std::string key_; - uint64_t timer_{}; - - // TODO(Superjomn) make static - static const int name_w; - static const int data_w; - - public: - BasicTimer() = default; - BasicTimer(int id, const std::string &key) : id_(id), key_(key) {} - - void SetId(int id) { id_ = id; } - void SetKey(const std::string &key) { key_ = key; } - void Start() { - timer_ = static_cast( - std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count()); - } - void Stop() { - auto duration = static_cast< - uint64_t>( // timer unit: microsecond, 1second = 1e6 microsecond - std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count() - - timer_); - Log(duration); - } - - int count() const { return count_; } - - void Log(uint32_t timespan) { - total_ += timespan; - max_ = std::max(max_, timespan); - min_ = std::min(min_, timespan); - count_++; - } - - static std::string basic_repr_header() { - STL::stringstream ss; - ss << std::setw(name_w) << "kernel" // - << std::setw(data_w) << "average" // - << std::setw(data_w) << "min" // - << std::setw(data_w) << "max" // - << std::setw(data_w) << "count"; - return ss.str(); - } - - std::string basic_repr() const { - STL::stringstream ss; - ss << std::setw(name_w) << key() // - << std::setw(data_w) << ave() // - << std::setw(data_w) << min() // - << std::setw(data_w) << max() // - << std::setw(data_w) << count_; - return ss.str(); - } - - const std::string &key() const { return key_; } - - int id() const { - CHECK_GE(id_, 0) << "id is not inited"; - return id_; - } - - double ave() const { return total_ * 1. / count_; } - double max() const { return max_; } - double min() const { return min_; } - - // BasicRecord(const BasicRecord &) = delete; - void operator=(const BasicTimer &) = delete; -}; - -/* - * A basic profiler, with each record logs the total latency. - */ -template -class BasicProfiler { - public: - explicit BasicProfiler(const std::string &name) : name_(name) {} - using record_t = TimerT; - - static BasicProfiler &Global() { - static std::unique_ptr x(new BasicProfiler("[global]")); - return *x; - } - - record_t &NewRcd(const std::string &key) { - records_.emplace_back(); - records_.back().SetId(records_.size() - 1); - records_.back().SetKey(key); - return records_.back(); - } - - const record_t &record(int id) { - CHECK_LT(id, records_.size()); - CHECK_GE(id, 0); - return records_[id]; - } - - record_t *mutable_record(int id) { - CHECK_GE(id, 0); - CHECK_LT(static_cast(id), records_.size()); - return &records_[id]; - } - - std::string basic_repr() const { - STL::stringstream ss; - for (const auto &rcd : records_) { - ss << rcd.basic_repr() << "\n"; - } - return ss.str(); - } - - ~BasicProfiler() { - LOG(INFO) << "Profile dumps:"; - LOG(INFO) << "\n" + BasicTimer::basic_repr_header() + "\n" + basic_repr(); - } - - private: - std::string name_; - std::vector records_; -}; - -struct ProfileBlock { - explicit ProfileBlock(int id) : id_(id) { - BasicProfiler::Global().mutable_record(id_)->Start(); - } - - ~ProfileBlock() { - BasicProfiler::Global().mutable_record(id_)->Stop(); - } - - private: - int id_{}; -}; - -#define LITE_PROFILE_ONE(key__) \ - static int key__##__profiler_id = \ - ::paddle::lite::profile::BasicProfiler< \ - ::paddle::lite::profile::BasicTimer>::Global() \ - .NewRcd(#key__) \ - .id(); \ - ::paddle::lite::profile::ProfileBlock key__##profiler__(key__##__profiler_id); - -} // namespace profile -} // namespace lite -} // namespace paddle diff --git a/lite/core/profile/basic_profiler_test.cc b/lite/core/profile/basic_profiler_test.cc deleted file mode 100644 index 928fdd61cb..0000000000 --- a/lite/core/profile/basic_profiler_test.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/profile/basic_profiler.h" -#include -#include // NOLINT -#include // NOLINT -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace profile { - -TEST(basic_record, init) { - BasicTimer timer; - timer.SetKey("hello"); -} - -TEST(basic_profile, init) { - auto& rcd = BasicProfiler::Global().NewRcd("fc"); - for (int i = 11; i < 100; i++) { - rcd.Log(i); - } - - LOG(INFO) << BasicProfiler::Global().basic_repr(); -} - -TEST(basic_profile, real_latency) { - LITE_PROFILE_ONE(test0); - std::this_thread::sleep_for(std::chrono::milliseconds(1200)); -} - -} // namespace profile -} // namespace lite -} // namespace paddle diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h deleted file mode 100644 index d9111e5c46..0000000000 --- a/lite/core/profile/precision_profiler.h +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file implements BasicProfile, a profiler that helps to profile the basic - * CPU execution. It can display the min, max, average lantency of the execution - * of each kernel. - */ -#pragma once -#include -#include -#include "lite/core/program.h" - -namespace paddle { -namespace lite { -namespace profile { - -template -static void write_tensorfile(const Tensor* tensor, const std::string& locate) { - if (locate.find('/') != std::string::npos) { - return; - } - FILE* fp = fopen(locate.c_str(), "w"); - if (fp == nullptr) { - LOG(ERROR) << "file open field " << locate; - } else { - const dtype* data = tensor->data(); - for (int i = 0; i < tensor->numel(); ++i) { - fprintf(fp, "[%d] %f \n", i, static_cast(data[i])); - } - } - fclose(fp); -} - -class PrecisionProfiler { - public: - explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {} - ~PrecisionProfiler() { - LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr() - << " on Target " << TargetToStr(inst_->kernel()->target()) << " " - << PrecisionToStr(inst_->kernel()->precision()); - auto tensor_mean = [](const Tensor* in, - PrecisionType ptype, - std::string name = "inst") -> double { - if (!in->data()) { - return -99999; - } - double sum = 0.; - switch (ptype) { - case PRECISION(kFloat): { - auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); - } - case PRECISION(kAny): { - auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); - } - case PRECISION(kInt8): { - auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); - } - case PRECISION(kInt32): { - auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); - } - default: - LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype); - return 0.; - } - }; - if (inst_->op()->op_info()->Type() != "fetch") { - auto op = const_cast(inst_->op()); - auto kernel = inst_->kernel(); - auto op_scope = op->scope(); - auto out_names = op->op_info()->output_names(); - for (auto& out_name : out_names) { - std::string out_arg_name; - op->op_info()->GetOutputArgname(out_name, &out_arg_name); - auto type = kernel->GetOutputDeclType(out_arg_name); - - if (type->IsTensor()) { - auto tout = op_scope->FindVar(out_name)->GetMutable(); - double mean = tensor_mean(tout, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean << " shape:" << tout->dims(); - } else if (type->IsTensorList()) { - auto tout = - op_scope->FindVar(out_name)->GetMutable>(); - for (auto& t : *tout) { - double mean = tensor_mean(&t, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean; - } - } - } - } - } - - private: - const Instruction* inst_{nullptr}; -}; - -} // namespace profile -} // namespace lite -} // namespace paddle - -#define LITE_PRECISION_PROFILE(inst) \ - { auto a = paddle::lite::profile::PrecisionProfiler(&inst); } diff --git a/lite/core/program.cc b/lite/core/program.cc deleted file mode 100644 index 179cdf909a..0000000000 --- a/lite/core/program.cc +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/program.h" -#include -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/var_desc.h" -#include "lite/operators/while_op.h" -#ifdef LITE_WITH_PROFILE -#include "lite/core/profile/precision_profiler.h" -#endif - -namespace paddle { -namespace lite { - -void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) { - CHECK(desc); - // NOTE: RuntimeProgram do not has all meta info, so save model just update - // upon origin model - CHECK(desc->BlocksSize()); - auto& main_block = *desc->GetBlock(0); - main_block.ClearOps(); - for (auto& node : instructions_) { - auto* op = main_block.AddOp(); - *op = *node.op()->op_info(); - op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType()); - } -} - -// `UpdateVarsOfProgram` will remove unused var_descs and add new created -// vars' descs in the block 0. Now, the type of a new created var can only -// be LOD_TENSOR. -void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { - CHECK(desc); - CHECK(desc->BlocksSize()); - std::unordered_map origin_var_maps; - auto& main_block = *desc->GetBlock(0); - auto var_size = main_block.VarsSize(); - for (int i = 0; i < var_size; i++) { - auto v = main_block.GetVar(i); - auto name = v->Name(); - origin_var_maps.emplace(name, *v); - } - - main_block.ClearVars(); - for (auto& node : instructions_) { - auto* op = const_cast(node.op()); - auto* kernel = node.kernel(); - auto* scope = op->scope(); - auto in_names = op->op_info()->input_names(); - auto out_names = op->op_info()->output_names(); - for (auto& in_name : in_names) { - auto it = origin_var_maps.find(in_name); - if (it != origin_var_maps.end()) { - auto* v = main_block.AddVar(); - v->SetName((it->second).Name()); - v->SetType((it->second).GetType()); - v->SetPersistable((it->second).Persistable()); - } else { - // New created vars must be LOD_TENSOR - auto* v = main_block.AddVar(); - v->SetName(in_name); - v->SetType(cpp::VarDesc::Type::LOD_TENSOR); - std::string in_arg_name; - op->op_info()->GetInputArgname(in_name, &in_arg_name); - auto type = kernel->GetInputDeclType(in_arg_name); - if (type->IsTensor()) { - auto tensor = scope->FindVar(in_name)->GetMutable(); - v->SetPersistable(tensor->persistable()); - } else { - CHECK(false) << "unsupported var type"; - } - } - } - - for (auto& out_name : out_names) { - auto it = origin_var_maps.find(out_name); - if (it != origin_var_maps.end()) { - auto* v = main_block.AddVar(); - v->SetName((it->second).Name()); - v->SetType((it->second).GetType()); - v->SetPersistable((it->second).Persistable()); - } else { - // New created vars must be LOD_TENSOR - auto* v = main_block.AddVar(); - v->SetName(out_name); - v->SetType(cpp::VarDesc::Type::LOD_TENSOR); - std::string out_arg_name; - op->op_info()->GetOutputArgname(out_name, &out_arg_name); - auto type = kernel->GetOutputDeclType(out_arg_name); - if (type->IsTensor()) { - auto tensor = scope->FindVar(out_name)->GetMutable(); - v->SetPersistable(tensor->persistable()); - } else { - CHECK(false) << "unsupported var type"; - } - } - } - } -} - -void RuntimeProgram::Run() { - for (auto& inst : instructions_) { - VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr() - << " on Target " << TargetToStr(inst.kernel()->target()); - - inst.Run(); -#ifdef LITE_WITH_PROFILE -#ifdef LITE_WITH_PRECISION_PROFILE - LITE_PRECISION_PROFILE(inst) -#endif // LITE_WITH_PRECISION_PROFILE -#endif // LITE_WITH_PROFILE - } -} - -void Program::Build(const cpp::ProgramDesc& prog) { - CHECK(ops_.empty()) << "Executor duplicate Build found"; - - // Create operators. - auto program = prog; - CHECK(program.BlocksSize()); - auto& main_block = *program.GetBlock(0); - for (size_t i = 0; i < main_block.OpsSize(); ++i) { - auto& op_desc = *main_block.GetOp(i); - auto op_type = op_desc.Type(); - // if (op_type == "feed" || op_type == "fetch") continue; - VLOG(4) << "create Op [" << op_type << "]"; - auto op = LiteOpRegistry::Global().Create(op_type); - CHECK(op) << "no Op found for " << op_type; - if (op_type == "while") { - auto sub_block_idx = op_desc.GetAttr("sub_block"); - auto sub_block = - const_cast(prog).GetBlock( - sub_block_idx); - static_cast(op.get())->SetSubBlock(sub_block); - } - ops_.emplace_back(std::move(op)); - ops_.back()->Attach(op_desc, exec_scope_); - } -} - -void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) { - CHECK(!exec_scope_) << "Duplicate PrepareWorkspace found"; - exec_scope_ = &scope_->NewScope(); - // Create Feed and Fetch var. - scope_->Var("feed")->GetMutable>(); - scope_->Var("fetch")->GetMutable>(); - tmp_vars_.push_back("feed"); - tmp_vars_.push_back("fetch"); - - auto program = prog; - CHECK(program.BlocksSize()); - for (size_t b = 0; b < program.BlocksSize(); ++b) { - auto& main_block = *program.GetBlock(b); - for (size_t i = 0; i < main_block.VarsSize(); ++i) { - auto& var_desc = *main_block.GetVar(i); - if (!var_desc.Persistable()) { - tmp_vars_.push_back(var_desc.Name()); - exec_scope_->Var(var_desc.Name()); - if (b > 0) { - VLOG(4) << "var: " << var_desc.Name(); - } - } else { - if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue; - weights_.push_back(var_desc.Name()); - if (var_desc.Persistable()) scope_->Var(var_desc.Name()); - } - } - } -} - -void Instruction::Run() { -#ifdef LITE_WITH_PROFILE - profile::ProfileBlock x(profile_id_); -#endif // LITE_WITH_PROFILE - CHECK(op_) << "op null"; - CHECK(kernel_) << "kernel null"; - if (first_epoch_) { - first_epoch_ = false; - CHECK(op_->CheckShape()); - } - - if (op_->run_once() && has_run_) return; - VLOG(4) << "kernel launch"; - op_->InferShape(); - kernel_->Launch(); - has_run_ = true; -} - -STL::ostream& operator<<(STL::ostream& os, const Instruction& other) { - os << other.kernel_->summary() << "\t(" << other.kernel_->doc() << ")"; - return os; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/program.h b/lite/core/program.h deleted file mode 100644 index 1b3c036db5..0000000000 --- a/lite/core/program.h +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_lite.h" -#include "lite/core/op_registry.h" -#include "lite/model_parser/cpp/program_desc.h" -#ifdef LITE_WITH_PROFILE -#include "lite/core/profile/basic_profiler.h" -#endif // LITE_WITH_PROFILE - -namespace paddle { -namespace lite { - -static const char kKernelTypeAttr[] = "__@kernel_type_attr@__"; - -// A program is used to represent a code program, in Paddle, a code program -// contains: -// - main block, which is a list of OpLite -// - scope: which contains all the weights -struct Program { - public: - explicit Program(const std::shared_ptr& root) { scope_ = root; } - Program(const cpp::ProgramDesc& desc, - const std::shared_ptr& root, - const std::vector& valid_places) - : scope_(root), valid_places_(valid_places), desc_(desc) { - CHECK(scope_) << "scope should be init first"; - VLOG(4) << "prepare work"; - PrepareWorkspace(desc); - VLOG(4) << "build desc"; - Build(desc); - VLOG(4) << "build desc finished"; - } - - std::unique_ptr Clone() const { - std::unique_ptr res(new Program(desc_, scope_, valid_places_)); - return res; - } - - const std::list& weights() const { return weights_; } - const std::list& tmp_vars() const { return tmp_vars_; } - std::list* mutable_weights() { return &weights_; } - std::list* mutable_tmp_vars() { return &tmp_vars_; } - - const std::list>& ops() const { return ops_; } - std::list>* mutable_ops() { return &ops_; } - - lite::Scope* exec_scope() { return exec_scope_; } - lite::Scope* scope() { return scope_.get(); } - - private: - // Build from a program and scope. - void Build(const cpp::ProgramDesc& program); - // Create temporary variables. - void PrepareWorkspace(const cpp::ProgramDesc& program); - - private: - std::list tmp_vars_; - std::list weights_; - std::list> ops_; - // the scope to run the kernels, NOTE this is the execution scope. - std::shared_ptr scope_; - std::vector valid_places_; - // Runtime scope. - lite::Scope* exec_scope_{}; - cpp::ProgramDesc desc_; -}; - -struct Instruction { - Instruction(const std::shared_ptr& op, - std::unique_ptr&& kernel) - : op_(op), kernel_(std::move(kernel)) { -#ifdef LITE_WITH_PROFILE - profile_id_ = profile::BasicProfiler::Global() - .NewRcd(kernel_->SerializedKernelType()) - .id(); -#endif // LITE_WITH_PROFILE - } - - // Run the instruction. - void Run(); - - friend STL::ostream& operator<<(STL::ostream& os, const Instruction& other); - - const OpLite* op() const { return op_.get(); } - const KernelBase* kernel() const { return kernel_.get(); } - KernelBase* mutable_kernel() { return kernel_.get(); } - - private: - std::shared_ptr op_; - std::unique_ptr kernel_; - bool first_epoch_{true}; - bool has_run_{false}; - -#ifdef LITE_WITH_PROFILE - // for profiler - int profile_id_{-1}; -#endif // LITE_WITH_PROFILE -}; - -/* - * A program contains kernels for runtime. - */ -class LITE_API RuntimeProgram { - public: - explicit RuntimeProgram(std::vector&& insts) - : instructions_(std::move(insts)) { - if (instructions_.empty()) { - LOG(FATAL) << "no instructions"; - } - } - - void Run(); - - void set_exec_scope(lite::Scope* x) { exec_scope_ = x; } - lite::Scope* exec_scope() { return exec_scope_; } - - size_t num_instructions() const { return instructions_.size(); } - - const std::vector& instructions() const { return instructions_; } - - // `SaveOpInfosToProgram` will update the op list(ops_) of the block 0 - // in ProgramDesc. - void SaveOpInfosToProgram(cpp::ProgramDesc* desc); - - // `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in - // ProgramDesc. Namely, if a new var created in some passes, its var_desc will - // be added in vars_. - void UpdateVarsOfProgram(cpp::ProgramDesc* desc); - - private: - RuntimeProgram(const RuntimeProgram&) = delete; - std::vector instructions_; - lite::Scope* exec_scope_{}; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/program_fake_utils.cc b/lite/core/program_fake_utils.cc deleted file mode 100644 index b4d7a00dfa..0000000000 --- a/lite/core/program_fake_utils.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/program_fake_utils.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace mir {} // namespace mir -} // namespace lite -} // namespace paddle diff --git a/lite/core/program_fake_utils.h b/lite/core/program_fake_utils.h deleted file mode 100644 index edcbb101aa..0000000000 --- a/lite/core/program_fake_utils.h +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include "lite/core/mir/ssa_graph.h" -#include "lite/core/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace lite { - -Program FakeProgram() { - Program program(std::make_shared()); - - auto add_fc = [&](int id, std::string x) { - // create variables - std::string w1 = "w" + std::to_string(id); - std::string b1 = "b" + std::to_string(id); - std::string out1 = "out" + std::to_string(id); - auto w1v = program.scope()->Var(w1)->GetMutable(); - auto b1v = program.scope()->Var(b1)->GetMutable(); - auto out1v = program.scope()->Var(out1)->GetMutable(); - - cpp::OpDesc desc; - desc.SetInput("Input", {x}); - desc.SetInput("W", {w1}); - desc.SetInput("Bias", {b1}); - desc.SetOutput("Out", {out1}); - desc.SetType("fc"); - desc.SetAttr("in_num_col_dims", 1); - - // add to input - program.mutable_tmp_vars()->push_back(w1); - program.mutable_tmp_vars()->push_back(b1); - - auto fc_op = LiteOpRegistry::Global().Create("fc"); - fc_op->Attach(desc, program.scope()); - program.mutable_ops()->emplace_back(std::move(fc_op)); - - w1v->Resize(DDimHvy(std::vector({100, 100}))); - b1v->Resize(DDimHvy(std::vector({100, 1}))); - out1v->Resize(DDimHvy(std::vector({100, 100}))); - - return out1; - }; - - // x1, w1, b1 -fc-> out1 - // out1, w2, b2 -fc-> out2 - - std::string x = "x"; - program.mutable_tmp_vars()->push_back(x); - auto* xv = program.scope()->Var(x)->GetMutable(); - xv->Resize(DDimHvy(std::vector({100, 100}))); - - for (int i = 0; i < 3; i++) { - x = add_fc(i, x); - } - return program; -} - -class ProgramFaker { - public: - ProgramFaker() {} - - framework::ProgramDesc* program() { - desc_.Flush(); - return &desc_; - } - - void CreateVars(lite::Scope* scope) { - for (auto& var : tmp_vars_) { - auto* x = scope->Var(var); - x->GetMutable(); - } - - for (auto& x : tmp_vars_) { - desc_.MutableBlock(0)->Var(x); - } - } - - void AddMul(const std::string& X, - const std::string& Y, - const std::string& out) { - tmp_vars_.insert(X); - tmp_vars_.insert(Y); - tmp_vars_.insert(out); - - auto* block = desc_.MutableBlock(0); - auto* op = block->AppendOp(); - op->SetType("mul"); - op->SetInput("X", {X}); - op->SetInput("Y", {Y}); - op->SetOutput("Out", {Y}); - op->SetAttr("x_num_col_dims", 1); - op->SetAttr("y_num_col_dims", 1); - } - - void AddFeed(const std::string& Out, int col) { - tmp_vars_.insert(Out); - - auto* block = desc_.MutableBlock(0); - auto* op = block->AppendOp(); - op->SetType("feed"); - op->SetInput("X", {"feed"}); - op->SetOutput("Out", {Out}); - op->SetAttr("col", col); - } - - void AddFetch(const std::string& Input, int col) { - tmp_vars_.insert(Input); - auto* block = desc_.MutableBlock(0); - auto* op = block->AppendOp(); - op->SetType("fetch"); - op->SetInput("X", {Input}); - op->SetOutput("Out", {"fetch"}); - op->SetAttr("col", col); - } - - private: - std::set tmp_vars_; - std::vector weight_vars_; - framework::ProgramDesc desc_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/scope.cc b/lite/core/scope.cc deleted file mode 100644 index 775652e2a0..0000000000 --- a/lite/core/scope.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/scope.h" - -namespace paddle { -namespace lite { - -Scope::~Scope() { - for (auto *x : kids_) { - if (x) { - delete x; - } - } -} - -Scope &Scope::NewScope() const { - kids_.push_back(new Scope); - kids_.back()->parent_ = this; - return *kids_.back(); -} - -Variable *Scope::Var(const std::string &name) { - auto *var = FindVar(name); - if (var) return var; - - // create a new variable. - vars_.emplace(name, std::unique_ptr(new Variable)); - return vars_[name].get(); -} - -Variable *Scope::FindVar(const std::string &name) const { - Variable *var{nullptr}; - var = FindLocalVar(name); - const Scope *cur_scope = this; - while (!var && cur_scope->parent()) { - cur_scope = cur_scope->parent(); - var = cur_scope->FindLocalVar(name); - } - - return var; -} - -Variable *Scope::FindLocalVar(const std::string &name) const { - auto it = vars_.find(name); - if (it != vars_.end()) { - return it->second.get(); - } - return nullptr; -} - -std::vector Scope::LocalVarNames() const { - std::vector keys; - for (const auto &item : vars_) { - keys.push_back(item.first); - } - return keys; -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/scope.h b/lite/core/scope.h deleted file mode 100644 index 2593c36522..0000000000 --- a/lite/core/scope.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include "lite/core/variable.h" - -namespace paddle { -namespace lite { - -class Scope final { - public: - Scope() {} - // delete below two functions to allow pybind to recognise it cannot make a - // copy - // link: - // https://stackoverflow.com/questions/53807248/pybind11-returning-a-pointer-to-a-container-of-unique-ptr - Scope(const Scope&) = delete; - Scope& operator=(const Scope&) = delete; - ~Scope(); - - Scope& NewScope() const; - - Variable* Var(const std::string& name); - - Variable* FindVar(const std::string& name) const; - - Variable* FindLocalVar(const std::string& name) const; - - const Scope* parent() const { return parent_; } - - // Following the legacy scope interface. - std::vector LocalVarNames() const; - - /// ------------------------------------- helper functions for Tensor - /// ---------------------------------- - // Create a Tensor variable. This will create a new Variable called `name`. - Tensor* NewTensor(const std::string& name) { - auto* var = Var(name); - return var->GetMutable(); - } - - const Tensor* FindTensor(const std::string& name) { - auto* var = FindVar(name); - if (!var) return nullptr; - return &var->Get(); - } - - Tensor* FindMutableTensor(const std::string& name) { - auto* var = FindVar(name); - if (!var) return nullptr; - return var->GetMutable(); - } - - private: - // Scope in `kids_` are owned by this class. - mutable std::list kids_; - const Scope* parent_{nullptr}; - std::unordered_map> vars_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/scope_test.cc b/lite/core/scope_test.cc deleted file mode 100644 index 8806e6b1c0..0000000000 --- a/lite/core/scope_test.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/scope.h" -#include - -namespace paddle { -namespace lite { - -TEST(Scope, Var) { - Scope scope; - auto* x = scope.Var("x"); - *x->GetMutable() = 100; - - ASSERT_EQ(x->Get(), 100); -} - -TEST(Scope, FindVar) { - Scope scope; - ASSERT_FALSE(scope.FindVar("x")); - scope.Var("x"); - ASSERT_TRUE(scope.FindVar("x")); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/target_wrapper.cc b/lite/core/target_wrapper.cc deleted file mode 100644 index 046336036b..0000000000 --- a/lite/core/target_wrapper.cc +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/target_wrapper.h" -#include -#include "lite/utils/all.h" - -namespace paddle { -namespace lite {} // namespace lite -} // namespace paddle diff --git a/lite/core/target_wrapper.h b/lite/core/target_wrapper.h deleted file mode 100644 index aa7dd6cc12..0000000000 --- a/lite/core/target_wrapper.h +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "lite/api/paddle_place.h" -#include "lite/utils/cp_logging.h" - -#ifdef LITE_WITH_CUDA -#include -#include -#endif // LITE_WITH_CUDA - -namespace paddle { -namespace lite { - -using lite_api::TargetType; -using lite_api::PrecisionType; -using lite_api::DataLayoutType; -using lite_api::PrecisionTypeLength; -using lite_api::TargetToStr; -using lite_api::Place; -using lite_api::PrecisionToStr; -using lite_api::DataLayoutToStr; -using lite_api::TargetRepr; -using lite_api::PrecisionRepr; -using lite_api::DataLayoutRepr; - -// Memory copy directions. -enum class IoDirection { - HtoH = 0, // Host to host - HtoD, // Host to device - DtoH, // Device to host - DtoD, // Device to device -}; - -// This interface should be specified by each kind of target. -template -class TargetWrapper { - public: - using stream_t = StreamTy; - using event_t = EventTy; - - static size_t num_devices() { return 0; } - static size_t maximum_stream() { return 0; } - - static void CreateStream(stream_t* stream) {} - static void DestroyStream(const stream_t& stream) {} - - static void CreateEvent(event_t* event) {} - static void DestroyEvent(const event_t& event) {} - - static void RecordEvent(const event_t& event) {} - static void SyncEvent(const event_t& event) {} - - static void StreamSync(const stream_t& stream) {} - - static void* Malloc(size_t size) { - LOG(FATAL) << "Unimplemented malloc for " << TargetToStr(Target); - return nullptr; - } - static void Free(void* ptr) { LOG(FATAL) << "Unimplemented"; } - - static void MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir) { - LOG(FATAL) << "Unimplemented"; - } - static void MemcpyAsync(void* dst, - const void* src, - size_t size, - IoDirection dir, - const stream_t& stream) { - MemcpySync(dst, src, size, dir); - } -}; - -// This interface should be specified by each kind of target. -using TargetWrapperHost = TargetWrapper; -using TargetWrapperX86 = TargetWrapperHost; -template <> -class TargetWrapper { - public: - using stream_t = int; - using event_t = int; - - static size_t num_devices() { return 0; } - static size_t maximum_stream() { return 0; } - - static void CreateStream(stream_t* stream) {} - static void DestroyStream(const stream_t& stream) {} - - static void CreateEvent(event_t* event) {} - static void DestroyEvent(const event_t& event) {} - - static void RecordEvent(const event_t& event) {} - static void SyncEvent(const event_t& event) {} - - static void StreamSync(const stream_t& stream) {} - - static void* Malloc(size_t size); - static void Free(void* ptr); - - static void MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir); - static void MemcpyAsync(void* dst, - const void* src, - size_t size, - IoDirection dir, - const stream_t& stream) { - MemcpySync(dst, src, size, dir); - } -}; - -#ifdef LITE_WITH_FPGA -template <> -class TargetWrapper { - public: - using stream_t = int; - using event_t = int; - - static size_t num_devices() { return 0; } - static size_t maximum_stream() { return 0; } - - static void CreateStream(stream_t* stream) {} - static void DestroyStream(const stream_t& stream) {} - - static void CreateEvent(event_t* event) {} - static void DestroyEvent(const event_t& event) {} - - static void RecordEvent(const event_t& event) {} - static void SyncEvent(const event_t& event) {} - - static void StreamSync(const stream_t& stream) {} - - static void* Malloc(size_t size); - static void Free(void* ptr); - - static void MemcpySync(void* dst, - const void* src, - size_t size, - IoDirection dir); - static void MemcpyAsync(void* dst, - const void* src, - size_t size, - IoDirection dir, - const stream_t& stream) { - MemcpySync(dst, src, size, dir); - } -}; -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc deleted file mode 100644 index 4dd4f5319d..0000000000 --- a/lite/core/tensor.cc +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef LITE_WITH_FPGA - -#include "lite/core/tensor.h" -#include -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { - -using value_type = int64_t; - -value_type DDimLite::production() const { - value_type res = 1; - for (size_t i = 0; i < this->size(); i++) { - res *= (*this)[i]; - } - return res; -} - -value_type DDimLite::count(int start, int end) const { - if (start < 0) { - start = 0; - } - if (end > size()) { - end = size(); - } - if (end < start) { - end = start; - } - value_type sum = 1; - for (auto i = start; i < end; ++i) { - sum *= data_[i]; - } - return sum; -} - -DDimLite DDimLite::Slice(int start, int end) const { - std::vector vec; - for (int i = start; i < end; i++) { - vec.push_back((*this)[i]); - } - return DDimLite(vec); -} - -std::string DDimLite::repr() const { - STL::stringstream ss; - if (empty()) { - ss << "{}"; - return ss.str(); - } - ss << "{"; - for (size_t i = 0; i < this->size() - 1; i++) { - ss << (*this)[i] << ","; - } - if (!this->empty()) ss << (*this)[size() - 1]; - ss << "}"; - return ss.str(); -} - -void TensorLite::ShareDataWith(const TensorLite &other) { - buffer_ = other.buffer_; - dims_ = other.dims_; - target_ = other.target_; - lod_ = other.lod_; - memory_size_ = other.memory_size_; -} - -void *TensorLite::mutable_data(size_t memory_size) { - memory_size_ = memory_size; - buffer_->ResetLazy(target_, memory_size_); - return buffer_->data(); -} - -void *TensorLite::mutable_data(TargetType target, size_t memory_size) { - target_ = target; - return mutable_data(memory_size); -} - -void TensorLite::CopyDataFrom(const TensorLite &other) { - dims_ = other.dims_; - target_ = other.target_; - lod_ = other.lod_; - memory_size_ = other.memory_size_; - buffer_->CopyDataFrom(*other.buffer_, memory_size_); -} - -// static LoD TensorLite::ToAbsOffset(const LoD &lod) { -// if (lod.empty() || lod.size() == 1) return lod; -// LoD ret = lod; -// for (int level = static_cast(lod.size()) - 2; level >= 0; --level) { -// for (size_t i = 0; i < lod[level].size(); ++i) { -// size_t index = lod[level][i]; -// result[level][i] = result[level + 1][index]; -// } -// } -//} - -} // namespace lite -} // namespace paddle - -#endif diff --git a/lite/core/tensor.h b/lite/core/tensor.h deleted file mode 100644 index aa4cb1b3c5..0000000000 --- a/lite/core/tensor.h +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef LITE_WITH_FPGA -#include "lite/backends/fpga/lite_tensor.h" -#endif - -#ifndef LITE_WITH_FPGA - -#include -#include // for multiplies -#include -#include -#include -#include -#include "lite/core/memory.h" -#include "lite/utils/replace_stl/stream.h" - -namespace paddle { -namespace lite { - -class DDimLite; -class TensorLite; - -using DDim = lite::DDimLite; -using Tensor = lite::TensorLite; - -class DDimLite { - public: - using value_type = int64_t; - - DDimLite() = default; - - explicit DDimLite(const std::vector &x) { ConstructFrom(x); } - // DDimLite(std::initializer_list init_list) : - // DDimLite(std::vector(init_list)) {} - - void ConstructFrom(const std::vector &x) { data_ = x; } - - value_type operator[](int offset) const { return data_[offset]; } - value_type &operator[](int offset) { return data_[offset]; } - std::vector Vectorize() const { return data_; } - - size_t size() const { return data_.size(); } - bool empty() const { return data_.empty(); } - - value_type production() const; - - const std::vector &data() const { return data_; } - value_type count(int start, int end) const; - - DDimLite Slice(int start, int end) const; - - DDimLite Flatten2D(int col) const { - return DDimLite(std::vector( - {Slice(0, col).production(), Slice(col, size()).production()})); - } - - std::string repr() const; - - friend STL::ostream &operator<<(STL::ostream &os, const DDimLite &dims) { - os << dims.repr(); - return os; - } - - friend bool operator==(const DDimLite &a, const DDimLite &b) { - if (a.size() != b.size()) return false; - for (size_t i = 0; i < a.size(); i++) { - if (a[i] != b[i]) return false; - } - return true; - } - - friend bool operator!=(const DDimLite &a, const DDimLite &b) { - return !(a == b); - } - - private: - std::vector data_; -}; - -using LoD = std::vector>; - -// A light-weight tensor implementation. -class TensorLite { - public: - TensorLite() : buffer_(std::make_shared()) {} - - template - void Assign(DType *data, const DimT &dim) { - Resize(dim); - auto *dst = mutable_data(Target); - CopySync( - dst, data, dim.production() * sizeof(DType), IoDirection::HtoD); - } - - // T is the data type and R is the return type - // For OpenCL, the return type can be cl::Buffer - // and the data type can be float/int8_t. - // For other devices, T and R may be the same type. - template - const R *data() const { - return reinterpret_cast(static_cast(buffer_->data()) + - offset_); - } - - void Resize(const DDimLite &ddim) { dims_ = ddim; } - void Resize(const std::vector &x) { dims_ = DDimLite(x); } - - const DDimLite &dims() const { return dims_; } - int64_t numel() const { return dims_.production(); } - - const LoD &lod() const { return lod_; } - LoD *mutable_lod() { return &lod_; } - void set_lod(const LoD &lod) { lod_ = lod; } - - PrecisionType precision() const { return precision_; } - void set_precision(PrecisionType precision) { precision_ = precision; } - - bool persistable() const { return persistable_; } - void set_persistable(bool persistable) { persistable_ = persistable; } - - // T is the data type and R is the return type - // For OpenCL, the return type can be cl::Buffer - // and the data type can be float/int8_t. - // For other devices, T and R may be the same type. - template - R *mutable_data(); - - // T is the data type and R is the return type - // For OpenCL, the return type can be cl::Buffer - // and the data type can be float/int8_t. - // For other devices, T and R may be the same type. - template - R *mutable_data(TargetType target); - void *mutable_data(size_t memory_size); - void *mutable_data(TargetType target, size_t memory_size); - - const void *raw_data() const { - return static_cast( - (static_cast(buffer_->data()) + offset_)); - } - - size_t data_size() const { return this->dims().production(); } - - size_t memory_size() const { return memory_size_; } - - size_t offset() const { return offset_; } - - bool IsInitialized() const { return buffer_->data(); } - - // Other share data to this. - void ShareDataWith(const TensorLite &other); - - void CopyDataFrom(const TensorLite &other); - - TargetType target() const { return target_; } - - template - TensorLite Slice(int64_t begin, int64_t end) const; - - friend STL::ostream &operator<<(STL::ostream &os, const TensorLite &tensor) { - os << "Tensor:" << '\n'; - os << "dim: " << tensor.dims() << '\n'; - for (int i = 0; i < tensor.dims().production(); i++) { - os << tensor.template data()[i] << " "; - } - os << "\n"; - return os; - } - - private: - TargetType target_{TargetType::kHost}; - // precision_ and persistable_ are only used for persistable vars. - // If your tensor wants to be saved and loaded correctly, you must - // set values of precision_ and persistable_ after updating it. - // If your tensor is just a temp tensor, such as activations, - // you can ignore these two attributes. - PrecisionType precision_{PrecisionType::kUnk}; - bool persistable_{false}; - - DDimLite dims_; - std::shared_ptr buffer_; - LoD lod_; - size_t memory_size_{}; - - /// @brief Buffer may be shared with other tensors - size_t offset_{0}; -}; - -template -R *TensorLite::mutable_data() { - memory_size_ = dims_.production() * sizeof(T); - buffer_->ResetLazy(target_, memory_size_); - return reinterpret_cast(static_cast(buffer_->data()) + offset_); -} - -template -R *TensorLite::mutable_data(TargetType target) { - target_ = target; - memory_size_ = dims_.production() * sizeof(T); - buffer_->ResetLazy(target, memory_size()); - return reinterpret_cast(static_cast(buffer_->data()) + offset_); -} - -template -TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { - CHECK_GE(begin, 0); - CHECK_LE(end, dims_[0]); - CHECK_LT(begin, end); - if (dims_[0] == 1) { - return *this; - } else { - int64_t base = numel() / dims_[0]; - TensorLite dst; - dst.buffer_ = buffer_; - dst.target_ = target_; - auto dst_dims = dims_; - dst_dims[0] = end - begin; - dst.Resize(dst_dims); - dst.offset_ = offset_ + static_cast(begin * base) * sizeof(T); - return dst; - } -} - -template -bool TensorCompareWith(const TensorT &a, const TensorT &b) { - if (a.dims() != b.dims()) return false; - if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false; - return true; -} - -} // namespace lite -} // namespace paddle - -#endif diff --git a/lite/core/type_system.cc b/lite/core/type_system.cc deleted file mode 100644 index 276d0c4a34..0000000000 --- a/lite/core/type_system.cc +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/type_system.h" -#include "lite/utils/string.h" - -namespace paddle { -namespace lite { - -size_t ParamTypeRegistry::KernelIdTy::hash() const { - std::hash h; - size_t hash = h(kernel_type); - hash = hash_combine(hash, place.hash()); - hash = hash_combine(hash, std::hash()(static_cast(io))); - hash = hash_combine(hash, std::hash()(arg_name)); - return hash; -} - -STL::ostream &operator<<(STL::ostream &os, const Type &other) { - os << other.name(); - return os; -} - -// An map is used to maintain a global repo for types. We don't use -// MACROs with static variables for that the TypeSystem should only used in -// compile time, that is not performance sensitive, and a map-based way is -// easier to implement and maintain. -// -// The map is declared in each Type::GetXXX method other than in the Type class -// so that it will force to construct before any usage. - -const Type *Type::GetTensorTy(TargetType target, - PrecisionType precision, - DataLayoutType layout, - int device) { - static std::map type_repo; - // NOTE quite naive implementation here, but not performance sensitive. - DataType::ID type_id = DataType::ID::Tensor; - -#define HASH_ONE(x) v = hash_combine(v, hasher(static_cast(x))) - - std::hash hasher; - size_t v = hasher(static_cast(type_id)); - HASH_ONE(target); - HASH_ONE(precision); - HASH_ONE(layout); - HASH_ONE(device); -#undef HASH_ONE - - STL::stringstream name; - name << "Tensor<"; - name << TargetToStr(target) << ","; - name << PrecisionToStr(precision) << ","; - name << DataLayoutToStr(layout) << ","; - name << device; - name << ">"; - - if (!type_repo[v]) - // The Types should alive across the process life, no need to delete. - type_repo[v] = - new Type(type_id, name.str(), target, precision, layout, device); - return type_repo[v]; -} - -const Type *Type::GetTensorListTy(TargetType target, - PrecisionType precision, - DataLayoutType layout, - int device) { - static std::map type_repo; - DataType::ID type_id = DataType::ID::TensorList; - -#define HASH_ONE(x) v = hash_combine(v, hasher(static_cast(x))) - - std::hash hasher; - size_t v = hasher(static_cast(type_id)); - HASH_ONE(target); - HASH_ONE(precision); - HASH_ONE(layout); - HASH_ONE(device); -#undef HASH_ONE - - STL::stringstream name; - name << "TensorList<"; - name << TargetToStr(target) << ","; - name << PrecisionToStr(precision) << ","; - name << DataLayoutToStr(layout) << ","; - name << device; - name << ">"; - - if (!type_repo[v]) - // The Types should alive across the process life, no need to delete. - type_repo[v] = - new Type(type_id, name.str(), target, precision, layout, device); - return type_repo[v]; -} - -const Type *Type::GetUnsupportedTy() { - static std::map type_repo; - std::hash hasher; - size_t v = hasher(static_cast(DataType::ID::Unsupported)); - if (!type_repo[v]) - type_repo[v] = new Type(DataType::ID::Unsupported, - "Unsupported", - TARGET(kUnk), - PRECISION(kUnk), - DATALAYOUT(kUnk), - -1); - return type_repo[v]; -} - -const Type *Type::GetVoidTy() { - static std::map type_repo; - std::hash hasher; - size_t v = hasher(static_cast(DataType::ID::Void)); - if (!type_repo[v]) - type_repo[v] = new Type(DataType::ID::Void, - "Void", - TARGET(kAny), - PRECISION(kAny), - DATALAYOUT(kAny), - -1); - return type_repo[v]; -} - -const Type *Type::Get(DataType::ID type_id, - TargetType target, - PrecisionType precision, - DataLayoutType layout, - int device) { - switch (type_id) { - case DataType::ID::Void: - return GetVoidTy(); - case DataType::ID::Unsupported: - return GetUnsupportedTy(); - case DataType::ID::Tensor: - return GetTensorTy(target, precision, layout, device); - case DataType::ID::TensorList: - return GetTensorListTy(target, precision, layout, device); - default: - LOG(FATAL) << "Unknown Type found"; - return nullptr; - } -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/type_system.h b/lite/core/type_system.h deleted file mode 100644 index 722cdca0eb..0000000000 --- a/lite/core/type_system.h +++ /dev/null @@ -1,390 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -// This file contains the file system of the lite system. Every data type in -// Variable should be registered here, and the analysis phase will check the -// data type correction. -// This mechanism is made for keeping our system simpler and more stable, for -// the dubious typed Variables in the Operators' inputs and outputs are disaster -// for analysis and runtime. - -#include -#include -#include -#include -#include -#include -#include "lite/core/tensor.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { - -// Type is the definition of all the types that supported by the Variable that -// represents as the input and output of an operator or kernel. -// The DNN system is simple, just a list of operators, and the architecture -// can not process that many data types as a compiler, or that will turn out to -// a chaos. -// -// We should make sure that the supported data types be registered here, and -// keep the set small and avoid using some special data types as op's -// inputs or outputs, such as some runtime cache, those types can't be processed -// by the MIR. -// -// A tensor with different places(target, precision, data layout or device) -// should be treated as different types. Different types might be compatible -// with each other, for example, the `VoidTy` means any type, so any other types -// can be treated as a `VoidTy`. -// -// The Different Types can transform to others by adding some special -// transforming operators, for example, a DataLayoutTransformOp can convert a -// `TensorFp32NCHWTy` to a `TensorFp32NHWCTy`; a IoCopyOp can convert a -// `TensorFp32NCHWTy(kHost)` to `TensorFp32NCHWTy(kCUDA)`. There are many other -// convertions between different Types, but there are some unsupported type -// convertions, for example, there is noway to convert a `UnsupportedTy` to a -// `TensorAnyTy`. -// -// We use Types to declare the definition of a kernel, each inputs' and outputs' -// arguments have a specific Types. -// -// REGISTER_LITE_KERNEL(mul, kHost, kFloat, -// paddle::lite::kernels::host::MulCompute, def) -// .BindInput("X", {paddle::lite::Type::Get( -// TARGET(kHost))}) -// .BindInput("Y", {paddle::lite::Type::Get( -// TARGET(kHost))}) -// .BindOutput("Out", -// {paddle::lite::Type::Get(TARGET(kHost))}) -// .Finalize(); -// -// The above definition will be used in MIR by Type inference and uncompatible -// types check. -// -// TODO(Superjomn) Add operator/kernel-wise static checking to avoid unsupported -// type mixed in the system. -class DataType { - public: - // The Void type can cast to any other type. - // The Unsupported is the data type that developed include in the system, for - // example, some `std::set` is used as input of some operator. It wan't be - // analyzed or optimized by the system, that way results in many bugs in - // previous system, so it should be avoided. - enum class ID : int { - Void = 0, // unknown type that can be cast to any data type. - Unsupported, // Unsupported data type that will not be analyzed. - // Tensor_Any represents a Tensor with any place, data, layout. It is used - // in some IO kernels those doesn't care the data. - Tensor, - // A tensor list, but all the elements should have the same type. - TensorList, - // --------- - NumTypes, // Must remains as last defined ID. - }; - - ID id() const { return id_; } - - // type check. - bool IsVoid() const { return id_ == ID::Void; } - bool IsUnsupported() const { return id_ == ID::Unsupported; } - bool IsTensor() const { return id_ == ID::Tensor; } - bool IsTensorList() const { return id_ == ID::TensorList; } - // Get number of types. - int num_types() const { return static_cast(ID::NumTypes); } - - protected: - // Can only extended by subclass. - explicit DataType(ID id) : id_(id) {} - - ID id_{ID::Unsupported}; -}; - -/* - * Datatype with device info considered. - * NOTE A Type with different device is treated as different DeviceDataType. - */ -class Type : public DataType { - public: - // Can cast to another type. This is heavily used in MIR, by determine whether - // is is possible to add a statement to transform a type to another. - virtual bool TypeCastable(const Type& type) const { return id_ == type.id(); } - - /// Get a Tensor type. - static const Type* GetTensorTy(TargetType target, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW), - int device = 0); - /// Get a TensorList type. - static const Type* GetTensorListTy( - TargetType target, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW), - int device = 0); - /// Get an Unsupported type. - static const Type* GetUnsupportedTy(); - /// Get an Void type. - static const Type* GetVoidTy(); - - static const Type* Get(DataType::ID type_id, - TargetType target = TARGET(kUnk), - PrecisionType precision = PRECISION(kUnk), - DataLayoutType layout = DATALAYOUT(kUnk), - int device = 0); - - TargetType target() const { return place_.target; } - PrecisionType precision() const { return place_.precision; } - DataLayoutType layout() const { return place_.layout; } - int16_t device() const { return place().device; } - const Place& place() const { return place_; } - const std::string& name() const { return name_; } - - bool operator==(const Type& other) { - return id_ == other.id() && place_ == other.place(); - } - friend STL::ostream& operator<<(STL::ostream& os, const Type& other); - - virtual ~Type() = default; - - protected: - /// One should avoid using this construct. - Type(ID id, - const std::string& name, - TargetType target = TargetType::kHost, - PrecisionType precision = PrecisionType::kFloat, - DataLayoutType layout = DataLayoutType::kNCHW, - int16_t device = 0) - : DataType(id), place_{target, precision, layout, device}, name_(name) {} - - Place place_; - const std::string name_; -}; - -// -------------------------------- compatible check --------------------------- -static bool TargetCompatibleTo(const Type& a, const Type& b) { - auto is_host = [](TargetType x) -> bool { - return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM); - }; - if (a.IsVoid() || b.IsVoid()) return true; - if (a.IsTensor() || b.IsTensor()) { - if (a.IsTensor() && b.IsTensor()) { - return is_host(a.target()) ? is_host(b.target()) - : a.target() == b.target(); - } - return false; - } - return true; -} - -static bool DataLayoutCompatibleTo(const Type& a, const Type& b) { - return a.IsVoid() || // - ((a.layout() == b.layout() || // - b.layout() == DATALAYOUT(kAny))); -} -static bool DataLayoutCompatible(const Type& a, const Type& b) { - return a.IsVoid() || b.IsVoid() || // - ((a.layout() == b.layout() || // - b.layout() == DATALAYOUT(kAny) || - a.layout() == DATALAYOUT(kAny))); -} - -static bool PrecisionCompatibleTo(const Type& a, const Type& b) { - return a.IsVoid() || // - (((a.IsTensor() && b.IsTensor()) || - (a.IsTensorList() && b.IsTensorList())) && - (a.precision() == b.precision() || // - b.precision() == PRECISION(kAny) || - a.precision() == PRECISION(kAny))); -} -static bool PrecisionCompatible(const Type& a, const Type& b) { - return a.IsVoid() || b.IsVoid() || // - (a.IsTensor() && b.IsTensor() && (a.precision() == b.precision() || // - b.precision() == PRECISION(kAny) || - a.precision() == PRECISION(kAny))); -} - -static bool DeviceCompatibleTo(const Type& a, const Type& b) { - return a.IsVoid() || // - (a.IsTensor() && b.IsTensor() && (a.device() == b.device())); -} - -// Can type 'a' be passed to 'b' directly. -static bool TypeCompatibleTo(const Type& a, const Type& b) { - return TargetCompatibleTo(a, b) && DataLayoutCompatibleTo(a, b) && - PrecisionCompatibleTo(a, b) && DeviceCompatibleTo(a, b); -} -static bool TypeCompatible(const Type& a, const Type& b) { - return TargetCompatibleTo(a, b) && DataLayoutCompatible(a, b) && - PrecisionCompatible(a, b) && DeviceCompatibleTo(a, b); -} - -/* - * ParamType is used to represent a data type of a parameter for the kernel. It - * can represent any Variable data type. - * The element_type_hash is the hash code of the element, it should be - * registered in the `TypeSystem`. - */ -struct ParamType { - const Type* type; - - ParamType() = default; - ParamType(const Type* type) : type(type) {} // NOLINT - - std::string DebugString() const { return type->name(); } -}; - -/* - * The data types of kernel parameters. It is used to track the type of kernel's - * inputs and outputs. - */ -struct ParamTypeRecorder { - std::map inputs; - std::map outputs; - - void RegisterInputType(const std::string& arg_name, const ParamType& type) { - Register(&inputs, arg_name, type); - } - - void RegisterOutputType(const std::string& arg_name, const ParamType& type) { - Register(&outputs, arg_name, type); - } - - private: - void Register(std::map* ts, - const std::string& arg_name, - ParamType type) { - (*ts)[arg_name] = type; - } -}; - -/* - * The ParamTypeRegistry help register the input and output data types for all - * the kernels. It is made singleton so that all the objects of the same kernel - * can share the same information. - * - * Usage: - * for register a kernel for FC operator. - * ParamTypeRegistry::Global().Register( - * "fc", {TARGET(kCUDA), PRECISION(kFloat)}, 0, - * {typeid(Tensor), {TARGET(kCUDA)}}); - */ -class ParamTypeRegistry { - public: - enum class IO : int { kInput = 0, kOutput }; - - template - /* - * Helper class for registering a ParamType for a Kernel. - * Usage: - * - * NewInstance("fc") - * .BindInput(0, {typeid(Tensor).hash_code(), {TARGET(kHost)}) - * .BindInput(1, {typeid(Tensor).hash_code(), {TARGET(kHost), - * PRECISION(kFloat)}); - */ - struct NewInstance { - explicit NewInstance(const std::string& kernel_type) - : kernel_type_(kernel_type) {} - - NewInstance& BindInput(const std::string& arg_name, - const ParamType& ptype) { - ParamTypeRegistry::Global().Register( - kernel_type_, Place{target, precision, layout}, arg_name, ptype); - return *this; - } - NewInstance& BindOutput(const std::string& arg_name, - const ParamType& ptype) { - ParamTypeRegistry::Global().Register( - kernel_type_, Place{target, precision, layout}, arg_name, ptype); - return *this; - } - - bool Finalize() { return true; } - - private: - std::string kernel_type_; - }; - - template - void Register(const std::string& kernel_type, - const Place& place, - const std::string& arg_name, - ParamType data_type) { - KernelIdTy key{kernel_type, place, io, arg_name}; - types_[key] = data_type; - CHECK(types_.count(key)); - } - - const ParamType* RetrieveInArgument(const Place& place, - const std::string& op_type, - const std::string& arg_name) { - return Retrieve(place, op_type, arg_name); - } - const ParamType* RetrieveOutArgument(const Place& place, - const std::string& op_type, - const std::string& arg_name) { - return Retrieve(place, op_type, arg_name); - } - - static ParamTypeRegistry& Global() { - static ParamTypeRegistry x; - return x; - } - - friend STL::ostream& operator<<(STL::ostream& os, - const ParamTypeRegistry& other) { - for (auto& item : other.types_) { - os << item.first << " " << item.second.DebugString() << "\n"; - } - return os; - } - - protected: - template - const ParamType* Retrieve(const Place& place, - const std::string& op_type, - const std::string& arg_name) { - KernelIdTy key{op_type, place, io, arg_name}; - auto it = types_.find(key); - if (it == types_.end()) return nullptr; - return &it->second; - } - - private: - ParamTypeRegistry() = default; - - public: - // Identification for a Kernel. - struct KernelIdTy { - std::string kernel_type; - Place place; - IO io; - std::string arg_name; - - size_t hash() const; - friend STL::ostream& operator<<(STL::ostream& os, const KernelIdTy& other); - }; - - using key_t = KernelIdTy; - struct KeyCmp { - bool operator()(const key_t& a, const key_t& b) const; - }; - - private: - std::map types_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/type_system_test.cc b/lite/core/type_system_test.cc deleted file mode 100644 index 224a779fcb..0000000000 --- a/lite/core/type_system_test.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/type_system.h" -#include - -namespace paddle { -namespace lite { - -TEST(TypeSystem, CheckDuplicateGet) { - auto* tensor_ty = - Type::GetTensorTy(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto* tensor_ty1 = - Type::GetTensorTy(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); - - ASSERT_EQ(tensor_ty, tensor_ty1); - - ASSERT_EQ(tensor_ty->target(), TARGET(kHost)); - ASSERT_EQ(tensor_ty->precision(), PRECISION(kFloat)); - ASSERT_EQ(tensor_ty->layout(), DATALAYOUT(kNCHW)); -} - -} // namespace lite -} // namespace paddle diff --git a/lite/core/types.cc b/lite/core/types.cc deleted file mode 100644 index ec89e83e58..0000000000 --- a/lite/core/types.cc +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/types.h" - -namespace paddle { -namespace lite { -namespace core { - -KernelPickFactor& KernelPickFactor::ConsiderDataLayout() { - data_ |= static_cast(Factor::DataLayoutFirst); - return *this; -} -KernelPickFactor& KernelPickFactor::ConsiderPrecision() { - data_ |= static_cast(Factor::PrecisionFirst); - return *this; -} -KernelPickFactor& KernelPickFactor::ConsiderTarget() { - data_ |= static_cast(Factor::TargetFirst); - return *this; -} -KernelPickFactor& KernelPickFactor::ConsiderDevice() { - data_ |= static_cast(Factor::DeviceFirst); - return *this; -} -bool KernelPickFactor::IsPrecisionConsidered() const { - return data_ & static_cast(Factor::PrecisionFirst); -} -bool KernelPickFactor::IsTargetConsidered() const { - return data_ & static_cast(Factor::TargetFirst); -} -bool KernelPickFactor::IsDataLayoutConsidered() const { - return data_ & static_cast(Factor::DataLayoutFirst); -} -bool KernelPickFactor::IsDeviceConsidered() const { - return data_ & static_cast(Factor::DeviceFirst); -} - -STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k) { - std::stack bits; - auto data = k.data_; - while (data) { - bits.push(data % 2); - data /= 2; - } - int nbits = bits.size(); - for (size_t i = 0; i < sizeof(data) * 8 - nbits; i++) { - os << 0; - } - while (!bits.empty()) { - os << bits.top(); - bits.pop(); - } - return os; -} - -template <> -Type StdTypeToRepr() { - return Type::_int32; -} -template <> -Type StdTypeToRepr() { - return Type::_int64; -} -template <> -Type StdTypeToRepr() { - return Type::_float32; -} -template <> -Type StdTypeToRepr() { - return Type::_float64; -} -template <> -Type StdTypeToRepr() { - return Type::_string; -} -template <> -Type StdTypeToRepr() { - return Type::_bool; -} - -} // namespace core -} // namespace lite -} // namespace paddle diff --git a/lite/core/types.h b/lite/core/types.h deleted file mode 100644 index efb8a096e5..0000000000 --- a/lite/core/types.h +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/api/paddle_place.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { -namespace core { - -/* - * Type representations used to represent standard types. - */ -// TODO(Superjomn) unify all the type representation across the lite framework. -enum class Type { - _unk = -1, - // primary types - _int32, - _int64, - _float32, - _float64, - _bool, - _string, - // primary list types - _list, - // enum type - _enum, - _float16, - // number of types - __num__, -}; - -enum class FluidType { - // Pod Types - BOOL = 0, - INT16 = 1, - INT32 = 2, - INT64 = 3, - FP16 = 4, - FP32 = 5, - FP64 = 6, - // Tensor is used in C++. - SIZE_T = 19, - UINT8 = 20, - INT8 = 21, - - // Other types that may need additional descriptions - LOD_TENSOR = 7, - SELECTED_ROWS = 8, - FEED_MINIBATCH = 9, - FETCH_LIST = 10, - STEP_SCOPES = 11, - LOD_RANK_TABLE = 12, - LOD_TENSOR_ARRAY = 13, - PLACE_LIST = 14, - READER = 15, - // Any runtime decided variable type is raw - // raw variables should manage their own allocations - // in operators like nccl_op - RAW = 17, - TUPLE = 18, -}; - -template -Type StdTypeToRepr() { - return Type::_unk; -} -template <> -Type StdTypeToRepr(); -template <> -Type StdTypeToRepr(); -template <> -Type StdTypeToRepr(); -template <> -Type StdTypeToRepr(); -template <> -Type StdTypeToRepr(); - -// Factors that impact the kernel picking strategy. Multiple factors can be -// considered together by using statement like 'factor1 | factor2' -class KernelPickFactor { - public: - using value_type = unsigned char; - enum class Factor : int { - // The following factors are sorted by priority. - TargetFirst = 1, - PrecisionFirst = 1 << 1, - DataLayoutFirst = 1 << 2, - DeviceFirst = 1 << 3, - }; - - // Has any factors considered. - bool any_factor_considered() const { return data_; } - - KernelPickFactor& ConsiderTarget(); - // Prefer a specific target, e.g. prefer CUDA kernels. - KernelPickFactor& ConsiderPrecision(); - KernelPickFactor& ConsiderDataLayout(); - KernelPickFactor& ConsiderDevice(); - - bool IsTargetConsidered() const; - bool IsPrecisionConsidered() const; - bool IsDataLayoutConsidered() const; - bool IsDeviceConsidered() const; - - friend STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k); - - private: - unsigned char data_{}; - lite_api::TargetType target_{TARGET(kUnk)}; -}; - -struct dim2 { - int x{}; - int y{}; - - dim2(int x, int y) : x(x), y(y) {} -}; - -struct dim3 { - int x{}; - int y{}; - int z{}; - - dim3(int x, int y, int z) : x(x), y(y), z(z) {} -}; - -using byte_t = uint8_t; - -} // namespace core -} // namespace lite -} // namespace paddle diff --git a/lite/core/types_test.cc b/lite/core/types_test.cc deleted file mode 100644 index 9b7e5b6f05..0000000000 --- a/lite/core/types_test.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/types.h" -#include - -namespace paddle { -namespace lite { -namespace core { - -TEST(KernelPickFactor, Default) { - KernelPickFactor factor; - ASSERT_FALSE(factor.IsTargetConsidered()); - ASSERT_FALSE(factor.IsPrecisionConsidered()); - ASSERT_FALSE(factor.IsDataLayoutConsidered()); -} - -TEST(KernelPickFactor, Set) { - KernelPickFactor factor; - factor.ConsiderTarget(); - ASSERT_TRUE(factor.IsTargetConsidered()); - factor.ConsiderPrecision(); - ASSERT_TRUE(factor.IsPrecisionConsidered()); - factor.ConsiderDataLayout(); - ASSERT_TRUE(factor.IsDataLayoutConsidered()); - - LOG(INFO) << "factor " << factor; -} - -} // namespace core -} // namespace lite -} // namespace paddle diff --git a/lite/core/variable.cc b/lite/core/variable.cc deleted file mode 100644 index a344da63f1..0000000000 --- a/lite/core/variable.cc +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/variable.h" - -namespace paddle { -namespace lite {} // namespace lite -} // namespace paddle diff --git a/lite/core/variable.h b/lite/core/variable.h deleted file mode 100644 index 2c1e737a93..0000000000 --- a/lite/core/variable.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "lite/core/tensor.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { - -using FeedFetchList = std::vector; - -class Variable { - public: - template - const T& Get() const { - return blob_.get(); - } - - template - T* GetMutable() { - if (!blob_.is()) blob_.set(); - return blob_.get_mutable(); - } - - template - bool IsType() { - return blob_.type() == typeid(T).hash_code(); - } - - private: - // variant blob_; - variant> - blob_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/core/workspace.cc b/lite/core/workspace.cc deleted file mode 100644 index 196536f955..0000000000 --- a/lite/core/workspace.cc +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/workspace.h" diff --git a/lite/core/workspace.h b/lite/core/workspace.h deleted file mode 100644 index 117b80aaa7..0000000000 --- a/lite/core/workspace.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/memory.h" -#include "lite/core/types.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { - -/* - * WorkSpace is a container that help to manage the temporary memory that are - * shared across kernels during the serial execution. - * - * Due to the mobile library size limit, a complex allocator or GC algorithm is - * not suitable here, one need to carefully manage the workspace inside a single - * kernel. - * - * NOTE - * - * For kernel developers, one need to call the workspace as follows: - * - * - call `WorkSpace::Global().Alloc()` if needed to allocate some temporary - * buffer. - */ -class WorkSpace { - public: - // Reset the workspace, and treat the workspace as empty. - void AllocReset() { cursor_ = 0; } - - // Allocate a memory buffer. - core::byte_t* Alloc(size_t size) { - buffer_.ResetLazy(target_, cursor_ + size); - auto* data = static_cast(buffer_.data()) + cursor_; - cursor_ += size; - return data; - } - - static WorkSpace& Global_Host() { - thread_local std::unique_ptr x(new WorkSpace(TARGET(kHost))); - return *x; - } - -#if defined(LITE_WITH_X86) - static WorkSpace& Global_X86() { return Global_Host(); } -#endif - -#if defined(LITE_WITH_ARM) - static WorkSpace& Global_ARM() { return Global_Host(); } -#endif - -#if defined(LITE_WITH_CUDA) - static WorkSpace& Global_CUDA() { - thread_local std::unique_ptr x(new WorkSpace(TARGET(kCUDA))); - return *x; - } -#endif - - private: - explicit WorkSpace(TargetType x) : target_(x) {} - - TargetType target_; - Buffer buffer_; - size_t cursor_; - - DISALLOW_COPY_AND_ASSIGN(WorkSpace); -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/demo/cxx/Makefile.def b/lite/demo/cxx/Makefile.def deleted file mode 100644 index f0a0ec1dcb..0000000000 --- a/lite/demo/cxx/Makefile.def +++ /dev/null @@ -1,35 +0,0 @@ -CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \ - -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING -LDFLAGS = -latomic -pthread -ldl - -SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot - -THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a - -SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \ - -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \ - -I/opt/android-ndk-r17c/sources/android/support/include \ - -I/opt/android-ndk-r17c/sysroot/usr/include \ - -THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include - -ifeq ($(ARM_ABI), arm8) - CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ - CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=22 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE - CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections - SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64 - SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \ - /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a - INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES) -else - CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++ - CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \ - -D__ANDROID_API__=22 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE - CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--fix-cortex-a8 -Wl,--gc-sections -Wl,-z,nocopyreloc - SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-22/arch-arm - SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \ - /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \ - /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \ - /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a - INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES) -endif diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md deleted file mode 100644 index ec72c044e3..0000000000 --- a/lite/demo/cxx/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# C++ Demo -1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像 -2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。 -3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz ` -4. 执行以下命令准备模拟器环境 -```shell -# armv8 -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` -```shell -# armv7 -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` -5. 准备模型、编译并运行完整api的demo -```shell -cd inference_lite_lib.android.armv8/demo/cxx/mobile_full -wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz -tar zxvf mobilenet_v1.tar.gz -make -adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/ -adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api -adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" -``` -运行成功将在控制台输出预测结果的前10个类别的预测概率 - -6. 编译并运行轻量级api的demo -```shell -cd ../mobile_light -make -adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api -adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt" -``` diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 deleted file mode 100644 index f795b41d46..0000000000 --- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 +++ /dev/null @@ -1,22 +0,0 @@ -ARM_ABI = arm7 -export ARM_ABI - -include ../Makefile.def - -LITE_ROOT=../../../ - -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include - -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) - -mobilenetv1_full_api: mobilenetv1_full_api.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api $(CXX_LIBS) $(LDFLAGS) - -mobilenetv1_full_api.o: mobilenetv1_full_api.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_full_api.o -c mobilenetv1_full_api.cc - - -.PHONY: clean -clean: - rm -f mobilenetv1_full_api.o - rm -f mobilenetv1_full_api diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 deleted file mode 100644 index d0767145b0..0000000000 --- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 +++ /dev/null @@ -1,22 +0,0 @@ -ARM_ABI = arm8 -export ARM_ABI - -include ../Makefile.def - -LITE_ROOT=../../../ - -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include - -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) - -mobilenetv1_full_api: mobilenetv1_full_api.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api $(CXX_LIBS) $(LDFLAGS) - -mobilenetv1_full_api.o: mobilenetv1_full_api.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_full_api.o -c mobilenetv1_full_api.cc - - -.PHONY: clean -clean: - rm -f mobilenetv1_full_api.o - rm -f mobilenetv1_full_api diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 deleted file mode 100644 index d235d6e25f..0000000000 --- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 +++ /dev/null @@ -1,22 +0,0 @@ -ARM_ABI = arm7 -export ARM_ABI - -include ../Makefile.def - -LITE_ROOT=../../../ - -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include - -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) - -mobilenetv1_light_api: mobilenetv1_light_api.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api $(CXX_LIBS) $(LDFLAGS) - -mobilenetv1_light_api.o: mobilenetv1_light_api.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_light_api.o -c mobilenetv1_light_api.cc - - -.PHONY: clean -clean: - rm -f mobilenetv1_light_api.o - rm -f mobilenetv1_light_api diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 deleted file mode 100644 index b91aadcef8..0000000000 --- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 +++ /dev/null @@ -1,22 +0,0 @@ -ARM_ABI = arm8 -export ARM_ABI - -include ../Makefile.def - -LITE_ROOT=../../../ - -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include - -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) - -mobilenetv1_light_api: mobilenetv1_light_api.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api $(CXX_LIBS) $(LDFLAGS) - -mobilenetv1_light_api.o: mobilenetv1_light_api.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_light_api.o -c mobilenetv1_light_api.cc - - -.PHONY: clean -clean: - rm -f mobilenetv1_light_api.o - rm -f mobilenetv1_light_api diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc deleted file mode 100644 index 18167e3ca1..0000000000 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT - -using namespace paddle::lite_api; // NOLINT - -DEFINE_string(model_dir, "", "Model dir path."); -DEFINE_string(optimized_model_dir, "", "Optimized model dir."); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); - -int64_t ShapeProduction(const shape_t& shape) { - int64_t res = 1; - for (auto i : shape) res *= i; - return res; -} - -void RunModel() { - // 1. Set CxxConfig - CxxConfig config; - config.set_model_dir(FLAGS_model_dir); - std::vector valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; - if (FLAGS_prefer_int8_kernel) { - valid_places.push_back(Place{TARGET(kARM), PRECISION(kInt8)}); - config.set_preferred_place(Place{TARGET(kARM), PRECISION(kInt8)}); - } else { - config.set_preferred_place(Place{TARGET(kARM), PRECISION(kFloat)}); - } - config.set_valid_places(valid_places); - - // 2. Create PaddlePredictor by CxxConfig - std::shared_ptr predictor = - CreatePaddlePredictor(config); - - // 3. Save the optimized model - // WARN: The `predictor->SaveOptimizedModel` method must be executed - // before the `predictor->Run` method. Because some kernels' `PrepareForRun` - // method maybe change some parameters' values. - predictor->SaveOptimizedModel(FLAGS_optimized_model_dir, - LiteModelType::kNaiveBuffer); - - // 4. Prepare input data - std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize(shape_t({1, 3, 224, 224})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; - } - - // 5. Run predictor - predictor->Run(); - - // 6. Get output - std::unique_ptr output_tensor( - std::move(predictor->GetOutput(0))); - printf("Output dim: %d\n", output_tensor->shape()[1]); - for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - printf("Output[%d]: %f\n", i, output_tensor->data()[i]); - } -} - -int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - RunModel(); - return 0; -} diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc deleted file mode 100644 index e1833814ca..0000000000 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT - -using namespace paddle::lite_api; // NOLINT - -DEFINE_string(model_dir, "", "Model dir path."); - -int64_t ShapeProduction(const shape_t& shape) { - int64_t res = 1; - for (auto i : shape) res *= i; - return res; -} - -void RunModel() { - // 1. Set MobileConfig - MobileConfig config; - config.set_model_dir(FLAGS_model_dir); - - // 2. Create PaddlePredictor by MobileConfig - std::shared_ptr predictor = - CreatePaddlePredictor(config); - - // 3. Prepare input data - std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); - input_tensor->Resize({1, 3, 224, 224}); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) { - data[i] = 1; - } - - // 4. Run predictor - predictor->Run(); - - // 5. Get output - std::unique_ptr output_tensor( - std::move(predictor->GetOutput(0))); - printf("Output dim: %d\n", output_tensor->shape()[1]); - for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - printf("Output[%d]: %f\n", i, output_tensor->data()[i]); - } -} - -int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - RunModel(); - return 0; -} diff --git a/lite/demo/java/README.md b/lite/demo/java/README.md deleted file mode 100644 index 904726d744..0000000000 --- a/lite/demo/java/README.md +++ /dev/null @@ -1,118 +0,0 @@ -# Java Android Demo - -要编译和跑起 ./android 文件夹下的 Android demo 程序 PaddlePredictor,你需要准备: - -1. 一台能运行安卓程序的安卓手机 -2. 一台带有AndroidStudio的开发机 - -## 编译 - -首先在PaddleLite的开发Docker镜像中,拉取最新PaddleLite代码,编译对应你手机架构的预测库, -下面我们以arm8 架构举例。进入paddlelite 目录,运行以下cmake 和make 命令: - -``` -mkdir -p build.lite.android.arm8.gcc -cd build.lite.android.arm8.gcc - -cmake .. \ --DWITH_GPU=OFF \ --DWITH_MKL=OFF \ --DWITH_LITE=ON \ --DLITE_WITH_JAVA=ON \ --DLITE_WITH_CUDA=OFF \ --DLITE_WITH_X86=OFF \ --DLITE_WITH_ARM=ON \ --DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ --DWITH_TESTING=OFF \ --DLITE_SHUTDOWN_LOG=ON \ --DLITE_ON_TINY_PUBLISH=ON \ --DARM_TARGET_OS=android -DARM_TARGET_ARCH_ABI=armv8 -DARM_TARGET_LANG=gcc - -make publish_inference -j4 -``` - -Make完成后查看要存在 -``` -build.lite.android.arm8.gcc/lite/api/android/jni/native/libpaddle_lite_jni.so -build.lite.android.arm8.gcc/lite/api/android/jni/PaddlePredictor.jar -``` -这两个文件。他们分别为 PaddleLite c++ 动态链接库和 Java jar 包。包含 PaddleLite Java API,接下来 Android Java 代 -码会使用这些api - -## 准备 demo 需要的其他文件 - -Demo 除了代码,还需要准备 JNI .so 库(上节提到的`libpaddle_lite_jni.so`),Java .jar 包(上文提到的 -`PaddlePredictor.jar` ),和模型文件。我们提供了自动化的脚本和手动拷贝两种方法,用户可以根据自己需要选择: - -### 脚本方法 - -进入 `build.lite.android.armv8/inference_lite_lib.android.armv8/demo/java/android/`,我们准备了 -一个脚本`prepare_demo.bash`,脚本输入一个参数,为你要拷贝的.so 对应的架构文件夹名。 - -例如运行 -``` -bash prepare_demo.bash armv8 -``` -该脚本自动下载并解压缩模型文件,拷贝了 .jar 包进demo,还有生成的.so包进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下`, -在我们这个例子里,armv8 就是架构文件夹。备注:这种方式构建的 demo 在 armv8 手机运行正常。如果要 demo 程序 -在别的手机架构(如 armv7)上也运行正常,需要添加别的架构。 - -### 手动拷贝方法 - -接下来我们介绍手动拷贝,如果使用了脚本,那么可以跳过以下手动方法的介绍。 - -### 把 .so 动态库和 .jar 拷贝进安卓demo程序: - -把本文件夹下 demo/PaddlePredictor 载入到AndroidStudio。把上一步提到的`libpaddle_lite_jni.so` -拷贝进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下` 比如文件夹arm8里要包含该 .so文件: -把上一步提到的 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下 - -### 把demo使用到的模型文件拷贝进安卓程序: - -下载我们的5个模型文件,并解压缩到 `PaddlePredictor/app/src/main/assets` 这个文件夹中 -需要拷贝的模型文件和下载地址: - - inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz - lite_naive_model_opt.nb http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz - mobilenet_v1_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz - mobilenet_v2_relu_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz - resnet50_opt.nb http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz - -下载完后,assets文件夹里要包含解压后的上面五个模型文件夹,但demo里不需要保存原压缩.tar.gz 文件。 - -## 运行 Android 程序结果 - -以上准备工作完成,就可以开始Build ,安装,和跑安卓demo程序。当你运行PaddlePredictor 程序时,大概会等10秒, -然后看到类似以下字样: - - lite_naive_model output: 50.213173, -28.872887 - expected: 50.2132, -28.8729 - - inception_v4_simple test:true - time: xxx ms - - resnet50 test:true - time: xxx ms - - mobilenet_v1 test:true - time: xxx ms - - mobilenet_v2 test:true - time: xxx ms - -该 demo 程序跑我们的 5 个模型,第一个模型结果将真正的头两个数字输出,并在第二行附上期望的正确值。你应该要 -看到他们的误差小于0.001。后面四个模型如果你看到 test:true 字样,说明模型输出通过了我们在 demo 程序里对其输出 -的测试。time 代表该测试花费的时间。 - -## Android demo 程序的 Instrumented Test - -本节对于想通过命令行自动化demo程序的测试人员 - -要通过命令行运行demo程序在手机上,进入 demo 的 `PaddlePredictor` 文件夹,运行 -``` -./gradlew init -``` -以上命令只要运行一次,其初始化demo能运行的任务。之后可以通过以下命令运行我们的测试 -``` -./gradlew connectedAndroidTest -``` diff --git a/lite/demo/java/android/PaddlePredictor/.gitignore b/lite/demo/java/android/PaddlePredictor/.gitignore deleted file mode 100644 index 2b75303ac5..0000000000 --- a/lite/demo/java/android/PaddlePredictor/.gitignore +++ /dev/null @@ -1,13 +0,0 @@ -*.iml -.gradle -/local.properties -/.idea/caches -/.idea/libraries -/.idea/modules.xml -/.idea/workspace.xml -/.idea/navEditor.xml -/.idea/assetWizardSettings.xml -.DS_Store -/build -/captures -.externalNativeBuild diff --git a/lite/demo/java/android/PaddlePredictor/app/.gitignore b/lite/demo/java/android/PaddlePredictor/app/.gitignore deleted file mode 100644 index 796b96d1c4..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/build diff --git a/lite/demo/java/android/PaddlePredictor/app/build.gradle b/lite/demo/java/android/PaddlePredictor/app/build.gradle deleted file mode 100644 index b86d2f8e3d..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/build.gradle +++ /dev/null @@ -1,28 +0,0 @@ -apply plugin: 'com.android.application' - -android { - compileSdkVersion 28 - defaultConfig { - applicationId "com.baidu.paddle.lite" - minSdkVersion 23 - targetSdkVersion 28 - versionCode 1 - versionName "1.0" - testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner" - } - buildTypes { - release { - minifyEnabled false - proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' - } - } -} - -dependencies { - implementation fileTree(dir: 'libs', include: ['*.jar']) - implementation 'com.android.support:appcompat-v7:28.0.0' - implementation 'com.android.support.constraint:constraint-layout:1.1.3' - testImplementation 'junit:junit:4.12' - androidTestImplementation 'com.android.support.test:runner:1.0.2' - androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2' -} diff --git a/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro b/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro deleted file mode 100644 index f1b424510d..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro +++ /dev/null @@ -1,21 +0,0 @@ -# Add project specific ProGuard rules here. -# You can control the set of applied configuration files using the -# proguardFiles setting in build.gradle. -# -# For more details, see -# http://developer.android.com/guide/developing/tools/proguard.html - -# If your project uses WebView with JS, uncomment the following -# and specify the fully qualified class name to the JavaScript interface -# class: -#-keepclassmembers class fqcn.of.javascript.interface.for.webview { -# public *; -#} - -# Uncomment this to preserve the line number information for -# debugging stack traces. -#-keepattributes SourceFile,LineNumberTable - -# If you keep the line number information, uncomment this to -# hide the original source file name. -#-renamesourcefileattribute SourceFile diff --git a/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java b/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java deleted file mode 100644 index ca40855be7..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.baidu.paddle.lite; - -import android.content.Context; -import android.support.test.InstrumentationRegistry; -import android.support.test.runner.AndroidJUnit4; - -import org.junit.Test; -import org.junit.runner.RunWith; - -import java.util.ArrayList; - -import static org.junit.Assert.*; - -/** - * Lite example Instrument test - */ -@RunWith(AndroidJUnit4.class) -public class ExampleInstrumentedTest { - @Test - public void naiveModel_isCorrect() { - Context appContext = InstrumentationRegistry.getTargetContext(); - ArrayList result = MainActivity.setInputAndRunNaiveModel("lite_naive_model", appContext); - Tensor output = result.get(0); - long[] shape = output.shape(); - assertEquals(2, shape.length); - assertEquals(100L, shape[0]); - assertEquals(500L, shape[1]); - - float[] outputBuffer = output.getFloatData(); - assertEquals(50000, outputBuffer.length); - assertEquals(50.2132f, outputBuffer[0], 1e-4); - assertEquals(-28.8729, outputBuffer[1], 1e-4); - } - - @Test - public void inceptionV4Simple_isCorrect() { - Context appContext = InstrumentationRegistry.getTargetContext(); - ArrayList result = MainActivity.setInputAndRunImageModel("inception_v4_simple", appContext); - float[] expected = {0.0011684548f, 0.0010390386f, 0.0011301535f, 0.0010133048f, - 0.0010259597f, 0.0010982729f, 0.00093195855f, 0.0009141837f, - 0.00096620916f, 0.00089982944f, 0.0010064574f, 0.0010474789f, - 0.0009782845f, 0.0009230255f, 0.0010548076f, 0.0010974824f, - 0.0010612885f, 0.00089107914f, 0.0010112736f, 0.00097655767f}; - assertImageResult(expected, result); - } - - @Test - public void mobilenetV1_isCorrect() { - Context appContext = InstrumentationRegistry.getTargetContext(); - ArrayList result = MainActivity.setInputAndRunImageModel("mobilenet_v1", appContext); - float[] expected = {0.00019130898f, 9.467885e-05f, 0.00015971427f, 0.0003650665f, - 0.00026431272f, 0.00060884043f, 0.0002107942f, 0.0015819625f, - 0.0010323516f, 0.00010079765f, 0.00011006987f, 0.0017364529f, - 0.0048292773f, 0.0013995157f, 0.0018453331f, 0.0002428986f, - 0.00020211363f, 0.00013668182f, 0.0005855956f, 0.00025901722f}; - assertImageResult(expected, result); - } - - @Test - public void mobilenetV2Relu_isCorrect() { - Context appContext = InstrumentationRegistry.getTargetContext(); - ArrayList result = MainActivity.setInputAndRunImageModel("mobilenet_v2_relu", appContext); - float[] expected = {0.00017082224f, 5.699624e-05f, 0.000260885f, 0.00016412718f, - 0.00034818667f, 0.00015230637f, 0.00032959113f, 0.0014772735f, - 0.0009059976f, 9.5378724e-05f, 5.386537e-05f, 0.0006427285f, - 0.0070957416f, 0.0016094646f, 0.0018807327f, 0.00010506048f, - 6.823785e-05f, 0.00012269315f, 0.0007806194f, 0.00022354358f}; - assertImageResult(expected, result); - } - - @Test - public void resnet50_isCorrect() { - Context appContext = InstrumentationRegistry.getTargetContext(); - ArrayList result = MainActivity.setInputAndRunImageModel("resnet50", appContext); - float[] expected = {0.00024139918f, 0.00020566184f, 0.00022418296f, 0.00041731037f, - 0.0005366107f, 0.00016948722f, 0.00028638865f, 0.0009257241f, - 0.00072681636f, 8.531815e-05f, 0.0002129998f, 0.0021168243f, - 0.006387163f, 0.0037145028f, 0.0012812682f, 0.00045948103f, - 0.00013535398f, 0.0002483765f, 0.00076759676f, 0.0002773295f}; - assertImageResult(expected, result); - } - - public void assertImageResult(float[] expected, ArrayList result) { - assertEquals(2, result.size()); - assertEquals(20, expected.length); - - Tensor tensor = result.get(0); - Tensor tensor1 = result.get(1); - long[] shape = tensor.shape(); - long[] shape1 = tensor1.shape(); - - assertEquals(2, shape.length); - assertEquals(2, shape1.length); - - assertEquals(1L, shape[0]); - assertEquals(1L, shape1[0]); - assertEquals(1000L, shape[1]); - assertEquals(1000L, shape1[1]); - - float[] output = tensor.getFloatData(); - float[] output1 = tensor.getFloatData(); - - assertEquals(1000, output.length); - assertEquals(1000, output1.length); - for (int i = 0; i < output.length; ++i) { - assertEquals(output[i], output1[i], 1e-6f); - } - int step = 50; - for (int i = 0; i < expected.length; ++i) { - assertEquals(output[i * step], expected[i], 1e-6f); - } - } -} - diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml deleted file mode 100644 index 240078a587..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - \ No newline at end of file diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt b/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt deleted file mode 100644 index 14aace8f9b..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt +++ /dev/null @@ -1,8 +0,0 @@ -After build PaddleLite in your build folder, download and decompress the -following models in this directory: - -inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz -lite_naive_model_opt.nb http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz -mobilenet_v1_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz -mobilenet_v2_relu_opt.nb http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz -resnet50_opt.nb http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java b/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java deleted file mode 100644 index e8eb01bd55..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java +++ /dev/null @@ -1,206 +0,0 @@ -package com.baidu.paddle.lite; - -import android.content.Context; -import android.support.v7.app.AppCompatActivity; -import android.os.Bundle; -import android.widget.TextView; - -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Date; - -public class MainActivity extends AppCompatActivity { - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - setContentView(R.layout.activity_main); - - String textOutput = ""; - Tensor output; - output = setInputAndRunNaiveModel("lite_naive_model_opt.nb", this); - textOutput += "lite_naive_model output: " + output.getFloatData()[0] + ", " - + output.getFloatData()[1] + "\n"; - textOutput += "expected: 50.2132, -28.8729\n"; - - Date start = new Date(); - output = setInputAndRunImageModel("inception_v4_simple_opt.nb", this); - Date end = new Date(); - textOutput += "\ninception_v4_simple test: " + testInceptionV4Simple(output) + "\n"; - textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n"; - - start = new Date(); - output = setInputAndRunImageModel("resnet50_opt.nb", this); - end = new Date(); - textOutput += "\nresnet50 test: " + testResnet50(output) + "\n"; - textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n"; - - start = new Date(); - output = setInputAndRunImageModel("mobilenet_v1_opt.nb", this); - end = new Date(); - textOutput += "\nmobilenet_v1 test: " + testMobileNetV1(output) + "\n"; - textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n"; - - start = new Date(); - output = setInputAndRunImageModel("mobilenet_v2_relu_opt.nb", this); - end = new Date(); - textOutput += "\nmobilenet_v2 test: " + testMobileNetV2Relu(output) + "\n"; - textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n"; - - TextView textView = findViewById(R.id.text_view); - textView.setText(textOutput); - } - - public static String copyFromAssetsToCache(String modelPath, Context context) { - String newPath = context.getCacheDir() + "/" + modelPath; - // String newPath = "/sdcard/" + modelPath; - File desDir = new File(newPath); - - try { - if (!desDir.exists()) { - desDir.mkdir(); - } - for (String fileName : context.getAssets().list(modelPath)) { - InputStream stream = context.getAssets().open(modelPath + "/" + fileName); - OutputStream output = new BufferedOutputStream(new FileOutputStream(newPath + "/" + fileName)); - - byte data[] = new byte[1024]; - int count; - - while ((count = stream.read(data)) != -1) { - output.write(data, 0, count); - } - - output.flush(); - output.close(); - stream.close(); - } - - } catch (Exception e) { - throw new RuntimeException(e); - } - - return desDir.getPath(); - } - - public static Tensor runModel(String modelName, long[] dims, float[] inputBuffer, Context context) { - String modelPath = copyFromAssetsToCache(modelName, context); - - MobileConfig config = new MobileConfig(); - config.setModelDir(modelPath); - config.setPowerMode(PowerMode.LITE_POWER_HIGH); - config.setThreads(1); - PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config); - - Tensor input = predictor.getInput(0); - input.resize(dims); - input.setData(inputBuffer); - predictor.run(); - - Tensor output = predictor.getOutput(0); - - return output; - } - - - public static Tensor setInputAndRunNaiveModel(String modelName, Context context) { - long[] dims = {100, 100}; - float[] inputBuffer = new float[10000]; - for (int i = 0; i < 10000; ++i) { - inputBuffer[i] = i; - } - return runModel(modelName, dims, inputBuffer, context); - } - - /** - * Input size is 3 * 224 * 224 - * - * @param modelName - * @return - */ - public static Tensor setInputAndRunImageModel(String modelName, Context context) { - long[] dims = {1, 3, 224, 224}; - int item_size = 3 * 224 * 224; - float[] inputBuffer = new float[item_size]; - for (int i = 0; i < item_size; ++i) { - inputBuffer[i] = 1; - } - return runModel(modelName, dims, inputBuffer, context); - } - - public boolean equalsNear(float a, float b, float delta) { - return a >= b - delta && a <= b + delta; - } - - public boolean expectedResult(float[] expected, Tensor result) { - if (expected.length != 20) { - return false; - } - - long[] shape = result.shape(); - - if (shape.length != 2) { - return false; - } - - if (shape[0] != 1 || shape[1] != 1000) { - return false; - } - - float[] output = result.getFloatData(); - - if (output.length != 1000) { - return false; - } - - int step = 50; - for (int i = 0; i < expected.length; ++i) { - if (!equalsNear(output[i * step], expected[i], 1e-6f)) { - return false; - } - } - - return true; - } - - public boolean testInceptionV4Simple(Tensor output) { - float[] expected = {0.0011684548f, 0.0010390386f, 0.0011301535f, 0.0010133048f, - 0.0010259597f, 0.0010982729f, 0.00093195855f, 0.0009141837f, - 0.00096620916f, 0.00089982944f, 0.0010064574f, 0.0010474789f, - 0.0009782845f, 0.0009230255f, 0.0010548076f, 0.0010974824f, - 0.0010612885f, 0.00089107914f, 0.0010112736f, 0.00097655767f}; - return expectedResult(expected, output); - } - - public boolean testResnet50(Tensor output) { - float[] expected = {0.00024139918f, 0.00020566184f, 0.00022418296f, 0.00041731037f, - 0.0005366107f, 0.00016948722f, 0.00028638865f, 0.0009257241f, - 0.00072681636f, 8.531815e-05f, 0.0002129998f, 0.0021168243f, - 0.006387163f, 0.0037145028f, 0.0012812682f, 0.00045948103f, - 0.00013535398f, 0.0002483765f, 0.00076759676f, 0.0002773295f}; - return expectedResult(expected, output); - } - - public boolean testMobileNetV1(Tensor output) { - float[] expected = {0.00019130898f, 9.467885e-05f, 0.00015971427f, 0.0003650665f, - 0.00026431272f, 0.00060884043f, 0.0002107942f, 0.0015819625f, - 0.0010323516f, 0.00010079765f, 0.00011006987f, 0.0017364529f, - 0.0048292773f, 0.0013995157f, 0.0018453331f, 0.0002428986f, - 0.00020211363f, 0.00013668182f, 0.0005855956f, 0.00025901722f}; - return expectedResult(expected, output); - } - - public boolean testMobileNetV2Relu(Tensor output) { - float[] expected = {0.00017082224f, 5.699624e-05f, 0.000260885f, 0.00016412718f, - 0.00034818667f, 0.00015230637f, 0.00032959113f, 0.0014772735f, - 0.0009059976f, 9.5378724e-05f, 5.386537e-05f, 0.0006427285f, - 0.0070957416f, 0.0016094646f, 0.0018807327f, 0.00010506048f, - 6.823785e-05f, 0.00012269315f, 0.0007806194f, 0.00022354358f}; - return expectedResult(expected, output); - } - -} - diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml deleted file mode 100644 index 1f6bb29060..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - - - - - - - diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml deleted file mode 100644 index 0d025f9bf6..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml deleted file mode 100644 index 0d1e60b97e..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml deleted file mode 100644 index eca70cfe52..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - \ No newline at end of file diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml deleted file mode 100644 index eca70cfe52..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - \ No newline at end of file diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png deleted file mode 100644 index 898f3ed59ac9f3248734a00e5902736c9367d455..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2963 zcmV;E3vBd>P)a+K}1d8+^p? z!e{m!F(8(%L-Or7x3OYORF&;mRAm8a^;km%J=s!AdNyc=+ezQqUM;oHYO18U%`T}O zHf$ra^L^sklEoIeAKmbOvX~v2@Y|vHs<^3JwwH?D$4l*XnPNs zMOqozmbkT?^lZ?$DjQ9%E0x+GsV=1PwZ&39Y}iI-$Fb3d%nsk+qrN@cV=OmQMEdF% z)iHMl(4Yu=cIkixWXtwMIV=>BvDSrHg8?)+vLJKozy*}$iE>&gGGonlG0cJhG&DRv ztzkg-AO(q)B7~G^EwE#tK@nqmJ}!(Bqtf z=eN{I?X#P!Xx=uL)D9cAk=b!~&@H~6S)=a?R4fDdP{-5E5X_!5&FwFJ^7&W2WS z;CnxBCOsSU^v-%(vad;MPukr;&+ciI+F`>sGCPiqHe`1A1|N0p^<|#<+iECwOG@y7 zBF$;;0YAhxtqK7O0SW;M0SW;ckbsQ#9QTYyC*g`2j%bA%1Zh^g9=9l*Cy!I^{_p2$PP2>j_D2AybM$NwY}iJ(ZH9O3 zlM8g4+dw;}V{dlY2EM^Z-Q(AmcmO|Ub1&3EFTS>iuHC#rcNo$wkB3@5c#lSunxsQ) zaA7tLFV3Oxk}X2`9qVL6?4fcq?f>Yk0E0IEcm0~^P5ovLLV$&D9ibbZTOt4ivg_<= zu^#q8tYJktl(egXwj4c3u6N&}S3mj_9pv5y{gQvL;&nM}TeNE{4K3O%_QAdpCAswa z`Ev>!oQREY9uPqL)g(QPVc1U`Q3An`+x_7g8edZ^0zdcpXNv7^!ZsgV{ugB){w+5&3-Wlp}yI7?tN)6*ST)-XSL4g8_rtDVlw+a zE+K|#(tV!KfQE22d-}7B(mLkHukIp4?na@q?%@4Kb%u!@F-ww?o?tn_Ohb zPi3Do`yL?Y$rDPYtEV;|250yzpS^rZT*TflAZ&YqC;by2Ul7NTZHKmC)9NA6Vv+>C%^1XhNlp5*!7zxTTKfHTPhe?@XbH=VzWEuCcmX z@L_&qCB;=(Xi;-D&DvT)kGOiMQ0&YQTezdH&j4D;U@#9&WiZClJThS7w)OHH^fIT| z+jn{&5bhMbynmM$P<0U*%ksp0WUy)=J!n9~WJ&YNn$e3{jMFOW6n~uqMHg+M3FY|#>(q)ZF;RS(xqTh>S1Ez_jfFig z#ivbPnZ26mv{5wdB5SFYrUNM5D?g-OsiZZK?hPof9gqf&7m!5-C=d>yOsw<)(t*G@h5zIY2saaEx|99pU%^#gvdI(Qqf>)zFjf zN}5zm9~oT`PmH~EF012{9eT8?4piYolF(86uiGy`^r#V4yu7SA-c zjm})#d$(Kx2|Yn~i19Fr<)Gs+1XaUIJs~G>kg>3 zkQ$CqUj*cb1ORzHKmZ`Ab2^0!}Qkq&-DC(S~W*1GV zw9}L-zX}y4ZLblxEO1qhqE9Q-IY{NmR+w+RDpB;$@R(PRjCP|D$yJ+BvI$!mIbb<+GQ3MGKxUdIY{N`DOv%} zWA){tEw8M2f!r&ugC6C5AMVXM=w7ej#c_{G;Obab=fD={ut@71RLCd*b?Y1+R_HMR zqYNuWxFqU^Yq9YB)SmxVgNKR;UMH207l5qNItP~xUO*YTsayf1g`)yAJoRV6f2$Fh z|A1cNgyW)@1ZJ!8eBC7gN$MOgAgg|zqX4pYgkw{E4wcr09u#3tt$JW@xgr2dT0piE zfSguooznr3CR>T88cu6RII0io!Z)mN2S3C%toVr+P`0PTJ>8yo4OoHX161h;q+jRY zs$2o2lgirxY2o-j$>c;3w)BT<1fb;PVV(V`cL*zHj5+On;kX@;0)6rF-I?1)gyZtM6}?#ji{u+_Jz`IW9a=87nIA3aK2~3iFMS zzYP&fCXLEibCzR_6R~#sKN@)HB>);Za`ud*QCaKG8jEwqgoknK7rwW`Cq?RYYE5r+ zh-YUqJ082>*;EG`_lhV^vHEM7d+5Y#e$d^rC*jx{U%h3B^nU%7N|*y`o4g{@w;KP-89>&W#h zTBB2vTk*S|My+4jYTPKdk6yR3b?nAfcd`FeC@gttYuGBEl9wuf8`rOD9VP6`bhNxR znvXql-3ssVUSXfvcf^2L5R-^4E-s=g|M$Wm!?BMl!51d{AS*7Ggjwh^YsbK?6jgCA5T=(9$oK{{z$fCe9x5IJ^J=002ov JPDHLkV1g@XpTGbB diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png deleted file mode 100644 index dffca3601eba7bf5f409bdd520820e2eb5122c75..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4905 zcmV+^6V~jBP)sCJ+Khgs=qzz9*aFfTF@MBLc!81jy1$_D*`qMnYCeSOOSS zh~l6kD7e75FgOnvP=_arGNJ+k0uBt2?%a3It*Y+o?&`L?*#fV=?@xECZq+^KuXD~l z_tdQ>JOSF%q}x5h@>Id>gloHZ!fr_@%N)Qad* zI}<}@Poh`#X29>b50CkB%{yWf?z(t0rQf48W{j1a($$IrZ9{N{@#9Wqx}%DM^fL-m z`X#_s9{BwX>^};}KMtudHpmMyRCq34!+|XCtnqeli6}6}7JiE;H+GAtDViHuQ~X9` zP0^{y>Ov~ufreT-w7!yx_c;QOV>|0UxJK{lqSx`7cx`b!OLV*;Ez4q9Y_XdB$PKk4 z+Aq(kmz%WbOV3IpYsa0#_Vd?)>*2Lc zn) zvVw}USbx|rlL2LMl<$^rb@TnK-;J83fd3GKh6#=C5WlXv83lKz{0$(8x1g-%;q}$b z1=&8M<_eQZO4eJk#nshu9TsZZ11Z~hVkpt8oA4831ZP3Fj3C~EG*%gSnciYD-cpkI zj{J=o1Bg-kJrjfz${Js8D?vh>vJwR{=4)c@ZtTqt#tHRR<9b9ew~kVG6oc8(lNE=Pu>)F6HIf=`kIH3oJBkSO2;+SnG--LDU5kx zC0($63w`LN)znoR#GhW@M5n&8!EGBnj_usF!G5qm>{qhQ`sdB#K+CoQF7f-se z?#7!W#vF7jw48A-)Ulxz@0b)?7iKWQI+fE6Ud#Le4H#? z*wIeM>mtaY-X;WO^yfR4Adp*W)N+A4Yv~TqOy)a5g8AjAEfJ4acRWELKhbNNKrc!( z&!ze1YQkhsw=A3()t7B^pu2=1)CJq>k}s1bv-{fV>=i+J^=8Lh=Pn_L(@77X+QqLi zSM!u0YfVL$I)-o^+D$g^8iKevTQlfM$k z8A}@MLX0cd>SIdp0%mtcJaTy&g94$WW9QB?a!}a+T)Rd$eDM!(fgHCnNCsx!svv{S z@9-MjC~sfoKOK+dN>{)_sV(mjhof{qxwvX-7Df1DQTI(g)o z>s6XRhgIhE&g6I!q!Sxz>EW}#SnudH5WeBSekYPp`9~Vp)1-G^r@B46=-SWs(Z;X8 z02evPKG%G)Nf*Dpl|HNSeWdw0`U#|(mpohWGktDRF;Bo`A2K9T}=|{(p(X*E>(aYDag2maC6ay^+ zk7K(%-yfyPJKv6-`qy{#2oNV$%o|*T^A7!TivIn?ahqEKj{ka& z1#*R?@}3aHxtTmO=~U-w(|Xu(B2EmI8B50EvnOk9*GGbcJZK_}E{D#X@`(&j@%hg` zvgc+#V--FuV!3MbUy#-AgE($~;1gULUsw`94gkTgN-nwH+_TiyxD=9t>#{5GHSR=+VC|3HUj>p$m zF=5TOh#WCVpZxG0Mfs)VLU~bclwVS}a)Tud>)$I3M@i?-ZEb;CNQ$OT?W!i>WPgI2K-%bDAV3iV{YFpxIA_D~#F;z7mA_2ToA0 zz;J#$$gz?H{f~tykIYwsN^&ofDHEcc3HtMs_ksmo_H~%=S!trXzdzzq@XJ@P(yd>A zNh?17fF3z>nk9kWDu3|gPt>$~7yTPdOfi9U)o%B9hiOkpO1&hgnGv)+?=lcH(3zlF z)1$73Anp4*+{T@4Fog)rOQR%n2^~~bNRNp!ZBKCK-@noL+ER9Y8^~8Se*UT3c%b7TLtsqf14?X2rJH|pTWGz8-n&h;14Ov z#z`fWWiO*ed){^1em`8ly%A*0PxH#fdX?ndqyYz250dgaflgvo+ zJV{-K7`Kl9diHm3hJcly zengd6QU#LyA&GQLke(wb%#d-6v?HDD3F1f!>{yWg5#|xN?9J0WD7v z;l~T-X%q||!6msgyeyyoVe>kdc~D4&(TwHYfu@{&z(qUzHQHR6u}wE)#*5x&(o-7O zw@7jXJiKu=?N?bq2i6qRnT;Fhz}ixmnKagt?l)w-)BzP^3@k~*Wp97@gTqNpbZPR zy$S@S*a*rO5riY0Ud8DORwP?Adna(v!QOi8<4{14v_(t!#gLwrT(JX4+=L_$A%|pc zXmt?{(xut$cSLlVo(30Y+4jMCjtGY2uwS_m`dG?inGHD{f(#luthNkXB!$a+a>Yn- zK~O4(yi`tCXd{2}Q7v*n=1Z+W<4npgXvmO$@_f~4uO9n2kmNBzD-1S*B*<|l$eA1@ z#7YnNRI?n@&u)dVc}PLoFRSt;=(FF*KZU}pY9KTJIT}LH;AkK9+f+gq?~2G z5#)j#B*jLMG&xp+>KqBOk%JavBS>X$J^3kS)@II(S5WsDjsv%=Is#fvo%C=}VJ79C zu4XlR`eZez2+jdtZkwl~W8jW?O+mCNa{m8IZH0?IgmNQbXlLF4NHs~k~IN5KqX9?a!NuC1W) zYsz_4m;p2B(rNZ|bq7KTK$6gs(A^{fuF@Y|C$u<+ zeYYY3Gn!;AyU4%y;QbOj@OvR}OAX~1e60jYkYi7fGch)Tw9J(lK@#LJf(#;pbZHir zB&II7NTQ;~GF=lByQEr3##lyCO%LAbWBIf<~=H3(^R#^&aTfo7d6DH>o+Z>qt5T4kD_BN0|i~wM{;) zQDk{ivKxY=^BgNdF34d7nZyJ+lfx0Dp`+JSH331CES`Ogv=4}5y2Zs^=PLgRUr*8)xq~v8}M$U zLOie%h{Y~;4ui@DJqJtzG0(xF97ij3CmS@3983s@mls%CJveFs=+cwd>4yDCfvm&e z!5#1cb>BZeo;3I6^_Foju7YH-rfKy08n55>!E;8!9e--mI{HXM9UTG5-bio}4&^qi zE~isoTuo;*ZeZWBo`Vxk8!8zvL!O6k1VIoUEds_IbStzRBxm^3Gm}w=_OY=YZzMUw zCMRKGc;U#1X^+ec$Xs%Pdmk&k3F4CX?~8#O4uI@BY`Kmq!J0Uv+5@a9tSpblLOV))hr-m%u%E*xX4>hBnb`e#B{kyo18?4;4dFUw7M^53Rybu z824~aV-c4}JY7hR>xV*sAg3fy6mLS7LnaNbD2_RfLpjc^aO!{=GM5BGo|C6yB@D9o z>0^ok{idSKZKI>_xtZixNop4pgLk193Gf?Ao}Iaq1y@!>f+5tPYW8ZSJw77VrMS#< zkU%RzE|Nf;cya`#HnR*FQxeQ`<~;c>Y2!DH$r^KWEyp=Wij2g!i9-MbcG4!}i^_bU5@kB8)I8_7rlg4C4#@0J#r1#qtCFoLQJrO9E% zt`s&x4TB&q*Dj{y&(q&hhKJ${y!SHMP)2fle^N(DLRef11H>ps$3G)mFl*0{%0f#} zK?dh~_$b?`;>l7qyL_2N&lj^qc}_^Fh@jk*X2^mq@ZAj7%2fh^%)qQAA zZ3@z-Q#;=6kf<1C_wHkrQ^se@o}KxQJaxedR`bDn4a5ufwojD_f5pWfSc3vWaa8IF z!+Z?HAa-6lxNq{aCuDPGysez_-`RL=-eMvHI(P2D`bHVO)$w1e0^WP&R`mBpOFQKR>_w07I2s zIwmM1dOoD+-D@HOzvDhQc0abkw){E0*){N5cul3$g6n-PcZs4>q4bV;KlnN~%kbn}!V8maBKN?~PDN77Zj6xT>KxccMrJYVYoo)adu8>W% zmv*U9KCo@D{=sCEstjFGl{%?R9Bd_S;`C@G{FNG~X;+5Z0h*dJ1r|5g4wB8=?S#Zy zt3sAsXM@aL)nWAyCYz08&uXYp$}38nkeVvA0^C`|ts22ve2Y2>mf~J~_Til&y|FUz z%#l)O^+i>bDr7NsoiC}@GN^5^{=sAkPSF?VF#7ysBZm@DnF?;le_~|Un-B}Itc2u|IlX``0V1M3jKlcCTY73+_+5_^1 zO|_7<%PEyPhbqxCEnFv#uom}FdO$lY%`OKi#h<5Co8ZPBFZA{I!|wAx!c?aisEfxs z?T$*AUTc9D8_Hpt%L37MoudCVml+QIa-Q{X>F$I{4t=051yd2KXJy7g2ho;dPy9%m z&|3%hK)bgG?)N=_y3^l5BAU(HpEX16sc+%jjdr-wd5e*w`^js6LDPj(u<}q7%axih zoQB@MKIp*y%l0*noe!-3>L8Nvz`X|#;P=}%;m-Yg;Pd%Hg6jXkc0~S4=WWP7_Qlvb zG1>9)E0=~O9SWcSdXd@th$;|?3QV+Z@1bR;tdb%M2ko%(GTA+u#e@F7$5Mb+;mB`4 z!xVgv{Jp95%Y!hpT7-)jrQ~&IJFY@h`L?H{0L^~?0CJaZ z{tZjr)sT1m=#VQw^-Fg;S$l@ofMbuY0uykS+-JWJI=h~`ci}FY$50ATJ+%wA zO77DqVS>075^y6_kJfo$5r(}BH#(lkaYNw(n&Hbh&XQd-lYhgIk-UdHhZ4HzOR6cX9O(7$kLq}D}u9EB; z-dhHFDZZ<8Lc2GP(}(AKLrJ-Oau&a1s?6Nk^&FO z6KSRZhEqx_SQs6S0+Eca!Fb^G1gONmI zC+HbyhfVOuc?OI&h7uoNn}=`c_>iW5NO1q-GUX8K1^!Zxzl z4XfveR)GIBSo>}=cI+IH9~|U>#(X~teA-&84{aZTo0BMk;yjBqEL^gX=_9kDnP=}a z`+sm4^17nldnZj&U`51GznG$gf}Fz|OlbvM2~cNtN6bbO;LjW>4doDpXIHr_#-WEK zTp3oTSyarnG|L?64R(Lh#u7IM@+CF;0?j-dAKR%u-gp$bMThf`Y=V%QniZFqb4;b% z+^sU^c~$y+58W}2ds$fqbXadxS)oD}YcBF8+Kmro`dqK7bh9_jZo>N(2|7ZqH?6u% zs@LZQps|*E)s_+u&N{X0R(-hsYauy#KI0bVpUP;&tcc8vw<4D;UKP1mLj0?AU!cHb ztdAKWi}A~qZL?OzGg+1b@q^keUNsrViJ`HuE@E!RO5*b9*&nDxR@U?Q6pMIaj1kMY qJl2nQa+aK&iDQb84*TpHAJ>1BQ$$nT?9A!_0000+Hy9+Dw zQlg?UKB$_cZ8RBMYcyI%jkQf{#wz1Xr!PxQ>w~B~cKP~!=iIw{_rdOp7tZhwZ1+g(AXy-HL10DFmbXNx@L~ z3H0wQYEpsnp{iIyzhEeKgc((i$;}oAoqHl}Yb`&gx~}ISy|wl# zwdwQ;nvEgzkAnwYj%g}=Nide26RJwsNTUEE)Q2P-5}7cQ3Z84R%7rdvN4sQKhOlPcRnSrOp+WGP}nNJgfkDx!pMkypKGe90p51ezT#4MxAxQ zN3CC+fuRy0nP8u@+)%h}@FHZ>vWFTTCD?*bPf|6Oz4#LAYDsH*sO<_ z+8Vve2|wE19JrkK!TNc*tzkb>2=OxIfDS8-yiLEA$m0k(kQf0ZJlj+Q&+pg*@-o6x zTdEi#&vL>m?`;jX+>v0bbWnM`S<~tiA>-z6^m&Xo6y=iH&}dMDp40vqOvn?CbR0P3 z0YX_`z8klIalWefMaf}lN@-MvK>)C@OTMQsvEFV1j6zbmglN3)tDNw{&IYft@#yp|U;GYg&z^)Rt7d@u#0Bpe zimnOEmq&Tef~aWH7SjqERa#-iBMX%jZKUfNcy71bp|`IOKD_d0nA~D<-XkQV*jewl zx|K$GjP@M*^t)>e04FWS7-Uwy|!6q{ICob5gfvYaErq&g;Btk^VqnotOu zSN-|V;a*P<^rDbv9KD!YExR|ex)jop)as*$VeKa$K-3I_~rZ#$8n0D;V;;rwan!I2{& zEnl34toAlI^wpPe zlye)Ao4ycY%W~JdLaI0e(MHvF%G1SkH=uyAXf{=!ABS!n#lZ@o8CZ4XFmw8#1n{&R zVs(YP+3GCIkwRjs%TCiYQa(?iP=b^m$jib}=-N*{ggXx&44S-zukU>W+LOO#ZOZ!~ zOnukpUM6x&FsRNVXIChVTfbhB(rD_SHz|4}839cXjAmbiVtspfigR#uEFjIMj@si>Ore+Oei$<1cCarcfF2@0*j682U1A9rp; zlE=d6(}XYz#@Cd03QHCwxdi0=G&$N_{=Yy1XfbK~!v(L-Fa7gxu<_$VaOSVq1CpmY z8$Ujb&-~r%UfZSfpfHyQ7GTlb5>~#R>JqSaSxPVhD7~ea?b-3_j}BnQxCvh0zmvuF zfymQ6C7Oj$o(rpg(e8EsF8b6fI~#$e4S@tKotNPf@Ro97lv&dmNB}MOzKDHx{Td^7 z^e>kK&H&X>w(nxk__|+v<^;uhpfq|w0oCgN2n*&Uy98ur#zdLa9sUH2!{g=78$;%} z1L1P#zaX{-%}ARM>G(3`OF*1abzPV`HC~?1g-^B_&(OXN<=~`T0!1J)ouwb`hnx4h z9=m{>-*my^gYQ9FLp5Z*znzJYxJcY)*bL{8bEG_x3mc;?*yV2q=Kg#a+Xvy`pEue zJ2#<55|A&7Ku(lOR2IUxb#E82l~|riL@t>>J=|1!XP{(Gfq7D*RSSuh3Wmux1H9O5 zbzVzIvg#nSb+dS_bpfB9xub!%!Jvc0T8>$5O?a$?#5xXzQ6&nfaS6~B@Yl=oyt`5J zUi|^Lo>^h?bXpN!k$b{#I*o}Gg+L0KqjiNap+>{bdB$Wh1B{gdNt&z zkU*wl;*p0Tp96`fH`Pew34JvBLf)EFl)AaU3W$CXzIJ5}*_hmnyplOlgkJ%5dN1-^ zfYFOQ7f|g*o(nK@@|F3Nh4!=hOBWWfJjm^}QhYrdl{|g|c5+Shdb>Od$s<#GvjwI% znqg*ZJ*3tdIBXmlNOJbhCP>{}#ZfQ82y=FCgS0Is7aB~A{A+vOWk<4kG8-CsBA>N) z2Ro)Vo9)zRim|LCBI$`F-!JxDQG~E+nVNaMkGbGoHB3M|cbfqm?Jyjr6ln%D z61dqAY5B-YX2WN|HS&_#uo&dO1ZLdVcx6-*l>@yGiUd^twKIQ z1myy3dN1;B0z4enBibGcLp_=&v^1A84wc`CetouQG9=$!N7f##SDg2(;-$ z`!;UT3E!5cpgGLm)#4Fpf{Qj}^JF&E4%N%lmmNV4&oVB`hy6ytSLkp=a!l^3{cMD2 zTZ1ifMFW4}K)*?$c>mDR24g)rEZIEGUiM-d`ALieTX6^VNp)73C?Y9z`9d?=c(?d1 zs~_K-`cOc>&%IHK9z-;#Xp`TMv(d*wB}E%mPIu_y`4;N)(a6iqDI;Sfv%{G`Tq?Y? z`XY5qua{3ZRrAk6vM-O$&0Shch^Vh+#oUI{16*NgkrFgmFX!!x!YeN2Yr^QVW|_o)XG(ZcBN)a|R?) zB#;P8w$4loZCthCwyD)Kv~>DA|AHfFa+EnB3aXYkonv5irz&0+e_1c`|f ziIC%^3DMCrgrvlo!j#n640IkHIfLEfbrQs9Mtu8!_VBgvQKZl*M~Z$T%?|zlVT_2; lV%Z2*hu);6rydA(}wUDXPCF_W1vnaRBK zeoR6LNsxyaZGA2++G?*?dRwg0Dq5+E#aFEgnub(`IsNLD^CGWJ)s74L)DOcaT_gD&woh@MDDT7paS^E*rkp>8F->o#K*x;hPkb-{g{@G1-RXg&d5PhrJUf$gT>-Kc2+T~(?$>*Yu zT4h`0W>J$pZ%Azsi;{nVW%G=At*)awy8+_t6`#e`RGh(2zZ43)n*13}cE8;I5R%*` z|5tXk`=>gMs>q*$@(4m8?`JI1Q?{ zRHAd+JgRmHP9yV))rP7q3IO??4XSoJ$5!Su*=~JDub(K$fM<8yf*a-K*Qz zPelO^(`|+V_|-0Wk_vz*qdO0>?1mS)wM$Y29FC;)bEP-uAW0uG0ct9EO#m6#%K0RZ z39?+K6Wk5gE*|+^5I8uFyX{ALNYa2Nz%T`Hn@(}pU9*C57Xtylz}>iUsV2Z#2;ejg zaNoZ2a>iW@1kiDtzFVLPa8^~&DQ^ARm5e)008Ic*fO8jsh19y~Ki*W3-Qpae2p0nv zo(NXL_4n_CukY&uHM^BPt?*wD_pyjn&Gy=Rcfp3fUR68tMLx;5n(a64-U;9T#U52V zit5Q{QE!`~T|s99zY=X$w0cfmaNYW#0DU9B1CnnlE=a4Z9-s@!Y^>p_bSr_8-_-*O#n>*O#n>*O#n>*O#n@Ra~B|fQ*l9(%QQf9xcJEvaY~>ll!7d& zeMy*!>i>NLUU=_aXnXb`eD~hF-~w+IsQDzK^0wEj+D$`WSMKSA3v0K*aIW*wzx){v z|Lq;P{lJ5=b}1e+^O;s(t?biT$yLHOtC&t(07^{x))^Qyf&6nz%;wDIf6##eu8#&sKFHx$9)9f0Z%(CUS$4kJ%h zh7xEzhK3iU_R;u@KbYx|2=~79C&+BFEBd6;PpcBt&P}D2M4-D$&W5VeCtg1)xQ^3! z9dwsT*;DBzpVRTKQar!Iz)wS)Y_}P!pfNfWp?4YK(O3Tre#~%m=I?&-Fr?${tJVhS z>=lrTBvW+|8iS#2`i=IfwE<-R;44R%@X>{!`|u$=e(U6DgfD8a!sD+U6_7w8>_2iC zX4F|kjj91=H`?IFhx(x5cTdB<7oUfx-gpfTz4Im<`TO4(Xq$f9`@-{Je(C_+`S?TZ z4vcpQ8~0gw-iMFABs?!xhr3^RjtMxadO=JCss=`ts28z5FLd@+WjRbPjd{sS);z$b0hGtE^P}he^1i z7>H-yd;^|7eoS~C1QmcUcehUNIDmRU&%AkT#6+Jh?!%J56dPSF5W|cS2~^FD7Wvd} zT-c21)vi6B=%lT`_GJe6+|LDhTUPB z>Kqr7@|jIF1GGeZq0h@xpIiwP1yjb9Y*zKO!2wZMbhJU|{xvrEbS+BPy11i`MdHh_ zU@6%x@Ok(Gv{}~ZjMb!kP=K2@70hm|8K6>-+veseAW{OYUZ4qdx&3t8|MsoFVo&7r zBR|p`^0RB9Ym&QOBA13Klxzr>w7U5`YSn4T7nW@sCeFfg|s|3n!5j{|JLH@6H|aVdjq+q(_^fRXaK3P8tZdo9e@(iRu< zt#-^$ANe`N*~%uK05m~D0gxI2h64{X!b14LJ-fp52WMNa-_Ungz>n!?42H)aRu9tf zZn@BbcY(EZVhL~!%>xXh%jx{h69NHlePI7Nbyew@+aBx-lTRSu!x_l?#;y+Fs_qPn zFzyAQVd36CK07Sp-tGSwzO%a%W;so;wyOnR9>!fGhokSm2Wxk>z$}*;zO!cs^F5s7 zdN4|kx0C?4Z8H;L+zUX*9sl^`u!*Ba_}GaL;N;-QdrRble38%L9&`MolaSM3!@FQJ z6G4Z0_?!g@Oi9v1(0V6LNg6>3G$lEgO-Tm6-~7mZF&SDOz2J<8TOPaz5~@oX5^WXm zRgCN}thFfSJHcV(r^j|mGB%U)4;_7J+>jr_V@F?x)tyaH)Y%AYx|-ou6lC4*?Vr!2 zJS|H}beRSgvSlfiJk7T%A+RjP#kOg-=>Ybx$D05Lj~|1XcHQh<^OqD2_9kucVwoaqihgiFwGD}j~1T8KAq z9 z0*J_$7eGipRXI8<3eY7Ipjr$(pS5fpOv=;6o~r=0)r#cH3Lrr~6QEWsz)#GN7h+$5Xou}0dN}v_c^boY%{;YZ{WV+0(M1QNN9kM;!AOnLO zA!aO<$`pxu4!x90Kzr3RkuIy=J+gW&=9H=qA z_U>+&-|S@9p4AWyTLkr1J{JXz;e*%scI*>vDKlk)jL}tnO0kitDO+6 z?2}J&RYIn-a{R1}qm0E@ZB`_oFkdWy1o&B&jg?@V^{!r@`-SP05aqg;X(mq$fxs-TLGNGl11do^z)ej zbyh|4sl+n@Iva%o$n^8W0w|C#6u>A?ev|-N<5GZdoFLuJoL?^%Ksv}8B7j1W6%fFy zNPbv=Zjk_D@+X75dvA_6E6 zFN6iKm8nL!k^)EsSvqW^!UD*VZ;KXSB0MP{62Yt>fJB5F5ujW(!es*ZyvoB1VF6kp z*=dv~|NIJ2T%dOv2k0&0@pc1G%QTb_ih|Yb=$T%62%3bDw82d2XhH;WDF$Wp8)|TS zO9Yk>O2SA)vS<#MrV(i-iw4q$z#0HWxD;ejKcAgz2+A3z)@+3bosdkEd0g z;D&1#CpZiz#?%|L1R`t^3D6uAKsmytNfdzqGC|f*0VK$e7Qk*e$z8qXvXKiA`1=hV zmpdyx!B&1`%>9K46G0ec(a5T#01`o#KmdgZm-_e-0c6Mz|AmPOGO9|Ba#>%@WZZ2W z>Ho;wdKvvm*|hl5+kCX*InGgW8c#HK{=|ok`9yjeW-XboyKLmQg9WCdk*LNJcD!Wm8!M{^|rzMI;*ms)i5}x+Az2Z&!25I4rWwWL}BX? zEOKufEUd2?%)sM9ARn2w5R42L+weM@-Ge!fsOt>oIm=qnPh6z`_Ydz*&dt4=I7*o{ zE1hu`!$e9>O-f74pc5eSr(Br2T9<$6_jJqiuh$jk6-OgwWnppRih^SC?_wkr78Flg zxdOMJdh#qTEon9)Lx{AD zp})x??JVrlV(c?%q&{ae4u}ilB*0A^Hwr0^^>G9BT>K=*lpq(QLcEr=q$MqBNlRMN c(!@yr22-Ey)4s~&`~Uy|07*qoM6N<$g6%nSQUCw| diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png deleted file mode 100644 index 14ed0af35023e4f1901cf03487b6c524257b8483..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6895 zcmVBruHaWfboaZ^`J@5OTb59uN+UwfO z>5DKPj6xxy*f-15A^38Hcw8gS)fY>m7X^~)>WdY`i-Y7Ev5tB;lGU`#+aci!MOUUM zD}qsF_F|N>IHn{!fdYTV_wX|;<46$x9(d2I{>ArDOEMG+AD^=P{ywF-GrY99`C;pd zTVmI*ebJ{Z?*lK5{2OnL{2bsnz#klb&V^vTF8LL3idsEt+KcA+ISDVmw89n=b3!uh}YH8Am2dcyFwO zP>3sYL|70%XiHU}0Zo+(MxFf$fG{c^GK8Lk0nm!?MOUlH=$7@wQ=P+?afrb30+O<` ziTG*r2zL#G;JREn?w(KwKTW>kAG@~nvD;BDbNA6Sw3X7nOleNtO`EFE_iw7?Nk@V% z2nn}DI|Z-=FUSS{e!iMKGH%z#^FftGb+nGAxybACovek#YjQ#vb&d*p+t1kJZ`xQz z;u|ZlH|p$>-hl#GilOt>$n{u0Xl)T;>j-tlI@@Z?Wzp-=)#G34?74swCQ~ERfdKmc zFhPnTvx5a7>%ShCv+=IbEiP%zhTLzjnoMn+{p#7s56cR+1Ip9!b!Tb z`Sm7~BP+1z^;S0iG7&)FAn@&x7D5ZD8A|Rn^8#NH904lXb|d*p^Im_M3cx}s7!4)T z9gHH`t8+}w++;htxjC@gx{~KPlVjj*{S_ks3$9(+#6u-Jl&IAP3pu!CJwK#M5t6c_ z>9wdD74a&~(E(Zk#1U@ZTtm|Z&dTxVSzAiRZr?zO5>r03qKN!s*CrAGLWn8vUzShH zLj>)tEVfOD(e%jX+M_)bim*#E5_p?Gy16VcdB?_AS3UnYnfh>x4oMP&MNjS{^B>++6>|-QpN0X@X6L&Y0v_nr&QpJ?Nedk76e$t+1QRS1iuh%{F%%f!H-mR|< zQLG8Eng=h6w*&uot15mDdp?pMw_z>mzOGmllD0RJTU#1Lm&egEdG8hyS)~+JzIUCL zOasw+)T%|5zrIFI%imD16;(cBT?v`6d!z2=P1Pi}_cC zaY){_eM2i&Osq}6Oy>Y2JfPjfx74>{k`N|n!sM^n$$Li~8z=DouS%NFPq=6oaadk$ z0*u&FPkPm9z)j6IfM-M)d8(pgV+4M-S4t-d{CpIET*U$q-ZNqpnS{w$epknMM*J)< zPm6>bel7I#uL*$fN%fSIg0yd#CHM7kuV;h_C^iY@0i^Gty9+J2aLrPcO&e_I4V!m|%QLzX;!0D_phPA9;f z54Vuq!_U%`L{EsIT^4|j0x3HRvX(Vc4%<2x@Oh2+Dn;)>o2t)Xj~&>w&Vc`00uyVP z+rjjLt~xt1(^VjmUESy@cLz5nC)L@%fx;yxhQ-ro#ptR%A^-9B0u$XgK)sha_CY+|f}c==vHJ zIsE14R^;ECC&mE-m5-zZK z+8{Cl>U!wJC$s|y>+%=$e8oRsp!aOoBrJ@MF;SPkbU$$FNuOD87#(v%q_;vE<)g{{ z)}HI>svC+uv;Os$twg|H_&AuO>#CKsTo>rM<9BT$m9M@;K7t9+k|;62$@KkG-xKZ2 zhe^_oMi>opdhOmo+KXR&YGro*f{q}Ep3j$aj{uxYnw$E)-`r`v*$LKBT)@uM9ye4J z-Q#1bNUOU9;6>Q;!8^3)TN3u@@%O2>^UtqNkTbvkW<`=Kz-yfT?N{=`iBIXo`W%cP zOF@78`!8CjaFJ~gEr7rbg{*#HA!~+a`8W%{Bz>w?4Y=;y{O2FrCCt!4 zuy^g+qyHvTAKvPoK+M_<8JLnR5|X`g3r*75jg0vjI+5}2Tc>@aBLzSo8U5@X@4sm^ z5-ujt+fn`dMM}KeB4Jx*2>uVv&wPi8j_zvT3~}C%Z`$&>zV&72aX)=W3XlNt!|X?Q zQm^Au32^rJ-)S6xb54f}0OiA!vY*2j%^E_@&@x*=87F{e-s!CjZ|nOe1f`XR>1IGiFlvUuJSK*t=o+=Yf5Tc5TadL2IQF() zEi;A4K7Fc758(rGN!uFr7=1be_I@-cIEM1amN~NnsQVQ zGnAj7{i)NE&jag-b#>GhG`pj=Hqeb+VmN|mT#uW%u2aZ9WP0=nqgD1a!xX1#>7~!l<@*A zoYvP%oqLK3P?~FShX9z1Sqj6ovlDNLrBCj+nMZO-0B}XA0IJ;6%pJ)C?Fk@Zmdxqz ztUAO8CbdHVQ=%<(ai;xq23`ZNh1c{dOsDraC(;Gp_x{_&8?%}28UgCOUzsT>BkT#_$;_WV*qs7k zaPyN$mvj4DM~Poi24V76Q+NQ14?o+kc?17edH8v_RvLR<5W!E8Nw&XzRMg*N-BY$S zuzP*nCBWq5k(6tj0?eD4;4Tw{lUUiyM?|NRtpotF6fZvOQYu;~fC>eGYcU+!A^_gI z>|g&+Jh5H^5!z*f#wXumUx4XTZuC;;xMdO!D9;DmFW!WFarO)uTvuikAf~*Cy!Q2% z?KVMgd~=fYTB|S$Fu1;)-b?J?fAZ6hBmmb%3fCA#XxAj1GG?%S0g^}b05|kYcetUL z-fe4Y`Q-Vtqy|P!>5)U^_~}z_aa-{kcrCnU&C4&rJ`sE|B!wvbkd_OtElu>j6jNVj3Vxd?2fw$+FBYCS|S$=CYSc<5Xi_2*; z&gOy)`=+1ggA3j5q=$gF`8aHR>b`OQ}eQ6h8^930& zTfz6uT#6in{r9oABIe_L$ArY#I_=r^EJ;?q_OB~WfagCwZZ1HRKmdgU5x6DEkfO}< zfwzyo4LP-t+{?-ekO2Z@S_?o$$g;aAA0l1(9&md- z<=AWj7QQA=_Jw~#d#mJ4?b#K9JJqf<0gnCn1538001ANs_@tzj2-yZ49YM<%;c8eY z$FZH)D*9o-^{baHqyo6OF>A<%3Ni|8q&>{r+d^jT-r}%~5L31_lEnvhk3OrL;pn_Wlg^IkA4rJe+-a^UwY7R5qH&49$;zI8q6 zuFa?QWFa#_X%0VCHo0|kEkwel#20?HhOE_Boonzd$ROVHrqv>s49lswR{|TU1x4L9 zYWUdAHK)eyY$D^fHyXs|f^6qRnrJT@3q;P}(?aHg7lc1M1q}7Ow>ObxkL;#qWh{6p zNoJ@q2lV_2;LW5yv5(xor2$M!4PBBnq0SsoCnSIMQwPW-xK9!YXN?9Ewl1gu%s7*t+Bg35~wxOdVL z_!J6maK$|`wmvrlW(J|R4Qp6SZiZ11h`rAlpa;f+xk}ztOG1=6^mika+17v_cwJcm znb@*{glqHQ_Z$<{mdK^Ro{!{5S13qeX|4t2CTLg$Yx3A^XhS&(#Cr%31fKxLk>AE+jwroWIAJqGD8O53ik6ycRr{+uucnefYQ1B=j?lwCZCL0Z!rfHSi)rM z13-u*5X=u3)NR;&OIH(34)$~;+?LI^bTx53U>L*(G1V#y+YdHhk;R@Ll=i?+OkCd- z%3*SEKUbcW_h90>pZQtm|g{tib$ zTp&#%&A4L)t+45A(Dt7dVJl9s;bIyEC|u)|eC+Xd1+WujnF-*8d}{%+%uSDM1z{$R z&7_>g#s<0G`%Nz|CMXD((fWe2kIJa1h~| z1dux=-=+ZA>r1lqv|jhme3Ej-a^{v(vpkqY`fO7a6BRX#kuLv&l7`Q~y7ROYB*UHn z+5!+@oj?G`=>;nRoTL}fw?`M#BtWKv2$vOLIJmo103=_5DFBm)B`<7DKe~FO@{*5NG})#;LV$p z^ny_Ujoc~u*wc9ddR8e}^0QYE$@Iz9$PLF)hny$v0ZvsH#-G7`E%D3)bN6Cny)?Oo z+qSv+;8rB2z(RmV8v@wL?N9-lEd{Wj+o1w%wGhA#`MdzbHr2Go)TqJbTt%3<(;lIm zAUDzU378K1rVR-b78b-Utqt;cXu%;L^r5#m;S(UOxMfca@Vp&7^2Kf$-2R72FCZ2X z4Uz3AJnS1&!MHIBQ6xl$8R)*9=6bq&fnGYy#$XFui~gt_LO97NkaamPlJi zG}q~I`=rPHvkwCoH&ISlZaVxMHavs*`M}$I$W4lzSC%}s2RCQw@i<@HvgZtV*b$z$ z1usHku}*8?kXySDgM-1OS3 zUTf%8r$G=$z>}u%up?*XVrolC&vhjv5k$Ci$41h-vY7O&P;e-=MkR~*S`E2p?^e2R z2iI-Qp)^O8l4dnAv4*)FoLKDvZ9bYE?D@AANMDDx52qZkTzGY)>9HjOKPle;xH&j= z@eBOKOmjv`Hyzps*NFnc=^TJ|TSRUrK%GPVdOzN?a*|%a6f$NpF_~t|=CiIQ=k0*a z_gF9s&CV^f?WRfhqJP7Z2i@Zm5rN+@gx^9pm|1YoJ~}B;5wdmmL}=@&iPu5z8@0Jc zAb{iaf=vM&M7XvE5Rxy|@!k$I=PsOZhtM{&ZTGnpnJdqF)xt#!N9$N6F zgblJ1XdAJum&oim79o@gW2kW(w3Y;Pl=9zrpi`& z!mJaI$>Fh;R0Qh?H=tA~fP;NIicACUUhq}tw&EHtE`c(si%&^rOkR(5#=6rsU|XEx(9YvlOxt7`7r?j;Y@Ha zPS9~Uq=Rp`VM6r6xi!r4g~#X|fyA-jV9L%Fxb&&yzc@|W8V$kHtq`T!J->k$fwT9f zIY8D*dwEf&fqFE>)T?2)4Pu@N7f&9Xf6RBr>&*6g&&!c~>&O}H zr#}qk$lyMl5QDrSl9VKmNn_^Ee2iK3e)M7{i32${3oSk1TC7gGkDd~w?cAO{}c+|2tHX7 zU#BJGcQlcR%3^u|EI#sS6Kjh|H*En;OH2Zj6;&!Hp+#ASkepSggI6tnD`?^Do&Mky z_(gS3!Fy7-66*lojXxVy`EzxYFjw%47oscmr^CW}fN#x@ih)QBU|84q*gJzJCZ~13 zcV=bGip38P%u7EKDP8$aq&)5O$o!1&t}Dv=F{)U027y0E7G!>hpM_^Fehd{2TmRyarwi zugRJiU+!L#tDSf;g80yf8j!fq&|tdLATY2y^~;e|A@Du?49j3d&XV1QyT&!b+bIYy pii9&6o*bz{@b60mWOsVP{|BB8eXZ|AYE1wD002ovPDHLkV1li`I!yoo diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png deleted file mode 100644 index b0907cac3bfd8fbfdc46e1108247f0a1055387ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6387 zcma($WmFVQySpr~^b#u_OG=0|(kva)DP1B+cP_AmARxJ*NC=Wrg0zUl5(`L)gp{N- z(%_OG?|Z*r_s2c=$2@ap&UtF)$(eXP9W_!SdLjS-K&qjxY;ZTH{xb;h@8E{&N(%r$ z+p3|gU=%dFmq%!1q&9_NsUvvk-GvvZjaIJ%uU(o!Ypc=Wv%E8e<<)SFdRM{tz(T@!nKT{;0jT2A&dgKu3 zk|GDUX<&73+f+CnZza0G4g29@hmNkl+2wP#$0yi6=u-4CD#*a8LxJLG9KlkveQ7v} z>E#)-tL=xh89y&5li1I!>Zzc!_i6V~nKP^5-+!69FtnX*f=*tr+cf&UpZtLBY|wv< zJ6r*Z5374 zi$7+B3A@szy#|*$Tb~kkzc_N~h3;oe8q95K$w@e#5FRGcF}wXTR}t#^!OnNc>Z52w zu23YrlIQY7UrLLcFSW5ctMBzwrTz=X-m{1Y!*LWUbO~;u&&q8Lu;wlGFqO2h4olL; z{rpPfr}7f=Z)eZhFw1_ITpft-VzPF1CHv-W>u;OCBJBEOEn$HmTpFjX=xN6-H5#V{ zn6Si;q3V*@lFMd>H8;M}vOp8McQcJ}^bBfV`1xb0g0`9ZZa9(wb+L_RGO6wD&I8ouM<}YVDFU ztMSz*yMDz3AkS0YO)3_lYDarEUyj?A#9s@-ln${-1Op^nD7zREi=%4Hy%V?=YS7G`L@>`3kHM4eAD%)t@F};|C zfj?B^Kox-WuPMuDp2=LPZU3Obgnl7{dD>|>*A`fn-0|^8uAHJz;<)tkTXA8lI&dHt&xG(4Il=e~QNN6o9YD7H{TR?17eM>#Z8#Y@_=7fZ?HkZX8i|mEGs5mR`uBi^ zzFh5AG^3EMyvpx(a*)!eOI1?nPTn?v0Ly$)KlQ16Xfrzh+}+Ua_I!5XU@ciwrAZ>O z<7!MU$n6`x${EB6YH$hWOMuSEw+72Lb~rgO*Yp26LGdNp*;^;HAD@(SAr(Dk;j7w! zQ>!M4rxUFYn7E?v7)2q)2rJ2%PY>A>-1O7bY~nt&n)jYnG$(iR#hvlih1p}c)I+|I zy^C;=uIJImfY zL~pm6t6Zw8FiOIY<1>EBS(<5`Cv8DBcZEpTCQ{@@-|2$Bhi;6H?Pofq1Z%b2@)&at zUA{9iaqi62D1|=T{xTe3Czr|z52P;M7EB|V-ss{qspYc0Cj~hUUURef8?i5H?e;kA z<~qW5`JIc(rCLz_oJ~>x8O2IVR%>+7%}`TBSQt%i+m+4tV?z0(?5cf&1v8cNlz7Lg z%ZS>-e!({r)+sH_1+QJvE5BqOgmfK_$X*P0*x6beoRN|0FV zBu+T9^1E5}1I>g&wC|Bn^{(R$!_A@+E4<}3n|QMU=H|GuQZRAZ+zSZ}SS{MNj&mi0 zRY+fp&8IQn-}zGeIVj+qntrIP-IpXF?2xAoyT|i)X+@HL$+|t{#ZAvBrd?L!=9aLy z%@CY;X7U41O6VpHq<1UBk2vi~afo_h1Xrb{vQ%cE|Fvi8EjFCP^~ zabJnB#=NPyBD*BaNSQW*VI+TbEmlu2&HD<4U_UQNUR_`K~u~XWideSoLc(k)vEtG^CT* zG`Zdarw^M&6C=~oi^6W#WL!BMe{E&Gg9Arbg2gg;cO^sJ#+L$ zWBP!R+lcV(p-B#aK<&Ly>?*3fngF)TwSRSmGJ!zET{Brabip#AUPyChm}S9IFG!l{ z%+I_?Cl?zVm9nbGSU`Ksi%z1{vEPpxnv}!StZLIR4yl9y>GM~KIIbNdVs|xsuCpX=J#rE`8<@v*FO%Lb)=#c`~s7W#9EDhRI!G*VBK(y z5D`)jJo4o1={q}Kg%YGhdH~@PGate(xi{(OiQn~MMSZM;!kHNh*1-e<+YS5-j3b?2 zq7SYPWMn1a!^Gqxr4d1gZ5G`QQ(&4Ag*OcnWO}~9rz5xeE3Ycol5cj$@jggn@8x2* z)UpG-U2|Av7a)Hi=b^@SNp#`PEDfswF$nyx&rD*+4SF}`_U48`=1VnBn}aEm{Funk zSWQuC>r8yUkd_D(dKEqo`7i}}{#+a?O4 zDIg~&^q#d5-Ji>``G%gDDzV<~+=*qePTy_lbVjK?!d`>ygnhxwtyL65_G4A=A}{Dh zq;iS@h|Y-wJdeGj1b{KBTkst|klERM7*Hwy#ZO<~Q$5~GzC~WjZHz>=z3~>oAVbbv zzmgOw2JQ#Kv)GT9dwrXGJKz5(Jw%&rYPjfi;TI|dyVJrvaZ*ivGRT;i>R6}8B>7*j zbJi0%9UfLcYKp+TU9qXLSp`rm`)3(g6YOdHa4cv2Y)-JCPZ&g1Z*%F~T@dw@_HA~- zxeq6NeOi{(yh(ziMZ)4yIfDP6nhTg;)$=9N_-{KO!ZB@c@e$(SVH`%0b3YF`lgX)? zmPOF$H%(2yD*LrQ;d*vDgW=s=2h+1RYg?DCXa2gXNT~W+Hu+pBZ$bO8IlS+nqXw^| zBM2iS@v_S^5P@J5V0gw2hamKs7Wro(xWlv)U$%_D)AA{;Mb;l$7?FOK*2{U?f_M(W z4#aOFFlOC*Grkxzi#w)?qgNP48e=dJ*`EYNKfLm6BlZ-j@VMi+{0T>$Y6e%gC|6;v z4=~J;U-H`Rv(<}l7sEXpm?7;(jXl{O>aLca zP;<5GjkKb?74YTOqJAtFKzq|v(-+j{(@?GPIKVS95tsog!>*S60XwAsnYHqG)dW<#@2UIte}({hi5+*r;^rQeDpKps%Ql|LRink z=CR6^g!&1h1Ks5JplDey{0{E~MNPgvQNeH21%lrCFFh~_7#;b73>@zaFo0B}hXo(J z#OVP*a2!ZeK|x0LfazsE0=vAP5xpQ58{e}Xtzn5B`l%b)PM2PI{UmZ`}XbW%4eE=4-VAbQ|zojxNh6BnLDzTlx-stKQP0|=pi5R7qw0g}ivih_z$ zN`Pc6h9K3P5vFz^s^};EaGwq5yEdpH4Um!3Lju85e*w5hg)|yEkihSklp#pqhWjij zaK_T%_)PG>g`7N9$25qwhR3WB{&pp8G2;J-#qe6%xdFHO2AeceqW`Q#`J1X4*a>V4 z;Y4EVTMA!^vxOA;$ZDCt!CPots~0yn*Erio(G!n)@W*|^D_=Wy;f*k=tF~9Zmr)dn zCzfODoJ@UXXs>1NP-A4#YmmhGXavn<+z_gJ`>cZaGo@Iz2J)=M7{{ zJ;n45y6T86%gls;?`*1bFl=sXf1H<+2AiBU`}H6YM=+eFPoz%Sg=s>Dva{ls1mJO? zTWP*i(U7Ec^3%Z$g`f%l##*mSt_wOa-d&(0A0@(ms#pY$P8SX-ZAVg)> zpsk00`SNH__*AQ#=>~|-wScS`e>RBCs6NsQ18sz`Q({qI(fOQUY10Mt%YO^v{>w>TEBSR zi>oS_n(}3A8W+^iWG~}cr3Bv#s3W>CFUJm0ejS>=V^X>!UmDV@|xH@hWB5yhc zuXagN9&cY%tMFc@?PqIxYmy+OSGU`O5gvK2Yaic7tFAiaz`*T*dLafG4tz~<{L=*n z1iRA9k6#TYhCWcSFW6P4&4yOea4q&Fy6Mbkfl&!{&@KmDXMWs7;2Q2bRU~gBtDs>o zNeUgzt#lWV4oq=C=5{Id0)=a+u5HaCtDZwXnX5u!bO%{LbXF-L40}KeG4lG*uU{E_AOMMd4ch=Q9&rc=;3fB`I@EFBuF!XcuT783*FH`4zO zxZ=AOG#fzwnh^u6!|A7Fqf5u{$IesB&EF?V9g5dyhcmbVh)|M3^!U*}qJEYbGFaK2 z#0I`dWniJzl~+;sJs^jty%7`^Yv#{r+=Q<#CleH22pEWpQ)lwX9b5uv064&fPlS+b zqZM<&o~(2`QgUJ$O29zuo%|4(uP+zAeibd;jfc(zz|+6+9EUrZ?#^|ymX-knV0Dsz zFn=Bg(*p-JjWR}+{_C#CZ~dR&on|-C9&{&ij%~0x9gtgIMPCkr_rc{WE_}pL*bCnZ z3d?M3AYq3)iUS7jPOFD3m9DVG)E&SJ1*`YXzZQib9R(``({n~0aGXEhgZnJU3vy*N zlEAeqef_?@nqICTH{?wuZFw#7F{`&i?NLpf<7G2noyziDxMHBmK=Z&P8jf>~^fSVF zFmD1h)DVg7D8erkb}OkfElv2i`s#7j5-;7~&l>SlgLRqNM90B`oFJ!3Z!I+~g7^$B zkD<7Y^U2QID5DVT!a*uS%0aL5KAD#Lk5^|WCC!!OQcFyxCl$386q*ohKGP#?pNL0_ zG0d|NfxU%N?);5-{u0rA@S7+4>7&sDwppXmJaj`?8D#?9@k90l(a-Vg>E`q1zXh9B zEsyo)21!OKE@yf_^P?a!d>O%I$~z&Bg| z{KuO5lVh07O|keMJh@ks$3EfHm`nFk6qNS&_PxPbKN1c~Ds8?;y>OzV;B0$XVQ=LQx12PJ2~x!&?qm%Tl)eivoas}<)&`&84*`tT{?ou45c+RPjX;imIsuwmXJs;5Klbii3#Q0kSLKcW+Y@xKcRce+GJ-RTlpMp(c)D`xrv zd|#_rj!Bm<&cad=Pq($+uKOY#CGCK-8EXOLAo{LJ2l({+_%87YR(e2EErULI*gm@X z*m6LuczdHTQHH`3=)x;unt9KH-4duW3nu}xk&Cu4-DS4wjNG}S$tO5H_$l1*S3Go6 z0HH1rN4WcDUK${}+a@ICZ(ZC#*`6h6EK7)q2OePook_w)c5%-9AxwoT6E*>!XDxpM zy_C$yP!`aN2TiCVLn_z`_E((J%LUYuw%2%(GBL3Cve+5zmepidD|^#$=@2Wfp!?NR zUpV2SwaMg68}9+`X#n-Ust|TK-Qk@HXu7dM*@>KO~@YA_S!geT; zxLp>TbIo9^WI=ZuT?ErRN;LqRSZX$7)+{MdSSiDnSdSwQ+6Yqb#nF393O_Ow-rRZD z1MtC55vP=~4kwe+$#2C8b3Q6*<^!T_D^X($HS$*Ns2(pd5~m<_QgfsetRt77rwh}yjg#yx`@p|%;RnzvAN8~6i5D;EQg*azSU-+F9W;M>-%sM=r4J zY%}@{t+!2883WSGMgw_85U#I}O75Rr0Q_D5;Du8|l@ zHWBq-r2&(pezi>6+daPx-qwVIQ3A6$h}GxIH72G*;HeRgyXKy?Uf!HvVg$M3Vs?lo j7HB*8-{6~e<}KKy%g|C8?m&3=nE}vH(NX@WXdCq(XawjJ diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png deleted file mode 100644 index d8ae03154975f397f8ed1b84f2d4bf9783ecfa26..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10413 zcmV;eC{ovnP){+^kJY@_qlWNt)byXXcl4&di)UgOL4U zf7l=Phy7uH*dML-fsqKMr;DlfM>yz|;&bpF`{OQzgo8jbktkySeg~64fbWuHz_H+% zO2F)JwJEE@HLSkR79_Z#oHbogc3dx%o7^AeCk{b5(&1F_9NvTf!DryJ`XFJT+JS0q z&?sCD-y=8K2W2PRhjJ3<`jzFS2UeBViE9@x1RKUQCZdv7kl1SX?3WZMS(_}*GPxT+MhW0P|fyhZ+Qq30&o zK&_A(Oze8$+U<`PdXPq;v4_f|Urm8qVAY042UnGp45})9cTiQyEh4N`WieG?WwHFJ zL%SQEJASBPNL8tfyeEVAm>Ttneh$6^dT@7TL)6K`4dZuI$Q8$@YC7*NxE8o3xHh;( z)oY%paC7#DbzBq#z7eX{hBSaAFX=&XZgM%%7vkI`tW*yCO_Yg=`yqnAa-v2eeE;?> zc{iKw z56$?22D^!CP)@={l~{!+p^?NV4J00s5s~K!m``K3Z^mK!w_^!uRBfLTqF!aWIQ-yF z+-+mFw$C)OYiVHDrh2UxX&Im_YA#t%&~JYj4^H@@?c?sN*|d{1z)fXCWK#h&a-j`x zMSwIVr!Zx+>*mUE)45>nPAFTm4uSn)0ywG_n3eP}spMCtk;WQXTc!Xa#?G<8~9?@D4_J^SH8;MHSdkm@M;{c4Zl4~|K=yFf32q2}KbIxDWFpb1y zO+OA&=Iq3=s^1(B1GFU0ED0TN)1GUEzJjf&cITr}~_843H9IFf?D zpy-;D=W+{Ha$5$7>!~TGM>3^{(aM!hTwS-Zu6}T3B@Ohtm!x|WXwD0DS$2Sg4MHki zT4wy)C@!)S)O94Q^ENX$IJLgcuiK`aOAMYnR<7i>43I*17(|~2Z^{a28-tFl06j}G z1E(L_b%g+AG(2{IghMo@X493&wrmJ$)etG%R?khj1IO;za&76!!+2C}`5mZmW7T)d zdc5TLAso7|4x4fu(6j?P@#13#aX@*#Nyh;YpF8maDO(w~k+R(hKe!7&`(pji{+WqG zRNJD}1i%xZuq*IN{U@la2#gbNVFCfAchs zIJDcO;{ZH`Z=Jz5RkkxH?-ZOri>KGuU75U|b7#sb@!GV{ltwd6tl0 z`-tj|)YKcR-o#ogdg%auyuQ|?Hi%I3R1^-|ZB z3w@dmquBHyVR{7VswXIVTX$?MPH4+9kb2qjlDK$t-RcV{VoZD69&BtHN{89>gQ~qP zJ3uX1wj2^zXGt+iUU`JHjaZ|tY;IN^;K@-L=fQS>Y@uwVEi&RUN?2Y*+sNids}(cC z+40kwrYD*P3GD#2c-goFwX_(F;ug=ctyz2p&FRs8BZP#KW)rz1wGkz3b++zpGX3NIKL+e&!v|_Kf@T~~axF4tuT$cD=XZI()UWvicEV_jFqjbw^Y;_9AkJsqs?mSQ_V zHd!_~?Uk)r`5Rg=yAOj%Y^~TwjIt7{g{Gt00kYMyk+w^ZgMfMuZBvVP>lJ}>TFiaQ z6}$vw71{x^*|Ko~^_rD(w0N!+0&330f%Q3TNHV+~AX_dQo92j#JW0ofEat`()+cpU zNK-<*Wh>c%oF}ld7(cPM7T>>P3+`N++2#S7TwjYH+FeDL-}5iew@%rhE!V8XXvx!0 zTFweF>(f3j`6XB-!?_??289+P$hL!oDad&d`knUqYw_}zU&NQL{fPhk`)_>p#vk~F zOaH-9ClAxr#e^P5nv&DV0je~`L#5{FGh$URTHx9AYn@Acj8H9 z-fn2Xa=Bbhm#_bhv)?!+_&C~>bovC&J9ipS=gMNVj42zRq^}*vKi$01ti15vyd!%p zUA9JO)5+CkcwA~i2(aSSaRpH~0l2>#}`U$mAt<;*`UUpCUF!4<_g zFf*C<$Rf;^y{H)XiCNlB=(vxmae|1Pqx`~~S}Rm0li_pUevNx<%Eh8q90Q566YDZZYFMh0VeMrAMOVe1 z|Lz;ye`{f@1!x?J0yCotz`^}fMr`Fm4fEt{bxGcZ@CDfQlmg-(RljEY}^PEkElrDm9b@vQz3{qdC=2bx32OI6ixaob7Peg<(shE$A37*Y0*ydf7hWB3l zfOPA%yE6dnF4t(NpuypoFMj$Fe(uB} zYGE`j2L$`WNWctZJGzc_^Y7cZ=&iGKe5Qp4N#!&iijDjXjTz(3xiMo>J=mmazv7G# zF};w)79FkiA@1zpCm-spe1PcGSD#bY2j6kZTSF>x2d*b>5aJ1Q0i#dXZr;STA6&qX z?AfNYN-*H~;g8?zcE?0p{`DpSKBZ+x+2NX#R$#Yh=T4y^j8P-g+?ON+%kpw5Ksi!b zOAq(oLt>AA{_iWD?hG2?wJ$%XV>2K8a2fw~=WnZlqj?=Lg8tUGU(+#}_pV&l`FXI2 z2R{CgjGSMfif5%=Dvs=1Gg5Q<1A2u%ogU0AeaR=a7WglGq9Gm z05rN_()Itp2xw&&&f%Gd_t?ff9{`jo#qQFme-Q@S8}7!~yjOSWsy>00CD&oc8BE zFMG|E_M?KjbKQ9%c|x42azM)$4)-h1zrz4(v;}}*K(PA#cWCU;R^U~Jl3;7>rw{Cu!{8QN zl(B*ZEn!VUSbEKv??13(3(hAM`|DqSwpn--f-*wJC6w9N`i?w)2q&I8VbU?i)Rp5$ zpRbmO?ySVUW0vO8F+m{!u@5;7*qFB&61$hYbWjGt9T07-U^P?#05ata{Vwd{2a}a; z(QWDK-j|R#Z<>+y4)Emu^ECb8n$m7_4%f@(9^8ck*T(DwCIkV5Cej$Fy(m5INbk)B z81_|%Sz$1T#tN3wg#Zy2eKhpDFrV~OEAFZrs~>OtfgjpaWmJ8GEc7e5$ z<-7`0<%3Bl$~A83zX=m=j13)K`E?&RU1#)%u;U-p*j;=g6-ytEUsw>Kreg^;rRu)?wAO})#2n1X6G=;eY zbpY#7JLDu;AE2T%dC;~}?3TFl3JMDHXKYCH0n`pX@o;Z)fS+3mpgvpH+sc<*x z1F}9*_-oA}DzIg@@Ei1s?3sQ04(rg@i;xN56+FJ0yx!{~|Zn%b_xqcb^P%5t(dMXW@Ug}*T&pN4~-o|+0Y3PH&pF}W=|bT0Q%e706_}svCls?Dd?;u zzf`BxSd7-LQcApTHC}%70KMPb((ph|^QvQq=sA_wK%P6L#o@{e=S=Dp9Q*VlcFK&` z3z4}2a!ZM6K#x2yjjU$pQYbW-n|+%|^QNhAEZ%^{+o;|Dp_Dctk{ReEnaG1N7!M zUvln?NB+f`^cqb${^jex;SpPlIV(gVl3I2ghz8NCZ=kUwM+yh%k@0;{mh_r60fM<7 zQyUMG(-U4kq8@)Rcpf7Gs5P<|e4I7+Y4)N_=QfSdz}A0i8M z<9|WJh7HjV5X(eFBM0>$=J8u=0pwnoia*!0$bca|pm_&(<4!rrxI=n8_RLDeAtY}2 z=*KHo>(0ZuLTbvfXLb_qK-^8I+%| zUdG%Cl=sFd>;Oyj@<24U&RhVc(aBVo=p`QzCVUthI@4N3$j=WxTE)7Iqpe%ok|sRnzE-FFFLy4v@Ojy zAh^N;M6&#AA&{i2o>0u#PM074u4E9~0hJ6dw^~A0!+7s~xzzXy*t&$}*`nH~ad24Swg^YQW%SiNd)(;TZ&v!xo_w?$uA?IrfP_|`m zEQFQk^)0w$mv+7L-8Z=N`c!^^cB=rCZUjVG+>M2OQ>B-YZ>N5giD0_7nBKcn9Z(nY zVT8K$EKGZqvp|-)wRvDgk=|8G?b5E#u3g0gVLJp(fT}bAG6o{JwYgv&4v1g=CLIIv zMIDs;tm=7)QDC4e`P->SW@4!&?~R8=%fD+wwQ%fNlz;`*m_7f4lZg zPs+CxK;6mf8GGySjQUzZnze5S&OQAymYz5)_&eH^bn*y2)>B%~UnfXQkL<$*XJ5rj zUfj!-MX2_vYu16CIG-E`Qa)zv+b&q$i!-$Vw2cR#ICW+4KtvPw2|#OCVb?j+tDrN5 z?)7#T8bCM2K|x)hC)UY#!K_emE(FoWtx~UdHXaJ8k-wu&kn8+J-4;A-Q@)_j>(YJY zg?Mu97A%3iAvFK5B_WJYJ=Uk;DLX5%Z$S!1DXUc!tzD^_ios5qQXIOg3I}f~YCb`# zRk6GpUA2J+pg4XtgGkD)Rv#BBbDlJQ4i`ZC2o9iC;vkyV;Ys8tPL2MM0+eN;g~p)} z0w6LgK%2DyWB@z>N{>Q5fDD62D?moT1F($VrU{S^crr8~0`~=JA&cjHO4_~;Wq@Nr zWEemQNj!S?^ny4@yn0cIMFA2Bk;MTr5FUPj42OpoAS2;v4v+wNsNimoCijJ&noYkkmt8oOdws$f#{!w*f?U)Jch8E3A=KN%$ z+~TWqXo1Kw0L2&$j}jo#@V*79M#G~7Xtyqagu%lBw2>bmUGSvS8y4j#ei=rgkL1%f z@7Ap&y`32$qxTGRKt41A?~MHXhN9HfKQK2YxA^)%Jnqcg06k8QB}t7j8Xmm>352H! zplw$Td3)1=B;S71raVS|C4XCE+i!)Y)YsxC zwr{1D2jEFPc?7RGyqCV#udVzd$BRCC0H?lu6o-;y!s{o=UxTz0REZZH+>J9|JAt3s zzmvYE+Eq#889~}zMJ*4&lX>bSjy`sXzE)_;9zIn!*Yltns(4batkeI%Q%T*?_v-l- zwzrm3eQo2^eRVjbFzZgQkn!Qr)?Qv-9>(^*n!7QC+Pie_+=cw@9hkfB2xJx-vh}yA zTVn@TmEvJ#1=R8YJWubbp>9m4%JS)VG&LMlUV!KB-HunhxDSsc$As6z%h&U3vo;k{ zO$HcWI*2C`VCj2X3Q12&RYlshwMk%k0G`!-Fx?$J^uSaSsW%wXr8mn$ z;~AVgF)0R8iD^b{(GvruXp?%J)1xrGDF!ki=FyCE)MFsSVjfM6Au&)Wu}Bi=^k|QH z6l$achszhr(CFcFXd8EPGdXzH1jvCdyxFM(++21qTCwm28srMxgw9+m)jJWN4erJ$ zfHVLZMJ&MMe#UxB{gzxExlj?R><7D^?>gd zIsvP#Th0rRf$)HO7NyhMYMKBt93Bp!1R5YW1IR#lv;!2+Z+#M@Fq;1OKH8?<-rZ>% zn<;qKH8R~3_2@bhB`p7*PXFr}owme&VS;Ayb&TsY1IP$?02pEJib{@y9PbYJ9-F0^9DWM#x0cd9E8d{Nhwu7<=K>8+N^$ZNE0c0dR zf&mgRx77?FBjITdP&~i&$sz#7EWzl}kQ~~U7Pda>u@Fr0w?{q5-~J?^euK+yOKh+@ zK-wS@FtV&4AYl`uO#r1C4No(GOn|2epc(>Df)>{$ZJ_HW%?-am+He4COHWJ0KH7U^ zJ}zBh%m57^@+5I(e{q>?{I1NR0BKHp2%Oha0+beGG(36%GGJC+2~b6`N$@BEs@DQg zX1pBgOSE*}Efmy$I&DJ>^}KXhp?36ES5Hqr^0%LO&a^z*cv>b}Ee=pNt0)6z*0lp< zSV{&gYQPJSfhidrK-D||#TlBCfycn$tyX}D>xy2C#ZNx60osnWp*w3+F|xu#VTHJL zgq)pW3H*WRxp}YA%HipiSp^_NAR?fQ+R6uz;rTqg02z_b!w-<*@IW1C1t<%~d{$u5 ztf~K`ZN{~oH)~6)SfAzrbq8wx0#N79V@ObTnO>*{L{8A*)}e#1H3DaS0kwz1l{q{-VIh)6$u;94s{*9U z5~XMZ$oNb`HGoXWBy0kx#3Xo{0hGz&9?~NdEngrPj~y9BU6+T4KW#fJ1kU3zQ!wON-a=10NQ87wwb%6LRQHnNzVok~O}hUVsF`(;T3r*TuC}N0kXv5o)1FlPiM+Bqt}hut8}4Q~S}Hl}cCEA^@pEl%fTo9TnOE z5;!qR0U`~r9Ux&7qZFX$wE$!QJWT-AasYwrihB-=rayj^whh-tom(<6q$B9d zZUq^P7R@|EduBNavK9kK0a0o+4?xA*0Wx4#9hQ{S4v_F!bx8Vx+?{3s83>O8AUKu; z7R5-2!lIdB=SZ6jp>5M1b)#+7g073t3W?bexF?D1dr=>Y&`=aP=RG=KRF>NSOQy95 zK)et|<53k_05UKoLpwl*rDX5|WCT1=*3s1jpuM#X5*RF;GwnaH88>Ycu5CP3rYl6q zMjop1khimkM{gLVb|XErK`9BJ!`9JjPoHdbLU(bm z;eEj(uqd?P&>oz1`XpVG5SEpLMGg41O+(c*@m(RvVTLqR$Rvb$EPmC{;Fw=5eU(@q zfM-E*{{K4m?)@;dfs>DWA9{;2*ESMcghxGlkqgj#6g@N7fPjz(bJITSk)MJkc}X&3 zx1n||Scj*RSZZ`#x$)as6IUTgi=&nY;DLm932`IpiqozPb@`WM;c2AddJtCz%c<}x zlTT7LK>|GFFhd$DOoH+&LAOZEBO#raL9xrfVDKn#VxV-BG6@wi5acWy8uM^nb<*3C zF2kbP(>^3_>j4H&AJ*e?wdPcXIU#bR%Y(SN^(B7;+qG*q9Lts!hUfDDKvSRB0+0c->J*@QZ2-mV0!U8Bd1526=;cl}bkQ8tzni+Ng#wO^Uu3(L_tPcUJ2^F{|sY8r}6)1CKU{y0Ag40i>Wq#8V$DMynRd zXk`mr#M7(*DR#7h*J;LQ680?4Yz~kS`8@mp>4Aq_pJ?eknRs%@Ca6=I+r!mym(~ss zA4IM+m~%${$kj2BJP&es;J(Eua`v~}s5PX5=yquq0SGoEfnRZ&amirK05UQetT{mO z+VYs?G@CFn3XA4Hby++zco~HU>eLzaW&yLSEe#Z!GbVCj-N~NF)fFHbEb;NWAI%Ow z1wNeH15|rvqs0JH3^oD)2Bu^v0V+y2DU+}Xpi&+1NE_($Rg19bsnD~MPM#C!sK1x% zAX=wf-MX~Km`A83YRASRU?Q&vfoLGi&p=!xesa=!(en8>x#^F@M!Hf~mK6a~LS$G< zhHij_&#Ef{sw!;`4kW-spbWV@OXl1ZKNeC#V@a6X;(mxdSet;y4)0u*1N9VQ6mnIhyQEZyBO%Gb%x{I6!oXH>p9h>Ks5dJOCM%k^un0ed6UHP%Pb8m@^LR*1I5nOkq_hdUc^+S%FHIjIFJs_SQx=R!_ z{|}V3f?1%o4b%2-m&4)?76nK(Cekx8+8iL`lEGk!m8tc$a$f-|$Uu0~PAo}G2sF?{mwdqxbK&cGQ$%gni}UaT%W z>{iFH*vN(TF1pf6baWg*dmhXpN!;AVi65PqEqZ491+;wOpOAS+8#RZ)#91aeU3opr zM1U0TES(RaEFAz5U^3zeEO9c{qvEDbq@;7OZ2q63IpG(?4?U1W%5uNL;yAjv45nq} z!0F2Bz~yd^b&Rz}5@xDhSt1nNKIG>}ewB_*u5Bn$utQM)S>h>^Dn$#P{*b_Qi}v2A zWlB&7DvMeu3e}jpavVlt4oQvyTVrcNloqGbjn8N#ujME$ULBYWcGoQFO`)jyw?y-1 zd?*fmxYA*8|JiWuY&?g$Do4)Z__4Bjv$8v>bkFVZm;oftBGK_9@@pl%lXjej!A!LC zh#}9ohCi{{ZQ-mp-B&KY>P}({57N+{xyjh8FctPfr+T!$Mn30oz09XHQwIB^dljb1 z$^SVOsXW(wZ+)uVGjE;TvtW(PvtX@k@RmZ^+(Uch12(V6o&_nG{11DO9u@4h`w=yp@yLR7+-F_P_1>{dzv%Vc z{4?EWO|R#D_cC>41Q@6rEpfZPY}Qsw(iu+VtM zk?VfLxt-`8D*o)6RH0G0sdlU^c5qq%Bu%TN3R6ec{q<$PcmS#o?ctDy1vk>p({m{8 zE>kOk6c$U>a;ZxBKlm)ODnpQ`%TPxJEO2ZmdS9GBJEt$ZhK?H0Xj&UPI5rAX2R88L z$%0cK7N~Y(7NHkw?B3M1K;whO01!A0WE#NW=*IvFVBhg)$LPV1*_EBco1N2*U4tE( zRtl2?YqWMOIBn0yR9sp7qyVcUb1gnBpzXq7P*oT9KOgqljw+zIvtzojb2zbcN;KS) z9hz1SlqysTupC)~JF~`b&#VTY6#sW--*Hp{MHLo1Fn0-5nsA9VKvNapXEcv<*FF9Z XdJ+W}DiIkV00000NkvXXu0mjfKBlg6 diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png deleted file mode 100644 index 2c18de9e66108411737e910f5c1972476f03ddbf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9128 zcmb`NcT^K!5btji2)!5SAPPuNq)Ls56s4*38hVo^(nUfO6%ZAH(6N9hNR=iCp@USV zNUs_|I-wKc#ou}5-}laWIcKxU$(_yIot@8o_s%{sGSH@@=As4w(CO-E-X`sF|29fE z>HYT9T?zm$_~>e0H4dIw&!!4C9vSZxNlr9*d^_s#H!1R~WS_6MVYz@X@%G!e zXHz-tb|VivQj`iFZDUWNj>i`*9rwT8VC9f`)ww2)D0tG&WBFX^J|oMigqUy#_eV)Q z<3?;pz6pkr(;Z)thNWZ3Tu^XIU(m2~K2{iFEAS`~Gy5VW_tC>i*Cl0kv`b9xtW+!e zPD_a1*)E4YGCWy+8(ZVrP7}Y9URLg*>8E8fyY^0u;VQCkoBQJ<_5zdXl(d!zb~b;b z)6|dkG)>oK`*erN6Q98nTc z*T4b)onLqyA@?UYxy_MYQjd+D&|e(Pm(0oT&BjWQ4@?kFIoB**?M#(;rSUW9SnG<- zSt-|WaL6iG_P3uZd9eIpr{TtNWC*$Hh2Qz?uBS}bIbRfO#e{zRE!IEy&YexD%F}@N zL-y@k#YdI*GK@^S9Mw$gu9^2z1mSnEkrdxz+MPN|ZNhhS)_oYvhM)cLTYGn3J-&{3 z*gO%dE$+F=!pgEJp;TQOxUvmXY0MZXd)l&aIQ@q%&TOO4FwrA~ak$>;=zXV4zzr%` z=0~OcyNxrVAu`L~2ctf1)jOUXrl5QhI{u_3cR4;2>t?n_c`o(TMz?xA14+Wh$Va%BY0&2$WKO9mM2sYf3h-OCY*=ZOJ$Ngw)1D_iorRZXHQZi4&2K7qT927nQC0Lrg3 z(#lL522bDvLQQ|!4#s}u&v;Yf6v=QytSm1*VR`JzNHPFHGlJ!`WMgHC3lNnE^`=*0 zy?^9tJWsJlLSn+d=%5(DNQYCcv%)omexK}hyZmUHWQF=7JRFKXB_b-*?UD4{x!=dVwazRjll3YN!e1GQ6{ViI{ zhkd)N+MWKT`q_V0)j;tA_oAca{;nI(Y$Pb7t7Zgb7)DUREOEf@igE4Q;TqcgkX-wd zJ;8G+7!?>DALr#bk)GNchOvQs{BBN~iU1F0&RMR&ou$CHl>C|ZrZ@PkAenI@K>Al% zQ7|N8uxRTq4vM*lnm?oa%}HLn-3G$yJC_b75?=65k%LM)%(H@{N`65=i4pdO>Mz+= zLeav25B?f086=X6O6;%!2@%ZP1|;Nvbnj_2aSc+8ZOx$k{x3Drh^ zc*UWh!@lFm$>1}Uo>u2rUqXSar;=W-2Mqo41Pl(rQD;>HWC;@e#W@Z29HUt(caNqC zC&6BqG(7E8;B^rX*m6|Ejm>-6L>RWQs{?%J*!{N&Cn3FMX$DmBS8~(Emio*Dj(^J_ zk~mE@d*561epZk|Er>78iC#q_4Sp0Y3GD6B@JKKrmyoJG4WGBh)HqTZZw>kH>(OJH zlp#iE)N?g*Z@4^*MV+s+H!!1LJlIN*`JxC#o-v0{2|BS}}kDUMqX8%d%;Zo1pF*{G_rVrzNd`M2ya!T0DJTesuRVwL9u7n&PS ze_~l@1G?`(riUCq#<3T)^gi`sw~pk^JSP})C#_iBKTD*{^N7d0$A0wJ3#IRYe;0q4 zA*$YJb_LE1lo-`!M^fB~U00SLiLywh>%-_CXgSb{ju=7v+FzB+78O;y>TeZvRv&RoWxTLP?d+9Zi&Ypua2+{3 z?&P=TOQKt{%~L~p0$j8^;iia9j_>fKovkcwq%sUQ@nh>Z!)%cfJ0$;z4CPrz6I0OU z@+^ZT$qbq`@V*LyaM7l>CZ1ZQo!IplAN5a81(Tt~ztAbYc(d{@u2@?f2YdnGcoX!#60Ixw-Nvix#$k1X*NJg)beTLqL8^6*<{2f@@ns|Q}RjZ!$JIHK8NbS8xrmu#@ z6ulfiVr7xxNb~dV#acSrSX_pQm;bUeyjdV!{OZy#M4(A` zwu81?V`O!?oZ`D{REMi+x!1hB*6Cy(I?k8T%kET=uKQWo39E}=ca$my=uHTEyP8y z54Nz1YH*)(w%#ztIo^C*PQOjte`Hel~gpFN_jZaXoFZnUzuu<)94E6T<5ZU?s4>c zpU3Uo@d?+!hgYmVil!6X(ly;KNm*OwbI8{z3v|%I_4HT>Nt&7^q0@@SPXaA`iAvAR zSr*v1muELwpeL3wqu$P7L5q4m)-N%|J6fE`4!V+xyrOkr+X2!LT$k#tFYksHJH=n z3F!I2Qe4B5pnFmAer;+($yQcgD*uHlDurPx@2dd)1-RjhQe(5`*~SLS`q|S9v+`3~ zQ>IMi+hcTX^%}_YWT=}koWlGSwSH~mOvRNJ&Sfrc>H__ux(6*kTUubhdoQN>V2}J< zR)ymBx4g=I%zlp1J+QjI7joltSLskIt}qG%d@lfB@0(d>+A&l+Glwv&La86NxDmfT zNv>`p7eT?@iBSF8R6M^wCx1D;HRt!F#6s8>2mF;&B-MF;2m~@G4CaiZ!p=4aG-$V0 zYR+PtSNvY$YwW0OPYxL-i+8&!G0&s(?(IcQ&Iv2 z0Nx*-7_~pZT6#2L-so8nF7QMgH5}#22w+dCGMyllm->HAO8q%eYuJ_BHB7343cyG+ zgo9$W05T7{CPl`Zw^P=q+#rx_`T2%M zMCeCJLfZT%fI{csusPnQ7Xv@XSzVNmPU{iX2w134>~=VfgQ82*rq^p^97wA647vgT`a# z85e!NpbSl#8uA*dnopv4RMby4F4MY{UFn^r{Li3l%Ume;QtBh5?8wCixw0*zSQ${* z6)@M`djm|Nz;H2K_j1ACvx90`pqKN#`9b8Cd=@J|$6R{ZYc5yw){(D1GtABWH=Zy` z-HxQuV(8LOB`UjI4iAOJ34LY@KVEmPb@XIC)FfA6m5B&*8T*hQyR{mweAL1#*kA9n z;O}eZUE%DcD;yjrQM!F!8~hPzPrCH2Fvr-ItjJE$$pV*gv9>ye(q2lsB=uQP$h%X% zlekK6q~fP4niGy&O9mR~_I;)G@;?e;L8#rja{}{3_rR(d$+fAsX?PiFx`2ashkOGP zw9A><#);kE3G}H}!W&WxH1$sg*P@*n!{=#L{PK)y~GHI;RsgpA$#8cpY~ zct*9kjG$l!k{*0T43n={dVV!idt6Zw;lPW%!2K;#E>?J>D|V%r^A`&*)MdYZJT>jL z*;x5TTDFevc8OARtqyN`Wyt;0MTTO-DDG|wtNxUqM1$~ye0&&wUtZ&eqI0=0|Y{WT*|Ia1An)J!bjzf9y3P874R^|FamuD zD47YqkS6Zsd3^fEq_zq1i3zN7fM#ldxb7Z@0Y;<&n|qFI`e8q;TO3t$s`geh?U*oK zp&F$0CKJFD-a%BYO^4KA!5J4T1f9rK@Izkpt4qui#^S_s8AE_pvL7$dKQ z*TXfMJYx+MCq$g?pCj@15ZQdjbAm~v`@A?MCg`$$;e!iKvcv423 z^QOF{_mgOGh3-cDZ={Gyr z_&&UYqVw>f(5K`SHp~Mm5XB0N9$~=XOXd$uQNj=bO95ChnZX9K@n&#T?vXPDfqt07xJZVvBuujM>H*4hP6HvbJ~#$K=z-vNQnRCryVz5?3YqR02@1#K{#%aX?h4VQ45b zcmM<+1V?|eCnx}P7(IWh<1mpP1d4*Z4r1WAfB;C4dhrfKPC^**Pz;nD$YOJ0I9i3T zdQ`v*UjtnCM$WL`J8L<$;~1_X+Oyzj(IKG(tLOn!YS8Vny{ z@>lc1XCA-~hhrD7h1@0O)T))gw+GcvsVwxcnaCv{EQzu|qcwKGyiwb`TTP(}njGXHh$KxOryTWq$B1F6I8!hh2O<$rL^FOXZoKME=~3M&0eN93bd- zfpL<(mU)+asMc@#Mvb?Ws^Rw;E;iny$Mb$bu)1ovt0lOm4f(~cAmY<65o0ePN*$EX zrmHUhGI1J_t=@d`{#mmFd?eV^Q&jw>g^;Pf)7JHdLzQB*87{77?Kto0xMvGjC=&M5EOW+c zXpXOY6|Uf)0am19ZLde+hX5J6c11*#mSinvk^A4NWc#m5P)?v~|Bppv*0~T;-^rI9{w3{`~5)bC}`nF?zGx z#@S`#(Q@kl-1Fmze)A@u^#@9=c>MA>$*eslP^G`Zvb5N|sKK{mQ*V?4eX_x+nT?*N zalRRl;P=w1HG57g+d^AJQCZh4&g{?mbJZuj*>jJpGL#!`*C>{MRd4-HML#+BNUG#EHx5`rs8QUMda13u9eMG(lKCYTHCS2gO0L&PIU zkkI-^jv5$aR|blKRsJ6xJ^?au7%A7>eD6+l!ALkEL&*RPl442Nll#UeUv)cn5=YV~ zP)$eQ=SZYMG+hSAy@o*c95}KXP7(~*M%`ovFuZos#RM5t0XkRn?DdjD!7zh+HMGoz6C^Gk*}xdzg{VaE0-2L4An_I# z_)DVjA|u=a+{fkuUkWg+!HA~@f87&ENbQ{u_}}LPin9T}}BZ5K1W#~XT5z0gcc+cy7@$?+tH6Ta*1qVBL@ zBwd%m=LAwRv8~~Cx3MfLmwax@N%=M`ciGYizcDPi#Qug{`#^)V(iZGpR*3ayNFiWv zCT;%Yg?Tn;SO3Pvyu6Dolgt$Pq@8;O(nD{uHM<__6!t9UUP@K#N73GQB){T~9Hpci z<4P6T>Kb;ktBMTne4`e~@)E&sIdENQj5G9OYu`7~bvsRTeRl1z?i^aI{)?VNlekCC zXJKVy+B;Z0|Abe1cpfcW)93y`*4%NW#+1!-OVtut{#3Q5fvBQ-b<*gu4x4f6pmz-x)Q8wc+4G^!kGq??b_{28Zdu9+dS0=wgR`1Va^@f*j96v zE?=;Q{AtjKXi>F3-EkrPfL<`s@S z(Cl$t|NBt^_k;7j{U(%~9iLt{7g5yFfhq?^mE$`_Z>W$9l{seeXUdzmz8$X$3_fz0 zNc_d*naeGkU7&S83}C%)Owd-QTjWCq)4F3puS?Y*tOH3*JX`9t7=HyB%;}BFw)~fX zP3M8Ef?E#|5Tf;EuVktd)#&vh7trJcyxkI{{O|eok{tE^hzi3_4LW$*rN)J?Qmy@$ z@GmJ)5nOLC0(h_C(Ayd(aO3hP5pxuMsRZfvoFgBCNNrsu!(1gLl_W1XDWi)1KiM4& z4TFIN4Z44?71-@F^TGn<^DjNF#jfDTD;qdJ36mB3{oK$>kk1T9x32)H^4{v<&J$?GFZQeeKn zog^e?9JHCkaVAg{99*Xytpn)yWZ-y+!;hT(I=Fwaat_Fckc87LJ*r7!)y;@7k^fUK zxl{eySNWG_U%a8X+L`q+Pwk<%iyJN!iw;Q%=1>$p(4~A8CwtPS13^pt$BA_79TEm3 z!hx@gB4KmstaCTszUdc8*ch3y0f@{;*awP0cxYg(J0u?XLQsFzBA;#(`vHd`I*lBM z;(99!j{626=)R8+$DgEz-MfuzaGI&_b*%9#-BUQaw^>IHgp<=gob@UA0r`@#>-qw0 zpfFP4HZ?#}t^J2jFG?J|6<^ALo3?t>Oz5`IuInteCESw+$NTFo3L77A?}>NbqA$vz z-v81kRTwtLT8^1Hkf#X&iRsn`fKmr-Mu&N{*qwp;$qBXyT}BAQ@L;wB^UWEXX)3_b zh&*ke8czIhFd!IxCi_N!jnrKGIQpfPR2xJo1%*JNF^PvDwB;>G~7@ zQVZ23Q}9_P0C|)?QPY(DS0!&Y!!b^`S|XCy zKNy*Kil!;HIXgI}+mn{ko*V0S7_|JPJm`{p{nOe9Vi^>B;a*toh zNY>_;v-=$AgIA44ebwp@a!75wJN7K9j;+SW z8uoQjVUb03=55d=@#Y_9`Fs=Ut|9xs?0ce>@0mn&q+oSJdb^!tTO8;mb$%l));(4- zKPebA@3lPn z@G1otTd9DCo-AAllf-ruy4anJn=H{RXLG>6j;g|@m(&__Lzek=U-sRZzRO1lOrtOJ zm+5k9slTfFKsku7%a$T6ENphjA3uy9eG=kh6ii90n}D&mc!E$-XY)ycsx6qljq9PY zpDzzbG!`4}xmvrE+7f*Jx351b!!}L5XmvDjt;&0$*g9U$nbVZwscA2!5>S?vG~K*d zPzXIIrnkt|yfEO5^dk>cVc0*&Hh$%zYA8nPL(Hwwk?vVuZpJ+&#LxCsujZ^dalGUq zk8X*2y(traI^+1KZEu-(_j%t<)w?tI>hVd#CUfisw!-|mSM{#>X=67C83>oRW^)Nc z_@hYvV5!q}p#c+`qTV9*kqk5GkA6Z;&)MXHw7m;gzS)ito45k#Ejt_oX>5cfTLfXUX@_N^+#UicK@ zbUwcCAj!Nyi??H{sraN8NiTB?aleSuG-iy_c^*{zg2xn*m1e+7rBnP~o!PuP9z$Gcf(C!4f_G&|`v9JI zHr460gE4qwW4yYiYMyx4c#(d_<1JDCcBZLe=D9DE4fC#q8)2D2Dpnaszf0h1)i*7) zxyKd8y*&dyiKySsH2Uj5(~gfdkoWmaI$)6ycN3CquawfZ+R8$$x+k;L>%Fd*;XYy0 zkq~3{maC~f(~h3ZUsXWo-EodvK!+KO{DW8g|IOnpPq%l@9Ky`Dd0%sz0@6$Ox`Aei I20H400LcNok^lez diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png deleted file mode 100644 index beed3cdd2c32af5114a7dc70b9ef5b698eb8797e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15132 zcmZvDWmr_-8||54h>`B@4yC)hOQZ#cM!EzfhmdZRPLWXQlpaz*O1gvrk&^D_^84TW z@jlOq4`=WFp4extwb#3MjEilFPELs0YL1Js)Fn* zzr}qsbfZ_wbNOa4S@vf>;bE~>+%RD!>v%IFV#WTd^7(B=#T|Xno7mV6xS4f=u6692 zQq~7{i;;}Y46D{(Y+R?~SpnS3W=+e#JKDJX-SSUi>9(#}mwE5Tv-r0dn5ZY||9_k1 zWM~Q&Gt=O&6oAqZ3T;9&9$g)JWBOFs0NWF6vYJZJ24_?zn}`jXIHjr$^?F69z!2p< zy%t?XyTRP;!zMXPY^&6kR$$J?UW%?3bCC4XDqr@?ukqAzCEf6lUi%~QE1bZLYf8h# zNIFjy{z&gk+iBasaZQZklPN%Bhl~H-pewWJX`t_4w;I)?=gcrEWq1%u$-pwhg=Fn& zj3nJfbY`j%G4F^8@$CZRg?Lweh*w;b>{2YdOIAi*x9?W^yUNovn|q?NJ#6TPeU_fVowC-#v9#b~gYH6zAw5m28>MUeJ4Tj* znIVgljj#XhW$ zhiz?z_2X4xbgPrk6@%1I-IDPigjXj6D_rk=N!MHKhrgxgN|sX9wAG{r8mKBc5uYx! zD6;oWKPFPVaeKY+;_tfGk8dnA3*mxhD6c6ylsqfXvWFU-T3PF_*(Y_!aR4ycp@UiK zL{0B(1-*H{F=ezF{RJj(g)4PzJx50@A1Bg2>XU|TM&*KjHze0G!vbN}?9#L0`)Mh& zSDg1vm!sTu701b=n&--{Q{n2DpuDb{%No!D^gwg^bAW&J!~L20v4&-T0QrdY*80B?ozklkW% z0rk7=VB9&#oB_RdT&RhUD^ z<%mehua9i+?=)hn7$VmdJdx(xObB8b; zd)9+r z`yz+r{dSM5hDz=4ys1#(+WoWqC+KtBRNG8x2R zkNK+s#C-E*)s>kZCpyIRfB`}hQ6FwUXyKlgYs)!v{kjY>{yEe5^Qr5JEe^d*zcU@; zK#oE%1w&_PZ%A@P#G}S>`1qbU0tkHPO<2-5_Uhe0Y6$FovD9c;Ov~qVD?l$$zpcmn z8BGk}4~3UeEkzOUc<9FqtY1TqoY%qGS&?kSM=O3g}NY85}H(VQS~6J6eJsX=%$ zf%etV-q-i9X(#Qm$6xDNs6>@0-*1b4*6TC?1v|R@FkpbQLy%N<#0-I&1swvEMn?Y( zQKWmqz2#a=uq>R|^cdhnkaB3z*DB@@Q=Jpj%9EBXLuo{WDl~W0E}qH^aARnpD#`Dn zAO=+iepMRRSE1j%9nTDc{=3ACQK(De^37Zvsl54F9`aO8G+M-hmV$3r9l|3HavVov z=cO%-IOVsvo}L%}Jm> zX9gR60KV3P&h$KA;XH%c12K@uFzJy5i9S6?U7BKXLk4&WhD>E$HbfP_Ojp5OF9rfm zT$`)n#dWaGB<22Cl)AZ@Gv7i0;!*>IUJv7##H1X4+Wx!Jki<;jka&jGH6W2$nzJ4> z6yD|%yOMzcBZj~}DSWA5Qj5Q$P>edSrrCzs=X;k&irN=Q9KBAfO4RZ>klxjm*H%`2m5c(y7Pw zcP@DyYA!WftG!MB6T>V!I>_ym+&LEFyikRHI`-j@U5hGl(;JWZbO|orN^1|6{D4+0 z>5k@1pQ`!&UM0WB;(#4ds`}Zu6)B_YebI)X)jZRhJn}_frc0jF4SFi~JHS=t;knPP z&yEu(+8%qK>YIlcGahTfF6Ze^7edgT$J`6#2qm|n26OTFDY|d8s~3hl zpLtuXp@mq2GW8<6|E)D{#yU2)#iuPY!=|5Hmo-<*yo(QYr$3HQqx#%vtHjS|I7NiRxC6lDQq< zTXIalFx_Ncd(TZ(!iRaFymyh~tc4h-VJo_vaMKP(y_b-@V9j{@6aA&=*?g2r3#HBa z-Q(IP$--;P*a%%PO{^%D$`G{5nl&>sUgEN|s^PG}Jh>ISvD%;O|psp}p`-pKAK?pbIHTV?a9?u}(q*GCDRrVm> z0lC9`wd;C96R!Yg%?DnK2`W*_@jf%9IPnwdr@BgGxWS)z)J>cDasy)mt3Y7)p=txP zM)#~H^+!85n&7b%$l{U`iUrdD?1+BT#+yClM)OQek##8!6GFE0paMGl~ znJT5wR_VzqeBv^?U47rJ0!hXwG=8QSN^}EyUNDp2J?(D#FGFgCo^@;lRCMe2zczB^ zM%9XHn3ccHp;wqZ^Uy8mD<>D6R1W$5gqQ>%@AfWuiX0~?SIt2=9&6BS)f-v(V+-C6 zBfbm+ypV$sk2v=A1#JUeO~Sbved*o%-1Huvn%MCF?%m%fP5;xCPP|-(b1@laO;e4- zd6?k_0KN;j`6NXEVgi#X0MXBw38O@O`lZ=y4(f@Vx@QT9*Vpgk{{$@lzYwyh%?NrN zGtU^kn)F6?fKBPA{djTaw^L#(7F&HK0b>+C#os)3 zXBq#MC^QE6lzK^4733pD>UE36G;-{`GpU&0a|`(V-vTwp@G~>2EL6F$*&3YMPp-<3 z$pGu8`_-xR9b-}m{9;+irLXejrTbK_!ep%zGnh;U{^iGo^_=F2)RW>Gnr99OXB*dm zfO+ugGg0L-0>cKR_lG&~a#|_x2{kD1`&ncdCyi6M^Lm931EU`O+-XCCFYRAnjs5f6 zUa^V+z|fk5UB$rN`lRE$u7^I~$Cjw-;Cp6f)HA(2LU;};f)pd4T8-D?I2up+3G(m$&;vg0~+JOD};L`gqqk*eJg+xpbq{T}SE4${0xj>in~=ldQi1rE&?>CiYw2 z#vg0Xtv2hPZfP@t{cR}nkn`imMzN%Ni-Y?Fuhn*~A(k1`mx6vQI)vLRy&;WKU0n}B z@ZJ|)Fn=>TPu!<>B>2~#eYSLuW5D_)A)V?!{Y4XguE!i#eiyl1d{uE|RTBFea zM(g%RB^85qT#!n$qYwxcyR1CEXmt{nlJiLD0Zs8{OI%+d`MxVXSwT?e&2t6`t3 za4o!LrCv}!1now|E(qC6Hf>E@-0qF^3NbW7_qjxU<9CDT$8j)VXDt{8H;2Pzmw@Nb zJ}1NB7;d^GlLw5^EU`sTe0n9Pg~GmQIXwnxEAeh@zS%X#f?&FG!fvUXW1I^%m4Huq zFb9-|D>sEz%pg}Dy}4S#5$%jBg@1FfhQKlNSk?MlP{oDv8s=i*#C%7KTfKRpT((!vAA*0?h5%4doY~|3yq_DA32&6T2RHbNq-AItD)b&W z5)Ng>T|a!hlRxqb6(lwy3n#TR>Q{5$zoTQ(7Yp23btrx0L6lb;lMIld_ZsBm;X65W zhL~-DK~O*?iR1lG`e>ZDti=^0@Hu{22rk-ri$|Mhlfjx zz}x1wtNp{S65T4sftJev1F_{RMAe{B#a1+VB3lE#HN&bH7Rc8 z9d*c27p;2oA4ZYZSk)abazBuwEu8=L?5J?TG~{R3V8o868I?F z#Lt>o_|ohZd7psYl9Vtz6-np(@R&^Q6yKF@# zKK_Phwv=G^eE6%t(B0N4(**az{Z$|8Nab8SLz)m@0bPk@Wo;!3I&BJu}Fl z{}e^!Iy||DQ~DlD9=@%{OB>I8fpV4ZTC})4v8^-k&+wR4`hMI|wtCe3@xtk*M_gV& zT7}a{1ERd3c8RiWPPBvInQ4k+GPxSExF}CJt9v>(EoD>AsA|3ioYaprn4PVQ}7|zFbK2=iyU{SL8K#I2+N-*;IUC zGNwTD;XDPHkYcjzxc(jT?|J#?A9c3l*&Jc_`dkI4Rs7QC{PM6ty6TzkxCMvgm=@WZ zf59SoAflkydVV7?TYoT5`U(N`-HxGa2z_V)YRIz`HRRE3`12J1-lEtmojvMCPtH+1 z)V=IiqG9TR@`K%FOk2#6!1{1OD;*%xRAYo%)EDc|<)I;%EXi}?^()_B6K`pYE*`4Sg)tmZ&*^v8jAGJgK-rh(nO znii&AGyPojK+Ee9+EI?hH-rm&m>=`lAO7{E>D1JKm7n{&r&z%Cwi})WQZ*k0bJ6u=B0Pn1}ek~+ch_lXwn zuc_uu@YRZb$iGWq5BG|g|^Wd_oh(t2hEHAQ>~0CE_L3eNN1(NZ={TZ z*Q&K4gY{whUfZO+x8Pi73^^HTU(N+4u|z~}-7IGjQufEje1K4zazaTk96zyU#Oomt z{bZ_BZ#I(ren>G~3QNkj-ElHS()&+TCR+bjq4vO-*_o`jyU7mwVd?J!edfIxKubK~ znqmum7Gd^m1|fh?4|kW$?Yo6*!cTvq_fNlm%+Olmz3Wf^I(4mQ zO~z#3)9fPojD(VbPK-c6xq)}DM$borMa#X!P?x0&SBqzQG-BST1On6bd~bfeDWpmL zg;dMkgsT6muQ^9L>bR6T?+9!G07EA3XvMR&Q}8^MSfgNeA zEzFXFyts}my(yK#E3|dx>wH+PW-82HFn_p_ z{;sH%Izw2f?je+3ZGMKbJJ%-MUk6I$Q3lW`X#vZ{OC+X9zuDb|vQX4W2a2z2W*Oj)w$<7+lPbGYqEE4!Y z5j4*J(;o`UAc^wryi7M1qZAX{UySopT5y$cT@|8wdo0j-F+*z55(QN4-0X9E2(%0w z->Pj3_BQrPW?JjaUyorsqkqgQ;wow+pkug_qLB3byas`FE+^x`c+_Iv!A2o)GczmY zAV6d5;m~?7FDJ}pHp;5ORZwuDRq(s2BNghbg+aq0nsM$z_3LiUp~h}O&p9WQTkF%8 zM=j%0_<0RSBT*koU?wS=bWkoexJwQclztyKASoPa^=_gN4ebgz`-%PQ4pC%-=4Vq0 zfe#O}LUsDlrtPI4qXRa|3{g~nzfS$+u@EI(83`y$`zM*F4ZrP)V>J3FyYXx}ZGKDg zcnAHvt{Rs*n3G9nWAYgvN_?47{`Qg%8)$u7L&yUCg=`X~0xo?Nm zOT?BaawiXVZT^N9@PB8m9mlRme!pMhW#CUp&O)q1Ff49V5&%z22#hJ2F`M#8APaP0 z$_Rp4aJOUiQWa7(@mp|%WL)nG$d&Zv_rF<$bdOHX?n0#JYw}R-L?73ZR{Dh~d)_hC zut16KfP{BGRQ-I6p%4Q2bsb~&j&!tu<3}y`>iw3ht$>i661@OYn_Xr&XV#5d@S|oP zA@W{))lxW_UJQXd+s5{jYwPj)u*;o$QivH&LtwNF#bMPtindqcy_Sg_0jNOW`lS26z`VMFkJaH+Sv!=ug__rdCdmKpW)`?T6Ob{o>w!vsy+D z-B>}mgAw_|pUbN&6M&;nPF~<=LStpG+Z5n5r71uf?m?gQ-F4dx9x_V$5%CbECK$Gw zzJ2<^i95T446#0C`xOGneN913e!;7o!R%C)^uMCe0=Tn<*P?H{k7Z&~3QPz=NJW=T zj3CEU61-h1U6W|>zbw|;d_CCnt>k5|J0cEO>N_La+8&pSKU3E{M-On-Vw%ehQ{LlX zxIB8%LF!fTxKT!H6<|d62Qh9ehYjV*#xl%&Z~JpAI7ZChyU6I`b9k!^*geM*&r!)0 z`P_*C_$(P{7dfN3zXX2lZVtYo4StL|JW2|=e>3xO1G$K#=;n=dYTEcI0n01mkFdT* zZlxjCcP7Y5aQ>oPVpawo8YKRl#hc>oIaxO{*fKmVk?3H*sQ8bIy$$PNS zm^QUJj;!T<|8X&Tmhjigq?%e(ppMY%uLMndna;mU(!hA{kXVc%0H6AUgIMB;Y2q3as&sY398#kE0 zW83CIlm!|%OO&SzQ41d zS$iN9BrRi!79O=xyI?ngbQV~+RpO` zgt2WYwEdm=V<3qZ)gKkzTAP9Zf$LsE<)l0?cLpV{+UkiYYIQGnS~Bad;H{xUx0IA93P!Z$Ub zRs}&&XlPF1+UESgi+B-d`JNY2Bfq~xE9@Kpnx?;#;mg;m75vQ*?*d4Tztw|nTLS^Y zH-`iqEf>b-r);F3Q~_D`cZH$BGWu)siXg~pRDs3)1|az7kgqJm2#$NR_{p2Y23-4BY)ULyBEa^$KdzDc9uq0^ACB~H-gaD=Y4z@9VVD}V$kHmZY*Zd--RR|Y0w6WlPWsSq`9?!a)pOu312EGz zk4m+W%p>D^0mr(5WfHSjGm4$@-XbLhSU&;M=<@H`iuaG1?)qq49eVAA5|f{k5V){} z8uBYG8s*=a?&=i4q?=aPx<^%phdi8kO`X$JJFg~83BLUMcYF-+MJbGo^^{rW9Z@->vG69q4q3;`%j1PYG2lz1;eHLUAMDldZP&8yIZ=zAT!_W^5Gh_b#n%EiU zZ%Fin+oCFPL;K`A8?8xGtUp%fnKU^o)jCC>R2*P%Cfi#_LmHjMEJxhmc}|a?*)R;# zbyHfgLFFpb00`ZaHUnRQmT#aiiK}x0gu+pd23%n_RUjE4QhiC3{(j_k)DA`~jo|p# z#u5J(u73}=8;tpFvdM1RcA}^T|4=?G_T`x+6LdEhUm=K9erRBQI z%4?gf+wXzRB%6mX!*t}t3Kv1nsQ~!hZbTr0bFyUkaDfV!snDh2##9g(Hhul2EW747 zgi;TxQ%{3b>Mc4N=|y#vIG(4HW=>NnpTpmFun$Rj02m`#o`ex0ONfET z4F{r7@emkC;R~!#dbkG?-M#lhIS+y-buu?tP{T}iowTIQI|Q3D*0|PFM=K&Z8(ngl zIFhy237n_38l?NRLR4+dQiB2V$&rEkfgtk?a6l=H7ExIM41_<)P%KaggZNGFqMZAL zMY&tS8=|yPYSZZFA&!dSI@Tu^@(_*Fml5a%4cZC)7jK+63+eEuZ3PCX_~(AjQOo`= zNPnlQ)GVKn42^BzfT?X|&6O%hoWj^?UbjQVlhMl_0`x{xa=q49T>Mx-$^2R5#O^pn z>2!Sz?&CdJ65j%GFWASd4pIV3tzxpdURHySx^q=6dVRBZ3a7`JP?PSBjkcQPh@?pe)x&( zA66UTKY_1wx3-Ur8yZU zi(!nn?u&oDM9#cLFP7RGZ@liCG@JKro%!fz2GqHc@fk04klM@5*ths6nRZJ%lI|p) ztyuO1VIcggf?H~xX6i7k&p4~V9`G>zjntUEflyoQ^SD~$lBIr*#v)di`!hHHzZ~Wd zJ-QNEBRBq)fz4l2#_xXm8YV8KB%v!-2Is(P`1=|D+zIhS-F?ZUgd{4ZvFP};cKr74 zvi0T|HHv$hL!f3guj8b`g!f?>1v>B0gS~UEbJ?|HOB?fc^jFhtGDY1pfHBHP3X70`g0Pl;1%{(WPrw) zLA={hi)#y_&B|CHDe{&@tUa4*`Gx7EV=fZARJ1+2VgS0L3UZC@{Wc`R>bF^Y|J_=) z6@zu_xnjZE0yN`sSuL5S5%*$tR?_Sn;IN zk+q_-5?}{FkQtG0br0boxa+}qf_r@ocNJU^!H6bY#l--XDfxMU;d>>l#G-kxw=U|n z4oX{wIsAKre7G+PF-;OsE5di0T5MG_-(T zhUl%sTLJ_I(vT32H{#nS1y2{d~Bk*>z;1fMDT#15#7$-u6_Yo!o9QuS!|5#-{ zC0)T!;?6@2clqJa$)sMARqIYV;r+ zk0)L=B>56L%h)=EE^|VE0=oK*K#|t8- zuPFs$^fLQzLGuZ2ZmXe@id)*N@}ZDUnL1)Z8A52hime?+&Bx7u|5)K3ImXEMUQge< zM`(Zo{DDFnt^k6F1jF&@18xC^>12aHE)&2k zs@Nwb?4XI^>w*cbU-d#dTM%R#VlaWL2MW8>deH&l@xZNi1uJB>M`h5y{I|JcKhaAgcz;0;FDw2<~EhliI5igwCTS&^FLFZSoB$eD>H zD10LcRu|WoR}}rm2%pHJGsgh+eOu9q0~qG^b(v)v%8_%bfYg<>q0IYcTAhF-kNC49 zGRJPK;g!YDNi0#B-0xu-ox&gG{wQ(DTXtXWgzKH6KjnvR?85x$A$ZN+G0#8>XkFb9 z9zWb_5-`)TxAZ%jIz@ik!2)usZWY?tyjjOd<;04s^5^fjU8zy`7I$70NYN82zW6h| z$X=NbEUMsfM*!<{`)e40n^{H-)`KJX!(mZdv-cC!9L+JvSVnSO(VKcNP;t?UGtk!b zSPgVYsnD9ejE;FGyPg{6YW6R5Q$rGiy%J(H)2LXP4eT;Slga?wulT3;iy&;Ia=@Rj z!U(jtPyK}8ZWprMhYw6rMgQS66{Y=o_anEEOn1Vj*{8icX-1vaY{+vNoJDFj0{pO( zMG_NH%h3QMU|oF!Z9ocohL5ayn*Z36RiYk>2PU&{vAU1j? zkRdJ8tizF;3llfJ+zh|bK4_O(7pI-9w^Y4gTB0F9sU?J)5ad=AE{p>o;579Jw#@~5OWbag~+3Mnyph?f@wbwu8 z=fB{(_w#nycZtQsdzOuJ=!+1W3GvhPtLJ9m8OpCA&1MCEcLm9=MUSexJUgvMnqDuz zd3!`HT>912mxR#8IDT6FH+LT`QmrCDq@~pdJ?clm$SLSgUD~0uNXRqN&U+KZqw7Df zzDBzgap!mUAGRk7ciu7Jh?&{>=jdQn1ag0rfaz2*?e8k)dfhWih%4+tNn18&)E9RC<4z zeXoG((fW36d;|?kq_y=zW+bjMr=HBC9G6~Oz67sXY9iWf{^(T=lY^M^#K>_LyRTd# zP2auGUqc^`u^ubR5w4Vs@kxf)dChil)2=KRi>a|4o@pNTPdUTmaKG~`#_vwS6!#k6 z{+4VvCc;c#xdy8hCDR;Cl~`TpA&O_}1i*3^LT54QK|MZcr> z_WFbw0$>}L+Ody2Uo6A7WL7!Jjsi|{&4b%5B5BgX4~e|uY}|YIqYsLi98Q<{`IYRM zg6GJnsy+;=)vhXW#}ZcT6Xz)uFQxpe`U{DB-KsDH#Ubr*#odC)p9`{S*v9t${JC%W zNwRP4qvDI=x+u!)g-*90R-vYQbpgwWYEHiCSSi3znGDt6hfK_&?&t8e#l%}MMpBFl zxE>$Q97^qR@(KeM*(xar8JyGv7=1lKpu)}4U@!(Ggn@EP+h#cPr~OUH-`QqXhlhNd zjl-d^u9-i0$Gp!aVs!#8LeIRnr-PZYrSHxBwm7LpU-rGj%`%3{jJ$YGlC;!ih7QtL z?Zt!uX4Po`%PTiH$H>#58o08=3zvG`f%ntyD#+pAjuhI>e65GIil-1!j zY|&2)#*BgVwZTom3H=~rSH4u71~5Evh9-a_APuJ-&g8=GsZ%XZ`qc>;Jya=i6~{(4 zze`0_$3fz?k)M$&6Q&2k9O@)|ms0J}WX+PQI!AD_7a~rK?MmT=*{6>HgTC8@7F?wW zQvP*i_&d*0XyEkG>uvdgHGS``HxH~dcZ(_r(SdxGqHQ%PTNR$W9pbwF`p%+Ykchrg zd;ZKP$e_{BKpcRu)<0Yc9BtI9zz>QDE10>pjI*RY^gW>ul4rjnPF^nE9*z_fjWPsx z;rz(NO!21+*w8E;HQ$iEs5?KQdY&WrS6@)|)f2@QGGUNb`pZ9QAe|~5VNk^MzNK=| z;9mAK2uc9Z4dpSjUqcHr9b7A0l!Z0R|#ihlchp@I~KLoS?6Doh)_ zu=K%3UGOn9lpxZdn;Jp5l_rCG^PfI$I}&ztJSpaMC0Dy0lkx;${plYda`3~ne*P2} z9ns|~NVrt6b{V?dJkGZr?$|N@3Us`o=$|_;^#S3=1iixlG*FRl!;~WTtHWQYrv4vi zfe1%Iyo&Usa1;vcWijV9f7lG3%s-7n>1JhqP#>q+%Q)cm8&5xe%t7J#7D4;Pq!ZrW z*g^ioamw?yQzmW9rs}H{8t5HMq^f8a;yr5&UFlvWAEjU8sr=MHK{6`(@8X=pB5QW2 z)rThuRkfKID&7*$00)V;uz|kjA&u<%qJ(-ftQI~Y0{FUqmAQ!dX>BIlbU4uR1a+&@ zkmj#sFi6@RVdl;od8!Nb$k?GwV+%UZN9AD$I^SFxGhyZiYBo6^FlHMmi!Ic%74vOR zTbAhK$tdDL$9G>b!@nzjgEd46*Yv8FuSvFht22=+*rv|+4$3b zZ!3S9Pw}ln%eG1#?EZ^BG{yxDUxw|9&~c^5s(?Zdx-((jv z13BIiNg7v<)1Ffv6D%?fSr_TBhX^49!*M=iw(6`RQc?jsR0}$}pNjkz<6%^oMiYn`-l$ug_5e zS1DRhObQInw-Hk}ce)nOJZ9INf!2B`WzZ4KR@X3E!~FpiZ)K(=-8Jv@E0_O7vHoC^ z*mjWnD^9@x&n<51a}BtoDA5<;<}xSCC+OaWNZ$ME3m&cIdTfwC4Zm$M?e4xF(O$|$ zrSzuPFiN2WDjj&+{!K)`jnAnWe@$`zFB!7C_VUHc>G-^C$sIK&2Yo??dG8%0cY(-P z1rmXM{)O0gYP&rAn2vYb`0|l9nE3ECc_<5>4C^-IkP5A?DipVEh9TOz&DpiYx%6@C z#Dno^dc`iX8XU-yP(<05{clKW%B~$F$=^>896~*gwp&*&IxfA9fhpjF$7_{qs|GRM zLX+R8N{JxU6-9q%_r?JeOsI^WN_t7?pj&xEkHMow{;zu80jt}tvI zFD>(I?F<}NeZm5#`PrYw0M)P3Kz3*VPJFh2r$Th$n@AOsr`1dhA9WkD|k=MnY0PQDYtoFoJo3AVzoQ(6}uJ5 zwBXm2)hE`7bwu6b&XTa}cPj9p2ZnQpcF_$!1-P{a=mYqW?0lIKJ;w@^$6in|X0*YF`$DQZHSS134zF#>yPW_`4AM znjWs@7CMvwH&w=voOp3Nmp*fLCy%HIhrP5`8tIG_zpnAcnl=|XlAwc5huL$3P(55h z>c_yBe?U^0$VIy65!`OulJGuDnbnWNi(Y(X%(q+=wc|?Q2Wu_JnDJ&$*`0Aw!ZUIi zLNC5ADY4@dQNnc>jc?!5JbOc?nNQyEX>`M5$mfqT$&v=S?+6QQU0tZYtev?)e4p?- zY{z1l6g8L;7w5*j(|auG#MUb~C2FLD6F18@z+LutDU_~ID;*L^^u`B!#;k#f{-zo9?Ko4_oPY}^K;S}Z+?xf&NYM^|v z*pkvo9N^|^q7*<0z0x+Hj+W+}ccPQ$H(-$H-?fpVpC<>uExt9k+(1qEU9M}vo%HvX0RkxaW5 z=KK>pm4^BzfJRm1U%B1g>RZ@jDfLn$`jQ>x1y$v|mymsRDCL?c!YkXHKGa-HgE^c< z&YfRD-oQYl9&jEJOV>1l30cc7hM{sP6OEbF4?M=-nqywL<U9Y?sIr@s$(G5wcSm@dzPD$+RR=zaQD*X%5`4WL^3uN+b)z#*3hP*#P%bC@!UE zZ>`)nYW}1sbTh`W{0WJAY;H1vzX&xGt4PFK9HgIS)leN-3# diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml deleted file mode 100644 index 69b22338c6..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - #008577 - #00574B - #D81B60 - diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml deleted file mode 100644 index 168adfb0a0..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml +++ /dev/null @@ -1,3 +0,0 @@ - - PaddlePredictor - diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml deleted file mode 100644 index 5885930df6..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - diff --git a/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java b/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java deleted file mode 100644 index 99dc6d27b3..0000000000 --- a/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.baidu.paddle.lite; - -import org.junit.Test; - -import static org.junit.Assert.*; - -/** - * Example local unit test, which will execute on the development machine (host). - * - * @see Testing documentation - */ -public class ExampleUnitTest { - @Test - public void addition_isCorrect() { - assertEquals(4, 2 + 2); - } -} \ No newline at end of file diff --git a/lite/demo/java/android/PaddlePredictor/build.gradle b/lite/demo/java/android/PaddlePredictor/build.gradle deleted file mode 100644 index 02199bb823..0000000000 --- a/lite/demo/java/android/PaddlePredictor/build.gradle +++ /dev/null @@ -1,27 +0,0 @@ -// Top-level build file where you can add configuration options common to all sub-projects/modules. - -buildscript { - repositories { - google() - jcenter() - - } - dependencies { - classpath 'com.android.tools.build:gradle:3.4.1' - - // NOTE: Do not place your application dependencies here; they belong - // in the individual module build.gradle files - } -} - -allprojects { - repositories { - google() - jcenter() - - } -} - -task clean(type: Delete) { - delete rootProject.buildDir -} diff --git a/lite/demo/java/android/PaddlePredictor/gradle.properties b/lite/demo/java/android/PaddlePredictor/gradle.properties deleted file mode 100644 index 743d692ce1..0000000000 --- a/lite/demo/java/android/PaddlePredictor/gradle.properties +++ /dev/null @@ -1,13 +0,0 @@ -# Project-wide Gradle settings. -# IDE (e.g. Android Studio) users: -# Gradle settings configured through the IDE *will override* -# any settings specified in this file. -# For more details on how to configure your build environment visit -# http://www.gradle.org/docs/current/userguide/build_environment.html -# Specifies the JVM arguments used for the daemon process. -# The setting is particularly useful for tweaking memory settings. -org.gradle.jvmargs=-Xmx1536m -# When configured, Gradle will run in incubating parallel mode. -# This option should only be used with decoupled projects. More details, visit -# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects -# org.gradle.parallel=true diff --git a/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar b/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar deleted file mode 100644 index f6b961fd5a86aa5fbfe90f707c3138408be7c718..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 54329 zcmagFV|ZrKvM!pAZQHhO+qP}9lTNj?q^^Y^VFp)SH8qbSJ)2BQ2giqr}t zFG7D6)c?v~^Z#E_K}1nTQbJ9gQ9<%vVRAxVj)8FwL5_iTdUB>&m3fhE=kRWl;g`&m z!W5kh{WsV%fO*%je&j+Lv4xxK~zsEYQls$Q-p&dwID|A)!7uWtJF-=Tm1{V@#x*+kUI$=%KUuf2ka zjiZ{oiL1MXE2EjciJM!jrjFNwCh`~hL>iemrqwqnX?T*MX;U>>8yRcZb{Oy+VKZos zLiFKYPw=LcaaQt8tj=eoo3-@bG_342HQ%?jpgAE?KCLEHC+DmjxAfJ%Og^$dpC8Xw zAcp-)tfJm}BPNq_+6m4gBgBm3+CvmL>4|$2N$^Bz7W(}fz1?U-u;nE`+9`KCLuqg} zwNstNM!J4Uw|78&Y9~9>MLf56to!@qGkJw5Thx%zkzj%Ek9Nn1QA@8NBXbwyWC>9H z#EPwjMNYPigE>*Ofz)HfTF&%PFj$U6mCe-AFw$U%-L?~-+nSXHHKkdgC5KJRTF}`G zE_HNdrE}S0zf4j{r_f-V2imSqW?}3w-4=f@o@-q+cZgaAbZ((hn))@|eWWhcT2pLpTpL!;_5*vM=sRL8 zqU##{U#lJKuyqW^X$ETU5ETeEVzhU|1m1750#f}38_5N9)B_2|v@1hUu=Kt7-@dhA zq_`OMgW01n`%1dB*}C)qxC8q;?zPeF_r;>}%JYmlER_1CUbKa07+=TV45~symC*g8 zW-8(gag#cAOuM0B1xG8eTp5HGVLE}+gYTmK=`XVVV*U!>H`~j4+ROIQ+NkN$LY>h4 zqpwdeE_@AX@PL};e5vTn`Ro(EjHVf$;^oiA%@IBQq>R7_D>m2D4OwwEepkg}R_k*M zM-o;+P27087eb+%*+6vWFCo9UEGw>t&WI17Pe7QVuoAoGHdJ(TEQNlJOqnjZ8adCb zI`}op16D@v7UOEo%8E-~m?c8FL1utPYlg@m$q@q7%mQ4?OK1h%ODjTjFvqd!C z-PI?8qX8{a@6d&Lb_X+hKxCImb*3GFemm?W_du5_&EqRq!+H?5#xiX#w$eLti-?E$;Dhu`{R(o>LzM4CjO>ICf z&DMfES#FW7npnbcuqREgjPQM#gs6h>`av_oEWwOJZ2i2|D|0~pYd#WazE2Bbsa}X@ zu;(9fi~%!VcjK6)?_wMAW-YXJAR{QHxrD5g(ou9mR6LPSA4BRG1QSZT6A?kelP_g- zH(JQjLc!`H4N=oLw=f3{+WmPA*s8QEeEUf6Vg}@!xwnsnR0bl~^2GSa5vb!Yl&4!> zWb|KQUsC$lT=3A|7vM9+d;mq=@L%uWKwXiO9}a~gP4s_4Yohc!fKEgV7WbVo>2ITbE*i`a|V!^p@~^<={#?Gz57 zyPWeM2@p>D*FW#W5Q`1`#5NW62XduP1XNO(bhg&cX`-LYZa|m-**bu|>}S;3)eP8_ zpNTnTfm8 ze+7wDH3KJ95p)5tlwk`S7mbD`SqHnYD*6`;gpp8VdHDz%RR_~I_Ar>5)vE-Pgu7^Y z|9Px+>pi3!DV%E%4N;ii0U3VBd2ZJNUY1YC^-e+{DYq+l@cGtmu(H#Oh%ibUBOd?C z{y5jW3v=0eV0r@qMLgv1JjZC|cZ9l9Q)k1lLgm))UR@#FrJd>w^`+iy$c9F@ic-|q zVHe@S2UAnc5VY_U4253QJxm&Ip!XKP8WNcnx9^cQ;KH6PlW8%pSihSH2(@{2m_o+m zr((MvBja2ctg0d0&U5XTD;5?d?h%JcRJp{_1BQW1xu&BrA3(a4Fh9hon-ly$pyeHq zG&;6q?m%NJ36K1Sq_=fdP(4f{Hop;_G_(i?sPzvB zDM}>*(uOsY0I1j^{$yn3#U(;B*g4cy$-1DTOkh3P!LQ;lJlP%jY8}Nya=h8$XD~%Y zbV&HJ%eCD9nui-0cw!+n`V~p6VCRqh5fRX z8`GbdZ@73r7~myQLBW%db;+BI?c-a>Y)m-FW~M=1^|<21_Sh9RT3iGbO{o-hpN%d6 z7%++#WekoBOP^d0$$|5npPe>u3PLvX_gjH2x(?{&z{jJ2tAOWTznPxv-pAv<*V7r$ z6&glt>7CAClWz6FEi3bToz-soY^{ScrjwVPV51=>n->c(NJngMj6TyHty`bfkF1hc zkJS%A@cL~QV0-aK4>Id!9dh7>0IV;1J9(myDO+gv76L3NLMUm9XyPauvNu$S<)-|F zZS}(kK_WnB)Cl`U?jsdYfAV4nrgzIF@+%1U8$poW&h^c6>kCx3;||fS1_7JvQT~CV zQ8Js+!p)3oW>Df(-}uqC`Tcd%E7GdJ0p}kYj5j8NKMp(KUs9u7?jQ94C)}0rba($~ zqyBx$(1ae^HEDG`Zc@-rXk1cqc7v0wibOR4qpgRDt#>-*8N3P;uKV0CgJE2SP>#8h z=+;i_CGlv+B^+$5a}SicVaSeaNn29K`C&=}`=#Nj&WJP9Xhz4mVa<+yP6hkrq1vo= z1rX4qg8dc4pmEvq%NAkpMK>mf2g?tg_1k2%v}<3`$6~Wlq@ItJ*PhHPoEh1Yi>v57 z4k0JMO)*=S`tKvR5gb-(VTEo>5Y>DZJZzgR+j6{Y`kd|jCVrg!>2hVjz({kZR z`dLlKhoqT!aI8=S+fVp(5*Dn6RrbpyO~0+?fy;bm$0jmTN|t5i6rxqr4=O}dY+ROd zo9Et|x}!u*xi~>-y>!M^+f&jc;IAsGiM_^}+4|pHRn{LThFFpD{bZ|TA*wcGm}XV^ zr*C6~@^5X-*R%FrHIgo-hJTBcyQ|3QEj+cSqp#>&t`ZzB?cXM6S(lRQw$I2?m5=wd z78ki`R?%;o%VUhXH?Z#(uwAn9$m`npJ=cA+lHGk@T7qq_M6Zoy1Lm9E0UUysN)I_x zW__OAqvku^>`J&CB=ie@yNWsaFmem}#L3T(x?a`oZ+$;3O-icj2(5z72Hnj=9Z0w% z<2#q-R=>hig*(t0^v)eGq2DHC%GymE-_j1WwBVGoU=GORGjtaqr0BNigOCqyt;O(S zKG+DoBsZU~okF<7ahjS}bzwXxbAxFfQAk&O@>LsZMsZ`?N?|CDWM(vOm%B3CBPC3o z%2t@%H$fwur}SSnckUm0-k)mOtht`?nwsDz=2#v=RBPGg39i#%odKq{K^;bTD!6A9 zskz$}t)sU^=a#jLZP@I=bPo?f-L}wpMs{Tc!m7-bi!Ldqj3EA~V;4(dltJmTXqH0r z%HAWKGutEc9vOo3P6Q;JdC^YTnby->VZ6&X8f{obffZ??1(cm&L2h7q)*w**+sE6dG*;(H|_Q!WxU{g)CeoT z(KY&bv!Usc|m+Fqfmk;h&RNF|LWuNZ!+DdX*L=s-=_iH=@i` z?Z+Okq^cFO4}_n|G*!)Wl_i%qiMBaH8(WuXtgI7EO=M>=i_+;MDjf3aY~6S9w0K zUuDO7O5Ta6+k40~xh~)D{=L&?Y0?c$s9cw*Ufe18)zzk%#ZY>Tr^|e%8KPb0ht`b( zuP@8#Ox@nQIqz9}AbW0RzE`Cf>39bOWz5N3qzS}ocxI=o$W|(nD~@EhW13Rj5nAp; zu2obEJa=kGC*#3=MkdkWy_%RKcN=?g$7!AZ8vBYKr$ePY(8aIQ&yRPlQ=mudv#q$q z4%WzAx=B{i)UdLFx4os?rZp6poShD7Vc&mSD@RdBJ=_m^&OlkEE1DFU@csgKcBifJ zz4N7+XEJhYzzO=86 z#%eBQZ$Nsf2+X0XPHUNmg#(sNt^NW1Y0|M(${e<0kW6f2q5M!2YE|hSEQ*X-%qo(V zHaFwyGZ0on=I{=fhe<=zo{=Og-_(to3?cvL4m6PymtNsdDINsBh8m>a%!5o3s(en) z=1I z6O+YNertC|OFNqd6P=$gMyvmfa`w~p9*gKDESFqNBy(~Zw3TFDYh}$iudn)9HxPBi zdokK@o~nu?%imcURr5Y~?6oo_JBe}t|pU5qjai|#JDyG=i^V~7+a{dEnO<(y>ahND#_X_fcEBNiZ)uc&%1HVtx8Ts z*H_Btvx^IhkfOB#{szN*n6;y05A>3eARDXslaE>tnLa>+`V&cgho?ED+&vv5KJszf zG4@G;7i;4_bVvZ>!mli3j7~tPgybF5|J6=Lt`u$D%X0l}#iY9nOXH@(%FFJLtzb%p zzHfABnSs;v-9(&nzbZytLiqqDIWzn>JQDk#JULcE5CyPq_m#4QV!}3421haQ+LcfO*>r;rg6K|r#5Sh|y@h1ao%Cl)t*u`4 zMTP!deC?aL7uTxm5^nUv#q2vS-5QbBKP|drbDXS%erB>fYM84Kpk^au99-BQBZR z7CDynflrIAi&ahza+kUryju5LR_}-Z27g)jqOc(!Lx9y)e z{cYc&_r947s9pteaa4}dc|!$$N9+M38sUr7h(%@Ehq`4HJtTpA>B8CLNO__@%(F5d z`SmX5jbux6i#qc}xOhumzbAELh*Mfr2SW99=WNOZRZgoCU4A2|4i|ZVFQt6qEhH#B zK_9G;&h*LO6tB`5dXRSBF0hq0tk{2q__aCKXYkP#9n^)@cq}`&Lo)1KM{W+>5mSed zKp~=}$p7>~nK@va`vN{mYzWN1(tE=u2BZhga5(VtPKk(*TvE&zmn5vSbjo zZLVobTl%;t@6;4SsZ>5+U-XEGUZGG;+~|V(pE&qqrp_f~{_1h@5ZrNETqe{bt9ioZ z#Qn~gWCH!t#Ha^n&fT2?{`}D@s4?9kXj;E;lWV9Zw8_4yM0Qg-6YSsKgvQ*fF{#Pq z{=(nyV>#*`RloBVCs;Lp*R1PBIQOY=EK4CQa*BD0MsYcg=opP?8;xYQDSAJBeJpw5 zPBc_Ft9?;<0?pBhCmOtWU*pN*;CkjJ_}qVic`}V@$TwFi15!mF1*m2wVX+>5p%(+R zQ~JUW*zWkalde{90@2v+oVlkxOZFihE&ZJ){c?hX3L2@R7jk*xjYtHi=}qb+4B(XJ z$gYcNudR~4Kz_WRq8eS((>ALWCO)&R-MXE+YxDn9V#X{_H@j616<|P(8h(7z?q*r+ zmpqR#7+g$cT@e&(%_|ipI&A%9+47%30TLY(yuf&*knx1wNx|%*H^;YB%ftt%5>QM= z^i;*6_KTSRzQm%qz*>cK&EISvF^ovbS4|R%)zKhTH_2K>jP3mBGn5{95&G9^a#4|K zv+!>fIsR8z{^x4)FIr*cYT@Q4Z{y}};rLHL+atCgHbfX*;+k&37DIgENn&=k(*lKD zG;uL-KAdLn*JQ?@r6Q!0V$xXP=J2i~;_+i3|F;_En;oAMG|I-RX#FwnmU&G}w`7R{ z788CrR-g1DW4h_`&$Z`ctN~{A)Hv_-Bl!%+pfif8wN32rMD zJDs$eVWBYQx1&2sCdB0!vU5~uf)=vy*{}t{2VBpcz<+~h0wb7F3?V^44*&83Z2#F` z32!rd4>uc63rQP$3lTH3zb-47IGR}f)8kZ4JvX#toIpXH`L%NnPDE~$QI1)0)|HS4 zVcITo$$oWWwCN@E-5h>N?Hua!N9CYb6f8vTFd>h3q5Jg-lCI6y%vu{Z_Uf z$MU{{^o~;nD_@m2|E{J)q;|BK7rx%`m``+OqZAqAVj-Dy+pD4-S3xK?($>wn5bi90CFAQ+ACd;&m6DQB8_o zjAq^=eUYc1o{#+p+ zn;K<)Pn*4u742P!;H^E3^Qu%2dM{2slouc$AN_3V^M7H_KY3H)#n7qd5_p~Za7zAj|s9{l)RdbV9e||_67`#Tu*c<8!I=zb@ z(MSvQ9;Wrkq6d)!9afh+G`!f$Ip!F<4ADdc*OY-y7BZMsau%y?EN6*hW4mOF%Q~bw z2==Z3^~?q<1GTeS>xGN-?CHZ7a#M4kDL zQxQr~1ZMzCSKFK5+32C%+C1kE#(2L=15AR!er7GKbp?Xd1qkkGipx5Q~FI-6zt< z*PTpeVI)Ngnnyaz5noIIgNZtb4bQdKG{Bs~&tf)?nM$a;7>r36djllw%hQxeCXeW^ z(i6@TEIuxD<2ulwLTt|&gZP%Ei+l!(%p5Yij6U(H#HMkqM8U$@OKB|5@vUiuY^d6X zW}fP3;Kps6051OEO(|JzmVU6SX(8q>*yf*x5QoxDK={PH^F?!VCzES_Qs>()_y|jg6LJlJWp;L zKM*g5DK7>W_*uv}{0WUB0>MHZ#oJZmO!b3MjEc}VhsLD~;E-qNNd?x7Q6~v zR=0$u>Zc2Xr}>x_5$-s#l!oz6I>W?lw;m9Ae{Tf9eMX;TI-Wf_mZ6sVrMnY#F}cDd z%CV*}fDsXUF7Vbw>PuDaGhu631+3|{xp<@Kl|%WxU+vuLlcrklMC!Aq+7n~I3cmQ! z`e3cA!XUEGdEPSu``&lZEKD1IKO(-VGvcnSc153m(i!8ohi`)N2n>U_BemYJ`uY>8B*Epj!oXRLV}XK}>D*^DHQ7?NY*&LJ9VSo`Ogi9J zGa;clWI8vIQqkngv2>xKd91K>?0`Sw;E&TMg&6dcd20|FcTsnUT7Yn{oI5V4@Ow~m zz#k~8TM!A9L7T!|colrC0P2WKZW7PNj_X4MfESbt<-soq*0LzShZ}fyUx!(xIIDwx zRHt^_GAWe0-Vm~bDZ(}XG%E+`XhKpPlMBo*5q_z$BGxYef8O!ToS8aT8pmjbPq)nV z%x*PF5ZuSHRJqJ!`5<4xC*xb2vC?7u1iljB_*iUGl6+yPyjn?F?GOF2_KW&gOkJ?w z3e^qc-te;zez`H$rsUCE0<@7PKGW?7sT1SPYWId|FJ8H`uEdNu4YJjre`8F*D}6Wh z|FQ`xf7yiphHIAkU&OYCn}w^ilY@o4larl?^M7&8YI;hzBIsX|i3UrLsx{QDKwCX< zy;a>yjfJ6!sz`NcVi+a!Fqk^VE^{6G53L?@Tif|j!3QZ0fk9QeUq8CWI;OmO-Hs+F zuZ4sHLA3{}LR2Qlyo+{d@?;`tpp6YB^BMoJt?&MHFY!JQwoa0nTSD+#Ku^4b{5SZVFwU9<~APYbaLO zu~Z)nS#dxI-5lmS-Bnw!(u15by(80LlC@|ynj{TzW)XcspC*}z0~8VRZq>#Z49G`I zgl|C#H&=}n-ajxfo{=pxPV(L*7g}gHET9b*s=cGV7VFa<;Htgjk>KyW@S!|z`lR1( zGSYkEl&@-bZ*d2WQ~hw3NpP=YNHF^XC{TMG$Gn+{b6pZn+5=<()>C!N^jncl0w6BJ zdHdnmSEGK5BlMeZD!v4t5m7ct7{k~$1Ie3GLFoHjAH*b?++s<|=yTF+^I&jT#zuMx z)MLhU+;LFk8bse|_{j+d*a=&cm2}M?*arjBPnfPgLwv)86D$6L zLJ0wPul7IenMvVAK$z^q5<^!)7aI|<&GGEbOr=E;UmGOIa}yO~EIr5xWU_(ol$&fa zR5E(2vB?S3EvJglTXdU#@qfDbCYs#82Yo^aZN6`{Ex#M)easBTe_J8utXu(fY1j|R z9o(sQbj$bKU{IjyhosYahY{63>}$9_+hWxB3j}VQkJ@2$D@vpeRSldU?&7I;qd2MF zSYmJ>zA(@N_iK}m*AMPIJG#Y&1KR)6`LJ83qg~`Do3v^B0>fU&wUx(qefuTgzFED{sJ65!iw{F2}1fQ3= ziFIP{kezQxmlx-!yo+sC4PEtG#K=5VM9YIN0z9~c4XTX?*4e@m;hFM!zVo>A`#566 z>f&3g94lJ{r)QJ5m7Xe3SLau_lOpL;A($wsjHR`;xTXgIiZ#o&vt~ zGR6KdU$FFbLfZCC3AEu$b`tj!9XgOGLSV=QPIYW zjI!hSP#?8pn0@ezuenOzoka8!8~jXTbiJ6+ZuItsWW03uzASFyn*zV2kIgPFR$Yzm zE<$cZlF>R8?Nr2_i?KiripBc+TGgJvG@vRTY2o?(_Di}D30!k&CT`>+7ry2!!iC*X z<@=U0_C#16=PN7bB39w+zPwDOHX}h20Ap);dx}kjXX0-QkRk=cr};GYsjSvyLZa-t zzHONWddi*)RDUH@RTAsGB_#&O+QJaaL+H<<9LLSE+nB@eGF1fALwjVOl8X_sdOYme z0lk!X=S(@25=TZHR7LlPp}fY~yNeThMIjD}pd9+q=j<_inh0$>mIzWVY+Z9p<{D^#0Xk+b_@eNSiR8;KzSZ#7lUsk~NGMcB8C2c=m2l5paHPq`q{S(kdA7Z1a zyfk2Y;w?^t`?@yC5Pz9&pzo}Hc#}mLgDmhKV|PJ3lKOY(Km@Fi2AV~CuET*YfUi}u zfInZnqDX(<#vaS<^fszuR=l)AbqG{}9{rnyx?PbZz3Pyu!eSJK`uwkJU!ORQXy4x83r!PNgOyD33}}L=>xX_93l6njNTuqL8J{l%*3FVn3MG4&Fv*`lBXZ z?=;kn6HTT^#SrPX-N)4EZiIZI!0ByXTWy;;J-Tht{jq1mjh`DSy7yGjHxIaY%*sTx zuy9#9CqE#qi>1misx=KRWm=qx4rk|}vd+LMY3M`ow8)}m$3Ggv&)Ri*ON+}<^P%T5 z_7JPVPfdM=Pv-oH<tecoE}(0O7|YZc*d8`Uv_M*3Rzv7$yZnJE6N_W=AQ3_BgU_TjA_T?a)U1csCmJ&YqMp-lJe`y6>N zt++Bi;ZMOD%%1c&-Q;bKsYg!SmS^#J@8UFY|G3!rtyaTFb!5@e(@l?1t(87ln8rG? z--$1)YC~vWnXiW3GXm`FNSyzu!m$qT=Eldf$sMl#PEfGmzQs^oUd=GIQfj(X=}dw+ zT*oa0*oS%@cLgvB&PKIQ=Ok?>x#c#dC#sQifgMwtAG^l3D9nIg(Zqi;D%807TtUUCL3_;kjyte#cAg?S%e4S2W>9^A(uy8Ss0Tc++ZTjJw1 z&Em2g!3lo@LlDyri(P^I8BPpn$RE7n*q9Q-c^>rfOMM6Pd5671I=ZBjAvpj8oIi$! zl0exNl(>NIiQpX~FRS9UgK|0l#s@#)p4?^?XAz}Gjb1?4Qe4?j&cL$C8u}n)?A@YC zfmbSM`Hl5pQFwv$CQBF=_$Sq zxsV?BHI5bGZTk?B6B&KLdIN-40S426X3j_|ceLla*M3}3gx3(_7MVY1++4mzhH#7# zD>2gTHy*%i$~}mqc#gK83288SKp@y3wz1L_e8fF$Rb}ex+`(h)j}%~Ld^3DUZkgez zOUNy^%>>HHE|-y$V@B}-M|_{h!vXpk01xaD%{l{oQ|~+^>rR*rv9iQen5t?{BHg|% zR`;S|KtUb!X<22RTBA4AAUM6#M?=w5VY-hEV)b`!y1^mPNEoy2K)a>OyA?Q~Q*&(O zRzQI~y_W=IPi?-OJX*&&8dvY0zWM2%yXdFI!D-n@6FsG)pEYdJbuA`g4yy;qrgR?G z8Mj7gv1oiWq)+_$GqqQ$(ZM@#|0j7})=#$S&hZwdoijFI4aCFLVI3tMH5fLreZ;KD zqA`)0l~D2tuIBYOy+LGw&hJ5OyE+@cnZ0L5+;yo2pIMdt@4$r^5Y!x7nHs{@>|W(MzJjATyWGNwZ^4j+EPU0RpAl-oTM@u{lx*i0^yyWPfHt6QwPvYpk9xFMWfBFt!+Gu6TlAmr zeQ#PX71vzN*_-xh&__N`IXv6`>CgV#eA_%e@7wjgkj8jlKzO~Ic6g$cT`^W{R{606 zCDP~+NVZ6DMO$jhL~#+!g*$T!XW63#(ngDn#Qwy71yj^gazS{e;3jGRM0HedGD@pt z?(ln3pCUA(ekqAvvnKy0G@?-|-dh=eS%4Civ&c}s%wF@0K5Bltaq^2Os1n6Z3%?-Q zAlC4goQ&vK6TpgtzkHVt*1!tBYt-`|5HLV1V7*#45Vb+GACuU+QB&hZ=N_flPy0TY zR^HIrdskB#<$aU;HY(K{a3(OQa$0<9qH(oa)lg@Uf>M5g2W0U5 zk!JSlhrw8quBx9A>RJ6}=;W&wt@2E$7J=9SVHsdC?K(L(KACb#z)@C$xXD8^!7|uv zZh$6fkq)aoD}^79VqdJ!Nz-8$IrU(_-&^cHBI;4 z^$B+1aPe|LG)C55LjP;jab{dTf$0~xbXS9!!QdcmDYLbL^jvxu2y*qnx2%jbL%rB z{aP85qBJe#(&O~Prk%IJARcdEypZ)vah%ZZ%;Zk{eW(U)Bx7VlzgOi8)x z`rh4l`@l_Ada7z&yUK>ZF;i6YLGwI*Sg#Fk#Qr0Jg&VLax(nNN$u-XJ5=MsP3|(lEdIOJ7|(x3iY;ea)5#BW*mDV%^=8qOeYO&gIdJVuLLN3cFaN=xZtFB=b zH{l)PZl_j^u+qx@89}gAQW7ofb+k)QwX=aegihossZq*+@PlCpb$rpp>Cbk9UJO<~ zDjlXQ_Ig#W0zdD3&*ei(FwlN#3b%FSR%&M^ywF@Fr>d~do@-kIS$e%wkIVfJ|Ohh=zc zF&Rnic^|>@R%v?@jO}a9;nY3Qrg_!xC=ZWUcYiA5R+|2nsM*$+c$TOs6pm!}Z}dfM zGeBhMGWw3$6KZXav^>YNA=r6Es>p<6HRYcZY)z{>yasbC81A*G-le8~QoV;rtKnkx z;+os8BvEe?0A6W*a#dOudsv3aWs?d% z0oNngyVMjavLjtjiG`!007#?62ClTqqU$@kIY`=x^$2e>iqIy1>o|@Tw@)P)B8_1$r#6>DB_5 zmaOaoE~^9TolgDgooKFuEFB#klSF%9-~d2~_|kQ0Y{Ek=HH5yq9s zDq#1S551c`kSiWPZbweN^A4kWiP#Qg6er1}HcKv{fxb1*BULboD0fwfaNM_<55>qM zETZ8TJDO4V)=aPp_eQjX%||Ud<>wkIzvDlpNjqW>I}W!-j7M^TNe5JIFh#-}zAV!$ICOju8Kx)N z0vLtzDdy*rQN!7r>Xz7rLw8J-(GzQlYYVH$WK#F`i_i^qVlzTNAh>gBWKV@XC$T-` z3|kj#iCquDhiO7NKum07i|<-NuVsX}Q}mIP$jBJDMfUiaWR3c|F_kWBMw0_Sr|6h4 zk`_r5=0&rCR^*tOy$A8K;@|NqwncjZ>Y-75vlpxq%Cl3EgH`}^^~=u zoll6xxY@a>0f%Ddpi;=cY}fyG!K2N-dEyXXmUP5u){4VnyS^T4?pjN@Ot4zjL(Puw z_U#wMH2Z#8Pts{olG5Dy0tZj;N@;fHheu>YKYQU=4Bk|wcD9MbA`3O4bj$hNRHwzb zSLcG0SLV%zywdbuwl(^E_!@&)TdXge4O{MRWk2RKOt@!8E{$BU-AH(@4{gxs=YAz9LIob|Hzto0}9cWoz6Tp2x0&xi#$ zHh$dwO&UCR1Ob2w00-2eG7d4=cN(Y>0R#$q8?||q@iTi+7-w-xR%uMr&StFIthC<# zvK(aPduwuNB}oJUV8+Zl)%cnfsHI%4`;x6XW^UF^e4s3Z@S<&EV8?56Wya;HNs0E> z`$0dgRdiUz9RO9Au3RmYq>K#G=X%*_dUbSJHP`lSfBaN8t-~@F>)BL1RT*9I851A3 z<-+Gb#_QRX>~av#Ni<#zLswtu-c6{jGHR>wflhKLzC4P@b%8&~u)fosoNjk4r#GvC zlU#UU9&0Hv;d%g72Wq?Ym<&&vtA3AB##L}=ZjiTR4hh7J)e>ei} zt*u+>h%MwN`%3}b4wYpV=QwbY!jwfIj#{me)TDOG`?tI!%l=AwL2G@9I~}?_dA5g6 zCKgK(;6Q0&P&K21Tx~k=o6jwV{dI_G+Ba*Zts|Tl6q1zeC?iYJTb{hel*x>^wb|2RkHkU$!+S4OU4ZOKPZjV>9OVsqNnv5jK8TRAE$A&^yRwK zj-MJ3Pl?)KA~fq#*K~W0l4$0=8GRx^9+?w z!QT8*-)w|S^B0)ZeY5gZPI2G(QtQf?DjuK(s^$rMA!C%P22vynZY4SuOE=wX2f8$R z)A}mzJi4WJnZ`!bHG1=$lwaxm!GOnRbR15F$nRC-M*H<*VfF|pQw(;tbSfp({>9^5 zw_M1-SJ9eGF~m(0dvp*P8uaA0Yw+EkP-SWqu zqal$hK8SmM7#Mrs0@OD+%_J%H*bMyZiWAZdsIBj#lkZ!l2c&IpLu(5^T0Ge5PHzR} zn;TXs$+IQ_&;O~u=Jz+XE0wbOy`=6>m9JVG} zJ~Kp1e5m?K3x@@>!D)piw^eMIHjD4RebtR`|IlckplP1;r21wTi8v((KqNqn%2CB< zifaQc&T}*M&0i|LW^LgdjIaX|o~I$`owHolRqeH_CFrqCUCleN130&vH}dK|^kC>) z-r2P~mApHotL4dRX$25lIcRh_*kJaxi^%ZN5-GAAMOxfB!6flLPY-p&QzL9TE%ho( zRwftE3sy5<*^)qYzKkL|rE>n@hyr;xPqncY6QJ8125!MWr`UCWuC~A#G1AqF1@V$kv>@NBvN&2ygy*{QvxolkRRb%Ui zsmKROR%{*g*WjUUod@@cS^4eF^}yQ1>;WlGwOli z+Y$(8I`0(^d|w>{eaf!_BBM;NpCoeem2>J}82*!em=}}ymoXk>QEfJ>G(3LNA2-46 z5PGvjr)Xh9>aSe>vEzM*>xp{tJyZox1ZRl}QjcvX2TEgNc^(_-hir@Es>NySoa1g^ zFow_twnHdx(j?Q_3q51t3XI7YlJ4_q&(0#)&a+RUy{IcBq?)eaWo*=H2UUVIqtp&lW9JTJiP&u zw8+4vo~_IJXZIJb_U^&=GI1nSD%e;P!c{kZALNCm5c%%oF+I3DrA63_@4)(v4(t~JiddILp7jmoy+>cD~ivwoctFfEL zP*#2Rx?_&bCpX26MBgp^4G>@h`Hxc(lnqyj!*t>9sOBcXN(hTwEDpn^X{x!!gPX?1 z*uM$}cYRwHXuf+gYTB}gDTcw{TXSOUU$S?8BeP&sc!Lc{{pEv}x#ELX>6*ipI1#>8 zKes$bHjiJ1OygZge_ak^Hz#k;=od1wZ=o71ba7oClBMq>Uk6hVq|ePPt)@FM5bW$I z;d2Or@wBjbTyZj|;+iHp%Bo!Vy(X3YM-}lasMItEV_QrP-Kk_J4C>)L&I3Xxj=E?| zsAF(IfVQ4w+dRRnJ>)}o^3_012YYgFWE)5TT=l2657*L8_u1KC>Y-R{7w^S&A^X^U}h20jpS zQsdeaA#WIE*<8KG*oXc~$izYilTc#z{5xhpXmdT-YUnGh9v4c#lrHG6X82F2-t35} zB`jo$HjKe~E*W$=g|j&P>70_cI`GnOQ;Jp*JK#CT zuEGCn{8A@bC)~0%wsEv?O^hSZF*iqjO~_h|>xv>PO+?525Nw2472(yqS>(#R)D7O( zg)Zrj9n9$}=~b00=Wjf?E418qP-@8%MQ%PBiCTX=$B)e5cHFDu$LnOeJ~NC;xmOk# z>z&TbsK>Qzk)!88lNI8fOE2$Uxso^j*1fz>6Ot49y@=po)j4hbTIcVR`ePHpuJSfp zxaD^Dn3X}Na3@<_Pc>a;-|^Pon(>|ytG_+U^8j_JxP=_d>L$Hj?|0lz>_qQ#a|$+( z(x=Lipuc8p4^}1EQhI|TubffZvB~lu$zz9ao%T?%ZLyV5S9}cLeT?c} z>yCN9<04NRi~1oR)CiBakoNhY9BPnv)kw%*iv8vdr&&VgLGIs(-FbJ?d_gfbL2={- zBk4lkdPk~7+jIxd4{M(-W1AC_WcN&Oza@jZoj zaE*9Y;g83#m(OhA!w~LNfUJNUuRz*H-=$s*z+q+;snKPRm9EptejugC-@7-a-}Tz0 z@KHra#Y@OXK+KsaSN9WiGf?&jlZ!V7L||%KHP;SLksMFfjkeIMf<1e~t?!G3{n)H8 zQAlFY#QwfKuj;l@<$YDATAk;%PtD%B(0<|8>rXU< zJ66rkAVW_~Dj!7JGdGGi4NFuE?7ZafdMxIh65Sz7yQoA7fBZCE@WwysB=+`kT^LFX zz8#FlSA5)6FG9(qL3~A24mpzL@@2D#>0J7mMS1T*9UJ zvOq!!a(%IYY69+h45CE?(&v9H4FCr>gK0>mK~F}5RdOuH2{4|}k@5XpsX7+LZo^Qa4sH5`eUj>iffoBVm+ zz4Mtf`h?NW$*q1yr|}E&eNl)J``SZvTf6Qr*&S%tVv_OBpbjnA0&Vz#(;QmGiq-k! zgS0br4I&+^2mgA15*~Cd00cXLYOLA#Ep}_)eED>m+K@JTPr_|lSN}(OzFXQSBc6fM z@f-%2;1@BzhZa*LFV z-LrLmkmB%<<&jEURBEW>soaZ*rSIJNwaV%-RSaCZi4X)qYy^PxZ=oL?6N-5OGOMD2 z;q_JK?zkwQ@b3~ln&sDtT5SpW9a0q+5Gm|fpVY2|zqlNYBR}E5+ahgdj!CvK$Tlk0 z9g$5N;aar=CqMsudQV>yb4l@hN(9Jcc=1(|OHsqH6|g=K-WBd8GxZ`AkT?OO z-z_Ued-??Z*R4~L7jwJ%-`s~FK|qNAJ;EmIVDVpk{Lr7T4l{}vL)|GuUuswe9c5F| zv*5%u01hlv08?00Vpwyk*Q&&fY8k6MjOfpZfKa@F-^6d=Zv|0@&4_544RP5(s|4VPVP-f>%u(J@23BHqo2=zJ#v9g=F!cP((h zpt0|(s++ej?|$;2PE%+kc6JMmJjDW)3BXvBK!h!E`8Y&*7hS{c_Z?4SFP&Y<3evqf z9-ke+bSj$%Pk{CJlJbWwlBg^mEC^@%Ou?o>*|O)rl&`KIbHrjcpqsc$Zqt0^^F-gU2O=BusO+(Op}!jNzLMc zT;0YT%$@ClS%V+6lMTfhuzzxomoat=1H?1$5Ei7&M|gxo`~{UiV5w64Np6xV zVK^nL$)#^tjhCpTQMspXI({TW^U5h&Wi1Jl8g?P1YCV4=%ZYyjSo#5$SX&`r&1PyC zzc;uzCd)VTIih|8eNqFNeBMe#j_FS6rq81b>5?aXg+E#&$m++Gz9<+2)h=K(xtn}F ziV{rmu+Y>A)qvF}ms}4X^Isy!M&1%$E!rTO~5(p+8{U6#hWu>(Ll1}eD64Xa>~73A*538wry?v$vW z>^O#FRdbj(k0Nr&)U`Tl(4PI*%IV~;ZcI2z&rmq=(k^}zGOYZF3b2~Klpzd2eZJl> zB=MOLwI1{$RxQ7Y4e30&yOx?BvAvDkTBvWPpl4V8B7o>4SJn*+h1Ms&fHso%XLN5j z-zEwT%dTefp~)J_C8;Q6i$t!dnlh-!%haR1X_NuYUuP-)`IGWjwzAvp!9@h`kPZhf zwLwFk{m3arCdx8rD~K2`42mIN4}m%OQ|f)4kf%pL?Af5Ul<3M2fv>;nlhEPR8b)u} zIV*2-wyyD%%) zl$G@KrC#cUwoL?YdQyf9WH)@gWB{jd5w4evI& zOFF)p_D8>;3-N1z6mES!OPe>B^<;9xsh)){Cw$Vs-ez5nXS95NOr3s$IU;>VZSzKn zBvub8_J~I%(DozZW@{)Vp37-zevxMRZ8$8iRfwHmYvyjOxIOAF2FUngKj289!(uxY zaClWm!%x&teKmr^ABrvZ(ikx{{I-lEzw5&4t3P0eX%M~>$wG0ZjA4Mb&op+0$#SO_ z--R`>X!aqFu^F|a!{Up-iF(K+alKB{MNMs>e(i@Tpy+7Z-dK%IEjQFO(G+2mOb@BO zP>WHlS#fSQm0et)bG8^ZDScGnh-qRKIFz zfUdnk=m){ej0i(VBd@RLtRq3Ep=>&2zZ2%&vvf?Iex01hx1X!8U+?>ER;yJlR-2q4 z;Y@hzhEC=d+Le%=esE>OQ!Q|E%6yG3V_2*uh&_nguPcZ{q?DNq8h_2ahaP6=pP-+x zK!(ve(yfoYC+n(_+chiJ6N(ZaN+XSZ{|H{TR1J_s8x4jpis-Z-rlRvRK#U%SMJ(`C z?T2 zF(NNfO_&W%2roEC2j#v*(nRgl1X)V-USp-H|CwFNs?n@&vpRcj@W@xCJwR6@T!jt377?XjZ06=`d*MFyTdyvW!`mQm~t3luzYzvh^F zM|V}rO>IlBjZc}9Z zd$&!tthvr>5)m;5;96LWiAV0?t)7suqdh0cZis`^Pyg@?t>Ms~7{nCU;z`Xl+raSr zXpp=W1oHB*98s!Tpw=R5C)O{{Inl>9l7M*kq%#w9a$6N~v?BY2GKOVRkXYCgg*d

      <5G2M1WZP5 zzqSuO91lJod(SBDDw<*sX(+F6Uq~YAeYV#2A;XQu_p=N5X+#cmu19Qk>QAnV=k!?wbk5I;tDWgFc}0NkvC*G=V+Yh1cyeJVq~9czZiDXe+S=VfL2g`LWo8om z$Y~FQc6MFjV-t1Y`^D9XMwY*U_re2R?&(O~68T&D4S{X`6JYU-pz=}ew-)V0AOUT1 zVOkHAB-8uBcRjLvz<9HS#a@X*Kc@|W)nyiSgi|u5$Md|P()%2(?olGg@ypoJwp6>m z*dnfjjWC>?_1p;%1brqZyDRR;8EntVA92EJ3ByOxj6a+bhPl z;a?m4rQAV1@QU^#M1HX)0+}A<7TCO`ZR_RzF}X9-M>cRLyN4C+lCk2)kT^3gN^`IT zNP~fAm(wyIoR+l^lQDA(e1Yv}&$I!n?&*p6?lZcQ+vGLLd~fM)qt}wsbf3r=tmVYe zl)ntf#E!P7wlakP9MXS7m0nsAmqxZ*)#j;M&0De`oNmFgi$ov#!`6^4)iQyxg5Iuj zjLAhzQ)r`^hf7`*1`Rh`X;LVBtDSz@0T?kkT1o!ijeyTGt5vc^Cd*tmNgiNo^EaWvaC8$e+nb_{W01j3%=1Y&92YacjCi>eNbwk%-gPQ@H-+4xskQ}f_c=jg^S-# zYFBDf)2?@5cy@^@FHK5$YdAK9cI;!?Jgd}25lOW%xbCJ>By3=HiK@1EM+I46A)Lsd zeT|ZH;KlCml=@;5+hfYf>QNOr^XNH%J-lvev)$Omy8MZ`!{`j>(J5cG&ZXXgv)TaF zg;cz99i$4CX_@3MIb?GL0s*8J=3`#P(jXF(_(6DXZjc@(@h&=M&JG)9&Te1?(^XMW zjjC_70|b=9hB6pKQi`S^Ls7JyJw^@P>Ko^&q8F&?>6i;#CbxUiLz1ZH4lNyd@QACd zu>{!sqjB!2Dg}pbAXD>d!3jW}=5aN0b;rw*W>*PAxm7D)aw(c*RX2@bTGEI|RRp}vw7;NR2wa;rXN{L{Q#=Fa z$x@ms6pqb>!8AuV(prv>|aU8oWV={C&$c zMa=p=CDNOC2tISZcd8~18GN5oTbKY+Vrq;3_obJlfSKRMk;Hdp1`y`&LNSOqeauR_ z^j*Ojl3Ohzb5-a49A8s|UnM*NM8tg}BJXdci5%h&;$afbmRpN0&~9rCnBA`#lG!p zc{(9Y?A0Y9yo?wSYn>iigf~KP$0*@bGZ>*YM4&D;@{<%Gg5^uUJGRrV4 z(aZOGB&{_0f*O=Oi0k{@8vN^BU>s3jJRS&CJOl3o|BE{FAA&a#2YYiX3pZz@|Go-F z|Fly;7eX2OTs>R}<`4RwpHFs9nwh)B28*o5qK1Ge=_^w0m`uJOv!=&!tzt#Save(C zgKU=Bsgql|`ui(e1KVxR`?>Dx>(rD1$iWp&m`v)3A!j5(6vBm*z|aKm*T*)mo(W;R zNGo2`KM!^SS7+*9YxTm6YMm_oSrLceqN*nDOAtagULuZl5Q<7mOnB@Hq&P|#9y{5B z!2x+2s<%Cv2Aa0+u{bjZXS);#IFPk(Ph-K7K?3i|4ro> zRbqJoiOEYo(Im^((r}U4b8nvo_>4<`)ut`24?ILnglT;Pd&U}$lV3U$F9#PD(O=yV zgNNA=GW|(E=&m_1;uaNmipQe?pon4{T=zK!N!2_CJL0E*R^XXIKf*wi!>@l}3_P9Z zF~JyMbW!+n-+>!u=A1ESxzkJy$DRuG+$oioG7(@Et|xVbJ#BCt;J43Nvj@MKvTxzy zMmjNuc#LXBxFAwIGZJk~^!q$*`FME}yKE8d1f5Mp}KHNq(@=Z8YxV}0@;YS~|SpGg$_jG7>_8WWYcVx#4SxpzlV9N4aO>K{c z$P?a_fyDzGX$Of3@ykvedGd<@-R;M^Shlj*SswJLD+j@hi_&_>6WZ}#AYLR0iWMK|A zH_NBeu(tMyG=6VO-=Pb>-Q#$F*or}KmEGg*-n?vWQREURdB#+6AvOj*I%!R-4E_2$ zU5n9m>RWs|Wr;h2DaO&mFBdDb-Z{APGQx$(L`if?C|njd*fC=rTS%{o69U|meRvu?N;Z|Y zbT|ojL>j;q*?xXmnHH#3R4O-59NV1j=uapkK7}6@Wo*^Nd#(;$iuGsb;H315xh3pl zHaJ>h-_$hdNl{+|Zb%DZH%ES;*P*v0#}g|vrKm9;j-9e1M4qX@zkl&5OiwnCz=tb6 zz<6HXD+rGIVpGtkb{Q^LIgExOm zz?I|oO9)!BOLW#krLmWvX5(k!h{i>ots*EhpvAE;06K|u_c~y{#b|UxQ*O@Ks=bca z^_F0a@61j3I(Ziv{xLb8AXQj3;R{f_l6a#H5ukg5rxwF9A$?Qp-Mo54`N-SKc}fWp z0T)-L@V$$&my;l#Ha{O@!fK4-FSA)L&3<${Hcwa7ue`=f&YsXY(NgeDU#sRlT3+9J z6;(^(sjSK@3?oMo$%L-nqy*E;3pb0nZLx6 z;h5)T$y8GXK1DS-F@bGun8|J(v-9o=42&nLJy#}M5D0T^5VWBNn$RpC zZzG6Bt66VY4_?W=PX$DMpKAI!d`INr) zkMB{XPQ<52rvWVQqgI0OL_NWxoe`xxw&X8yVftdODPj5|t}S6*VMqN$-h9)1MBe0N zYq?g0+e8fJCoAksr0af1)FYtz?Me!Cxn`gUx&|T;)695GG6HF7!Kg1zzRf_{VWv^bo81v4$?F6u2g|wxHc6eJQAg&V z#%0DnWm2Rmu71rPJ8#xFUNFC*V{+N_qqFH@gYRLZ6C?GAcVRi>^n3zQxORPG)$-B~ z%_oB?-%Zf7d*Fe;cf%tQwcGv2S?rD$Z&>QC2X^vwYjnr5pa5u#38cHCt4G3|efuci z@3z=#A13`+ztmp;%zjXwPY_aq-;isu*hecWWX_=Z8paSqq7;XYnUjK*T>c4~PR4W7 z#C*%_H&tfGx`Y$w7`dXvVhmovDnT>btmy~SLf>>~84jkoQ%cv=MMb+a{JV&t0+1`I z32g_Y@yDhKe|K^PevP~MiiVl{Ou7^Mt9{lOnXEQ`xY^6L8D$705GON{!1?1&YJEl#fTf5Z)da=yiEQ zGgtC-soFGOEBEB~ZF_{7b(76En>d}mI~XIwNw{e>=Fv)sgcw@qOsykWr?+qAOZSVrQfg}TNI ztKNG)1SRrAt6#Q?(me%)>&A_^DM`pL>J{2xu>xa$3d@90xR61TQDl@fu%_85DuUUA za9tn64?At;{`BAW6oykwntxHeDpXsV#{tmt5RqdN7LtcF4vR~_kZNT|wqyR#z^Xcd zFdymVRZvyLfTpBT>w9<)Ozv@;Yk@dOSVWbbtm^y@@C>?flP^EgQPAwsy75bveo=}T zFxl(f)s)j(0#N_>Or(xEuV(n$M+`#;Pc$1@OjXEJZumkaekVqgP_i}p`oTx;terTx zZpT+0dpUya2hqlf`SpXN{}>PfhajNk_J0`H|2<5E;U5Vh4F8er z;RxLSFgpGhkU>W?IwdW~NZTyOBrQ84H7_?gviIf71l`EETodG9a1!8e{jW?DpwjL? zGEM&eCzwoZt^P*8KHZ$B<%{I}>46IT%jJ3AnnB5P%D2E2Z_ z1M!vr#8r}1|KTqWA4%67ZdbMW2YJ81b(KF&SQ2L1Qn(y-=J${p?xLMx3W7*MK;LFQ z6Z`aU;;mTL4XrrE;HY*Rkh6N%?qviUGNAKiCB~!P}Z->IpO6E(gGd7I#eDuT7j|?nZ zK}I(EJ>$Kb&@338M~O+em9(L!+=0zBR;JAQesx|3?Ok90)D1aS9P?yTh6Poh8Cr4X zk3zc=f2rE7jj+aP7nUsr@~?^EGP>Q>h#NHS?F{Cn`g-gD<8F&dqOh-0sa%pfL`b+1 zUsF*4a~)KGb4te&K0}bE>z3yb8% zibb5Q%Sfiv7feb1r0tfmiMv z@^4XYwg@KZI=;`wC)`1jUA9Kv{HKe2t$WmRcR4y8)VAFjRi zaz&O7Y2tDmc5+SX(bj6yGHYk$dBkWc96u3u&F)2yEE~*i0F%t9Kg^L6MJSb&?wrXi zGSc;_rln$!^ybwYBeacEFRsVGq-&4uC{F)*Y;<0y7~USXswMo>j4?~5%Zm!m@i@-> zXzi82sa-vpU{6MFRktJy+E0j#w`f`>Lbog{zP|9~hg(r{RCa!uGe>Yl536cn$;ouH za#@8XMvS-kddc1`!1LVq;h57~zV`7IYR}pp3u!JtE6Q67 zq3H9ZUcWPm2V4IukS}MCHSdF0qg2@~ufNx9+VMjQP&exiG_u9TZAeAEj*jw($G)zL zq9%#v{wVyOAC4A~AF=dPX|M}MZV)s(qI9@aIK?Pe+~ch|>QYb+78lDF*Nxz2-vpRbtQ*F4$0fDbvNM#CCatgQ@z1+EZWrt z2dZfywXkiW=no5jus-92>gXn5rFQ-COvKyegmL=4+NPzw6o@a?wGE-1Bt;pCHe;34K%Z z-FnOb%!nH;)gX+!a3nCk?5(f1HaWZBMmmC@lc({dUah+E;NOros{?ui1zPC-Q0);w zEbJmdE$oU$AVGQPdm{?xxI_0CKNG$LbY*i?YRQ$(&;NiA#h@DCxC(U@AJ$Yt}}^xt-EC_ z4!;QlLkjvSOhdx!bR~W|Ezmuf6A#@T`2tsjkr>TvW*lFCMY>Na_v8+{Y|=MCu1P8y z89vPiH5+CKcG-5lzk0oY>~aJC_0+4rS@c@ZVKLAp`G-sJB$$)^4*A!B zmcf}lIw|VxV9NSoJ8Ag3CwN&d7`|@>&B|l9G8tXT^BDHOUPrtC70NgwN4${$k~d_4 zJ@eo6%YQnOgq$th?0{h`KnqYa$Nz@vlHw<%!C5du6<*j1nwquk=uY}B8r7f|lY+v7 zm|JU$US08ugor8E$h3wH$c&i~;guC|3-tqJy#T;v(g( zBZtPMSyv%jzf->435yM(-UfyHq_D=6;ouL4!ZoD+xI5uCM5ay2m)RPmm$I}h>()hS zO!0gzMxc`BPkUZ)WXaXam%1;)gedA7SM8~8yIy@6TPg!hR0=T>4$Zxd)j&P-pXeSF z9W`lg6@~YDhd19B9ETv(%er^Xp8Yj@AuFVR_8t*KS;6VHkEDKI#!@l!l3v6`W1`1~ zP{C@keuV4Q`Rjc08lx?zmT$e$!3esc9&$XZf4nRL(Z*@keUbk!GZi(2Bmyq*saOD? z3Q$V<*P-X1p2}aQmuMw9nSMbOzuASsxten7DKd6A@ftZ=NhJ(0IM|Jr<91uAul4JR zADqY^AOVT3a(NIxg|U;fyc#ZnSzw2cr}#a5lZ38>nP{05D)7~ad7JPhw!LqOwATXtRhK!w0X4HgS1i<%AxbFmGJx9?sEURV+S{k~g zGYF$IWSlQonq6}e;B(X(sIH|;52+(LYW}v_gBcp|x%rEAVB`5LXg_d5{Q5tMDu0_2 z|LOm$@K2?lrLNF=mr%YP|U-t)~9bqd+wHb4KuPmNK<}PK6e@aosGZK57=Zt+kcszVOSbe;`E^dN! ze7`ha3WUUU7(nS0{?@!}{0+-VO4A{7+nL~UOPW9_P(6^GL0h${SLtqG!} zKl~Ng5#@Sy?65wk9z*3SA`Dpd4b4T^@C8Fhd8O)k_4%0RZL5?#b~jmgU+0|DB%0Z) zql-cPC>A9HPjdOTpPC` zQwvF}uB5kG$Xr4XnaH#ruSjM*xG?_hT7y3G+8Ox`flzU^QIgb_>2&-f+XB6MDr-na zSi#S+c!ToK84<&m6sCiGTd^8pNdXo+$3^l3FL_E`0 z>8it5YIDxtTp2Tm(?}FX^w{fbfgh7>^8mtvN>9fWgFN_*a1P`Gz*dyOZF{OV7BC#j zQV=FQM5m>47xXgapI$WbPM5V`V<7J9tD)oz@d~MDoM`R^Y6-Na(lO~uvZlpu?;zw6 zVO1faor3dg#JEb5Q*gz4<W8tgC3nE2BG2jeIQs1)<{In&7hJ39x=;ih;CJDy)>0S1at*7n?Wr0ahYCpFjZ|@u91Zl7( zv;CSBRC65-6f+*JPf4p1UZ)k=XivKTX6_bWT~7V#rq0Xjas6hMO!HJN8GdpBKg_$B zwDHJF6;z?h<;GXFZan8W{XFNPpOj!(&I1`&kWO86p?Xz`a$`7qV7Xqev|7nn_lQuX ziGpU1MMYt&5dE2A62iX3;*0WzNB9*nSTzI%62A+N?f?;S>N@8M=|ef3gtQTIA*=yq zQAAjOqa!CkHOQo4?TsqrrsJLclXcP?dlAVv?v`}YUjo1Htt;6djP@NPFH+&p1I+f_ z)Y279{7OWomY8baT(4TAOlz1OyD{4P?(DGv3XyJTA2IXe=kqD)^h(@*E3{I~w;ws8 z)ZWv7E)pbEM zd3MOXRH3mQhks9 zv6{s;k0y5vrcjXaVfw8^>YyPo=oIqd5IGI{)+TZq5Z5O&hXAw%ZlL}^6FugH;-%vP zAaKFtt3i^ag226=f0YjzdPn6|4(C2sC5wHFX{7QF!tG1E-JFA`>eZ`}$ymcRJK?0c zN363o{&ir)QySOFY0vcu6)kX#;l??|7o{HBDVJN+17rt|w3;(C_1b>d;g9Gp=8YVl zYTtA52@!7AUEkTm@P&h#eg+F*lR zQ7iotZTcMR1frJ0*V@Hw__~CL>_~2H2cCtuzYIUD24=Cv!1j6s{QS!v=PzwQ(a0HS zBKx04KA}-Ue+%9d`?PG*hIij@54RDSQpA7|>qYVIrK_G6%6;#ZkR}NjUgmGju)2F`>|WJoljo)DJgZr4eo1k1i1+o z1D{>^RlpIY8OUaOEf5EBu%a&~c5aWnqM zxBpJq98f=%M^{4mm~5`CWl%)nFR64U{(chmST&2jp+-r z3675V<;Qi-kJud%oWnCLdaU-)xTnMM%rx%Jw6v@=J|Ir=4n-1Z23r-EVf91CGMGNz zb~wyv4V{H-hkr3j3WbGnComiqmS0vn?n?5v2`Vi>{Ip3OZUEPN7N8XeUtF)Ry6>y> zvn0BTLCiqGroFu|m2zG-;Xb6;W`UyLw)@v}H&(M}XCEVXZQoWF=Ykr5lX3XWwyNyF z#jHv)A*L~2BZ4lX?AlN3X#axMwOC)PoVy^6lCGse9bkGjb=qz%kDa6}MOmSwK`cVO zt(e*MW-x}XtU?GY5}9{MKhRhYOlLhJE5=ca+-RmO04^ z66z{40J=s=ey9OCdc(RCzy zd7Zr1%!y3}MG(D=wM_ebhXnJ@MLi7cImDkhm0y{d-Vm81j`0mbi4lF=eirlr)oW~a zCd?26&j^m4AeXEsIUXiTal)+SPM4)HX%%YWF1?(FV47BaA`h9m67S9x>hWMVHx~Hg z1meUYoLL(p@b3?x|9DgWeI|AJ`Ia84*P{Mb%H$ZRROouR4wZhOPX15=KiBMHl!^JnCt$Az`KiH^_d>cev&f zaG2>cWf$=A@&GP~DubsgYb|L~o)cn5h%2`i^!2)bzOTw2UR!>q5^r&2Vy}JaWFUQE04v>2;Z@ZPwXr?y&G(B^@&y zsd6kC=hHdKV>!NDLIj+3rgZJ|dF`%N$DNd;B)9BbiT9Ju^Wt%%u}SvfM^=|q-nxDG zuWCQG9e#~Q5cyf8@y76#kkR^}{c<_KnZ0QsZcAT|YLRo~&tU|N@BjxOuy`#>`X~Q< z?R?-Gsk$$!oo(BveQLlUrcL#eirhgBLh`qHEMg`+sR1`A=1QX7)ZLMRT+GBy?&mM8 zQG^z-!Oa&J-k7I(3_2#Q6Bg=NX<|@X&+YMIOzfEO2$6Mnh}YV!m!e^__{W@-CTprr zbdh3f=BeCD$gHwCrmwgM3LAv3!Mh$wM)~KWzp^w)Cu6roO7uUG5z*}i0_0j47}pK; ztN530`ScGatLOL06~zO)Qmuv`h!gq5l#wx(EliKe&rz-5qH(hb1*fB#B+q`9=jLp@ zOa2)>JTl7ovxMbrif`Xe9;+fqB1K#l=Dv!iT;xF zdkCvS>C5q|O;}ns3AgoE({Ua-zNT-9_5|P0iANmC6O76Sq_(AN?UeEQJ>#b54fi3k zFmh+P%b1x3^)0M;QxXLP!BZ^h|AhOde*{9A=f3|Xq*JAs^Y{eViF|=EBfS6L%k4ip zk+7M$gEKI3?bQg?H3zaE@;cyv9kv;cqK$VxQbFEsy^iM{XXW0@2|DOu$!-k zSFl}Y=jt-VaT>Cx*KQnHTyXt}f9XswFB9ibYh+k2J!ofO+nD?1iw@mwtrqI4_i?nE zhLkPp41ED62me}J<`3RN80#vjW;wt`pP?%oQ!oqy7`miL>d-35a=qotK$p{IzeSk# ze_$CFYp_zIkrPFVaW^s#U4xT1lI^A0IBe~Y<4uS%zSV=wcuLr%gQT=&5$&K*bwqx| zWzCMiz>7t^Et@9CRUm9E+@hy~sBpm9fri$sE1zgLU((1?Yg{N1Sars=DiW&~Zw=3I zi7y)&oTC?UWD2w97xQ&5vx zRXEBGeJ(I?Y}eR0_O{$~)bMJRTsNUPIfR!xU9PE7A>AMNr_wbrFK>&vVw=Y;RH zO$mlpmMsQ}-FQ2cSj7s7GpC+~^Q~dC?y>M}%!-3kq(F3hGWo9B-Gn02AwUgJ>Z-pKOaj zysJBQx{1>Va=*e@sLb2z&RmQ7ira;aBijM-xQ&cpR>X3wP^foXM~u1>sv9xOjzZpX z0K;EGouSYD~oQ&lAafj3~EaXfFShC+>VsRlEMa9cg9i zFxhCKO}K0ax6g4@DEA?dg{mo>s+~RPI^ybb^u--^nTF>**0l5R9pocwB?_K)BG_)S zyLb&k%XZhBVr7U$wlhMqwL)_r&&n%*N$}~qijbkfM|dIWP{MyLx}X&}ES?}7i;9bW zmTVK@zR)7kE2+L42Q`n4m0VVg5l5(W`SC9HsfrLZ=v%lpef=Gj)W59VTLe+Z$8T8i z4V%5+T0t8LnM&H>Rsm5C%qpWBFqgTwL{=_4mE{S3EnBXknM&u8n}A^IIM4$s3m(Rd z>zq=CP-!9p9es2C*)_hoL@tDYABn+o#*l;6@7;knWIyDrt5EuakO99S$}n((Fj4y} zD!VvuRzghcE{!s;jC*<_H$y6!6QpePo2A3ZbX*ZzRnQq*b%KK^NF^z96CHaWmzU@f z#j;y?X=UP&+YS3kZx7;{ zDA{9(wfz7GF`1A6iB6fnXu0?&d|^p|6)%3$aG0Uor~8o? z*e}u#qz7Ri?8Uxp4m_u{a@%bztvz-BzewR6bh*1Xp+G=tQGpcy|4V_&*aOqu|32CM zz3r*E8o8SNea2hYJpLQ-_}R&M9^%@AMx&`1H8aDx4j%-gE+baf2+9zI*+Pmt+v{39 zDZ3Ix_vPYSc;Y;yn68kW4CG>PE5RoaV0n@#eVmk?p$u&Fy&KDTy!f^Hy6&^-H*)#u zdrSCTJPJw?(hLf56%2;_3n|ujUSJOU8VPOTlDULwt0jS@j^t1WS z!n7dZIoT+|O9hFUUMbID4Ec$!cc($DuQWkocVRcYSikFeM&RZ=?BW)mG4?fh#)KVG zcJ!<=-8{&MdE)+}?C8s{k@l49I|Zwswy^ZN3;E!FKyglY~Aq?4m74P-0)sMTGXqd5(S<-(DjjM z&7dL-Mr8jhUCAG$5^mI<|%`;JI5FVUnNj!VO2?Jiqa|c2;4^n!R z`5KK0hyB*F4w%cJ@Un6GC{mY&r%g`OX|1w2$B7wxu97%<@~9>NlXYd9RMF2UM>(z0 zouu4*+u+1*k;+nFPk%ly!nuMBgH4sL5Z`@Rok&?Ef=JrTmvBAS1h?C0)ty5+yEFRz zY$G=coQtNmT@1O5uk#_MQM1&bPPnspy5#>=_7%WcEL*n$;sSAZcXxMpcXxLe;_mLA z5F_paad+bGZV*oh@8h0(|D2P!q# zTHjmiphJ=AazSeKQPkGOR-D8``LjzToyx{lfK-1CDD6M7?pMZOdLKFtjZaZMPk4}k zW)97Fh(Z+_Fqv(Q_CMH-YYi?fR5fBnz7KOt0*t^cxmDoIokc=+`o# zrud|^h_?KW=Gv%byo~(Ln@({?3gnd?DUf-j2J}|$Mk>mOB+1{ZQ8HgY#SA8END(Zw z3T+W)a&;OO54~m}ffemh^oZ!Vv;!O&yhL0~hs(p^(Yv=(3c+PzPXlS5W79Er8B1o* z`c`NyS{Zj_mKChj+q=w)B}K za*zzPhs?c^`EQ;keH{-OXdXJet1EsQ)7;{3eF!-t^4_Srg4(Ot7M*E~91gwnfhqaM zNR7dFaWm7MlDYWS*m}CH${o?+YgHiPC|4?X?`vV+ws&Hf1ZO-w@OGG^o4|`b{bLZj z&9l=aA-Y(L11!EvRjc3Zpxk7lc@yH1e$a}8$_-r$)5++`_eUr1+dTb@ zU~2P1HM#W8qiNN3b*=f+FfG1!rFxnNlGx{15}BTIHgxO>Cq4 z;#9H9YjH%>Z2frJDJ8=xq>Z@H%GxXosS@Z>cY9ppF+)e~t_hWXYlrO6)0p7NBMa`+ z^L>-#GTh;k_XnE)Cgy|0Dw;(c0* zSzW14ZXozu)|I@5mRFF1eO%JM=f~R1dkNpZM+Jh(?&Zje3NgM{2ezg1N`AQg5%+3Y z64PZ0rPq6;_)Pj-hyIOgH_Gh`1$j1!jhml7ksHA1`CH3FDKiHLz+~=^u@kUM{ilI5 z^FPiJ7mSrzBs9{HXi2{sFhl5AyqwUnU{sPcUD{3+l-ZHAQ)C;c$=g1bdoxeG(5N01 zZy=t8i{*w9m?Y>V;uE&Uy~iY{pY4AV3_N;RL_jT_QtLFx^KjcUy~q9KcLE3$QJ{!)@$@En{UGG7&}lc*5Kuc^780;7Bj;)X?1CSy*^^ zPP^M)Pr5R>mvp3_hmCtS?5;W^e@5BjE>Cs<`lHDxj<|gtOK4De?Sf0YuK5GX9G93i zMYB{8X|hw|T6HqCf7Cv&r8A$S@AcgG1cF&iJ5=%+x;3yB`!lQ}2Hr(DE8=LuNb~Vs z=FO&2pdc16nD$1QL7j+!U^XWTI?2qQKt3H8=beVTdHHa9=MiJ&tM1RRQ-=+vy!~iz zj3O{pyRhCQ+b(>jC*H)J)%Wq}p>;?@W*Eut@P&?VU+Sdw^4kE8lvX|6czf{l*~L;J zFm*V~UC;3oQY(ytD|D*%*uVrBB}BbAfjK&%S;z;7$w68(8PV_whC~yvkZmX)xD^s6 z{$1Q}q;99W?*YkD2*;)tRCS{q2s@JzlO~<8x9}X<0?hCD5vpydvOw#Z$2;$@cZkYrp83J0PsS~!CFtY%BP=yxG?<@#{7%2sy zOc&^FJxsUYN36kSY)d7W=*1-{7ghPAQAXwT7z+NlESlkUH&8ODlpc8iC*iQ^MAe(B z?*xO4i{zFz^G=^G#9MsLKIN64rRJykiuIVX5~0#vAyDWc9-=6BDNT_aggS2G{B>dD ze-B%d3b6iCfc5{@yz$>=@1kdK^tX9qh0=ocv@9$ai``a_ofxT=>X7_Y0`X}a^M?d# z%EG)4@`^Ej_=%0_J-{ga!gFtji_byY&Vk@T1c|ucNAr(JNr@)nCWj?QnCyvXg&?FW;S-VOmNL6^km_dqiVjJuIASVGSFEos@EVF7St$WE&Z%)`Q##+0 zjaZ=JI1G@0!?l|^+-ZrNd$WrHBi)DA0-Eke>dp=_XpV<%CO_Wf5kQx}5e<90dt>8k zAi00d0rQ821nA>B4JHN7U8Zz=0;9&U6LOTKOaC1FC8GgO&kc=_wHIOGycL@c*$`ce703t%>S}mvxEnD-V!;6c`2(p74V7D0No1Xxt`urE66$0(ThaAZ1YVG#QP$ zy~NN%kB*zhZ2Y!kjn826pw4bh)75*e!dse+2Db(;bN34Uq7bLpr47XTX{8UEeC?2i z*{$`3dP}32${8pF$!$2Vq^gY|#w+VA_|o(oWmQX8^iw#n_crb(K3{69*iU?<%C-%H zuKi)3M1BhJ@3VW>JA`M>L~5*_bxH@Euy@niFrI$82C1}fwR$p2E&ZYnu?jlS}u7W9AyfdXh2pM>78bIt3 z)JBh&XE@zA!kyCDfvZ1qN^np20c1u#%P6;6tU&dx0phT1l=(mw7`u!-0e=PxEjDds z9E}{E!7f9>jaCQhw)&2TtG-qiD)lD(4jQ!q{`x|8l&nmtHkdul# zy+CIF8lKbp9_w{;oR+jSLtTfE+B@tOd6h=QePP>rh4@~!8c;Hlg9m%%&?e`*Z?qz5-zLEWfi>`ord5uHF-s{^bexKAoMEV@9nU z^5nA{f{dW&g$)BAGfkq@r5D)jr%!Ven~Q58c!Kr;*Li#`4Bu_?BU0`Y`nVQGhNZk@ z!>Yr$+nB=`z#o2nR0)V3M7-eVLuY`z@6CT#OTUXKnxZn$fNLPv7w1y7eGE=Qv@Hey`n;`U=xEl|q@CCV^#l)s0ZfT+mUf z^(j5r4)L5i2jnHW4+!6Si3q_LdOLQi<^fu?6WdohIkn79=jf%Fs3JkeXwF(?_tcF? z?z#j6iXEd(wJy4|p6v?xNk-)iIf2oX5^^Y3q3ziw16p9C6B;{COXul%)`>nuUoM*q zzmr|NJ5n)+sF$!yH5zwp=iM1#ZR`O%L83tyog-qh1I z0%dcj{NUs?{myT~33H^(%0QOM>-$hGFeP;U$puxoJ>>o-%Lk*8X^rx1>j|LtH$*)>1C!Pv&gd16%`qw5LdOIUbkNhaBBTo}5iuE%K&ZV^ zAr_)kkeNKNYJRgjsR%vexa~&8qMrQYY}+RbZ)egRg9_$vkoyV|Nc&MH@8L)`&rpqd zXnVaI@~A;Z^c3+{x=xgdhnocA&OP6^rr@rTvCnhG6^tMox$ulw2U7NgUtW%|-5VeH z_qyd47}1?IbuKtqNbNx$HR`*+9o=8`%vM8&SIKbkX9&%TS++x z5|&6P<%=F$C?owUI`%uvUq^yW0>`>yz!|WjzsoB9dT;2Dx8iSuK%%_XPgy0dTD4kd zDXF@&O_vBVVKQq(9YTClUPM30Sk7B!v7nOyV`XC!BA;BIVwphh+c)?5VJ^(C;GoQ$ zvBxr7_p*k$T%I1ke}`U&)$uf}I_T~#3XTi53OX)PoXVgxEcLJgZG^i47U&>LY(l%_ z;9vVDEtuMCyu2fqZeez|RbbIE7@)UtJvgAcVwVZNLccswxm+*L&w`&t=ttT=sv6Aq z!HouSc-24Y9;0q$>jX<1DnnGmAsP))- z^F~o99gHZw`S&Aw7e4id6Lg7kMk-e)B~=tZ!kE7sGTOJ)8@q}np@j7&7Sy{2`D^FH zI7aX%06vKsfJ168QnCM2=l|i>{I{%@gcr>ExM0Dw{PX6ozEuqFYEt z087%MKC;wVsMV}kIiuu9Zz9~H!21d!;Cu#b;hMDIP7nw3xSX~#?5#SSjyyg+Y@xh| z%(~fv3`0j#5CA2D8!M2TrG=8{%>YFr(j)I0DYlcz(2~92?G*?DeuoadkcjmZszH5& zKI@Lis%;RPJ8mNsbrxH@?J8Y2LaVjUIhRUiO-oqjy<&{2X~*f|)YxnUc6OU&5iac= z*^0qwD~L%FKiPmlzi&~a*9sk2$u<7Al=_`Ox^o2*kEv?p`#G(p(&i|ot8}T;8KLk- zPVf_4A9R`5^e`Om2LV*cK59EshYXse&IoByj}4WZaBomoHAPKqxRKbPcD`lMBI)g- zeMRY{gFaUuecSD6q!+b5(?vAnf>c`Z(8@RJy%Ulf?W~xB1dFAjw?CjSn$ph>st5bc zUac1aD_m6{l|$#g_v6;=32(mwpveQDWhmjR7{|B=$oBhz`7_g7qNp)n20|^^op3 zSfTdWV#Q>cb{CMKlWk91^;mHap{mk)o?udk$^Q^^u@&jd zfZ;)saW6{e*yoL6#0}oVPb2!}r{pAUYtn4{P~ES9tTfC5hXZnM{HrC8^=Pof{G4%Bh#8 ze~?C9m*|fd8MK;{L^!+wMy>=f^8b&y?yr6KnTq28$pFMBW9Oy7!oV5z|VM$s-cZ{I|Xf@}-)1=$V&x7e;9v81eiTi4O5-vs?^5pCKy2l>q);!MA zS!}M48l$scB~+Umz}7NbwyTn=rqt@`YtuwiQSMvCMFk2$83k50Q>OK5&fe*xCddIm)3D0I6vBU<+!3=6?(OhkO|b4fE_-j zimOzyfBB_*7*p8AmZi~X2bgVhyPy>KyGLAnOpou~sx9)S9%r)5dE%ADs4v%fFybDa_w*0?+>PsEHTbhKK^G=pFz z@IxLTCROWiKy*)cV3y%0FwrDvf53Ob_XuA1#tHbyn%Ko!1D#sdhBo`;VC*e1YlhrC z?*y3rp86m#qI|qeo8)_xH*G4q@70aXN|SP+6MQ!fJQqo1kwO_v7zqvUfU=Gwx`CR@ zRFb*O8+54%_8tS(ADh}-hUJzE`s*8wLI>1c4b@$al)l}^%GuIXjzBK!EWFO8W`>F^ ze7y#qPS0NI7*aU)g$_ziF(1ft;2<}6Hfz10cR8P}67FD=+}MfhrpOkF3hFhQu;Q1y zu%=jJHTr;0;oC94Hi@LAF5quAQ(rJG(uo%BiRQ@8U;nhX)j0i?0SL2g-A*YeAqF>RVCBOTrn{0R27vu}_S zS>tX4!#&U4W;ikTE!eFH+PKw%p+B(MR2I%n#+m0{#?qRP_tR@zpgCb=4rcrL!F=;A zh%EIF8m6%JG+qb&mEfuFTLHSxUAZEvC-+kvZKyX~SA3Umt`k}}c!5dy?-sLIM{h@> z!2=C)@nx>`;c9DdwZ&zeUc(7t<21D7qBj!|1^Mp1eZ6)PuvHx+poKSDCSBMFF{bKy z;9*&EyKitD99N}%mK8431rvbT+^%|O|HV23{;RhmS{$5tf!bIPoH9RKps`-EtoW5h zo6H_!s)Dl}2gCeGF6>aZtah9iLuGd19^z0*OryPNt{70RvJSM<#Ox9?HxGg04}b^f zrVEPceD%)#0)v5$YDE?f`73bQ6TA6wV;b^x*u2Ofe|S}+q{s5gr&m~4qGd!wOu|cZ||#h_u=k*fB;R6&k?FoM+c&J;ISg70h!J7*xGus)ta4veTdW)S^@sU@ z4$OBS=a~@F*V0ECic;ht4@?Jw<9kpjBgHfr2FDPykCCz|v2)`JxTH55?b3IM={@DU z!^|9nVO-R#s{`VHypWyH0%cs;0GO3E;It6W@0gX6wZ%W|Dzz&O%m17pa19db(er}C zUId1a4#I+Ou8E1MU$g=zo%g7K(=0Pn$)Rk z<4T2u<0rD)*j+tcy2XvY+0 z0d2pqm4)4lDewsAGThQi{2Kc3&C=|OQF!vOd#WB_`4gG3@inh-4>BoL!&#ij8bw7? zqjFRDaQz!J-YGitV4}$*$hg`vv%N)@#UdzHFI2E<&_@0Uw@h_ZHf}7)G;_NUD3@18 zH5;EtugNT0*RXVK*by>WS>jaDDfe!A61Da=VpIK?mcp^W?!1S2oah^wowRnrYjl~`lgP-mv$?yb6{{S55CCu{R z$9;`dyf0Y>uM1=XSl_$01Lc1Iy68IosWN8Q9Op=~I(F<0+_kKfgC*JggjxNgK6 z-3gQm6;sm?J&;bYe&(dx4BEjvq}b`OT^RqF$J4enP1YkeBK#>l1@-K`ajbn05`0J?0daOtnzh@l3^=BkedW1EahZlRp;`j*CaT;-21&f2wU z+Nh-gc4I36Cw+;3UAc<%ySb`#+c@5y ze~en&bYV|kn?Cn|@fqmGxgfz}U!98$=drjAkMi`43I4R%&H0GKEgx-=7PF}y`+j>r zg&JF`jomnu2G{%QV~Gf_-1gx<3Ky=Md9Q3VnK=;;u0lyTBCuf^aUi?+1+`4lLE6ZK zT#(Bf`5rmr(tgTbIt?yA@y`(Ar=f>-aZ}T~>G32EM%XyFvhn&@PWCm#-<&ApLDCXT zD#(9m|V(OOo7PmE@`vD4$S5;+9IQm19dd zvMEU`)E1_F+0o0-z>YCWqg0u8ciIknU#{q02{~YX)gc_u;8;i233D66pf(IkTDxeN zL=4z2)?S$TV9=ORVr&AkZMl<4tTh(v;Ix1{`pPVqI3n2ci&4Dg+W|N8TBUfZ*WeLF zqCH_1Q0W&f9T$lx3CFJ$o@Lz$99 zW!G&@zFHxTaP!o#z^~xgF|(vrHz8R_r9eo;TX9}2ZyjslrtH=%6O)?1?cL&BT(Amp zTGFU1%%#xl&6sH-UIJk_PGk_McFn7=%yd6tAjm|lnmr8bE2le3I~L{0(ffo}TQjyo zHZZI{-}{E4ohYTlZaS$blB!h$Jq^Rf#(ch}@S+Ww&$b);8+>g84IJcLU%B-W?+IY& zslcZIR>+U4v3O9RFEW;8NpCM0w1ROG84=WpKxQ^R`{=0MZCubg3st z48AyJNEvyxn-jCPTlTwp4EKvyEwD3e%kpdY?^BH0!3n6Eb57_L%J1=a*3>|k68A}v zaW`*4YitylfD}ua8V)vb79)N_Ixw_mpp}yJGbNu+5YYOP9K-7nf*jA1#<^rb4#AcS zKg%zCI)7cotx}L&J8Bqo8O1b0q;B1J#B5N5Z$Zq=wX~nQFgUfAE{@u0+EnmK{1hg> zC{vMfFLD;L8b4L+B51&LCm|scVLPe6h02rws@kGv@R+#IqE8>Xn8i|vRq_Z`V;x6F zNeot$1Zsu`lLS92QlLWF54za6vOEKGYQMdX($0JN*cjG7HP&qZ#3+bEN$8O_PfeAb z0R5;=zXac2IZ?fxu59?Nka;1lKm|;0)6|#RxkD05P5qz;*AL@ig!+f=lW5^Jbag%2 z%9@iM0ph$WFlxS!`p31t92z~TB}P-*CS+1Oo_g;7`6k(Jyj8m8U|Q3Sh7o-Icp4kV zK}%qri5>?%IPfamXIZ8pXbm-#{ytiam<{a5A+3dVP^xz!Pvirsq7Btv?*d7eYgx7q zWFxrzb3-%^lDgMc=Vl7^={=VDEKabTG?VWqOngE`Kt7hs236QKidsoeeUQ_^FzsXjprCDd@pW25rNx#6x&L6ZEpoX9Ffzv@olnH3rGOSW( zG-D|cV0Q~qJ>-L}NIyT?T-+x+wU%;+_GY{>t(l9dI%Ximm+Kmwhee;FK$%{dnF;C% zFjM2&$W68Sz#d*wtfX?*WIOXwT;P6NUw}IHdk|)fw*YnGa0rHx#paG!m=Y6GkS4VX zX`T$4eW9k1W!=q8!(#8A9h67fw))k_G)Q9~Q1e3f`aV@kbcSv7!priDUN}gX(iXTy zr$|kU0Vn%*ylmyDCO&G0Z3g>%JeEPFAW!5*H2Ydl>39w3W+gEUjL&vrRs(xGP{(ze zy7EMWF14@Qh>X>st8_029||TP0>7SG9on_xxeR2Iam3G~Em$}aGsNt$iES9zFa<3W zxtOF*!G@=PhfHO!=9pVPXMUVi30WmkPoy$02w}&6A7mF)G6-`~EVq5CwD2`9Zu`kd)52``#V zNSb`9dG~8(dooi1*-aSMf!fun7Sc`-C$-E(3BoSC$2kKrVcI!&yC*+ff2+C-@!AT_ zsvlAIV+%bRDfd{R*TMF><1&_a%@yZ0G0lg2K;F>7b+7A6pv3-S7qWIgx+Z?dt8}|S z>Qbb6x(+^aoV7FQ!Ph8|RUA6vXWQH*1$GJC+wXLXizNIc9p2yLzw9 z0=MdQ!{NnOwIICJc8!+Jp!zG}**r#E!<}&Te&}|B4q;U57$+pQI^}{qj669zMMe_I z&z0uUCqG%YwtUc8HVN7?0GHpu=bL7&{C>hcd5d(iFV{I5c~jpX&!(a{yS*4MEoYXh z*X4|Y@RVfn;piRm-C%b@{0R;aXrjBtvx^HO;6(>i*RnoG0Rtcd25BT6edxTNOgUAOjn zJ2)l{ipj8IP$KID2}*#F=M%^n&=bA0tY98@+2I+7~A&T-tw%W#3GV>GTmkHaqftl)#+E zMU*P(Rjo>8%P@_@#UNq(_L{}j(&-@1iY0TRizhiATJrnvwSH0v>lYfCI2ex^><3$q znzZgpW0JlQx?JB#0^^s-Js1}}wKh6f>(e%NrMwS`Q(FhazkZb|uyB@d%_9)_xb$6T zS*#-Bn)9gmobhAtvBmL+9H-+0_0US?g6^TOvE8f3v=z3o%NcPjOaf{5EMRnn(_z8- z$|m0D$FTU zDy;21v-#0i)9%_bZ7eo6B9@Q@&XprR&oKl4m>zIj-fiRy4Dqy@VVVs?rscG| zmzaDQ%>AQTi<^vYCmv#KOTd@l7#2VIpsj?nm_WfRZzJako`^uU%Nt3e;cU*y*|$7W zLm%fX#i_*HoUXu!NI$ey>BA<5HQB=|nRAwK!$L#n-Qz;~`zACig0PhAq#^5QS<8L2 zS3A+8%vbVMa7LOtTEM?55apt(DcWh#L}R^P2AY*c8B}Cx=6OFAdMPj1f>k3#^#+Hk z6uW1WJW&RlBRh*1DLb7mJ+KO>!t^t8hX1#_Wk`gjDio9)9IGbyCAGI4DJ~orK+YRv znjxRMtshZQHc$#Y-<-JOV6g^Cr@odj&Xw5B(FmI)*qJ9NHmIz_r{t)TxyB`L-%q5l ztzHgD;S6cw?7Atg*6E1!c6*gPRCb%t7D%z<(xm+K{%EJNiI2N0l8ud0Ch@_av_RW? zIr!nO4dL5466WslE6MsfMss7<)-S!e)2@r2o=7_W)OO`~CwklRWzHTfpB)_HYwgz=BzLhgZ9S<{nLBOwOIgJU=94uj6r!m>Xyn9>&xP+=5!zG_*yEoRgM0`aYts z^)&8(>z5C-QQ*o_s(8E4*?AX#S^0)aqB)OTyX>4BMy8h(cHjA8ji1PRlox@jB*1n? zDIfyDjzeg91Ao(;Q;KE@zei$}>EnrF6I}q&Xd=~&$WdDsyH0H7fJX|E+O~%LS*7^Q zYzZ4`pBdY{b7u72gZm6^5~O-57HwzwAz{)NvVaowo`X02tL3PpgLjwA`^i9F^vSpN zAqH3mRjG8VeJNHZ(1{%!XqC+)Z%D}58Qel{_weSEHoygT9pN@i zi=G;!Vj6XQk2tuJC>lza%ywz|`f7TIz*EN2Gdt!s199Dr4Tfd_%~fu8gXo~|ogt5Q zlEy_CXEe^BgsYM^o@L?s33WM14}7^T(kqohOX_iN@U?u;$l|rAvn{rwy>!yfZw13U zB@X9)qt&4;(C6dP?yRsoTMI!j-f1KC!<%~i1}u7yLXYn)(#a;Z6~r>hp~kfP));mi zcG%kdaB9H)z9M=H!f>kM->fTjRVOELNwh1amgKQT=I8J66kI)u_?0@$$~5f`u%;zl zC?pkr^p2Fe=J~WK%4ItSzKA+QHqJ@~m|Cduv=Q&-P8I5rQ-#G@bYH}YJr zUS(~(w|vKyU(T(*py}jTUp%I%{2!W!K(i$uvotcPjVddW z8_5HKY!oBCwGZcs-q`4Yt`Zk~>K?mcxg51wkZlX5e#B08I75F7#dgn5yf&Hrp`*%$ zQ;_Qg>TYRzBe$x=T(@WI9SC!ReSas9vDm(yslQjBJZde5z8GDU``r|N(MHcxNopGr z_}u39W_zwWDL*XYYt>#Xo!9kL#97|EAGyGBcRXtLTd59x%m=3i zL^9joWYA)HfL15l9%H?q`$mY27!<9$7GH(kxb%MV>`}hR4a?+*LH6aR{dzrX@?6X4 z3e`9L;cjqYb`cJmophbm(OX0b)!AFG?5`c#zLagzMW~o)?-!@e80lvk!p#&CD8u5_r&wp4O0zQ>y!k5U$h_K;rWGk=U)zX!#@Q%|9g*A zWx)qS1?fq6X<$mQTB$#3g;;5tHOYuAh;YKSBz%il3Ui6fPRv#v62SsrCdMRTav)Sg zTq1WOu&@v$Ey;@^+_!)cf|w_X<@RC>!=~+A1-65O0bOFYiH-)abINwZvFB;hJjL_$ z(9iScmUdMp2O$WW!520Hd0Q^Yj?DK%YgJD^ez$Z^?@9@Ab-=KgW@n8nC&88)TDC+E zlJM)L3r+ZJfZW_T$;Imq*#2<(j+FIk8ls7)WJ6CjUu#r5PoXxQs4b)mZza<8=v{o)VlLRM<9yw^0En#tXAj`Sylxvki{<1DPe^ zhjHwx^;c8tb?Vr$6ZB;$Ff$+3(*oinbwpN-#F)bTsXq@Sm?43MC#jQ~`F|twI=7oC zH4TJtu#;ngRA|Y~w5N=UfMZi?s0%ZmKUFTAye&6Y*y-%c1oD3yQ%IF2q2385Zl+=> zfz=o`Bedy|U;oxbyb^rB9ixG{Gb-{h$U0hVe`J;{ql!s_OJ_>>eoQn(G6h7+b^P48 zG<=Wg2;xGD-+d@UMZ!c;0>#3nws$9kIDkK13IfloGT@s14AY>&>>^#>`PT7GV$2Hp zN<{bN*ztlZu_%W=&3+=#3bE(mka6VoHEs~0BjZ$+=0`a@R$iaW)6>wp2w)=v2@|2d z%?34!+iOc5S@;AAC4hELWLH56RGxo4jw8MDMU0Wk2k_G}=Vo(>eRFo(g3@HjG|`H3 zm8b*dK=moM*oB<)*A$M9!!5o~4U``e)wxavm@O_R(`P|u%9^LGi(_%IF<6o;NLp*0 zKsfZ0#24GT8(G`i4UvoMh$^;kOhl?`0yNiyrC#HJH=tqOH^T_d<2Z+ zeN>Y9Zn!X4*DMCK^o75Zk2621bdmV7Rx@AX^alBG4%~;G_vUoxhfhFRlR&+3WwF^T zaL)8xPq|wCZoNT^>3J0K?e{J-kl+hu2rZI>CUv#-z&u@`hjeb+bBZ>bcciQVZ{SbW zez04s9oFEgc8Z+Kp{XFX`MVf-s&w9*dx7wLen(_@y34}Qz@&`$2+osqfxz4&d}{Ql z*g1ag00Gu+$C`0avds{Q65BfGsu9`_`dML*rX~hyWIe$T>CsPRoLIr%MTk3pJ^2zH1qub1MBzPG}PO;Wmav9w%F7?%l=xIf#LlP`! z_Nw;xBQY9anH5-c8A4mME}?{iewjz(Sq-29r{fV;Fc>fv%0!W@(+{={Xl-sJ6aMoc z)9Q+$bchoTGTyWU_oI19!)bD=IG&OImfy;VxNXoIO2hYEfO~MkE#IXTK(~?Z&!ae! zl8z{D&2PC$Q*OBC(rS~-*-GHNJ6AC$@eve>LB@Iq;jbBZj`wk4|LGogE||Ie=M5g= z9d`uYQ1^Sr_q2wmZE>w2WG)!F%^KiqyaDtIAct?}D~JP4shTJy5Bg+-(EA8aXaxbd~BKMtTf2iQ69jD1o* zZF9*S3!v-TdqwK$%&?91Sh2=e63;X0Lci@n7y3XOu2ofyL9^-I767eHESAq{m+@*r zbVDx!FQ|AjT;!bYsXv8ilQjy~Chiu&HNhFXt3R_6kMC8~ChEFqG@MWu#1Q1#=~#ix zrkHpJre_?#r=N0wv`-7cHHqU`phJX2M_^{H0~{VP79Dv{6YP)oA1&TSfKPEPZn2)G z9o{U1huZBLL;Tp_0OYw@+9z(jkrwIGdUrOhKJUbwy?WBt zlIK)*K0lQCY0qZ!$%1?3A#-S70F#YyUnmJF*`xx?aH5;gE5pe-15w)EB#nuf6B*c~ z8Z25NtY%6Wlb)bUA$w%HKs5$!Z*W?YKV-lE0@w^{4vw;J>=rn?u!rv$&eM+rpU6rc=j9>N2Op+C{D^mospMCjF2ZGhe4eADA#skp2EA26%p3Ex9wHW8l&Y@HX z$Qv)mHM}4*@M*#*ll5^hE9M^=q~eyWEai*P;4z<9ZYy!SlNE5nlc7gm;M&Q zKhKE4d*%A>^m0R?{N}y|i6i^k>^n4(wzKvlQeHq{l&JuFD~sTsdhs`(?lFK@Q{pU~ zb!M3c@*3IwN1RUOVjY5>uT+s-2QLWY z4T2>fiSn>>Fob+%B868-v9D@AfWr#M8eM6w#eAlhc#zk6jkLxGBGk`E3$!A@*am!R zy>29&ptYK6>cvP`b!syNp)Q$0UOW|-O@)8!?94GOYF_}+zlW%fCEl|Tep_zx05g6q z>tp47e-&R*hSNe{6{H!mL?+j$c^TXT{C&@T-xIaesNCl05 z9SLb@q&mSb)I{VXMaiWa3PWj=Ed!>*GwUe;^|uk=Pz$njNnfFY^MM>E?zqhf6^{}0 zx&~~dA5#}1ig~7HvOQ#;d9JZBeEQ+}-~v$at`m!(ai z$w(H&mWCC~;PQ1$%iuz3`>dWeb3_p}X>L2LK%2l59Tyc}4m0>9A!8rhoU3m>i2+hl zx?*qs*c^j}+WPs>&v1%1Ko8_ivAGIn@QK7A`hDz-Emkcgv2@wTbYhkiwX2l=xz*XG zaiNg+j4F-I>9v+LjosI-QECrtKjp&0T@xIMKVr+&)gyb4@b3y?2CA?=ooN zT#;rU86WLh(e@#mF*rk(NV-qSIZyr z$6!ZUmzD)%yO-ot`rw3rp6?*_l*@Z*IB0xn4|BGPWHNc-1ZUnNSMWmDh=EzWJRP`) zl%d%J613oXzh5;VY^XWJi{lB`f#u+ThvtP7 zq(HK<4>tw(=yzSBWtYO}XI`S1pMBe3!jFxBHIuwJ(@%zdQFi1Q_hU2eDuHqXte7Ki zOV55H2D6u#4oTfr7|u*3p75KF&jaLEDpxk!4*bhPc%mpfj)Us3XIG3 zIKMX^s^1wt8YK7Ky^UOG=w!o5e7W-<&c|fw2{;Q11vm@J{)@N3-p1U>!0~sKWHaL= zWV(0}1IIyt1p%=_-Fe5Kfzc71wg}`RDDntVZv;4!=&XXF-$48jS0Sc;eDy@Sg;+{A zFStc{dXT}kcIjMXb4F7MbX~2%i;UrBxm%qmLKb|2=?uPr00-$MEUIGR5+JG2l2Nq` zkM{{1RO_R)+8oQ6x&-^kCj)W8Z}TJjS*Wm4>hf+4#VJP)OBaDF%3pms7DclusBUw} z{ND#!*I6h85g6DzNvdAmnwWY{&+!KZM4DGzeHI?MR@+~|su0{y-5-nICz_MIT_#FE zm<5f3zlaKq!XyvY3H`9s&T};z!cK}G%;~!rpzk9-6L}4Rg7vXtKFsl}@sT#U#7)x- z7UWue5sa$R>N&b{J61&gvKcKlozH*;OjoDR+elkh|4bJ!_3AZNMOu?n9&|L>OTD78 z^i->ah_Mqc|Ev)KNDzfu1P3grBIM#%`QZqj5W{qu(HocQhjyS;UINoP`{J+DvV?|1 z_sw6Yr3z6%e7JKVDY<$P=M)dbk@~Yw9|2!Cw!io3%j92wTD!c^e9Vj+7VqXo3>u#= zv#M{HHJ=e$X5vQ>>ML?E8#UlmvJgTnb73{PSPTf*0)mcj6C z{KsfUbDK|F$E(k;ER%8HMdDi`=BfpZzP3cl5yJHu;v^o2FkHNk;cXc17tL8T!CsYI zfeZ6sw@;8ia|mY_AXjCS?kUfxdjDB28)~Tz1dGE|{VfBS9`0m2!m1yG?hR})er^pl4c@9Aq+|}ZlDaHL)K$O| z%9Jp-imI-Id0|(d5{v~w6mx)tUKfbuVD`xNt04Mry%M+jXzE>4(TBsx#&=@wT2Vh) z1yeEY&~17>0%P(eHP0HB^|7C+WJxQBTG$uyOWY@iDloRIb-Cf!p<{WQHR!422#F34 zG`v|#CJ^G}y9U*7jgTlD{D&y$Iv{6&PYG>{Ixg$pGk?lWrE#PJ8KunQC@}^6OP!|< zS;}p3to{S|uZz%kKe|;A0bL0XxPB&Q{J(9PyX`+Kr`k~r2}yP^ND{8!v7Q1&vtk& z2Y}l@J@{|2`oA%sxvM9i0V+8IXrZ4;tey)d;LZI70Kbim<4=WoTPZy=Yd|34v#$Kh zx|#YJ8s`J>W&jt#GcMpx84w2Z3ur-rK7gf-p5cE)=w1R2*|0mj12hvapuUWM0b~dG zMg9p8FmAZI@i{q~0@QuY44&mMUNXd7z>U58shA3o`p5eVLpq>+{(<3->DWuSFVZwC zxd50Uz(w~LxC4}bgag#q#NNokK@yNc+Q|Ap!u>Ddy+df>v;j@I12CDNN9do+0^n8p zMQs7X#+FVF0C5muGfN{r0|Nkql%BQT|K(DDNdR2pzM=_ea5+GO|J67`05AV92t@4l z0Qno0078PIHdaQGHZ~Scw!dzgqjK~3B7kf>BcP__&lLyU(cu3B^uLo%{j|Mb0NR)tkeT7Hcwp4O# z)yzu>cvG(d9~0a^)eZ;;%3ksk@F&1eEBje~ zW+-_s)&RgiweQc!otF>4%vbXKaOU41{!hw?|2`Ld3I8$&#WOsq>EG)1ANb!{N4z9@ zsU!bPG-~-bqCeIDzo^Q;gnucB{tRzm{ZH^Orphm2U+REA!*<*J6YQV83@&xoDl%#wnl5qcBqCcAF-vX5{30}(oJrnSH z{RY85hylK2dMOh2%oO1J8%)0?8TOL%rS8)+CsDv}aQ>4D)Jv+DLK)9gI^n-T^$)Tc zFPUD75qJm!Y-KBqj;JP4dV4 z`X{lGmn<)1IGz330}s}Jrjtf{(lnuuNHe5(ezA(pYa=1|Ff-LhPFK8 zyJh_b{yzu0yll6ZkpRzRjezyYivjyjW7QwO;@6X`m;2Apn2EK2!~7S}-*=;5*7K$B z`x(=!^?zgj(-`&ApZJXI09aDLXaT@<;CH=?fBOY5d|b~wBA@@p^K#nxr`)?i?SqTupI_PJ(A3cx`z~9mX_*)>L F{|7XC?P&l2 diff --git a/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties b/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties deleted file mode 100644 index 2d135d7b25..0000000000 --- a/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties +++ /dev/null @@ -1,6 +0,0 @@ -#Wed Jun 26 10:57:21 CST 2019 -distributionBase=GRADLE_USER_HOME -distributionPath=wrapper/dists -zipStoreBase=GRADLE_USER_HOME -zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-5.1.1-all.zip diff --git a/lite/demo/java/android/PaddlePredictor/gradlew b/lite/demo/java/android/PaddlePredictor/gradlew deleted file mode 100755 index cccdd3d517..0000000000 --- a/lite/demo/java/android/PaddlePredictor/gradlew +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env sh - -############################################################################## -## -## Gradle start up script for UN*X -## -############################################################################## - -# Attempt to set APP_HOME -# Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi -done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null - -APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS="" - -# Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" - -warn () { - echo "$*" -} - -die () { - echo - echo "$*" - echo - exit 1 -} - -# OS specific support (must be 'true' or 'false'). -cygwin=false -msys=false -darwin=false -nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; -esac - -CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar - -# Determine the Java command to use to start the JVM. -if [ -n "$JAVA_HOME" ] ; then - if [ -x "$JAVA_HOME/jre/sh/java" ] ; then - # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" - else - JAVACMD="$JAVA_HOME/bin/java" - fi - if [ ! -x "$JAVACMD" ] ; then - die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." - fi -else - JAVACMD="java" - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." -fi - -# Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi -fi - -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi - -# For Cygwin, switch paths to Windows format before running java -if $cygwin ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi - # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" - fi - i=$((i+1)) - done - case $i in - (0) set -- ;; - (1) set -- "$args0" ;; - (2) set -- "$args0" "$args1" ;; - (3) set -- "$args0" "$args1" "$args2" ;; - (4) set -- "$args0" "$args1" "$args2" "$args3" ;; - (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac -fi - -# Escape application args -save () { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=$(save "$@") - -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" - -# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong -if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then - cd "$(dirname "$0")" -fi - -exec "$JAVACMD" "$@" diff --git a/lite/demo/java/android/PaddlePredictor/gradlew.bat b/lite/demo/java/android/PaddlePredictor/gradlew.bat deleted file mode 100644 index f9553162f1..0000000000 --- a/lite/demo/java/android/PaddlePredictor/gradlew.bat +++ /dev/null @@ -1,84 +0,0 @@ -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS= - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto init - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:init -@rem Get command-line arguments, handling Windows variants - -if not "%OS%" == "Windows_NT" goto win9xME_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega diff --git a/lite/demo/java/android/PaddlePredictor/settings.gradle b/lite/demo/java/android/PaddlePredictor/settings.gradle deleted file mode 100644 index e7b4def49c..0000000000 --- a/lite/demo/java/android/PaddlePredictor/settings.gradle +++ /dev/null @@ -1 +0,0 @@ -include ':app' diff --git a/lite/demo/java/android/prepare_demo.bash b/lite/demo/java/android/prepare_demo.bash deleted file mode 100644 index e0dbdaf75f..0000000000 --- a/lite/demo/java/android/prepare_demo.bash +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# Script to download model files and copy .Jar and JNI lib for Android demo -# $1 will be the arch name - -if [ x$1 != x ]; then - cp ../../../java/so/libpaddle_lite_jni.so PaddlePredictor/app/src/main/jniLibs/$1/ -else - echo "Warning: didn't copy JNI .so lib because arch name is empty" -fi - -MODELS=(inception_v4_simple_opt.nb lite_naive_model_opt.nb mobilenet_v1_opt.nb mobilenet_v2_relu_opt.nb resnet50_opt.nb) -MODELS_DIR=PaddlePredictor/app/src/main/assets/ - -for m in "${MODELS[@]}" -do - wget --no-check-certificate -q http://paddle-inference-dist.bj.bcebos.com/${m}.tar.gz \ - -O ${MODELS_DIR}${m}.tar.gz - tar xzf ${MODELS_DIR}${m}.tar.gz -C ${MODELS_DIR} - rm -rf ${MODELS_DIR}${m}.tar.gz -done - -cp ../../../java/jar/PaddlePredictor.jar PaddlePredictor/app/libs/ diff --git a/lite/fluid/CMakeLists.txt b/lite/fluid/CMakeLists.txt deleted file mode 100644 index 308dcb2c30..0000000000 --- a/lite/fluid/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if (LITE_WITH_X86) -lite_cc_library(fluid_data_type SRCS data_type.cc DEPS framework_proto eigen3) -# lite_cc_library(selected_rows SRCS selected_rows.cc) -endif() diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc deleted file mode 100644 index aa8971499f..0000000000 --- a/lite/fluid/data_type.cc +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/fluid/data_type.h" -#include -#include -#include - -using float16 = paddle::lite::fluid::float16; - -namespace paddle { -namespace lite { -namespace fluid { - -struct DataTypeMap { - std::unordered_map - cpp_to_proto_; - std::unordered_map proto_to_cpp_; - std::unordered_map proto_to_str_; - std::unordered_map proto_to_size_; -}; - -static DataTypeMap* InitDataTypeMap(); -// C++11 removes the need for manual locking. Concurrent execution shall wait if -// a static local variable is already being initialized. -// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex -static DataTypeMap& gDataTypeMap() { - static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); - return *g_data_type_map_; -} - -template -static inline void RegisterType(DataTypeMap* map, - framework::proto::VarType::Type proto_type, - const std::string& name) { - map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); - map->cpp_to_proto_.emplace(typeid(T), proto_type); - map->proto_to_str_.emplace(static_cast(proto_type), name); - map->proto_to_size_.emplace(static_cast(proto_type), sizeof(T)); -} - -static DataTypeMap* InitDataTypeMap() { - auto retv = new DataTypeMap(); - -#define RegType(cc_type, proto_type) \ - RegisterType(retv, proto_type, #cc_type) - - _ForEachDataType_(RegType); - -#undef RegType - return retv; -} - -framework::proto::VarType::Type ToDataType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_proto_.find(type); - if (it != gDataTypeMap().cpp_to_proto_.end()) { - return it->second; - } - PADDLE_THROW("Not support %s as tensor type", type.name()); -} - -std::type_index ToTypeIndex(framework::proto::VarType::Type type) { - auto it = gDataTypeMap().proto_to_cpp_.find(static_cast(type)); - if (it != gDataTypeMap().proto_to_cpp_.end()) { - return it->second; - } - PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type", - static_cast(type)); -} - -std::string DataTypeToString(const framework::proto::VarType::Type type) { - auto it = gDataTypeMap().proto_to_str_.find(static_cast(type)); - if (it != gDataTypeMap().proto_to_str_.end()) { - return it->second; - } - PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type", - static_cast(type)); -} - -size_t SizeOfType(framework::proto::VarType::Type type) { - auto it = gDataTypeMap().proto_to_size_.find(static_cast(type)); - if (it != gDataTypeMap().proto_to_size_.end()) { - return it->second; - } - PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type)); -} - -} // namespace fluid -} // namespace lite -} // namespace paddle diff --git a/lite/fluid/data_type.h b/lite/fluid/data_type.h deleted file mode 100644 index a8b11ec465..0000000000 --- a/lite/fluid/data_type.h +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "lite/core/framework.pb.h" -#include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" - -namespace paddle { -namespace lite { -namespace fluid { - -template -struct DataTypeTrait {}; - -// Stub handle for void -template <> -struct DataTypeTrait { - constexpr static auto DataType = framework::proto::VarType::RAW; -}; - -#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ - callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); - -#define _ForEachDataType_(callback) \ - _ForEachDataTypeHelper_(callback, float, FP32); \ - _ForEachDataTypeHelper_(callback, ::paddle::lite::fluid::float16, FP16); \ - _ForEachDataTypeHelper_(callback, double, FP64); \ - _ForEachDataTypeHelper_(callback, int, INT32); \ - _ForEachDataTypeHelper_(callback, int64_t, INT64); \ - _ForEachDataTypeHelper_(callback, bool, BOOL); \ - _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ - _ForEachDataTypeHelper_(callback, int16_t, INT16); \ - _ForEachDataTypeHelper_(callback, int8_t, INT8) - -#define DefineDataTypeTrait(cpp_type, proto_type) \ - template <> \ - struct DataTypeTrait { \ - constexpr static auto DataType = proto_type; \ - } - -_ForEachDataType_(DefineDataTypeTrait); - -#undef DefineDataTypeTrait - -extern framework::proto::VarType::Type ToDataType(std::type_index type); -extern std::type_index ToTypeIndex(framework::proto::VarType::Type type); - -template -inline void VisitDataType(framework::proto::VarType::Type type, - Visitor visitor) { -#define VisitDataTypeCallback(cpp_type, proto_type) \ - do { \ - if (type == proto_type) { \ - visitor.template apply(); \ - return; \ - } \ - } while (0) - - _ForEachDataType_(VisitDataTypeCallback); -#undef VisitDataTypeCallback - PADDLE_THROW("Not supported %d", type); -} - -extern std::string DataTypeToString(const framework::proto::VarType::Type type); -extern size_t SizeOfType(framework::proto::VarType::Type type); -inline std::ostream& operator<<(std::ostream& out, - const framework::proto::VarType::Type& type) { - out << DataTypeToString(type); - return out; -} - -} // namespace fluid -} // namespace lite -} // namespace paddle diff --git a/lite/fluid/data_type_test.cc b/lite/fluid/data_type_test.cc deleted file mode 100644 index 2a380201f2..0000000000 --- a/lite/fluid/data_type_test.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/framework/data_type.h" - -#include -#include "gtest/gtest.h" -#include "paddle/fluid/framework/tensor.h" - -TEST(DataType, float16) { - using paddle::framework::Tensor; - using paddle::platform::CPUPlace; - using paddle::platform::float16; - namespace f = paddle::framework; - f::proto::VarType::Type dtype = f::proto::VarType::FP16; - - Tensor tensor; - CPUPlace cpu; - tensor.mutable_data(cpu, dtype); - - // test fp16 tensor - EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16))); - - // test fp16 size - EXPECT_EQ(f::SizeOfType(dtype), 2u); - - // test debug info - std::string type = "::paddle::platform::float16"; - EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); -} diff --git a/lite/fluid/eigen.h b/lite/fluid/eigen.h deleted file mode 100644 index f5d5e4b5e5..0000000000 --- a/lite/fluid/eigen.h +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "lite/core/tensor.h" -#include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" -#include "unsupported/Eigen/CXX11/Tensor" - -namespace paddle { -namespace lite { -namespace fluid { - -// EigenDim converts paddle::platform::DDim into Eigen::DSizes. -template -struct EigenDim { - using Type = Eigen::DSizes; - - static Type From(const lite::DDim& dims) { - PADDLE_ENFORCE(dims.size() == D, "D must match DDim::size"); - Type ret; - for (int64_t d = 0; d < dims.size(); d++) { - ret[d] = dims[d]; - } - return ret; - } -}; - -// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor. -template -struct EigenTensor { - // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on - // the speed of aligned and unaligned version in future. - using Type = Eigen::TensorMap>; - - using ConstType = - Eigen::TensorMap>; - - static Type From(Tensor& tensor, lite::DDim dims) { // NOLINT - return Type(const_cast(tensor.data()), - EigenDim::From(dims)); // NOLINT - } - - static Type From(Tensor& tensor) { // NOLINT - return From(tensor, tensor.dims()); - } // NOLINT - - static ConstType From(const Tensor& tensor, lite::DDim dims) { - return ConstType(tensor.data(), EigenDim::From(dims)); - } - - static ConstType From(const Tensor& tensor) { - return From(tensor, tensor.dims()); - } -}; - -template -struct EigenMatrix : public EigenTensor { - static typename EigenMatrix::Type Reshape(Tensor& tensor, // NOLINT - int num_col_dims) { - int rank = tensor.dims().size(); - PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, - "`num_col_dims` must be between (0, rank_of_tensor)."); - return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims)); - } - - static typename EigenMatrix::ConstType Reshape(const Tensor& tensor, - int num_col_dims) { - int rank = tensor.dims().size(); - PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, - "`num_col_dims` must be between (0, rank_of_tensor)."); - return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims)); - } -}; - -template -struct EigenVector : public EigenTensor { - // Flatten reshapes a Tensor into an EigenVector. - static typename EigenVector::Type Flatten(Tensor& tensor) { // NOLINT - return EigenVector::From( - tensor, lite::DDim(std::vector({tensor.dims().production()}))); - } - - static typename EigenVector::ConstType Flatten( - const Tensor& tensor) { // NOLINT - return EigenVector::From( - tensor, lite::DDim(std::vector({tensor.dims().production()}))); - } -}; - -template -struct EigenScalar { - // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. - using Type = Eigen::TensorMap< - Eigen::TensorFixedSize, MajorType, IndexType>>; - using ConstType = Eigen::TensorMap< - Eigen::TensorFixedSize, MajorType, IndexType>>; - - static Type From(Tensor& tensor) { return Type(tensor.data()); } // NOLINT - - static ConstType From(const Tensor& tensor) { - return ConstType(tensor.data()); - } -}; - -template -struct EigenDevice; - -template <> -struct EigenDevice { - using Type = ::Eigen::DefaultDevice; -}; - -template -using EigenDeviceType = typename EigenDevice::Type; - -} // namespace fluid -} // namespace lite -} // namespace paddle diff --git a/lite/fluid/float16.h b/lite/fluid/float16.h deleted file mode 100644 index d1ef6f7dc5..0000000000 --- a/lite/fluid/float16.h +++ /dev/null @@ -1,1100 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#ifdef PADDLE_WITH_CUDA -#include -#endif // PADDLE_WITH_CUDA - -#ifdef __GNUC__ -#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) -#else -#define PADDLE_GNUC_VER 0 -#endif // __GNUC__ - -#ifdef __clang__ -#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__) -#else -#define PADDLE_CLANG_VER 0 -#endif // __clang__ - -#if defined(__CUDACC__) && CUDA_VERSION >= 7050 -#define PADDLE_CUDA_FP16 -#include -#endif - -#if defined(__arm__) || defined(__aarch64__) -#define PADDLE_ARM -#endif - -#if defined(__ARM_NEON) || defined(__ARM_NEON__) -#define PADDLE_NEON -#include -#endif - -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37) -#define PADDLE_WITH_NATIVE_FP16 -#endif - -#ifndef PADDLE_ARM -#include -#endif // PADDLE_ARM - -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif - -namespace paddle { -namespace lite { -namespace fluid { - -// Forward declare float16 for eigen.h -struct float16; - -} // namespace fluid -} // namespace lite -} // namespace paddle - -#include "lite/utils/macros.h" -#include "unsupported/Eigen/CXX11/Tensor" - -namespace paddle { -namespace lite { -namespace fluid { - -// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated -// and aligned at least on a 2-byte boundary, which leads to efficient -// memory access of float16 struct and also makes float16 compatible -// with CUDA half, ARM float16_t, and Eigen::half data types. -struct PADDLE_ALIGN(2) float16 { - public: - uint16_t x; - - // The following defaulted special class member functions - // are added to make float16 pass the std::is_trivial test - float16() = default; - float16(const float16& o) = default; - float16& operator=(const float16& o) = default; - float16(float16&& o) = default; - float16& operator=(float16&& o) = default; - ~float16() = default; - -// Constructors -#ifdef PADDLE_CUDA_FP16 - HOSTDEVICE inline explicit float16(const half& h) { -#if CUDA_VERSION >= 9000 - x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; -#else - x = h.x; -#endif // CUDA_VERSION >= 9000 - } -#endif // PADDLE_CUDA_FP16 - - HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {} - -#ifdef PADDLE_WITH_NATIVE_FP16 - // __fp16 is a native half precision data type for arm cpu, - // float16_t is an alias for __fp16 - HOSTDEVICE inline explicit float16(const float16_t& h) { - x = *reinterpret_cast(&h); - } -#endif - - HOSTDEVICE inline explicit float16(float val) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - half tmp = __float2half(val); - x = *reinterpret_cast(&tmp); - -#elif defined(PADDLE_WITH_NATIVE_FP16) - float32x4_t tmp = vld1q_dup_f32(&val); - float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0); - x = *reinterpret_cast(&res); - -#elif defined(__F16C__) - x = _cvtss_sh(val, 0); - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v, s; - v.f = val; - uint32_t sign = v.si & sigN; - v.si ^= sign; - sign >>= shiftSign; // logical shift - s.si = mulN; - s.si = s.f * v.f; // correct subnormals - v.si ^= (s.si ^ v.si) & -(minN > v.si); - v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); - v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - v.ui >>= shift; // logical shift - v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); - v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); - x = v.ui | sign; - -#endif - } - - HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} - - template - HOSTDEVICE inline explicit float16(const T& val) - : x(float16(static_cast(val)).x) {} - -// Assignment operators -#ifdef PADDLE_CUDA_FP16 - HOSTDEVICE inline float16& operator=(const half& rhs) { -#if CUDA_VERSION >= 9000 - x = reinterpret_cast<__half_raw*>(const_cast(&rhs))->x; -#else - x = rhs.x; -#endif - return *this; - } -#endif - - HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { - x = rhs.x; - return *this; - } - -#ifdef PADDLE_WITH_NATIVE_FP16 - HOSTDEVICE inline float16& operator=(const float16_t& rhs) { - x = *reinterpret_cast(&rhs); - return *this; - } -#endif - - HOSTDEVICE inline float16& operator=(bool b) { - x = b ? 0x3c00 : 0; - return *this; - } - - HOSTDEVICE inline float16& operator=(int8_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint8_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(int16_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint16_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(int32_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint32_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(int64_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(uint64_t val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(float val) { - x = float16(val).x; - return *this; - } - - HOSTDEVICE inline float16& operator=(double val) { - x = float16(val).x; - return *this; - } - -// Conversion opertors -#ifdef PADDLE_CUDA_FP16 - HOSTDEVICE inline explicit operator half() const { -#if CUDA_VERSION >= 9000 - __half_raw h; - h.x = x; - return half(h); -#else - half h; - h.x = x; - return h; -#endif // CUDA_VERSION >= 9000 - } -#endif // PADDLE_CUDA_FP16 - - HOSTDEVICE inline explicit operator Eigen::half() const { - Eigen::half h; - h.x = x; - return h; - } - -#ifdef PADDLE_WITH_NATIVE_FP16 - HOSTDEVICE inline explicit operator float16_t() const { - return *reinterpret_cast(this); - } -#endif - - HOSTDEVICE inline explicit operator float() const { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - half tmp = *reinterpret_cast(this); - return __half2float(tmp); - -#elif defined(PADDLE_WITH_NATIVE_FP16) - float16x4_t res = vld1_dup_f16(reinterpret_cast(this)); - return vgetq_lane_f32(vcvt_f32_f16(res), 0); - -#elif defined(__F16C__) - return _cvtsh_ss(this->x); - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v; - v.ui = this->x; - int32_t sign = v.si & sigC; - v.si ^= sign; - sign <<= shiftSign; - v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); - v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); - Bits s; - s.si = mulC; - s.f *= v.si; - int32_t mask = -(norC > v.si); - v.si <<= shift; - v.si ^= (s.si ^ v.si) & mask; - v.si |= sign; - return v.f; - -#endif - } - - HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } - - HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(static_cast(*this)); - } - - HOSTDEVICE inline explicit operator double() const { - return static_cast(static_cast(*this)); - } - - private: - union Bits { - float f; - int32_t si; - uint32_t ui; - }; - - static const int shift = 13; - static const int shiftSign = 16; - - static const int32_t infN = 0x7F800000; - static const int32_t maxN = 0x477FE000; // max flt16 as flt32 - static const int32_t minN = 0x38800000; // min flt16 normal as flt32 - static const int32_t sigN = 0x80000000; // sign bit - - static constexpr int32_t infC = infN >> shift; - static constexpr int32_t nanN = (infC + 1) - << shift; // minimum flt16 nan as float32 - static constexpr int32_t maxC = maxN >> shift; - static constexpr int32_t minC = minN >> shift; - static constexpr int32_t sigC = sigN >> shiftSign; - - static const int32_t mulN = 0x52000000; // (1 << 23) / minN - static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) - static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted - static const int32_t norC = 0x00400; // min flt32 normal downshifted - - static constexpr int32_t maxD = infC - maxC - 1; - static constexpr int32_t minD = minC - subC - 1; -}; - -// Arithmetic operators on GPU -// CUDA 9.0 provides built-in arithmetic operators for half while -// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are -// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in -// CUDA 9.0 regarding the half data type. -#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 - -DEVICE inline half operator+(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hadd(a, b); -#else - float res = static_cast(float16(a)) + static_cast(float16(b)); - return half(float16(res)); -#endif -} - -DEVICE inline half operator-(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hsub(a, b); -#else - float res = static_cast(float16(a)) - static_cast(float16(b)); - return half(float16(res)); -#endif -} - -DEVICE inline half operator*(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hmul(a, b); -#else - float res = static_cast(float16(a)) * static_cast(float16(b)); - return half(float16(res)); -#endif -} - -DEVICE inline half operator/(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -#else - float res = static_cast(float16(a)) / static_cast(float16(b)); - return half(float16(res)); -#endif -} - -DEVICE inline half operator-(const half& a) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hneg(a); -#else - float res = -static_cast(float16(a)); - return half(float16(res)); -#endif -} - -DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT - a = a + b; - return a; -} - -DEVICE inline half& operator-=(half& a, const half& b) { // NOLINT - a = a - b; - return a; -} - -DEVICE inline half& operator*=(half& a, const half& b) { // NOLINT - a = a * b; - return a; -} - -DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT - a = a / b; - return a; -} - -DEVICE inline bool operator==(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __heq(a, b); -#else - return static_cast(float16(a)) == static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator!=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hne(a, b); -#else - return static_cast(float16(a)) != static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator<(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hlt(a, b); -#else - return static_cast(float16(a)) < static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator<=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hle(a, b); -#else - return static_cast(float16(a)) <= static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator>(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hgt(a, b); -#else - return static_cast(float16(a)) > static_cast(float16(b)); -#endif -} - -DEVICE inline bool operator>=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hge(a, b); -#else - return static_cast(float16(a)) >= static_cast(float16(b)); -#endif -} - -#endif // PADDLE_CUDA_FP16 - -// Arithmetic operators for float16 on GPU -#if defined(PADDLE_CUDA_FP16) -HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hadd(half(a), half(b))); -#else - return float16(static_cast(a) + static_cast(b)); -#endif -} - -HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hsub(half(a), half(b))); -#else - return float16(static_cast(a) - static_cast(b)); -#endif -} - -HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hmul(half(a), half(b))); -#else - return float16(static_cast(a) * static_cast(b)); -#endif -} - -HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - // TODO(kexinzhao): check which cuda version starts to support __hdiv - float num = __half2float(half(a)); - float denom = __half2float(half(b)); - return float16(num / denom); -#else - return float16(static_cast(a) / static_cast(b)); -#endif -} - -HOSTDEVICE inline float16 operator-(const float16& a) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hneg(half(a))); -#else - float16 res; - res.x = a.x ^ 0x8000; - return res; -#endif -} - -HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { // NOLINT - a = a + b; - return a; -} - -HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { // NOLINT - a = a - b; - return a; -} - -HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { // NOLINT - a = a * b; - return a; -} - -HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT - a = a / b; - return a; -} - -HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __heq(half(a), half(b)); -#else - return static_cast(a) == static_cast(b); -#endif -} - -HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hne(half(a), half(b)); -#else - return static_cast(a) != static_cast(b); -#endif -} - -HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hlt(half(a), half(b)); -#else - return static_cast(a) < static_cast(b); -#endif -} - -HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hle(half(a), half(b)); -#else - return static_cast(a) <= static_cast(b); -#endif -} - -HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hgt(half(a), half(b)); -#else - return static_cast(a) > static_cast(b); -#endif -} - -HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hge(half(a), half(b)); -#else - return static_cast(a) >= static_cast(b); -#endif -} - -// Arithmetic operators for float16 on ARMv8.2-A CPU -#elif defined(PADDLE_WITH_NATIVE_FP16) -inline float16 operator+(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fadd h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator-(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fsub h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator*(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fmul h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator/(const float16& a, const float16& b) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fdiv h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0", "v1"); - return res; -} - -inline float16 operator-(const float16& a) { - float16 res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "fneg h0, h0\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [res_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0"); - return res; -} - -inline float16& operator+=(float16& a, const float16& b) { // NOLINT - a = a + b; - return a; -} - -inline float16& operator-=(float16& a, const float16& b) { // NOLINT - a = a - b; - return a; -} - -inline float16& operator*=(float16& a, const float16& b) { // NOLINT - a = a * b; - return a; -} - -inline float16& operator/=(float16& a, const float16& b) { // NOLINT - a = a / b; - return a; -} - -inline bool operator==(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fcmeq h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } - -inline bool operator<(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v1.h}[0], [%[a_ptr]]\n" - "ld1 {v0.h}[0], [%[b_ptr]]\n" - "fcmgt h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator<=(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v1.h}[0], [%[a_ptr]]\n" - "ld1 {v0.h}[0], [%[b_ptr]]\n" - "fcmge h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator>(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fcmgt h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -inline bool operator>=(const float16& a, const float16& b) { - uint16_t res; - asm volatile( - "ld1 {v0.h}[0], [%[a_ptr]]\n" - "ld1 {v1.h}[0], [%[b_ptr]]\n" - "fcmge h0, h0, h1\n" - "st1 {v0.h}[0], [%[res_ptr]]\n" - : // outputs - : // inputs - [a_ptr] "r"(&(a.x)), - [b_ptr] "r"(&(b.x)), - [res_ptr] "r"(&res) - : // clobbers - "memory", "v0", "v1"); - return (res & 0xffff) != 0; -} - -// Arithmetic operators for float16, software emulated on other CPU -#else -inline float16 operator+(const float16& a, const float16& b) { - return float16(static_cast(a) + static_cast(b)); -} - -inline float16 operator-(const float16& a, const float16& b) { - return float16(static_cast(a) - static_cast(b)); -} - -inline float16 operator*(const float16& a, const float16& b) { - return float16(static_cast(a) * static_cast(b)); -} - -inline float16 operator/(const float16& a, const float16& b) { - return float16(static_cast(a) / static_cast(b)); -} - -inline float16 operator-(const float16& a) { - float16 res; - res.x = a.x ^ 0x8000; - return res; -} - -inline float16& operator+=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) + static_cast(b)); - return a; -} - -inline float16& operator-=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) - static_cast(b)); - return a; -} - -inline float16& operator*=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) * static_cast(b)); - return a; -} - -inline float16& operator/=(float16& a, const float16& b) { // NOLINT - a = float16(static_cast(a) / static_cast(b)); - return a; -} - -inline bool operator==(const float16& a, const float16& b) { - return static_cast(a) == static_cast(b); -} - -inline bool operator!=(const float16& a, const float16& b) { - return static_cast(a) != static_cast(b); -} - -inline bool operator<(const float16& a, const float16& b) { - return static_cast(a) < static_cast(b); -} - -inline bool operator<=(const float16& a, const float16& b) { - return static_cast(a) <= static_cast(b); -} - -inline bool operator>(const float16& a, const float16& b) { - return static_cast(a) > static_cast(b); -} - -inline bool operator>=(const float16& a, const float16& b) { - return static_cast(a) >= static_cast(b); -} -#endif - -HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) { - float16 res; - res.x = a; - return res; -} - -HOSTDEVICE inline bool(isnan)(const float16& a) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hisnan(half(a)); -#else - return (a.x & 0x7fff) > 0x7c00; -#endif -} - -HOSTDEVICE inline bool(isinf)(const float16& a) { - return (a.x & 0x7fff) == 0x7c00; -} - -HOSTDEVICE inline bool(isfinite)(const float16& a) { - return !((isnan)(a)) && !((isinf)(a)); -} - -inline std::ostream& operator<<(std::ostream& os, const float16& a) { - os << static_cast(a); - return os; -} - -} // namespace fluid -} // namespace lite -} // namespace paddle - -namespace std { - -// Override the std::is_pod::value for float16 -// The reason is that different compilers implemented std::is_pod based on -// different C++ standards. float16 class is a plain old data in C++11 given -// that it is both trivial and standard_layout. -// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is -// more restricted in that you cannot provide any customized -// constructor in float16. Hence, we override is_pod here following C++11 -// so that .cu files can be successfully compiled by nvcc. -template <> -struct is_pod { - static const bool value = - is_trivial::value && - is_standard_layout::value; -}; - -template <> -struct is_floating_point - : std::integral_constant< - bool, - std::is_same::type>::value> {}; -template <> -struct is_signed { - static const bool value = true; -}; - -template <> -struct is_unsigned { - static const bool value = false; -}; - -inline bool isnan(const paddle::lite::fluid::float16& a) { - return paddle::lite::fluid::isnan(a); -} - -inline bool isinf(const paddle::lite::fluid::float16& a) { - return paddle::lite::fluid::isinf(a); -} - -template <> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 3; - static const int max_digits10 = 5; - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - static paddle::lite::fluid::float16(min)() { - return paddle::lite::fluid::raw_uint16_to_float16(0x400); - } - static paddle::lite::fluid::float16 lowest() { - return paddle::lite::fluid::raw_uint16_to_float16(0xfbff); - } - static paddle::lite::fluid::float16(max)() { - return paddle::lite::fluid::raw_uint16_to_float16(0x7bff); - } - static paddle::lite::fluid::float16 epsilon() { - return paddle::lite::fluid::raw_uint16_to_float16(0x0800); - } - static paddle::lite::fluid::float16 round_error() { - return paddle::lite::fluid::float16(0.5); - } - static paddle::lite::fluid::float16 infinity() { - return paddle::lite::fluid::raw_uint16_to_float16(0x7c00); - } - static paddle::lite::fluid::float16 quiet_NaN() { - return paddle::lite::fluid::raw_uint16_to_float16(0x7e00); - } - static paddle::lite::fluid::float16 signaling_NaN() { - return paddle::lite::fluid::raw_uint16_to_float16(0x7e00); - } - static paddle::lite::fluid::float16 denorm_min() { - return paddle::lite::fluid::raw_uint16_to_float16(0x1); - } -}; - -} // namespace std - -namespace Eigen { - -using float16 = paddle::lite::fluid::float16; - -template <> -struct NumTraits : GenericNumTraits { - enum { - IsSigned = true, - IsInteger = false, - IsComplex = false, - RequireInitialization = false - }; - - HOSTDEVICE static inline float16 epsilon() { - return paddle::lite::fluid::raw_uint16_to_float16(0x0800); - } - HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); } - HOSTDEVICE static inline float16 highest() { - return paddle::lite::fluid::raw_uint16_to_float16(0x7bff); - } - HOSTDEVICE static inline float16 lowest() { - return paddle::lite::fluid::raw_uint16_to_float16(0xfbff); - } - HOSTDEVICE static inline float16 infinity() { - return paddle::lite::fluid::raw_uint16_to_float16(0x7c00); - } - HOSTDEVICE static inline float16 quiet_NaN() { - return paddle::lite::fluid::raw_uint16_to_float16(0x7c01); - } -}; - -namespace numext { - -template <> -HOSTDEVICE inline bool(isnan)(const float16& a) { - return (paddle::lite::fluid::isnan)(a); -} - -template <> -HOSTDEVICE inline bool(isinf)(const float16& a) { - return (paddle::lite::fluid::isinf)(a); -} - -template <> -HOSTDEVICE inline bool(isfinite)(const float16& a) { - return (paddle::lite::fluid::isfinite)(a); -} - -template <> -HOSTDEVICE inline float16 exp(const float16& a) { - return float16(::expf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 erf(const float16& a) { - return float16(::erff(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 log(const float16& a) { - return float16(::logf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 tanh(const float16& a) { - return float16(::tanhf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 sqrt(const float16& a) { - return float16(::sqrtf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 ceil(const float16& a) { - return float16(::ceilf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 floor(const float16& a) { - return float16(::floorf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 round(const float16& a) { - return float16(::roundf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 pow(const float16& a, const float16& b) { - return float16(::powf(static_cast(a), static_cast(b))); -} - -template <> -HOSTDEVICE inline float16 abs(const float16& a) { - return float16(::fabs(static_cast(a))); -} - -} // namespace numext - -} // namespace Eigen diff --git a/lite/fluid/lod.h b/lite/fluid/lod.h deleted file mode 100644 index 68068ba1d0..0000000000 --- a/lite/fluid/lod.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -namespace paddle { -namespace lite { -namespace fluid { -using LoD = std::vector>; - -LoD ToAbsOffset(const LoD &in) { - // the lowest level stores relative offsets - if (in.empty() || in.size() == 1) return in; - LoD result = in; - for (auto level = static_cast(in.size() - 2); level >= 0; level--) { - for (size_t i = 0; i < in[level].size(); ++i) { - size_t index = in[level][i]; - result[level][i] = result[level + 1][index]; - } - } - return result; -} -} // namespace fluid -} // namespace lite -} // namespace paddle diff --git a/lite/fluid/math.h b/lite/fluid/math.h deleted file mode 100644 index 8cc24200d3..0000000000 --- a/lite/fluid/math.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/hostdevice.h" - -#include "math.h" // NOLINT - -namespace paddle { -namespace operators { - -inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) { - return static_cast(::expf(static_cast(x))); -} - -inline HOSTDEVICE float real_exp(float x) { return ::expf(x); } - -inline HOSTDEVICE double real_exp(double x) { return ::exp(x); } - -inline HOSTDEVICE platform::float16 real_log(platform::float16 x) { - return static_cast(::logf(static_cast(x))); -} - -inline HOSTDEVICE float real_log(float x) { return ::logf(x); } - -inline HOSTDEVICE double real_log(double x) { return ::log(x); } - -} // namespace operators -} // namespace paddle diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt deleted file mode 100644 index d83657ad3e..0000000000 --- a/lite/gen_code/CMakeLists.txt +++ /dev/null @@ -1,49 +0,0 @@ -if (LITE_ON_TYNY_PUBLISH) - return() -endif() - -lite_cc_library(gen_code SRCS gen_code.cc - DEPS program op scope - cpp_op_desc - HVY_DEPS operator) -lite_cc_library(paddle_infer_gencode SRCS paddle_infer.cc DEPS program utils) - -lite_cc_test(test_gen_code SRCS gen_code_test.cc - DEPS gen_code tensor ${host_kernels} ${ops} - compatible_pb - model_parser - X86_DEPS ${x86_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - EXCLUDE_COMPILE_DEPS "ON" - ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) - -lite_cc_library(__generated_code__ - SRCS ${CMAKE_BINARY_DIR}/lite/gen_code/__generated_code__.cc - DEPS scope op kernel paddle_infer_gencode - EXCLUDE_COMPILE_DEPS "ON" -) -if(WITH_TESTING) - add_dependencies(__generated_code__ test_gen_code) - add_dependencies(__generated_code__ extern_lite_download_lite_naive_model_tar_gz) -endif(WITH_TESTING) - -lite_cc_binary(paddle_code_generator SRCS paddle_code_generator.cc DEPS model_parser gen_code gflags) - -# TODO(xxx): fix the gen code bug on ios -if(IOS) - return() -endif() - -lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__ - ${ops} ${host_kernels} - X86_DEPS ${x86_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - EXCLUDE_COMPILE_DEPS "ON" -) - diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc deleted file mode 100644 index 0d8f4d0d19..0000000000 --- a/lite/gen_code/gen_code.cc +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/gen_code/gen_code.h" -#include -#include -#include - -namespace paddle { -namespace lite { -namespace gencode { - -void Module::AddWeight(const std::string &name, const TensorRepr &tensor) { - auto w_name = WeightUniqueName(); - Line(string_format("// Create weight: %s", name.c_str())); - // auto* w0 = scope.Var("w0")->GetMutable(); - Line(string_format("auto* %s = scope->Var(%s)->GetMutable();", - w_name.c_str(), - Repr(name).c_str())); - // lite::DDim w_ddim({1, 2}) - Line(string_format("lite::DDim %s_ddim(std::vector(%s));", - w_name.c_str(), - tensor.ddim.repr().c_str())); - // std::vector w_data({}); - auto w_data_repr = DataRepr( - std::string(static_cast(tensor.raw_data), tensor.num_bytes), - tensor.dtype); - Line(string_format("std::vector<%s> %s_data({%s});", - PrecisionToStr(tensor.dtype).c_str(), - w_name.c_str(), - w_data_repr.c_str())); - // w0->Assign(w0_data.data(), w0_ddim); - Line(string_format( - "%s->Assign<%s, lite::DDim, TARGET(kX86)>(%s_data.data(), %s_ddim);", - w_name.c_str(), - PrecisionToStr(tensor.dtype).c_str(), - w_name.c_str(), - w_name.c_str())); - Line(""); -} - -void Module::AddHeaderIncludeGenCode() { - Line(""); - Line("#include "); - Line("#include "); - Line("#include \"lite/core/tensor.h\""); - Line("#include \"lite/core/context.h\""); - Line("#include \"lite/gen_code/paddle_infer.h\""); - Line("#include \"lite/core/op_registry.h\""); - Line("#include \"lite/core/scope.h\""); - Line("#include \"lite/model_parser/cpp/op_desc.h\""); - Line(""); - Line(""); -} - -std::string Module::DataRepr(const std::string &raw_data, PrecisionType dtype) { - STL::stringstream ss; - switch (dtype) { - case PRECISION(kFloat): { - const float *raw = reinterpret_cast(raw_data.c_str()); - int num_elems = raw_data.size() / sizeof(float); - if (num_elems) { - for (int i = 0; i < num_elems - 1; i++) { - ss << raw[i] << ","; - } - ss << raw[num_elems - 1]; - } - } break; - - default: - LOG(FATAL) << "Unsupported type " << PrecisionToStr(dtype); - } - return ss.str(); -} - -void Module::AddOpDescHelper(const std::string &op_id, - const cpp::OpDesc &desc) { - std::string desc_var = op_id + "_desc"; - Line(string_format("lite::cpp::OpDesc %s;", desc_var.c_str())); - auto vec_str_repr = [](const std::vector &vec) { - return Repr(vec); - }; - for (auto &item : desc.inputs()) { - Line(string_format("%s.SetInput(%s, %s);", - desc_var.c_str(), - Repr(item.first).c_str(), - vec_str_repr(item.second).c_str())); - } - - for (auto &item : desc.outputs()) { - Line(string_format("%s.SetOutput(%s, %s);", - desc_var.c_str(), - Repr(item.first).c_str(), - vec_str_repr(item.second).c_str())); - } - - auto attr_repr = [&](const std::string &name) -> std::string { - using AttrType = OpDescAPI::AttrType; - auto type = desc.GetAttrType(name); - - switch (type) { - case AttrType::INT: - return std::to_string(desc.GetAttr(name)); - case AttrType::FLOAT: - return std::to_string(desc.GetAttr(name)); - case AttrType::BOOLEAN: - return std::to_string(desc.GetAttr(name)); - case AttrType::STRING: - return "\"" + desc.GetAttr(name) + "\""; - case AttrType::FLOATS: { - auto vals = desc.GetAttr>(name); - return "{" + Join(vals, ",") + "}"; - } - case AttrType::INTS: { - auto vals = desc.GetAttr>(name); - return "{" + Join(vals, ",") + "}"; - } - - case AttrType::STRINGS: { - std::vector tmp; - auto vals = desc.GetAttr>(name); - std::transform(vals.begin(), - vals.end(), - std::back_inserter(tmp), - [](const std::string &x) { return Repr(x); }); - return "{" + Join(tmp, ",") + "}"; - } - default: - LOG(FATAL) << "Unsupported attribute type: " << static_cast(type); - } - return ""; - }; - - auto attr_type_repr = [&](const std::string &name) -> std::string { - using AttrType = OpDescAPI::AttrType; - auto type = desc.GetAttrType(name); - - switch (type) { - case AttrType::INT: - return "int"; - case AttrType::FLOAT: - return "float"; - case AttrType::BOOLEAN: - return "bool"; - case AttrType::STRING: - return "std::string"; - case AttrType::FLOATS: - return "std::vector"; - case AttrType::STRINGS: - return "std::vector"; - case AttrType::INTS: - return "std::vector"; - default: - LOG(FATAL) << "Unsupported attribute type: " << static_cast(type); - } - - return "unk_t"; - }; - for (auto &item : desc.AttrNames()) { - // Drop the python information. - if (item == "op_callstack") continue; - auto attr_type = attr_type_repr(item); - auto attr_val = attr_repr(item); - Line(string_format("%s.SetAttr<%s>(%s, %s);", // - desc_var.c_str(), - attr_type.c_str(), - Repr(item).c_str(), - attr_val.c_str())); - } -} - -void Module::AddOp(const cpp::OpDesc &op) { - auto op_name = OpUniqueName(); - AddOpDescHelper(op_name, op); - - LOG(INFO) << "add op " << op_name; - - Line(string_format("// Create Op: %s", op.Type().c_str())); - - Line(string_format("auto %s = lite::LiteOpRegistry::Global().Create(\"%s\");", - op_name.c_str(), - op.Type().c_str())); - - CHECK(op.HasAttr(kKernelTypeAttr)) - << "the kernel type should be specified before generate code."; - auto kernel_type = op.GetAttr(kKernelTypeAttr); - Line(string_format("%s->Attach(%s, exec_scope);", - op_name.c_str(), - (op_name + "_desc").c_str())); - - // Create kernel - auto kernel_name = KernelUniqueName(); - Line(string_format( - "auto %s = std::move(%s->CreateKernels(valid_places, \"%s\").front());", - kernel_name.c_str(), - op_name.c_str(), - kernel_type.c_str())); - - // Set Context for kernel - // clang-format off - Line(string_format("%s->SetContext(lite::ContextScheduler::Global().NewContext(%s->target()));", kernel_name.c_str(), kernel_name.c_str())); // NOLINT - // clang-format on - - Line(string_format("ops.push_back(%s);", op_name.c_str())); - Line(string_format("kernels.push_back(std::move(%s));", kernel_name.c_str())); - - op_kinds_.insert(op.Type()); - kernel_kinds_.insert(kernel_type); -} -} // namespace gencode -} // namespace lite -} // namespace paddle diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h deleted file mode 100644 index 7dea36636a..0000000000 --- a/lite/gen_code/gen_code.h +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "lite/core/framework.pb.h" -#include "lite/core/program.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" -#include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/desc_apis.h" -#include "lite/model_parser/pb/op_desc.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { -namespace gencode { - -struct TensorRepr { - TensorRepr() = default; - TensorRepr(PrecisionType dtype, - const std::vector &ddim, - void *raw_data, - size_t num_bytes) - : dtype(dtype), ddim(ddim), raw_data(raw_data), num_bytes(num_bytes) {} - - PrecisionType dtype; - lite::DDim ddim; - const void *raw_data; - size_t num_bytes{}; -}; - -class Module { - std::vector ops; - std::vector weights; - std::vector tmp_vars_; - STL::stringstream stream_; - std::set kernel_kinds_; - std::set op_kinds_; - - int line_indent_{}; - const int indent_unit_{2}; - - public: - void NewOp(const cpp::OpDesc &desc) { ops.push_back(desc); } - void NewWeight(const TensorRepr &x) { weights.push_back(x); } - void NewTmpVar(const std::string &x) { tmp_vars_.push_back(x); } - - STL::stringstream &stream() { return stream_; } - - void AddHeaderIncludeGenCode(); - - void AddNamespaceBegin() { - Line("namespace paddle {"); - Line("namespace gencode{"); - Line(""); - } - - void AddNamespaceEnd() { - Line(""); - Line("} // namespace gencode"); - Line("} // namespace paddle"); - } - - void AddInitFuncBegin() { - Line("void PaddlePredictor::Init() {"); - Line(""); - IncIndent(); - } - - void AddInitFuncEnd() { - DecIndent(); - Line(""); - Line("}"); - } - - void AddScopeDecl() { - Line("lite::Scope* scope = static_cast(raw_scope_);"); - - // clang-format off - Line("lite::Scope* exec_scope = static_cast(raw_exe_scope_);"); // NOLINT - // clang-format on - - // Create feed and fetch in exec_scope. - Line(string_format("exec_scope->Var(%s);", Repr("feed").c_str())); - Line(string_format("exec_scope->Var(%s);", Repr("fetch").c_str())); - } - - void AddValidPlaceDecl() { - // clang-format off - Line("std::vector valid_places({lite::Place({TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW)}), lite::Place({TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)})});"); // NOLINT - // clang-format on - } - - void AddMemberCast() { - Line("// Cast the raw members"); - // clang-format off - Line(string_format("auto& ops = *static_cast>*>(raw_ops_);")); // NOLINT - Line(string_format("auto& kernels = *static_cast>*>(raw_kernels_);")); // NOLINT - // clang-format on - Line(""); - } - - void AddWeight(const std::string &name, const TensorRepr &tensor); - - void AddTmpVar(const std::string &x) { - Line(string_format("// Create temporary variable: %s", x.c_str())); - Line(string_format("exec_scope->Var(%s);", Repr(x).c_str())); - Line(""); - } - - void AddOp(const cpp::OpDesc &op); - - void AddOpDescHelper(const std::string &op_id, const cpp::OpDesc &desc); - - void AddOpCompileDeps() { - Line(""); - Line("// Add Operator compile deps"); - for (auto &op_type : op_kinds_) { - Line(string_format("USE_LITE_OP(%s)", op_type.c_str())); - } - Line(""); - } - void AddKernelCompileDeps() { - Line("// Add Kernel compile deps"); - - std::string op_type, alias; - Place place; - for (auto &kernel_type : kernel_kinds_) { - KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); - Line(string_format("USE_LITE_KERNEL(%s, %s, %s, %s, %s)", // - op_type.c_str(), // - TargetRepr(place.target).c_str(), - PrecisionRepr(place.precision).c_str(), - DataLayoutRepr(place.layout).c_str(), - alias.c_str())); - } - } - - private: - std::string WeightUniqueName() const { - return "w_" + std::to_string(weight_counter_++); - } - std::string TmpVarUniqueName() const { - return "tmp_" + std::to_string(tmp_var_counter_++); - } - std::string OpUniqueName() const { - return "op_" + std::to_string(op_counter_++); - } - std::string KernelUniqueName() const { - return "kernel_" + std::to_string(kernel_counter_++); - } - - std::string DataRepr(const std::string &raw_data, PrecisionType dtype); - - void IncIndent() { line_indent_++; } - void DecIndent() { line_indent_--; } - - void Line(const std::string &x) { - std::string indent_str(line_indent_ * indent_unit_, ' '); - stream() << indent_str << x << "\n"; - } - - private: - mutable int weight_counter_{}; - mutable int tmp_var_counter_{}; - mutable int op_counter_{}; - mutable int kernel_counter_{}; -}; - -class ProgramCodeGenerator { - public: - ProgramCodeGenerator(const framework::proto::ProgramDesc &program, - const lite::Scope &exec_scope) - : program_(program), exec_scope_(exec_scope) {} - - std::string GenCode() { - Module m; - m.AddHeaderIncludeGenCode(); - m.AddNamespaceBegin(); - m.AddInitFuncBegin(); - m.AddMemberCast(); - m.AddScopeDecl(); - m.AddValidPlaceDecl(); - - AddWeights(&m); - AddTmpVars(&m); - AddOps(&m); - - m.AddInitFuncEnd(); - m.AddNamespaceEnd(); - - m.AddOpCompileDeps(); - m.AddKernelCompileDeps(); - - return m.stream().str(); - } - - void AddWeights(Module *m) { - for (auto &var : program_.blocks(0).vars()) { - if (var.persistable()) { - auto name = var.name(); - if (name == "feed" || name == "fetch") continue; - const auto &tensor = exec_scope_.FindVar(name)->Get(); - TensorRepr repr; - TensorToRepr(tensor, &repr); - m->AddWeight(name, repr); - } - } - } - void AddTmpVars(Module *m) { - for (auto &var : program_.blocks(0).vars()) { - if (!var.persistable()) { - m->AddTmpVar(var.name()); - } - } - } - void AddOps(Module *m) { - for (auto &pb_op : program_.blocks(0).ops()) { - auto op = pb_op; - lite::pb::OpDesc pb_desc(&op); - lite::cpp::OpDesc cpp_desc; - TransformOpDescAnyToCpp(pb_desc, &cpp_desc); - m->AddOp(cpp_desc); - } - } - - private: - void TensorToRepr(const lite::Tensor &tensor, TensorRepr *repr) { - repr->ddim = tensor.dims(); - // TODO(Superjomn) support other types. - repr->dtype = PRECISION(kFloat); - repr->raw_data = tensor.data(); - repr->num_bytes = repr->ddim.production() * sizeof(float); - } - - private: - const framework::proto::ProgramDesc &program_; - const lite::Scope &exec_scope_; -}; - -} // namespace gencode -} // namespace lite -} // namespace paddle diff --git a/lite/gen_code/gen_code_test.cc b/lite/gen_code/gen_code_test.cc deleted file mode 100644 index d0b1c1f8b2..0000000000 --- a/lite/gen_code/gen_code_test.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/gen_code/gen_code.h" -#include -#include -#include -#include -#include -#include -#include "lite/api/paddle_use_kernels.h" -#include "lite/api/paddle_use_ops.h" -#include "lite/core/context.h" -#include "lite/core/scope.h" -#include "lite/core/tensor.h" -#include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/model_parser.h" -#include "lite/model_parser/pb/program_desc.h" - -DEFINE_string(optimized_model, "", ""); -DEFINE_string(generated_code_file, "__generated_code__.cc", ""); - -namespace paddle { -namespace lite { -namespace gencode { - -// Manually construct a program. -TEST(gen_code, manual) { - // For holding the weights. - lite::Scope scope; - // For holding the temporary variables. - auto &tmp_scope = scope.NewScope(); - - // Create weight variables. - auto *w0 = scope.Var("w0")->GetMutable(); - // Create temporary variables. - auto *a = tmp_scope.Var("x")->GetMutable(); - tmp_scope.Var("out")->GetMutable(); - - // Set weights. - std::vector w0_data({0, 1, 2, 3}); - std::vector a_data({0, 1, 2, 3}); -#ifdef LITE_WITH_ARM - w0->Assign( - w0_data.data(), lite::DDim{std::vector({2, 2})}); - a->Assign( - a_data.data(), lite::DDim{std::vector({2, 2})}); -#else - w0->Assign( - w0_data.data(), lite::DDim{std::vector({2, 2})}); - a->Assign( - a_data.data(), lite::DDim{std::vector({2, 2})}); -#endif - - std::vector valid_places({ -#ifdef LITE_WITH_ARM - Place{TARGET(kARM), PRECISION(kFloat)}, -#else - Place{TARGET(kX86), PRECISION(kFloat)}, -#endif - Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kHost), PRECISION(kAny)}, - }); - auto mul_op = LiteOpRegistry::Global().Create("mul"); - cpp::OpDesc mul_op_desc; - mul_op_desc.SetType("mul"); - mul_op_desc.SetInput("X", {"x"}); - mul_op_desc.SetInput("Y", {"w0"}); - mul_op_desc.SetAttr("x_num_col_dims", 1); - mul_op_desc.SetAttr("y_num_col_dims", 1); - mul_op_desc.SetOutput("Out", {"out"}); - - mul_op->Attach(mul_op_desc, &tmp_scope); - auto mul_kernel = std::move(mul_op->CreateKernels(valid_places).front()); -#ifdef LITE_WITH_ARM - auto fc_ctx = ContextScheduler::Global().NewContext(TARGET(kARM)); -#else - auto fc_ctx = ContextScheduler::Global().NewContext(TARGET(kX86)); -#endif - mul_op->CheckShape(); - mul_op->InferShape(); - mul_kernel->SetContext(std::move(fc_ctx)); - mul_kernel->Launch(); -} - -TEST(gen_code, auto_gen) { - std::vector w0_data({0, 1, 2, 3}); - TensorRepr w0(PRECISION(kFloat), - std::vector({2, 2}), - w0_data.data(), - w0_data.size() * sizeof(float)); - - std::vector w1_data({0.01, 1.2, 2.3, 3.4, 1.1, 2.2}); - TensorRepr w1(PRECISION(kFloat), - std::vector({3, 2}), - w1_data.data(), - w1_data.size() * sizeof(float)); - - cpp::OpDesc op0; - op0.SetType("mul"); - op0.SetInput("X", {"a", "b"}); - op0.SetOutput("Out", {"out0"}); - op0.SetAttr("desc", "this is a desc"); - op0.SetAttr("x_col", 1); - op0.SetAttr("y_col", 2); -#ifdef LITE_WITH_ARM - op0.SetAttr(kKernelTypeAttr, "arm"); -#else - op0.SetAttr(kKernelTypeAttr, "x86"); -#endif - - gencode::Module module; - module.AddHeaderIncludeGenCode(); - - module.AddNamespaceBegin(); - module.AddInitFuncBegin(); - - module.AddMemberCast(); - - module.AddWeight("w0", w0); - module.AddWeight("w1", w1); - module.AddTmpVar("a"); - module.AddTmpVar("b"); - - module.AddOp(op0); - - module.AddInitFuncEnd(); - module.AddNamespaceEnd(); - - LOG(INFO) << module.stream().str(); -} - -TEST(gen_code, optimized_program) { - lite::Scope scope; - cpp::ProgramDesc cpp_desc; - std::string model_file = FLAGS_optimized_model + "/model"; - std::string param_file = FLAGS_optimized_model + "/params"; - LoadModelPb( - FLAGS_optimized_model, model_file, param_file, &scope, &cpp_desc, true); - - framework::proto::ProgramDesc pb_proto_desc; - lite::pb::ProgramDesc pb_desc(&pb_proto_desc); - TransformProgramDescCppToAny(cpp_desc, &pb_desc); - - ProgramCodeGenerator codegen(pb_proto_desc, scope); - - std::ofstream file(FLAGS_generated_code_file); - - file << codegen.GenCode(); - - file.close(); -} - -} // namespace gencode -} // namespace lite -} // namespace paddle diff --git a/lite/gen_code/generated_code_test.cc b/lite/gen_code/generated_code_test.cc deleted file mode 100644 index 199ba579d4..0000000000 --- a/lite/gen_code/generated_code_test.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/gen_code/paddle_infer.h" -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { - -TEST(PaddlePredictor, Init) { - gencode::PaddlePredictor predictor; - predictor.Init(); -} - -#ifdef LITE_WITH_X86 -TEST(PaddlePredictor, RunX86) { - gencode::PaddlePredictor predictor; - predictor.Init(); - - LOG(INFO) << "run the generated code"; - auto input_tensor = predictor.GetInput(0); - input_tensor->Resize(std::vector({100, 100})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100 * 100; i++) { - data[i] = i; - } - - predictor.Run(); - - auto output_tensor = predictor.GetOutput(0); - LOG(INFO) << "output: " << output_tensor->data()[0]; -} -#endif - -#ifdef LITE_WITH_ARM -TEST(PaddlePredictor, RunARM) { - gencode::PaddlePredictor predictor; - predictor.Init(); - - LOG(INFO) << "run the generated code"; - auto input_tensor = predictor.GetInput(0); - input_tensor->Resize(std::vector({1, 100})); - auto* data = input_tensor->mutable_data(); - for (int i = 0; i < 100; i++) { - data[i] = 1; - } - - predictor.Run(); - - std::vector result({0.4350058, - -0.6048313, - -0.29346266, - 0.40377066, - -0.13400325, - 0.37114543, - -0.3407839, - 0.14574292, - 0.4104212, - 0.8938774}); - - auto output_tensor = predictor.GetOutput(0); - auto output_shape = output_tensor->shape(); - ASSERT_EQ(output_shape.size(), 2); - ASSERT_EQ(output_shape[0], 1); - ASSERT_EQ(output_shape[1], 500); - - int step = 50; - for (int i = 0; i < result.size(); i += step) { - EXPECT_NEAR(output_tensor->data()[i], result[i], 1e-6); - } -} -#endif - -} // namespace lite -} // namespace paddle diff --git a/lite/gen_code/paddle_code_generator.cc b/lite/gen_code/paddle_code_generator.cc deleted file mode 100644 index 578c869382..0000000000 --- a/lite/gen_code/paddle_code_generator.cc +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/gen_code/gen_code.h" -#include "lite/model_parser/model_parser.h" -#include "lite/model_parser/pb/program_desc.h" - -DEFINE_string(optimized_model, "", ""); -DEFINE_string(generated_code_file, "__generated_code__.cc", ""); - -namespace paddle { -namespace lite { -namespace gencode { - -void GenCode(const std::string& model_dir, const std::string& out_file) { - lite::Scope scope; - cpp::ProgramDesc cpp_desc; - std::string model_file = model_dir + "/model"; - std::string param_file = model_dir + "/params"; - LoadModelPb(model_dir, model_file, param_file, &scope, &cpp_desc, true); - - framework::proto::ProgramDesc pb_proto_desc; - lite::pb::ProgramDesc pb_desc(&pb_proto_desc); - TransformProgramDescCppToAny(cpp_desc, &pb_desc); - - ProgramCodeGenerator codegen(pb_proto_desc, scope); - - std::ofstream file(out_file); - - file << codegen.GenCode(); - - file.close(); -} - -} // namespace gencode -} // namespace lite -} // namespace paddle - -int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, false); - paddle::lite::gencode::GenCode(FLAGS_optimized_model, - FLAGS_generated_code_file); - return 0; -} diff --git a/lite/gen_code/paddle_infer.cc b/lite/gen_code/paddle_infer.cc deleted file mode 100644 index 180e75e1a6..0000000000 --- a/lite/gen_code/paddle_infer.cc +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/gen_code/paddle_infer.h" -#include "lite/core/op_lite.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace gencode { - -void Tensor::Resize(const Tensor::ddim_t &shape) { - CHECK(raw_mutable_tensor_); - auto *tensor = static_cast(raw_mutable_tensor_); - tensor->Resize(shape); -} - -std::vector Tensor::shape() const { - CHECK(raw_tensor_); - auto *tensor = static_cast(raw_tensor_); - return tensor->dims().Vectorize(); -} - -#define FOR_EACH_TYPE(HANDLE) \ - HANDLE(int); \ - HANDLE(float); \ - HANDLE(int8_t); \ - HANDLE(int64_t); - -#define IMPL_DATA(T) \ - template <> \ - const T *Tensor::data() const { \ - CHECK(raw_tensor_); \ - const auto *tensor = static_cast(raw_tensor_); \ - return tensor->data(); \ - } -FOR_EACH_TYPE(IMPL_DATA); -#undef IMPL_DATA - -#define IMPL_MUTABLE_DATA(T) \ - template <> \ - T *Tensor::mutable_data() { \ - CHECK(raw_mutable_tensor_); \ - auto *tensor = static_cast(raw_mutable_tensor_); \ - return tensor->mutable_data(); \ - } -FOR_EACH_TYPE(IMPL_MUTABLE_DATA); -#undef IMPL_MUTABLE_DATA - -PaddlePredictor::PaddlePredictor() { - raw_ops_ = new std::vector>; - raw_kernels_ = new std::vector>; - raw_scope_ = new lite::Scope; - raw_exe_scope_ = &(static_cast(raw_scope_)->NewScope()); -} - -std::unique_ptr PaddlePredictor::GetTensor( - const std::string &id) const { - auto *exe_scope = static_cast(raw_exe_scope_); - const auto *var = exe_scope->FindVar(id); - const auto &tensor = var->Get(); - return std::unique_ptr(new Tensor(&tensor, nullptr)); -} - -std::unique_ptr PaddlePredictor::GetMutableTensor( - const std::string &id) { - auto *exe_scope = static_cast(raw_exe_scope_); - auto *var = exe_scope->FindVar(id); - auto *tensor = var->GetMutable(); - return std::unique_ptr(new Tensor(nullptr, tensor)); -} - -#define CAST_OPS \ - auto *ops = \ - static_cast> *>(raw_ops_); -#define CAST_KERNELS \ - auto *kernels = \ - static_cast> *>( \ - raw_kernels_); -#define CAST_SCOPE auto *scope = static_cast(raw_scope_); - -PaddlePredictor::~PaddlePredictor() { - CAST_OPS - CAST_KERNELS - CAST_SCOPE - - if (ops) { - delete ops; - } - if (kernels) { - delete kernels; - } - if (scope) { - delete scope; - } -} - -void PaddlePredictor::Run() { - CAST_OPS - CAST_KERNELS - - CHECK(ops); - CHECK(kernels); - CHECK_EQ(ops->size(), kernels->size()); - - for (size_t i = 0; i < ops->size(); i++) { - LOG(INFO) << "Running the " << i << "-th operator"; - ops->at(i)->InferShape(); - kernels->at(i)->Launch(); - } -} - -std::unique_ptr PaddlePredictor::GetInput(size_t offset) { - auto *exec_scope = static_cast(raw_exe_scope_); - auto *_feed_list = exec_scope->FindVar("feed"); - CHECK(_feed_list) << "no feed variable in exec_scope"; - auto *feed_list = _feed_list->GetMutable>(); - if (offset >= feed_list->size()) { - feed_list->resize(offset + 1); - } - - return std::unique_ptr(new Tensor(nullptr, &feed_list->at(offset))); -} - -std::unique_ptr PaddlePredictor::GetOutput(size_t offset) { - auto *exec_scope = static_cast(raw_exe_scope_); - auto *_fetch_list = exec_scope->FindVar("fetch"); - CHECK(_fetch_list) << "no fatch variable in exec_scope"; - auto &fetch_list = *_fetch_list->GetMutable>(); - CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow"; - return std::unique_ptr(new Tensor(&fetch_list.at(offset), nullptr)); -} - -} // namespace gencode -} // namespace paddle diff --git a/lite/gen_code/paddle_infer.h b/lite/gen_code/paddle_infer.h deleted file mode 100644 index e01ffc25e2..0000000000 --- a/lite/gen_code/paddle_infer.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -namespace paddle { -namespace gencode { - -/// Zero Copy Tensor. -class Tensor { - public: - using ddim_t = std::vector; - - Tensor(const void *raw_tensor, void *raw_mutable_tensor) - : raw_tensor_(raw_tensor), raw_mutable_tensor_(raw_mutable_tensor) {} - - void Resize(const ddim_t &shape); - template - const T *data() const; - template - T *mutable_data(); - - ddim_t shape() const; - - private: - const void *raw_tensor_; - void *raw_mutable_tensor_{}; -}; - -/* - * Predictor for the generated code. - */ -class PaddlePredictor { - public: - void Init(); - - std::unique_ptr GetTensor(const std::string &id) const; - std::unique_ptr GetMutableTensor(const std::string &id); - - // Get offset-th col of feed. - std::unique_ptr GetInput(size_t offset); - - std::unique_ptr GetOutput(size_t offset); - - void Run(); - - PaddlePredictor(); - ~PaddlePredictor(); - - private: - void *raw_ops_; - void *raw_kernels_; - void *raw_scope_{}; - void *raw_exe_scope_{}; // raw_exe_scope is not owned. -}; - -} // namespace gencode -} // namespace paddle diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt deleted file mode 100644 index 1996f50133..0000000000 --- a/lite/kernels/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -message(STATUS "add lite kernels") - -set(lite_kernel_deps type_system kernel op op_registry context tensor any CACHE INTERNAL "" FORCE) - -add_subdirectory(host) -add_subdirectory(arm) -add_subdirectory(cuda) -add_subdirectory(x86) -add_subdirectory(opencl) -add_subdirectory(fpga) -add_subdirectory(npu) diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt deleted file mode 100644 index 91550476d6..0000000000 --- a/lite/kernels/arm/CMakeLists.txt +++ /dev/null @@ -1,95 +0,0 @@ -add_kernel(fc_compute_arm ARM basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(activation_compute_arm ARM basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(mul_compute_arm ARM basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(matmul_compute_arm ARM basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(conv_compute_arm ARM basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(assign_compute_arm ARM basic SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(anchor_generator_compute_arm ARM basic SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(generate_proposals_compute_arm ARM basic SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(roi_align_compute_arm ARM basic SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(box_clip_compute_arm ARM basic SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm) - -# for OCR specific -add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm) - -# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered -# to the model_optimize_tool. -if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)) - return() -endif() - -message(STATUS "compile with lite ARM kernels") - - -lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm) -lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) -lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) -lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm) -lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm) -lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm) -lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm) -lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm) -lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm) -lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm) -lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) -lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm) -lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm) -lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra) -lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm) -lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) -lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm) diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc deleted file mode 100644 index 406ec530ac..0000000000 --- a/lite/kernels/arm/activation_compute.cc +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/activation_compute.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void ReluCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_relu( - x_data, output_data, x_dims.production(), ctx.threads()); -} - -void LeakyReluCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto alpha = param.Leaky_relu_alpha; - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_relu_neg( - x_data, output_data, x_dims.production(), alpha, ctx.threads()); -} - -void ReluClippedCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto coef = param.Relu_clipped_coef; - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_clipped_relu( - x_data, output_data, x_dims.production(), coef, ctx.threads()); -} - -void PReluCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto mode = param.Prelu_mode; - auto alpha_data = param.Prelu_alpha->data(); - auto output_data = param.Out->mutable_data(); - - int outer_size = x_dims[0]; - int channel_size = x_dims[1]; - int inner_size = x_dims.count(2, x_dims.size()); - - lite::arm::math::act_prelu(x_data, - output_data, - outer_size, - channel_size, - inner_size, - mode, - alpha_data, - ctx.threads()); -} - -void SigmoidCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_sigmoid( - x_data, output_data, x_dims.production(), ctx.threads()); -} - -void TanhCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_tanh( - x_data, output_data, x_dims.production(), ctx.threads()); -} - -void SwishCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto beta = param.Swish_beta; - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_swish( - x_data, output_data, x_dims.production(), beta, ctx.threads()); -} - -void Relu6Compute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - float coef = 6.; - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_clipped_relu( - x_data, output_data, x_dims.production(), coef, ctx.threads()); -} - -void LogCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_log( - x_data, output_data, x_dims.production(), ctx.threads()); -} - -void ExpCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_exp( - x_data, output_data, x_dims.production(), ctx.threads()); -} - -void FloorCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_floor( - x_data, output_data, x_dims.production(), ctx.threads()); -} - -void HardSigmoidCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto x_dims = param.X->dims(); - auto x_data = param.X->data(); - float slope = param.hard_sigmoid_slope; - float offset = param.hard_sigmoid_offset; - auto output_data = param.Out->mutable_data(); - lite::arm::math::act_hard_sigmoid( - x_data, output_data, x_dims.production(), slope, offset, ctx.threads()); -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL( - relu, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ReluCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL(leaky_relu, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::LeakyReluCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("alpha", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL(relu_clipped, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::ReluClippedCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Relu_clipped_coef", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL( - prelu, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::PReluCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("mode", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Alpha", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL(sigmoid, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::SigmoidCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL( - tanh, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::TanhCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL( - swish, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SwishCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("beta", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL( - relu6, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::Relu6Compute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL( - log, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LogCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL( - exp, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ExpCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL( - floor, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FloorCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); -REGISTER_LITE_KERNEL(hard_sigmoid, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::HardSigmoidCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h deleted file mode 100644 index ac1b7ca177..0000000000 --- a/lite/kernels/arm/activation_compute.h +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class ReluCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~ReluCompute() = default; -}; - -class LeakyReluCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~LeakyReluCompute() = default; -}; - -class ReluClippedCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~ReluClippedCompute() = default; -}; - -class PReluCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~PReluCompute() = default; -}; - -class SigmoidCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~SigmoidCompute() = default; -}; - -class TanhCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~TanhCompute() = default; -}; - -class SwishCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~SwishCompute() = default; -}; - -class Relu6Compute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~Relu6Compute() = default; -}; - -class LogCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~LogCompute() = default; -}; - -class ExpCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~ExpCompute() = default; -}; - -class FloorCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~FloorCompute() = default; -}; - -class HardSigmoidCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override; - - virtual ~HardSigmoidCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/affine_channel_compute.cc b/lite/kernels/arm/affine_channel_compute.cc deleted file mode 100644 index 6781dab488..0000000000 --- a/lite/kernels/arm/affine_channel_compute.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/affine_channel_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void AffineChannelCompute::Run() { - auto& param = Param(); - const lite::Tensor* x = param.X; - const lite::Tensor* scale = param.Scale; - const lite::Tensor* bias = param.Bias; - const std::string data_layout = param.data_layout; - lite::Tensor* out = param.Out; - - auto x_dims = x->dims(); - int num = x_dims[0]; - int channel = 0; - int h = 0; - int w = 0; - if (data_layout == "NCHW") { - channel = x_dims[1]; - h = x_dims[2]; - w = x_dims[3]; - } else if (data_layout == "NHWC") { - channel = x_dims[3]; - h = x_dims[1]; - w = x_dims[2]; - } - lite::arm::math::affine_channel_func(x->data(), - scale->data(), - bias->data(), - data_layout, - num, - channel, - h, - w, - out->mutable_data()); - return; -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(affine_channel, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::AffineChannelCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/affine_channel_compute.h b/lite/kernels/arm/affine_channel_compute.h deleted file mode 100644 index 5b50af7907..0000000000 --- a/lite/kernels/arm/affine_channel_compute.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/operators/affine_channel_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class AffineChannelCompute - : public KernelLite { - public: - using param_t = operators::AffineChannelParam; - - void Run() override; - - virtual ~AffineChannelCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/anchor_generator_compute.cc b/lite/kernels/arm/anchor_generator_compute.cc deleted file mode 100644 index 3f31717475..0000000000 --- a/lite/kernels/arm/anchor_generator_compute.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/anchor_generator_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void AnchorGeneratorCompute::Run() { - auto& param = Param(); - auto* anchors = param.Anchors; - auto* variances = param.Variances; - auto* input = param.Input; - - float* anchors_data = anchors->mutable_data(); - float* variances_data = variances->mutable_data(); - auto input_dims = input->dims(); - int feature_height = input_dims[2]; - int feature_width = input_dims[3]; - - lite::arm::math::anchor_generator_func(feature_height, - feature_width, - param.anchor_sizes, - param.aspect_ratios, - param.stride, - param.variances, - param.offset, - anchors_data, - variances_data); - return; -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(anchor_generator, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::AnchorGeneratorCompute, - def) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Anchors", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/anchor_generator_compute.h b/lite/kernels/arm/anchor_generator_compute.h deleted file mode 100644 index af6a6eef02..0000000000 --- a/lite/kernels/arm/anchor_generator_compute.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/operators/anchor_generator_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class AnchorGeneratorCompute - : public KernelLite { - public: - using param_t = operators::AnchorGeneratorParam; - - void Run() override; - - virtual ~AnchorGeneratorCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/argmax_compute.cc b/lite/kernels/arm/argmax_compute.cc deleted file mode 100644 index ad279e8f8e..0000000000 --- a/lite/kernels/arm/argmax_compute.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/argmax_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void ArgmaxCompute::Run() { - auto& param = Param(); - lite::Tensor* input = param.X; - lite::Tensor* output = param.Out; - int axis = param.Axis; - - lite::arm::math::argmax_func(input, axis, output); - return; -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(arg_max, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::ArgmaxCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/argmax_compute.h b/lite/kernels/arm/argmax_compute.h deleted file mode 100644 index c87f5a451b..0000000000 --- a/lite/kernels/arm/argmax_compute.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/operators/argmax_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class ArgmaxCompute : public KernelLite { - public: - using param_t = operators::ArgmaxParam; - - void Run() override; - - virtual ~ArgmaxCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc deleted file mode 100644 index 58bdf18474..0000000000 --- a/lite/kernels/arm/argmax_compute_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/argmax_compute.h" -#include -#include -#include -#include -#include -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -void argmax_compute_ref(const operators::ArgmaxParam& param) { - lite::Tensor* x = param.X; - lite::Tensor* output = param.Out; - int axis = param.Axis; - - auto x_data = x->data(); - auto output_data = output->mutable_data(); - DDim x_dims = x->dims(); - DDim output_dims = output->dims(); - - // int in_channel = x_dims - const int size = x_dims[axis]; - const int in_channel = x_dims.count(axis, x_dims.size()); - const int out_channel = output_dims.count(axis, output_dims.size()); - const int in_stride = x_dims.count(axis + 1, x_dims.size()); - const int out_stride = x_dims.count(0, axis); - - for (int n = 0; n < out_stride; n++) { - for (int k = 0; k < in_stride; k++) { - const dtype* in_ptr = x_data + n * in_channel + k; - std::vector> vec; - vec.resize(size); - for (int i = 0; i < size; i++) { - vec[i] = std::make_pair(in_ptr[i * in_stride], i); - } - // sort - std::partial_sort(vec.begin(), - vec.begin() + 1, - vec.end(), - std::greater>()); - - // out - dtype* out_ptr = output_data + n * out_channel + k; - *out_ptr = vec[0].second; - } - } -} - -TEST(argmax_arm, retrive_op) { - auto argmax = - KernelRegistry::Global().Create( - "arg_max"); - ASSERT_FALSE(argmax.empty()); - ASSERT_TRUE(argmax.front()); -} - -TEST(argmax_arm, init) { - ArgmaxCompute argmax; - ASSERT_EQ(argmax.precision(), PRECISION(kFloat)); - ASSERT_EQ(argmax.target(), TARGET(kARM)); -} -TEST(argmax_arm, compute) { - DeviceInfo::Init(); - for (auto n : {2, 3}) { - for (auto c : {3, 4 /*, 128*/}) { - for (auto h : {4, 5 /*, 56 , 112, 224, 512*/}) { - for (auto w : {5, 6 /*, 56, 112, 224, 512*/}) { - Tensor x; - Tensor output; - Tensor output_ref; - int axis = (n + c + h + w) % 4; - - // get tensor x data - x.Resize({n, c, h, w}); - auto* x_data = x.mutable_data(); - for (int i = 0; i < x.dims().production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - x_data[i] = sign * static_cast(i % 128) * 0.013f; - } - - // resize output and output_ref - int nchw[] = {n, c, h, w}; - std::vector output_size(nchw, nchw + 4); - output_size.erase(output_size.begin() + axis); - output.Resize(output_size); - output_ref.Resize(output_size); - - // obtain output_data - ArgmaxCompute argmaxOp; - std::unique_ptr ctx(new KernelContext); - ctx->As(); - argmaxOp.SetContext(std::move(ctx)); - operators::ArgmaxParam param; - param.X = &x; - param.Out = &output; - param.Axis = axis; - argmaxOp.SetParam(param); - argmaxOp.Launch(); - auto* output_data = output.mutable_data(); - - // obtain output_ref_data - param.Out = &output_ref; - argmax_compute_ref(param); - auto* output_ref_data = output_ref.mutable_data(); - - // compare - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); - } - } - } - } - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle -USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def); diff --git a/lite/kernels/arm/assign_compute.cc b/lite/kernels/arm/assign_compute.cc deleted file mode 100644 index b0a5529c36..0000000000 --- a/lite/kernels/arm/assign_compute.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/assign_compute.h" -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void AssignCompute::PrepareForRun() { - // CHECK_OR_FALSE(param_t.Out); -} - -void AssignCompute::Run() { - // LOG(INFO) << "into kernel compute run"; - auto& param = Param(); - const lite::Tensor* input = param.X; - lite::Tensor* output = param.Out; - output->CopyDataFrom(*input); -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL( - assign, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AssignCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/assign_compute.h b/lite/kernels/arm/assign_compute.h deleted file mode 100644 index 3f0dd8e281..0000000000 --- a/lite/kernels/arm/assign_compute.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/operators/assign_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class AssignCompute : public KernelLite { - public: - using param_t = operators::AssignParam; - void PrepareForRun() override; - void Run() override; - - virtual ~AssignCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/assign_value_compute.cc b/lite/kernels/arm/assign_value_compute.cc deleted file mode 100644 index 45f28ba363..0000000000 --- a/lite/kernels/arm/assign_value_compute.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/assign_value_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" -#include "lite/core/types.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -void TensorFromVector(const std::vector& src, lite::Tensor* dst) { - auto* src_ptr = static_cast(src.data()); - auto* dst_ptr = static_cast(dst->mutable_data()); - auto size = src.size() * sizeof(T); - std::memcpy(dst_ptr, src_ptr, size); -} - -void AssignValueCompute::Run() { - auto& param = Param(); - int dtype = param.dtype; - std::vector fp32_values = param.fp32_values; - std::vector int32_values = param.int32_values; - auto* out = param.Out; - - if (dtype == static_cast(lite::core::FluidType::INT32)) { - TensorFromVector(int32_values, out); - } else if (dtype == static_cast(lite::core::FluidType::FP32)) { - TensorFromVector(fp32_values, out); - } else { - LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype; - } - return; -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(assign_value, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::AssignValueCompute, - def) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/assign_value_compute.h b/lite/kernels/arm/assign_value_compute.h deleted file mode 100644 index f0c33f865b..0000000000 --- a/lite/kernels/arm/assign_value_compute.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/operators/assign_value_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class AssignValueCompute : public KernelLite { - public: - using param_t = operators::AssignValueParam; - - void Run() override; - - virtual ~AssignValueCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/axpy_compute.cc b/lite/kernels/arm/axpy_compute.cc deleted file mode 100644 index 705aa6a0f5..0000000000 --- a/lite/kernels/arm/axpy_compute.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/axpy_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void AxpyCompute::Run() { - auto& param = Param(); - lite::Tensor* scale = param.Scale; - lite::Tensor* x = param.X; - lite::Tensor* bias = param.Bias; - lite::Tensor* out = param.Out; - - const float* scale_ptr = scale->data(); - const float* x_ptr = x->data(); - const float* bias_ptr = bias->data(); - float* out_ptr = out->mutable_data(); - - auto bias_dims = bias->dims(); - int num = bias_dims[0]; - int channel = bias_dims[1]; - int size = bias_dims[2] * bias_dims[3]; - int in_channel = channel * size; - - lite::arm::math::axpy_kernel_fp32( - scale_ptr, x_ptr, bias_ptr, out_ptr, num, channel, size, in_channel); - return; -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL( - axpy, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AxpyCompute, def) - .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/axpy_compute.h b/lite/kernels/arm/axpy_compute.h deleted file mode 100644 index 29983bdb99..0000000000 --- a/lite/kernels/arm/axpy_compute.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/operators/axpy_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class AxpyCompute : public KernelLite { - public: - using param_t = operators::AxpyParam; - - void Run() override; - - virtual ~AxpyCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/axpy_compute_test.cc b/lite/kernels/arm/axpy_compute_test.cc deleted file mode 100644 index af145435eb..0000000000 --- a/lite/kernels/arm/axpy_compute_test.cc +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/axpy_compute.h" -#include -#include -#include -#include -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -void axpy_compute_ref(const operators::AxpyParam& param) { - lite::Tensor* scale = param.Scale; - lite::Tensor* x = param.X; - lite::Tensor* bias = param.Bias; - lite::Tensor* output = param.Out; - - auto scale_data = scale->data(); - auto x_data = x->data(); - auto bias_data = bias->data(); - auto output_data = output->mutable_data(); - - DDim x_dims = x->dims(); - int num = x_dims[0]; - int channel = x_dims[1]; - int size = x_dims[2] * x_dims[3]; - int in_channel = channel * size; - - for (int i = 0; i < num; i++) { - auto scale_data_i = scale_data + i * channel; - auto x_data_i = x_data + i * in_channel; - auto bias_data_i = bias_data + i * in_channel; - auto output_data_i = output_data + i * in_channel; - for (int j = 0; j < channel; j++) { - auto scale_data_j = scale_data_i + j; - auto x_data_j = x_data_i + j * size; - auto bias_data_j = bias_data_i + j * size; - auto output_data_j = output_data_i + j * size; - for (int k = 0; k < size; k++) { - output_data_j[k] = scale_data_j[0] * x_data_j[k] + bias_data_j[k]; - } - } - } -} - -TEST(axpy_arm, retrive_op) { - auto axpy = - KernelRegistry::Global().Create("axpy"); - ASSERT_FALSE(axpy.empty()); - ASSERT_TRUE(axpy.front()); -} - -TEST(axpy_arm, init) { - AxpyCompute axpy; - ASSERT_EQ(axpy.precision(), PRECISION(kFloat)); - ASSERT_EQ(axpy.target(), TARGET(kARM)); -} -TEST(axpy_arm, compute) { - DeviceInfo::Init(); - int iter = 10; - for (int i = 0; i < iter; i++) { - Tensor scale; - Tensor x; - Tensor bias; - Tensor output; - Tensor output_ref; - - // set the dims of scale, x, bias and output_ref - int n = 2, c = 3, h = 4, w = 5; - scale.Resize({n, c}); - x.Resize({n, c, h, w}); - bias.Resize({n, c, h, w}); - output.Resize({n, c, h, w}); - output_ref.Resize({n, c, h, w}); - - // initialize the data of scale, x, bias - // initialize_random_data(scale); - // initialize_random_data(x); - // initialize_random_data(bias); - auto* scale_data = scale.mutable_data(); - for (int i = 0; i < scale.dims().production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - scale_data[i] = sign * static_cast(i % 128) * 0.010f; - } - auto* x_data = x.mutable_data(); - for (int i = 0; i < x.dims().production(); i++) { - float sign = i % 4 == 0 ? -1.0f : 1.0f; - x_data[i] = sign * static_cast(i % 128) * 0.007f; - } - auto* bias_data = bias.mutable_data(); - for (int i = 0; i < bias.dims().production(); i++) { - float sign = i % 5 == 0 ? -1.0f : 1.0f; - bias_data[i] = sign * static_cast(i % 128) * 0.005f; - } - - // prepare kernel params and run to obtain output_data - AxpyCompute axpy_op; - std::unique_ptr ctx(new KernelContext); - ctx->As(); - axpy_op.SetContext(std::move(ctx)); - operators::AxpyParam param; - param.Scale = &scale; - param.X = &x; - param.Bias = &bias; - param.Out = &output; - axpy_op.SetParam(param); - axpy_op.Launch(); - auto* output_data = output.mutable_data(); - - // invoking ref implementation and compare results - param.Out = &output_ref; - axpy_compute_ref(param); - auto* output_ref_data = output_ref.mutable_data(); - - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); - } - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle -USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def); diff --git a/lite/kernels/arm/batch_norm_compute.cc b/lite/kernels/arm/batch_norm_compute.cc deleted file mode 100644 index 1519ad624e..0000000000 --- a/lite/kernels/arm/batch_norm_compute.cc +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/batch_norm_compute.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void BatchNormCompute::PrepareForRun() { - auto& param = this->Param(); - auto x_dims = param.x->dims(); - bool global_stats = param.is_test || param.use_global_stats; - if (global_stats) { - int64_t channel_size = 0; - switch (param.data_layout) { - case DATALAYOUT(kNCHW): - channel_size = x_dims[1]; - break; - // case DATALAYOUT(kNHWC): - // channel_size = x_dims[x_dims.size() - 1]; - // break; - default: - LOG(FATAL) << "Unknown storage order: " - << DataLayoutToStr(param.data_layout); - break; - } - new_scale.Resize({channel_size}); - new_bias.Resize({channel_size}); - auto* scale_data = param.scale->mutable_data(); - auto* bias_data = param.bias->mutable_data(); - auto* mean_data = param.mean->mutable_data(); - auto* variance_data = param.variance->mutable_data(); - auto* new_scale_data = new_scale.mutable_data(); - auto* new_bias_data = new_bias.mutable_data(); - for (int c = 0; c < channel_size; c++) { - float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon)); - new_bias_data[c] = - bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; - new_scale_data[c] = inv_scale * scale_data[c]; - } - } -} - -void BatchNormCompute::Run() { - auto& param = this->Param(); - auto x_dims = param.x->dims(); - auto x_data = param.x->mutable_data(); - auto y_data = param.y->mutable_data(); - bool global_stats = param.is_test || param.use_global_stats; - if (global_stats) { - auto* new_scale_data = new_scale.mutable_data(); - auto* new_bias_data = new_bias.mutable_data(); - int64_t outer_size = 0; - int64_t channel_size = 0; - int64_t inner_size = 0; - switch (param.data_layout) { - case DATALAYOUT(kNCHW): - outer_size = x_dims[0]; - channel_size = x_dims[1]; - inner_size = x_dims.Slice(2, x_dims.size()).production(); - lite::arm::math::scale(x_data, - y_data, - outer_size, - channel_size, - inner_size, - new_scale_data, - new_bias_data); - break; - // case DATALAYOUT(kNHWC): - // outer_size = x_dims.Slice(0, x_dims.size() - 1).production(); - // channel_size = x_dims[x_dims.size() - 1]; - // lite::arm::math::scale(x_data, y_data, outer_size, channel_size, - // new_scale_data, new_bias_data); - // break; - default: - LOG(FATAL) << "Unknown storage order: " - << DataLayoutToStr(param.data_layout); - break; - } - } else { - // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and - // saved_variance - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(batch_norm, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::BatchNormCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/batch_norm_compute.h b/lite/kernels/arm/batch_norm_compute.h deleted file mode 100644 index 22553f55d5..0000000000 --- a/lite/kernels/arm/batch_norm_compute.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class BatchNormCompute : public KernelLite { - public: - using param_t = operators::BatchNormParam; - - void PrepareForRun() override; - - void Run() override; - - virtual ~BatchNormCompute() = default; - - private: - Tensor new_scale; - Tensor new_bias; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/batch_norm_compute_test.cc b/lite/kernels/arm/batch_norm_compute_test.cc deleted file mode 100644 index c603a04d47..0000000000 --- a/lite/kernels/arm/batch_norm_compute_test.cc +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/batch_norm_compute.h" -#include -#include -#include -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -void batch_norm_compute_ref(const operators::BatchNormParam& param) { - DDim x_dims = param.x->dims(); - auto x_data = param.x->mutable_data(); - auto scale_data = param.scale->mutable_data(); - auto bias_data = param.bias->mutable_data(); - auto mean_data = param.mean->mutable_data(); - auto variance_data = param.variance->mutable_data(); - auto y_data = param.y->mutable_data(); - float epsilon = param.epsilon; - float momentum = param.momentum; - DataLayoutType data_layout = param.data_layout; - - bool global_stats = param.is_test || param.use_global_stats; - if (global_stats) { - int64_t outer_size = 0; - int64_t channel_size = 0; - int64_t inner_size = 0; - switch (data_layout) { - case DATALAYOUT(kNCHW): - outer_size = x_dims[0]; - channel_size = x_dims[1]; - inner_size = x_dims.Slice(2, x_dims.size()).production(); - break; - // case DATALAYOUT(kNHWC): - // outer_size = x_dims.Slice(0, x_dims.size() - 1).production(); - // channel_size = x_dims[x_dims.size() - 1]; - // inner_size = 1; - // break; - default: - LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout); - break; - } - auto x_ptr = x_data; - auto y_ptr = y_data; - for (int o = 0; o < outer_size; o++) { - for (int c = 0; c < channel_size; c++) { - for (int i = 0; i < inner_size; i++) { - dtype norm_x = - (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); - *y_ptr = norm_x * scale_data[c] + bias_data[c]; - x_ptr++; - y_ptr++; - } - } - } - } else { - // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and - // saved_variance - } -} - -TEST(batch_norm_arm, retrive_op) { - auto batch_norm = - KernelRegistry::Global().Create( - "batch_norm"); - ASSERT_FALSE(batch_norm.empty()); - ASSERT_TRUE(batch_norm.front()); -} - -TEST(batch_norm_arm, init) { - BatchNormCompute batch_norm; - ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat)); - ASSERT_EQ(batch_norm.target(), TARGET(kARM)); -} - -TEST(batch_norm_arm, compute) { - DeviceInfo::Init(); - for (auto n : {1, 2}) { - for (auto c : {6, 32 /*, 128*/}) { - for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) { - for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) { - for (auto is_test : {/*false, */ true}) { - for (auto use_global_stats : {false, true}) { - for (auto epsilon : {1e-4f, 1e-5f}) { - for (auto momentum : {0.9f, 0.99f}) { - for (auto data_layout : - {DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) { - Tensor x; - Tensor scale; - Tensor bias; - Tensor mean; - Tensor variance; - Tensor y; - Tensor mean_out; - Tensor variance_out; - Tensor saved_mean; - Tensor saved_variance; - Tensor y_ref; - Tensor mean_out_ref; - Tensor variance_out_ref; - Tensor saved_mean_ref; - Tensor saved_variance_ref; - // set the dims of input, output, ref output tensors - std::vector in_out_shape; - switch (data_layout) { - case DATALAYOUT(kNCHW): - in_out_shape = {n, c, h, w}; - break; - // case DATALAYOUT(kNHWC): - // in_out_shape = {n, h, w, c}; - // break; - default: - LOG(FATAL) << "Unknown storage order: " - << DataLayoutToStr(data_layout); - break; - } - x.Resize(in_out_shape); - scale.Resize({c}); - bias.Resize({c}); - mean.Resize({c}); - variance.Resize({c}); - y.Resize(in_out_shape); - mean_out.Resize({c}); - variance_out.Resize({c}); - saved_mean.Resize({c}); - saved_variance.Resize({c}); - y_ref.Resize(in_out_shape); - mean_out_ref.Resize({c}); - variance_out_ref.Resize({c}); - saved_mean_ref.Resize({c}); - saved_variance_ref.Resize({c}); - // initialize the data of input tensors - auto* x_data = x.mutable_data(); - auto* scale_data = scale.mutable_data(); - auto* bias_data = bias.mutable_data(); - auto* mean_data = mean.mutable_data(); - auto* variance_data = variance.mutable_data(); - auto* y_data = y.mutable_data(); - for (int i = 0; i < x.dims().production(); i++) { - x_data[i] = static_cast(i % 64); - } - for (int i = 0; i < scale.dims().production(); i++) { - scale_data[i] = static_cast(i) * 0.01f + 0.03f; - } - for (int i = 0; i < bias.dims().production(); i++) { - bias_data[i] = static_cast(i) * 0.065f + 0.1f; - } - for (int i = 0; i < mean.dims().production(); i++) { - mean_data[i] = static_cast(i) * 0.0565f; - } - for (int i = 0; i < variance.dims().production(); i++) { - variance_data[i] = static_cast(i) * 2.08f + 1.5f; - } - // prepare kernel params and run - BatchNormCompute batch_norm; - std::unique_ptr ctx(new KernelContext); - ctx->As(); - batch_norm.SetContext(std::move(ctx)); - operators::BatchNormParam param; - param.x = &x; - param.scale = &scale; - param.bias = &bias; - param.mean = &mean; - param.variance = &variance; - param.is_test = is_test; - param.use_global_stats = use_global_stats; - param.epsilon = epsilon; - param.momentum = momentum; - param.data_layout = data_layout; - param.y = &y; - param.mean_out = &mean_out; - param.variance_out = &variance_out; - param.saved_mean = &saved_mean; - param.saved_variance = &saved_variance; - batch_norm.SetParam(param); - batch_norm.Launch(); - // invoking ref implementation and compare results - param.y = &y_ref; - param.mean_out = &mean_out_ref; - param.variance_out = &variance_out_ref; - param.saved_mean = &saved_mean_ref; - param.saved_variance = &saved_variance_ref; - batch_norm_compute_ref(param); - auto* y_ref_data = y_ref.mutable_data(); - for (int i = 0; i < y.dims().production(); i++) { - EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5); - } - } - } - } - } - } - } - } - } - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); diff --git a/lite/kernels/arm/beam_search_compute.cc b/lite/kernels/arm/beam_search_compute.cc deleted file mode 100644 index 5ac53b3b96..0000000000 --- a/lite/kernels/arm/beam_search_compute.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/beam_search_compute.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void BeamSearchCompute::PrepareForRun() {} - -void BeamSearchCompute::Run() { - auto& ctx = this->ctx_->template As(); - auto& param = this->Param(); - lite::arm::math::beam_search(param.pre_ids, - param.pre_scores, - param.ids, - param.scores, - param.selected_ids, - param.selected_scores, - param.parent_idx, - param.level, - param.beam_size, - param.end_id, - param.is_accumulated, - &ctx); -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(beam_search, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::BeamSearchCompute, - def) - .BindInput("pre_ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("pre_scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("selected_ids", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("selected_scores", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("parent_idx", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/beam_search_compute.h b/lite/kernels/arm/beam_search_compute.h deleted file mode 100644 index ebd72732bb..0000000000 --- a/lite/kernels/arm/beam_search_compute.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/backends/arm/math/type_trans.h" -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class BeamSearchCompute : public KernelLite { - public: - using param_t = operators::BeamSearchParam; - - void PrepareForRun() override; - - void Run() override; - - ~BeamSearchCompute() {} - - private: -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/beam_search_decode_compute.cc b/lite/kernels/arm/beam_search_decode_compute.cc deleted file mode 100644 index a417baa6d7..0000000000 --- a/lite/kernels/arm/beam_search_decode_compute.cc +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/beam_search_decode_compute.h" -#include -#include -#include "lite/api/paddle_place.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -using LoDTensor = lite::Tensor; -using LoDTensorArray = std::vector; - -// all the lod have 2 levels. -// The first is source level, the second is sentence level. -// source level describe how many prefixes (branchs) for each source sentece -// (beam). sentence level describe how these candidates belong to the prefixes. -const size_t kSourceLevel = 0; -const size_t kSentenceLevel = 1; - -template -struct Sentence { - std::vector word_ids; - std::vector scores; -}; - -template -using SentenceVector = std::vector>; - -template -struct BeamSearchDecoder { - BeamSearchDecoder(size_t beam_size, int end_id) - : beam_size_(beam_size), end_id_(end_id) {} - - /** - * convert the result sentence_vector for each source sentence into two - * LodTensor. - * One is all candidate sentences with word id, one is all candidate sentences - * with word score. - * Param: - * sentence_vector_list: sentence_vector for each source sentence. - * id_tensor: result LoDTensor for sentences of id. - * score_tensor: result LoDTensor for sentences of score. - * reverse: whether ids of sentence in sentence_vector_list is reversed - * sort_by_score: whether to sort hypotheses of each sentence by scores. - */ - void ConvertSentenceVectorToLodTensor( - std::vector> sentence_vector_list, - LoDTensor* id_tensor, - LoDTensor* score_tensor, - bool reverse = true, - bool sort_by_score = true) const { - size_t src_num = sentence_vector_list.size(); - CHECK_GT(src_num, 0) << "src_num should not be 0"; - - std::vector source_level_lod = {0}; - std::vector sentence_level_lod = {0}; - std::vector id_data; - std::vector score_data; - - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - if (sort_by_score) { - sort(sentence_vector_list[src_idx].begin(), - sentence_vector_list[src_idx].end(), - [reverse](const Sentence& a, const Sentence& b) { - if (reverse) - return a.scores.front() > b.scores.front(); - else - return a.scores.back() > b.scores.back(); - }); - } - for (Sentence& sentence : sentence_vector_list[src_idx]) { - if (reverse) { - id_data.insert(id_data.end(), - sentence.word_ids.rbegin(), - sentence.word_ids.rend()); - score_data.insert(score_data.end(), - sentence.scores.rbegin(), - sentence.scores.rend()); - } else { - id_data.insert(id_data.end(), - sentence.word_ids.begin(), - sentence.word_ids.end()); - score_data.insert( - score_data.end(), sentence.scores.begin(), sentence.scores.end()); - } - - sentence_level_lod.push_back(sentence_level_lod.back() + - sentence.word_ids.size()); - } - source_level_lod.push_back(source_level_lod.back() + - sentence_vector_list[src_idx].size()); - } - - LoD lod; - lod.push_back(source_level_lod); - lod.push_back(sentence_level_lod); - - *(id_tensor->mutable_lod()) = lod; - - id_tensor->Resize({static_cast(id_data.size())}); - auto id_ptr = id_tensor->mutable_data(); - TargetCopy( - TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float)); - - *(score_tensor->mutable_lod()) = lod; - score_tensor->Resize({static_cast(score_data.size())}); - auto score_ptr = score_tensor->mutable_data(); - TargetCopy(TARGET(kARM), - score_ptr, - score_data.data(), - score_data.size() * sizeof(T)); - } - - /** - * Gather the hypotheses for each source sentence by backtrace though the - * LoDTensorArray step_ids whose lods reserve the path in the tree. - */ - void Backtrace(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor) const { - CHECK(!step_ids.empty()) << "step num should be larger than 0"; - CHECK_EQ(step_ids.size(), step_scores.size()) - << "step_ids and step_scores should be the same"; - const size_t step_num = step_ids.size(); - const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; - std::vector> sentence_vector_list( - src_num, SentenceVector(beam_size_)); - std::vector> prefix_idx_vector_list(src_num); - for (int step_id = step_num - 1; step_id >= 0; --step_id) { - auto& cur_ids = step_ids.at(step_id); - auto& cur_scores = step_scores.at(step_id); - for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { - // for each source sentence - auto& sentence_vector = sentence_vector_list.at(src_idx); - auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx); - size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx]; - size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; - if (prefix_idx_vector.empty()) { // be finished and pruned at this step - // or the last time step - for (size_t prefix_idx = src_prefix_start; - prefix_idx < src_prefix_end; - ++prefix_idx) { - size_t candidate_start = - cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - size_t candidate_end = - cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1]; - for (size_t candidate_idx = candidate_start; - candidate_idx < candidate_end; - ++candidate_idx) { - prefix_idx_vector.push_back(prefix_idx); - size_t idx = prefix_idx_vector.size() - 1; - auto cur_id = cur_ids.data()[candidate_idx]; - auto cur_score = cur_scores.data()[candidate_idx]; - sentence_vector.at(idx).word_ids.push_back(cur_id); - sentence_vector.at(idx).scores.push_back(cur_score); - } - } - } else { // use prefix_idx_vector to backtrace - size_t src_candidate_start = - cur_ids.lod().at(kSentenceLevel)[src_prefix_start]; - size_t prefix_idx = src_prefix_start; - size_t candidate_num = - cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - - cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { - auto candidate_idx = prefix_idx_vector.at(idx); - auto cur_id = cur_ids.data()[candidate_idx]; - auto cur_score = cur_scores.data()[candidate_idx]; - if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { - // to skip redundant end tokens - sentence_vector.at(idx).word_ids.push_back(cur_id); - sentence_vector.at(idx).scores.push_back(cur_score); - } - - while (src_candidate_start + candidate_num <= - candidate_idx) { // search the corresponding prefix - prefix_idx++; - candidate_num += - cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - - cur_ids.lod().at(kSentenceLevel)[prefix_idx]; - } - prefix_idx_vector.at(idx) = prefix_idx; - } - } - } - } - - ConvertSentenceVectorToLodTensor( - sentence_vector_list, id_tensor, score_tensor, true, true); - } - - size_t beam_size_; - int end_id_; -}; - -struct BeamSearchDecodeFunctor { - BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, - const LoDTensorArray& step_scores, - LoDTensor* id_tensor, - LoDTensor* score_tensor, - size_t beam_size, - int end_id) - : beam_size_(beam_size), - end_id_(end_id), - step_ids_(step_ids), - step_scores_(step_scores), - id_tensor_(id_tensor), - score_tensor_(score_tensor) {} - - template - void apply() const { - BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); - beam_search_decoder.Backtrace( - step_ids_, step_scores_, id_tensor_, score_tensor_); - } - - size_t beam_size_; - int end_id_; - const LoDTensorArray& step_ids_; - const LoDTensorArray& step_scores_; - LoDTensor* id_tensor_; - LoDTensor* score_tensor_; -}; - -template <> -void BeamSearchDecodeFunctor::apply() const { - LOG(FATAL) << "beam search decode op does not support bool!"; -} - -void BeamSearchDecodeCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - // inputs - auto ids = param.ids; - auto scores = param.scores; - // outputs - auto sentence_ids = param.sentence_ids; - auto sentence_scores = param.sentence_scores; - - const size_t step_num = ids->size(); - CHECK_GT(step_num, 0UL) << "beam search steps should be larger than 0"; - const size_t source_num = ids->at(0).lod().at(0).size() - 1; - CHECK_GT(source_num, 0UL) << "source num should be larger than 0"; - - for (size_t i = 0; i < step_num; ++i) { - CHECK_EQ(ids->at(i).lod().size(), 2UL) << "Level of LodTensor should be 2"; - } - - //! fixme - // only support float score now - BeamSearchDecodeFunctor func(*ids, - *scores, - sentence_ids, - sentence_scores, - param.beam_size, - param.end_id); - - func.apply(); -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(beam_search_decode, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::BeamSearchDecodeCompute, - def) - .BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))}) - .BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))}) - .BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/beam_search_decode_compute.h b/lite/kernels/arm/beam_search_decode_compute.h deleted file mode 100644 index db1961ad93..0000000000 --- a/lite/kernels/arm/beam_search_decode_compute.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class BeamSearchDecodeCompute - : public KernelLite { - public: - using param_t = operators::BeamSearchDecodeParam; - - BeamSearchDecodeCompute() = default; - - void Run() override; - - virtual ~BeamSearchDecodeCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/box_clip_compute.cc b/lite/kernels/arm/box_clip_compute.cc deleted file mode 100644 index 9591302c58..0000000000 --- a/lite/kernels/arm/box_clip_compute.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/box_clip_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -void ClipTiledBoxes(const Tensor& im_info, - const Tensor& input_boxes, - Tensor* out) { - T* out_data = out->mutable_data(); - const T* im_info_data = im_info.data(); - const T* input_boxes_data = input_boxes.data(); - T zero(0); - T im_w = round(im_info_data[1] / im_info_data[2]); - T im_h = round(im_info_data[0] / im_info_data[2]); - for (int64_t i = 0; i < input_boxes.numel(); ++i) { - if (i % 4 == 0) { - out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); - } else if (i % 4 == 1) { - out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); - } else if (i % 4 == 2) { - out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); - } else { - out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); - } - } -} - -void BoxClipCompute::Run() { - auto& param = Param(); - const auto* input = param.Input; - const auto* im_info = param.ImInfo; - auto* output = param.Output; - output->mutable_data(); - if (input->lod().size() > 1) { - LOG(FATAL) << "Only support 0 and 1 level of LoD."; - } - - auto box_lod = input->lod().back(); - int64_t n = static_cast(box_lod.size() - 1); - for (int i = 0; i < n; ++i) { - Tensor im_info_slice = im_info->Slice(i, i + 1); - auto* im_info_slice_data = im_info_slice.data(); - Tensor box_slice = input->Slice(box_lod[i], box_lod[i + 1]); - Tensor output_slice = output->Slice(box_lod[i], box_lod[i + 1]); - ClipTiledBoxes(im_info_slice, box_slice, &output_slice); - } - return; -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(box_clip, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::BoxClipCompute, - def) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("ImInfo", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/box_clip_compute.h b/lite/kernels/arm/box_clip_compute.h deleted file mode 100644 index 460921b2d0..0000000000 --- a/lite/kernels/arm/box_clip_compute.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/core/kernel.h" -#include "lite/operators/box_clip_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class BoxClipCompute : public KernelLite { - public: - using param_t = operators::BoxClipParam; - - void Run() override; - - virtual ~BoxClipCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/box_coder_compute.cc b/lite/kernels/arm/box_coder_compute.cc deleted file mode 100644 index 81e79a83f2..0000000000 --- a/lite/kernels/arm/box_coder_compute.cc +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/box_coder_compute.h" -#include -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void EncodeCenterSize(const Tensor* target_box, - const Tensor* prior_box, - const Tensor* prior_box_var, - const bool normalized, - const std::vector variance, - float* output) { - int64_t row = target_box->dims()[0]; - int64_t col = prior_box->dims()[0]; - int64_t len = prior_box->dims()[1]; - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - int64_t offset = i * col * len + j * len; - float prior_box_width = prior_box_data[j * len + 2] - - prior_box_data[j * len] + (normalized == false); - float prior_box_height = prior_box_data[j * len + 3] - - prior_box_data[j * len + 1] + - (normalized == false); - float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2; - float prior_box_center_y = - prior_box_data[j * len + 1] + prior_box_height / 2; - - float target_box_center_x = - (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; - float target_box_center_y = - (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; - float target_box_width = target_box_data[i * len + 2] - - target_box_data[i * len] + (normalized == false); - float target_box_height = target_box_data[i * len + 3] - - target_box_data[i * len + 1] + - (normalized == false); - - output[offset] = - (target_box_center_x - prior_box_center_x) / prior_box_width; - output[offset + 1] = - (target_box_center_y - prior_box_center_y) / prior_box_height; - output[offset + 2] = - std::log(std::fabs(target_box_width / prior_box_width)); - output[offset + 3] = - std::log(std::fabs(target_box_height / prior_box_height)); - } - } - - if (prior_box_var) { - const float* prior_box_var_data = prior_box_var->data(); - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - for (int k = 0; k < 4; ++k) { - int64_t offset = i * col * len + j * len; - int64_t prior_var_offset = j * len; - output[offset + k] /= prior_box_var_data[prior_var_offset + k]; - } - } - } - } else if (!(variance.empty())) { - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - for (int k = 0; k < 4; ++k) { - int64_t offset = i * col * len + j * len; - output[offset + k] /= static_cast(variance[k]); - } - } - } - } -} - -template -void DecodeCenterSize(const Tensor* target_box, - const Tensor* prior_box, - const Tensor* prior_box_var, - const bool normalized, - std::vector variance, - float* output) { - int64_t row = target_box->dims()[0]; - int64_t col = target_box->dims()[1]; - int64_t len = target_box->dims()[2]; - - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - - float var_data[4] = {1., 1., 1., 1.}; - float* var_ptr = var_data; - int64_t offset = i * col * len + j * len; - int64_t prior_box_offset = axis == 0 ? j * len : i * len; - - float prior_box_width = prior_box_data[prior_box_offset + 2] - - prior_box_data[prior_box_offset] + - (normalized == false); - float prior_box_height = prior_box_data[prior_box_offset + 3] - - prior_box_data[prior_box_offset + 1] + - (normalized == false); - float prior_box_center_x = - prior_box_data[prior_box_offset] + prior_box_width / 2; - float prior_box_center_y = - prior_box_data[prior_box_offset + 1] + prior_box_height / 2; - - float target_box_center_x = 0, target_box_center_y = 0; - float target_box_width = 0, target_box_height = 0; - int64_t prior_var_offset = axis == 0 ? j * len : i * len; - if (var_size == 2) { - std::memcpy(var_ptr, - prior_box_var->data() + prior_var_offset, - 4 * sizeof(float)); - } else if (var_size == 1) { - var_ptr = reinterpret_cast(variance.data()); - } - float box_var_x = *var_ptr; - float box_var_y = *(var_ptr + 1); - float box_var_w = *(var_ptr + 2); - float box_var_h = *(var_ptr + 3); - - target_box_center_x = - box_var_x * target_box_data[offset] * prior_box_width + - prior_box_center_x; - target_box_center_y = - box_var_y * target_box_data[offset + 1] * prior_box_height + - prior_box_center_y; - target_box_width = - std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width; - target_box_height = - std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height; - - output[offset] = target_box_center_x - target_box_width / 2; - output[offset + 1] = target_box_center_y - target_box_height / 2; - output[offset + 2] = - target_box_center_x + target_box_width / 2 - (normalized == false); - output[offset + 3] = - target_box_center_y + target_box_height / 2 - (normalized == false); - } - } -} - -void BoxCoderCompute::Run() { - /* - auto& param = Param(); - int axis = param.axis; - bool box_normalized = param.box_normalized; - std::string code_type = param.code_type; - - lite::arm::math::box_coder(param.proposals, - param.prior_box, - param.prior_box_var, - param.target_box, - code_type, - box_normalized, - axis); - */ - auto& param = Param(); - auto* prior_box = param.prior_box; - auto* prior_box_var = param.prior_box_var; - auto* target_box = param.target_box; - auto* output_box = param.proposals; - std::vector variance = param.variance; - const int axis = param.axis; - std::string code_type = param.code_type; - bool normalized = param.box_normalized; - - auto row = target_box->dims()[0]; - auto col = prior_box->dims()[0]; - if (code_type == "decode_center_size") { - col = target_box->dims()[1]; - } - auto len = prior_box->dims()[1]; - output_box->Resize({row, col, len}); - auto* output = output_box->mutable_data(); - - if (code_type == "encode_center_size") { - EncodeCenterSize( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else if (code_type == "decode_center_size") { - if (prior_box_var) { - if (axis == 0) { - DecodeCenterSize<0, 2>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else { - DecodeCenterSize<1, 2>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } - } else if (!(variance.empty())) { - if (axis == 0) { - DecodeCenterSize<0, 1>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else { - DecodeCenterSize<1, 1>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } - } else { - if (axis == 0) { - DecodeCenterSize<0, 0>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else { - DecodeCenterSize<1, 0>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } - } - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(box_coder, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::BoxCoderCompute, - def) - .BindInput("PriorBox", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("PriorBoxVar", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("TargetBox", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("OutputBox", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/box_coder_compute.h b/lite/kernels/arm/box_coder_compute.h deleted file mode 100644 index 0279af4ea5..0000000000 --- a/lite/kernels/arm/box_coder_compute.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class BoxCoderCompute : public KernelLite { - public: - using param_t = operators::BoxCoderParam; - - void Run() override; - - virtual ~BoxCoderCompute() = default; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc deleted file mode 100644 index 525e5aefd6..0000000000 --- a/lite/kernels/arm/calib_compute.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/calib_compute.h" -#include -#include "lite/backends/arm/math/type_trans.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void CalibComputeFp32ToInt8::Run() { - auto& param = this->Param(); - std::vector scale = {param.scale}; - const auto* din = param.input->data(); - auto* dout = param.output->mutable_data(); - lite::arm::math::fp32_to_int8( - din, dout, scale.data(), 1, 1, param.input->numel()); - return; -} - -void CalibComputeInt8ToFp32::Run() { - auto& param = this->Param(); - const auto* din = param.input->data(); - std::vector scale = {param.scale}; - auto* dout = param.output->mutable_data(); - lite::arm::math::int8_to_fp32( - din, dout, scale.data(), 1, 1, param.input->numel()); - return; -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(calib, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeFp32ToInt8, - fp32_to_int8) - .BindInput("Input", - {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .Finalize(); - -REGISTER_LITE_KERNEL(calib, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeInt8ToFp32, - int8_to_fp32) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) - .Finalize(); -REGISTER_LITE_KERNEL(calib_once, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeFp32ToInt8, - fp32_to_int8) - .BindInput("Input", - {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .Finalize(); - -REGISTER_LITE_KERNEL(calib_once, - kARM, - kInt8, - kNCHW, - paddle::lite::kernels::arm::CalibComputeInt8ToFp32, - int8_to_fp32) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) - .Finalize(); diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h deleted file mode 100644 index 8d9a32bc24..0000000000 --- a/lite/kernels/arm/calib_compute.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/core/kernel.h" -#include "lite/operators/calib_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class CalibComputeFp32ToInt8 - : public KernelLite { - public: - using param_t = operators::CalibParam; - - void Run() override; - - ~CalibComputeFp32ToInt8() override{}; - - private: -}; - -class CalibComputeInt8ToFp32 - : public KernelLite { - public: - using param_t = operators::CalibParam; - - void Run() override; - - ~CalibComputeInt8ToFp32() override{}; - - private: -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/calib_compute_test.cc b/lite/kernels/arm/calib_compute_test.cc deleted file mode 100644 index 27049cc2c6..0000000000 --- a/lite/kernels/arm/calib_compute_test.cc +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/calib_compute.h" -#include -#include -#include -#include -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -static int get_rand(int start, int end) { - int i = rand(); // NOLINT - i = (i % (end - start)) + start; - return i; -} - -static void int8_to_fp32_basic(const int8_t* din, - float* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int loop_size = axis_size * outer_size; - for (int i = 0; i < loop_size; ++i) { - float scale_in = scale[i % axis_size]; - for (int j = 0; j < inner_size; ++j) { - dout[j] = din[j] * scale_in; - } - dout += inner_size; - din += inner_size; - } -} - -static void fp32_to_int8_basic(const float* din, - int8_t* dout, - const float* scale, - int axis_size, - int64_t outer_size, - int64_t inner_size) { - int loop_size = axis_size * outer_size; - for (int i = 0; i < loop_size; ++i) { - float inv_scale = 1.f / scale[i % axis_size]; - for (int j = 0; j < inner_size; ++j) { - dout[j] = static_cast(roundf(din[j] * inv_scale)); - } - dout += inner_size; - din += inner_size; - } -} - -void calib_ref(const operators::CalibParam& param) { - std::vector scale = {param.in_scale}; - if (param.in_dtype == PRECISION(kFloat) && - param.out_dtype == PRECISION(kInt8)) { - const auto* din = param.input->data(); - auto* dout = param.output->mutable_data(); - fp32_to_int8_basic(din, dout, scale.data(), 1, 1, param.input->numel()); - return; - } - if (param.in_dtype == PRECISION(kInt8) && - param.out_dtype == PRECISION(kFloat)) { - const auto* din = param.input->data(); - auto* dout = param.output->mutable_data(); - int8_to_fp32_basic(din, dout, scale.data(), 1, 1, param.input->numel()); - return; - } - LOG(FATAL) << "Unsupport Dtype."; -} - -TEST(calib_arm, retrive_op) { - auto calib = - KernelRegistry::Global() - .Create("calib"); - ASSERT_FALSE(calib.empty()); - ASSERT_TRUE(calib.front()); -} - -TEST(calib_arm, init) { - CalibCompute calib; - ASSERT_EQ(calib.precision(), PRECISION(kInt8)); - ASSERT_EQ(calib.target(), TARGET(kARM)); -} - -TEST(calib_arm, int8_to_fp32) { - DeviceInfo::Init(); - for (auto n : {1, 2}) { - for (auto c : {6, 32 /*, 128*/}) { - for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) { - for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) { - Tensor x; - Tensor output; - Tensor output_ref; - // set the dims of input, output, ref output tensors - x.Resize({n, c, h, w}); - output.Resize({n, c, h, w}); - output_ref.Resize({n, c, h, w}); - // initialize the data of input tensors - auto* x_data = x.mutable_data(); - auto* output_data = output.mutable_data(); - for (int i = 0; i < x.dims().production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - x_data[i] = sign * static_cast(i % 128) * 0.013f; - } - // prepare kernel params and run - CalibCompute calib; - std::unique_ptr ctx(new KernelContext); - ctx->As(); - calib.SetContext(std::move(ctx)); - operators::CalibParam param; - param.in_scale = get_rand(0, 100) * 0.1f; - param.in_dtype = PRECISION(kInt8); - param.out_dtype = PRECISION(kFloat); - param.input = &x; - param.output = &output; - calib.SetParam(param); - calib.Launch(); - // invoking ref implementation and compare results - param.output = &output_ref; - calib_ref(param); - auto* output_ref_data = output_ref.mutable_data(); - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); - } - } - } - } - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32); -USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8); diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc deleted file mode 100644 index 8b6971ec13..0000000000 --- a/lite/kernels/arm/cast_compute.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/cast_compute.h" -#include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -out_type TransOp(in_type in) { - return static_cast(in); -} - -void CastCompute::PrepareForRun() {} - -void CastCompute::Run() { - auto& ctx = this->ctx_->template As(); - auto& param = this->Param(); - - auto input_dims = param.X->dims(); - - // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6; - // SIZE_T = 19;UINT8 = 20;INT8 = 21; - if (param.in_dtype == param.out_dtype && param.in_dtype == 2) { - const auto* x_data = param.X->data(); - auto* o_data = param.Out->mutable_data(); - memcpy(o_data, x_data, sizeof(float) * param.X->numel()); - } else if (param.in_dtype == 21 && param.out_dtype == 5) { // int8->float32 - const char* x_data_begin = param.X->data(); - const char* x_data_end = x_data_begin + param.X->numel(); - float* out_data = param.Out->mutable_data(); - std::transform(x_data_begin, x_data_end, out_data, TransOp); - } else { - LOG(FATAL) << "other has not been implemented"; - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL( - cast, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CastCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/cast_compute.h b/lite/kernels/arm/cast_compute.h deleted file mode 100644 index d342a405ad..0000000000 --- a/lite/kernels/arm/cast_compute.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "lite/backends/arm/math/type_trans.h" -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class CastCompute : public KernelLite { - public: - using param_t = operators::CastParam; - - void PrepareForRun() override; - - void Run() override; - - ~CastCompute() {} - - private: -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc deleted file mode 100644 index fe4b3d6587..0000000000 --- a/lite/kernels/arm/compare_compute.cc +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/compare_compute.h" -#include -#include "lite/api/paddle_place.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -#define COMPARE_FUNCTOR(name, op) \ - template \ - struct _##name##Functor { \ - inline bool operator()(const T &a, const T &b) const { return a op b; } \ - }; - -COMPARE_FUNCTOR(Equal, ==); -COMPARE_FUNCTOR(NotEqual, !=); -COMPARE_FUNCTOR(LessThan, <); -COMPARE_FUNCTOR(LessEqual, <=); -COMPARE_FUNCTOR(GreaterThan, >); -COMPARE_FUNCTOR(GreaterEqual, >=); - -template <> -struct _EqualFunctor { - inline bool operator()(const float &a, const float &b) const { - // It is safe to cast a and b to double. - return fabs(static_cast(a - b)) < 1e-8; - } -}; - -template <> -struct _NotEqualFunctor { - inline bool operator()(const float &a, const float &b) const { - return !_EqualFunctor()(a, b); - } -}; - -inline void get_mid_dims(const lite::DDim &x_dims, - const lite::DDim &y_dims, - const int axis, - int *pre, - int *n, - int *post) { - *pre = 1; - *n = 1; - *post = 1; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - - for (int i = 0; i < y_dims.size(); ++i) { - (*n) *= y_dims[i]; - } - - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } -} -template