From 36004a9aa4bc963bf406299235c86d0d42ebf0d5 Mon Sep 17 00:00:00 2001 From: "baolei.an" Date: Thu, 9 Jan 2020 15:12:14 +0800 Subject: [PATCH] update the newest version --- README.md | 75 +- cmake/cross_compiling/ios.cmake | 1 + cmake/cross_compiling/npu.cmake | 2 +- lite/CMakeLists.txt | 16 +- lite/api/CMakeLists.txt | 14 +- lite/api/cxx_api.cc | 6 +- lite/api/cxx_api_impl.cc | 4 +- lite/api/lite_multithread_test.cc | 0 lite/api/model_optimize_tool.cc | 220 +- lite/api/model_test.cc | 1 + lite/api/paddle_api.h | 14 +- lite/api/test_step_rnn_lite_x86.cc | 4 +- .../arm/math/conv3x3s1_depthwise_fp32.cc | 538 -- .../arm/math/conv3x3s2_depthwise_fp32.cc | 361 -- .../backends/arm/math/conv_depthwise_3x3p0.cc | 4178 -------------- .../backends/arm/math/conv_depthwise_3x3p1.cc | 4850 ----------------- .../backends/arm/math/conv_depthwise_3x3s1.cc | 2539 --------- .../backends/arm/math/conv_depthwise_3x3s2.cc | 1862 ------- lite/backends/arm/math/reduce_prod.cc | 0 lite/backends/arm/math/reduce_prod.h | 0 .../arm/math/split_merge_lod_tenosr.cc | 0 .../arm/math/split_merge_lod_tenosr.h | 0 lite/backends/fpga/KD/debugger.hpp | 0 lite/backends/fpga/KD/dl_engine.cpp | 0 lite/backends/fpga/KD/dl_engine.hpp | 0 lite/backends/fpga/KD/llapi/zynqmp_api.cpp | 0 lite/backends/fpga/KD/llapi/zynqmp_api.h | 0 lite/backends/fpga/KD/pes/conv_process.hpp | 0 lite/backends/fpga/KD/pes/crop_pe.cpp | 0 .../fpga/KD/pes/depthwise_conv_pe.hpp | 0 .../fpga/KD/pes/elementwise_mul_pe.hpp | 0 .../fpga/KD/pes/fully_connected_pe.hpp | 0 lite/backends/fpga/KD/pes/gru_pe.hpp | 0 lite/backends/fpga/KD/pes/gru_util.hpp | 0 lite/backends/fpga/KD/pes/output_pe.hpp | 0 lite/backends/fpga/KD/pes/pooling_pe.hpp | 0 lite/backends/fpga/KD/pes/scale_pe.hpp | 0 lite/backends/fpga/lite_tensor.cc | 0 lite/backends/npu/builder.cc | 192 - lite/backends/npu/builder.h | 145 - lite/backends/npu/device.cc | 0 lite/backends/npu/device.h | 4 +- lite/backends/npu/runtime.cc | 60 - lite/backends/npu/runtime.h | 50 - .../cl_kernel/image/conv2d_1x1_kernel.cl | 0 .../opencl/cl_kernel/image/reshape_kernel.cl | 0 lite/backends/x86/jit/README.en.md | 2 +- lite/backends/x86/jit/README.md | 2 +- lite/backends/x86/jit/gen/CMakeLists.txt | 54 +- lite/backends/x86/jit/gen/act.cc | 12 +- lite/backends/x86/jit/gen/blas.cc | 14 +- lite/backends/x86/jit/gen/embseqpool.cc | 2 +- lite/backends/x86/jit/gen/gru.cc | 6 +- lite/backends/x86/jit/gen/hopv.cc | 4 +- lite/backends/x86/jit/gen/lstm.cc | 4 +- lite/backends/x86/jit/gen/matmul.cc | 2 +- lite/backends/x86/jit/gen/seqpool.cc | 2 +- lite/backends/x86/jit/gen/sgd.cc | 2 +- lite/backends/x86/jit/gen/vbroadcast.cc | 2 +- lite/backends/x86/jit/more/CMakeLists.txt | 4 +- .../x86/jit/more/intrinsic/CMakeLists.txt | 4 +- lite/backends/x86/jit/more/mix/CMakeLists.txt | 16 +- lite/backends/x86/jit/more/mkl/CMakeLists.txt | 30 +- lite/backends/x86/jit/refer/CMakeLists.txt | 66 +- lite/backends/x86/jit/refer/refer.cc | 2 +- lite/backends/x86/jit/registry.h | 122 +- lite/backends/x86/parallel.h | 0 lite/backends/xpu/builder.cc | 189 - lite/backends/xpu/builder.h | 60 - lite/backends/xpu/device.cc | 7 +- lite/backends/xpu/device.h | 22 +- lite/backends/xpu/runtime.cc | 46 - lite/backends/xpu/runtime.h | 69 - lite/core/CMakeLists.txt | 10 +- lite/core/arena/CMakeLists.txt | 2 +- lite/core/framework.proto | 1 - lite/core/kernel.h | 9 +- ...elementwise_mul_constant_eliminate_pass.cc | 0 .../elementwise_add_activation_fuse_pass.cc | 4 +- lite/core/mir/fusion/fc_fuse_pass.cc | 11 +- lite/core/mir/fusion/fc_fuse_pass_test.cc | 1 + lite/core/mir/fusion/fc_fuser.cc | 16 +- lite/core/mir/fusion/fc_fuser.h | 2 + .../fusion/sequence_pool_concat_fuse_pass.cc | 0 .../fusion/sequence_pool_concat_fuse_pass.h | 0 .../mir/fusion/sequence_pool_concat_fuser.cc | 0 .../mir/fusion/sequence_pool_concat_fuser.h | 0 .../var_conv_2d_activation_fuse_pass.cc | 0 .../fusion/var_conv_2d_activation_fuse_pass.h | 0 .../fusion/var_conv_2d_activation_fuser.cc | 0 .../mir/fusion/var_conv_2d_activation_fuser.h | 0 lite/core/mir/generate_program_pass.cc | 1 - lite/core/mir/subgraph/CMakeLists.txt | 2 +- lite/core/mir/subgraph/subgraph_detector.cc | 2 +- lite/core/mir/subgraph/subgraph_detector.h | 0 .../mir/subgraph/subgraph_detector_test.cc | 0 lite/core/mir/subgraph/subgraph_pass.cc | 6 +- lite/core/mir/subgraph/subgraph_pass.h | 0 lite/core/mir/subgraph/subgraph_pass_test.cc | 2 +- lite/core/profile/profiler.cc | 78 +- lite/core/profile/profiler.h | 26 +- lite/core/profile/test_timer.cc | 6 +- lite/core/program.cc | 16 +- lite/core/program.h | 13 +- lite/core/tensor.h | 21 +- lite/demo/cxx/README.md | 110 +- .../mobile_classify/Makefile.android.armv7 | 0 .../mobile_classify/Makefile.android.armv8 | 0 .../Makefile.android.armv7 | 12 +- .../Makefile.android.armv8 | 12 +- .../makefiles/test_cv/Makefile.android.armv7 | 71 + .../makefiles/test_cv/Makefile.android.armv8 | 70 + .../yolov3_detection/Makefile.android.armv7 | 61 + .../yolov3_detection/Makefile.android.armv8 | 61 + .../cxx/mobile_classify/mobile_classify.cc | 2 +- lite/demo/cxx/mobile_detection/test.jpg | Bin 127499 -> 0 bytes .../ssd_detection.cc} | 2 +- lite/demo/cxx/test_cv/README.md | 131 + lite/demo/cxx/test_cv/test_img_prepross.cc | 389 ++ lite/demo/cxx/test_cv/test_model_cv.cc | 224 + .../cxx/yolov3_detection/yolov3_detection.cc | 238 + lite/kernels/arm/CMakeLists.txt | 2 +- .../arm/collect_fpn_proposals_compute.cc | 0 .../arm/collect_fpn_proposals_compute.h | 0 lite/kernels/arm/conditional_block_compute.cc | 0 lite/kernels/arm/conditional_block_compute.h | 0 lite/kernels/arm/conv_compute.cc | 6 +- .../arm/conv_transpose_compute_test.cc | 371 -- .../arm/distribute_fpn_proposals_compute.cc | 0 .../arm/distribute_fpn_proposals_compute.h | 0 lite/kernels/arm/grid_sampler_compute.cc | 0 lite/kernels/arm/grid_sampler_compute.h | 0 lite/kernels/arm/instance_norm_compute.cc | 0 lite/kernels/arm/instance_norm_compute.h | 0 lite/kernels/arm/merge_lod_tensor_compute.cc | 0 lite/kernels/arm/merge_lod_tensor_compute.h | 0 .../arm/merge_lod_tensor_compute_test.cc | 0 lite/kernels/arm/reduce_prod_compute.cc | 0 lite/kernels/arm/reduce_prod_compute.h | 0 lite/kernels/arm/split_lod_tensor_compute.cc | 0 lite/kernels/arm/split_lod_tensor_compute.h | 0 .../arm/split_lod_tensor_compute_test.cc | 0 lite/kernels/arm/yolo_box_compute.cc | 2 + lite/kernels/cuda/CMakeLists.txt | 2 +- lite/kernels/cuda/conv_compute_test.cc | 1 + lite/kernels/cuda/elementwise_add_compute.cu | 139 - lite/kernels/cuda/elementwise_add_compute.h | 53 - .../cuda/elementwise_add_compute_test.cc | 166 - lite/kernels/cuda/mul_compute.h | 1 - .../cuda/sequence_pool_concat_compute.cu | 0 .../cuda/sequence_pool_concat_compute.h | 0 lite/kernels/cuda/yolo_box_compute.cu | 2 +- lite/kernels/fpga/CMakeLists.txt | 2 +- lite/kernels/fpga/calib_compute.cc | 0 lite/kernels/fpga/conv_compute.cc | 0 lite/kernels/fpga/conv_compute.h | 0 lite/kernels/fpga/dropout_compute.cc | 0 lite/kernels/fpga/elementwise_compute.cc | 0 lite/kernels/fpga/fc_compute.h | 0 lite/kernels/fpga/feed_compute.cc | 0 lite/kernels/fpga/feed_compute.h | 0 lite/kernels/fpga/fetch_compute.h | 0 lite/kernels/fpga/gru_compute.h | 0 lite/kernels/fpga/im2sequence_compute.cc | 0 lite/kernels/fpga/im2sequence_compute.h | 0 lite/kernels/fpga/mul_compute.h | 0 lite/kernels/fpga/multiclass_nms_compute.cc | 0 lite/kernels/fpga/norm_compute.cc | 0 lite/kernels/fpga/norm_compute.h | 0 lite/kernels/fpga/pooling_compute_test.cc | 0 lite/kernels/fpga/prior_box_compute.cc | 0 lite/kernels/fpga/prior_box_compute.h | 0 lite/kernels/fpga/reshape_compute.cc | 0 lite/kernels/fpga/scale_compute.cc | 0 lite/kernels/fpga/scale_compute.h | 0 lite/kernels/fpga/softmax_compute.cc | 0 lite/kernels/fpga/transpose_compute.cc | 0 lite/kernels/npu/bridges/CMakeLists.txt | 3 +- lite/kernels/npu/bridges/act_op.cc | 59 +- lite/kernels/npu/bridges/argmax_op.cc | 21 +- lite/kernels/npu/bridges/argmax_op_test.cc | 0 lite/kernels/npu/bridges/batch_norm_op.cc | 41 +- .../kernels/npu/bridges/batch_norm_op_test.cc | 168 - lite/kernels/npu/bridges/concat_op.cc | 23 +- lite/kernels/npu/bridges/conv_op.cc | 153 +- lite/kernels/npu/bridges/conv_transpose_op.cc | 78 +- lite/kernels/npu/bridges/elementwise_ops.cc | 77 +- lite/kernels/npu/bridges/engine.cc | 6 +- lite/kernels/npu/bridges/engine.h | 0 lite/kernels/npu/bridges/fc_op.cc | 59 +- lite/kernels/npu/bridges/graph.cc | 54 +- lite/kernels/npu/bridges/graph.h | 217 +- lite/kernels/npu/bridges/interpolate_op.cc | 48 +- lite/kernels/npu/bridges/mul_op.cc | 51 +- lite/kernels/npu/bridges/pad2d_op.cc | 33 +- lite/kernels/npu/bridges/paddle_use_bridges.h | 70 +- .../npu/bridges/paddle_use_npu_bridges.h | 55 - lite/kernels/npu/bridges/pool_op.cc | 34 +- lite/kernels/npu/bridges/pool_op_test.cc | 252 - lite/kernels/npu/bridges/reduce_mean_op.cc | 36 +- lite/kernels/npu/bridges/registry.cc | 24 +- lite/kernels/npu/bridges/registry.h | 30 +- lite/kernels/npu/bridges/reshape_op.cc | 68 +- lite/kernels/npu/bridges/scale_op.cc | 38 +- .../kernels/npu/bridges/shuffle_channel_op.cc | 20 +- .../npu/bridges/shuffle_channel_op_test.cc | 117 - lite/kernels/npu/bridges/softmax_op.cc | 31 +- lite/kernels/npu/bridges/split_op.cc | 37 +- lite/kernels/npu/bridges/sqrt_op.cc | 17 +- lite/kernels/npu/bridges/square_op.cc | 17 +- lite/kernels/npu/bridges/transpose_op.cc | 29 +- lite/kernels/npu/bridges/transpose_op_test.cc | 153 - lite/kernels/npu/bridges/unsqueeze_op.cc | 26 +- lite/kernels/npu/bridges/unsqueeze_op_test.cc | 139 - lite/kernels/npu/bridges/utility.cc | 18 +- lite/kernels/npu/bridges/utility.h | 66 +- lite/kernels/npu/graph_compute.cc | 145 - lite/kernels/npu/graph_compute.h | 54 - lite/kernels/npu/subgraph_compute.cc | 46 +- lite/kernels/npu/subgraph_compute.h | 2 +- lite/kernels/opencl/CMakeLists.txt | 10 +- lite/kernels/opencl/conv2d_1x1_compute.cc | 0 .../kernels/opencl/conv2d_1x1_compute_test.cc | 0 lite/kernels/opencl/reshape_compute.cc | 0 lite/kernels/opencl/reshape_compute_test.cc | 0 lite/kernels/x86/fc_compute_test.cc | 100 - lite/kernels/x86/layer_norm_compute.h | 2 +- lite/kernels/x86/relu_compute.cc | 25 - lite/kernels/x86/relu_compute.h | 52 - lite/kernels/xpu/bridges/act_op.cc | 21 +- lite/kernels/xpu/bridges/act_op_test.cc | 102 - lite/kernels/xpu/bridges/batch_norm_op.cc | 38 +- .../kernels/xpu/bridges/batch_norm_op_test.cc | 164 - lite/kernels/xpu/bridges/conv_op.cc | 51 +- lite/kernels/xpu/bridges/dropout_op.cc | 22 +- lite/kernels/xpu/bridges/elementwise_ops.cc | 32 +- lite/kernels/xpu/bridges/gather_op.cc | 46 +- lite/kernels/xpu/bridges/graph.cc | 107 +- lite/kernels/xpu/bridges/graph.h | 183 +- lite/kernels/xpu/bridges/layer_norm_op.cc | 56 +- lite/kernels/xpu/bridges/lookup_table_op.cc | 43 +- lite/kernels/xpu/bridges/matmul_op.cc | 75 +- lite/kernels/xpu/bridges/mul_op.cc | 47 +- lite/kernels/xpu/bridges/paddle_use_bridges.h | 44 +- .../xpu/bridges/paddle_use_xpu_bridges.h | 26 - lite/kernels/xpu/bridges/pool_op.cc | 26 +- lite/kernels/xpu/bridges/registry.cc | 41 - lite/kernels/xpu/bridges/registry.h | 93 - lite/kernels/xpu/bridges/reshape_op.cc | 32 +- lite/kernels/xpu/bridges/scale_op.cc | 18 +- lite/kernels/xpu/bridges/slice_op.cc | 18 +- lite/kernels/xpu/bridges/softmax_op.cc | 14 +- lite/kernels/xpu/bridges/stack_op.cc | 20 +- lite/kernels/xpu/bridges/transpose_op.cc | 26 +- lite/kernels/xpu/bridges/utility.cc | 4 +- lite/kernels/xpu/bridges/utility.h | 1 - lite/kernels/xpu/graph_compute.cc | 99 - lite/kernels/xpu/graph_compute.h | 47 - lite/kernels/xpu/subgraph_compute.cc | 46 +- lite/kernels/xpu/subgraph_compute.h | 0 lite/model_parser/naive_buffer/naive_buffer.h | 33 +- lite/model_parser/naive_buffer/param_desc.cc | 9 +- lite/operators/CMakeLists.txt | 3 +- lite/operators/collect_fpn_proposals_op.cc | 0 lite/operators/collect_fpn_proposals_op.h | 0 lite/operators/compare_op.cc | 2 +- lite/operators/conditional_block_op.cc | 0 lite/operators/conditional_block_op.h | 0 lite/operators/distribute_fpn_proposals_op.cc | 0 lite/operators/distribute_fpn_proposals_op.h | 0 lite/operators/dropout_op.cc | 2 +- lite/operators/fc_op.cc | 2 +- lite/operators/graph_op.cc | 58 - lite/operators/graph_op.h | 52 - lite/operators/grid_sampler_op.cc | 0 lite/operators/grid_sampler_op.h | 0 lite/operators/instance_norm_op.cc | 0 lite/operators/instance_norm_op.h | 0 lite/operators/merge_lod_tensor_op.cc | 0 lite/operators/merge_lod_tensor_op.h | 0 lite/operators/reduce_prod_op.cc | 0 lite/operators/reduce_prod_op.h | 0 lite/operators/sequence_pool_concat_op.cc | 0 lite/operators/sequence_pool_concat_op.h | 0 lite/operators/split_lod_tensor_op.cc | 0 lite/operators/split_lod_tensor_op.h | 0 lite/operators/subgraph_op.cc | 0 lite/operators/subgraph_op.h | 0 lite/tests/cv/CMakeLists.txt | 2 +- lite/tests/cv/cv_basic.h | 61 +- lite/tests/cv/image_convert_test.cc | 136 +- lite/tests/kernels/CMakeLists.txt | 130 +- lite/tests/kernels/batch_norm_compute_test.cc | 181 + lite/tests/kernels/dropout_compute_test.cc | 0 lite/tests/kernels/gather_compute_test.cc | 0 .../kernels/grid_sampler_compute_test.cc | 0 .../kernels/instance_norm_compute_test.cc | 0 lite/tests/kernels/layer_norm_compute_test.cc | 0 .../kernels/lookup_table_compute_test.cc | 0 lite/tests/kernels/mul_compute_test.cc | 0 lite/tests/kernels/pool_compute_test.cc | 367 ++ .../tests/kernels/reduce_prod_compute_test.cc | 0 lite/tests/kernels/reshape_compute_test.cc | 38 +- lite/tests/kernels/scale_compute_test.cc | 57 +- .../kernels/shuffle_channel_compute_test.cc | 72 +- lite/tests/kernels/softmax_compute_test.cc | 47 +- lite/tests/kernels/transpose_compute_test.cc | 44 +- lite/tests/kernels/unsqueeze_compute_test.cc | 88 +- lite/tests/utils/timer.h | 105 - lite/tools/build_bm.sh | 112 - lite/tools/build_xpu.sh | 5 + lite/tools/ci_build.sh | 57 + .../create_fake_kernel_registry.py | 3 + .../cmake_tools/parse_kernel_registry.py | 4 + lite/tools/cmake_tools/parse_op_registry.py | 4 + .../cmake_tools/record_supported_kernel_op.py | 129 + lite/utils/cv/CMakeLists.txt | 3 +- lite/utils/cv/image2tensor.cc | 154 +- lite/utils/cv/image_convert.cc | 302 +- lite/utils/cv/image_flip.cc | 26 + lite/utils/cv/image_flip.h | 9 + lite/utils/cv/image_resize.cc | 9 + lite/utils/cv/image_resize.h | 10 + lite/utils/cv/image_rotate.cc | 43 +- lite/utils/cv/image_rotate.h | 10 + lite/utils/cv/paddle_image_preprocess.cc | 53 +- lite/utils/cv/paddle_image_preprocess.h | 27 +- lite/utils/env.h | 0 mobile/src/common/log.h | 14 +- mobile/src/fpga/V2/image.cpp | 0 mobile/src/fpga/V2/pe.cpp | 0 mobile/src/framework/cl/cl_deleter.h | 20 +- mobile/src/framework/cl/cl_engine.cpp | 61 +- mobile/src/framework/cl/cl_engine.h | 61 +- mobile/src/framework/cl/cl_helper.h | 4 +- mobile/src/framework/cl/cl_image.h | 50 +- mobile/src/framework/cl/cl_scope.h | 44 +- mobile/src/framework/context.h | 10 +- mobile/src/framework/executor.cpp | 7 +- mobile/src/framework/loader.cpp | 3 +- mobile/src/framework/operator.cpp | 78 +- mobile/src/io/opencl_interface.cpp | 18 + mobile/src/io/opencl_interface.h | 1 + mobile/src/io/paddle_mobile.h | 13 +- mobile/src/operators/expand_op.cpp | 0 mobile/src/operators/expand_op.h | 0 mobile/src/operators/grid_sampler_op.cpp | 0 mobile/src/operators/grid_sampler_op.h | 0 .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 0 .../cl/cl_kernel/elementwise_mul_kernel.cl | 18 + .../cl/cl_kernel/elementwise_sub_kernel.cl | 0 .../operators/kernel/cl/cl_kernel/expend.cl | 0 .../cl/cl_kernel/grid_sampler_kernel.cl | 0 .../kernel/cl/conv_transpose_kernel.cpp | 8 +- .../kernel/cl/elementwise_mul_kernel.cpp | 178 +- .../kernel/cl/elementwise_sub_kernel.cpp | 0 .../src/operators/kernel/cl/expand_kernel.cpp | 0 .../kernel/cl/grid_sampler_kernel.cpp | 0 mobile/src/operators/kernel/expand_kernel.h | 0 .../kernel/fpga/V2/elementwise_add_kernel.cpp | 0 .../fpga/V2/elementwise_add_relu_kernel.cpp | 0 .../kernel/fpga/V2/reshape2_kernel.cpp | 0 .../operators/kernel/fpga/V2/slice_kernel.cpp | 0 .../operators/kernel/grid_sampler_kernel.h | 0 mobile/src/operators/op_param.h | 40 +- mobile/src/pass/memory_optimize_cl.cpp | 13 +- mobile/test/CMakeLists.txt | 6 + mobile/test/executor_for_test_opencl.h | 0 mobile/test/net/test_inference_api_v2.cpp | 0 mobile/test/net/test_mobilenet_male2fe.cpp | 66 + mobile/test/net/test_net_multi_feed.cpp | 0 mobile/test/operators/test_expend_op.cpp | 0 .../tools/python/fluidtools/run_multi_feed.py | 0 373 files changed, 5457 insertions(+), 20291 deletions(-) mode change 100755 => 100644 lite/api/lite_multithread_test.cc delete mode 100644 lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc delete mode 100644 lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p0.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p1.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3s1.cc delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3s2.cc mode change 100755 => 100644 lite/backends/arm/math/reduce_prod.cc mode change 100755 => 100644 lite/backends/arm/math/reduce_prod.h mode change 100755 => 100644 lite/backends/arm/math/split_merge_lod_tenosr.cc mode change 100755 => 100644 lite/backends/arm/math/split_merge_lod_tenosr.h mode change 100755 => 100644 lite/backends/fpga/KD/debugger.hpp mode change 100644 => 100755 lite/backends/fpga/KD/dl_engine.cpp mode change 100644 => 100755 lite/backends/fpga/KD/dl_engine.hpp mode change 100644 => 100755 lite/backends/fpga/KD/llapi/zynqmp_api.cpp mode change 100644 => 100755 lite/backends/fpga/KD/llapi/zynqmp_api.h mode change 100644 => 100755 lite/backends/fpga/KD/pes/conv_process.hpp mode change 100644 => 100755 lite/backends/fpga/KD/pes/crop_pe.cpp mode change 100644 => 100755 lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp mode change 100755 => 100644 lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp mode change 100644 => 100755 lite/backends/fpga/KD/pes/fully_connected_pe.hpp mode change 100755 => 100644 lite/backends/fpga/KD/pes/gru_pe.hpp mode change 100755 => 100644 lite/backends/fpga/KD/pes/gru_util.hpp mode change 100644 => 100755 lite/backends/fpga/KD/pes/output_pe.hpp mode change 100644 => 100755 lite/backends/fpga/KD/pes/pooling_pe.hpp mode change 100755 => 100644 lite/backends/fpga/KD/pes/scale_pe.hpp mode change 100644 => 100755 lite/backends/fpga/lite_tensor.cc delete mode 100644 lite/backends/npu/builder.cc delete mode 100644 lite/backends/npu/builder.h mode change 100755 => 100644 lite/backends/npu/device.cc mode change 100755 => 100644 lite/backends/npu/device.h delete mode 100644 lite/backends/npu/runtime.cc delete mode 100644 lite/backends/npu/runtime.h mode change 100755 => 100644 lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl mode change 100755 => 100644 lite/backends/opencl/cl_kernel/image/reshape_kernel.cl mode change 100755 => 100644 lite/backends/x86/parallel.h delete mode 100644 lite/backends/xpu/builder.cc delete mode 100644 lite/backends/xpu/builder.h mode change 100755 => 100644 lite/backends/xpu/device.cc mode change 100755 => 100644 lite/backends/xpu/device.h delete mode 100644 lite/backends/xpu/runtime.cc delete mode 100644 lite/backends/xpu/runtime.h mode change 100755 => 100644 lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuser.cc mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuser.h mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuser.cc mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuser.h mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_detector.cc mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_detector.h mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_detector_test.cc mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_pass.cc mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_pass.h mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_pass_test.cc mode change 100755 => 100644 lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 mode change 100755 => 100644 lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 rename lite/demo/cxx/makefiles/{mobile_detection => ssd_detection}/Makefile.android.armv7 (90%) rename lite/demo/cxx/makefiles/{mobile_detection => ssd_detection}/Makefile.android.armv8 (89%) create mode 100644 lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 create mode 100644 lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 create mode 100644 lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 create mode 100644 lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 mode change 100755 => 100644 lite/demo/cxx/mobile_classify/mobile_classify.cc delete mode 100644 lite/demo/cxx/mobile_detection/test.jpg rename lite/demo/cxx/{mobile_detection/mobile_detection.cc => ssd_detection/ssd_detection.cc} (98%) create mode 100644 lite/demo/cxx/test_cv/README.md create mode 100644 lite/demo/cxx/test_cv/test_img_prepross.cc create mode 100644 lite/demo/cxx/test_cv/test_model_cv.cc create mode 100644 lite/demo/cxx/yolov3_detection/yolov3_detection.cc mode change 100755 => 100644 lite/kernels/arm/collect_fpn_proposals_compute.cc mode change 100755 => 100644 lite/kernels/arm/collect_fpn_proposals_compute.h mode change 100755 => 100644 lite/kernels/arm/conditional_block_compute.cc mode change 100755 => 100644 lite/kernels/arm/conditional_block_compute.h delete mode 100644 lite/kernels/arm/conv_transpose_compute_test.cc mode change 100755 => 100644 lite/kernels/arm/distribute_fpn_proposals_compute.cc mode change 100755 => 100644 lite/kernels/arm/distribute_fpn_proposals_compute.h mode change 100755 => 100644 lite/kernels/arm/grid_sampler_compute.cc mode change 100755 => 100644 lite/kernels/arm/grid_sampler_compute.h mode change 100755 => 100644 lite/kernels/arm/instance_norm_compute.cc mode change 100755 => 100644 lite/kernels/arm/instance_norm_compute.h mode change 100755 => 100644 lite/kernels/arm/merge_lod_tensor_compute.cc mode change 100755 => 100644 lite/kernels/arm/merge_lod_tensor_compute.h mode change 100755 => 100644 lite/kernels/arm/merge_lod_tensor_compute_test.cc mode change 100755 => 100644 lite/kernels/arm/reduce_prod_compute.cc mode change 100755 => 100644 lite/kernels/arm/reduce_prod_compute.h mode change 100755 => 100644 lite/kernels/arm/split_lod_tensor_compute.cc mode change 100755 => 100644 lite/kernels/arm/split_lod_tensor_compute.h mode change 100755 => 100644 lite/kernels/arm/split_lod_tensor_compute_test.cc delete mode 100644 lite/kernels/cuda/elementwise_add_compute.cu delete mode 100644 lite/kernels/cuda/elementwise_add_compute.h delete mode 100644 lite/kernels/cuda/elementwise_add_compute_test.cc mode change 100755 => 100644 lite/kernels/cuda/sequence_pool_concat_compute.cu mode change 100755 => 100644 lite/kernels/cuda/sequence_pool_concat_compute.h mode change 100644 => 100755 lite/kernels/fpga/CMakeLists.txt mode change 100644 => 100755 lite/kernels/fpga/calib_compute.cc mode change 100644 => 100755 lite/kernels/fpga/conv_compute.cc mode change 100644 => 100755 lite/kernels/fpga/conv_compute.h mode change 100755 => 100644 lite/kernels/fpga/dropout_compute.cc mode change 100644 => 100755 lite/kernels/fpga/elementwise_compute.cc mode change 100644 => 100755 lite/kernels/fpga/fc_compute.h mode change 100644 => 100755 lite/kernels/fpga/feed_compute.cc mode change 100644 => 100755 lite/kernels/fpga/feed_compute.h mode change 100644 => 100755 lite/kernels/fpga/fetch_compute.h mode change 100755 => 100644 lite/kernels/fpga/gru_compute.h mode change 100755 => 100644 lite/kernels/fpga/im2sequence_compute.cc mode change 100755 => 100644 lite/kernels/fpga/im2sequence_compute.h mode change 100755 => 100644 lite/kernels/fpga/mul_compute.h mode change 100755 => 100644 lite/kernels/fpga/multiclass_nms_compute.cc mode change 100755 => 100644 lite/kernels/fpga/norm_compute.cc mode change 100755 => 100644 lite/kernels/fpga/norm_compute.h mode change 100644 => 100755 lite/kernels/fpga/pooling_compute_test.cc mode change 100755 => 100644 lite/kernels/fpga/prior_box_compute.cc mode change 100755 => 100644 lite/kernels/fpga/prior_box_compute.h mode change 100755 => 100644 lite/kernels/fpga/reshape_compute.cc mode change 100644 => 100755 lite/kernels/fpga/scale_compute.cc mode change 100644 => 100755 lite/kernels/fpga/scale_compute.h mode change 100644 => 100755 lite/kernels/fpga/softmax_compute.cc mode change 100755 => 100644 lite/kernels/fpga/transpose_compute.cc mode change 100755 => 100644 lite/kernels/npu/bridges/argmax_op.cc mode change 100755 => 100644 lite/kernels/npu/bridges/argmax_op_test.cc delete mode 100644 lite/kernels/npu/bridges/batch_norm_op_test.cc mode change 100755 => 100644 lite/kernels/npu/bridges/engine.cc mode change 100755 => 100644 lite/kernels/npu/bridges/engine.h mode change 100755 => 100644 lite/kernels/npu/bridges/graph.cc mode change 100755 => 100644 lite/kernels/npu/bridges/graph.h mode change 100755 => 100644 lite/kernels/npu/bridges/paddle_use_bridges.h delete mode 100644 lite/kernels/npu/bridges/paddle_use_npu_bridges.h delete mode 100644 lite/kernels/npu/bridges/pool_op_test.cc delete mode 100644 lite/kernels/npu/bridges/shuffle_channel_op_test.cc delete mode 100644 lite/kernels/npu/bridges/transpose_op_test.cc mode change 100755 => 100644 lite/kernels/npu/bridges/unsqueeze_op.cc delete mode 100755 lite/kernels/npu/bridges/unsqueeze_op_test.cc mode change 100755 => 100644 lite/kernels/npu/bridges/utility.cc mode change 100755 => 100644 lite/kernels/npu/bridges/utility.h delete mode 100644 lite/kernels/npu/graph_compute.cc delete mode 100644 lite/kernels/npu/graph_compute.h mode change 100755 => 100644 lite/kernels/npu/subgraph_compute.cc mode change 100755 => 100644 lite/kernels/npu/subgraph_compute.h mode change 100755 => 100644 lite/kernels/opencl/conv2d_1x1_compute.cc mode change 100755 => 100644 lite/kernels/opencl/conv2d_1x1_compute_test.cc mode change 100755 => 100644 lite/kernels/opencl/reshape_compute.cc mode change 100755 => 100644 lite/kernels/opencl/reshape_compute_test.cc delete mode 100644 lite/kernels/x86/fc_compute_test.cc delete mode 100644 lite/kernels/x86/relu_compute.cc delete mode 100644 lite/kernels/x86/relu_compute.h delete mode 100644 lite/kernels/xpu/bridges/act_op_test.cc delete mode 100644 lite/kernels/xpu/bridges/batch_norm_op_test.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/dropout_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/gather_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/graph.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/graph.h mode change 100755 => 100644 lite/kernels/xpu/bridges/layer_norm_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/lookup_table_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/matmul_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/paddle_use_bridges.h delete mode 100644 lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h delete mode 100644 lite/kernels/xpu/bridges/registry.cc delete mode 100644 lite/kernels/xpu/bridges/registry.h mode change 100755 => 100644 lite/kernels/xpu/bridges/reshape_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/scale_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/slice_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/stack_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/transpose_op.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/utility.cc mode change 100755 => 100644 lite/kernels/xpu/bridges/utility.h delete mode 100644 lite/kernels/xpu/graph_compute.cc delete mode 100644 lite/kernels/xpu/graph_compute.h mode change 100755 => 100644 lite/kernels/xpu/subgraph_compute.cc mode change 100755 => 100644 lite/kernels/xpu/subgraph_compute.h mode change 100755 => 100644 lite/operators/collect_fpn_proposals_op.cc mode change 100755 => 100644 lite/operators/collect_fpn_proposals_op.h mode change 100755 => 100644 lite/operators/conditional_block_op.cc mode change 100755 => 100644 lite/operators/conditional_block_op.h mode change 100755 => 100644 lite/operators/distribute_fpn_proposals_op.cc mode change 100755 => 100644 lite/operators/distribute_fpn_proposals_op.h delete mode 100644 lite/operators/graph_op.cc delete mode 100644 lite/operators/graph_op.h mode change 100755 => 100644 lite/operators/grid_sampler_op.cc mode change 100755 => 100644 lite/operators/grid_sampler_op.h mode change 100755 => 100644 lite/operators/instance_norm_op.cc mode change 100755 => 100644 lite/operators/instance_norm_op.h mode change 100755 => 100644 lite/operators/merge_lod_tensor_op.cc mode change 100755 => 100644 lite/operators/merge_lod_tensor_op.h mode change 100755 => 100644 lite/operators/reduce_prod_op.cc mode change 100755 => 100644 lite/operators/reduce_prod_op.h mode change 100755 => 100644 lite/operators/sequence_pool_concat_op.cc mode change 100755 => 100644 lite/operators/sequence_pool_concat_op.h mode change 100755 => 100644 lite/operators/split_lod_tensor_op.cc mode change 100755 => 100644 lite/operators/split_lod_tensor_op.h mode change 100755 => 100644 lite/operators/subgraph_op.cc mode change 100755 => 100644 lite/operators/subgraph_op.h create mode 100644 lite/tests/kernels/batch_norm_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/dropout_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/gather_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/grid_sampler_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/instance_norm_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/layer_norm_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/lookup_table_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/mul_compute_test.cc create mode 100644 lite/tests/kernels/pool_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/reduce_prod_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/reshape_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/softmax_compute_test.cc mode change 100755 => 100644 lite/tests/kernels/transpose_compute_test.cc delete mode 100644 lite/tests/utils/timer.h delete mode 100755 lite/tools/build_bm.sh create mode 100644 lite/tools/cmake_tools/record_supported_kernel_op.py mode change 100755 => 100644 lite/utils/env.h mode change 100755 => 100644 mobile/src/fpga/V2/image.cpp mode change 100755 => 100644 mobile/src/fpga/V2/pe.cpp mode change 100755 => 100644 mobile/src/operators/expand_op.cpp mode change 100755 => 100644 mobile/src/operators/expand_op.h mode change 100755 => 100644 mobile/src/operators/grid_sampler_op.cpp mode change 100755 => 100644 mobile/src/operators/grid_sampler_op.h mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/expend.cl mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl mode change 100755 => 100644 mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp mode change 100755 => 100644 mobile/src/operators/kernel/cl/expand_kernel.cpp mode change 100755 => 100644 mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp mode change 100755 => 100644 mobile/src/operators/kernel/expand_kernel.h mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp mode change 100755 => 100644 mobile/src/operators/kernel/grid_sampler_kernel.h mode change 100755 => 100644 mobile/test/executor_for_test_opencl.h mode change 100755 => 100644 mobile/test/net/test_inference_api_v2.cpp create mode 100644 mobile/test/net/test_mobilenet_male2fe.cpp mode change 100755 => 100644 mobile/test/net/test_net_multi_feed.cpp mode change 100755 => 100644 mobile/test/operators/test_expend_op.cpp mode change 100755 => 100644 mobile/tools/python/fluidtools/run_multi_feed.py diff --git a/README.md b/README.md index 83d0a986da..22b8488829 100644 --- a/README.md +++ b/README.md @@ -1 +1,74 @@ -编译方法: ./lite/tools/build_bm.sh --target_name=bm --bm_sdk_root=/Paddle-Lite/third-party/bmnnsdk2-bm1684_v2.0.1 bm +[中文版](./README_cn.md) + +# Paddle Lite + + +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/) +[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) + + + +Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources. + +For tutorials, please see [PaddleLite Document](https://paddlepaddle.github.io/Paddle-Lite/). + +## Key Features + +### Light Weight + +On mobile devices, execution module can be deployed without third-party libraries, because our excecution module and analysis module are decoupled. + +On ARM V7, only 800KB are taken up, while on ARM V8, 1.3MB are taken up with the 80 operators and 85 kernels in the dynamic libraries provided by Paddle Lite. + +Paddle Lite enables immediate inference without extra optimization. + +### High Performance + +Paddle Lite enables device-optimized kernels, maximizing ARM CPU performance. + +It also supports INT8 quantizations with [PaddleSlim model compression tools](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim), reducing the size of models and increasing the performance of models. + +On Huawei NPU and FPGA, the performance is also boosted. + +The latest benchmark is located at [benchmark](https://paddlepaddle.github.io/Paddle-Lite/develop/benchmark/) + +### High Compatibility + +Hardware compatibility: Paddle Lite supports a diversity of hardwares — ARM CPU, Mali GPU, Adreno GPU, Huawei NPU and FPGA. In the near future, we will also support AI microchips from Cambricon and Bitmain. + +Model compatibility: The Op of Paddle Lite is fully compatible to that of PaddlePaddle. The accuracy and performance of 18 models (mostly CV models and OCR models) and 85 operators have been validated. In the future, we will also support other models. + +Framework compatibility: In addition to models trained on PaddlePaddle, those trained on Caffe and TensorFlow can also be converted to be used on Paddle Lite, via [X2Paddle](https://github.com/PaddlePaddle/X2Paddle). In the future to come, we will also support models of ONNX format. + +## Architecture + +Paddle Lite is designed to support a wide range of hardwares and devices, and it enables mixed execution of a single model on multiple devices, optimization on various phases, and leight-weighted applications on devices. + +![img](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png) + +As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment. + +## Key Info about the Update + +The earlier Paddle-Mobile was designed to be compatible with PaddlePaddle and multiple hardwares, including ARM CPU, Mali GPU, Adreno GPU, FPGA, ARM-Linux and Apple's GPU Metal. Within Baidu, inc, many product lines have been using Paddle-Mobile. For more details, please see: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md). + +As an update of Paddle-Mobile, Paddle Lite has incorporated many older capabilities into the [new architecture](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite). For the time being, the code of Paddle-mobile will be kept under the directory `mobile/`, before complete transfer to Paddle Lite. + +For demands of Apple's GPU Metal and web front end inference, please see `./metal` and `./web` . These two modules will be further developed and maintained. + +## Special Thanks + +Paddle Lite has referenced the following open-source projects: + +- [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29) +- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. + + +## Feedback and Community Support + +- Questions, reports, and suggestions are welcome through Github Issues! +- Forum: Opinions and questions are welcome at our [PaddlePaddle Forum](https://ai.baidu.com/forum/topic/list/168)! +- WeChat Official Account: PaddlePaddle +- QQ Group Chat: 696965088 +

     

+

  WeChat Official Account           QQ Group Chat     

diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake index 76f62765af..0597ef0cc4 100644 --- a/cmake/cross_compiling/ios.cmake +++ b/cmake/cross_compiling/ios.cmake @@ -120,6 +120,7 @@ # ## Lite settings +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto") if (ARM_TARGET_OS STREQUAL "ios") set(PLATFORM "OS") elseif(ARM_TARGET_OS STREQUAL "ios64") diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake index 25aa4d2bc8..c22bb1db4f 100644 --- a/cmake/cross_compiling/npu.cmake +++ b/cmake/cross_compiling/npu.cmake @@ -30,7 +30,7 @@ if(NOT NPU_DDK_INC) message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include") endif() -include_directories("${NPU_DDK_ROOT}") +include_directories("${NPU_DDK_ROOT}/include") set(NPU_SUB_LIB_PATH "lib64") if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index c053d4ec2b..cb6a872e06 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -224,10 +224,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -239,10 +243,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" - COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" ) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index d57496487a..a1fde4c152 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -35,6 +35,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE NPU_DEPS ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) + if (LITE_WITH_NPU) # Strips the symbols of our protobuf functions to fix the conflicts during # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) @@ -45,8 +46,8 @@ else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) - set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") - add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) @@ -91,6 +92,7 @@ if (NOT LITE_ON_TINY_PUBLISH) SRCS cxx_api.cc DEPS ${cxx_api_deps} ${ops} ${host_kernels} program X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} @@ -129,7 +131,9 @@ if(WITH_TESTING) DEPS cxx_api mir_passes lite_api_test_helper ${ops} ${host_kernels} X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -293,12 +297,13 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling model_optimize_tool") lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS gflags kernel op optimizer mir_passes utils) - add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc) + add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h) endif(LITE_ON_MODEL_OPTIMIZE_TOOL) lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light ${ops} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -327,13 +332,14 @@ if(NOT IOS) lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) - lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils + lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 990d08f18f..c1e9fc4224 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -201,7 +201,11 @@ void Predictor::Build(const lite_api::CxxConfig &config, const std::string &model_file = config.model_file(); const std::string ¶m_file = config.param_file(); const bool model_from_memory = config.model_from_memory(); - LOG(INFO) << "load from memory " << model_from_memory; + if (model_from_memory) { + LOG(INFO) << "Load model from memory."; + } else { + LOG(INFO) << "Load model from file."; + } Build(model_path, model_file, diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 3e6e10103e..81ea60eac6 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -42,11 +42,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) - int num_threads = config.cpu_math_library_num_threads(); + int num_threads = config.x86_math_library_num_threads(); int real_num_threads = num_threads > 1 ? num_threads : 1; paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads); omp_set_num_threads(real_num_threads); - VLOG(3) << "set_cpu_math_library_math_threads() is set successfully and the " + VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the " "number of threads is:" << num_threads; #endif diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc old mode 100755 new mode 100644 diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc index b678c7ecd2..fc23e0b54b 100644 --- a/lite/api/model_optimize_tool.cc +++ b/lite/api/model_optimize_tool.cc @@ -16,8 +16,9 @@ #ifdef PADDLE_WITH_TESTING #include #endif -// "all_kernel_faked.cc" and "kernel_src_map.h" are created automatically during -// model_optimize_tool's compiling period +// "supported_kernel_op_info.h", "all_kernel_faked.cc" and "kernel_src_map.h" +// are created automatically during model_optimize_tool's compiling period +#include #include "all_kernel_faked.cc" // NOLINT #include "kernel_src_map.h" // NOLINT #include "lite/api/cxx_api.h" @@ -25,8 +26,11 @@ #include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_passes.h" #include "lite/core/op_registry.h" +#include "lite/model_parser/compatible_pb.h" +#include "lite/model_parser/pb/program_desc.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" +#include "supported_kernel_op_info.h" // NOLINT DEFINE_string(model_dir, "", @@ -62,10 +66,16 @@ DEFINE_string(valid_targets, "The targets this model optimized for, should be one of (arm, " "opencl, x86), splitted by space"); DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); +DEFINE_bool(print_supported_ops, + false, + "Print supported operators on the inputed target"); +DEFINE_bool(print_all_ops, + false, + "Print all the valid operators of Paddle-Lite"); +DEFINE_bool(print_model_ops, false, "Print operators in the input model"); namespace paddle { namespace lite_api { - //! Display the kernel information. void DisplayKernels() { LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString(); @@ -130,9 +140,7 @@ void RunOptimize(const std::string& model_dir, config.set_model_dir(model_dir); config.set_model_file(model_file); config.set_param_file(param_file); - config.set_valid_places(valid_places); - auto predictor = lite_api::CreatePaddlePredictor(config); LiteModelType model_type; @@ -168,6 +176,202 @@ void CollectModelMetaInfo(const std::string& output_dir, lite::WriteLines(std::vector(total.begin(), total.end()), output_path); } +void PrintOpsInfo(std::set valid_ops = {}) { + std::vector targets = {"kHost", + "kX86", + "kCUDA", + "kARM", + "kOpenCL", + "kFPGA", + "kNPU", + "kXPU", + "kAny", + "kUnk"}; + int maximum_optype_length = 0; + for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { + maximum_optype_length = it->first.size() > maximum_optype_length + ? it->first.size() + : maximum_optype_length; + } + std::cout << std::setiosflags(std::ios::internal); + std::cout << std::setw(maximum_optype_length) << "OP_name"; + for (int i = 0; i < targets.size(); i++) { + std::cout << std::setw(10) << targets[i].substr(1); + } + std::cout << std::endl; + if (valid_ops.empty()) { + for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { + std::cout << std::setw(maximum_optype_length) << it->first; + auto ops_valid_places = it->second; + for (int i = 0; i < targets.size(); i++) { + if (std::find(ops_valid_places.begin(), + ops_valid_places.end(), + targets[i]) != ops_valid_places.end()) { + std::cout << std::setw(10) << "Y"; + } else { + std::cout << std::setw(10) << " "; + } + } + std::cout << std::endl; + } + } else { + for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) { + std::cout << std::setw(maximum_optype_length) << *op; + // Check: If this kernel doesn't match any operator, we will skip it. + if (supported_ops.find(*op) == supported_ops.end()) { + continue; + } + // Print OP info. + auto ops_valid_places = supported_ops.at(*op); + for (int i = 0; i < targets.size(); i++) { + if (std::find(ops_valid_places.begin(), + ops_valid_places.end(), + targets[i]) != ops_valid_places.end()) { + std::cout << std::setw(10) << "Y"; + } else { + std::cout << std::setw(10) << " "; + } + } + std::cout << std::endl; + } + } +} +/// Print help information +void PrintHelpInfo() { + // at least one argument should be inputed + const char help_info[] = + "At least one argument should be inputed. Valid arguments are listed " + "below:\n" + " Arguments of model optimization:\n" + " `--model_dir=`\n" + " `--model_file=`\n" + " `--param_file=`\n" + " `--optimize_out_type=(protobuf|naive_buffer)`\n" + " `--optimize_out=`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" + " `--prefer_int8_kernel=(true|false)`\n" + " `--record_tailoring_info=(true|false)`\n" + " Arguments of model checking and ops information:\n" + " `--print_all_ops=true` Display all the valid operators of " + "Paddle-Lite\n" + " `--print_supported_ops=true " + "--valid_targets=(arm|opencl|x86|npu|xpu)`" + " Display valid operators of input targets\n" + " `--print_model_ops=true --model_dir= " + "--valid_targets=(arm|opencl|x86|npu|xpu)`" + " Display operators in the input model\n"; + std::cout << help_info << std::endl; + exit(1); +} + +// Parse Input command +void ParseInputCommand() { + if (FLAGS_print_all_ops) { + std::cout << "All OPs supported by Paddle-Lite: " << supported_ops.size() + << " ops in total." << std::endl; + PrintOpsInfo(); + exit(1); + } else if (FLAGS_print_supported_ops) { + auto valid_places = paddle::lite_api::ParserValidPlaces(); + // get valid_targets string + std::vector target_types = {}; + for (int i = 0; i < valid_places.size(); i++) { + target_types.push_back(valid_places[i].target); + } + std::string targets_str = TargetToStr(target_types[0]); + for (int i = 1; i < target_types.size(); i++) { + targets_str = targets_str + TargetToStr(target_types[i]); + } + + std::cout << "Supported OPs on '" << targets_str << "': " << std::endl; + target_types.push_back(TARGET(kHost)); + target_types.push_back(TARGET(kUnk)); + + std::set valid_ops; + for (int i = 0; i < target_types.size(); i++) { + auto ops = supported_ops_target[static_cast(target_types[i])]; + valid_ops.insert(ops.begin(), ops.end()); + } + PrintOpsInfo(valid_ops); + exit(1); + } +} +// test whether this model is supported +void CheckIfModelSupported() { + // 1. parse valid places and valid targets + auto valid_places = paddle::lite_api::ParserValidPlaces(); + // set valid_ops + auto valid_ops = supported_ops_target[static_cast(TARGET(kHost))]; + auto valid_unktype_ops = supported_ops_target[static_cast(TARGET(kUnk))]; + valid_ops.insert( + valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); + for (int i = 0; i < valid_places.size(); i++) { + auto target = valid_places[i].target; + auto ops = supported_ops_target[static_cast(target)]; + valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); + } + // get valid ops + std::set valid_ops_set(valid_ops.begin(), valid_ops.end()); + + // 2.Load model into program to get ops in model + std::string prog_path = FLAGS_model_dir + "/__model__"; + if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) { + prog_path = FLAGS_model_file; + } + lite::cpp::ProgramDesc cpp_prog; + framework::proto::ProgramDesc pb_proto_prog = + *lite::LoadProgram(prog_path, false); + lite::pb::ProgramDesc pb_prog(&pb_proto_prog); + // Transform to cpp::ProgramDesc + lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog); + + std::set unsupported_ops; + std::set input_model_ops; + for (int index = 0; index < cpp_prog.BlocksSize(); index++) { + auto current_block = cpp_prog.GetBlock(index); + for (size_t i = 0; i < current_block->OpsSize(); ++i) { + auto& op_desc = *current_block->GetOp(i); + auto op_type = op_desc.Type(); + input_model_ops.insert(op_type); + if (valid_ops_set.count(op_type) == 0) { + unsupported_ops.insert(op_type); + } + } + } + // 3. Print ops_info of input model and check if this model is supported + if (FLAGS_print_model_ops) { + std::cout << "OPs in the input model include:\n"; + PrintOpsInfo(input_model_ops); + } + if (!unsupported_ops.empty()) { + std::string unsupported_ops_str = *unsupported_ops.begin(); + for (auto op_str = ++unsupported_ops.begin(); + op_str != unsupported_ops.end(); + op_str++) { + unsupported_ops_str = unsupported_ops_str + ", " + *op_str; + } + std::vector targets = {}; + for (int i = 0; i < valid_places.size(); i++) { + targets.push_back(valid_places[i].target); + } + std::sort(targets.begin(), targets.end()); + targets.erase(unique(targets.begin(), targets.end()), targets.end()); + std::string targets_str = TargetToStr(targets[0]); + for (int i = 1; i < targets.size(); i++) { + targets_str = targets_str + "," + TargetToStr(targets[i]); + } + + LOG(ERROR) << "Error: This model is not supported, because " + << unsupported_ops.size() << " ops are not supported on '" + << targets_str << "'. These unsupported ops are: '" + << unsupported_ops_str << "'."; + exit(1); + } + if (FLAGS_print_model_ops) { + std::cout << "Paddle-Lite supports this model!" << std::endl; + exit(1); + } +} void Main() { if (FLAGS_display_kernels) { @@ -241,7 +445,13 @@ void Main() { } // namespace paddle int main(int argc, char** argv) { + // If there is none input argument, print help info. + if (argc < 2) { + paddle::lite_api::PrintHelpInfo(); + } google::ParseCommandLineFlags(&argc, &argv, false); + paddle::lite_api::ParseInputCommand(); + paddle::lite_api::CheckIfModelSupported(); paddle::lite_api::Main(); return 0; } diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index dc9fac96ee..5b063a8ef1 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -86,6 +86,7 @@ void Run(const std::vector>& input_shapes, for (int i = 0; i < input_shapes[j].size(); ++i) { input_num *= input_shapes[j][i]; } + for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; } diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index a014719c57..6308699ac9 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -133,7 +133,9 @@ class LITE_API CxxConfig : public ConfigBase { std::string model_file_; std::string param_file_; bool model_from_memory_{false}; - int cpu_math_library_math_threads_ = 1; +#ifdef LITE_WITH_X86 + int x86_math_library_math_threads_ = 1; +#endif public: void set_valid_places(const std::vector& x) { valid_places_ = x; } @@ -153,12 +155,14 @@ class LITE_API CxxConfig : public ConfigBase { std::string param_file() const { return param_file_; } bool model_from_memory() const { return model_from_memory_; } - void set_cpu_math_library_num_threads(int threads) { - cpu_math_library_math_threads_ = threads; +#ifdef LITE_WITH_X86 + void set_x86_math_library_num_threads(int threads) { + x86_math_library_math_threads_ = threads; } - int cpu_math_library_num_threads() const { - return cpu_math_library_math_threads_; + int x86_math_library_num_threads() const { + return x86_math_library_math_threads_; } +#endif }; /// MobileConfig is the config for the light weight predictor, it will skip diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc index 075d314df6..013fd82b19 100644 --- a/lite/api/test_step_rnn_lite_x86.cc +++ b/lite/api/test_step_rnn_lite_x86.cc @@ -30,7 +30,9 @@ TEST(Step_rnn, test_step_rnn_lite_x86) { std::string model_dir = FLAGS_model_dir; lite_api::CxxConfig config; config.set_model_dir(model_dir); - config.set_cpu_math_library_num_threads(1); +#ifdef LITE_WITH_X86 + config.set_x86_math_library_num_threads(1); +#endif config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)}, lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); diff --git a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc deleted file mode 100644 index 99aeea8bde..0000000000 --- a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc +++ /dev/null @@ -1,538 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void conv_3x3s1_depthwise_fp32(const float* i_data, - float* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - int threads = ctx->threads(); - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - const int out_c_block = 4; - const int out_h_kernel = 2; - const int out_w_kernel = 4; - const int win_ext = ow + 2; - const int ow_round = ROUNDUP(ow, 4); - const int win_round = ROUNDUP(win_ext, 4); - const int hin_round = oh + 2; - const int prein_size = win_round * hin_round * out_c_block; - auto workspace_size = - threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; - ctx->ExtendWorkspace(sizeof(float) * workspace_size); - - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - - /// get workspace - float* ptr_zero = ctx->workspace_data(); - memset(ptr_zero, 0, sizeof(float) * win_round); - float* ptr_write = ptr_zero + win_round; - - int size_in_channel = win * ih; - int size_out_channel = ow * oh; - - int ws = -pad_w; - int we = ws + win_round; - int hs = -pad_h; - int he = hs + hin_round; - int w_loop = ow_round / 4; - auto remain = w_loop * 4 - ow; - bool flag_remain = remain > 0; - remain = 4 - remain; - remain = remain > 0 ? remain : 0; - int row_len = win_round * out_c_block; - - for (int n = 0; n < bs; ++n) { - const float* din_batch = i_data + n * ic * size_in_channel; - float* dout_batch = o_data + n * oc * size_out_channel; -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < oc; c += out_c_block) { -#ifdef ARM_WITH_OMP - float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; -#else - float* pre_din = ptr_write + ow_round; -#endif - /// const array size - float pre_out[out_c_block * out_w_kernel * out_h_kernel]; // NOLINT - prepack_input_nxwc4_dw( - din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); - const float* weight_c = weights + c * 9; // kernel_w * kernel_h - float* dout_c00 = dout_batch + c * size_out_channel; - float bias_local[4] = {0, 0, 0, 0}; - if (flag_bias) { - bias_local[0] = bias[c]; - bias_local[1] = bias[c + 1]; - bias_local[2] = bias[c + 2]; - bias_local[3] = bias[c + 3]; - } - float32x4_t vbias = vld1q_f32(bias_local); -#ifdef __aarch64__ - float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 - float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 -#endif - for (int h = 0; h < oh; h += out_h_kernel) { - float* outc00 = dout_c00 + h * ow; - float* outc01 = outc00 + ow; - float* outc10 = outc00 + size_out_channel; - float* outc11 = outc10 + ow; - float* outc20 = outc10 + size_out_channel; - float* outc21 = outc20 + ow; - float* outc30 = outc20 + size_out_channel; - float* outc31 = outc30 + ow; - const float* inr0 = pre_din + h * row_len; - const float* inr1 = inr0 + row_len; - const float* inr2 = inr1 + row_len; - const float* inr3 = inr2 + row_len; - if (c + out_c_block > oc) { - switch (c + out_c_block - oc) { - case 3: - outc10 = ptr_write; - outc11 = ptr_write; - case 2: - outc20 = ptr_write; - outc21 = ptr_write; - case 1: - outc30 = ptr_write; - outc31 = ptr_write; - default: - break; - } - } - if (h + out_h_kernel > oh) { - outc01 = ptr_write; - outc11 = ptr_write; - outc21 = ptr_write; - outc31 = ptr_write; - } - float* outl[] = {outc00, - outc10, - outc20, - outc30, - outc01, - outc11, - outc21, - outc31, - reinterpret_cast(bias_local), - reinterpret_cast(flag_relu)}; - void* outl_ptr = reinterpret_cast(outl); - for (int w = 0; w < w_loop; ++w) { - bool flag_mask = (w == w_loop - 1) && flag_remain; - float* out0 = pre_out; -// clang-format off -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ - "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ - "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ - "ldp q8, q9, [%[inr1]], #32\n" /* load input r1*/ - "ldp q4, q5, [%[inr0]]\n" /* load input r0*/ - "ldp q10, q11, [%[inr1]]\n" /* load input r1*/ - /* r0, r1, mul w0, get out r0, r1 */ - "fmul v15.4s , %[w0].4s, v0.4s\n" /* outr00 = w0 * r0, 0*/ - "fmul v16.4s , %[w0].4s, v1.4s\n" /* outr01 = w0 * r0, 1*/ - "fmul v17.4s , %[w0].4s, v2.4s\n" /* outr02 = w0 * r0, 2*/ - "fmul v18.4s , %[w0].4s, v3.4s\n" /* outr03 = w0 * r0, 3*/ - "fmul v19.4s , %[w0].4s, v6.4s\n" /* outr10 = w0 * r1, 0*/ - "fmul v20.4s , %[w0].4s, v7.4s\n" /* outr11 = w0 * r1, 1*/ - "fmul v21.4s , %[w0].4s, v8.4s\n" /* outr12 = w0 * r1, 2*/ - "fmul v22.4s , %[w0].4s, v9.4s\n" /* outr13 = w0 * r1, 3*/ - /* r0, r1, mul w1, get out r0, r1 */ - "fmla v15.4s , %[w1].4s, v1.4s\n" /* outr00 = w1 * r0[1]*/ - "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ - "fmla v16.4s , %[w1].4s, v2.4s\n" /* outr01 = w1 * r0[2]*/ - "fmla v17.4s , %[w1].4s, v3.4s\n" /* outr02 = w1 * r0[3]*/ - "fmla v18.4s , %[w1].4s, v4.4s\n" /* outr03 = w1 * r0[4]*/ - "fmla v19.4s , %[w1].4s, v7.4s\n" /* outr10 = w1 * r1[1]*/ - "fmla v20.4s , %[w1].4s, v8.4s\n" /* outr11 = w1 * r1[2]*/ - "fmla v21.4s , %[w1].4s, v9.4s\n" /* outr12 = w1 * r1[3]*/ - "fmla v22.4s , %[w1].4s, v10.4s\n"/* outr13 = w1 * r1[4]*/ - /* r0, r1, mul w2, get out r0, r1 */ - "fmla v15.4s , %[w2].4s, v2.4s\n" /* outr00 = w2 * r0[2]*/ - "fmla v16.4s , %[w2].4s, v3.4s\n" /* outr01 = w2 * r0[3]*/ - "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ - "fmla v17.4s , %[w2].4s, v4.4s\n" /* outr02 = w2 * r0[4]*/ - "fmla v18.4s , %[w2].4s, v5.4s\n" /* outr03 = w2 * r0[5]*/ - "ldp q4, q5, [%[inr2]]\n" /* load input r2*/ - "fmla v19.4s , %[w2].4s, v8.4s\n" /* outr10 = w2 * r1[2]*/ - "fmla v20.4s , %[w2].4s, v9.4s\n" /* outr11 = w2 * r1[3]*/ - "fmla v21.4s , %[w2].4s, v10.4s\n"/* outr12 = w2 * r1[4]*/ - "fmla v22.4s , %[w2].4s, v11.4s\n"/* outr13 = w2 * r1[5]*/ - /* r1, r2, mul w3, get out r0, r1 */ - "fmla v15.4s , %[w3].4s, v6.4s\n" /* outr00 = w3 * r1[0]*/ - "fmla v16.4s , %[w3].4s, v7.4s\n" /* outr01 = w3 * r1[1]*/ - "fmla v17.4s , %[w3].4s, v8.4s\n" /* outr02 = w3 * r1[2]*/ - "fmla v18.4s , %[w3].4s, v9.4s\n" /* outr03 = w3 * r1[3]*/ - "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr10 = w3 * r2[0]*/ - "fmla v20.4s , %[w3].4s, v1.4s\n" /* outr11 = w3 * r2[1]*/ - "fmla v21.4s , %[w3].4s, v2.4s\n" /* outr12 = w3 * r2[2]*/ - "fmla v22.4s , %[w3].4s, v3.4s\n" /* outr13 = w3 * r2[3]*/ - /* r1, r2, mul w4, get out r0, r1 */ - "fmla v15.4s , %[w4].4s, v7.4s\n" /* outr00 = w4 * r1[1]*/ - "ldp q6, q7, [%[inr3]], #32\n" /* load input r3*/ - "fmla v16.4s , %[w4].4s, v8.4s\n" /* outr01 = w4 * r1[2]*/ - "fmla v17.4s , %[w4].4s, v9.4s\n" /* outr02 = w4 * r1[3]*/ - "fmla v18.4s , %[w4].4s, v10.4s\n"/* outr03 = w4 * r1[4]*/ - "ldp x0, x1, [%[outl]] \n" - "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr10 = w4 * r2[1]*/ - "fmla v20.4s , %[w4].4s, v2.4s\n" /* outr11 = w4 * r2[2]*/ - "fmla v21.4s , %[w4].4s, v3.4s\n" /* outr12 = w4 * r2[3]*/ - "fmla v22.4s , %[w4].4s, v4.4s\n" /* outr13 = w4 * r2[4]*/ - /* r1, r2, mul w5, get out r0, r1 */ - "fmla v15.4s , %[w5].4s, v8.4s\n" /* outr00 = w5 * r1[2]*/ - "fmla v16.4s , %[w5].4s, v9.4s\n" /* outr01 = w5 * r1[3]*/ - "ldp q8, q9, [%[inr3]], #32\n" /* load input r3*/ - "fmla v17.4s , %[w5].4s, v10.4s\n"/* outr02 = w5 * r1[4]*/ - "fmla v18.4s , %[w5].4s, v11.4s\n"/* outr03 = w5 * r1[5]*/ - "ldp q10, q11, [%[inr3]]\n" /* load input r3*/ - "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr10 = w5 * r2[2]*/ - "fmla v20.4s , %[w5].4s, v3.4s\n" /* outr11 = w5 * r2[3]*/ - "fmla v21.4s , %[w5].4s, v4.4s\n" /* outr12 = w5 * r2[4]*/ - "fmla v22.4s , %[w5].4s, v5.4s\n" /* outr13 = w5 * r2[5]*/ - /* r2, r3, mul w6, get out r0, r1 */ - "fmla v15.4s , %[w6].4s, v0.4s\n" /* outr00 = w6 * r2[0]*/ - "fmla v16.4s , %[w6].4s, v1.4s\n" /* outr01 = w6 * r2[1]*/ - "fmla v17.4s , %[w6].4s, v2.4s\n" /* outr02 = w6 * r2[2]*/ - "fmla v18.4s , %[w6].4s, v3.4s\n" /* outr03 = w6 * r2[3]*/ - "ldp x2, x3, [%[outl], #16] \n" - "fmla v19.4s , %[w6].4s, v6.4s\n" /* outr10 = w6 * r3[0]*/ - "fmla v20.4s , %[w6].4s, v7.4s\n" /* outr11 = w6 * r3[1]*/ - "fmla v21.4s , %[w6].4s, v8.4s\n" /* outr12 = w6 * r3[2]*/ - "fmla v22.4s , %[w6].4s, v9.4s\n" /* outr13 = w6 * r3[3]*/ - /* r2, r3, mul w7, get out r0, r1 */ - "fmla v15.4s , %[w7].4s, v1.4s\n" /* outr00 = w7 * r2[1]*/ - "fmla v16.4s , %[w7].4s, v2.4s\n" /* outr01 = w7 * r2[2]*/ - "fmla v17.4s , %[w7].4s, v3.4s\n" /* outr02 = w7 * r2[3]*/ - "fmla v18.4s , %[w7].4s, v4.4s\n" /* outr03 = w7 * r2[4]*/ - "ldp x4, x5, [%[outl], #32] \n" - "fmla v19.4s , %[w7].4s, v7.4s\n" /* outr10 = w7 * r3[1]*/ - "fmla v20.4s , %[w7].4s, v8.4s\n" /* outr11 = w7 * r3[2]*/ - "fmla v21.4s , %[w7].4s, v9.4s\n" /* outr12 = w7 * r3[3]*/ - "fmla v22.4s , %[w7].4s, v10.4s\n"/* outr13 = w7 * r3[4]*/ - /* r2, r3, mul w8, get out r0, r1 */ - "fmla v15.4s , %[w8].4s, v2.4s\n" /* outr00 = w8 * r2[2]*/ - "fmla v16.4s , %[w8].4s, v3.4s\n" /* outr01 = w8 * r2[3]*/ - "fmla v17.4s , %[w8].4s, v4.4s\n" /* outr02 = w8 * r2[0]*/ - "fmla v18.4s , %[w8].4s, v5.4s\n" /* outr03 = w8 * r2[1]*/ - "ldp x6, x7, [%[outl], #48] \n" - "fmla v19.4s , %[w8].4s, v8.4s\n" /* outr10 = w8 * r3[2]*/ - "fmla v20.4s , %[w8].4s, v9.4s\n" /* outr11 = w8 * r3[3]*/ - "fmla v21.4s , %[w8].4s, v10.4s\n"/* outr12 = w8 * r3[0]*/ - "fmla v22.4s , %[w8].4s, v11.4s\n"/* outr13 = w8 * r3[1]*/ - - "fadd v15.4s, v15.4s, %[vbias].4s\n"/* add bias */ - "fadd v16.4s, v16.4s, %[vbias].4s\n"/* add bias */ - "fadd v17.4s, v17.4s, %[vbias].4s\n"/* add bias */ - "fadd v18.4s, v18.4s, %[vbias].4s\n"/* add bias */ - "fadd v19.4s, v19.4s, %[vbias].4s\n"/* add bias */ - "fadd v20.4s, v20.4s, %[vbias].4s\n"/* add bias */ - "fadd v21.4s, v21.4s, %[vbias].4s\n"/* add bias */ - "fadd v22.4s, v22.4s, %[vbias].4s\n"/* add bias */ - - /* transpose */ - "trn1 v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/ - "trn2 v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/ - "trn1 v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/ - "trn2 v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/ - "trn1 v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/ - "trn2 v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/ - "trn1 v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/ - "trn2 v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/ - "trn1 v15.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ - "trn2 v19.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ - "trn1 v17.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ - "trn2 v21.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ - "trn1 v16.2d, v4.2d, v6.2d\n" /* r1: a0a1a2a3*/ - "trn2 v20.2d, v4.2d, v6.2d\n" /* r1: c0c1c2c3*/ - "trn1 v18.2d, v5.2d, v7.2d\n" /* r1: b0b1b2b3*/ - "trn2 v22.2d, v5.2d, v7.2d\n" /* r1: d0d1d2d3*/ - - "cbz %w[flag_relu], 0f\n" /* skip relu*/ - "movi v0.4s, #0\n" /* for relu */ - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "0:\n" - "cbnz %w[flag_mask], 1f\n" - "str q15, [x0]\n" /* save outc00 */ - "str q16, [x4]\n" /* save outc01 */ - "str q17, [x1]\n" /* save outc10 */ - "str q18, [x5]\n" /* save outc11 */ - "str q19, [x2]\n" /* save outc20 */ - "str q20, [x6]\n" /* save outc21 */ - "str q21, [x3]\n" /* save outc30 */ - "str q22, [x7]\n" /* save outc31 */ - "b 2f\n" - "1:\n" - "str q15, [%[out]], #16 \n" /* save remain to pre_out */ - "str q17, [%[out]], #16 \n" /* save remain to pre_out */ - "str q19, [%[out]], #16 \n" /* save remain to pre_out */ - "str q21, [%[out]], #16 \n" /* save remain to pre_out */ - "str q16, [%[out]], #16 \n" /* save remain to pre_out */ - "str q18, [%[out]], #16 \n" /* save remain to pre_out */ - "str q20, [%[out]], #16 \n" /* save remain to pre_out */ - "str q22, [%[out]], #16 \n" /* save remain to pre_out */ - "2:\n" - :[inr0] "+r"(inr0), [inr1] "+r"(inr1), - [inr2] "+r"(inr2), [inr3] "+r"(inr3), - [out]"+r"(out0) - :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), - [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), - [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), - [vbias]"w" (vbias), [outl] "r" (outl_ptr), - [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu) - : "cc", "memory", - "v0","v1","v2","v3","v4","v5","v6","v7", - "v8", "v9", "v10", "v11", "v15", - "v16","v17","v18","v19","v20","v21","v22", - "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7" - ); -#else - asm volatile( - /* load weights */ - "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, w1, to q5, q6\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, to q7\n" - /* load r0, r1 */ - "vld1.32 {d0-d3}, [%[r0]]! @ load r0, q0, q1\n" - "vld1.32 {d4-d7}, [%[r0]]! @ load r0, q2, q3\n" - /* main loop */ - "0: @ main loop\n" - /* mul r0 with w0, w1, w2, get out r0 */ - "vmul.f32 q8, q5, q0 @ w0 * inr00\n" - "vmul.f32 q9, q5, q1 @ w0 * inr01\n" - "vmul.f32 q10, q5, q2 @ w0 * inr02\n" - "vmul.f32 q11, q5, q3 @ w0 * inr03\n" - "vmla.f32 q8, q6, q1 @ w1 * inr01\n" - "vld1.32 {d0-d3}, [%[r0]] @ load r0, q0, q1\n" - "vmla.f32 q9, q6, q2 @ w1 * inr02\n" - "vmla.f32 q10, q6, q3 @ w1 * inr03\n" - "vmla.f32 q11, q6, q0 @ w1 * inr04\n" - "vmla.f32 q8, q7, q2 @ w2 * inr02\n" - "vmla.f32 q9, q7, q3 @ w2 * inr03\n" - "vld1.32 {d4-d7}, [%[r1]]! @ load r0, q2, q3\n" - "vmla.f32 q10, q7, q0 @ w2 * inr04\n" - "vmla.f32 q11, q7, q1 @ w2 * inr05\n" - "vld1.32 {d0-d3}, [%[r1]]! @ load r0, q0, q1\n" - "vld1.32 {d8-d9}, [%[wc0]]! @ load w3 to q4\n" - /* mul r1 with w0-w5, get out r0, r1 */ - "vmul.f32 q12, q5, q2 @ w0 * inr10\n" - "vmul.f32 q13, q5, q3 @ w0 * inr11\n" - "vmul.f32 q14, q5, q0 @ w0 * inr12\n" - "vmul.f32 q15, q5, q1 @ w0 * inr13\n" - "vld1.32 {d10-d11}, [%[wc0]]! @ load w4 to q5\n" - "vmla.f32 q8, q4, q2 @ w3 * inr10\n" - "vmla.f32 q9, q4, q3 @ w3 * inr11\n" - "vmla.f32 q10, q4, q0 @ w3 * inr12\n" - "vmla.f32 q11, q4, q1 @ w3 * inr13\n" - /* mul r1 with w1, w4, get out r1, r0 */ - "vmla.f32 q8, q5, q3 @ w4 * inr11\n" - "vmla.f32 q12, q6, q3 @ w1 * inr11\n" - "vld1.32 {d4-d7}, [%[r1]] @ load r1, q2, q3\n" - "vmla.f32 q9, q5, q0 @ w4 * inr12\n" - "vmla.f32 q13, q6, q0 @ w1 * inr12\n" - "vmla.f32 q10, q5, q1 @ w4 * inr13\n" - "vmla.f32 q14, q6, q1 @ w1 * inr13\n" - "vmla.f32 q11, q5, q2 @ w4 * inr14\n" - "vmla.f32 q15, q6, q2 @ w1 * inr14\n" - "vld1.32 {d12-d13}, [%[wc0]]! @ load w5 to q6\n" - /* mul r1 with w2, w5, get out r1, r0 */ - "vmla.f32 q12, q7, q0 @ w2 * inr12\n" - "vmla.f32 q13, q7, q1 @ w2 * inr13\n" - "vmla.f32 q8, q6, q0 @ w5 * inr12\n" - "vmla.f32 q9, q6, q1 @ w5 * inr13\n" - "vld1.32 {d0-d3}, [%[r2]]! @ load r2, q0, q1\n" - "vmla.f32 q14, q7, q2 @ w2 * inr14\n" - "vmla.f32 q15, q7, q3 @ w2 * inr15\n" - "vmla.f32 q10, q6, q2 @ w5 * inr14\n" - "vmla.f32 q11, q6, q3 @ w5 * inr15\n" - "vld1.32 {d4-d7}, [%[r2]]! @ load r2, q0, q1\n" - "vld1.32 {d14-d15}, [%[wc0]]! @ load w6, to q7\n" - /* mul r2 with w3-w8, get out r0, r1 */ - "vmla.f32 q12, q4, q0 @ w3 * inr20\n" - "vmla.f32 q13, q4, q1 @ w3 * inr21\n" - "vmla.f32 q14, q4, q2 @ w3 * inr22\n" - "vmla.f32 q15, q4, q3 @ w3 * inr23\n" - "vld1.32 {d8-d9}, [%[wc0]]! @ load w7, to q4\n" - "vmla.f32 q8, q7, q0 @ w6 * inr20\n" - "vmla.f32 q9, q7, q1 @ w6 * inr21\n" - "vmla.f32 q10, q7, q2 @ w6 * inr22\n" - "vmla.f32 q11, q7, q3 @ w6 * inr23\n" - /* mul r2 with w4, w7, get out r1, r0 */ - "vmla.f32 q8, q4, q1 @ w7 * inr21\n" - "vmla.f32 q12, q5, q1 @ w4 * inr21\n" - "vld1.32 {d0-d3}, [%[r2]] @ load r2, q0, q1\n" - "vmla.f32 q9, q4, q2 @ w7 * inr22\n" - "vmla.f32 q13, q5, q2 @ w4 * inr22\n" - "vmla.f32 q10, q4, q3 @ w7 * inr23\n" - "vmla.f32 q14, q5, q3 @ w4 * inr23\n" - "vmla.f32 q11, q4, q0 @ w7 * inr24\n" - "vmla.f32 q15, q5, q0 @ w4 * inr24\n" - "vld1.32 {d10-d11}, [%[wc0]]! @ load w8 to q5\n" - /* mul r1 with w5, w8, get out r1, r0 */ - "vmla.f32 q12, q6, q2 @ w5 * inr22\n" - "vmla.f32 q13, q6, q3 @ w5 * inr23\n" - "vmla.f32 q8, q5, q2 @ w8 * inr22\n" - "vmla.f32 q9, q5, q3 @ w8 * inr23\n" - "vld1.32 {d4-d7}, [%[r3]]! @ load r3, q2, q3\n" - "ldr r4, [%[outl], #32] @ load bias addr to r4\n" - "vmla.f32 q14, q6, q0 @ w5 * inr24\n" - "vmla.f32 q15, q6, q1 @ w5 * inr25\n" - "vmla.f32 q10, q5, q0 @ w8 * inr24\n" - "vmla.f32 q11, q5, q1 @ w8 * inr25\n" - "vld1.32 {d0-d3}, [%[r3]]! @ load r3, q0, q1\n" - "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" - /* mul r3 with w6, w7, w8, get out r1 */ - "vmla.f32 q12, q7, q2 @ w6 * inr30\n" - "vmla.f32 q13, q7, q3 @ w6 * inr31\n" - "vmla.f32 q14, q7, q0 @ w6 * inr32\n" - "vmla.f32 q15, q7, q1 @ w6 * inr33\n" - "vmla.f32 q12, q4, q3 @ w7 * inr31\n" - "vld1.32 {d4-d7}, [%[r3]] @ load r3, q2, q3\n" - "vld1.32 {d12-d13}, [r4] @ load bias\n" - "vmla.f32 q13, q4, q0 @ w7 * inr32\n" - "vmla.f32 q14, q4, q1 @ w7 * inr33\n" - "vmla.f32 q15, q4, q2 @ w7 * inr34\n" - "ldr r0, [%[outl]] @ load outc00 to r0\n" - "vmla.f32 q12, q5, q0 @ w8 * inr32\n" - "vmla.f32 q13, q5, q1 @ w8 * inr33\n" - "ldr r5, [%[outl], #36] @ load flag_relu to r5\n" - "vmla.f32 q14, q5, q2 @ w8 * inr34\n" - "vmla.f32 q15, q5, q3 @ w8 * inr35\n" - "ldr r1, [%[outl], #4] @ load outc10 to r1\n" - "vadd.f32 q8, q8, q6 @ r00 add bias\n" - "vadd.f32 q9, q9, q6 @ r01 add bias\n" - "vadd.f32 q10, q10, q6 @ r02 add bias\n" - "vadd.f32 q11, q11, q6 @ r03 add bias\n" - "ldr r2, [%[outl], #8] @ load outc20 to r2\n" - "vadd.f32 q12, q12, q6 @ r10 add bias\n" - "vadd.f32 q13, q13, q6 @ r11 add bias\n" - "vadd.f32 q14, q14, q6 @ r12 add bias\n" - "vadd.f32 q15, q15, q6 @ r13 add bias\n" - "ldr r3, [%[outl], #12] @ load outc30 to r3\n" - "vmov.u32 q7, #0 @ mov zero to q7\n" - "cmp r5, #0 @ cmp flag relu\n" - "beq 1f @ skip relu\n" - "vmax.f32 q8, q8, q7 @ r00 relu\n" - "vmax.f32 q9, q9, q7 @ r01 relu\n" - "vmax.f32 q10, q10, q7 @ r02 relu\n" - "vmax.f32 q11, q11, q7 @ r03 relu\n" - "vmax.f32 q12, q12, q7 @ r10 relu\n" - "vmax.f32 q13, q13, q7 @ r11 relu\n" - "vmax.f32 q14, q14, q7 @ r12 relu\n" - "vmax.f32 q15, q15, q7 @ r13 relu\n" - "1:\n" - "ldr r4, [%[outl], #16] @ load outc01 to r4\n" - "vtrn.32 q8, q9 @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n" - "vtrn.32 q10, q11 @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n" - "vtrn.32 q12, q13 @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n" - "vtrn.32 q14, q15 @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n" - "ldr r5, [%[outl], #20] @ load outc11 to r5\n" - "vswp d17, d20 @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n" - "vswp d19, d22 @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n" - "vswp d25, d28 @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n" - "vswp d27, d30 @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n" - "cmp %[flag_mask], #0 @ cmp flag mask\n" - "bne 2f\n" - "vst1.32 {d16-d17}, [r0] @ save outc00\n" - "vst1.32 {d18-d19}, [r1] @ save outc10\n" - "vst1.32 {d20-d21}, [r2] @ save outc20\n" - "vst1.32 {d22-d23}, [r3] @ save outc30\n" - "vst1.32 {d24-d25}, [r4] @ save outc01\n" - "vst1.32 {d26-d27}, [r5] @ save outc11\n" - "ldr r0, [%[outl], #24] @ load outc21 to r0\n" - "ldr r1, [%[outl], #28] @ load outc31 to r1\n" - "vst1.32 {d28-d29}, [r0] @ save outc21\n" - "vst1.32 {d30-d31}, [r1] @ save outc31\n" - "b 3f @ branch end\n" - "2: \n" - "vst1.32 {d16-d17}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d18-d19}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d20-d21}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d22-d23}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d24-d25}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d26-d27}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d28-d29}, [%[out0]]! @ save remain to pre_out\n" - "vst1.32 {d30-d31}, [%[out0]]! @ save remain to pre_out\n" - "3: \n" - : [r0] "+r"(inr0), [r1] "+r"(inr1), - [r2] "+r"(inr2), [r3] "+r"(inr3), - [out0] "+r"(out0), [wc0] "+r"(weight_c) - : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr) - : "cc", "memory", - "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5" - ); -#endif // __arch64__ - // clang-format on - outl[0] += 4; - outl[1] += 4; - outl[2] += 4; - outl[3] += 4; - outl[4] += 4; - outl[5] += 4; - outl[6] += 4; - outl[7] += 4; - if (flag_mask) { - memcpy(outl[0] - 4, pre_out, remain * sizeof(float)); - memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float)); - memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float)); - memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float)); - memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float)); - memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float)); - memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float)); - memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float)); - } - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc deleted file mode 100644 index 2d75323a96..0000000000 --- a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc +++ /dev/null @@ -1,361 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "lite/backends/arm/math/conv_block_utils.h" -#include "lite/backends/arm/math/conv_impl.h" -#include "lite/core/context.h" -#include "lite/operators/op_params.h" -#ifdef ARM_WITH_OMP -#include -#endif - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_3x3s2_depthwise_fp32(const float* i_data, - float* o_data, - int bs, - int oc, - int oh, - int ow, - int ic, - int ih, - int win, - const float* weights, - const float* bias, - const operators::ConvParam& param, - ARMContext* ctx) { - int threads = ctx->threads(); - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - const int out_c_block = 4; - const int out_h_kernel = 1; - const int out_w_kernel = 4; - const int win_ext = ow * 2 + 1; - const int ow_round = ROUNDUP(ow, 4); - const int win_round = ROUNDUP(win_ext, 4); - const int hin_round = oh * 2 + 1; - const int prein_size = win_round * hin_round * out_c_block; - auto workspace_size = - threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; - ctx->ExtendWorkspace(sizeof(float) * workspace_size); - - bool flag_relu = param.fuse_relu; - bool flag_bias = param.bias != nullptr; - - /// get workspace - auto ptr_zero = ctx->workspace_data(); - memset(ptr_zero, 0, sizeof(float) * win_round); - float* ptr_write = ptr_zero + win_round; - - int size_in_channel = win * ih; - int size_out_channel = ow * oh; - - int ws = -pad_w; - int we = ws + win_round; - int hs = -pad_h; - int he = hs + hin_round; - int w_loop = ow_round / 4; - auto remain = w_loop * 4 - ow; - bool flag_remain = remain > 0; - remain = 4 - remain; - remain = remain > 0 ? remain : 0; - int row_len = win_round * out_c_block; - - for (int n = 0; n < bs; ++n) { - const float* din_batch = i_data + n * ic * size_in_channel; - float* dout_batch = o_data + n * oc * size_out_channel; -#pragma omp parallel for num_threads(threads) - for (int c = 0; c < oc; c += out_c_block) { -#ifdef ARM_WITH_OMP - float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; -#else - float* pre_din = ptr_write + ow_round; -#endif - /// const array size - prepack_input_nxwc4_dw( - din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); - const float* weight_c = weights + c * 9; // kernel_w * kernel_h - float* dout_c00 = dout_batch + c * size_out_channel; - float bias_local[4] = {0, 0, 0, 0}; - if (flag_bias) { - bias_local[0] = bias[c]; - bias_local[1] = bias[c + 1]; - bias_local[2] = bias[c + 2]; - bias_local[3] = bias[c + 3]; - } -#ifdef __aarch64__ - float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 - float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 - float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 - float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 - float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 - float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 - float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 - float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 - float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 -#endif - for (int h = 0; h < oh; h += out_h_kernel) { - float* outc0 = dout_c00 + h * ow; - float* outc1 = outc0 + size_out_channel; - float* outc2 = outc1 + size_out_channel; - float* outc3 = outc2 + size_out_channel; - const float* inr0 = pre_din + h * 2 * row_len; - const float* inr1 = inr0 + row_len; - const float* inr2 = inr1 + row_len; - if (c + out_c_block > oc) { - switch (c + out_c_block - oc) { - case 3: - outc1 = ptr_write; - case 2: - outc2 = ptr_write; - case 1: - outc3 = ptr_write; - default: - break; - } - } - auto c0 = outc0; - auto c1 = outc1; - auto c2 = outc2; - auto c3 = outc3; - float pre_out[16]; - for (int w = 0; w < w_loop; ++w) { - bool flag_mask = (w == w_loop - 1) && flag_remain; - if (flag_mask) { - c0 = outc0; - c1 = outc1; - c2 = outc2; - c3 = outc3; - outc0 = pre_out; - outc1 = pre_out + 4; - outc2 = pre_out + 8; - outc3 = pre_out + 12; - } -// clang-format off -#ifdef __aarch64__ - asm volatile( - "ldr q8, [%[bias]]\n" /* load bias */ - "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ - "and v19.16b, v8.16b, v8.16b\n" - "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ - "and v20.16b, v8.16b, v8.16b\n" - "ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/ - "and v21.16b, v8.16b, v8.16b\n" - "ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/ - "and v22.16b, v8.16b, v8.16b\n" - "ldr q8, [%[inr0]]\n" /* load input r0*/ - /* r0 mul w0-w2, get out */ - "fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/ - "fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/ - "fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/ - "fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/ - "fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/ - "ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/ - "fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/ - "fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/ - "fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/ - "fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/ - "ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/ - "fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/ - "ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/ - "fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/ - "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ - "fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/ - "ldr q8, [%[inr1]]\n" /* load input r1*/ - /* r1, mul w3-w5, get out */ - "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/ - "fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/ - "fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/ - "fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/ - "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/ - "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ - "fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/ - "fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/ - "fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/ - "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/ - "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ - "fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/ - "ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/ - "fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/ - "ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/ - "fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/ - "ldr q8, [%[inr2]]\n" /* load input r2*/ - /* r2, mul w6-w8, get out r0, r1 */ - "fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/ - "fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/ - "fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/ - "fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/ - "fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/ - "fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/ - "fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/ - "fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/ - "fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/ - "fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/ - "fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/ - "fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/ - /* transpose */ - "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/ - "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/ - "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/ - "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/ - "trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ - "trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ - "trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ - "trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ - /* relu */ - "cbz %w[flag_relu], 0f\n" /* skip relu*/ - "movi v0.4s, #0\n" /* for relu */ - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - /* save result */ - "0:\n" - "str q19, [%[outc0]], #16\n" - "str q20, [%[outc1]], #16\n" - "str q21, [%[outc2]], #16\n" - "str q22, [%[outc3]], #16\n" - :[inr0] "+r"(inr0), [inr1] "+r"(inr1), - [inr2] "+r"(inr2), - [outc0]"+r"(outc0), [outc1]"+r"(outc1), - [outc2]"+r"(outc2), [outc3]"+r"(outc3) - :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), - [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), - [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), - [bias] "r" (bias_local), [flag_relu]"r"(flag_relu) - : "cc", "memory", - "v0","v1","v2","v3","v4","v5","v6","v7", - "v8", "v19","v20","v21","v22" - ); -#else - asm volatile( - /* fill with bias */ - "vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */ - /* load weights */ - "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */ - "vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/ - "vand.i32 q12, q8, q8\n" - "vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/ - "vand.i32 q13, q8, q8\n" - "vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/ - "vand.i32 q14, q8, q8\n" - "vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/ - "vand.i32 q15, q8, q8\n" - "vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/ - /* mul r0 with w0, w1, w2 */ - "vmla.f32 q12, q9, q0 @ w0 * inr0\n" - "vmla.f32 q13, q9, q2 @ w0 * inr2\n" - "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */ - "vmla.f32 q14, q9, q4 @ w0 * inr4\n" - "vmla.f32 q15, q9, q6 @ w0 * inr6\n" - "vmla.f32 q12, q10, q1 @ w1 * inr1\n" - "vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n" - "vmla.f32 q13, q10, q3 @ w1 * inr3\n" - "vmla.f32 q14, q10, q5 @ w1 * inr5\n" - "vmla.f32 q15, q10, q7 @ w1 * inr7\n" - "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */ - "vmla.f32 q12, q11, q2 @ w2 * inr2\n" - "vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n" - "vmla.f32 q13, q11, q4 @ w2 * inr4\n" - "vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n" - "vmla.f32 q14, q11, q6 @ w2 * inr6\n" - "vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n" - "vmla.f32 q15, q11, q8 @ w2 * inr8\n" - /* mul r1 with w3, w4, w5 */ - "vmla.f32 q12, q9, q0 @ w3 * inr0\n" - "vmla.f32 q13, q9, q2 @ w3 * inr2\n" - "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */ - "vmla.f32 q14, q9, q4 @ w3 * inr4\n" - "vmla.f32 q15, q9, q6 @ w3 * inr6\n" - "vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/ - "vmla.f32 q12, q10, q1 @ w4 * inr1\n" - "vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n" - "vmla.f32 q13, q10, q3 @ w4 * inr3\n" - "vmla.f32 q14, q10, q5 @ w4 * inr5\n" - "vmla.f32 q15, q10, q7 @ w4 * inr7\n" - "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */ - "vmla.f32 q12, q11, q2 @ w5 * inr2\n" - "vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n" - "vmla.f32 q13, q11, q4 @ w5 * inr4\n" - "vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n" - "vmla.f32 q14, q11, q6 @ w5 * inr6\n" - "vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n" - "vmla.f32 q15, q11, q8 @ w5 * inr8\n" - /* mul r2 with w6, w7, w8 */ - "vmla.f32 q12, q9, q0 @ w6 * inr0\n" - "vmla.f32 q13, q9, q2 @ w6 * inr2\n" - "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */ - "vmla.f32 q14, q9, q4 @ w6 * inr4\n" - "vmla.f32 q15, q9, q6 @ w6 * inr6\n" - "vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/ - "vmla.f32 q12, q10, q1 @ w7 * inr1\n" - "vmla.f32 q13, q10, q3 @ w7 * inr3\n" - "vmla.f32 q14, q10, q5 @ w7 * inr5\n" - "vmla.f32 q15, q10, q7 @ w7 * inr7\n" - "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" - "vmla.f32 q12, q11, q2 @ w8 * inr2\n" - "vmla.f32 q13, q11, q4 @ w8 * inr4\n" - "vmla.f32 q14, q11, q6 @ w8 * inr6\n" - "vmla.f32 q15, q11, q8 @ w8 * inr8\n" - /* transpose */ - "vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/ - "vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/ - "vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/ - "vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/ - "cmp %[flag_relu], #0\n" - "beq 0f\n" /* skip relu*/ - "vmov.u32 q0, #0\n" - "vmax.f32 q12, q12, q0\n" - "vmax.f32 q13, q13, q0\n" - "vmax.f32 q14, q14, q0\n" - "vmax.f32 q15, q15, q0\n" - "0:\n" - "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/ - "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/ - "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/ - "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/ - :[r0] "+r"(inr0), [r1] "+r"(inr1), - [r2] "+r"(inr2), [wc0] "+r" (weight_c), - [outc0]"+r"(outc0), [outc1]"+r"(outc1), - [outc2]"+r"(outc2), [outc3]"+r"(outc3) - :[bias] "r" (bias_local), - [flag_relu]"r"(flag_relu) - :"cc", "memory", - "q0","q1","q2","q3","q4","q5","q6","q7", - "q8", "q9","q10","q11","q12","q13","q14","q15" - ); -#endif // __arch64__ - // clang-format off - if (flag_mask) { - for (int i = 0; i < remain; ++i) { - c0[i] = pre_out[i]; - c1[i] = pre_out[i + 4]; - c2[i] = pre_out[i + 8]; - c3[i] = pre_out[i + 12]; - } - } - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3p0.cc b/lite/backends/arm/math/conv_depthwise_3x3p0.cc deleted file mode 100644 index 0c050ffe6f..0000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc +++ /dev/null @@ -1,4178 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s1p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3p0_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (stride == 1) { - if (flag_relu) { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 8) { - conv_depthwise_3x3s2p0_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 8) { - conv_depthwise_3x3s2p0_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -// 4line -void conv_depthwise_3x3s1p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - // wr0 = vsetq_lane_f32(0.f, wr0, 3); - // wr1 = vsetq_lane_f32(0.f, wr1, 3); - // wr2 = vsetq_lane_f32(0.f, wr2, 3); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - - // mid - // "cmp %[cnt], #1 \n" - // "blt 5f \n" - "4: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 4b \n" - - // right - "5: \n" - "cmp %[remain], #1 \n" - "blt 0f \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v18.4s}, [%[rmask]] \n" - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - // end - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - case 0: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r3\n" - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "cmp %[remain], #1 @ check whether has " - "mid cols\n" - "blt 0f @ jump to main loop start " - "point\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - "0: \n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 >= h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - // mid - "2: \n" - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -// 4line -void conv_depthwise_3x3s1p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - // wr0 = vsetq_lane_f32(0.f, wr0, 3); - // wr1 = vsetq_lane_f32(0.f, wr1, 3); - // wr2 = vsetq_lane_f32(0.f, wr2, 3); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - - // mid - "4: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - // r5 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 4b \n" - - // right - "5: \n" - "cmp %[remain], #1 \n" - "blt 0f \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - "ld1 {v18.4s}, [%[rmask]] \n" - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - // end - "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_out; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - case 0: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r3\n" - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "cmp %[remain], #1 @ check whether has " - "mid cols\n" - "blt 0f @ jump to main loop start " - "point\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - "0: \n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, with reulu - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 >= h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v14.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - // mid - "2: \n" - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - "vmax.f32 q3, q3, q9 @ relu \n" - - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float* dr0 = din_channel + j * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" - - "bif v0.16b, %[zero].16b, %[mask1].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask2].16b\n" // d0_1234 - - "bif v2.16b, %[zero].16b, %[mask1].16b\n" // d1_1234 - "bif v3.16b, %[zero].16b, %[mask2].16b\n" // d1_1234 - - "bif v4.16b, %[zero].16b, %[mask1].16b\n" // d2_1234 - "bif v5.16b, %[zero].16b, %[mask2].16b\n" // d2_1234 - - "bif v6.16b, %[zero].16b, %[mask1].16b\n" // d3_1234 - "bif v7.16b, %[zero].16b, %[mask2].16b\n" // d3_1234 - - "ext v8.16b, v0.16b, v1.16b, #4\n" // d1_2345 - "ext v9.16b, v0.16b, v1.16b, #8\n" // d1_3450 - - "and v12.16b, %[vbias].16b, %[vbias].16b \n" // v12 = vbias - "and v13.16b, %[vbias].16b, %[vbias].16b \n" // v13 = vbias - - // r0 - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v2.16b, v3.16b, #4\n" // d1_2345 - "ext v9.16b, v2.16b, v3.16b, #8\n" // d1_3450 - - // r1 - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v4.16b, v5.16b, #4\n" // d1_2345 - "ext v9.16b, v4.16b, v5.16b, #8\n" // d1_3450 - - // r2 - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v6.16b, v7.16b, #4\n" // d1_2345 - "ext v9.16b, v6.16b, v7.16b, #8\n" // d1_3450 - - // r3 - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fadd v12.4s, v12.4s, v10.4s\n" - - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "fadd v12.4s, v12.4s, v11.4s\n" // out1 - "fadd v13.4s, v13.4s, v14.4s\n" // out2 - "fadd v13.4s, v13.4s, v15.4s\n" // out2 - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); -#else - unsigned int* vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vadd.f32 q4, q4, q11 @ q4 += q10 \n" - - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" - "vadd.f32 q5, q5, q9 @ q4 += q10 \n" - - "vst1.32 {d8-d9}, [%[out1]] @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[out2]] @ store result, add pointer\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ - -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; ++j) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - "and v4.16b, %[bias].16b, %[bias].16b \n" // v10 = vbias - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v10.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v7.16b, v12.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v8.16b, v14.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" // 0246 * w00 - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" // 1357 * w01 - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" // 2468 * w02 - - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" // v13 * w12 - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" // v15 * w21 - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v16.4s \n" - - // "fadd v4.4s, v4.4s, %[bias].4s \n" - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,0} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q7 = {2,4,6,0} - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" // q8 = {2,4,6,0} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // {0,2,4,6} - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // {1,3,5,7} - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // {2,4,6,0} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(dmask) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float* dr0 = din_channel + j * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" - - "bif v0.16b, %[zero].16b, %[mask1].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask2].16b\n" // d0_1234 - - "bif v2.16b, %[zero].16b, %[mask1].16b\n" // d1_1234 - "bif v3.16b, %[zero].16b, %[mask2].16b\n" // d1_1234 - - "bif v4.16b, %[zero].16b, %[mask1].16b\n" // d2_1234 - "bif v5.16b, %[zero].16b, %[mask2].16b\n" // d2_1234 - - "bif v6.16b, %[zero].16b, %[mask1].16b\n" // d3_1234 - "bif v7.16b, %[zero].16b, %[mask2].16b\n" // d3_1234 - - "ext v8.16b, v0.16b, v1.16b, #4\n" // d1_2345 - "ext v9.16b, v0.16b, v1.16b, #8\n" // d1_3450 - - "and v12.16b, %[vbias].16b, %[vbias].16b \n" // v12 = vbias - "and v13.16b, %[vbias].16b, %[vbias].16b \n" // v13 = vbias - - // r0 - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v2.16b, v3.16b, #4\n" // d1_2345 - "ext v9.16b, v2.16b, v3.16b, #8\n" // d1_3450 - - // r1 - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v4.16b, v5.16b, #4\n" // d1_2345 - "ext v9.16b, v4.16b, v5.16b, #8\n" // d1_3450 - - // r2 - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" // d0_1234 * w0[0] - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" // d1_2345 * w0[1] - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" // d0_3456 * w0[2] - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "ext v8.16b, v6.16b, v7.16b, #4\n" // d1_2345 - "ext v9.16b, v6.16b, v7.16b, #8\n" // d1_3450 - - // r3 - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" // d0_1234 * w0[0] - - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" // d1_2345 * w0[1] - - "fadd v12.4s, v12.4s, v10.4s\n" - - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" // d0_3456 * w0[2] - - "fadd v12.4s, v12.4s, v11.4s\n" // out1 - "fadd v13.4s, v13.4s, v14.4s\n" // out2 - "fadd v13.4s, v13.4s, v15.4s\n" // out2 - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - "fmax v12.4s, v12.4s, %[zero].4s \n" - "fmax v13.4s, v13.4s, %[zero].4s \n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); -#else - unsigned int* vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vadd.f32 q4, q4, q11 @ q4 += q10 \n" - - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" - "vadd.f32 q5, q5, q9 @ q4 += q10 \n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vst1.32 {d8-d9}, [%[out1]] @ store result, add pointer\n" - "vst1.32 {d10-d11}, [%[out2]] @ store result, add pointer\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - // doutr0 = doutr1; - // doutr1 += w_out; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 7 - */ -void conv_depthwise_3x3s2p0_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; ++j) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]] \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - "and v4.16b, %[bias].16b, %[bias].16b \n" // v10 = vbias - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v10.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v7.16b, v12.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - "ext v8.16b, v14.16b, v9.16b, #4 \n" // v6 = - // {2,4,6,8} - - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" // 0246 * w00 - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" // 1357 * w01 - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" // 2468 * w02 - - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" // v13 * w12 - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" // v15 * w21 - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v16.4s \n" - "fmax v4.4s, v4.4s, v9.4s \n" - - // "fadd v4.4s, v4.4s, %[bias].4s \n" - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf), - [mask_ptr] "r"(mask_ptr) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,0} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q7 = {2,4,6,0} - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" // q8 = {2,4,6,0} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // {0,2,4,6} - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // {1,3,5,7} - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // {2,4,6,0} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(mask_ptr) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3p1.cc b/lite/backends/arm/math/conv_depthwise_3x3p1.cc deleted file mode 100644 index 6f28d48d6d..0000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc +++ /dev/null @@ -1,4850 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s1p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -//! for input width <= 4 -void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3p1_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (stride == 1) { - if (flag_relu) { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } else { //! stride = 2 - if (flag_relu) { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s_relu(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } else { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -// 4line -void conv_depthwise_3x3s1p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - // printf("conv3x3_dw start \n"); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - - // left - // r0 - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * - w0[1]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * - w0[0]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * - w0[2]*/ - - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * - w1[1]*/ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ - - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ - "cmp %[cnt], #1 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "blt 3f \n" - // mid - "1: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 1b \n" - - // right - "3: \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v18.4s}, [%[rmask]] \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" - "vext.32 q7, q8, q9, #1 @ 1234\n" - - // left - // r0 - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" - - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" - "vext.32 q7, q10, q11, #1 @ 1234\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" - "vext.32 q7, q12, q13, #1 @ 1234\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" - "vext.32 q7, q14, q15, #1 @ 1234\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "cmp %[cnt], #1 @ check whether has " - "mid cols\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - "blt 3f @ jump to main loop start " - "point\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" // v10 = {0,1,3,5} - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmul v12.4s, v1.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr0], %[inptr0], #4 \n" - "sub %[inptr1], %[inptr1], #4 \n" - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v12.4s, v3.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr2], %[inptr2], #4 \n" - "sub %[inptr3], %[inptr3], #4 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmla v11.4s, v4.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - - "fmul v14.4s, v5.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v12.4s, v5.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - - "fmla v17.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - "fmla v16.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr4], %[inptr4], #4 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v7.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" // v10 = {0,1,3,5} - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v9.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "cmp %[cnt], #1 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "blt 1f \n" - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - unsigned int* mask_ptr = dmask; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q10, q11 - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q12, q13 - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v13={0,2,4,6} v14={1,3,5,7}, q14, q15 - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vext.32 q6, q9, q11, #3 @ shift right 1 " - "data\n" // q2 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, " - "out1\n" // q0 * w01 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, " - "out1\n" // q1 * w02 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, " - "out1\n" // q2 * w00 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "cmp %[cnt], #1 \n" - "blt 1f \n" - // mid - "2: \n" - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "subs %[cnt], #1 \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -// 4line -void conv_depthwise_3x3s1p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - // printf("conv3x3_dw start \n"); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int tile_h = (h_in + 3) >> 2; - int cnt_col = tile_w - 2; - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for -#ifdef __aarch64__ - for (int c = 0; c < ch_in; c++) { - float* dout_ptr = dout_batch + c * size_out_channel; - - const float* din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float* wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float* doutr0 = dout_ptr; - float* doutr1 = doutr0 + w_out; - float* doutr2 = doutr1 + w_out; - float* doutr3 = doutr2 + w_out; - - const float* dr0 = din_ch_ptr; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - const float* dr5 = dr4 + w_in; - - const float* din_ptr0 = dr0; - const float* din_ptr1 = dr1; - const float* din_ptr2 = dr2; - const float* din_ptr3 = dr3; - const float* din_ptr4 = dr4; - const float* din_ptr5 = dr5; - - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - asm volatile( - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" - "movi v21.4s, #0x0\n" /* out0 = 0 */ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - - // left - // r0 - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * - w0[1]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * - w0[0]*/ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * - w0[2]*/ - - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * - w1[1]*/ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ - - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - - // r4 - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w2[1]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w1[1]*/ - - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - // r5 - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * - w1[1]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * - w0[1]*/ - - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ - "cmp %[cnt], #1 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "blt 3f \n" - // mid - "1: \n" - // r0 - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - "subs %[cnt], %[cnt], #1 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - - "bne 1b \n" - - // right - "3: \n" - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" - "ld1 {v22.4s}, [%[doutr0]] \n" - "ld1 {v23.4s}, [%[doutr1]] \n" - "ld1 {v24.4s}, [%[doutr2]] \n" - "ld1 {v25.4s}, [%[doutr3]] \n" - - "bif v0.16b, %[vzero].16b, v18.16b \n" - "bif v1.16b, %[vzero].16b, v19.16b \n" - "bif v2.16b, %[vzero].16b, v18.16b \n" - "bif v3.16b, %[vzero].16b, v19.16b \n" - - "bif v4.16b, %[vzero].16b, v18.16b \n" - "bif v5.16b, %[vzero].16b, v19.16b \n" - "bif v6.16b, %[vzero].16b, v18.16b \n" - "bif v7.16b, %[vzero].16b, v19.16b \n" - - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ - - // r0 - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "bif v8.16b, %[vzero].16b, v18.16b \n" - "bif v9.16b, %[vzero].16b, v19.16b \n" - "bif v10.16b, %[vzero].16b, v18.16b \n" - "bif v11.16b, %[vzero].16b, v19.16b \n" - - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "ld1 {v18.4s}, [%[rmask]] \n" - - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ - - // r1 - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ - - // r2 - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v12.16b, v22.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ - - // r3 - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "st1 {v12.4s}, [%[doutr0]], #16 \n" - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v13.16b, v23.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ - - "st1 {v13.4s}, [%[doutr1]], #16 \n" - - // r3 - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * - w0[0]*/ - - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ - - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * - w0[1]*/ - - "bif v14.16b, v24.16b, v18.16b \n" - - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * - w0[2]*/ - - "st1 {v14.4s}, [%[doutr2]], #16 \n" - - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ - - "bif v15.16b, v25.16b, v18.16b \n" - - "st1 {v15.4s}, [%[doutr3]], #16 \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - dout_ptr = dout_ptr + 4 * w_out; - } - } -#else - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float bias_val = flag_bias ? bias[i] : 0.f; - - float* dout_channel = dout_batch + i * size_out_channel; - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - const float* din0_ptr = nullptr; - const float* din1_ptr = nullptr; - const float* din2_ptr = nullptr; - const float* din3_ptr = nullptr; - - float* doutr0 = nullptr; - float* doutr1 = nullptr; - - float* ptr_zero = const_cast(zero); - - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - - doutr0 = dout_channel; - doutr1 = dout_channel + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din1_ptr = zero_ptr; - case 2: - din2_ptr = zero_ptr; - case 1: - din3_ptr = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int* rmask_ptr = rmask; - unsigned int* vmask_ptr = vmask; - asm volatile( - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" - - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" - "vext.32 q7, q8, q9, #1 @ 1234\n" - - // left - // r0 - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" - - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" - "vext.32 q7, q10, q11, #1 @ 1234\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" - "vext.32 q7, q12, q13, #1 @ 1234\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" - "vext.32 q7, q14, q15, #1 @ 1234\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "cmp %[cnt], #1 @ check whether has " - "mid cols\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q5 - // = - // vbias - "blt 3f @ jump to main loop start " - "point\n" - - // mid - "1: @ right pad entry\n" - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - "pld [%[din3_ptr]] @ preload data\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - "vdup.32 q4, %[bias_val] @ and \n" // q4 - // = - // vbias - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - - "vdup.32 q5, %[bias_val] @ and \n" // q4 - // = - // vbias - - "bne 1b @ jump to main loop start " - "point\n" - - // right - "3: @ right pad entry\n" - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" - - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" - - "vbif d16, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d17, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d18, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vbif d20, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d21, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d22, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vext.32 q6, q8, q9, #1 @ 1234\n" - "vext.32 q7, q8, q9, #2 @ 2345\n" - - // r0 - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" - - "vbif d24, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d25, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d26, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d28, %e[vzero], d19 @ bit select, deal with " - "right pad\n" - "vbif d29, %e[vzero], d23 @ bit select, deal with " - "right pad\n" - "vbif d30, %e[vzero], d27 @ bit select, deal with " - "right pad\n" - - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" - - "vext.32 q6, q10, q11, #1 @ 1234\n" - "vext.32 q7, q10, q11, #2 @ 2345\n" - - // r1 - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" - - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" - - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q12, q13, #1 @ 1234\n" - "vext.32 q7, q12, q13, #2 @ 2345\n" - - // r2 - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" - - "vext.32 q6, q14, q15, #1 @ 1234\n" - "vext.32 q7, q14, q15, #2 @ 2345\n" - - // r3 - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" - - "vmax.f32 q4, q4, %q[vzero] @ relu \n" - - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" - - "vbif d8, d16, d19 @ bit select, deal with right pad\n" - "vbif d9, d17, d23 @ bit select, deal with right pad\n" - - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" - - "vmax.f32 q5, q5, %q[vzero] @ relu \n" - - "vbif d10, d20, d19 @ bit select, deal with right " - "pad\n" - "vbif d11, d21, d23 @ bit select, deal with right " - "pad\n" - - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add " - "pointer\n" - - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [din3_ptr] "+r"(din3_ptr), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - dout_channel += 2 * w_out; - } //! end of processing mid rows - } -#endif - } -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, with reulu - */ -// w_in > 7 -void conv_depthwise_3x3s2p1_bias_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - - float32x4_t wbias; - float bias_c = 0.f; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - bias_c = bias[i]; - } else { - wbias = vdupq_n_f32(0.f); - } - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "prfm pldl1keep, [%[inptr0]] \n" - "prfm pldl1keep, [%[inptr1]] \n" - "prfm pldl1keep, [%[inptr2]] \n" - "prfm pldl1keep, [%[inptr3]] \n" - "prfm pldl1keep, [%[inptr4]] \n" - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" // v10 = {0,1,3,5} - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmul v12.4s, v1.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr0], %[inptr0], #4 \n" - "sub %[inptr1], %[inptr1], #4 \n" - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v12.4s, v3.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v16.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr2], %[inptr2], #4 \n" - "sub %[inptr3], %[inptr3], #4 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[1] \n" // {0,2,4,6} * w01 - "fmla v11.4s, v4.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - - "fmul v14.4s, v5.4s, %[w0].s[2] \n" // {1,3,5,7} * w02 - "fmla v12.4s, v5.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - - "fmla v17.4s, v10.4s, %[w0].s[0] \n" // {0,1,3,5} * w00 - "fmla v16.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" // v10 = {0,1,3,5} - - "sub %[inptr4], %[inptr4], #4 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v7.4s, %[w1].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w1].s[0] \n" // {0,1,3,5} * w00 - - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" // v10 = {0,1,3,5} - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[1] \n" // {0,2,4,6} * w01 - "fmla v14.4s, v9.4s, %[w2].s[2] \n" // {1,3,5,7} * w02 - "fmla v17.4s, v10.4s, %[w2].s[0] \n" // {0,1,3,5} * w00 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - "fadd v17.4s, v17.4s, v13.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "cmp %[cnt], #1 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "blt 1f \n" - // mid - "2: \n" - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, v18.16b, #4 \n" // v10 = {2,4,6,8} - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" // v0={0,2,4,6} - // v1={1,3,5,7} - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, v19.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, v20.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, v21.16b, #4 \n" // v10 = {2,4,6,8} - - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fadd v17.4s, v17.4s, v14.4s \n" - - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - "and v16.16b, %[vbias].16b, %[vbias].16b \n" // v10 = vbias - "subs %[cnt], %[cnt], #1 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - - "and v17.16b, %[vbias].16b, %[vbias].16b \n" // v16 = vbias - - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 4f \n" - "3: \n" - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r0 - "fmul v11.4s, v0.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmul v12.4s, v1.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" // pipei - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" // pipei - - // r1 - "fmla v11.4s, v2.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v12.4s, v3.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v16.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r2 - "fmul v13.4s, v4.4s, %[w0].s[0] \n" // {0,2,4,6} * w00 - "fmla v11.4s, v4.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - - "fmul v14.4s, v5.4s, %[w0].s[1] \n" // {1,3,5,7} * w01 - "fmla v12.4s, v5.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - - "fmla v17.4s, v10.4s, %[w0].s[2] \n" // {2,4,6,8} * w02 - "fmla v16.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - - // r3 - "fmla v13.4s, v6.4s, %[w1].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v7.4s, %[w1].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w1].s[2] \n" // {2,4,6,8} * w02 - - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" // v10 = {2,4,6,8} - "ld1 {v0.4s}, [%[outptr0]] \n" - - "fadd v16.4s, v16.4s, v11.4s \n" - "fadd v16.4s, v16.4s, v12.4s \n" - "ld1 {v1.4s}, [%[outptr1]] \n" - - // r4 - "fmla v13.4s, v8.4s, %[w2].s[0] \n" // {0,2,4,6} * w00 - "fmla v14.4s, v9.4s, %[w2].s[1] \n" // {1,3,5,7} * w01 - "fmla v17.4s, v10.4s, %[w2].s[2] \n" // {2,4,6,8} * w02 - - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ - - "fadd v17.4s, v17.4s, v13.4s \n" - - "bif v16.16b, v0.16b, %[wmask].16b \n" // pipei - - "fadd v17.4s, v17.4s, v14.4s \n" - - "st1 {v16.4s}, [%[outptr0]], #16 \n" - - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ - - "bif v17.16b, v1.16b, %[wmask].16b \n" // pipei - - "st1 {v17.4s}, [%[outptr1]], #16 \n" - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - doutr0 = doutr0 + 2 * w_out; - } -#else - - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - - unsigned int* mask_ptr = dmask; - asm volatile( - // top - // Load up 12 elements (3 vectors) from each of 8 sources. - "0: \n" - "vmov.u32 q9, #0 \n" - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q10, q11 - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v11={0,2,4,6} v12={1,3,5,7}, q12, q13 - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v13={0,2,4,6} v14={1,3,5,7}, q14, q15 - "pld [%[din0_ptr]] @ preload data\n" - "pld [%[din1_ptr]] @ preload data\n" - "pld [%[din2_ptr]] @ preload data\n" - - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vext.32 q6, q9, q11, #3 @ shift right 1 " - "data\n" // q2 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift right 1 " - "data\n" // q6 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q11 * w01 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q12 * w02 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w00 - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, " - "out1\n" // q0 * w01 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, " - "out1\n" // q1 * w02 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, " - "out1\n" // q2 * w00 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "cmp %[cnt], #1 \n" - "blt 1f \n" - // mid - "2: \n" - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" // q2={8,10,12,14} - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" // q2={8,10,12,14} - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // v0={0,2,4,6} v1={1,3,5,7} - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // v4={0,2,4,6} v5={1,3,5,7} - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "subs %[cnt], #1 \n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "bne 2b \n" - - // right - "1: \n" - "cmp %[remain], #1 \n" - "blt 3f \n" - - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q10 = - // vbias - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, " - "out0\n" // q0 * w00 - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w02 - - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" // q6 = {2,4,6,8} - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" - - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, " - "out0\n" // q6 * w02 - - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" - - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, " - "out0\n" // q0 * w00 - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, " - "out0\n" // q1 * w01 - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, " - "out0\n" // q6 * w02 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu \n" - - "vbif.f32 q3, q10, q11 @ write mask\n" - - "vst1.32 {d6-d7}, [%[outptr]]! \n" - "3: \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - - doutr0 = doutr0 + w_out; - } -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s}, [%[din0]], #16\n" - "ld1 {v1.4s}, [%[din1]], #16\n" - "ld1 {v2.4s}, [%[din2]], #16\n" - "ld1 {v3.4s}, [%[din3]], #16\n" - - "bif v0.16b, %[zero].16b, %[mask].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask].16b\n" // d1_1234 - "bif v2.16b, %[zero].16b, %[mask].16b\n" // d2_1234 - "bif v3.16b, %[zero].16b, %[mask].16b\n" // d3_1234 - - "ext v4.16b, %[zero].16b, v0.16b, #12\n" // d0_0123 - "ext v5.16b, %[zero].16b, v1.16b, #12\n" // d1_0123 - "ext v6.16b, %[zero].16b, v2.16b, #12\n" // d2_0123 - "ext v7.16b, %[zero].16b, v3.16b, #12\n" // d3_0123 - - "ext v8.16b, v0.16b, %[zero].16b, #4\n" // d0_2340 - "ext v9.16b, v1.16b, %[zero].16b, #4\n" // d1_2340 - "ext v10.16b, v2.16b, %[zero].16b, #4\n" // d2_2340 - "ext v11.16b, v3.16b, %[zero].16b, #4\n" // d3_2340 - - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" - - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" - - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" - - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" - - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" - - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" - - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" - - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" - - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" - - "fadd v12.4s, v12.4s, v14.4s\n" - "fadd v12.4s, v12.4s, v16.4s\n" - - "fadd v13.4s, v13.4s, v15.4s\n" // out1 - "fadd v13.4s, v13.4s, v17.4s\n" // out2 - - "fadd v12.4s, v12.4s, %[bias].4s\n" // out1 add bias - "fadd v13.4s, v13.4s, %[bias].4s\n" // out2 add bias - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); -#else - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d12-d13}, [%[din0]]!\n" - "vld1.32 {d14-d15}, [%[din1]]!\n" - "vld1.32 {d16-d17}, [%[din2]]!\n" - "vld1.32 {d18-d19}, [%[din3]]!\n" - - "vbif q6, %q[zero], %q[mask]\n" // d0_1234 - "vbif q7, %q[zero], %q[mask]\n" // d1_1234 - "vbif q8, %q[zero], %q[mask]\n" // d2_1234 - "vbif q9, %q[zero], %q[mask]\n" // d3_1234 - - "vmul.f32 q14, q6, %e[wr0][1]\n" - "vmul.f32 q15, q7, %e[wr0][1]\n" - - "vmla.f32 q14, q7, %e[wr1][1]\n" - "vmla.f32 q15, q8, %e[wr1][1]\n" - - "vmla.f32 q14, q8, %e[wr2][1]\n" - "vmla.f32 q15, q9, %e[wr2][1]\n" - - "vext.32 q10, %q[zero], q6, #3\n" // d0_0123 - "vext.32 q11, %q[zero], q7, #3\n" // d1_0123 - "vext.32 q12, %q[zero], q8, #3\n" // d2_0123 - "vext.32 q13, %q[zero], q9, #3\n" // d3_0123 - - "vmla.f32 q14, q10, %e[wr0][0]\n" - "vmla.f32 q15, q11, %e[wr0][0]\n" - - "vmla.f32 q14, q11, %e[wr1][0]\n" - "vmla.f32 q15, q12, %e[wr1][0]\n" - - "vmla.f32 q14, q12, %e[wr2][0]\n" - "vmla.f32 q15, q13, %e[wr2][0]\n" - - "vext.32 q10, q6, %q[zero], #1\n" // d0_2340 - "vext.32 q11, q7, %q[zero], #1\n" // d1_2340 - "vext.32 q12, q8, %q[zero], #1\n" // d2_2340 - "vext.32 q13, q9, %q[zero], #1\n" // d3_2340 - - "vmla.f32 q14, q10, %f[wr0][0]\n" - "vmla.f32 q15, q11, %f[wr0][0]\n" - - "vmla.f32 q14, q11, %f[wr1][0]\n" - "vmla.f32 q15, q12, %f[wr1][0]\n" - - "vmla.f32 q14, q12, %f[wr2][0]\n" // out1 - "vmla.f32 q15, q13, %f[wr2][0]\n" // out2 - - "vadd.f32 q14, q14, %q[bias]\n" // out1 add bias - "vadd.f32 q15, q15, %q[bias]\n" // out2 add bias - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vst1.32 {d28-d29}, [%[out1]]\n" - "vst1.32 {d30-d31}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ - -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v9.16b, v11.16b, #12 \n" // v6 = - // {0,1,3,5} - "ext v7.16b, v9.16b, v13.16b, #12 \n" // v7 = - // {0,1,3,5} - "ext v8.16b, v9.16b, v15.16b, #12 \n" // v8 = - // {0,1,3,5} - - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" // v10 * w01 - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" // v11 * w02 - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" // v6 * w00 - - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" // v13 * w12 - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" // v15 * w21 - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v6.4s \n" - - "fadd v4.4s, v4.4s, %[bias].4s \n" - - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" // q6 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" // q7 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" // q8 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, " - "out0\n" // q10 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, " - "out0\n" // q11 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w00 - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float* dout_channel = dout_batch + i * size_out_channel; - const float* din_channel = din_batch + i * size_in_channel; - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float* doutr0 = dout_channel; - float* doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - asm volatile( - "prfm pldl1keep, [%[din0]]\n" - "prfm pldl1keep, [%[din1]]\n" - "prfm pldl1keep, [%[din2]]\n" - "prfm pldl1keep, [%[din3]]\n" - - "ld1 {v0.4s}, [%[din0]], #16\n" - "ld1 {v1.4s}, [%[din1]], #16\n" - "ld1 {v2.4s}, [%[din2]], #16\n" - "ld1 {v3.4s}, [%[din3]], #16\n" - - "bif v0.16b, %[zero].16b, %[mask].16b\n" // d0_1234 - "bif v1.16b, %[zero].16b, %[mask].16b\n" // d1_1234 - "bif v2.16b, %[zero].16b, %[mask].16b\n" // d2_1234 - "bif v3.16b, %[zero].16b, %[mask].16b\n" // d3_1234 - - "ext v4.16b, %[zero].16b, v0.16b, #12\n" // d0_0123 - "ext v5.16b, %[zero].16b, v1.16b, #12\n" // d1_0123 - "ext v6.16b, %[zero].16b, v2.16b, #12\n" // d2_0123 - "ext v7.16b, %[zero].16b, v3.16b, #12\n" // d3_0123 - - "ext v8.16b, v0.16b, %[zero].16b, #4\n" // d0_2340 - "ext v9.16b, v1.16b, %[zero].16b, #4\n" // d1_2340 - "ext v10.16b, v2.16b, %[zero].16b, #4\n" // d2_2340 - "ext v11.16b, v3.16b, %[zero].16b, #4\n" // d3_2340 - - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" - - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" - - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" - - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" - - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" - - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" - - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" - - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" - - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" - - "fadd v12.4s, v12.4s, v14.4s\n" - "fadd v12.4s, v12.4s, v16.4s\n" - - "fadd v13.4s, v13.4s, v15.4s\n" // out1 - "fadd v13.4s, v13.4s, v17.4s\n" // out2 - - "fadd v12.4s, v12.4s, %[bias].4s\n" // out1 add bias - "fadd v13.4s, v13.4s, %[bias].4s\n" // out2 add bias - - "prfm pldl1keep, [%[out1]]\n" - "prfm pldl1keep, [%[out2]]\n" - - "fmax v12.4s, v12.4s, %[zero].4s\n" // out1 -> relu - "fmax v13.4s, v13.4s, %[zero].4s\n" // out2 -> relu - - "st1 {v12.4s}, [%[out1]]\n" - "st1 {v13.4s}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); -#else - asm volatile( - "pld [%[din0]]\n" - "pld [%[din1]]\n" - "pld [%[din2]]\n" - "pld [%[din3]]\n" - - "vld1.32 {d12-d13}, [%[din0]]!\n" - "vld1.32 {d14-d15}, [%[din1]]!\n" - "vld1.32 {d16-d17}, [%[din2]]!\n" - "vld1.32 {d18-d19}, [%[din3]]!\n" - - "vbif q6, %q[zero], %q[mask]\n" // d0_1234 - "vbif q7, %q[zero], %q[mask]\n" // d1_1234 - "vbif q8, %q[zero], %q[mask]\n" // d2_1234 - "vbif q9, %q[zero], %q[mask]\n" // d3_1234 - - "vmul.f32 q14, q6, %e[wr0][1]\n" - "vmul.f32 q15, q7, %e[wr0][1]\n" - - "vmla.f32 q14, q7, %e[wr1][1]\n" - "vmla.f32 q15, q8, %e[wr1][1]\n" - - "vmla.f32 q14, q8, %e[wr2][1]\n" - "vmla.f32 q15, q9, %e[wr2][1]\n" - - "vext.32 q10, %q[zero], q6, #3\n" // d0_0123 - "vext.32 q11, %q[zero], q7, #3\n" // d1_0123 - "vext.32 q12, %q[zero], q8, #3\n" // d2_0123 - "vext.32 q13, %q[zero], q9, #3\n" // d3_0123 - - "vmla.f32 q14, q10, %e[wr0][0]\n" - "vmla.f32 q15, q11, %e[wr0][0]\n" - - "vmla.f32 q14, q11, %e[wr1][0]\n" - "vmla.f32 q15, q12, %e[wr1][0]\n" - - "vmla.f32 q14, q12, %e[wr2][0]\n" - "vmla.f32 q15, q13, %e[wr2][0]\n" - - "vext.32 q10, q6, %q[zero], #1\n" // d0_2340 - "vext.32 q11, q7, %q[zero], #1\n" // d1_2340 - "vext.32 q12, q8, %q[zero], #1\n" // d2_2340 - "vext.32 q13, q9, %q[zero], #1\n" // d3_2340 - - "vmla.f32 q14, q10, %f[wr0][0]\n" - "vmla.f32 q15, q11, %f[wr0][0]\n" - - "vmla.f32 q14, q11, %f[wr1][0]\n" - "vmla.f32 q15, q12, %f[wr1][0]\n" - - "vmla.f32 q14, q12, %f[wr2][0]\n" // out1 - "vmla.f32 q15, q13, %f[wr2][0]\n" // out2 - - "vadd.f32 q14, q14, %q[bias]\n" // out1 add bias - "vadd.f32 q15, q15, %q[bias]\n" // out2 add bias - - "pld [%[out1]]\n" - "pld [%[out2]]\n" - - "vmax.f32 q14, q14, %q[zero]\n" // out1 -> relu - "vmax.f32 q15, q15, %q[zero]\n" // out2 -> relu - - "vst1.32 {d28-d29}, [%[out1]]\n" - "vst1.32 {d30-d31}, [%[out2]]\n" - - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 7 - */ -void conv_depthwise_3x3s2p1_bias_s_relu(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "movi v9.4s, #0 \n" - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" - - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" // v10={0,2,4,6} - // v11={1,3,5,7} - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" // v13={0,2,4,6} - // v12={1,3,5,7} - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" // v14={0,2,4,6} - // v15={1,3,5,7} - - "bif v10.16b, v9.16b, v6.16b \n" - "bif v11.16b, v9.16b, v7.16b \n" - "bif v12.16b, v9.16b, v6.16b \n" - "bif v13.16b, v9.16b, v7.16b \n" - "bif v14.16b, v9.16b, v6.16b \n" - "bif v15.16b, v9.16b, v7.16b \n" - - "ext v6.16b, v9.16b, v11.16b, #12 \n" // v6 = - // {0,1,3,5} - "ext v7.16b, v9.16b, v13.16b, #12 \n" // v7 = - // {0,1,3,5} - "ext v8.16b, v9.16b, v15.16b, #12 \n" // v8 = - // {0,1,3,5} - - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" // v10 * w01 - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" // v11 * w02 - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" // v6 * w00 - - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" // v12 * w11 - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" // v13 * w12 - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" // v7 * w10 - - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" // v14 * w20 - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" // v15 * w21 - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" // v8 * w22 - - "fadd v4.4s, v4.4s, v5.4s \n" - "fadd v4.4s, v4.4s, v6.4s \n" - - "fadd v4.4s, v4.4s, %[bias].4s \n" // out add bias - "fmax v4.4s, v4.4s, v9.4s \n" - - "st1 {v4.4s}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - -#else - asm volatile( - // Load up 12 elements (3 vectors) from each of 8 sources. - "vmov.u32 q9, #0 \n" - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" - "vdup.32 q3, %[bias] @ and \n" // q3 = - // vbias - - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" // q10={0,2,4,6} q11={1,3,5,7} - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" // q13={0,2,4,6} q12={1,3,5,7} - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" // q14={0,2,4,6} q15={1,3,5,7} - - "vbif q10, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q11, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q12, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q13, q9, q7 @ bit select, deal " - "with right pad\n" - "vbif q14, q9, q6 @ bit select, deal " - "with right pad\n" - "vbif q15, q9, q7 @ bit select, deal " - "with right pad\n" - - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" // q6 = {0,1,3,5} - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" // q7 = {0,1,3,5} - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" // q8 = {0,1,3,5} - - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, " - "out0\n" // q10 * w01 - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, " - "out0\n" // q11 * w02 - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, " - "out0\n" // q6 * w00 - - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, " - "out0\n" // q12 * w11 - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, " - "out0\n" // q13 * w12 - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, " - "out0\n" // q7 * w10 - - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, " - "out0\n" // q14 * w20 - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, " - "out0\n" // q15 * w21 - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, " - "out0\n" // q8 * w22 - - "vadd.f32 q3, q3, q4 @ add \n" - "vadd.f32 q3, q3, q5 @ add \n" - - "vmax.f32 q3, q3, q9 @ relu\n" - - "vst1.32 {d6-d7}, [%[out]] \n" - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); -#endif // __aarch64__ - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3s1.cc b/lite/backends/arm/math/conv_depthwise_3x3s1.cc deleted file mode 100644 index 8d0ebb58ad..0000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3s1.cc +++ /dev/null @@ -1,2539 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void conv_depthwise_3x3s1p0_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1p0_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1p1_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1p1_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx); - -void conv_depthwise_3x3s1_fp32(const float *din, - float *dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float *weights, - const float *bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext *ctx) { - if (pad == 0) { - if (w_in > 5) { - conv_depthwise_3x3s1p0_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - if (pad == 1) { - if (w_in > 4) { - conv_depthwise_3x3s1p1_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s1p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } -} - -#ifdef __aarch64__ -#define INIT_S1 \ - "PRFM PLDL1KEEP, [%[din_ptr0]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr1]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr2]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr3]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr4]] \n" \ - "PRFM PLDL1KEEP, [%[din_ptr5]] \n" \ - "movi v21.4s, #0x0\n" /* out0 = 0 */ \ - \ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ - -#define LEFT_COMPUTE_S1 \ - "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */ \ - "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ \ - "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ \ - \ - "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ \ - "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ \ - \ - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \ - \ - "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */ \ - "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/ \ - "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ \ - "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ \ - \ - "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16=1234 */ \ - "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ \ - \ - /* r2 */ \ - "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ - -#define LEFT_RESULT_S1 \ - /* r4 */ \ - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */ \ - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ - "cmp %w[cnt], #1 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "blt 3f \n" - -#define MID_COMPUTE_S1 \ - "1: \n" /* r0 */ \ - "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - -#define MID_RESULT_S1 \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "bne 1b \n" - -#define RIGHT_COMPUTE_S1 \ - "3: \n" \ - "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" \ - "ld1 {v22.4s}, [%[doutr0]] \n" \ - "ld1 {v23.4s}, [%[doutr1]] \n" \ - "ld1 {v24.4s}, [%[doutr2]] \n" \ - "ld1 {v25.4s}, [%[doutr3]] \n" \ - \ - "bif v0.16b, %[vzero].16b, v18.16b \n" \ - "bif v1.16b, %[vzero].16b, v19.16b \n" \ - "bif v2.16b, %[vzero].16b, v18.16b \n" \ - "bif v3.16b, %[vzero].16b, v19.16b \n" \ - \ - "bif v4.16b, %[vzero].16b, v18.16b \n" \ - "bif v5.16b, %[vzero].16b, v19.16b \n" \ - "bif v6.16b, %[vzero].16b, v18.16b \n" \ - "bif v7.16b, %[vzero].16b, v19.16b \n" \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */ \ - "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v8.16b, %[vzero].16b, v18.16b \n" \ - "bif v9.16b, %[vzero].16b, v19.16b \n" \ - "bif v10.16b, %[vzero].16b, v18.16b \n" \ - "bif v11.16b, %[vzero].16b, v19.16b \n" \ - \ - "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "ld1 {v18.4s}, [%[rmask]] \n" \ - \ - "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ - "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ - "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ - -#define RIGHT_RESULT_S1 \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v12.16b, v22.16b, v18.16b \n" \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v13.16b, v23.16b, v18.16b \n" \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "bif v14.16b, v24.16b, v18.16b \n" \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "bif v15.16b, v25.16b, v18.16b \n" \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" - -#define LEFT_RESULT_S1_RELU \ - /* r4 */ \ - "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ - \ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ - \ - "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ - "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \ - "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ - \ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ - \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ - \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ - "cmp %w[cnt], #1 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - "blt 3f \n" - -#define MID_RESULT_S1_RELU \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - \ - "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" \ - \ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ - \ - /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ - "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ - \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" \ - "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ - \ - "bne 1b \n" - -#define RIGHT_RESULT_S1_RELU \ - /* r3 */ \ - "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "bif v12.16b, v22.16b, v18.16b \n" \ - \ - "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ - "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "st1 {v12.4s}, [%[doutr0]], #16 \n" \ - "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "bif v13.16b, v23.16b, v18.16b \n" \ - \ - "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ - "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ - \ - "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ - "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ - \ - "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ - \ - "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ - \ - "bif v14.16b, v24.16b, v18.16b \n" \ - \ - "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ - \ - "st1 {v14.4s}, [%[doutr2]], #16 \n" \ - \ - "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ - \ - "bif v15.16b, v25.16b, v18.16b \n" \ - \ - "st1 {v15.4s}, [%[doutr3]], #16 \n" - -#define COMPUTE_S_S1 \ - "prfm pldl1keep, [%[din0]]\n" \ - "prfm pldl1keep, [%[din1]]\n" \ - "prfm pldl1keep, [%[din2]]\n" \ - "prfm pldl1keep, [%[din3]]\n" \ - \ - "ld1 {v0.4s}, [%[din0]], #16\n" \ - "ld1 {v1.4s}, [%[din1]], #16\n" \ - "ld1 {v2.4s}, [%[din2]], #16\n" \ - "ld1 {v3.4s}, [%[din3]], #16\n" \ - \ - "bif v0.16b, %[zero].16b, %[mask].16b\n" \ - "bif v1.16b, %[zero].16b, %[mask].16b\n" \ - "bif v2.16b, %[zero].16b, %[mask].16b\n" \ - "bif v3.16b, %[zero].16b, %[mask].16b\n" \ - \ - "ext v4.16b, %[zero].16b, v0.16b, #12\n" \ - "ext v5.16b, %[zero].16b, v1.16b, #12\n" \ - "ext v6.16b, %[zero].16b, v2.16b, #12\n" \ - "ext v7.16b, %[zero].16b, v3.16b, #12\n" \ - \ - "ext v8.16b, v0.16b, %[zero].16b, #4\n" \ - "ext v9.16b, v1.16b, %[zero].16b, #4\n" \ - "ext v10.16b, v2.16b, %[zero].16b, #4\n" \ - "ext v11.16b, v3.16b, %[zero].16b, #4\n" \ - \ - "fmul v12.4s, v0.4s, %[wr0].s[1]\n" \ - "fmul v13.4s, v1.4s, %[wr0].s[1]\n" \ - \ - "fmul v14.4s, v1.4s, %[wr1].s[1]\n" \ - "fmul v15.4s, v2.4s, %[wr1].s[1]\n" \ - \ - "fmul v16.4s, v2.4s, %[wr2].s[1]\n" \ - "fmul v17.4s, v3.4s, %[wr2].s[1]\n" \ - \ - "fmla v12.4s, v4.4s, %[wr0].s[0]\n" \ - "fmla v13.4s, v5.4s, %[wr0].s[0]\n" \ - \ - "fmla v14.4s, v5.4s, %[wr1].s[0]\n" \ - "fmla v15.4s, v6.4s, %[wr1].s[0]\n" \ - \ - "fmla v16.4s, v6.4s, %[wr2].s[0]\n" \ - "fmla v17.4s, v7.4s, %[wr2].s[0]\n" \ - \ - "fmla v12.4s, v8.4s, %[wr0].s[2]\n" \ - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ - \ - "fmla v14.4s, v9.4s, %[wr1].s[2]\n" \ - "fmla v15.4s, v10.4s, %[wr1].s[2]\n" \ - \ - "fmla v16.4s, v10.4s, %[wr2].s[2]\n" \ - "fmla v17.4s, v11.4s, %[wr2].s[2]\n" \ - \ - "fadd v12.4s, v12.4s, v14.4s\n" \ - "fadd v12.4s, v12.4s, v16.4s\n" \ - \ - "fadd v13.4s, v13.4s, v15.4s\n" \ - "fadd v13.4s, v13.4s, v17.4s\n" \ - \ - "fadd v12.4s, v12.4s, %[bias].4s\n" \ - "fadd v13.4s, v13.4s, %[bias].4s\n" - -#define RESULT_S_S1 \ - "prfm pldl1keep, [%[out1]]\n" \ - "prfm pldl1keep, [%[out2]]\n" \ - \ - "st1 {v12.4s}, [%[out1]]\n" \ - "st1 {v13.4s}, [%[out2]]\n" - -#define RESULT_S_S1_RELU \ - "prfm pldl1keep, [%[out1]]\n" \ - "prfm pldl1keep, [%[out2]]\n" \ - \ - "fmax v12.4s, v12.4s, %[zero].4s\n" \ - "fmax v13.4s, v13.4s, %[zero].4s\n" \ - \ - "st1 {v12.4s}, [%[out1]]\n" \ - "st1 {v13.4s}, [%[out2]]\n" - -#define COMPUTE_S_S1_P0 \ - "prfm pldl1keep, [%[din0]]\n" \ - "prfm pldl1keep, [%[din1]]\n" \ - "prfm pldl1keep, [%[din2]]\n" \ - "prfm pldl1keep, [%[din3]]\n" \ - \ - "ld1 {v0.4s, v1.4s}, [%[din0]]\n" \ - "ld1 {v2.4s, v3.4s}, [%[din1]]\n" \ - "ld1 {v4.4s, v5.4s}, [%[din2]]\n" \ - "ld1 {v6.4s, v7.4s}, [%[din3]]\n" \ - \ - "bif v0.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v1.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "bif v2.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v3.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "bif v4.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v5.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "bif v6.16b, %[zero].16b, %[mask1].16b\n" \ - "bif v7.16b, %[zero].16b, %[mask2].16b\n" \ - \ - "ext v8.16b, v0.16b, v1.16b, #4\n" \ - "ext v9.16b, v0.16b, v1.16b, #8\n" \ - \ - "and v12.16b, %[vbias].16b, %[vbias].16b \n" \ - "and v13.16b, %[vbias].16b, %[vbias].16b \n" /* r0 */ \ - "fmul v10.4s, v0.4s, %[wr0].s[0]\n" \ - "fmul v11.4s, v8.4s, %[wr0].s[1]\n" \ - "fmla v12.4s, v9.4s, %[wr0].s[2]\n" \ - \ - "ext v8.16b, v2.16b, v3.16b, #4\n" \ - "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */ \ - "fmul v14.4s, v2.4s, %[wr0].s[0]\n" \ - "fmla v10.4s, v2.4s, %[wr1].s[0]\n" \ - \ - "fmul v15.4s, v8.4s, %[wr0].s[1]\n" \ - "fmla v11.4s, v8.4s, %[wr1].s[1]\n" \ - \ - "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ - "fmla v12.4s, v9.4s, %[wr1].s[2]\n" \ - \ - "ext v8.16b, v4.16b, v5.16b, #4\n" \ - "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */ \ - "fmla v14.4s, v4.4s, %[wr1].s[0]\n" \ - "fmla v10.4s, v4.4s, %[wr2].s[0]\n" \ - \ - "fmla v15.4s, v8.4s, %[wr1].s[1]\n" \ - "fmla v11.4s, v8.4s, %[wr2].s[1]\n" \ - \ - "fmla v13.4s, v9.4s, %[wr1].s[2]\n" \ - "fmla v12.4s, v9.4s, %[wr2].s[2]\n" \ - \ - "ext v8.16b, v6.16b, v7.16b, #4\n" \ - "ext v9.16b, v6.16b, v7.16b, #8\n" \ - \ - "fmla v14.4s, v6.4s, %[wr2].s[0]\n" \ - \ - "fmla v15.4s, v8.4s, %[wr2].s[1]\n" \ - \ - "fadd v12.4s, v12.4s, v10.4s\n" \ - \ - "fmla v13.4s, v9.4s, %[wr2].s[2]\n" \ - \ - "fadd v12.4s, v12.4s, v11.4s\n" \ - "fadd v13.4s, v13.4s, v14.4s\n" \ - "fadd v13.4s, v13.4s, v15.4s\n" // \ - // "prfm pldl1keep, [%[out1]]\n" \ - // "prfm pldl1keep, [%[out2]]\n" \ - // \ - // "st1 {v12.4s}, [%[out1]]\n" \ - // "st1 {v13.4s}, [%[out2]]\n" \ - - -#else -#define INIT_S1 \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - "pld [%[din3_ptr]] @ preload data\n" \ - \ - "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" \ - "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" \ - "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" \ - "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" \ - \ - "vdup.32 q4, %[bias_val] @ and \n" \ - "vdup.32 q5, %[bias_val] @ and \n" - -#define LEFT_COMPUTE_S1 \ - "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" \ - "vext.32 q7, q8, q9, #1 @ 1234\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" \ - "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" \ - "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" \ - "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - "pld [%[din3_ptr]] @ preload data\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" \ - "vext.32 q7, q10, q11, #1 @ 1234\n" \ - \ - /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" \ - "vext.32 q7, q12, q13, #1 @ 1234\n" \ - \ - /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" \ - "vext.32 q7, q14, q15, #1 @ 1234\n" - -#define LEFT_RESULT_S1 \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - "cmp %[cnt], #1 @ check whether has mid cols\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - "blt 3f @ jump to main loop start point\n" - -#define MID_COMPUTE_S1 \ - "1: @ right pad entry\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - "pld [%[din3_ptr]] @ preload data\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ - \ - "vext.32 q6, q10, q11, #1 @ 1234\n" \ - "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q12, q13, #1 @ 1234\n" \ - "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q14, q15, #1 @ 1234\n" \ - "vext.32 q7, q14, q15, #2 @ 2345\n" - -#define MID_RESULT_S1 \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "subs %[cnt], #1 @ loop count minus 1\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - \ - "bne 1b @ jump to main loop start point\n" - -#define RIGHT_COMPUTE_S1 \ - "3: @ right pad entry\n" \ - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ - \ - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ - "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" \ - \ - "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q10, q11, #1 @ 1234\n" \ - "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" \ - "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" \ - \ - "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" \ - "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" \ - \ - "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q12, q13, #1 @ 1234\n" \ - "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q14, q15, #1 @ 1234\n" \ - "vext.32 q7, q14, q15, #2 @ 2345\n" - -#define RIGHT_RESULT_S1 \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ - "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ - "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" - -#define LEFT_RESULT_S1_RELU \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ - \ - "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ - \ - "cmp %[cnt], #1 @ check whether has mid cols\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - "blt 3f @ jump to main loop start point\n" - -#define MID_RESULT_S1_RELU \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ - "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" \ - "vdup.32 q4, %[bias_val] @ and \n" \ - \ - "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ - \ - "subs %[cnt], #1 @ loop count minus 1\n" \ - \ - "vdup.32 q5, %[bias_val] @ and \n" \ - \ - "bne 1b @ jump to main loop start point\n" - -#define RIGHT_RESULT_S1_RELU \ - /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ - \ - "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ - "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ - \ - "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ - \ - "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ - "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ - \ - "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" - -#define COMPUTE_S_S1 \ - "pld [%[din0]]\n" \ - "pld [%[din1]]\n" \ - "pld [%[din2]]\n" \ - "pld [%[din3]]\n" \ - \ - "vld1.32 {d12-d13}, [%[din0]]!\n" \ - "vld1.32 {d14-d15}, [%[din1]]!\n" \ - "vld1.32 {d16-d17}, [%[din2]]!\n" \ - "vld1.32 {d18-d19}, [%[din3]]!\n" \ - \ - "vbif q6, %q[vzero], %q[mask]\n" \ - "vbif q7, %q[vzero], %q[mask]\n" \ - "vbif q8, %q[vzero], %q[mask]\n" \ - "vbif q9, %q[vzero], %q[mask]\n" \ - \ - "vmul.f32 q14, q6, %e[wr0][1]\n" \ - "vmul.f32 q15, q7, %e[wr0][1]\n" \ - \ - "vmla.f32 q14, q7, %e[wr1][1]\n" \ - "vmla.f32 q15, q8, %e[wr1][1]\n" \ - \ - "vmla.f32 q14, q8, %e[wr2][1]\n" \ - "vmla.f32 q15, q9, %e[wr2][1]\n" \ - \ - "vext.32 q10, %q[vzero], q6, #3\n" \ - "vext.32 q11, %q[vzero], q7, #3\n" \ - "vext.32 q12, %q[vzero], q8, #3\n" \ - "vext.32 q13, %q[vzero], q9, #3\n" \ - \ - "vmla.f32 q14, q10, %e[wr0][0]\n" \ - "vmla.f32 q15, q11, %e[wr0][0]\n" \ - \ - "vmla.f32 q14, q11, %e[wr1][0]\n" \ - "vmla.f32 q15, q12, %e[wr1][0]\n" \ - \ - "vmla.f32 q14, q12, %e[wr2][0]\n" \ - "vmla.f32 q15, q13, %e[wr2][0]\n" \ - \ - "vext.32 q10, q6, %q[vzero], #1\n" \ - "vext.32 q11, q7, %q[vzero], #1\n" \ - "vext.32 q12, q8, %q[vzero], #1\n" \ - "vext.32 q13, q9, %q[vzero], #1\n" \ - \ - "vmla.f32 q14, q10, %f[wr0][0]\n" \ - "vmla.f32 q15, q11, %f[wr0][0]\n" \ - \ - "vmla.f32 q14, q11, %f[wr1][0]\n" \ - "vmla.f32 q15, q12, %f[wr1][0]\n" \ - \ - "vmla.f32 q14, q12, %f[wr2][0]\n" \ - "vmla.f32 q15, q13, %f[wr2][0]\n" \ - \ - "vadd.f32 q14, q14, %q[bias]\n" \ - "vadd.f32 q15, q15, %q[bias]\n" - -#define RESULT_S_S1 \ - "pld [%[out1]]\n" \ - "pld [%[out2]]\n" \ - \ - "vst1.32 {d28-d29}, [%[out1]]\n" \ - "vst1.32 {d30-d31}, [%[out2]]\n" - -#define RESULT_S_S1_RELU \ - "pld [%[out1]]\n" \ - "pld [%[out2]]\n" \ - \ - "vmax.f32 q14, q14, %q[vzero]\n" \ - "vmax.f32 q15, q15, %q[vzero]\n" \ - \ - "vst1.32 {d28-d29}, [%[out1]]\n" \ - "vst1.32 {d30-d31}, [%[out2]]\n" - -#define COMPUTE_S_S1_P0 \ - "pld [%[din0]]\n" \ - "pld [%[din1]]\n" \ - "pld [%[din2]]\n" \ - "pld [%[din3]]\n" \ - "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" \ - "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" \ - "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" \ - "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" \ - \ - "vdup.32 q4, %[bias_val] @ and \n" \ - "vdup.32 q5, %[bias_val] @ and \n" \ - \ - "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ - "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ - \ - "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ - \ - "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ - \ - "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ - \ - "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ - "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vext.32 q6, q8, q9, #1 @ 1234\n" \ - "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ - "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ - "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ - "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ - \ - "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ - \ - "vext.32 q6, q10, q11, #1 @ 1234\n" \ - "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ - "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ - "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ - "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q12, q13, #1 @ 1234\n" \ - "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ - "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ - "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - \ - "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ - "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ - \ - "vext.32 q6, q14, q15, #1 @ 1234\n" \ - "vext.32 q7, q14, q15, #2 @ 2345\n" /* r3 */ \ - "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ - \ - "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ - "vadd.f32 q4, q4, q10 @ q4 += q10 \n" \ - \ - "pld [%[out1]]\n" \ - "pld [%[out2]]\n" \ - \ - "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ - "vadd.f32 q14, q4, q11 @ q4 += q10 \n" \ - \ - "vadd.f32 q5, q5, q8 @ q4 += q10 \n" \ - "vadd.f32 q15, q5, q9 @ q4 += q10 \n" - -#endif -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -void conv_depthwise_3x3s1p1_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float *zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float *write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = (w_in + 3) >> 2; - int cnt_col = tile_w - 2; - - unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - float *dout_ptr = dout_batch + c * size_out_channel; - - const float *din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float *wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float *doutr0 = dout_ptr; - float *doutr1 = doutr0 + w_out; - float *doutr2 = doutr1 + w_out; - float *doutr3 = doutr2 + w_out; - - const float *dr0 = din_ch_ptr; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - const float *dr4 = dr3 + w_in; - const float *dr5 = dr4 + w_in; - - const float *din_ptr0 = dr0; - const float *din_ptr1 = dr1; - const float *din_ptr2 = dr2; - const float *din_ptr3 = dr3; - const float *din_ptr4 = dr4; - const float *din_ptr5 = dr5; - float *ptr_zero = const_cast(zero); -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - din_ptr4 = dr3; - din_ptr5 = dr4; - dr0 = dr3; - dr1 = dr4; - dr2 = dr5; - } else { - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - } - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 > h_in) { - switch (i + 5 - h_in) { - case 5: - din_ptr1 = zero_ptr; - case 4: - din_ptr2 = zero_ptr; - case 3: - din_ptr3 = zero_ptr; - case 2: - din_ptr4 = zero_ptr; - case 1: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = cnt_col; - if (flag_relu) { - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } - dout_ptr = dout_ptr + 4 * w_out; - } -#else - for (int i = 0; i < h_in; i += 2) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = dout_ptr + w_out; - // unsigned int* rst_mask = rmask; - - if (i == 0) { - din_ptr0 = zero_ptr; - din_ptr1 = dr0; - din_ptr2 = dr1; - din_ptr3 = dr2; - dr0 = dr1; - dr1 = dr2; - dr2 = dr3; - dr3 = dr2 + w_in; - } else { - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - } - //! process bottom pad - if (i + 3 > h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = cnt_col; - unsigned int *rmask_ptr = rmask; - unsigned int *vmask_ptr = vmask; - if (flag_relu) { - asm volatile( - INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 - MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 - MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - dout_ptr += 2 * w_out; - } //! end of processing mid rows -#endif - } - } -} - -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p1_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[4] = {3, 2, 1, 0}; - const float zero[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float *dout_channel = dout_batch + i * size_out_channel; - const float *din_channel = din_batch + i * size_in_channel; - const float *weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } - - int hs = -1; - int he = 3; - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - int h_cnt = (h_out + 1) >> 1; - float *doutr0 = dout_channel; - float *doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_cnt; ++j) { - const float *dr0 = din_channel + hs * w_in; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - - if (hs == -1) { - dr0 = zero; - } - - switch (he - h_in) { - case 2: - dr2 = zero; - doutr1 = trash_buf; - case 1: - dr3 = zero; - default: - break; - } -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - } else { - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [zero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17"); - } -#else - if (flag_relu) { - asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S1 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [mask] "w"(vmask_rp), - [bias] "w"(wbias), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - doutr0 = doutr1; - doutr1 += w_out; - hs += 2; - he += 2; - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} - -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width > 4 - */ -void conv_depthwise_3x3s1p0_bias(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! pad is done implicit - const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - //! for 4x6 convolution window - const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - - float *zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float *write_ptr = zero_ptr + w_in; - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - int w_stride = 9; - - int tile_w = w_out >> 2; - int remain = w_out % 4; - - unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); - const int remian_idx[4] = {0, 1, 2, 3}; - - uint32x4_t vmask_rp1 = - vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_rp2 = - vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); - uint32x4_t vmask_result = - vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - unsigned int rmask[4]; - vst1q_u32(rmask, vmask_result); - - float32x4_t vzero = vdupq_n_f32(0.f); - - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int c = 0; c < ch_in; c++) { - float *dout_ptr = dout_batch + c * size_out_channel; - - const float *din_ch_ptr = din_batch + c * size_in_channel; - - float bias_val = flag_bias ? bias[c] : 0.f; - float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; - - const float *wei_ptr = weights + c * w_stride; - - float32x4_t wr0 = vld1q_f32(wei_ptr); - float32x4_t wr1 = vld1q_f32(wei_ptr + 3); - float32x4_t wr2 = vld1q_f32(wei_ptr + 6); - - float *doutr0 = dout_ptr; - float *doutr1 = doutr0 + w_out; - float *doutr2 = doutr1 + w_out; - float *doutr3 = doutr2 + w_out; - - const float *dr0 = din_ch_ptr; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - const float *dr4 = dr3 + w_in; - const float *dr5 = dr4 + w_in; - - const float *din_ptr0 = dr0; - const float *din_ptr1 = dr1; - const float *din_ptr2 = dr2; - const float *din_ptr3 = dr3; - const float *din_ptr4 = dr4; - const float *din_ptr5 = dr5; - - float *ptr_zero = const_cast(zero); -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 4) { - //! process top pad pad_h = 1 - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - din_ptr4 = dr4; - din_ptr5 = dr5; - - doutr0 = dout_ptr; - doutr1 = doutr0 + w_out; - doutr2 = doutr1 + w_out; - doutr3 = doutr2 + w_out; - - dr0 = dr4; - dr1 = dr5; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - dr5 = dr4 + w_in; - - //! process bottom pad - if (i + 5 >= h_in) { - switch (i + 5 - h_in) { - case 4: - din_ptr1 = zero_ptr; - case 3: - din_ptr2 = zero_ptr; - case 2: - din_ptr3 = zero_ptr; - case 1: - din_ptr4 = zero_ptr; - case 0: - din_ptr5 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 4 > h_out) { - switch (i + 4 - h_out) { - case 3: - doutr1 = write_ptr; - case 2: - doutr2 = write_ptr; - case 1: - doutr3 = write_ptr; - default: - break; - } - } - - int cnt = tile_w; - if (flag_relu) { - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1_RELU - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } else { - asm volatile( - INIT_S1 - "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ - "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ - "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ - "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ - "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ - MID_COMPUTE_S1 MID_RESULT_S1 - "cmp %w[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1 "0: \n" - : [cnt] "+r"(cnt), - [din_ptr0] "+r"(din_ptr0), - [din_ptr1] "+r"(din_ptr1), - [din_ptr2] "+r"(din_ptr2), - [din_ptr3] "+r"(din_ptr3), - [din_ptr4] "+r"(din_ptr4), - [din_ptr5] "+r"(din_ptr5), - [doutr0] "+r"(doutr0), - [doutr1] "+r"(doutr1), - [doutr2] "+r"(doutr2), - [doutr3] "+r"(doutr3) - : [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [bias_val] "r"(vbias), - [vmask] "r"(vmask), - [rmask] "r"(rmask), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21", - "v22", - "v23", - "v24", - "v25"); - } - dout_ptr = dout_ptr + 4 * w_out; - } -#else - for (int i = 0; i < h_out; i += 2) { - din_ptr0 = dr0; - din_ptr1 = dr1; - din_ptr2 = dr2; - din_ptr3 = dr3; - - doutr0 = dout_ptr; - doutr1 = dout_ptr + w_out; - - dr0 = dr2; - dr1 = dr3; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - //! process bottom pad - if (i + 3 >= h_in) { - switch (i + 3 - h_in) { - case 3: - din_ptr1 = zero_ptr; - case 2: - din_ptr2 = zero_ptr; - case 1: - din_ptr3 = zero_ptr; - case 0: - din_ptr3 = zero_ptr; - default: - break; - } - } - //! process bottom remain - if (i + 2 > h_out) { - doutr1 = write_ptr; - } - int cnt = tile_w; - unsigned int *rmask_ptr = rmask; - unsigned int *vmask_ptr = vmask; - if (flag_relu) { - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1_RELU - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1_RELU "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S1 - "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" - "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" - "vext.32 q6, q8, q9, #1 @ 0012\n" - "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 - MID_RESULT_S1 - "cmp %[remain], #1 \n" - "blt 0f \n" RIGHT_COMPUTE_S1 - RIGHT_RESULT_S1 "0: \n" - : [dout_ptr1] "+r"(doutr0), - [dout_ptr2] "+r"(doutr1), - [din0_ptr] "+r"(din_ptr0), - [din1_ptr] "+r"(din_ptr1), - [din2_ptr] "+r"(din_ptr2), - [din3_ptr] "+r"(din_ptr3), - [cnt] "+r"(cnt), - [rmask] "+r"(rmask_ptr), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias_val] "r"(bias_val), - [vzero] "w"(vzero), - [remain] "r"(remain) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - dout_ptr += 2 * w_out; - } //! end of processing mid rows -#endif - } - } -} -/** - * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, - * width <= 4 - */ -void conv_depthwise_3x3s1p0_bias_s(float *dout, - const float *din, - const float *weights, - const float *bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext *ctx) { - //! 3x3s1 convolution, implemented by direct algorithm - //! pad is done implicit - //! for 4x6 convolution window - const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - float32x4_t vzero = vdupq_n_f32(0.f); - uint32x4_t vmask_rp1 = - vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); - uint32x4_t vmask_rp2 = - vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); - - unsigned int vmask[8]; - vst1q_u32(vmask, vmask_rp1); - vst1q_u32(vmask + 4, vmask_rp2); - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - for (int n = 0; n < num; ++n) { - const float *din_batch = din + n * ch_in * size_in_channel; - float *dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - float *dout_channel = dout_batch + i * size_out_channel; - const float *din_channel = din_batch + i * size_in_channel; - const float *weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - -#ifdef __aarch64__ - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } -#endif // __aarch64__ - - float out_buf1[4]; - float out_buf2[4]; - float trash_buf[4]; - - float *doutr0 = dout_channel; - float *doutr1 = dout_channel + w_out; - - for (int j = 0; j < h_out; j += 2) { - const float *dr0 = din_channel + j * w_in; - const float *dr1 = dr0 + w_in; - const float *dr2 = dr1 + w_in; - const float *dr3 = dr2 + w_in; - - doutr0 = dout_channel + j * w_out; - doutr1 = doutr0 + w_out; - - if (j + 3 >= h_in) { - switch (j + 3 - h_in) { - case 3: - dr1 = zero_ptr; - case 2: - dr2 = zero_ptr; - case 1: - dr3 = zero_ptr; - doutr1 = trash_buf; - case 0: - dr3 = zero_ptr; - doutr1 = trash_buf; - default: - break; - } - } -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } else { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vbias] "w"(wbias), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [zero] "w"(vzero), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } -#else - unsigned int *vmask_ptr = vmask; - float bias_val = flag_bias ? bias[i] : 0.f; - if (flag_relu) { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 - : [din0] "+r"(dr0), - [din1] "+r"(dr1), - [din2] "+r"(dr2), - [din3] "+r"(dr3), - [vmask] "+r"(vmask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [vzero] "w"(vzero), - [bias_val] "r"(bias_val), - [out1] "r"(out_buf1), - [out2] "r"(out_buf2) - : "cc", - "memory", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *doutr0++ = out_buf1[w]; - *doutr1++ = out_buf2[w]; - } - } // end of processing heights - } // end of processing channels - } // end of processing batchs -} -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/conv_depthwise_3x3s2.cc b/lite/backends/arm/math/conv_depthwise_3x3s2.cc deleted file mode 100644 index ec039af98c..0000000000 --- a/lite/backends/arm/math/conv_depthwise_3x3s2.cc +++ /dev/null @@ -1,1862 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/arm/math/conv_depthwise.h" -#include - -namespace paddle { -namespace lite { -namespace arm { -namespace math { -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx); - -void conv_depthwise_3x3s2_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int pad, - bool flag_bias, - bool flag_relu, - ARMContext* ctx) { - if (pad == 0) { - if (w_in > 7) { - conv_depthwise_3x3s2p0_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p0_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } - if (pad == 1) { - if (w_in > 7) { - conv_depthwise_3x3s2p1_bias(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } else { - conv_depthwise_3x3s2p1_bias_s(dout, - din, - weights, - bias, - flag_bias, - flag_relu, - num, - ch_in, - h_in, - w_in, - h_out, - w_out, - ctx); - } - } -} -#ifdef __aarch64__ -#define INIT_S2 \ - "prfm pldl1keep, [%[inptr0]] \n" \ - "prfm pldl1keep, [%[inptr1]] \n" \ - "prfm pldl1keep, [%[inptr2]] \n" \ - "prfm pldl1keep, [%[inptr3]] \n" \ - "prfm pldl1keep, [%[inptr4]] \n" \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" - -#define LEFT_COMPUTE_S2 \ - "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" /* r0 */ \ - "fmul v11.4s, v0.4s, %[w0].s[1] \n" /* {0,2,4,6} * w01 */ \ - "fmul v12.4s, v1.4s, %[w0].s[2] \n" /* {1,3,5,7} * w02 */ \ - "fmla v16.4s, v10.4s, %[w0].s[0] \n" /* {0,1,3,5} * w00*/ \ - \ - "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" /* v10 = {0,1,3,5} */ \ - \ - "sub %[inptr0], %[inptr0], #4 \n" \ - "sub %[inptr1], %[inptr1], #4 \n" /* r1 */ \ - "fmla v11.4s, v2.4s, %[w1].s[1] \n" \ - "fmla v12.4s, v3.4s, %[w1].s[2] \n" \ - "fmla v16.4s, v10.4s, %[w1].s[0] \n" \ - \ - "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" \ - \ - "sub %[inptr2], %[inptr2], #4 \n" \ - "sub %[inptr3], %[inptr3], #4 \n" /* r2 */ \ - "fmul v13.4s, v4.4s, %[w0].s[1] \n" \ - "fmla v11.4s, v4.4s, %[w2].s[1] \n" \ - \ - "fmul v14.4s, v5.4s, %[w0].s[2] \n" \ - "fmla v12.4s, v5.4s, %[w2].s[2] \n" \ - \ - "fmla v17.4s, v10.4s, %[w0].s[0] \n" \ - "fmla v16.4s, v10.4s, %[w2].s[0] \n" \ - \ - "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" \ - \ - "sub %[inptr4], %[inptr4], #4 \n" /* r3 */ \ - "fmla v13.4s, v6.4s, %[w1].s[1] \n" \ - "fmla v14.4s, v7.4s, %[w1].s[2] \n" \ - "fmla v17.4s, v10.4s, %[w1].s[0] \n" \ - \ - "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" \ - "fadd v16.4s, v16.4s, v11.4s \n" \ - "fadd v16.4s, v16.4s, v12.4s \n" - -#define LEFT_RESULT_S2 \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "cmp %w[cnt], #1 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "blt 1f \n" - -#define MID_COMPUTE_S2 \ - "2: \n" /* r0 */ \ - "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ - "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ - \ - "ext v10.16b, v2.16b, v18.16b, #4 \n" \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" /* r1 */ \ - "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ - "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v4.16b, v19.16b, #4 \n" \ - \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" /* r2 */ \ - "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ - "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ - \ - "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ - "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ - \ - "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ - "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ext v10.16b, v6.16b, v20.16b, #4 \n" \ - \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" /* r3 */ \ - "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ - "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v8.16b, v21.16b, #4 \n" \ - \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - \ - "fadd v16.4s, v16.4s, v11.4s \n" \ - "fadd v16.4s, v16.4s, v12.4s \n" - -#define MID_RESULT_S2 \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "bne 2b \n" - -#define RIGHT_COMPUTE_S2 \ - "1: \n" \ - "cmp %w[remain], #1 \n" \ - "blt 4f \n" \ - "3: \n" \ - "bif v0.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v1.16b, %[vzero].16b, %[mask2].16b \n" \ - \ - "bif v2.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v3.16b, %[vzero].16b, %[mask2].16b \n" \ - \ - "bif v4.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v5.16b, %[vzero].16b, %[mask2].16b \n" \ - \ - "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" \ - \ - "bif v6.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v7.16b, %[vzero].16b, %[mask2].16b \n" /* r0 */ \ - "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ - "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ - \ - "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" \ - "bif v8.16b, %[vzero].16b, %[mask1].16b \n" \ - "bif v9.16b, %[vzero].16b, %[mask2].16b \n" /* r1 */ \ - "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ - "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ - "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" /* r2 */ \ - "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ - "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ - \ - "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ - "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ - \ - "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ - "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" /* r3 */ \ - "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ - "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ - \ - "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" \ - "ld1 {v0.4s}, [%[outptr0]] \n" \ - \ - "fadd v16.4s, v16.4s, v11.4s \n" \ - "fadd v16.4s, v16.4s, v12.4s \n" \ - "ld1 {v1.4s}, [%[outptr1]] \n" - -#define RIGHT_RESULT_S2 \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "bif v16.16b, v0.16b, %[wmask].16b \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "bif v17.16b, v1.16b, %[wmask].16b \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - "4: \n" - -#define LEFT_RESULT_S2_RELU \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ - \ - "fmax v16.4s, v16.4s, %[vzero].4s \n" \ - \ - "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ - "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ - "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - \ - "fmax v17.4s, v17.4s, %[vzero].4s \n" \ - \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "cmp %w[cnt], #1 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "blt 1f \n" - -#define MID_RESULT_S2_RELU \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ - "ld1 {v15.4s}, [%[inptr0]] \n" \ - "ld1 {v18.4s}, [%[inptr1]] \n" \ - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "ld1 {v19.4s}, [%[inptr2]] \n" \ - "ld1 {v20.4s}, [%[inptr3]] \n" \ - "ld1 {v21.4s}, [%[inptr4]] \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "ext v10.16b, v0.16b, v15.16b, #4 \n" \ - "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ - "subs %w[cnt], %w[cnt], #1 \n" \ - \ - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - \ - "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ - \ - "bne 2b \n" - -#define RIGHT_RESULT_S2_RELU \ - /* r4 */ \ - "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ - "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ - "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ - \ - "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ - \ - "fadd v17.4s, v17.4s, v13.4s \n" \ - \ - "bif v16.16b, v0.16b, %[wmask].16b \n" \ - \ - "fadd v17.4s, v17.4s, v14.4s \n" \ - \ - "st1 {v16.4s}, [%[outptr0]], #16 \n" \ - \ - "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ - \ - "bif v17.16b, v1.16b, %[wmask].16b \n" \ - \ - "st1 {v17.4s}, [%[outptr1]], #16 \n" \ - "4: \n" - -#define COMPUTE_S_S2 \ - "movi v9.4s, #0 \n" \ - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ - \ - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ - \ - "bif v10.16b, v9.16b, v6.16b \n" \ - "bif v11.16b, v9.16b, v7.16b \n" \ - "bif v12.16b, v9.16b, v6.16b \n" \ - "bif v13.16b, v9.16b, v7.16b \n" \ - "bif v14.16b, v9.16b, v6.16b \n" \ - "bif v15.16b, v9.16b, v7.16b \n" \ - \ - "ext v6.16b, v9.16b, v11.16b, #12 \n" \ - "ext v7.16b, v9.16b, v13.16b, #12 \n" \ - "ext v8.16b, v9.16b, v15.16b, #12 \n" \ - \ - "fmul v4.4s, v10.4s, %[wr0].s[1] \n" \ - "fmul v5.4s, v11.4s, %[wr0].s[2] \n" \ - "fmul v6.4s, v6.4s, %[wr0].s[0] \n" \ - \ - "fmla v4.4s, v12.4s, %[wr1].s[1] \n" \ - "fmla v5.4s, v13.4s, %[wr1].s[2] \n" \ - "fmla v6.4s, v7.4s, %[wr1].s[0] \n" \ - \ - "fmla v4.4s, v14.4s, %[wr2].s[1] \n" \ - "fmla v5.4s, v15.4s, %[wr2].s[2] \n" \ - "fmla v6.4s, v8.4s, %[wr2].s[0] \n" \ - \ - "fadd v4.4s, v4.4s, v5.4s \n" \ - "fadd v4.4s, v4.4s, v6.4s \n" - -#define RESULT_S_S2 \ - "fadd v4.4s, v4.4s, %[bias].4s \n" \ - \ - "st1 {v4.4s}, [%[out]] \n" - -#define RESULT_S_S2_RELU \ - "fadd v4.4s, v4.4s, %[bias].4s \n" \ - "fmax v4.4s, v4.4s, v9.4s \n" \ - \ - "st1 {v4.4s}, [%[out]] \n" - -#define COMPUTE_S_S2_P0 \ - "movi v9.4s, #0 \n" \ - "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ - \ - "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ - "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ - "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ - "and v4.16b, %[bias].16b, %[bias].16b \n" \ - \ - "bif v10.16b, v9.16b, v6.16b \n" \ - "bif v11.16b, v9.16b, v7.16b \n" \ - "bif v12.16b, v9.16b, v6.16b \n" \ - "bif v13.16b, v9.16b, v7.16b \n" \ - "bif v14.16b, v9.16b, v6.16b \n" \ - "bif v15.16b, v9.16b, v7.16b \n" \ - \ - "ext v6.16b, v10.16b, v9.16b, #4 \n" \ - "ext v7.16b, v12.16b, v9.16b, #4 \n" \ - "ext v8.16b, v14.16b, v9.16b, #4 \n" \ - \ - "fmla v4.4s, v10.4s, %[wr0].s[0] \n" \ - "fmul v5.4s, v11.4s, %[wr0].s[1] \n" \ - "fmul v16.4s, v6.4s, %[wr0].s[2] \n" \ - \ - "fmla v4.4s, v12.4s, %[wr1].s[0] \n" \ - "fmla v5.4s, v13.4s, %[wr1].s[1] \n" \ - "fmla v16.4s, v7.4s, %[wr1].s[2] \n" \ - \ - "fmla v4.4s, v14.4s, %[wr2].s[0] \n" \ - "fmla v5.4s, v15.4s, %[wr2].s[1] \n" \ - "fmla v16.4s, v8.4s, %[wr2].s[2] \n" \ - \ - "fadd v4.4s, v4.4s, v5.4s \n" \ - "fadd v4.4s, v4.4s, v16.4s \n" - -#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n" - -#define RESULT_S_S2_P0_RELU \ - "fmax v4.4s, v4.4s, v9.4s \n" \ - "st1 {v4.4s}, [%[out]] \n" - -#else -#define INIT_S2 \ - "vmov.u32 q9, #0 \n" \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ - "pld [%[din0_ptr]] @ preload data\n" \ - "pld [%[din1_ptr]] @ preload data\n" \ - "pld [%[din2_ptr]] @ preload data\n" \ - \ - "vdup.32 q3, %[bias] @ and \n" - -#define LEFT_COMPUTE_S2 \ - "vext.32 q6, q9, q11, #3 @ shift right 1 data\n" \ - "vext.32 q7, q9, q13, #3 @ shift right 1 data\n" \ - "vext.32 q8, q9, q15, #3 @ shift right 1 data\n" \ - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, out0\n" \ - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, out0\n" \ - \ - "sub %[din0_ptr], #4 @ inpitr0 - 1\n" \ - "sub %[din1_ptr], #4 @ inpitr1 - 1\n" \ - "sub %[din2_ptr], #4 @ inpitr2 - 1\n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ - \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, out1\n" \ - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, out1\n" \ - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, out1\n" \ - \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define LEFT_RESULT_S2 \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "cmp %[cnt], #1 \n" \ - "blt 1f \n" - -#define MID_COMPUTE_S2 \ - "2: \n" \ - "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - "vext.32 q6, q10, q8, #1 @ shift left 1 \n" \ - "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" \ - \ - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ - \ - "vext.32 q7, q12, q8, #1 @ shift left 1 \n" \ - "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ - \ - "vext.32 q6, q14, q8, #1 @ shift left 1 \n" \ - \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ - \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define MID_RESULT_S2 \ - "subs %[cnt], #1 \n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "bne 2b \n" - -#define RIGHT_COMPUTE_S2 \ - "1: \n" \ - "cmp %[remain], #1 \n" \ - "blt 3f \n" \ - \ - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - \ - "vbif q10, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q11, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q12, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q13, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q14, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q15, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - \ - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ - \ - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ - \ - "vext.32 q6, q14, q9, #1 @ shift left 1 \n" \ - "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ - \ - "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define RIGHT_RESULT_S2 \ - "vbif.f32 q3, q10, q11 @ write mask\n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "3: \n" - -#define LEFT_RESULT_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "cmp %[cnt], #1 \n" \ - "blt 1f \n" - -#define MID_RESULT_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "subs %[cnt], #1 \n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "bne 2b \n" - -#define RIGHT_RESULT_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "vbif.f32 q3, q10, q11 @ write mask\n" \ - \ - "vst1.32 {d6-d7}, [%[outptr]]! \n" \ - "3: \n" - -#define COMPUTE_S_S2 \ - "vmov.u32 q9, #0 \n" \ - "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ - \ - "vbif q10, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q11, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q12, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q13, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q14, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q15, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - \ - "vext.32 q6, q9, q11, #3 @ shift left 1 \n" \ - "vext.32 q7, q9, q13, #3 @ shift left 1 \n" \ - "vext.32 q8, q9, q15, #3 @ shift left 1 \n" \ - \ - "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, out0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, out0\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n" - -#define RESULT_S_S2_RELU \ - "vmax.f32 q3, q3, q9 @ relu\n" \ - \ - "vst1.32 {d6-d7}, [%[out]] \n" - -#define COMPUTE_S_S2_P0 \ - "vmov.u32 q9, #0 \n" \ - "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" \ - "vdup.32 q3, %[bias] @ and \n" \ - \ - "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ - "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ - "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ - \ - "vbif q10, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q11, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q12, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q13, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - "vbif q14, q9, q6 @ bit select, deal with " \ - "right pad\n" \ - "vbif q15, q9, q7 @ bit select, deal with " \ - "right pad\n" \ - \ - "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ - "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ - "vext.32 q8, q14, q9, #1 @ shift left 1 \n" \ - \ - "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ - "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ - "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ - \ - "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ - "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ - "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ - \ - "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ - "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ - "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, out0\n" \ - \ - "vadd.f32 q3, q3, q4 @ add \n" \ - "vadd.f32 q3, q3, q5 @ add \n" - -#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n" - -#define RESULT_S_S2_P0_RELU \ - "vmax.f32 q3, q3, q9 @ relu \n" \ - "vst1.32 {d6-d7}, [%[out]] \n" - -#endif - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - * w_in > 7 - */ -void conv_depthwise_3x3s2p1_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - int size_pad_bottom = h_out * 2 - h_in; - - int cnt_col = (w_out >> 2) - 2; - int size_right_remain = w_in - (7 + cnt_col * 8); - if (size_right_remain >= 9) { - cnt_col++; - size_right_remain -= 8; - } - int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // - - int size_right_pad = w_out * 2 - w_in; - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); -#ifdef __aarch64__ - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } -#else - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[i]; - } -#endif // __aarch64__ - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_in; i += 4) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - din3_ptr = dr2; - din4_ptr = dr3; - dr0 = dr3; - dr1 = dr4; - } else { - dr0 = dr4; - dr1 = dr0 + w_in; - } - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i + 4 > h_in) { - switch (i + 4 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i / 2 + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = cnt_col; - if (flag_relu) { - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 - MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } else { - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 - MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_in; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - if (i == 0) { - din0_ptr = zero_ptr; - din1_ptr = dr0; - din2_ptr = dr1; - dr0 = dr1; - dr1 = dr2; - dr2 = dr1 + w_in; - } else { - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - } - - //! process bottom pad - if (i + 2 > h_in) { - switch (i + 2 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = cnt_col; - unsigned int* mask_ptr = dmask; - if (flag_relu) { - asm volatile( - INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 - MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 - MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ -void conv_depthwise_3x3s2p1_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - int hs = -1; - int he = 2; - float out_buf[4]; - for (int j = 0; j < h_out; ++j) { - const float* dr0 = din_channel + hs * w_in; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - if (hs == -1) { - dr0 = zeros; - } - if (he > h_in) { - dr2 = zeros; - } - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } else { - asm volatile(COMPUTE_S_S2 RESULT_S_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15"); - } -#else - if (flag_relu) { - asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S2 RESULT_S_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - hs += 2; - he += 2; - } - } - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2 - */ -// w_in > 7 -void conv_depthwise_3x3s2p0_bias(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - - int tile_w = w_out >> 2; - int cnt_remain = w_out % 4; - - unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); - - uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), - vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - uint32x4_t wmask = - vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - float* zero_ptr = ctx->workspace_data(); - memset(zero_ptr, 0, w_in * sizeof(float)); - float* write_ptr = zero_ptr + w_in; - - unsigned int dmask[12]; - - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - vst1q_u32(dmask + 8, wmask); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float32x4_t vzero = vdupq_n_f32(0.f); - -#ifdef __aarch64__ - float32x4_t wbias; - if (flag_bias) { - wbias = vdupq_n_f32(bias[i]); - } else { - wbias = vdupq_n_f32(0.f); - } -#else - float bias_c = 0.f; - if (flag_bias) { - bias_c = bias[i]; - } -#endif // __aarch64__ - - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - const float* dr3 = dr2 + w_in; - const float* dr4 = dr3 + w_in; - - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - const float* din3_ptr = dr3; - const float* din4_ptr = dr4; - - float* doutr0 = dout_channel; - float* doutr0_ptr = nullptr; - float* doutr1_ptr = nullptr; - -#ifdef __aarch64__ - for (int i = 0; i < h_out; i += 2) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - din3_ptr = dr3; - din4_ptr = dr4; - - doutr0_ptr = doutr0; - doutr1_ptr = doutr0 + w_out; - - dr0 = dr4; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - dr3 = dr2 + w_in; - dr4 = dr3 + w_in; - - //! process bottom pad - if (i * 2 + 5 > h_in) { - switch (i * 2 + 5 - h_in) { - case 4: - din1_ptr = zero_ptr; - case 3: - din2_ptr = zero_ptr; - case 2: - din3_ptr = zero_ptr; - case 1: - din4_ptr = zero_ptr; - case 0: - din4_ptr = zero_ptr; - default: - break; - } - } - //! process output pad - if (i + 2 > h_out) { - doutr1_ptr = write_ptr; - } - int cnt = tile_w; - if (flag_relu) { - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2_RELU - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2_RELU - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } else { - asm volatile( - INIT_S2 - "ld1 {v15.4s}, [%[inptr0]] \n" - "ld1 {v18.4s}, [%[inptr1]] \n" - "ld1 {v19.4s}, [%[inptr2]] \n" - "ld1 {v20.4s}, [%[inptr3]] \n" - "ld1 {v21.4s}, [%[inptr4]] \n" - "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} - MID_COMPUTE_S2 MID_RESULT_S2 - "cmp %w[remain], #1 \n" - "blt 4f \n" RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2 - "4: \n" - : [inptr0] "+r"(din0_ptr), - [inptr1] "+r"(din1_ptr), - [inptr2] "+r"(din2_ptr), - [inptr3] "+r"(din3_ptr), - [inptr4] "+r"(din4_ptr), - [outptr0] "+r"(doutr0_ptr), - [outptr1] "+r"(doutr1_ptr), - [cnt] "+r"(cnt) - : [vzero] "w"(vzero), - [w0] "w"(wr0), - [w1] "w"(wr1), - [w2] "w"(wr2), - [remain] "r"(cnt_remain), - [mask1] "w"(vmask_rp1), - [mask2] "w"(vmask_rp2), - [wmask] "w"(wmask), - [vbias] "w"(wbias) - : "cc", - "memory", - "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20", - "v21"); - } - doutr0 = doutr0 + 2 * w_out; - } -#else - for (int i = 0; i < h_out; i++) { - din0_ptr = dr0; - din1_ptr = dr1; - din2_ptr = dr2; - - doutr0_ptr = doutr0; - - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - //! process bottom pad - if (i * 2 + 3 > h_in) { - switch (i * 2 + 3 - h_in) { - case 2: - din1_ptr = zero_ptr; - case 1: - din2_ptr = zero_ptr; - default: - break; - } - } - int cnt = tile_w; - unsigned int* mask_ptr = dmask; - if (flag_relu) { - asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU - RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2 - RIGHT_RESULT_S2 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [outptr] "+r"(doutr0_ptr), - [cnt] "+r"(cnt), - [mask_ptr] "+r"(mask_ptr) - : [remain] "r"(cnt_remain), - [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } - doutr0 = doutr0 + w_out; - } -#endif - } - } -} - -/** - * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 - */ -void conv_depthwise_3x3s2p0_bias_s(float* dout, - const float* din, - const float* weights, - const float* bias, - bool flag_bias, - bool flag_relu, - const int num, - const int ch_in, - const int h_in, - const int w_in, - const int h_out, - const int w_out, - ARMContext* ctx) { - int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - int out_pad_idx[4] = {0, 1, 2, 3}; - float zeros[8] = {0.0f}; - const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; - - uint32x4_t vmask_rp1 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 - uint32x4_t vmask_rp2 = - vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 - - int size_in_channel = w_in * h_in; - int size_out_channel = w_out * h_out; - - unsigned int dmask[8]; - vst1q_u32(dmask, vmask_rp1); - vst1q_u32(dmask + 4, vmask_rp2); - - for (int n = 0; n < num; ++n) { - const float* din_batch = din + n * ch_in * size_in_channel; - float* dout_batch = dout + n * ch_in * size_out_channel; -#pragma omp parallel for - for (int i = 0; i < ch_in; ++i) { - const float* din_channel = din_batch + i * size_in_channel; - float* dout_channel = dout_batch + i * size_out_channel; - - const float* weight_ptr = weights + i * 9; - float32x4_t wr0 = vld1q_f32(weight_ptr); - float32x4_t wr1 = vld1q_f32(weight_ptr + 3); - float32x4_t wr2 = vld1q_f32(weight_ptr + 6); - - float bias_c = 0.f; - - if (flag_bias) { - bias_c = bias[i]; - } - float32x4_t vbias = vdupq_n_f32(bias_c); - float out_buf[4]; - const float* dr0 = din_channel; - const float* dr1 = dr0 + w_in; - const float* dr2 = dr1 + w_in; - for (int j = 0; j < h_out; j++) { - const float* din0_ptr = dr0; - const float* din1_ptr = dr1; - const float* din2_ptr = dr2; - if (j * 2 + 2 >= h_in) { - switch (j + 2 - h_in) { - case 1: - din1_ptr = zero_ptr; - case 0: - din2_ptr = zero_ptr; - default: - break; - } - } - dr0 = dr2; - dr1 = dr0 + w_in; - dr2 = dr1 + w_in; - - unsigned int* mask_ptr = dmask; -#ifdef __aarch64__ - if (flag_relu) { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - } else { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr), - [mask_ptr] "+r"(mask_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "w"(vbias), - [out] "r"(out_buf) - : "cc", - "memory", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16"); - } -#else - if (flag_relu) { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(dmask) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } else { - asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 - : [din0_ptr] "+r"(din0_ptr), - [din1_ptr] "+r"(din1_ptr), - [din2_ptr] "+r"(din2_ptr) - : [wr0] "w"(wr0), - [wr1] "w"(wr1), - [wr2] "w"(wr2), - [bias] "r"(bias_c), - [out] "r"(out_buf), - [mask_ptr] "r"(dmask) - : "cc", - "memory", - "q3", - "q4", - "q5", - "q6", - "q7", - "q8", - "q9", - "q10", - "q11", - "q12", - "q13", - "q14", - "q15"); - } -#endif - for (int w = 0; w < w_out; ++w) { - *dout_channel++ = out_buf[w]; - } - } - } - } -} -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/reduce_prod.cc b/lite/backends/arm/math/reduce_prod.cc old mode 100755 new mode 100644 diff --git a/lite/backends/arm/math/reduce_prod.h b/lite/backends/arm/math/reduce_prod.h old mode 100755 new mode 100644 diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.cc b/lite/backends/arm/math/split_merge_lod_tenosr.cc old mode 100755 new mode 100644 diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.h b/lite/backends/arm/math/split_merge_lod_tenosr.h old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/pes/gru_util.hpp b/lite/backends/fpga/KD/pes/gru_util.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp old mode 100644 new mode 100755 diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp old mode 100755 new mode 100644 diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc old mode 100644 new mode 100755 diff --git a/lite/backends/npu/builder.cc b/lite/backends/npu/builder.cc deleted file mode 100644 index 954fad8c91..0000000000 --- a/lite/backends/npu/builder.cc +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/builder.h" -#include // NOLINT -#include -#include "lite/backends/npu/runtime.h" - -namespace paddle { -namespace lite { -namespace npu { - -// Build HIAI IR graph to om model, and store om model data into lite tensor -bool BuildModel(std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - lite::Tensor* model_data) { - LOG(INFO) << "[NPU] Build model."; - CHECK_GT(inputs.size(), 0); - CHECK_GT(outputs.size(), 0); - CHECK_NE(model_data, 0); - // build IR graph to om model - ge::Graph ir_graph("graph"); - ir_graph.SetInputs(inputs).SetOutputs(outputs); - ge::Model om_model("model", "model"); - om_model.SetGraph(ir_graph); - domi::HiaiIrBuild ir_build; - domi::ModelBufferData om_model_buf; - if (!ir_build.CreateModelBuff(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] CreateModelBuff failed!"; - return false; - } - if (!ir_build.BuildIRModel(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] BuildIRModel failed!"; - return false; - } - // store om model into tensor - model_data->Resize({om_model_buf.length}); - memcpy(model_data->mutable_data(), - om_model_buf.data, - om_model_buf.length); - ir_build.ReleaseModelBuff(om_model_buf); - return true; -} - -std::string UniqueName(const std::string& prefix) { - static std::mutex counter_mtx; - static std::unordered_map counter_map; - std::unique_lock counter_lck(counter_mtx); - int counter = 1; - auto it = counter_map.find(prefix); - if (it == counter_map.end()) { - counter_map[prefix] = counter; - } else { - counter = ++(it->second); - } - return prefix + "_" + std::to_string(counter); -} - -ge::DataType CvtPrecisionType(PrecisionType itype) { - ge::DataType otype = ge::DT_FLOAT; - switch (itype) { - case PRECISION(kFloat): - otype = ge::DT_FLOAT; - break; - case PRECISION(kInt8): - otype = ge::DT_INT8; - break; - case PRECISION(kInt32): - otype = ge::DT_INT32; - break; - default: - LOG(FATAL) << "[NPU] Can not convert precision type(" - << PrecisionToStr(itype) << ") from Lite to NPU"; - break; - } - return otype; -} - -ge::Format CvtDataLayoutType(DataLayoutType itype) { - ge::Format otype = ge::FORMAT_NCHW; - switch (itype) { - case DATALAYOUT(kNCHW): - otype = ge::FORMAT_NCHW; - break; - // TODO(hong19860320) support more data layout type - default: - LOG(FATAL) << "[NPU] Can not convert data layout type(" - << DataLayoutToStr(itype) << ") from Lite to NPU"; - break; - } - return otype; -} - -ge::TensorPtr CvtTensor(lite::Tensor* in_tensor, - std::vector out_shape, - PrecisionType in_ptype, - DataLayoutType in_ltype) { - uint8_t* in_data = nullptr; - auto in_size = in_tensor->dims().production(); - auto in_shape = in_tensor->dims().Vectorize(); - if (out_shape.empty()) { - out_shape = in_shape; - } - int in_bytes; - if (in_ptype == PRECISION(kFloat)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(float); - } else if (in_ptype == PRECISION(kInt32)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int32_t); - } else if (in_ptype == PRECISION(kInt8)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int8_t); - } else { - LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype); - } - ge::DataType out_ptype = CvtPrecisionType(in_ptype); - ge::Format out_ltype = CvtDataLayoutType(in_ltype); - - ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype); - CHECK_EQ(out_ltype, ge::FORMAT_NCHW); - - auto out_size = out_desc.GetShape().GetShapeSize(); - CHECK_EQ(out_size, in_size); - - ge::TensorPtr out_tensor = std::make_shared(); - out_tensor->SetTensorDesc(out_desc); - out_tensor->SetData(in_data, in_bytes); - return out_tensor; -} - -int CvtActMode(std::string act_type) { - int act_mode = 1; - if (act_type == "sigmoid") { - act_mode = 0; - } else if (act_type == "relu") { - act_mode = 1; - } else if (act_type == "tanh") { - act_mode = 2; - } else if (act_type == "relu_clipped") { - act_mode = 3; - } else if (act_type == "elu") { - act_mode = 4; - } else if (act_type == "leaky_relu") { - act_mode = 5; - } else if (act_type == "abs") { - act_mode = 6; - } else if (act_type == "softsign") { - act_mode = 8; - } else if (act_type == "softplus") { - act_mode = 9; - } else if (act_type == "hard_sigmoid") { - act_mode = 10; - } else { - // TODO(hong19860320) support more activation mode - LOG(FATAL) << "[NPU] Unsupported activation type " << act_type; - } - return act_mode; -} - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname) { - auto iarg_names = op_info->input_argnames(); - if (std::find(iarg_names.begin(), iarg_names.end(), argname) != - iarg_names.end()) { - auto inputs = op_info->Input(argname); - if (inputs.empty()) { - return false; - } - auto var_name = inputs.front(); - auto var = scope->FindVar(var_name); - return var != nullptr; - } else { - return false; - } -} - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h deleted file mode 100644 index 70200354fb..0000000000 --- a/lite/backends/npu/builder.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "ai_ddk_lib/include/hiai_ir_build.h" -#include "lite/core/op_lite.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" - -// Extended Ops of HIAI DDK -namespace ge { -/** - * Pads a tensor. - * - * x : the input tensor - * padding : the input tensor must be 2-D - * constant_values : constant values must be a scalar - * - * output : the output tensor - * - * t_paddings : Default DT_INT32 , t_paddings must be the same with - * datatype of the padding - * mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC - * T : datatype of constant_values DT_INT32:3 DT_FLOAT:0 - */ -REG_OP(Pad) - .INPUT(x, TensorType({DT_FLOAT, DT_INT32})) - .INPUT(padding, TensorType({DT_INT32})) - .OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT})) - .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32})) - .ATTR(t_paddings, AttrValue::INT{3}) - .ATTR(mode, AttrValue::INT{0}) - .REQUIRED_ATTR(T, AttrValue::INT) - .OP_END(); - -} // namespace ge - -namespace paddle { -namespace lite { -namespace npu { - -class OpList { - public: - static OpList& Global() { - static thread_local OpList x; - return x; - } - void clear() { lists_.clear(); } - void add(std::shared_ptr p) { lists_.push_back(p); } - - private: - std::vector> lists_; -}; - -// Build HIAI IR graph to om model, and store om model data into lite tensor -bool BuildModel(std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - lite::Tensor* model_data); - -std::string UniqueName(const std::string& prefix); - -ge::DataType CvtPrecisionType(PrecisionType itype); - -ge::Format CvtDataLayoutType(DataLayoutType itype); - -ge::TensorPtr CvtTensor(Tensor* in_tensor, - std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); - -template -ge::TensorPtr CreateTensorAndFillData(std::vector data, - std::vector shape = {}, - ge::Format format = ge::FORMAT_NCHW) { - const std::type_info& info = typeid(T); - ge::DataType type = ge::DT_FLOAT; - if (info == typeid(float)) { - type = ge::DT_FLOAT; - } else if (info == typeid(int8_t)) { - type = ge::DT_INT8; - } else if (info == typeid(int32_t)) { - type = ge::DT_INT32; - } else { - LOG(FATAL) << "[NPU] Unknow value type " << info.name(); - } - if (shape.empty()) { - shape = {static_cast(data.size())}; - } else { - int size = 1; - for (auto i : shape) { - size *= i; - } - CHECK_EQ(data.size(), size); - } - ge::TensorDesc desc(ge::Shape(shape), format, type); - ge::TensorPtr tensor = std::make_shared(); - tensor->SetTensorDesc(desc); - tensor->SetData(reinterpret_cast(data.data()), - data.size() * sizeof(T)); - return tensor; -} - -template -ge::TensorPtr CreateTensorAndFillData(T value, - std::vector shape = {1}, - ge::Format format = ge::FORMAT_NCHW) { - int64_t size = 1; - for (auto i : shape) { - size *= i; - } - std::vector data(size, value); - return CreateTensorAndFillData(data, shape, format); -} - -int CvtActMode(std::string act_type); - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname); - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc old mode 100755 new mode 100644 diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h old mode 100755 new mode 100644 index 3eba0b77e4..411600ae0a --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -18,8 +18,8 @@ #include #include #include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "ai_ddk_lib/include/hiai_ir_build.h" +#include "HiAiModelManagerService.h" // NOLINT +#include "hiai_ir_build.h" // NOLINT namespace paddle { namespace lite { diff --git a/lite/backends/npu/runtime.cc b/lite/backends/npu/runtime.cc deleted file mode 100644 index 3485f63c7c..0000000000 --- a/lite/backends/npu/runtime.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/npu/runtime.h" -#include -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace npu { - -// Create hiai model manager to load om model from lite tensor, and return the -// manager and an unique model name -bool LoadModel(const lite::Tensor &model_data, - std::shared_ptr *model_client, - std::string *model_name) { - LOG(INFO) << "[NPU] Load model."; - auto model_data_ptr = model_data.data(); - auto model_data_size = model_data.numel() * sizeof(int8_t); - if (model_data_ptr == nullptr || model_data_size == 0) { - return false; - } - *model_client = std::make_shared(); - int ret = (*model_client)->Init(nullptr); - if (ret != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] AiModelMngerClient init failed(" << ret << ")!"; - return false; - } - *model_name = "model.om"; - auto model_desc = std::make_shared( - *model_name, - DeviceInfo::Global().freq_level(), - DeviceInfo::Global().framework_type(), - DeviceInfo::Global().model_type(), - DeviceInfo::Global().device_type()); - model_desc->SetModelBuffer(model_data_ptr, model_data_size); - std::vector> model_descs; - model_descs.push_back(model_desc); - if ((*model_client)->Load(model_descs) != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!"; - return false; - } - return true; -} - -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/npu/runtime.h b/lite/backends/npu/runtime.h deleted file mode 100644 index 8b1ad51518..0000000000 --- a/lite/backends/npu/runtime.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace npu { - -class DeviceInfo { - public: - static DeviceInfo &Global() { - static DeviceInfo x; - return x; - } - DeviceInfo() {} - - int freq_level() { return freq_level_; } - int framework_type() { return framework_type_; } - int model_type() { return model_type_; } - int device_type() { return device_type_; } - - private: - int freq_level_{3}; - int framework_type_{0}; - int model_type_{0}; - int device_type_{0}; -}; - -bool LoadModel(const lite::Tensor &model_data, - std::shared_ptr *model_client, - std::string *model_name); -} // namespace npu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl old mode 100755 new mode 100644 diff --git a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl old mode 100755 new mode 100644 diff --git a/lite/backends/x86/jit/README.en.md b/lite/backends/x86/jit/README.en.md index cd2aa5c242..dc9eb4cf23 100644 --- a/lite/backends/x86/jit/README.en.md +++ b/lite/backends/x86/jit/README.en.md @@ -89,7 +89,7 @@ All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automati 3. Add reference function of `your_key`. Note: - this should be run on CPU and do not depend on any third-party. - - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used. + - Add `USE_JITKERNEL_REFER_LITE(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used. 4. Add unit test in `test.cc`, and verfiy at least `float` and `double`. Test more data type for some special functions if necessary, for example `int8`. 5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one. diff --git a/lite/backends/x86/jit/README.md b/lite/backends/x86/jit/README.md index 6998c5d867..bc0e27234d 100644 --- a/lite/backends/x86/jit/README.md +++ b/lite/backends/x86/jit/README.md @@ -79,7 +79,7 @@ PaddlePaddle/Paddle/paddle/fluid/ # 如何添加新的算子 1. 在`KernelType` 中添加 `your_key` 。 -2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。 +2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER_LITE(your_key)`来使用该kernel。 3. (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。 4. (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。 5. 添加新的`KernelTuple`,需要与`KernelType`一一对应,是所有类型的一个打包,包括数据类型,属性的类型,以及返回的函数类型。可以参考`SeqPoolTuple`,新加的Attr类型需要特例化`JitCodeKey`方法。 diff --git a/lite/backends/x86/jit/gen/CMakeLists.txt b/lite/backends/x86/jit/gen/CMakeLists.txt index 99244ea9bd..6250077528 100644 --- a/lite/backends/x86/jit/gen/CMakeLists.txt +++ b/lite/backends/x86/jit/gen/CMakeLists.txt @@ -4,33 +4,33 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE) -function(USE_JITKERNEL_GEN TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n") +function(USE_JITKERNEL_GEN_LITE TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_GEN_LITE(${TARGET});\n") endfunction() # use gen jitcode kernel by name -USE_JITKERNEL_GEN(kMatMul) -USE_JITKERNEL_GEN(kVMul) -USE_JITKERNEL_GEN(kVAdd) -USE_JITKERNEL_GEN(kVSub) -USE_JITKERNEL_GEN(kVAddRelu) -USE_JITKERNEL_GEN(kVScal) -USE_JITKERNEL_GEN(kVAddBias) -USE_JITKERNEL_GEN(kVRelu) -USE_JITKERNEL_GEN(kVSquare) -USE_JITKERNEL_GEN(kVIdentity) -USE_JITKERNEL_GEN(kVExp) -USE_JITKERNEL_GEN(kVSigmoid) -USE_JITKERNEL_GEN(kVTanh) -USE_JITKERNEL_GEN(kLSTMCtHt) -USE_JITKERNEL_GEN(kLSTMC1H1) -USE_JITKERNEL_GEN(kGRUH1) -USE_JITKERNEL_GEN(kGRUHtPart1) -USE_JITKERNEL_GEN(kGRUHtPart2) -USE_JITKERNEL_GEN(kNCHW16CMulNC) -USE_JITKERNEL_GEN(kSeqPool) -USE_JITKERNEL_GEN(kHMax) -USE_JITKERNEL_GEN(kHSum) -USE_JITKERNEL_GEN(kEmbSeqPool) -USE_JITKERNEL_GEN(kSgd) -USE_JITKERNEL_GEN(kVBroadcast) +USE_JITKERNEL_GEN_LITE(kMatMul) +USE_JITKERNEL_GEN_LITE(kVMul) +USE_JITKERNEL_GEN_LITE(kVAdd) +USE_JITKERNEL_GEN_LITE(kVSub) +USE_JITKERNEL_GEN_LITE(kVAddRelu) +USE_JITKERNEL_GEN_LITE(kVScal) +USE_JITKERNEL_GEN_LITE(kVAddBias) +USE_JITKERNEL_GEN_LITE(kVRelu) +USE_JITKERNEL_GEN_LITE(kVSquare) +USE_JITKERNEL_GEN_LITE(kVIdentity) +USE_JITKERNEL_GEN_LITE(kVExp) +USE_JITKERNEL_GEN_LITE(kVSigmoid) +USE_JITKERNEL_GEN_LITE(kVTanh) +USE_JITKERNEL_GEN_LITE(kLSTMCtHt) +USE_JITKERNEL_GEN_LITE(kLSTMC1H1) +USE_JITKERNEL_GEN_LITE(kGRUH1) +USE_JITKERNEL_GEN_LITE(kGRUHtPart1) +USE_JITKERNEL_GEN_LITE(kGRUHtPart2) +USE_JITKERNEL_GEN_LITE(kNCHW16CMulNC) +USE_JITKERNEL_GEN_LITE(kSeqPool) +USE_JITKERNEL_GEN_LITE(kHMax) +USE_JITKERNEL_GEN_LITE(kHSum) +USE_JITKERNEL_GEN_LITE(kEmbSeqPool) +USE_JITKERNEL_GEN_LITE(kSgd) +USE_JITKERNEL_GEN_LITE(kVBroadcast) diff --git a/lite/backends/x86/jit/gen/act.cc b/lite/backends/x86/jit/gen/act.cc index f1f261c199..45f4f7ddcc 100644 --- a/lite/backends/x86/jit/gen/act.cc +++ b/lite/backends/x86/jit/gen/act.cc @@ -156,9 +156,9 @@ size_t VTanhCreator::CodeSize(const int& d) const { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator); -REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator); -REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator); -REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator); -REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator); -REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator); +REGISTER_JITKERNEL_GEN_LITE(kVRelu, gen::VReluCreator); +REGISTER_JITKERNEL_GEN_LITE(kVSquare, gen::VSquareCreator); +REGISTER_JITKERNEL_GEN_LITE(kVIdentity, gen::VIdentityCreator); +REGISTER_JITKERNEL_GEN_LITE(kVExp, gen::VExpCreator); +REGISTER_JITKERNEL_GEN_LITE(kVSigmoid, gen::VSigmoidCreator); +REGISTER_JITKERNEL_GEN_LITE(kVTanh, gen::VTanhCreator); diff --git a/lite/backends/x86/jit/gen/blas.cc b/lite/backends/x86/jit/gen/blas.cc index 0bddea6ace..37183e6640 100644 --- a/lite/backends/x86/jit/gen/blas.cc +++ b/lite/backends/x86/jit/gen/blas.cc @@ -181,10 +181,10 @@ DECLARE_BLAS_CREATOR(VAddBias); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator); -REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator); -REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator); -REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator); -REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator); -REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator); -REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator); +REGISTER_JITKERNEL_GEN_LITE(kVMul, gen::VMulCreator); +REGISTER_JITKERNEL_GEN_LITE(kVAdd, gen::VAddCreator); +REGISTER_JITKERNEL_GEN_LITE(kVSub, gen::VSubCreator); +REGISTER_JITKERNEL_GEN_LITE(kVAddRelu, gen::VAddReluCreator); +REGISTER_JITKERNEL_GEN_LITE(kVScal, gen::VScalCreator); +REGISTER_JITKERNEL_GEN_LITE(kVAddBias, gen::VAddBiasCreator); +REGISTER_JITKERNEL_GEN_LITE(kNCHW16CMulNC, gen::NCHW16CMulNCCreator); diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc index 2ff6894383..7e697014ed 100644 --- a/lite/backends/x86/jit/gen/embseqpool.cc +++ b/lite/backends/x86/jit/gen/embseqpool.cc @@ -145,4 +145,4 @@ class EmbSeqPoolCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator); +REGISTER_JITKERNEL_GEN_LITE(kEmbSeqPool, gen::EmbSeqPoolCreator); diff --git a/lite/backends/x86/jit/gen/gru.cc b/lite/backends/x86/jit/gen/gru.cc index c5737faf13..4c2c57413e 100644 --- a/lite/backends/x86/jit/gen/gru.cc +++ b/lite/backends/x86/jit/gen/gru.cc @@ -111,6 +111,6 @@ DECLARE_GRU_CREATOR(GRUHtPart2); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator); -REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator); -REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator); +REGISTER_JITKERNEL_GEN_LITE(kGRUH1, gen::GRUH1Creator); +REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart1, gen::GRUHtPart1Creator); +REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart2, gen::GRUHtPart2Creator); diff --git a/lite/backends/x86/jit/gen/hopv.cc b/lite/backends/x86/jit/gen/hopv.cc index 4304dc48c5..0fdd63a740 100644 --- a/lite/backends/x86/jit/gen/hopv.cc +++ b/lite/backends/x86/jit/gen/hopv.cc @@ -99,5 +99,5 @@ DECLARE_HOP_CREATOR(HSum); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator); -REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator); +REGISTER_JITKERNEL_GEN_LITE(kHMax, gen::HMaxCreator); +REGISTER_JITKERNEL_GEN_LITE(kHSum, gen::HSumCreator); diff --git a/lite/backends/x86/jit/gen/lstm.cc b/lite/backends/x86/jit/gen/lstm.cc index 44e58d0b75..e441735520 100644 --- a/lite/backends/x86/jit/gen/lstm.cc +++ b/lite/backends/x86/jit/gen/lstm.cc @@ -138,5 +138,5 @@ DECLARE_LSTM_CREATOR(LSTMC1H1); namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator); -REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator); +REGISTER_JITKERNEL_GEN_LITE(kLSTMCtHt, gen::LSTMCtHtCreator); +REGISTER_JITKERNEL_GEN_LITE(kLSTMC1H1, gen::LSTMC1H1Creator); diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc index 2c75f6dd5d..010c80fac4 100644 --- a/lite/backends/x86/jit/gen/matmul.cc +++ b/lite/backends/x86/jit/gen/matmul.cc @@ -130,4 +130,4 @@ class MatMulCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator); +REGISTER_JITKERNEL_GEN_LITE(kMatMul, gen::MatMulCreator); diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc index e0cf5e5a5a..4c80737aac 100644 --- a/lite/backends/x86/jit/gen/seqpool.cc +++ b/lite/backends/x86/jit/gen/seqpool.cc @@ -82,4 +82,4 @@ class SeqPoolCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator); +REGISTER_JITKERNEL_GEN_LITE(kSeqPool, gen::SeqPoolCreator); diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc index 10659f5084..44e0833661 100644 --- a/lite/backends/x86/jit/gen/sgd.cc +++ b/lite/backends/x86/jit/gen/sgd.cc @@ -127,4 +127,4 @@ class SgdCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator); +REGISTER_JITKERNEL_GEN_LITE(kSgd, gen::SgdCreator); diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc index 9e02dca8c4..fb1e71f7b0 100644 --- a/lite/backends/x86/jit/gen/vbroadcast.cc +++ b/lite/backends/x86/jit/gen/vbroadcast.cc @@ -88,4 +88,4 @@ class VBroadcastCreator : public JitCodeCreator { namespace gen = paddle::lite::jit::gen; -REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator); +REGISTER_JITKERNEL_GEN_LITE(kVBroadcast, gen::VBroadcastCreator); diff --git a/lite/backends/x86/jit/more/CMakeLists.txt b/lite/backends/x86/jit/more/CMakeLists.txt index 2ddbbcd16a..5641466d8a 100644 --- a/lite/backends/x86/jit/more/CMakeLists.txt +++ b/lite/backends/x86/jit/more/CMakeLists.txt @@ -1,6 +1,6 @@ -function(USE_JITKERNEL_MORE TARGET TYPE) - file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n") +function(USE_JITKERNEL_MORE_LITE TARGET TYPE) + file(APPEND ${jit_file} "USE_JITKERNEL_MORE_LITE(${TARGET} ${TYPE});\n") endfunction() # enable it latter diff --git a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt index 468937a4f6..80dabc72fb 100644 --- a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt +++ b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt @@ -5,5 +5,5 @@ cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE) # use mkl kernels by name and type -USE_JITKERNEL_MORE(kCRFDecoding, intrinsic) -USE_JITKERNEL_MORE(kLayerNorm, intrinsic) +USE_JITKERNEL_MORE_LITE(kCRFDecoding, intrinsic) +USE_JITKERNEL_MORE_LITE(kLayerNorm, intrinsic) diff --git a/lite/backends/x86/jit/more/mix/CMakeLists.txt b/lite/backends/x86/jit/more/mix/CMakeLists.txt index dd039d2915..5e0238f26f 100644 --- a/lite/backends/x86/jit/more/mix/CMakeLists.txt +++ b/lite/backends/x86/jit/more/mix/CMakeLists.txt @@ -5,11 +5,11 @@ cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE) -USE_JITKERNEL_MORE(kVSigmoid, mix) -USE_JITKERNEL_MORE(kVTanh, mix) -USE_JITKERNEL_MORE(kLSTMCtHt, mix) -USE_JITKERNEL_MORE(kLSTMC1H1, mix) -USE_JITKERNEL_MORE(kGRUH1, mix) -USE_JITKERNEL_MORE(kGRUHtPart1, mix) -USE_JITKERNEL_MORE(kGRUHtPart2, mix) -USE_JITKERNEL_MORE(kSoftmax, mix) +USE_JITKERNEL_MORE_LITE(kVSigmoid, mix) +USE_JITKERNEL_MORE_LITE(kVTanh, mix) +USE_JITKERNEL_MORE_LITE(kLSTMCtHt, mix) +USE_JITKERNEL_MORE_LITE(kLSTMC1H1, mix) +USE_JITKERNEL_MORE_LITE(kGRUH1, mix) +USE_JITKERNEL_MORE_LITE(kGRUHtPart1, mix) +USE_JITKERNEL_MORE_LITE(kGRUHtPart2, mix) +USE_JITKERNEL_MORE_LITE(kSoftmax, mix) diff --git a/lite/backends/x86/jit/more/mkl/CMakeLists.txt b/lite/backends/x86/jit/more/mkl/CMakeLists.txt index 56f1a62ad4..3557f531a5 100644 --- a/lite/backends/x86/jit/more/mkl/CMakeLists.txt +++ b/lite/backends/x86/jit/more/mkl/CMakeLists.txt @@ -3,18 +3,18 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE) # use mkl kernels by name and type -USE_JITKERNEL_MORE(kMatMul, mkl) -USE_JITKERNEL_MORE(kVMul, mkl) -USE_JITKERNEL_MORE(kVAdd, mkl) -USE_JITKERNEL_MORE(kVScal, mkl) -USE_JITKERNEL_MORE(kStrideScal, mkl) -USE_JITKERNEL_MORE(kVExp, mkl) -USE_JITKERNEL_MORE(kVSquare, mkl) -USE_JITKERNEL_MORE(kVCopy, mkl) -USE_JITKERNEL_MORE(kVSigmoid, mkl) -USE_JITKERNEL_MORE(kVTanh, mkl) -USE_JITKERNEL_MORE(kSeqPool, mkl) -USE_JITKERNEL_MORE(kSoftmax, mkl) -USE_JITKERNEL_MORE(kEmbSeqPool, mkl) -USE_JITKERNEL_MORE(kSgd, mkl) -USE_JITKERNEL_MORE(kVBroadcast, mkl) +USE_JITKERNEL_MORE_LITE(kMatMul, mkl) +USE_JITKERNEL_MORE_LITE(kVMul, mkl) +USE_JITKERNEL_MORE_LITE(kVAdd, mkl) +USE_JITKERNEL_MORE_LITE(kVScal, mkl) +USE_JITKERNEL_MORE_LITE(kStrideScal, mkl) +USE_JITKERNEL_MORE_LITE(kVExp, mkl) +USE_JITKERNEL_MORE_LITE(kVSquare, mkl) +USE_JITKERNEL_MORE_LITE(kVCopy, mkl) +USE_JITKERNEL_MORE_LITE(kVSigmoid, mkl) +USE_JITKERNEL_MORE_LITE(kVTanh, mkl) +USE_JITKERNEL_MORE_LITE(kSeqPool, mkl) +USE_JITKERNEL_MORE_LITE(kSoftmax, mkl) +USE_JITKERNEL_MORE_LITE(kEmbSeqPool, mkl) +USE_JITKERNEL_MORE_LITE(kSgd, mkl) +USE_JITKERNEL_MORE_LITE(kVBroadcast, mkl) diff --git a/lite/backends/x86/jit/refer/CMakeLists.txt b/lite/backends/x86/jit/refer/CMakeLists.txt index 7133f59662..c52b21ad7d 100644 --- a/lite/backends/x86/jit/refer/CMakeLists.txt +++ b/lite/backends/x86/jit/refer/CMakeLists.txt @@ -2,39 +2,39 @@ cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base) set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE) -function(USE_JITKERNEL_REFER TARGET) - file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n") +function(USE_JITKERNEL_REFER_LITE TARGET) + file(APPEND ${jit_file} "USE_JITKERNEL_REFER_LITE(${TARGET});\n") endfunction() # use refer kernel by name -USE_JITKERNEL_REFER(kVMul) -USE_JITKERNEL_REFER(kVAdd) -USE_JITKERNEL_REFER(kVAddRelu) -USE_JITKERNEL_REFER(kVSub) -USE_JITKERNEL_REFER(kVScal) -USE_JITKERNEL_REFER(kStrideScal) -USE_JITKERNEL_REFER(kVAddBias) -USE_JITKERNEL_REFER(kVCopy) -USE_JITKERNEL_REFER(kVRelu) -USE_JITKERNEL_REFER(kVIdentity) -USE_JITKERNEL_REFER(kVExp) -USE_JITKERNEL_REFER(kVSigmoid) -USE_JITKERNEL_REFER(kVTanh) -USE_JITKERNEL_REFER(kLSTMCtHt) -USE_JITKERNEL_REFER(kLSTMC1H1) -USE_JITKERNEL_REFER(kGRUH1) -USE_JITKERNEL_REFER(kGRUHtPart1) -USE_JITKERNEL_REFER(kGRUHtPart2) -USE_JITKERNEL_REFER(kCRFDecoding) -USE_JITKERNEL_REFER(kLayerNorm) -USE_JITKERNEL_REFER(kNCHW16CMulNC) -USE_JITKERNEL_REFER(kSeqPool) -USE_JITKERNEL_REFER(kMatMul) -USE_JITKERNEL_REFER(kVSquare) -USE_JITKERNEL_REFER(kHSum) -USE_JITKERNEL_REFER(kHMax) -USE_JITKERNEL_REFER(kStrideASum) -USE_JITKERNEL_REFER(kSoftmax) -USE_JITKERNEL_REFER(kEmbSeqPool) -USE_JITKERNEL_REFER(kSgd) -USE_JITKERNEL_REFER(kVBroadcast) +USE_JITKERNEL_REFER_LITE(kVMul) +USE_JITKERNEL_REFER_LITE(kVAdd) +USE_JITKERNEL_REFER_LITE(kVAddRelu) +USE_JITKERNEL_REFER_LITE(kVSub) +USE_JITKERNEL_REFER_LITE(kVScal) +USE_JITKERNEL_REFER_LITE(kStrideScal) +USE_JITKERNEL_REFER_LITE(kVAddBias) +USE_JITKERNEL_REFER_LITE(kVCopy) +USE_JITKERNEL_REFER_LITE(kVRelu) +USE_JITKERNEL_REFER_LITE(kVIdentity) +USE_JITKERNEL_REFER_LITE(kVExp) +USE_JITKERNEL_REFER_LITE(kVSigmoid) +USE_JITKERNEL_REFER_LITE(kVTanh) +USE_JITKERNEL_REFER_LITE(kLSTMCtHt) +USE_JITKERNEL_REFER_LITE(kLSTMC1H1) +USE_JITKERNEL_REFER_LITE(kGRUH1) +USE_JITKERNEL_REFER_LITE(kGRUHtPart1) +USE_JITKERNEL_REFER_LITE(kGRUHtPart2) +USE_JITKERNEL_REFER_LITE(kCRFDecoding) +USE_JITKERNEL_REFER_LITE(kLayerNorm) +USE_JITKERNEL_REFER_LITE(kNCHW16CMulNC) +USE_JITKERNEL_REFER_LITE(kSeqPool) +USE_JITKERNEL_REFER_LITE(kMatMul) +USE_JITKERNEL_REFER_LITE(kVSquare) +USE_JITKERNEL_REFER_LITE(kHSum) +USE_JITKERNEL_REFER_LITE(kHMax) +USE_JITKERNEL_REFER_LITE(kStrideASum) +USE_JITKERNEL_REFER_LITE(kSoftmax) +USE_JITKERNEL_REFER_LITE(kEmbSeqPool) +USE_JITKERNEL_REFER_LITE(kSgd) +USE_JITKERNEL_REFER_LITE(kVBroadcast) diff --git a/lite/backends/x86/jit/refer/refer.cc b/lite/backends/x86/jit/refer/refer.cc index e1b1240c5d..c47f8216ab 100644 --- a/lite/backends/x86/jit/refer/refer.cc +++ b/lite/backends/x86/jit/refer/refer.cc @@ -18,7 +18,7 @@ namespace refer = paddle::lite::jit::refer; #define REGISTER_REFER_KERNEL(func) \ - REGISTER_JITKERNEL_REFER( \ + REGISTER_JITKERNEL_REFER_LITE( \ k##func, refer::func##Kernel, refer::func##Kernel) REGISTER_REFER_KERNEL(VMul); diff --git a/lite/backends/x86/jit/registry.h b/lite/backends/x86/jit/registry.h index 7613a8dd43..65e3152d70 100644 --- a/lite/backends/x86/jit/registry.h +++ b/lite/backends/x86/jit/registry.h @@ -77,16 +77,16 @@ class JitKernelRegistrar { void Touch() {} }; -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ +#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg) \ struct __test_global_namespace_##uniq_name##__ {}; \ static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ __test_global_namespace_##uniq_name##__>::value, \ msg) // Refer always on CPUPlace -#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace, \ +#define REGISTER_JITKERNEL_REFER_LITE(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_refer_CPUPlace, \ "REGISTER_KERNEL_REFER must be called in global namespace"); \ static ::paddle::lite::jit::JitKernelRegistrar< \ ::paddle::lite::jit::ReferKernelPool, \ @@ -94,84 +94,84 @@ class JitKernelRegistrar { __VA_ARGS__> \ __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \ ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ + int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \ __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \ return 0; \ } // kernel_type: should be in paddle::lite::jit::KernelType // place_type: should be one of CPUPlace and GPUPlace in paddle::platform -#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \ - "REGISTER_KERNEL_MORE must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ +#define REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, place_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type, \ + "REGISTER_KERNEL_MORE_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \ - UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + UNUSED = LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ static ::paddle::lite::jit::JitKernelRegistrar< \ ::paddle::lite::jit::KernelPool, \ ::paddle::lite::fluid::place_type, \ __VA_ARGS__> \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \ ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ + int LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \ __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \ .Touch(); \ return 0; \ } #define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \ - REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) - -#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \ - REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) - -#define REGISTER_JITKERNEL_GEN(kernel_type, ...) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ - "REGISTER_JITKERNEL_GEN must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \ - TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static ::paddle::lite::jit::JitKernelRegistrar< \ - ::paddle::lite::jit::JitCodeCreatorPool, \ - ::paddle::lite::fluid::CPUPlace, \ - __VA_ARGS__> \ - __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \ - ::paddle::lite::jit::KernelType::kernel_type); \ - int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \ - __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \ - return 0; \ + REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace, __VA_ARGS__) + +#define REGISTER_GPUKERNEL_MORE_LITE(kernel_type, impl_type, ...) \ + REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, GPUPlace, __VA_ARGS__) + +#define REGISTER_JITKERNEL_GEN_LITE(kernel_type, ...) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_gen_##kernel_type##_CPUPlace_, \ + "REGISTER_JITKERNEL_GEN_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \ + LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static ::paddle::lite::jit::JitKernelRegistrar< \ + ::paddle::lite::jit::JitCodeCreatorPool, \ + ::paddle::lite::fluid::CPUPlace, \ + __VA_ARGS__> \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \ + ::paddle::lite::jit::KernelType::kernel_type); \ + int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \ + __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \ + return 0; \ } -#define USE_JITKERNEL_GEN(kernel_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_gen_##kernel_type##_CPUPlace_, \ - "USE_JITKERNEL_GEN must be called in global namespace"); \ - extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \ - static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \ - TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() - -#define USE_JITKERNEL_REFER(kernel_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_refer_CPUPlace_, \ - "USE_JITKERNEL_REFER must be called in global namespace"); \ - extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ - static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \ - TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() - -#define USE_KERNEL_MORE(kernel_type, impl_type, place_type) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_, \ - "USE_JITKERNEL_MORE must be called in global namespace"); \ - extern int \ - TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \ - static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \ - UNUSED = \ - TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() - -#define USE_JITKERNEL_MORE(kernel_type, impl_type) \ - USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace) +#define USE_JITKERNEL_GEN_LITE(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_gen_##kernel_type##_CPUPlace_, \ + "USE_JITKERNEL_GEN_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \ + static int use_litejitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \ + LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() + +#define USE_JITKERNEL_REFER_LITE(kernel_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_refer_CPUPlace_, \ + "USE_JITKERNEL_REFER_LITE must be called in global namespace"); \ + extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \ + static int use_litejitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \ + LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() + +#define USE_KERNEL_MORE_LITE(kernel_type, impl_type, place_type) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type##_, \ + "USE_JITKERNEL_MORE_LITE must be called in global namespace"); \ + extern int \ + LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \ + static int use_litejitkernel_##kernel_type##_##impl_type##_##place_type##_ \ + UNUSED = \ + LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() + +#define USE_JITKERNEL_MORE_LITE(kernel_type, impl_type) \ + USE_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace) } // namespace jit } // namespace lite diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h old mode 100755 new mode 100644 diff --git a/lite/backends/xpu/builder.cc b/lite/backends/xpu/builder.cc deleted file mode 100644 index 796eaf9c46..0000000000 --- a/lite/backends/xpu/builder.cc +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/xpu/builder.h" -#include // NOLINT -#include -#include "lite/backends/xpu/runtime.h" - -namespace paddle { -namespace lite { -namespace xpu { - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname) { - auto iarg_names = op_info->input_argnames(); - if (std::find(iarg_names.begin(), iarg_names.end(), argname) != - iarg_names.end()) { - auto inputs = op_info->Input(argname); - if (inputs.empty()) { - return false; - } - auto var_name = inputs.front(); - auto var = scope->FindVar(var_name); - return var != nullptr; - } else { - return false; - } -} - -std::string UniqueName(const std::string& prefix) { - static std::mutex counter_mtx; - static std::unordered_map counter_map; - std::unique_lock counter_lck(counter_mtx); - int counter = 1; - auto it = counter_map.find(prefix); - if (it == counter_map.end()) { - counter_map[prefix] = counter; - } else { - counter = ++(it->second); - } - return prefix + "_" + std::to_string(counter); -} - -xtcl::DataType CvtPrecisionType(PrecisionType in_type) { - xtcl::DataType out_type = ::xtcl::Float(32); - switch (in_type) { - case PRECISION(kFloat): - out_type = ::xtcl::Float(32); - break; - case PRECISION(kInt8): - out_type = ::xtcl::Int(8); - break; - case PRECISION(kInt32): - out_type = ::xtcl::Int(32); - break; - default: - LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type) - << ") from Lite to XPU"; - break; - } - return out_type; -} - -DLDataType CvtDataType(PrecisionType in_type) { - DLDataType out_type = {kDLFloat, 32, 1}; - switch (in_type) { - case PRECISION(kFloat): - out_type = {kDLFloat, 32, 1}; - break; - case PRECISION(kInt8): - out_type = {kDLInt, 8, 1}; - break; - case PRECISION(kInt32): - out_type = {kDLInt, 32, 1}; - break; - default: - LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type) - << ") from Lite to XPU"; - break; - } - return out_type; -} - -xtcl::Array CvtShape(const std::vector& in_shape) { - xtcl::Array out_shape; - for (auto dim : in_shape) { - out_shape.push_back(dim); - } - return out_shape; -} - -xtcl::Array CvtShape(const std::vector& in_shape) { - return CvtShape(std::vector(in_shape.begin(), in_shape.end())); -} - -xtcl::Array CvtShape(const DDim& in_dims) { - return CvtShape(in_dims.Vectorize()); -} - -std::shared_ptr CvtTensor(lite::Tensor* in_tensor, - std::vector out_shape, - PrecisionType in_ptype, - DataLayoutType in_ltype) { - uint8_t* in_data = nullptr; - auto in_size = in_tensor->dims().production(); - auto in_shape = in_tensor->dims().Vectorize(); - if (out_shape.empty()) { - out_shape = in_shape; - } - int in_bytes; - if (in_ptype == PRECISION(kFloat)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(float); - } else if (in_ptype == PRECISION(kInt32)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int32_t); - } else if (in_ptype == PRECISION(kInt8)) { - in_data = reinterpret_cast(in_tensor->mutable_data()); - in_bytes = in_size * sizeof(int8_t); - } else { - LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype); - } - auto out_tensor = std::make_shared( - xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0})); - auto out_data = - reinterpret_cast(out_tensor->ToDLPack()->dl_tensor.data); - std::memcpy(out_data, in_data, in_bytes); - return out_tensor; -} - -// Build the XPU subgraph to the XPU model, store the model data into the -// weight tensor of the graph op, and the model data will be loaded again -// by the graph computing kernel when the graph op is executed for inference. -// Due to the lack of XPU APIs for building and outputing the model data, -// the compiled XPU runtime object will be managed by the global variable -// 'DeviceInfo' and the key name for finding the runtime object will be -// stored in the weight tensor of graph op. -// TODO(hong19860320) Compile the XPU subgraph and output the compiled model -// data to the weight tensor of graph op. -bool BuildModel( - std::shared_ptr builder, - std::shared_ptr params, - std::vector>* outputs, - lite::Tensor* model) { - LOG(INFO) << "[XPU] Build Model."; - CHECK(builder != nullptr); - CHECK(outputs != nullptr); - CHECK_GT(outputs->size(), 0); - CHECK(model != nullptr); - - // build graph and fill all of constant params - xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0])); - auto target = xtcl::Target::Create("llvm"); - auto compiler = xtcl::network::xTensorCompiler(network, target); - compiler.SetParams(*params); // set the data of constant tensors - compiler.Build(); - - // create and register runtime - auto runtime = std::make_shared( - compiler.CreateRuntimeInstance()); - if (runtime == nullptr) { - LOG(WARNING) << "[XPU] Build Model failed!"; - return false; - } - std::string name = UniqueName("xpu"); - LOG(INFO) << "[XPU] Model Name: " << name; - DeviceInfo::Global().Insert(name, runtime); - model->Resize({static_cast(name.length() + 1)}); - memcpy(model->mutable_data(), - reinterpret_cast(name.c_str()), - name.length() + 1); - return true; -} - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/xpu/builder.h b/lite/backends/xpu/builder.h deleted file mode 100644 index f0ac2b303a..0000000000 --- a/lite/backends/xpu/builder.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include "lite/core/op_lite.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace xpu { - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname); - -std::string UniqueName(const std::string& prefix); - -xtcl::DataType CvtPrecisionType(PrecisionType in_type); - -DLDataType CvtDataType(PrecisionType in_type); - -xtcl::Array CvtShape(const std::vector& in_shape); - -xtcl::Array CvtShape(const std::vector& in_shape); - -xtcl::Array CvtShape(const DDim& in_dims); - -std::shared_ptr CvtTensor( - Tensor* in_tensor, - std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); - -bool BuildModel( - std::shared_ptr builder, - std::shared_ptr params, - std::vector>* outputs, - lite::Tensor* model); - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/xpu/device.cc b/lite/backends/xpu/device.cc old mode 100755 new mode 100644 index dbf88ff833..badde878ad --- a/lite/backends/xpu/device.cc +++ b/lite/backends/xpu/device.cc @@ -36,8 +36,11 @@ std::unique_ptr Device::Build( } xtcl::xNetwork network = builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs)); - auto target = xtcl::Target::Create(device_name_); - auto compiler = xtcl::network::xTensorCompiler(network, target); + auto target = xtcl::NullValue(); + if (!target_.empty()) { + target = xtcl::Target::Create(target_); + } + xtcl::network::xTensorCompiler compiler(network, target); compiler.SetParams(*params); // Set the data of constant tensors compiler.Build(); VLOG(3) << "[XPU] Build done"; diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h old mode 100755 new mode 100644 index bf9a8bf76a..6de18d5466 --- a/lite/backends/xpu/device.h +++ b/lite/backends/xpu/device.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -30,7 +31,18 @@ class Device { static Device x; return x; } - Device() {} + Device() { + char* name = std::getenv("XPU_DEVICE_NAME"); + if (name) { + name_ = std::string(name); + } + // XPU_DEVICE_TARGET for XPU model building, which supports 'llvm' and 'xpu + // -libs=xdnn' + char* target = std::getenv("XPU_DEVICE_TARGET"); + if (target) { + target_ = std::string(target); + } + } // Build the XPU graph to the XPU runtime, return the XPU runtime which can be // used to run inference. @@ -39,10 +51,12 @@ class Device { xtcl::network::xTensorCompiler::ParamNDArrayMap* params, std::vector* outputs); + const std::string name() const { return name_; } + const std::string target() const { return target_; } + private: - // Keep reserved fields - int device_id_{0}; - std::string device_name_{"llvm"}; + std::string name_{""}; + std::string target_{""}; }; } // namespace xpu diff --git a/lite/backends/xpu/runtime.cc b/lite/backends/xpu/runtime.cc deleted file mode 100644 index a2c34b9575..0000000000 --- a/lite/backends/xpu/runtime.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/backends/xpu/runtime.h" -#include -#include "lite/utils/cp_logging.h" - -namespace paddle { -namespace lite { -namespace xpu { - -// Extract the model data and recover the XPU model for inference, the function -// is called by the graph computing kernel when the graph op is executed. -// Due to the lack of XPU APIs for loading and recovering the XPU model from -// memory, the key name is obtained from the weight tensor of graph op, to get -// the runtime object for inference from the global variable 'DeviceInfo'. -// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op. -bool LoadModel(const lite::Tensor &model, - std::shared_ptr *runtime) { - LOG(INFO) << "[XPU] Load Model."; - CHECK_GT(model.dims().production(), 0); - std::string name(reinterpret_cast(model.data())); - LOG(INFO) << "[XPU] Model Name: " << name; - CHECK(runtime != nullptr); - *runtime = DeviceInfo::Global().Find(name); - if (*runtime == nullptr) { - LOG(WARNING) << "[XPU] Load Model failed!"; - return false; - } - return true; -} - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/backends/xpu/runtime.h b/lite/backends/xpu/runtime.h deleted file mode 100644 index 4ff8d75bce..0000000000 --- a/lite/backends/xpu/runtime.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace xpu { - -class DeviceInfo { - public: - static DeviceInfo& Global() { - static DeviceInfo x; - return x; - } - DeviceInfo() {} - - void Insert(const std::string& name, - std::shared_ptr runtime) { - if (runtimes_.find(name) != runtimes_.end()) { - LOG(WARNING) << "[XPU] Model " << name << " already exists."; - return; - } - runtimes_.emplace(std::make_pair(name, runtime)); - } - - void Clear() { runtimes_.clear(); } - - std::shared_ptr Find( - const std::string& name) const { - if (runtimes_.find(name) != runtimes_.end()) { - return runtimes_.at(name); - } else { - return nullptr; - } - } - - private: - int device_id_{0}; - std::string device_name_{"default"}; - std::unordered_map> - runtimes_; -}; - -bool LoadModel(const lite::Tensor& model, - std::shared_ptr* runtime); - -} // namespace xpu -} // namespace lite -} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 57f353c0ee..1d0558451f 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -96,7 +96,15 @@ add_custom_command( add_custom_target(op_list_h DEPENDS ops.h) add_custom_target(kernel_list_h DEPENDS kernels.h) add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) - +# create headfile to restore ops info sorted by suppported platforms +add_custom_command( + COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py + ${kernels_src_list} + ${ops_src_list} + ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h + OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time. + ) + add_custom_target(supported_kernel_op_info_h DEPENDS supported_kernel_op_info.h) #----------------------------------------------- NOT CHANGE ----------------------------------------------- lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index d379b31b84..1c85353d53 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/framework.proto b/lite/core/framework.proto index 5adf2a18b9..84b5502ff7 100644 --- a/lite/core/framework.proto +++ b/lite/core/framework.proto @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto2"; -option optimize_for = LITE_RUNTIME; package paddle.framework.proto; // Any incompatible changes to ProgramDesc and its dependencies should diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 86193235a2..18a1243c11 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -83,14 +83,11 @@ class KernelBase { #if defined(LITE_WITH_CUDA) WorkSpace::Global_CUDA().AllocReset(); #endif - #ifdef LITE_WITH_PROFILE - CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " - "When LITE_WITH_PROFILE is defined, please set a " - "Profiler for Instruction."; - profiler_->StartTiming(profile_id_, ctx_.get()); + profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); + profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); Run(); - profiler_->StopTiming(profile_id_, ctx_.get()); + profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); #else Run(); #endif diff --git a/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc index 97f6a2657f..8447865bdc 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc @@ -35,5 +35,7 @@ void ElementwiseAddActivationFusePass::Apply( REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass, paddle::lite::mir::ElementwiseAddActivationFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU), TARGET(kBM)}) + .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kBM)}) + .ExcludeTargets({TARGET(kX86)}) .BindKernel("fusion_elementwise_add_activation"); diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index 5b8e8563ba..c85d34cbae 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -23,8 +23,13 @@ namespace lite { namespace mir { void FcFusePass::Apply(const std::unique_ptr& graph) { - fusion::FcFuser fuser; +#ifdef LITE_WITH_X86 + fusion::FcFuser fuser(true); fuser(graph.get()); +#endif + + fusion::FcFuser fuser2(false); + fuser2(graph.get()); } } // namespace mir @@ -33,5 +38,7 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU), TARGET(kBM)}) + .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kBM)}) + .ExcludeTargets({TARGET(kCUDA)}) .BindKernel("fc"); diff --git a/lite/core/mir/fusion/fc_fuse_pass_test.cc b/lite/core/mir/fusion/fc_fuse_pass_test.cc index f7aa4bb5ad..54260732c5 100644 --- a/lite/core/mir/fusion/fc_fuse_pass_test.cc +++ b/lite/core/mir/fusion/fc_fuse_pass_test.cc @@ -88,6 +88,7 @@ USE_LITE_OP(mul); USE_LITE_OP(elementwise_add); USE_LITE_OP(elementwise_sub); USE_LITE_OP(fc); +USE_LITE_OP(relu); USE_LITE_OP(feed); USE_LITE_OP(fetch); USE_LITE_OP(io_copy); diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc index 460c0fdf7a..3c99131083 100644 --- a/lite/core/mir/fusion/fc_fuser.cc +++ b/lite/core/mir/fusion/fc_fuser.cc @@ -35,12 +35,23 @@ void FcFuser::BuildPattern() { std::vector mul_inputs{W, x}; std::vector add_inputs{mul_out, b}; mul_inputs >> *mul >> *mul_out; - add_inputs >> *add >> *Out; // Some op specialities. mul_out->AsIntermediate(); mul->AsIntermediate(); add->AsIntermediate(); + + if (with_relu_) { + auto* add_out = VarNode("add_out"); + auto* relu = OpNode("relu", "relu"); + std::vector relu_inputs{add_out}; + add_inputs >> *add >> *add_out; + relu_inputs >> *relu >> *Out; + add_out->AsIntermediate(); + relu->AsIntermediate(); + } else { + add_inputs >> *add >> *Out; + } } void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { @@ -71,6 +82,9 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) { op_desc.SetAttr( "in_num_col_dims", matched.at("mul")->stmt()->op_info()->GetAttr("x_num_col_dims")); + if (with_relu_) { + op_desc.SetAttr("activation_type", std::string{"relu"}); + } return op_desc; } diff --git a/lite/core/mir/fusion/fc_fuser.h b/lite/core/mir/fusion/fc_fuser.h index 7ba0752789..6cb08f4157 100644 --- a/lite/core/mir/fusion/fc_fuser.h +++ b/lite/core/mir/fusion/fc_fuser.h @@ -25,11 +25,13 @@ namespace fusion { class FcFuser : public FuseBase { public: + explicit FcFuser(bool with_relu) : with_relu_(with_relu) {} void BuildPattern() override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; private: cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + bool with_relu_; }; } // namespace fusion diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.h b/lite/core/mir/fusion/sequence_pool_concat_fuser.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.h b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc index 9ad69b8152..76c97d2da6 100644 --- a/lite/core/mir/generate_program_pass.cc +++ b/lite/core/mir/generate_program_pass.cc @@ -29,7 +29,6 @@ void GenerateProgramPass::Apply(const std::unique_ptr& graph) { if (item->IsStmt()) { auto& stmt = item->AsStmt(); VLOG(4) << stmt; - LOG(INFO) << stmt; insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); } } diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt index 1ac4ab346f..f8aa09676c 100644 --- a/lite/core/mir/subgraph/CMakeLists.txt +++ b/lite/core/mir/subgraph/CMakeLists.txt @@ -4,7 +4,7 @@ lite_cc_library(subgraph_detector lite_cc_library(subgraph_pass SRCS subgraph_pass.cc DEPS mir_pass types context ${mir_fusers} subgraph_detector) -if (WITH_TESTING) +if (WITH_TESTING AND NOT LITE_WITH_CUDA) lite_cc_test(test_subgraph_detector SRCS subgraph_detector_test.cc DEPS subgraph_detector mir_passes gflags model_parser cxx_api diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc old mode 100755 new mode 100644 index bf04d5c2ef..6d48b053a1 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -94,7 +94,7 @@ std::string SubgraphVisualizer::operator()() { } auto res = dot.Build(); - //std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl; + std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl; return res; } diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc old mode 100755 new mode 100644 diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc old mode 100755 new mode 100644 index af5bcdee08..116b361681 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -27,7 +27,7 @@ namespace mir { void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type); +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/npu/bridges/paddle_use_bridges.h" #undef USE_SUBGRAPH_BRIDGE auto teller = [&](Node* node) { @@ -41,7 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type); +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/xpu/bridges/paddle_use_bridges.h" #undef USE_SUBGRAPH_BRIDGE auto teller = [&](Node* node) { @@ -55,7 +55,7 @@ void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { void BMSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type); +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/bm/bridges/paddle_use_bridges.h" #undef USE_SUBGRAPH_BRIDGE auto teller = [&](Node* node) { diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h old mode 100755 new mode 100644 diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc old mode 100755 new mode 100644 index 0d5fc7bf5e..a56c364f97 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -92,7 +92,7 @@ void FillInputTensors( #define FILL_TENSOR_WITH_TYPE(type) \ auto input_tensor_data = input_tensor->mutable_data(); \ for (int j = 0; j < input_tensor_size; j++) { \ - input_tensor_data[i] = static_cast(value); \ + input_tensor_data[j] = static_cast(value); \ } for (int i = 0; i < input_tensor_shape.size(); i++) { auto input_tensor = predictor->GetInput(i); diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc index 78317f78ac..f4d0e3c0af 100644 --- a/lite/core/profile/profiler.cc +++ b/lite/core/profile/profiler.cc @@ -28,36 +28,55 @@ auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) { }; } -int Profiler::NewTimer(const OpCharacter& ch) { - StatisUnit unit; - unit.character = ch; +std::map TypeStr{ + {Type::kUnk, "Unknown"}, + {Type::kCreate, "Create"}, + {Type::kDispatch, "Dispatch"}, +}; + +StatisUnit::StatisUnit(const OpCharacter& ch) : character(ch) { + create_t.reset(new DeviceTimer()); if (ch.target == TargetType::kCUDA) { #ifdef LITE_WITH_CUDA - unit.timer.reset(new DeviceTimer()); + dispatch_t.reset(new DeviceTimer()); #else LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the " "default x86 timer is used instead."; #endif } else { - unit.timer.reset(new DeviceTimer()); + dispatch_t.reset(new DeviceTimer()); } +} + +lite::profile::Timer* StatisUnit::Timer(Type type) { + if (type == Type::kCreate) { + return create_t.get(); + } else if (type == Type::kDispatch) { + return dispatch_t.get(); + } + LOG(FATAL) << "Timer cannot be returned for unknown platforms."; + return nullptr; +} + +int Profiler::NewTimer(const OpCharacter& ch) { + StatisUnit unit(ch); units_.push_back(std::move(unit)); return units_.size() - 1; } -void Profiler::StartTiming(const int index, KernelContext* ctx) { +void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - units_[index].timer->Start(ctx); + units_[index].Timer(type)->Start(ctx); } -float Profiler::StopTiming(const int index, KernelContext* ctx) { +float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - return units_[index].timer->Stop(ctx); + return units_[index].Timer(type)->Stop(ctx); } -std::string Profiler::Summary(bool concise, size_t w) { +std::string Profiler::Summary(Type type, bool concise, size_t w) { using std::setw; using std::left; using std::fixed; @@ -65,12 +84,14 @@ std::string Profiler::Summary(bool concise, size_t w) { std::string title; // Title. if (concise) { - ss << "Timing cycle = " << units_.front().timer->LapTimes().Size() + ss << "Timing cycle = " << units_.front().Timer(type)->LapTimes().Size() << std::endl; - ss << "===== Concise Profiler Summary: " << name_ << ", Exclude " << w + ss << "===== Concise " << TypeStr.find(type)->second + << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } else { - ss << "===== Detailed Profiler Summary: " << name_ << ", Exclude " << w + ss << "===== Detailed " << TypeStr.find(type)->second + << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } ss << setw(25) << left << "Operator Type" @@ -84,16 +105,16 @@ std::string Profiler::Summary(bool concise, size_t w) { if (concise) { std::map summary(op_comp); for (auto& unit : units_) { - auto ch = summary.find(unit.character); + auto ch = summary.find(unit.Character()); if (ch != summary.end()) { - ch->second.avg += unit.timer->LapTimes().Avg(w); - ch->second.min += unit.timer->LapTimes().Min(w); - ch->second.max += unit.timer->LapTimes().Max(w); + ch->second.avg += unit.Timer(type)->LapTimes().Avg(w); + ch->second.min += unit.Timer(type)->LapTimes().Min(w); + ch->second.max += unit.Timer(type)->LapTimes().Max(w); } else { - TimeInfo info({unit.timer->LapTimes().Avg(w), - unit.timer->LapTimes().Min(w), - unit.timer->LapTimes().Max(w)}); - summary.insert({unit.character, info}); + TimeInfo info({unit.Timer(type)->LapTimes().Avg(w), + unit.Timer(type)->LapTimes().Min(w), + unit.Timer(type)->LapTimes().Max(w)}); + summary.insert({unit.Character(), info}); } } for (const auto& item : summary) { @@ -109,14 +130,15 @@ std::string Profiler::Summary(bool concise, size_t w) { } } else { for (auto& unit : units_) { + const auto& times = unit.Timer(type)->LapTimes(); // clang-format off - ss << setw(25) << left << fixed << unit.character.op_type \ - << " " << setw(40) << left << fixed << unit.character.kernel_name \ - << " " << setw(12) << left << fixed << unit.character.remark \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Avg(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Min(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Max(w) \ - << " " << setw(12) << left << fixed << unit.timer->LapTimes().Last(w) \ + ss << setw(25) << left << fixed << unit.Character().op_type \ + << " " << setw(40) << left << fixed << unit.Character().kernel_name \ + << " " << setw(12) << left << fixed << unit.Character().remark \ + << " " << setw(12) << left << fixed << times.Avg(w) \ + << " " << setw(12) << left << fixed << times.Min(w) \ + << " " << setw(12) << left << fixed << times.Max(w) \ + << " " << setw(12) << left << fixed << times.Last(w) \ << std::endl; // clang-format on } diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h index 4e9e9ae31c..3933e5ba01 100644 --- a/lite/core/profile/profiler.h +++ b/lite/core/profile/profiler.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -22,6 +23,14 @@ namespace paddle { namespace lite { namespace profile { +enum class Type { + kUnk = 0, + kCreate, + kDispatch, +}; + +extern std::map TypeStr; + struct TimeInfo { float avg; float min; @@ -35,8 +44,15 @@ struct OpCharacter { std::string remark{std::string("N/A")}; }; -struct StatisUnit { - std::unique_ptr timer; +class StatisUnit final { + public: + explicit StatisUnit(const OpCharacter& ch); + lite::profile::Timer* Timer(Type type); + const OpCharacter& Character() const { return character; } + + protected: + std::unique_ptr create_t; + std::unique_ptr dispatch_t; OpCharacter character; }; @@ -45,9 +61,9 @@ class Profiler final { Profiler() = default; explicit Profiler(const std::string& name) : name_(name) {} int NewTimer(const OpCharacter& ch); - void StartTiming(const int index, KernelContext* ctx); - float StopTiming(const int index, KernelContext* ctx); - std::string Summary(bool concise = true, size_t warm_up = 10); + void StartTiming(Type type, const int index, KernelContext* ctx); + float StopTiming(Type type, const int index, KernelContext* ctx); + std::string Summary(Type type, bool concise = true, size_t warm_up = 10); private: std::string name_{std::string("N/A")}; diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc index 6f49698ef4..3841f01518 100644 --- a/lite/core/profile/test_timer.cc +++ b/lite/core/profile/test_timer.cc @@ -69,10 +69,10 @@ TEST(profiler, real_latency) { ch.op_type = "operator/1"; ch.kernel_name = "kernel/1"; int idx = profiler.NewTimer(ch); - profiler.StartTiming(idx, &ctx); + profiler.StartTiming(Type::kDispatch, idx, &ctx); std::this_thread::sleep_for(std::chrono::milliseconds(10)); - profiler.StopTiming(idx, &ctx); - std::cout << profiler.Summary(); + profiler.StopTiming(Type::kDispatch, idx, &ctx); + std::cout << profiler.Summary(Type::kDispatch); } #endif diff --git a/lite/core/program.cc b/lite/core/program.cc index 8dc8fb0ddd..41d178f015 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -137,8 +137,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { void RuntimeProgram::Run() { for (auto& inst : instructions_) { - std::string op_type = inst.op()->op_info()->Type(); - if (op_type == "feed" || op_type == "fetch") continue; + if (inst.is_feed_fetch_op()) continue; inst.Run(); #ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE @@ -147,7 +146,7 @@ void RuntimeProgram::Run() { #endif // LITE_WITH_PROFILE } #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(false, 0); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); #endif // LITE_WITH_PROFILE } @@ -252,8 +251,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) { } void Instruction::Run() { +#ifdef LITE_WITH_PROFILE + CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " + "When LITE_WITH_PROFILE is defined, please set a " + "Profiler for Instruction."; + profiler_->StartTiming( + profile::Type::kCreate, profile_id_, kernel_->mutable_context()); +#endif CHECK(op_) << "op null"; CHECK(kernel_) << "kernel null"; + if (first_epoch_) { first_epoch_ = false; CHECK(op_->CheckShape()); @@ -263,10 +270,7 @@ void Instruction::Run() { return; } - // VLOG(4) << "kernel launch"; op_->InferShape(); - // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target " - // << TargetToStr(kernel_->target()); kernel_->Launch(); has_run_ = true; } diff --git a/lite/core/program.h b/lite/core/program.h index 291252619b..c845a17c52 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -90,7 +90,12 @@ struct Program { struct Instruction { Instruction(const std::shared_ptr& op, std::unique_ptr&& kernel) - : op_(op), kernel_(std::move(kernel)) {} + : op_(op), kernel_(std::move(kernel)) { + std::string op_type = op->Type(); + if (op_type == "feed" || op_type == "fetch") { + is_feed_fetch_op_ = true; + } + } // Run the instruction. void Run(); @@ -101,6 +106,8 @@ struct Instruction { const KernelBase* kernel() const { return kernel_.get(); } KernelBase* mutable_kernel() { return kernel_.get(); } + bool is_feed_fetch_op() const { return is_feed_fetch_op_; } + #ifdef LITE_WITH_PROFILE void set_profiler(profile::Profiler* profiler) { profiler_ = profiler; @@ -118,6 +125,7 @@ struct Instruction { private: std::shared_ptr op_; std::unique_ptr kernel_; + bool is_feed_fetch_op_{false}; bool first_epoch_{true}; bool has_run_{false}; @@ -143,7 +151,8 @@ class LITE_API RuntimeProgram { } ~RuntimeProgram() { #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch); #endif // LITE_WITH_PROFILE } diff --git a/lite/core/tensor.h b/lite/core/tensor.h index de08aa82f3..41a2d16f75 100644 --- a/lite/core/tensor.h +++ b/lite/core/tensor.h @@ -139,6 +139,22 @@ class TensorLite { // For other devices, T and R may be the same type. template R *mutable_data() { + auto type_id = typeid(T).hash_code(); + if (type_id == typeid(bool).hash_code()) { // NOLINT + precision_ = PrecisionType::kBool; + } else if (type_id == typeid(float).hash_code()) { // NOLINT + precision_ = PrecisionType::kFloat; + } else if (type_id == typeid(int8_t).hash_code()) { + precision_ = PrecisionType::kInt8; + } else if (type_id == typeid(int16_t).hash_code()) { + precision_ = PrecisionType::kInt16; + } else if (type_id == typeid(int32_t).hash_code()) { + precision_ = PrecisionType::kInt32; + } else if (type_id == typeid(int64_t).hash_code()) { + precision_ = PrecisionType::kInt64; + } else { + precision_ = PrecisionType::kUnk; + } memory_size_ = dims_.production() * sizeof(T); buffer_->ResetLazy(target_, memory_size_); return reinterpret_cast(static_cast(buffer_->data()) + @@ -163,10 +179,7 @@ class TensorLite { template R *mutable_data(TargetType target) { target_ = target; - memory_size_ = dims_.production() * sizeof(T); - buffer_->ResetLazy(target, memory_size()); - return reinterpret_cast(static_cast(buffer_->data()) + - offset_); + return mutable_data(); } void *mutable_data(size_t memory_size); void *mutable_data(TargetType target, size_t memory_size); diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index 5e0ec49add..3217a7ed49 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -1,91 +1,111 @@ # C++ Demo -1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像 -2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。 -3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz ` -4. 执行以下命令准备模拟器环境 -```shell -# armv8 -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` -```shell -# armv7 -adb kill-server -adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done -echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" -echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 & -sleep 1m -``` -5. 准备模型、编译并运行完整api的demo +1. 环境准备 + - 保证Android NDK在/opt目录下 + - 一台armv7或armv8架构的安卓手机 +2. 编译并运行全量api的demo(注:当编译模式为tiny_pubish时将不存在该demo) ```shell cd inference_lite_lib.android.armv8/demo/cxx/mobile_full wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz make -adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/ -adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/ -adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api -adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +adb push mobilenet_v1 /data/local/tmp/ +adb push mobilenetv1_full_api /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobilenetv1_full_api +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" ``` 运行成功将在控制台输出预测结果的前10个类别的预测概率 -6. 编译并运行轻量级api的demo +3. 编译并运行轻量级api的demo ```shell cd ../mobile_light make -adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/ -adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api -adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +adb push mobilenetv1_light_api /data/local/tmp/ +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/mobilenetv1_light_api +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt" ``` +运行成功将在控制台输出预测结果的前10个类别的预测概率 -7. 编译并运行目标检测的demo +4. 编译并运行ssd目标检测的demo ```shell -cd ../mobile_detection +cd ../ssd_detection wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz tar zxvf mobilenetv1-ssd.tar.gz make -adb -s emulator-5554 push mobile_detection /data/local/tmp/ -adb -s emulator-5554 push test.jpg /data/local/tmp/ -adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection -adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && -/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" -adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./ +adb push ssd_detection /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push mobilenetv1-ssd /data/local/tmp +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/ssd_detection +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/ssd_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" +adb pull /data/local/tmp/test_ssd_detection_result.jpg ./ ``` -运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg +运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg -8. 编译并运行物体分类的demo +5. 编译并运行yolov3目标检测的demo +```shell +cd ../yolov3_detection +wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz +tar zxvf mobilenetv1-yolov3.tar.gz +make +adb push yolov3_detection /data/local/tmp/ +adb push test.jpg /data/local/tmp/ +adb push mobilenetv1-yolov3 /data/local/tmp +adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb shell chmod +x /data/local/tmp/yolov3_detection +adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/yolov3_detection /data/local/tmp/mobilenetv1-yolov3 /data/local/tmp/test.jpg" +adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./ +``` +运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg + +6. 编译并运行物体分类的demo ```shell cd ../mobile_classify wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz +./model_optimize_tool optimize model make + adb -s emulator-5554 push mobile_classify /data/local/tmp/ adb -s emulator-5554 push test.jpg /data/local/tmp/ adb -s emulator-5554 push labels.txt /data/local/tmp/ adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && -/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" ``` 运行成功将在控制台输出预测结果的前5个类别的预测概率 - 如若想看前10个类别的预测概率,在运行命令输入topk的值即可 eg: ```shell adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && - /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10" + /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10" ``` - 如若想看其他模型的分类结果, 在运行命令输入model_dir 及其model的输入大小即可 eg: ```shell adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && - /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224" + /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224" ``` +9. 编译含CV预处理库模型单测demo +```shell +cd ../test_cv +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +./model_optimize_tool optimize model +make +adb -s emulator-5554 push test_model_cv /data/local/tmp/ +adb -s emulator-5554 push test.jpg /data/local/tmp/ +adb -s emulator-5554 push labels.txt /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +``` +运行成功将在控制台输出预测结果的前10个类别的预测概率 diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 old mode 100755 new mode 100644 diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 old mode 100755 new mode 100644 diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7 similarity index 90% rename from lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 rename to lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7 index 784ad73da4..05f1c2e276 100644 --- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7 @@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) -mobile_detection: fetch_opencv mobile_detection.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) +ssd_detection: fetch_opencv ssd_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection $(CXX_LIBS) $(LDFLAGS) -mobile_detection.o: mobile_detection.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc +ssd_detection.o: ssd_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc fetch_opencv: @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} @@ -57,5 +57,5 @@ fetch_opencv: .PHONY: clean clean: - rm -f mobile_detection.o - rm -f mobile_detection + rm -f ssd_detection.o + rm -f ssd_detection diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8 similarity index 89% rename from lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 rename to lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8 index 2304b38eff..77ff07df95 100644 --- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8 @@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) -mobile_detection: fetch_opencv mobile_detection.o - $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) +ssd_detection: fetch_opencv ssd_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection $(CXX_LIBS) $(LDFLAGS) -mobile_detection.o: mobile_detection.cc - $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc +ssd_detection.o: ssd_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc fetch_opencv: @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} @@ -57,5 +57,5 @@ fetch_opencv: .PHONY: clean clean: - rm -f mobile_detection.o - rm -f mobile_detection + rm -f ssd_detection.o + rm -f ssd_detection diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 new file mode 100644 index 0000000000..d659a316cd --- /dev/null +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 @@ -0,0 +1,71 @@ +ARM_ABI = arm7 +LITE_WITH_CV = ON +export ARM_ABI +export LITE_WITH_CV + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + +test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + +test_img_prepross: fetch_opencv test_img_prepross.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS) + +test_img_prepross.o: test_img_prepross.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f test_model_cv.o + rm -f test_model_cv + rm -f test_img_prepross.o + rm -f test_img_prepross diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 new file mode 100644 index 0000000000..c80b07d5c0 --- /dev/null +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 @@ -0,0 +1,70 @@ +ARM_ABI = arm8 +LITE_WITH_CV = ON +export ARM_ABI +export LITE_WITH_CV + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + +test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + +test_img_prepross: fetch_opencv test_img_prepross.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS) + +test_img_prepross.o: test_img_prepross.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f test_model_cv.o + rm -f test_model_cv + rm -f test_img_prepross.o + rm -f test_img_prepross diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 new file mode 100644 index 0000000000..b584f56235 --- /dev/null +++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 @@ -0,0 +1,61 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +yolov3_detection: fetch_opencv yolov3_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection $(CXX_LIBS) $(LDFLAGS) + +yolov3_detection.o: yolov3_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f yolov3_detection.o + rm -f yolov3_detection diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 new file mode 100644 index 0000000000..2777981701 --- /dev/null +++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 @@ -0,0 +1,61 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +yolov3_detection: fetch_opencv yolov3_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection $(CXX_LIBS) $(LDFLAGS) + +yolov3_detection.o: yolov3_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f yolov3_detection.o + rm -f yolov3_detection diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc old mode 100755 new mode 100644 index c651bf9f4c..d0cf59e185 --- a/lite/demo/cxx/mobile_classify/mobile_classify.cc +++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc @@ -117,7 +117,7 @@ void pre_process(const cv::Mat& img, float* means, float* scales) { cv::Mat rgb_img; - // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); cv::Mat imgf; rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); diff --git a/lite/demo/cxx/mobile_detection/test.jpg b/lite/demo/cxx/mobile_detection/test.jpg deleted file mode 100644 index 6bb36e136deec6088c7b75215fc35d6231283673..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 127499 zcmb4qXHZkm7j}S95_*?vfKWmcklv(2LTCv!6sbaJQUVHC=mJth=pB+kLT?Hv3P_EG z5Q7#kbQs#?I!4Q+Icj1B%b2m_d%or{x8oQFr;KtWo;;Qzb)8v*dL0bemSGcia5 z7Hgos2w-3WGPAIwnXYNcU$9wjkk&)@F`oB61jJ#6nOnj#NKqmod4ZmnJ=6|aI zj{j-!GVuZ~0p3#z(i33GJvn8Yk_g9c9H^0Fe6;Gno#vAzNkt~sH@bSuA3n7tqclBO z2(GtJ8|R%<51F%PKg7u>4Hs|$ZFtp-ij3=T5M&fG9TOeC)Se=V?^FUyPb44Z`1!P5 z+^UOTnEVNse{pb0%OiE)QH2du{?%RN3B2I3rI8NE1vV|~?w4AkHEEbNR8xRgrLdY6 zb>1anE=Sn(QC^`Zf;Ffm{nl}oa5YzTQYp7tW0-{OYcR)=(pRdUE(s^D)9Iqf%Mmli~KN|J)4$E&!jtb27yQW!mG9*l!SJH4LI;p9L>M+lOE@x>L?cj{f%RZNm^q@3u*H$#fUgKR!UkB%53Q(mO2dIhQwz^4e&*$@W47RN zpgu-uC(7DOb)Hh+uTVY)FS#Y8NX&Ea5%8$=VwEbw#+XeqftN8`(+rZP>%fS2agps!TSp|ByY$xTMTSH+3!Hp`Iy_%SDFrj z)89TRPVT>yp`QMU{@Y2|S|-zJzP$Vt@>k$);gD5`d4(dm$;v%E-*GC4Ene^rT1+Jg zHayQFC04-!M@rb~MkiB^N-?WLj(g5N&zeDXF3D7vkgv-9KF_HAupP!h#H17O913L! zQ$k&Y$Ti#RIu{Tv3wcP=JiH+DdMOYa-GT~@a->M}6v_hh;L;{$UaYV*vV!dt8#@hf zw*eGgO|CG@n}7Uku(fmH^ky&I~rq27cW*f#S)2IP0?2PV7?|f zxYawb^yYPB@>Vk8OOyg=h=H%zVFEXvf@7Y$5|w)Tc^Ka|XP#8DuGzoXq%?hM?V_)^ zNyqZXy+VDzk5sU&V^N1mpudk5+prc69L~)l+l?Q*^z5W4Yx8w^d%Sq~N(J~OD#NP? zCTv5?cQF^pMmX9Wln?QE!UthLpe>`SeaO}e#CPy|3?m~*918XX^fh-0E4+$_J6r99|1J5mN)1(uHZzGz)~>s| z6oO*)ZyaGS&69g>5GAnM((b0n+E-^8G3Iqczy-rR-_;7ToJV5#Dn{&N8FYm#mUZ95 zJgjzKH8LV%xPyWS)`7R)OB!c~A2+c$nQnWYFm)mKJ^h`!NmuS}x)Wy}%4McysK_$} zW|p2C0m1+b=3h*BkdT|<-m*6R57|J&yFxBTdxp(eRc$y*HEF~w;=gnb^1JuVK_Xp} zl!#jmX98F30u&aqM^^4r*t1N($!BEKZk3+JwB$n~l^3?nFN|cl?d~OS^2vP=RM?Z_ zP+IJ(H<5C%&*Fh7A&C$ZHyNeM3VO0ygJygML#QZ9$|{8aH<9|ydPo$?V(PRH>!%C5 z-h7hL03w>P?DD1oe!L@78630lg2l{ zw%&m;71ei%CpEur;Bq+YFU0f^wZ2tM@bkp%)|dalqpFo@R#p$)bf zbr_znVDptfM&<;|p%MO(gr;OvI>EyRm?thqGFr;)jpfEM**VrdVGv7Re9c5ylCP+UG=i8~u zAlbS~+MlW%gQyyL{JI%y+1U`Lx!rK?Pj$CEx>F#b&0-o5KL_h>m@<5SH1i2O_ zSj-J*4@y_Dw>8<>eo>NN)awFM38(d}Myu!ntn%xpCUfVc`+V8Nlpuk^y!`5NXKU5y zZ86Nd*(H9Ffh;$Ly%eSrtv1&nK3~DeJVt-f_Jsvr;9aFf+;j1p0;`V%TlQq{yTP!k zZR&X;Ur|@XtaED7F-D3EUF~RJ5guyN4{e3D%hzD!^*+h~J9B87w_|4agX|6EAqVXv z?@rTs^p(8+>bVa?S6^Xs*El*#j@RXSM=ppl9Q47Csdh}Uq!6>9UX|ASmZp!VxhvI4 z^{l__e!~2qGs1_*0e{H93m?FnLk&l?qeC}#0pP@j7S!AUNhD#vO zGH)^SO$+B-Jx?X0K}>nMLtt)~Bu}Y~9KSH|X?jqhG*7tjrmaR541eKT;6#pRRF#4X zM5O?@7G&?K2Vwz+V=o78%NkU&ppXN&MGG|#BsjC^huFzjRYpl9Od-W-8g!CEV~m*g zb_CSl3=we&4F!#{EWdTvDPd|Yrcp!7#K{*v)fV?nynjX-bbE7UH;qWv?h?<{GXX;! z`P0ru&?Lkp7YcY1Jn^r2_#6;Ytk6DVaSr1bP>z( zuf!8bHtW1M9gB4d;Q~tT@9yO(|4{XF=)2?!BX2-~$^rZ5LhyZgtSAFTb|#vlbd3%Y zWE;HSj>t0s2s~@;JJy;m(`In3pmH*bzSR&c{fI4u7C^|W!&c^}VNW~Wzt|glnRV#N zy@dS-*nm>CK6c^V#~3}EoWwK>=O(LuDhUi3gqlX-ZwMk1=V)d9KQWr8X-wBxy^XEv z6^|z!Q&-WisrqGlM00@n*ph@(2bM)*hw94Mt;#eFD3hLkO2wS8%}QDAo8YY(FmbGv z4=~TZw$bM@;nDc(&Uxx}mfOo%*jf|y=!ei=khbQl`Xe1DM$y|mlCIWW-va*u?kxWU zB)2GH73n&D84Eq_5J+KWQ~3`7lf=BnCHEceDV|E9&!M>23vY3+jCy**uwGL)RE&O? zAdCIGuHx#YRUmIBApEQ5pzlQ`5JrQ>6yY)LM3XFLxw)v)(XjyYdcV!k?kBz;&dJEM)< zYj)6SwuqB{Yd#pT5J`-X=eVN%Iv6H-`Z?;RF^W_tYY%Dfa+zxCuqi_LEJTY!M;k2m}KeAb` zRmVJllwfpU%%h&2^xM4JS2j#uZeT z_~obe?mbEv>b2X6DNS=tBdYc`#9dh+D!5)^8~rpebYW=5OSv*Yb(z8YOh zE026e?SHCYF_ufOvGkT+_aWJ9YEeugL^Hvbw z9SZ7(%kA1Gn|L9x@-U+1-u)kPkd^v!6NDrfvgc^GP&{g~8slemT9j^3Aj|&S(KPn; zRYD=L)#Brx!DhtbgJkEG|N1Vc`+ODBkF5Cyai<=1Fy?(TbY>nW9D5@6Nw4dTLxQA( zoWjX`cr;@*V=7-%rIfBI5o@Iq$A*)BlU#+9ef->VQ2Tu*{fli(x}MNRa#@V$1BGkA zU#X1V6G9(@Odm`b9|PC$>st-4OZT=Jl+)=!;$#AY?z$D?nl6`f(0U<91kE4KpJ0Yy zd4XW`lin7=k+o&eJ-I_>{=u@KXfu0+FI+7hUqeKg?|-keqP?On&&Ea9Fgd={d^44s z%I1Ni4&K>3_8ivkx9(%ArGq2d)w4YsTFK}4dnE|@i}%v5Sx8QBwK|?gKn=@vL^4X; zFdv54;o`5XBXkQB+povYtQEgP-)b57tx^E372lNV;*Rs!e0NxJ1a4HpL^xg&%VM2C zXc5+a1Qx9a;o(EWP4%lyVPB(}MKgvRmX;EJAQKcW`#kiR+G%rM_@w5w!H`Ss7oFti z-$BHhjm%C(MC9Fl-QSl4-m8G6wqN!(j=6F_ zj=g+%1FAH1i~9YB7C?Rwg!OqIygym~WYxMrG4sBMW6k3JZ`K_Fmu~@%({*N1y^>y2 zr8AJzMvK>Fx5s@ZC-?5I%Z0xFRmQ^|WQBaYldCWFN)og$w14n{S0&+Ea<|YKW zyfdb@KC~zaE7OA@?;s4O)J_`c)we=wGI@@EFRXitZ(CRzu=@VgL0z$Uf3OoG?#okl z6!*lpTDEB|*3QkH~tp&}1*0eTuk5!bqh^~WI{gi-EC8>CBqcj;UAgj0Ta zTJ(L#b3gZ*#|gQ#e!;nT+mquL0S{;6_tS&9e0{gAU`6+$n zx~DJ{z-wsHAH!!4vI5{G9YAwQLwb&ATE0b}Nsw1Kdk1>UW5<>`=5C+S;A(6G6FT z8Wxq**-g5TbzjMNlYhAG8L%5yxKf6s_d+yx2Ta^r2G~5DFN4<u!KRWT z`};v^9ZA#@ZO!4I)ikTYgWlnPfSlX6T33eM@?v6~9op;>En)mLR+j_RcRe9RSBXXc zwp?mQ8|YcHAVW`0E#9R?wZ(-3T(|W&A+8H%C^ zD<#IN;Cr*(a0i)kHvI<-&7(njuawMv;K7wolf0s;6@`tzn|-jMNx=`a*)mH$+*Opl z#@F;hD0yyhQf8mjOuzaA{aVG3?>j>TSJio1b7Z>aEgq>0A&G5>M4fYm&5_AG zLQl?6q|Aw3$XG#;%~X5}5x|(Y3b-GiHZ>3O{Rhb4m)ak>W^V{F(%Zxt@A9X=+2D<; z>rNcJA5#08`mNkEh`zLtI%YC7vaFUf&V2oSRZbxvzSkD%B_}1DWMBD-%48;e^v31N zK!l{tsNn&6|mc=9waG}Nc zbEQkVReND)>ns59N?1G*SR`fx>&}uHWUu}tK}9$Wr?g_m?i zuAnf{l@H;Be}FChM(v!4Xv0$O=b}ocU*4%){Lc+HRCDp1ovY2xE0@V;XHV522zP2u zM_+OOi%2y<`qSuG^yOxktEbLv(WA1#5~f6l2kNWWorYReljpub(o;8&G_A88;QfB9 z-R^uRJb(Z5Zr9`A~}fn_X?a;lGrt!R>O3$ES?m@cE4k>_&^VQ#Vwz-n+M6 z?A;2eKQH3=sN+))cXVoYx7i7PUfO0h=Stxw|HMI-1RZ#uLty8w^;CiLLiRJrGi0D# z*a$8=Jc=ae8NIKDI+1!M_Rz{vSkr|%_&h>JJ{j53YPwuwqE%uVP-zk+&$JY6?%;7J zPGXWHtJ|#r72l9D)DWM=9g+3l0eKSt;C$$DQ`>Q7nC9noPy0v~+l2pou3u*1O2}9` zHtxk_#7tdE5|_d98!A5z)i=jg*1gkg`uo}a#jBc*Ka8?Um&oyzE$i=SgC0lrW=|Jx z55_gM`~yU|@EtCE5A8mgm9%wO)h8aGSf6cz_bsoTs$2+BWNM63Ji~qxcsv%+jQQ;k zhvEa_pAwLGmI*Zi*;YX__c-LlBk=0y&QQ&!9a7k&;#=_~(K42yC81W1HN;Z_{L zPsMne0#@e)YL5lMt5-biU#8GVU#%Jj@3{YBo0OW^-C%)azcf<*i{dOrWm%p!yu|0P~qk7 zi$kIa8g&ctxHWo3My=<~JJ^#IpTMe3lTBHJb^peb!blgnyuPP7!)Dn4W>kqQjlQtt zm;H&Hm3l8@$&;t?s#UZ`)V-S@MK1(qB*yw1hhB4Oc@}I{AgH$I%S@y_tIklMAe(S~ zGzncsVq}&M<44;+8d+M^veR^3tbF=Ez$jmzNr<*3Q#y;=A%O69b2IESC?eFf_%}dY z_BY^D|93VM6E?Kc4<22c-KtkuUC>bOU_C^mU*Gj!1;jMtX{bVEXTF>FqEa6!?uXZd zkD7~+>h>rro7Lt1ZV+rQ=juaZzgZQBrj-iibM*?)F0GtVCtb2I(Q&E zPJnk0(3j%S&gS8UrAhn3csMOBIty{b1#2J@H%(Y;OGVR&^N|jZC#+%H7m+V{6Hu(@ zTv9B=%?fLS)J-J05HWiuuua4I*8npg9=^yU+c<&&{i=!GkZRSnoV$s9dnU+GyTUII znT0k=-*b{KY%*xXq{S_)qAHMu&|))qs*U7cj-2Meh>h}Yi|A4%L$YenRj0X& zNV>}l=6d(@uYz`22clwgQ_U8Z-TumDh27%ik&z893Cnfq^f;wHIw&vFW!!&MI*bwR zb}g-TZlU`*;f=lO146a$eaACJi|whGT^MZoi99=%aEn5-+;KwBy2Q zj@4TJQN70Jt=Q{zr0#kHNdXAn7OM=#OBu;Wb6VRrg5F8i%1USZ zRpr-5!qR?@?LMSy2Q6D$(YT$tl5gcBo|a^TT9@E~fv0&3Q5X4fJV~u~L~n5@=P&6n zxY`B9=Mwx>UQT0etM_~uD#(hdTG{w+n8Up7)TYdMyPbURbMDBU=|hjVP^_WL?4EvH z;yBNk=}bnydHpiPCZ*gv@-CU)faJW#1U{^vPwshSn;2#o6aT*D7_e6QvsHPwZ)L$e za>ya}99YM(SUpax9a=QAKVFm{<_s8JB1hX6Y;SLWjX|X@qQ@N{P=U7bh9+E;>IBBE9t1w^r`R{M=vN z`R&e&iZ=?w+5Z5l{*hS%>Is$^os*uL>xN_pP!Y5+yh?`SE|C#PFvaJxDV#BDEAzrA zmX^_U&$+%rfW0_l)(`@PcQx7oO*oBp%Ejar2Z}~J68t6w?8ejzw-pkqD40Dy0m+bZ z-zKY`my(E*j;x^$y26vjX{~$12uVu+hRjKdjK>L4_DeyWXQKb#nGkyABL%z5A}%N7 z*QxGWL7of7AC>hvT8DIgJC?Ao7Aw~=mpM=TQc*E2_3W8!dpD4p6(`~5GvOLLqleOc z*5X~#s9jX>PzB5lai1uSFl=3X%yAt?&0GpoR2#u5UfPN??H*G}kP1cyHrlwZEME1X z&g0wAA3OT0l|=|CsjbVx{kUfzd+}`-+xza8&{$2i5T2(|pq27y&+k+K-JcyHHs?oU%&`w}Jl$6sn*WZ&rTkL7WR`A>6lyzFmLP6eMo zXxW#0V1upj_!2M!h0x2NFJ&%a@{lp_&L*2r zL?EBTjg3GLCh8zv)zpaQe*-=Dir=u|KbTwt7R^Oi-1MG=E3tLCv!`#olKS9>nXRz}ARbP4*@5IceRu66JFS zU}0cO1c3P*z(Y*b^x%qMrKW|uH)PC^&a?a&=say zb1N3mEVaPN9h=O3iFZTsMcZYY#Vp_oKLGYQq+iQY_-U3Zzi*&-uzss;R)paU%|+DC zT3A20n@w4&aRemmwVGE~(ZD={U^fT{=rDMurpI2_Z8uKhQ+AwqDiU9?tEgCvt zb52qLi;fFT$@{R|W?U4#GCOA6!l(3CMz>fIqwlt43~W8Muifw9nPDoK3rVyI3%mzE z5D0ot_qvf3uzPA-ZK^UA9dp00MkTKx)1aiu+G^QEW$=?)=K?*Y$Zu?NJM2bO5h_2U ziE~Qv52r%7$u~5beDNH*FA69tc=cEa%_IfyOOOuFO+TzH>I*8`O2f*!XCK>!&@{DM z59~kF_rUQeE}obIe{D9a460}idnlV!>`jNJ(b&p~)xqRk#l#x5s8j~a)XSegxada< zNhf(++1I>))s~sfc)On%g2zb52t_uNh1H&39c8OnQRN#*B*@<9eYftM#4a9qZlj~% zas#Cqm+4t3BDYaozX;(o6InI^~3w^d96I8)>J_H(Z$p@EQsVhpOB(`F{9S$ z`r3W#P9~J--=kX}&wEWbzwZyxS_;v$TT3{geiMZ*Xk54@VEYu=Eateoqm-7H)g{blhw)hb~we& z@tC}hBElG5!mw$bNwF*XoP}f*3N>N8*n5wkQAV+yeWdd+SC5fDJ2;}#6aMe8al4moJtLP=3q!Vw|}x-1Tn zfG!Z~!yT)EkVeyLX*N~(x(xzB?s@;&qO zA$FmqLXzN)LVczv&e`3e+9DIJ7nY|R-7R#FH%{?P@8YyMUGh>Uq&g*r*tUVy>(G|A{OSJZ>Oa7CUX$eh zgVFQz@WsoT5Sa*wqHyk(;H^gg6n2}*o|JD6TTRS75~aE0wR{!~pOP7s3nk`pLhX?*OhCj5#}d}9jt;a4~;nr@oiq4!7!T5bz zhNp^RccDN#5AnppTUXd*A+vjUIjo8~EX?9=RIfNkiDPF-)^M>2TQv`Ff^%`18Q-2A zZkm+6FENWhuCs`wsOkCk-b1ilM2guS4!g(85 zYo_FOgxNelZCDXp4LOTBE@@X=d{(9N9>YVm3!Xrs_&2xczJ|Ou(zIU5_zxL!{{VvF z<43|y#UQQNVF$19_%@YSH78tS;WnxGsWZNmI(_l__5Ym1uF&jK(-xW0i3DZn(98x`}&0!FE z@($iW#m-2|Jq5}o6|}ck<{TmdWnRVot+LOPb8dmw3MDMP6J){Hlf2gclmXaW zRgdM}%DExPGio6L)*dobbLv@e4D_Oia)X6Rv@XgzD=PF{f7p~>51eCHh$kO9LrYV~-_%)TtU&Iqe-p1b-5OT3kcXTTyqP zio?@pl`bK@ZBb)%;mr`Yq~W7aTO37cwB_z8@|)DITUA@U5(6k#?VDSAhc38$#`FP` z=n5vqkE>G`kKEs|dE5-sH#KY5&waclgcp<~+xUTRU+X>+vwNmE^wmx)w$C*LG@@2) z%9eUBNX_YL3L9F0X>cSR6abTR?jq3Oa>X=M1(X3{GHMP(T){`yIwDZS1q3RYFH3*{ zkzfvko7hO;OlfN9G+dsqG4CBjCOVA;ZA3OlXg;YyXs0 z+1)ls@q56UxCm6?1FrOkqa)>7meE)m9Ux8diqb#Nfg}cKp3QOwHNvf=Ho9 zHa)=riPVSst)?Mxmj6mCLKDoMaDtY)4N0&b!WhT<1_DWQj17fG@CA=@)N@$}65~x_ z2u&_;m=_s=W`@ba3b|p3Koa0<4H(CCU>l9B0C?ip#{_m+!#Cj)a~cQWn3d3ClT9Gm z{)h|Na#ztx4X+|=_4L7W4AE~4#g4H`YGy4`PkwOWie}{t3N@12og?u44zR;CpI7PL zcemC{sQpAat-d|AKsHq+B|YqYZ`Tq>IVabG!7EdYT)l{~czt z7WVwH4aF!@kV^N};OWyHO;aWx~{^ChrLBs&DsW>_glQkAN;R)1U<-VpjR|{)uOXa|FuJjtaT(i%@yqb^vjM+`fJQH_|*hZ(#TKuTZ6yTC%c#Qr{Ev$HgC$kUQ8J_M5uV6xu}dzWH`%Fu9JIF3G<@St z_?WgF>@Ag3BZ{*8u`_Z0^YSC@YU6X`E#DKCVw zwB5fMJ!<*zIY|Bf!;YYJ{dVgzO@5tfnvqVgw>jD|X~@;+Gdq?nycU^J`!ZB%31CI{ z2Pq*6FjThK%R_+DS|Ojbm32??^C)P18hd@yr(Y|OWTe959}jPTyKUal+z~v^ba`!P z^7~X{WsS-yEnq9q0sndd7u=P)#=G*YovVTLSH6CqqI<(K-6B@V>S9gqr{$*6SBC@V zM(?LCEZz&Dw`llU=gcO>hk89Jv@6ZrylmIst)g+i1G{>V_jV5*mTuEs654Q6Cn9dm z&rnhS+Ls{he}FMaRA0O?>5N+Y)H8gqMoKXM1Qm6jOa54~bl|Dr2RblP1YFwlN!x|L zSl2T>S{PX?l$$22+_+l02(W&xU=6Y?R$=5a3v?t%^Ptl4$PkhSt{0cmRV@O$P6}iI znsPKU%LqUT#AMvY7#5jK5QtC|sss9tLt<*E&v~GVZ5nMToSAJN0tgVM{yM*?u%BqB z&oJUlhGhF#BN~7z9aWta^7S~K!SomogzS<*{<~Ll{K%3gdq%CxTJ?IPW73Fu(r5H* zmi>sjGiFqMz**g7^)@?Z{wMibzm}-#B+aM+8P-e#_j_1X)t*Fz%xHVvYFcfF z*O1!3x(^-nw=CU$;4vQ0!sw<;$zbgp8cKZBfNJH;Nb%a$*h2KSE!yaKGKWd#e-tz$kSx$1q!xeaO|l|duEC_to5$9 z4l=YVSEXIMEG}xk#G1@ApD|3wepzAyyAC^8_80Nsmy!U!J%~R}_wQ&MxETWp2HRBllnkd&6hrvn zvdKIk=*%1|;=r|9mZ~DgR0sZpk?qT7u%y;p^toVIpCR*n(Sn-I&pL5SU}Rj7$)-d> zf!(=Q8T6k|^FKiFqgki+=3wVGsF>OXS--TM;X8ky4;@tek*(4D!r6+P29BpU2bb9& zYH9-JnMRbkL;DsVkA2f0nzV%#j=s8D#?m)X#h}rBTAKYQI?evjqnTwVa=T+7XxJ@1 zl-Y5NYbOo*qb=vjA9KmOYd7!hN(eV6Nwq|iVa{Z^st;SS13vPO`Fc)QTI~*`qJ~hW zvuxMkJs#R-C`N=rUp*LEXh6|heUo1lEg43fC0;PRr&!0xackRlp3E-i#==&`bm{j=uc%S;yh(z_EgL_u z{D$7J1~S*$1zRUiJv5kW`{TXhvhMlB9I`&f=G-q#4lh;fX*=Z2EKWyaW$jm?6FKyd z1- zhMR%pmucJG+M9H2Y9`!CMqK47V{Kr}31Z{P0*RQ+L#r8Ry6KXsa}hxhA^1H@2+X-K z9GvJc<;De|csg1+b_ng2OZhbZ7<(4*{%))0n+V)-tinr@k2Ip!!Gr6!t?=6j5qM?V zLsN9)JvXEl>KRouFZM|mm(#t+ryd@wIdea4RGeSx=C(}M7i;vcW9*8$ziQOp#cF8< zicxXvsnJUTF&kwQ;kL66Nn z!mr2_A~vfB&mV!rg0O@&dp*>M-FZH3{*cL+-k}R+g}u8P!1j&;l2$9glCWfT$9~@E z}QnQ0Y+jZc`>)mYc zsE>UVl%V8}khsu*2>XX+y;%c!!}ejJwmq0uCv|K2HKXB5FVF zxdn%a)e~-JMao*^URyHgXZxv2Z$i)5Ug5sVC+3%?CO04_?8V0GZ;VT^X}-$ML28^6 zKNUi9V~DX0_Z*iL(hoZAi-TS;GS4VoLbnQ-8nm6Y@cVyoOVQ+)k&)W@HcwcsOGORy)#18qvYm;4#kLty$mx>^2ge}A#e@+^DWZhOm8%M_rh_ziUiXhDh%^M zEwkuXE8hN!gvS))6-_sceR|cF;t)$zs2o!S{*f4>u4@!6 zJH@G(1C2UnTy3TPwwAu|jyZfl^i45Tnk9-q^>CsO34Yhd05J9y+rvF^nqDt!4P7s} zE;-L*ZC4le^9|eq6~;=`R)=UgK~77Rv?yPkFR$WF7mb)2894llex9c{s`c3n%iEsC zkCpLZL$VQCJU8D}9j-#R?f>`^_u@2U3i+;w~XlKTZc z_m5C%&tzm@$g#IiR{u9jk=+hca^FwV5_{iv@~Zz+;@)ZVw?x&s&yt_F-1WGoPrtkS zER|LsPhaBqV0|8}`tiomx7;U`lcjQoaxkX-^%I6*Y_*-|&Gq<4Q^%q=uTgt40{#}? z(nGVm?05F!yFcVk7x*O`pKtG7{@mHm!(Mtao9y)#wTpSq zpVdijdAd?sq_lg^CGJ$h_<{YY%U=Iy8Rko-2S0~M8Vja<3Vz|gtVxk~TlW<%9`;Yl zBzJF_As9?HOmHgS8=mp2a9&8 z$aZewZ6`r)S+Vm`rI~A=s5ckeKHH&(1gTabuY&h$j();U+YcjlV%nT;q{NxuT&lFs zEP|EJw%qvINPqAS)~atND!66rZbLpV#XuYn=D$^FjCBAJr+o&O* z^vS-+BX=B=$sOkQ#*S5)@DbT71E2ajfy^-!#-iG%tfrl0<;%v@ZxLZW>+RQ;yhy=v z0T_EebIZ42xY*<*V>FiV1hfRnhC>TMP`n8gkKn;Ty`-#}!s|mo_-%cTnd0XEOn&$V zu>ILg=h1~r>qX1SUT~g)gzx3RbF{SsaE<(A9S}Z3{SKnvV`{xZ+XeSz%e9rM>!-qS z2d*grS*x09EDLWY>{_A8)^!3V*}~gEB1e#@QAhuC@+&A(DxUHpaxR8C_&!ArP1o2E z*z!KbuklF<$+<@WJIw}=+)FJn5N%(37>yE zM7W-@Fvl}5pD}~exe=}H9mIEY<_t$v~_VjSGjU^ zVhI;oS;0j$cVIwW?bQ#xu-+Se5IHoQGNzVs%+STZQKxR`v$7glVO1!G5tB$=I73bt z=sULj{O-az)=EFR7MD&N;1KlnhHONr=CC0uviqBaK-EXS+B!Iy?Skxw3C|#*`9Q7i zbLl_rx8#Lc4ibLkq~H6GR@ngEw(x~U%gQkRbfp+M`8~8(ZJ*ceiODU63aI*O5XPc{ z(3tvhTdh? zZ80h?2;?D=3^JDN=M%j?>dVDly3&tvyC1(Fn4I+fmhJSERg-mlPo7kFN#(h~&H3+c zeNGpNE%ekesf3Hvp~p^wt1BmQqBpHm+YatzMz-Vh-!p`x7MF8SieMv?S7zG2Q1{afYp^9K)hTG#8EFI@Y3!TzF0om$Gexy@RQ zotmMIkMf)Q&IKpl|Cyp&7o+|GqJGm)?(F$Ib?Z1@RgpcVnO*vx`r-8Lp9b7>PGKFS zcT0<9nm^;$)zy=;1Z(Tc+0M(+YB2@TyWVKbpZI%Za>Ea*KB7sXcF%~$h%y2aG>SA* z(Q|sK70^_Sz@D6aMfjc^oGI8r0Y%`v3)<$Op~qOJVA~ud1y%wo6`iJ_4tLNtBQ%*p zV&sN=7;~I7pNtY%zSsRJ?2RKUR<*8PD~R3g?a$|;+KiwLT=qCxHP;VQsdh`HCLyLK z%eb*nxx4Iu(<|6+e&M&kIU|8II}3MIc!JDA)A99n`+}B`x|n}}zo_1ReQ@9G_?Y1+`+9z5 z%5%9oOIy#mW6}M{Aa4Uz6%n@M#so=r41sN%tFlOo!BDkxjXSt%J}%ZW}~{m54bNj6g`L7*#58Cs#w0L5Q7(S*rtny?5I1*U4K zngv?K5FkL10}r~;iXX~BkhguVp()FjY$t45p$T*(Kv+}>K+iy;=T>qtg5|LNC6k{q z_+Gg=$*b*^e`PG$uUm=(CT=X9x{0y_NvF@1&lj61Ab2#{oBUSyY*Gmxkg7L~vjQ!9 z)X+DcNTg4aLU6|5JO&mLx~!+hiqGw++|==Sv6r7JXET$5A?LKqMM_)8&C$spQUg;6 zahjxKTlPTHhsqy{OHZCWsSIPMOpiDfF}k)(h^!o1re!IXgj!WSWt#*q2Z{gm^Zy61 zz`gX&04ehgO6EN8Pd_`vx+*Pd$oOPtt$9M%r_n-baox8UHuQ67*zSFb()o7kJFeaH zvId@Dl8$Ll-&ADJKu$?LZ@SZ06T^A96oxw&k^nSyP^Y!GbevPZjyQ}77xZGyRS-Yo zYY?JWL zHvIyN0J!I_+gY{RXg>WbY_&P+O8Z>r%%g+ksc-J!y!A4{JHln&QFDFp%adU5?+Pe$Ql#t@;y7}na-y08al1R3t*j4Vg_+v$9PH&b}g8lxl>90Nk{{SKS zDb(*z+;<+tx?8GQXA~dJl0H1d&h8RJ8Q4(|3I=G;=CeA4YT_qE4&BPa)%R+}O=%_l zY#CfmcVp^Cx|Db{${ycy7HcaX6bHU&JNstQXL-AI0+Z7%*~NkO+C2b%X=E)Q@}56| zh_;(^{z@S7nVw(LFoiM^EA%lyJRIR$ObaYzh9LwB{VQk*W`ugK8|c&^HdzB3IX=#t z7qgpc{|`y$8OVnBzWvx+)vi^0ma1K=Hi<2X5u?dB}!G%R>$}8`#*2b`*WZBy3cieZUJ#%ZgB_PhJTLB<3E5aIXLW@G&AibaIuv^ zR)u~7O|@`DPEXz7&D=&o&=+2%p4`G$2AZ%};(vl#pHnl2)@)WX0SURNs;$?|U;hJq zS-?BbgoVNP4Ao}=y`{Wvcov43wC8szp+=OK%kd@Ami5RQ-4$c3X#o!DkYxSG%2}-> zW@YYkH~72H-S-j50H3-ug@k@sw<_diLu*%Hbn z6uI19aHo|=3W9|R_~?*91&OcJFYuQxBfjC-tTL-*YzmlEd&r1lhCbD1k|K!S*T7Fv zBbRT)lbHb>6QEx7n(h*tb($i&UWQYSwO&isDs@RwOd;d2BFnJUOwXQ;!34xNa@&rX z{g$fYDOS)#qWrFj3MTbOpzBQfQxC7U8x(Mo-dj}-2cEKb{IBdXM4`MIvyCK_kCT)$4$%8gMsfJzw{%vdX|W$b<;=e zJIv`0tX&X1@7jdUmkw}rWVHln`*W0afzlj>LNR(Z5YZBy)b_3w2lzb^<9FnyJ*i#I zIU(imK**sy{bT=0quoDEw{TWVSY@* zp8j>poXC;xH*4mW%aJVSUlK?3jrJdVRX7oegq*HuW9KM<7oq-;S6fm7{DE}_gsC6& z=v)6>dY2#B>Ns-UxtM`-c?lf348(fNC7<|+hg?Hqzv9^wkehLHmLTm%E;TO;M&2)+ z7zOGFuc?34dAh{c(>DuU(5TALvrbD)hVc3hefNoR^Oz5P%I7^4FAVe6gc7+qXJ4jX z&K7o~w&b9n8gzVz)-741;lKsV0WcBp9*ew>r2K=-lQsm!RkJr%+ zRy_&!Km(a5%l&(gU<@HopX8+}d}`N!_&kiDcTZ&)qMu+^kGGh1>k)nli3w3!Yr`BBd&~B= zCkJ$eiyu|5X*@o%A2k+jD?^+uM^K=1qR!LKam)5iBj1bygK0=@q<|W&@T%<3wM4gm zk?EP_s$fUlE|GA9XPEQfsSn(_$)CMzOW|#Tc_W{fs`t6zvJ47Y`A4F(-B&KKD-GGZd+-Wd_pV+q;5l zt@1F!XaxMjmojHe8b2p$ z$DoQ6O$!EnMk5%FE+|X_ZOl8%@vZk*q4BRr`rQ5wLUT0G(wD5*Oz4rOniApco#b|S z7i2iY?~uRpH`3wqMAJXjA&<9)&~Be^suBx@_%|3PZ`(ppduUzU6U1PUl zi(tgIP)wkb%bxG$SLWA)942~t+WW(T_JUU! z*n@5`nXnl~W1~=&3bt47YkqfyKOL!CFa=X?OUv(a43F?UJLH#oLkc#?2=Bf05V#2x z%~0g{r$|qm!r_Rsmw<;Jx1CjHpLE z1RJlmRQgrvDrFfwJm=5dQ!tme0*7&McakF!2(2F)^Dr{EP}V9R7=S4_t)?|5yFR@r zw5c2ylh?5NnQm-MrD#`3bXG@&F$LD&{M)#V zp9+WYw9f-(K-m+~X_+<~JR=QSazYM_O%`G@e$z?%^Li;!_uTv(ZQjfb>3n(myD5yj z%1M64tq!pd4s(WA6nFPvG)T=>fv)A@0%LpnMj(r!P9IE*s;Wj$YY5e~bWL<>ejN_B zJLQwiya;C*KF7X1aH#up-6QdRPzqxo;S+pdU>6|AVeK{(I;F5VQvWzSOBBLC;Mg6&6rC02ZmdUtSuV~3zHk+@GHdDpF1nROba(Wxj$**`w zFOo0VZ0dO z8hZpr=05&Bx*JGnQw{DAZG_YPwDH$On790@_atz(K&kMuI?L$YlO;ID} z^9IAQAHX2D-YyVu5 z`;Q46A~BZ!Z3Lpe*Po*<-m&O9ed${Ws157(=pa~NmdG`pZ| zHBg0-@20>Vz8&%{25L%XgR|(@cjd12H7BsYRbCml^sYybAG#ZvcYvqGQ(wk>B()`} z_+$0U)NR)Q${5n5;DwCwZ(fhSRqi8QwTbHA4@-U9B{Ju7{K~@P>WU-ji3-bFDkqgi z8a|Ai^!Qm?VZ2yCEc*=ciz;Z@br#zae>xD!G%tgpz>|xq=;Jp3-LKI+UiRbPq1RM) zM)j`E?r8xL@7X@I&q5`2s`9GC0dn!MoBq9|$X4frW>VP{Of1M?De46dE@~nK_7}BF zV+EMS(LcW<5&G}Z{OeG=_(y-w)4zh2#ND+7dpCcnH$ha!rNqnNh2w)qSdjuB97a%k zlI~q}*H9|0`=lnJx$;wixVbVEDL$mR)vrF7i?FmJ4)pV z6{#%{(l__&`q0Jg)?=Sk9Ay>{4j4RG{VY08UXLu-00=vmb=X$b;AuAx6rVQ%bzt}2 zKW8I+yiA921lt*EXV^>#+_!rl(-a+2@PXa^omIkL?+bPa_{&3vpk%qbU)za)99`$! zkbid)NJ(o+`=F23Q+?2C|3H?@*oW_XhmmHxZ=WQ#4^pyDryHdDyF+o}XvDHPQ~oc- z4-Tz0QV%8_MV)sCq6Bxm+f18335Y~U*Odslh2HNsMB(0F2k|}mc3Eq@nYzVGc6Ugh zGF{#<&Gv5Y(SEyYiX8dKN}?KJ2XXOLXWb&f~k?k?@_-vPqj+M)N3iW)Y~ z*2}P5&T*DK%36_ctclIgy7XdV8Rhl>=H>A-q?L*MKb3}=u-YG{zZx6uiB!m5=iA#G zYMO0`1o--A?S9PetoK*ZyCcoBq&Srs$$SFls*0+rI@ki!I(L@TJ6El<9NPoPYTrWe!_!Y?>*A4rXEphWK+m zZMs+PJE;DHIy^6#l=oy?Rjjb2pE;MZ@n$j(oT4hrBBt;7$Q3&AjX~K1b{<^bQXwO| zE!35MvbbxtsrTh-%JrI+svo{$X7=Hbh(R|3H6=!w&*DMmTQnu)tG+}@(Tw^ijPToX zIstqYw;Ck)um`7w9^Oz*hF)~7=za@2J5rd|sBt=VqSQ!)dZ z1vyn1DvSVjd!sOT)4GvK+^s!ZyP%7SdJ7aS6Hi#oVUb*EL*(vvPlPZo9!uiNRxvBB zC)8ZyY14Y6Vj=S46)*4y(1;~leNm~hzBvUKDTpY{3TUwBi7qJyG@qLZscCeE@($O~ zsWKrutn5~KlA8@&6oq(D07G-e$S3|3S$aCle();=(+8pM7HANF&x;qkvcV^qF`CNj zY*`v1dHUUvg-NmsBt_nvpcsQ{YhS52-k@W@8q`D)pfg&C`LeR)QY`+E)1u;=r?_bA zeFmpU9Z(eo4|hLhds^2@QF`*rdfb`%EVQt`Ko$AUvTV}G^{~?9-SCmc zdX+tHlEZQ|k}yy3&>2b0B+;4Xvtm;N!V+H$Fw-dv2fOTHlwZJkE~FK~1@@8<@A=1_ zf0r4gbQOW`I_@@#-&TM5*$8#|fUwRF@2CFIZW#8vU)Z1H1GigYDFXAyl9@{d{@*5|3=xUawFL@C&ly`&xqbu zF9M~v{3qiQpLcld3>T$D_~ORhG|4Kv%i@;HX)`0@Ja%Jli$O9>{sv%(I%Um&EnZw`?DHALz0)VPCa1NH=>Ug+dX~>XV?w zNEJLXAaWD;vEmYiRR~zZd|Nz8Y(!*wW%A}MHQ;~?WuWW{{sSzR z)z((`Kd_x8)ra4W7P(LlOT7Hsk&4kmPbTZ&G3mzg8fBZt97gElPsSG+(&+na;2#gu zSp3-k1K8aB=x|!u@wcA6Z{^Qr>$zDtkJsU)dosh$w6*&kCqyo9KCM{#4iqfr%#%xYW9L&+gk%Lg2aji4kFz z`CgBFpk?4z_dk*;_T$|%6!;}OHr^zR&dPGzOSYlo+4xSIOqIIxZrOrSs$R!n`N;~Y zGhF40XsgB}t_jI7mgiUXLaVIp9y}DQTH}qa?%*)G>F_toytN6pk89H^|M+M;P9*A< zR&S-QW{g<-T7f7)CgXjaFxSZL0(aP*n9oZ!+PWsqnXwOhD&4@(Bzv{>3979L7b?4~ zp|-vcnDkTM(~?VP26&Y|kVa_6Z|6gDg~W4wGQz<3zccaSHE;mSH)o>%)-BIJh^Yhr z0kFm({bV7p@66~60((d;$U8H_UpbveBfOt3U$`FJQ~8)9Q}Ohsvry))YTZzD-fEZF zgDS{rlNrOaFtTfICpg+dzrApB~>{ zCpIG`2*!M#oT=2#em#GXTfzn3QFYB#wsUN#vU#Hgu;N#$cX<3nSI0HeYA!IJJT~{B zjCVzny8O6ntH#j`j23M>?rOo=L~O?BPjVd3iRl#6eR*aZ7TUe_>WV|hbY}cREidJV zz&qRgo81wZ`gZD93PiFWV1#i6%T03tfb3jctZ9AaQ;cfW>cQ|O zd!iug$83kGLPNpj;9kqR9q25z@)PEMi1V|4DI+uLq+1>A4-8r$Dm5~Vy!RZf|4kY2 zA;E9tGdXd-v<#yQ7*@y>NGvcU6wXEnDL-jFxbAqo@l0*z8Bu{xOdnecIWW!ftQs>xa`fE9jG5f?&PV0o2$$h6S%!Qf z*4?pH;j!3^S0F!rK$V!14z};yt*yXS-MGxVvIN+_m|5S%Y}PPZ@fC@03M@&~Tc#3K z;!|Fi8K-svWH@f+6rrSu-DAVf+GE2oDX9UV))&i&QfKz9lQe{sQf~sI6EbHP> zQT3X~o^2|cW|hZCPct2<1VmO`BnJ$yV5FI|EsFW&n3Nn*=~~|B@|sn%3EfmIJAEDj zmgFwT?fwri)H=hn6V$u`PGa|Js-3AqkGw0r=?QeYUL-99pH6NJ^qrr(tCx^6plI#T zh#wGKHsB{+xJ98=KFK0+18Uqkk&w&wZ00lcJxntgFDTa#5fL*8oq%s~$p z+_utx*Hy5&N#tr7r_Wmg79Z%kZTiWDXxb5xK~~eKe(P{!&EVoOGz>2G-IrwKkmsL6 z2Tn2%5AW%*J@p6A42Y?U8__Q+=9OP(#V`0?xAv206Qsxm&9sYbx%Oi>kH>_H2Z>I} zB3Yaah_uK?Jkqoe$D*RtR9R9BnNyAwGHyY%T_D(F;rjMA%S;PJl@CG5lQufc>mZ6d zQ)8w$$oJClAsBY6|K%kqpfXYDAS!khZ~m3Z-@n0!*@ijL5!Bgf#KM_wk5seU`^n=D=RRXTmEr7hvcfY^fa~&aO+#Z z77*IC@2pc=DLtg*X?0cFxv!~wxrzU#AKSTBNV#I1J88hg2j{UwZ3#!gU7Dm23?>Kz@F(_jA-pRv zL9pVy+M)A*fI0XJ9F3Iq^gdqlkR-0_d~%rg#K<4d1l=l<@mWDri2CwS>3)#9*wCQ|L!VlY!P zA%x)x{Ps^}@xTLw;l)qARHR@@+_}|8Ydv#(5JbMs8r}_U{qK4S@t~n6_GreEl!N^6izLwjrtvhBatav1R_=@W8ko?cHj9 zSS_{VR?Aji6BBD3!95<;VbXGyO#Gv#E9k03zha*@Q054s^6z!l zJeKL+-aCHT+-4en74*aJ%Nq`r7R0aL=H^vhw*o3GTq zT;)`LX<|udvA}!LF?WtZS)n-BsjT#lgM#nv;_j2eFXYMg1OY&KI@gD z?G(LO)y?F!+C`{2E16tiY5r-h?#!(~vl2U2bb5OUPbO}6nzLTo+3Xh2#bK6VQ95*Y z&Y)edLQZJghIzeb+&uy4e5|h?Ss0a{W8YgXZ(;{|zJI-WNj+6g%8yAkqtQ=z%&`}p zB#C!_D|rl28XK9PaT$WNbOT_^I^(Qdsw$F9$yxM7%Pw{beY9Ly8#W9r=`CzTQI%iY zLgiPL>kzmIfcRbSEv$TGC%N=EN{kPxhxf{MB*xvz>&-^ zk+D8j%-Lwlrf0@}rEXl&v5?o+w_X;#mMOD>Nz<*2~!H0>;$M0Y9f4y;jx!Fr% zYh)Yv7%P-A22Bi$b88aO)f{3L@rq7WGUky*jlZpZ@|MZ%YQm(0V{y~UO8aO{sXH8A z?g1VMaiII4*d{{7rt_t&aF@5St@%z#hjUsc`glw`r1PP^!^&a~kTKgV{=W6ipSq)#u4 zs_}g=XIRZ|QG&=pjVI1^eacooyqSWOH!cq=s#k`*T!8O6#pKA011ZWW_f%@32lcL1 zi?_aKrmX=f^9VNf&1A%WoTHKa|Y1hzU5KWYDNO;K>X0-RsxY!V)L@6g-0sStl($*%8jZuOJ<>2b7 zFPFzp5X2;6obGt|maPgH>El#yp(?*18gd}JbH%c2un1rmNMPCNN|Qe4s&{6Fc(xr6G(Q>%00#Xo^in$4%6$KWH(aDf> zIiQLC?c8x&ZRh?k##67M+2?&;M}1l+{i#W2@^|wdaHlm^xjCHb*+`~ckcw@m7ED9q z@c#jNKl(34=z{t4kN71HjjC*l&7QWk;j|}DcMLL zmOrOwQxN+%pTjvNZBV2&0fm`{sXy6nRwb!3^diG;I7fhjzp5swtgBb)lCGCe59F_< zOOOSTdyuHzS{b3kA*uCjh)({tMb3+9uwPPtN&sdCK@=Q4P|akqrJ@+tuWmgqADszF z1;}A|xy^+e)KHDBEsR5(=3gCx`0>cXOg~Ke9gmx@vbGx{anWF|WmsK=@>>v_*3PKqpFsUt5#(Cph z-Lw@Cj9DQL)5lDQd48Q*>rC#AD7N6YjeiW}md#poO|ZGG;|(6yB7gLV-)>HU{y-4Y zHj?>+n_bVjOJ463<_Ts*e3QhAYCMAcPI^j!V8nZ0eD+Rk<)*uXmv@9ps;mxK2dt_q zc%I+<$;GR-bnN_MX_@60{rC1$T4nS>xYn{jEM+Od@iZte^TC+pe*hJ*0$XV9$IK+w zO{;m-uJ_#ipz`G)77ms_3PGJNGIA>6q5-)aVXq(XtLh8EPG-ba}KgFYTklNw|zdudq5H7GCBWz zCf#7?Ylz-Ns!eGvGI5Zi6)s!E>cuJ8Y*%3ss~(8^n?o_j_WvaQ|8e`waCcNpAL8m1 z>zirk4(!Uc5UX9Mro~iYSmiYq_B@Yoej(ILvbUA9=9=U@d@)Rd9x#7jCLj;n7B3pB z9xM#h%m&hZ3>RdN$>h4`iw&(TiLTMG(LB#R-7(R3K?Trp7HJzmxX3ltO6pI(NjaSR zNL>oYG?+wqWHM}|M zsop8xAw#smM7q)is1EF|9agkUXB$qJ#In`XH8#@bj+?^oJmw{FUFgHsu{X}UiD#G$ zcV&<+*v=%`W82K_P1p`xU#nW`?=!V?Ge$psWDg=*7nD|KE+G=t?ZB%H@1Y} z&9H8N_{SAGF*Im$<%|*Kpz4TqH|Yir%hc<6#Ja6_Gx&@dzntnq)R=c*^~tH9W>yEN z+Cau%ozt<5tLcNJV)D_Bc##pWi3luJA1^$Z|jDCqCT{?s54}w z4B)5>$Lt23qfIaD7A|mo`B^#W*3T^!w?icTiQj#kU~yT02`eVhOe1FU2`L3Us;x8#x!;zC3B%a)R9#9T_le_%U>a zZZ_pJE&IfE-TTA>9jR;8lwh{+54*wRwdT_nDFU?)BRu^loeZ9{T9dm+m}g^lkaGSf zb@FK75n-aq>xk`!c71b5T#69ruw`bNqOc_ezc$qhU+OO38hhJ7b_A}qsKR+OD^c8j zCf>@@TioE&#(AwDV4@wRW$IMVG%O=cNM`?Xw@A=bYuj|FsEJmj)}x?D0N3^og3FYX zcBnBx?C9x#03veV&8u1`XyjnbKd*PaGX`oXKT zbqyFeTnlcNcm`hj`8x{UC#&a;^uFq{piciKUri*_Os)5rbcVK(daKsjJn)4!x3~f9 zf}EAE?u*tq9I6Xm-wXU0_S0pW)^>vhx;2fTNVvu{b(D~-}_ZJz5|()C)i_gB(6 zV6X8ZDS2nvKvnHaKImX|Ih(Tvqxp^SvqGj3Wv36?jC;%rnqR{;EniP3;i^aYEHi^^ zTjq1*T=DyY6Fjb>$bjS z@bV=F?RkAlyD$fNs$gF*!RqZ?7kw`Ia$D}4wa{qFf}gi*M<3Ec%V+M%F`Pumd;`6l zoo}&Yc#q^$(Gx`_pTh-3ueL2*!bp5V_1w>4V4>xZ>h4&bYE7qrlRB*gV92H&cmEx| z8X=N+>F*L@E4At|CL=BavOPsP@Xkq>{>)DGLbRDeN&j1mIvsmsrOCspNoD%)N>X<+itj#6EqCcNw_8*)t1M|M*0Il^ z^N`Fmtb4D-kJTDC5shsQ;ywb}!LiCHwiokR91h5HS6JxnhQ<^YU6EZ&9gm z$wZ5f&U2b7?!5u10%DUBQZISmbez&CP`4R$4o#<=g@tJGUfph=k5dH8jY@(N=a!mr zULOJH!BU5oee*mKzXt2d8m|7{+Uf3s*qM8m#y1fp;~rf5kfllR>pQMTY%b>>&xii2 zGF?($8-0&E{sRcv<*c%r{6z}xY+Tmf@ox=->EwR>h32M-G(XCH@*eMq)K5%LB(lsd zE~Fg{dk4b9-?`QN*7|G!@%qfS6_^ly&Ef9F0s{e3d+!X+-0@XlEXE#Gq;^NPY9|6I zBs4FR(jsDzuSjm?*H5PKDT{m4lMCUO24JuVC2i|!eBx+1yB%RVPe4a&^ym{oJ;kiu zl*6o*?mKkNVZmw5emkWhiTv25w1W9ZIjN5?ag|!jRHU zl+9_PqbKW%HtX(KeP>WFDM7Xq!nDIFGjXfF+~828eZ$CSeZVCzN3Xr8xml!$Ll#4< zs*0WZY1^NQiCZr`Xmk3Ru?#OL@J<#LP4q_|wAXFSN};Ck_apL-t70~tSJSRKfg%X* z9<~q5Gb?cKe(cbcSvHf@3h@c&nvd7N*7{^)N3Wy*FWQ6sSwdMh8{894JxjTe_LcBq zde4-Hv+#Aa@$rKl_^x)LA4B|gbO_Fcv}RRO`KX{ldK_1GOg!Qo`w2@29CH1Y!pZQ`mL78mPPd z?}8mdNHvw}iz)3XQ_cnTuJww>CG=83G<+y)mIs1+(ToUfdySe}DVg0pDbjsG!2K1E zznoj~-MeEjtdq`WxAIFbKMS8#>C0EyBq}17EHi%IwXrpQlqbYb!-P%?T4wPN4>oU#sp8mRH$_f?o9N-r= z_&Og-iV*M39B^&bdBr48ZvQ+NQ_C|`_ro7^2d!(s)8S0%YZNjGtYN$IBCrHMQ|2n@ zolbqJ-r3j2;zW0Gw?f4!44>j!z~#RvSpmWbmU+p?S2AW4;H{k`rnj0EqBBb^-vu^v zm=yqp$SpRs8RkO*6bl%ciA#X-hyb!Pj)mWcrJ5}03E;uD8q$!PBWzY#!~i;MHK8C1{_C}7AXJ(J6j zNgeFWSfj4ABclOk(q(&vX24ztDw^;Zp@|y&QZ^q(iZixKLARod6}3__Z0IZ%S%6y2 zCbCCWN|NGAb|^Z^1P+bDp(+MQ6`wbui;=iHKOuj$tk1LX@@!g7V0kn2)2W8E=#<;? zq&cS=Lh9CPT{Vx?^a=3-Q!?fKib|Th6WP2=D|hSeV!I7PG3#wxL&tp9qEL29Vu2o`@Owr(gMtKg=nR--WSBsx*{G zNL^Bz8I29SMUG$Rzrr4W*h6e^%?7(51OQeGroc9DZk~E#?1K*`CZ8vdP{l^YAZ2Z_ zH_|bT9-M(%h{fqWTPxSmN8Z=uV(nstVImQx^gEUXK@KB@eNE<*Pw>yylotYRYhTAf zPGtLaqwB-^I0>On1Xxm_TH!~P&gSN1l?nXv?_sagq7G<)CS-BTY^5IGh!0WKHOyGL zdHJ~J9oP9ZJgmaB`(s98zgtkvobcMWXgbo?$@20^=zy`O*hSWznt}qiUUggeY9<#~ zJnl(zax(`fuN|?ldp($ScxMij-e)5l* zX)0m&fLpHI7D!norwsVEm*hvdRGj&)q-F^B_TDP*s}~4ztPFKrmSPHRQaH@T7_=f* z^dP%#+{P-l4s@C$k_Nro-G3impHnJxyNDv?&Y24brwk@rVM&Dq@*Un>>GjIezhkmD zG3r=pLEIx7qeU?@7HT|c(JC+e>t=_PJJvd6q<8K5j@`1PWfP&eKNX$hlGMKG{=b0+;CB}PGluf4ieGcWmUMUBC1`u3Z`4A^67Hh$@u&lH| z6$gvN%0qT?tNX(trM^mhsfwZ7>Ypau^Xh}x>4qhbM7n`r#J}2XA#S;$8Z-3yW3a1@ zTo77M6x*Uped(Isol9onermiP>mS)rbqH00L(iaCkE+-O!AhT+ra+Nf z3z=#?ZjQD#1CK1ZTVa!On(}|n+Z3*3dvEjm8DetR^^@;A-y`6k>Sj9ek$J9Lv=k}H zpL6rD`XMA=6f>(5j$U3jDMiTI^u?Ds@8u!ssb_Q#{%XHTGN9Dhu{Q-G)mjgz71tQh zAiIB!7SCR#GX)k1n8&(iCL?)|I9|O+UjC+aB*E_krE9K(4@Bz0&JstYZp|-%ak2vf z(3;@CCZaiM53N_+mf7iv^7A36xp&4NgJzTKvZny=8TlCd0`WHhy z>)8?$SjDF-T0-ui$1SG;#X3_iT-Ln#IvVL5BDPOjk3IXkAzJVUJbs>B0NIB1RvOao zynhhRTdki9g;ViD&!*{OjQoj^c_>mpGWXW}CATag`FoDCqZf$CW*R{RQNt-d=>Aw& zDPXnfNzACf#JxMK_y&65^o(EJCM+0!d>pjn5hvF*1gi3DpufK9AElNfU&1`fUqGjs z;it1}ie>wQ_S@mnkaux?WncVYjiUy^on*0QY;{~2MD--KA?+j8>hBR>y|lQ~0D5%$ z*??}j#UxnxK95Hd0ivrMny9PKrlYiL*D?C&XO*b$-cJ>gNZd^)0427E8bZzPZ9R+E zE}`GFq zg=f)6H~1nr3Mrw4>G|i4T)2%RQnKKH5wT@GRsi(=irmg5(jgsBExM}u!KO;jhd#1~ zu(+`xs`d+4)}Gqz6dBgdwIDiwCTGtS=aL7+eX-nh8x@2%yyW}SQX-Yzd-`T((8~l~?rKx@*gZoBZ<1?p$kKNpXqq`?eN|>}-CQx&X zz0d6xKGb-WRuEb$#aKx60kxe?Ik^%ne?l+2%G=LI#HuA7k3Q?}v5)0;(H>_DKOO`- zG1$-hbht?3h%jbXhxVuGck8D1v^_{^ncnZ|qm(g;O#^GT^_ng_UfC;MRQ|lm3mnZ9 zm!H1vaSa3P5#A2fDO2b1O=X3dtj!>rhnOm?kf%pGazZ-aWmy zePTx`V+IA&e0R}K%C<^UmdUV=`G=!M0o{b0VxTOYY=xeQEEQ$LMmSGl@-&r)%Z6+> zpKB;H6dG6VsHkf?WvVA@kXJY_8Shy0g8Idm!jN!Z&*G zM{n0F9@4k|o?z0u`QuN|Z7uk*`on1SNv}41=Y=JwWz9&pZ0V5A@)$FE05m>eqM6Us zHLZ2|)jV9869 zbqa|}@!NZ$wP)>;=K*?c?fY9dw3!zoQ;c&HVWAi=23XWa)4Sz9iCt8nr=Nb%*ZV@G z!A5FY#RdGx+da(7bvUKR)V6PIltx+zRUUYErQ=xKH!1mT(JS@N6G^RFpn2o$6MI6~ zC&o4JH)$22Po8yGt)-5L8%5m#7O{L|b2;!@C=`$T%{(@eT(={=Q59=lpE6BWpMv<3 zG3B7jL}^2SGwme5?te;#5hI>#t2LWfy_-7jb<=(C!rZLZqK$8VYzL>B-Je=u{tb`4 ztMhYl!n?kYphZbDk%ojecwCAYreSnHggq9Xu^mUJ zd0$}0#=UJtR<*2!Rq{=sK_moL43p>87a!upX)h1Q;-5dsNexhiBgsU~!J9dR(3~5r zkCAk7HFwBpXW32Bkd)Sp?@s-t)%+8biE}czuQ0d|4;xHTa-x@W=gOE2kq8-LvRe6= zpXNK@dMS{>tfJ7RrSnY|+ZUeug7&teiUecfNc;nyFGpoXOKs#j-qT0g#zigqHMdzd z6aVZ6UIZyJbB~wA(qG%-*!O`)rmCIA-Upud#}rwk;Cj=v zuA1@N3+1>szIck#Ow96o6dv~Q3G?2)`XA_=OMv$KZeA=rcBVT9!c?nH0u`__96UDz zkwxvagn`d*4itC4{O;^-@5Dc@T?bUl0B_$SEpl6_a{l5a$3Uz^?LyxMkE!P|jLJiB zX_`OZQE3}xY1si70e9mO$6}v`U6-0d##|`=un!jR+eLl*X5wQki9Gz!7VXJSrx&n| zUHY*V{OWuD0eOcG(AxGJ<;Gnd(|na>^2Y=M@kzX(TYqU-rRHH~auzwOqDf0-;v4K; zDY$Pc^F6?ud4oQyaeGHAgfLeO6`9jSKnbr?A?sAooQc~VQr!=f9VQYx=LS~DYjL?G z+n~d`Vu8Z(t4621ZTDvKr>*XlW=_+zwzZaS?S+UEGCtFIftuthtG^u;J6wrsj6)e( zm7hp=U9;oG6~ z&)P5i;dA4>&TyNBMy8QHumgaD&%Px*j|dZ3p{I2TB>y&HQ|{B|O8wnLyuckqUoOhT zq0UcqhYo|z$J9q9wcz>zB@mz;U5`FWKi_wzQA=gOB2Z`s@*Q);?llmYHPDdk>)TvQ zyM7KTbx7Gx06E#oDNadjt{IYx=JaGGIRk+=*t0KPcd_fUVF${ihZXFVKJi?Tl@5FB z#j0njB)7ERile=2VgPn}>f76Dt1?#qY`3y&6-oBo|_ zV@zIj{Xy;6@ryNv*NKijDO@VoGC#Gc|N7?h;LDr~V#0bl5h-wkrD2E$lMiKEm3LiP zH#ZXWX=vfrXqDZ1{U3myk}j79`7-5HcV##d`q(X!4sw#Xj8J;`Y>0K(qg@H9KiO!L z3aD!df#ad1=yXThRXBB~Xk>`JWYNnb{_D%8fOTr~Yt4Lq1C*Q6 zl5D0N=u5rbvx5|-acDvO3_LEMZlXQ7?IeNb19wRnx?*!3S}3HP|EIRtD7fv0n-j$^ zi{7%()hZ6%U{7hEh$;E}xrSWE*M#b=kj3iwCYX54P*%4N$Ol*I>n3}L4w~c{uXIFw zt5hPLLj)S&BwNr(!R(LhobG}wXx3!`?c2w1JDODQrlP~KTpU*QETaOUMgCAwUcQA$ zG~a%HoqkrneAwGDy8(ycKtRC(PhQ_zQql6JjLu*q3?T{7}uSl znsBqFOVCTRh4+W)-0SvxKR=su{o_Dq>$D%hJ+2HJte^eb28$V?>Dwn4E=RkGHppQT z3V(R3r8OC~AK<>Z6{&Ug_1{bzd=ZM6(|Q%UyI%TTOrYlpk|(yQjttvZwHJsM6H_%* z#$3Af{Trd>euZA!cT{hm5&iRrE;I*Ty2sg@w&yvbpBV}-#a;cwW`$$AyBs~eyLkJk z2v?30DUV>gCV=R@-1u;^xK&%h#I=8Z%77i!+R9UZ$(HlsIci1Zo zzA|x2Y95^VqC)gt(a3yFgt>;E)5p@(GLO?e>1_6GO$VKgd%X69_hy}q{pZ9 zrB~=2AuTP7x2eKq6mV2+qz9*U!XAX!nu;oYU=O136uWdT88IdH1_+e4%`ng&`-J0Z z!s4S&Vr7kH2_D^q{{XBTzIngYDv1DKah<+N)O(bih#!|`$`@>y*E9=P{Nq<=K_T(Cf*Z3-agXH3HS4)>=9g~#h{n0*2hCdo2O4;fpvzfKU9Di_7Sro0yHq($C-umqTK_v4@(b9Oth-6@L~N2 z&eV$PrWS{i&UaZo)4vr|-A45H92n1R+Om|(nGM~SEzN7ZmRr~gnbg)bdP5&O@EkuE zy4PB~z%!g8P&hcpBBYItVf8?C%CkUJM~8Ug}F)>V&2hT z8C?FleiSgS+Zgx5E{N@;wTfEyUzvP^wrcuGR(nLye8^gLL4ty~rUOlt z3+;n=fly;^hyYT$T<#66ZZ5RKg7^Ey6j$cQK7Y;odtcsVe;R!n{HSz3x@VTuvv&b$ z_tJDwtgWh4DOhO}`7yubWtp2Y&`%a4c~o(Z(-3El!f&t_4SP~_7_rczFkv2^s^$ti zv+<|0f8H^TW&hfaiXsT9OC??jI6w- zu4RmLjtS*dk)(BQK_9n6TNn%(J3Of%b}(JTa2pj_Oy6U)A5~w#s>hItRf; zx^#z}p|a&DOGd%;`*`*J@E^|Uq^9?ZGi5Ek+NIZ2qHs{|5WZ}$t$TkNg4yBreyMTie<5w5x_;GicC%XPLX)MzhJD-uYxZbGyoBg^?bD>})9~_0#FNlj`FFAGfIpxdf8yU6YPb2|0Q|nAZESaQ*^e zvzNYDrWT-S#^~4%L3bd!q}IfK(BkG+)t!yz`rhA^->+{bdY+T%yPks|D6dc+{A@H2 z6Wvs$iGL^i9+hRybJ=X_K6Cn7e7@o2LKrHVxsIZTYO4Yb<2-bW8eUXK7wVoMejStWUTC9`sn)A)I- zQwj=GFflq8RVb|sgIb@!vgLYKv%*%hEh0dW2hcj~Z8WIM8uIDM4Jd4ksv9?CvXWp4 zU4EFAwSD;H9tOJvY<+Z^)O?M2A-dw=iMO^+fB8i^!_Zg;>f4x+jlZ2VBE@Jq)OzdR zEpDNh`NT})DoQ9#S$KH{aNw4Mv`2c!<4;2My*^&ItZHQBZlTS|u7jqhXR^ORv?fq& z9R7AwqIpcZX9#}*{nt)psb}+agK}_W?h38!lVFhRfP>gSuNO-*$M-|M8yl7ds#GI6 ziKKh8yEQE9!(ZGUy}~l7d_+5{RvSDE?uEH&Bp({n`O%)$GhRl>N67gGE6EwUaD3`h zO}4i;{{cy8$fcUy&TK+y^e+ccWn1!5E@(2HY!%xSC!HGIP23tW( z8`k9%C6SV0JE=R}_xPg}p;n(GG-@&i)~r6t0qrd3G|x&the062DDXUD$ZiB+2dz>6 zxqayRytr%|C6qNM@h*N9S*-F4q0xoKkj>nc)SJQR+DZFefW^2 z?3afpb#NWt{WG;Vzs2|*4bHy z!`*d8W{EP3>{W?4=PDxOoU-?dNTQ;JWJPwFWhE4e?9uN&-{0Tw^?E;F&*$^?dOlyz z=i}iXM>?T*wYro{cVXu1L~lHDdRi91;s0jDP`Rslb3F!UG#MLdc(BU$?Aer(URPzGE(mJH6by1!xWamX{q-vm09;*;Ykmw zaRc&7c21Kda9Q2sw6OH^gQRB_Rp@UV>Bi`)_y!1tD9Lz0`*6ZZIpLc&UO7j)qJ!~v zw<`cmx)0YdExbo0@MhUesq&Q@amW(7WX)wspmm#8S4J>EILS-fkZBev#c7ns&L$d# zi>BOU9U9~8g2;t)G}uVNCDkoq6X)Nj3w3)s#*MF>{*8p~RdHXN2_vQ2kGdEo2xw$x zn)>tkH4Lo6O9Kbvtv+mX!JK@&kjm;ic_&WKb%eBn2eImk` zz%=oGdQ#Gn;MJtymsm~xNc&{#$39m@eh4L=+JXfpEmbOckHIB53GiN>BzBIZ%#zlI zn(Ix2H_`>3ZzMV`Jl$Vz9HLjz!gA5{V%Cr0L&-Xwcj=*nh@LVkjm~JqVXYZsvN*LV zfqV93iYdH@Z`+(jIg&v)F@KeJQZe<@XSKguFRq8_+pLT-_asKozA4O4*XCkFy}Beq zyP6HtV`xe!4iswnbXlhy;8DyDSVNgPYC0T9Jckan2o=FhMy8~Nu3)Xa&t3jb8bK(R z`cc#0ci&$4K=3W9aPia%gExM%?X5j))3|QkTWW_lv!X8=k)?y6-Ml!Wthd@q52=x0 zY%;=Xr*uQ)hWs;tr#tHl^oclExUcQVhnDh7`U5WEIQyvNSL&`z?;@gHjjePn)mJ+Brxb5Uv9LH+a3 z;lZDxq*MO7sf!Kb&6^yd*;m5=CS_5CNr|7k>FY~kSGVgu69mXVvN-{NCY}rf{I`RMwPHscDM& zySy4S0RA*D0Y;&=#_kn5In;zK)yLleX z^I;c>Pn`u_-=2M78qw7h)5f^Gc*gE#ad5E|I)~RZIp_{A9&f~_dOGm7AWn;jE@cg! zgQWr#?WU3U zAkzkCE?nU#RrkBgadj3K&o09l;cKx zxN>fEJ7T|%x6F|D0*4n)-@27Z0+DP&=`%p7^8meDsUk965$7P!V`jt_QFyMdL0waK zhjIGaM~45kAC_VIIXKJNaz9b!S9r`%r3t+HnPUGR4c0%I1sD!qGx(JnC%4v$%aKDs z?q_Fr=zCYH(!W{sd3)*yDt1nC)U zU}~1li$6Cdz9Y<;oNbe`ptCR7+xbS`zZ^3>*Ud=hwuaG+3d(mFCGR~=dx|0ot)qZ^ zKFlu3Q}rHi8W~fd?QqF!-S)4Wv)o+oud?SICBIZnm!JJ|3w(@aQFs1Ar&nlS5S6zz zzBPb_oolzt<+ko#5$!a|;n#dHfTNv#H#uhB zoG7{IDm(bhby;4(j>~()`MB}md(Auto#LXaBa@~MR|X%kp5>oGJQq^1(~(WTT|Jz# z&c9GS;hpL9x%Ok%8Po@>FF}$TUNBro&MZbOVieyi3nbz$;0vJ6huN;mL7yHDaOB&xL96Z3oj*9 zSfS7gdU}Eaf~5Pv@!lT z2#VR>auyPp%S0&@K)|(l@)h=2grQzZ(m66`QAHff>M_^IqsBR5yQX-%}tmE z;Z|^HWqn(tbI}-5+EnlPX+Pz27jJMLuhM;I^|O0Pt(ULfKk(CFqhHXsb9vt_^y=56 zyJMSWB+SpoRHySP%q2_Cs1MC{rKH8cL`t$5ErE0*GKnQEIL|i|NlYsvi&jsreI*UY zSj1lFE-mt3X6`KIghH>Kedu=(D1v5`loq0yHC^QyBpVWN(Y0(D$z!XRrgk2-r;&8F z?;iimX)dDWJ-*VDCv~T^^iq0e)tVr!rgBx7`)eKmrHoG zS1+Zfdg9zt3%kWF8%~~uM~D~aKVSHLGwdYAjsLDC_p&IU@R04sc~UJmCx~`;uUb!arhs%!#A`WAtZTLL zepJR#kfe~aq}<)-9FMAoI1(jf$G#{}ygugXv$!Y#*qJVVtip-ZB$yP9nb+|@C^NQV zvP`U;R#q(m{4rVJ~;2)CR+egAxr zrcLphPU&07ZxuDbI+h5R#9Mqeytw-qyQ zxSGz=<3BM{{38SDXI<#C{=`;~8EEJX7D>5rhS}O|n8$2np>oTUU3I(fmx?@6?X+-` zP*J|aJ0Kox4Ko(#w#wiu3reB8kyr=x+u_B5EL?Kv`n3~xioW%nE<@8vLnoR;YCBueL<1OoC4{B zN?n6B1ufjW6}L$AS1)YW&+aPBCCK^P(E7tI9-(bWA)!o!#x!YmltvvMxAco*Db_|uj3bzg3={aFj5BW7hU+E z9$gjyO$B3nUvx;a4Z4I*&1Wv9ln5ySlC?mvp$-&E%%ttG+j6c|^~mPARL*DO<-&=| z8rVE5Q|+^(DLP|3+Qwva)7`R5j-l=2=3?w_BrGaJj2(2FyQo(es*X)p_4v)}`yjpa zI=hUg@Q40NGrOVCdvCYPn=-Y|{_VbNKFl)f%<;syy5T!*^tQY8(3x{0I=8vYEw<0p z*J=gl+c?)R^|V$EU46K8@;6KR3Z|x1RD0AV6E^<+WBtt)9}#lX`crksG9hLXrKsU% z&8v85$(EK=rYzyY8S}xV87EKkm*;yzJszD~cLpjs=d2A=%A+1VDUWQo_x!==3!%iH zF&$i5@u@-}va6c`zsHRVnA(9#Y5I!_pIeKFZrA8ww^ge;LQc0q=v+rHG&wX~@+7ep zGaxB9a#|m@SP3$<*-ueT4{(8SoTf21QR8q3_)fJ;zkl}G3NGm5T#IMY*V$oVr)x2! z_Tir%*%b|rt$qe}5AB}@X_D!mhaV!s>_%Yd)+?v2NH--AlD%q-vaV7sql_;67bj#U zftTY8bmUGW|7@Ypy<6=9_}416o?S9IzxmRA>$`|(>fBZ0kTlLfh6=-DDQbUYsTv8S zt6VT9n1pNz`WTeY3He=gxc{PeskX6<0+W4>emR`hg~4(5acDP5XBU3Ql0F*MPRRZ0 zt{YLeaw%pp>z269Au=ZW$54|*Mv8vWLPL4uv;pfptDZ7+HWYe4REXt5Y1OzY$L&G- zi8DpdZauLZc{uKAG)}j>)!Ab4y)KO?woVd~r5YObHO=^hj=Q8BN1vb%C_$P&FEQ(t z1V6hNOwxs6f9^Sz1{mK;HT^LqTv2>KHyezRP! zUv15ju=)jEX|HzlGX8lQZiQ1&Q&I216Z`A!Cy((vUyo>F!=AktnYdhbRnUmh+b3)O zV59uRe5pQIqzKcMT8bqid4m{T?VdoA4cgxthaL<3BRe$6GLnVzkIij>-7^nxDav^y zCmhZqaw=Du-p)$|l$3Mhc{B*@>}-tV8nnx{7Fa9%q-21Hf(QzXfW(or%tGmB2Z<;v zd~PE-w~-+2TE}E0_gFv{+LQ#mu0SncpOleO8!E$qh%gO9p?2FW`-gUIt+c9}tX(0m zr12}>3s4D0KK*Vwx6KebHB9FNwVM2%TIWAzU3xv9w@A(*t}d>equT50{u`VsP~Cg( zgsetEr_<1}FAyzf{@cuk6z}KRDZEc_oPJ9^v*%;sa_YC2OjDJlZ;QiWmP(;o-+G#7 zghR34l4{j1{4ed)(Q2vP%bT~5nQFgk)t$wy_S#rC7^N7oZLMa9zxAEI zt;2&6%G>ZCdt<)+TelLuQgYu7y65;w>yEuxw^1#(t2(%&)4IGAnsq9+mh`+=N`s0# zj&Ib=e^qXKvUzX_F&1`Gks(CZN_Z6Jc-3#1ipz8iH8w$32A{Z4&fRReT4?b@jgMvK zVo^A!H^sIWIL3GJFZ8xmxEZ)CoKmrT323eT(b?TM^7>9DF7bX;3D~18uy*rg-BRp) zO_%_kxcXF{Svn6~wNpgX3l$lkLIpF)#D2~_@ZxhjZB)+VS7+3 ze(KuIW3feH5=|)@T_D%_jUHjw*+wnp;{iqKkAnpFPAoN9UJ2D{+2+=FVJ0=%%Ae;% zb6bk9-ToMzZ&3E`P9A1~CtXA+@yYqE*s*GEKW>M*1SY12w-+cpjC4Vs|5~WR>r0(^ zgoL?%!44g8WKGGoI@>zW0>`Gf$LVA7SsZp(&-zUMK{poE)Q9FmkMHM-9~tf|M*agb zs~)kH+&JDCE>1bL%2LW!KDf8Xb|8;4I7~|Z2Otkqk7XAM4xSZ1fNU|*HS?p3{zmSp zjhKJX6hGqoRA?0~v8z-VO&N$`l8Tl71G)`7Ph3nLTE23IV0qMen0j>U=x*_k0i!~d zryZS=M+mEvgrXhseW{2|#ngh5C{YF3-?j4M%}a#jnI%Jk9i!+U6Kgm111koHpoStR z#N`K~R}6~0Lz4<04-NTY*Bz((6<;si6cazS?*2U7T&lda?(2!0SFcqSL>A4k&j_}r zgZ&vL`80?TafeJno;_E{o&rNjpx)q`X2F8<(B zZ#;ifMZLVfI_jZ@Op;=!{Dzg^OJ|-Bi&MO<^?3{* zy6WmY5LbD;*S2hs<5ekqHUhe2(I18sz4%eOLbRGYyZY+j-uwq@>$1A$Rd$gR=Tz1! zzZ<~Py1!Xq6H2UMd=NC90L?j%bwVLP{HX9$LVK@;3tR z$ADA=bISaAF`Yn#tAq83wp92NcF`RM zYA59rd3!3&H91z~pPG9o6XfCyOhESXEKUhUTybLj?@XRNuk~)J>-c_tU}W3z&MN_n zM4wmvS1%M-7%GcAe{^+e1Trh&4)ls_cDhiNa$Uk?T9 z%j>O_{|>}9-i#W(Ptm<)=lFHxS%X)#uo-WRW5s9yd_H#cI7k^QS!^=KFtggskn@x*!L zeU)1MMtW807xlj6orrGO``z^Ig_`fgrda1%W{&K*-8Eqr z9a3cwa)vjN&pNQ6+n?@DvQa_bvFW=9YPVrlp-^OLr7&5NX%sIDN4Nwug)e2*ya`(H zMGenU7NJm$xYmC?KK*VHc0H1RO7mCY!n59?S(B;?K;rVtu5|sfM-lg`(WNIHPhb9N zn!xETV8*GQx#`#-`pk5+3pT(fr=^j$EPSHZ{@Pw?NBX-b*Lr#0Og1jj?K89^zoeDW zZ#Xt4GBX-44!9LMFzweia70-FZ+ryNEWUHO?kI}29)`uGD}CYr~T`% z;v~ENGiD+^&sD9gn{5CRR){;pAKE%FkUPARQr6I3RD%@ilzVkMy=kROuDFJA8fyFL zOYAJwX2wz4KiS60ek*7wc)%cQ-Vj>zknlTVV3P|IC32$25L(P1b++4n(}=4)yUHb; z5l^|Zo~jIwK#)_|EkVF4c>oT<$69BVA%lQN>mdl@V~J-Pc=d zKeu9rXnEJ_t7vx`n9NG&vJP=Tyu)Vt7S%zg=E=)c+19focF;84%41mD^h*Cw8Dhgo zduUssrloA@c_FSMvVTdUHRGPY4Ky%^*>O8WBJzP5bXnv`JJ{ouw7HZlIu|I_LRppL z1Z7&RVqx2S&^pF#!O&vslE+9{O$}`2LmryW6cY~KC!&>iYt7Dib@jpAlpgnE0SL{o znZ@Dyy$f-s2|f|cP4_p;jABF5h%)_SlS|kvPi&wRmh?%N(c60ao~uV?L8)#bTMI94 zP)e4J$B;RLt?yVf>RDt;&1AoW(6q-?|2%8Gd)DjB(-78Aj2&i8cuYgC&KA|aeRt^j zE+>D6{;Ipw)%xVXfldx=*o_e}&#v)7+ib%b!E&(eW3@~rZiv){kyn1opStxB#tr5i z&VgAetjzP>i;rxx{5)oV3a3|(WH+qcsO$P3kZfU~rQPkO_}~sch@(t6iB)2nuGKf>nEkwm&5!5wXBDrFeW%H zkExMLc^zuy5l4l6>S=~2ej(bD#O#gnY{APma*~W6UN9BNuB%VVnh)m(x6z_-xAS zX~z_c#T(^r+c`AM0pgk{+I;?tLvo>w|7>Xmf`4b$FIv!v z$Lz4o6hDQC&a?V|(Z$Pthskh@F(%&pi4m}u6OvKm9xKW5$Hsrb*xL``~H9u6JDXUVpz(2yuWGS@Xr+I!hUYgOJmOglmnF(m(rWaax0 zWlr6{!>cxJBmhUmdJ0daW9~fKDOB#hlAj38o^rC-I7Wphf*p zaRblVSc)HehoJ^K&mZ*(4Gg^l`8H*QMDnYbETtFr-+B4weY)n^vi7B}1WI*dYnmnX zN7D}nAK6CAWD!SO^=G;M#|vtsf_6S(x#q;3k(0?v6;8nJ1I}!qD`{i78~B@od%NOmK^rDV5Ut@JCQg5}?VFjs&(>j8M-ZVAr$4LzfJ&8LyC~BtgZ!fI+Upi^UO0Fsel)^*qnybRIktXT-5U4@>|Q$<*fZXIoKx(@Y4^5|x%pYq*SxZ6 zb_51V?iA{BKQ#E?n~7k>;`-|1ewg^q(|_AUQUDN}U;sP5lFGjSF!3v-+X`0G;Bm$4NgbDJr*l7r*`Lzy1$sz|S4( z9@@H(Igfr`7+?zLyyf59OXn1x+Cah@j%<+=7tlEoPnMGpssWzrv{CD$M^PzdC*rsy zS^T2YI}LUwqW2%)viS!*uw)w6Kj81X%8NTypKEih2+qfM?S}pViJN@yD#h0e4UQ%- zL96eUE&sON{0uFXFZ>6{q(uM8oVWf5Jks1Hh14+P$XF)Xu7Q(TX~>2wY4KgRI5U!z zj9m?3hM!97F{tR|PpEjNXfwrYBnKPENJfiHd9Tt9lHyd3M*cBpjG5<|Ux)q!PP%yP z=Nv!shbXL!*-&n*etk7 z^_MG<|L2&>YV@+nnX%(xR$4yss6VU4|A0oNAw-6AUBl(S<2v$6&zlWqI8DpzCMHF~ z26&iQz|nT9Z}QN|`|~*d?MovjpENt}_(_UX?v+y)FFfA55##5y{SUC*tGF7We3f^u zI!;Awm8q7+<{vB`t}V&EbtB6)J-pLjd9=>IDMnpB9b(w@D7&*fZNu*8zT)yQw@M8) z_||aWk;mae_f!YWs!RF<;2isgmVfA>`=;GsgNe0*0!9BZIJ$dy=KS{#i!*cGUu5!J z6rUW#Glh!%yDh$nV&1})RG##gsVR*HUw9Adf?y?ppRErs;4R;=0lu5I7Z+|hrc<+R)>vAik1DY@AebzW=!q3aQ@KB-)rqPA_Y zOV@9|I16=z8R~pl-60LE13$vYOri!@N0(+e6yrOjQO^xmpOkvtE;U3 z9UjL5ILokMarY9#WkDeN8FM7!>qJdavy9}R{DW=-@`amUlqC3Px|htJs!ARh%^564 z59dT6n1@`v3&;1wH>LL6^n5ZDpWFHeQXTA$2sxWdmqH|eG;+SVonp4bu+*Qu!jypl z*{nL~6P9%5iTGBu)^gfEV2}Ch(#+JwxS8#)6;lTdhWPdzkQ_+=roMHq=%3n2tCvwyk)R#wiLHn<5^RenCX0FS0q+^mgyqa zqL+=2%rMahZ4)ZSM7o_w}<`#!}&}1 z^!z7NCC&{zpWkwp-2(CtnKJtCf6km4O(i}(l2i@Y3i{8`%H%S>=xzrVr+#%H{dPZ- zm>c~MFgq&wHvj3+bR)(zdY^HWksES>KG}3>{rs`mVV8}gB!dm6fk|bXSuvNCGt8+@ zX$;Axz9@(qe>&y!4@l$c*kNkjVc#K%q~F<;)^ene`!Qwq4+u6O+a>cUAqSM}!k}Hw zsy8LyOp4lOb7G_I?8~3A78-1KYW6BE7kQp^D$Yb>(g&g$@xHmC;R78<9^bz6njNM5 z1DFnBxF#GWvd^V&>-f<8y13L}vIi$jx&j~lKQF>!>3-kgp;)2ly8-5e{7B>T>}(!||5 z*{P&sn=_}%BiB7epABB zy~l@h=e4eEzI{B-73fnFp(tUDhcwvG61dce9b-Jng0 zIsehic1btdzRPDZ9+LDD(U7D6bgGd#S(>{V3pbMtwhcDr4}-2OIZf-LXPXSxeJ||n z=R`A|mbLq@TTJ`Uz=>oTY)bxZJCx66l!oTdcM$&p??yfLpX{X_cfbD+_`SpY@gGbh ziT=;8b{Wf;Gv{wSy3Oq2|9?f?8kprn@n1OH3>NzZk-YdgZ*KC8Kd9M%9I;V z=IeX^a}00fGw6);q3ASL%d;`x{$HGm=1t6DFxBRN&tItvcUIXt#m=+wyMsiqXuEN$t%>rGG4yFV}B3WP(8P z%z81Wi?)9fC%v;Te%NPl&~^0kVQC1~+F9~V5)yI{*{9DO^?z@beME3~+v4X>mw_+# z2jqX1XHk;1_@v@Ic@cn{E5@wg29HoRs_(nl+F1i9c1%YO+=`95b&A&o6E%4;DI1QXE zT^f=a@m zGXEWZ#rL?=)+XVvXu-=`gx;&0w`j3Q_zaMl(N~mOEivIjm`=IUD>hZFNHJKb18Y!C zm~`jICq#!D>RjS<={ze?0A+@VE<*MT?*QN(7&O9zpw;4k90qzmgboTgC2LH0JH{5{ zx*bsI1#Uan{5rW3&`3om!d=D@cc^}Ucv6%$tO4%v5HHxQsNOHO=vPQ)9>brNO?ut3 zJ}tmV%i0eKHjSV)L&V{=_0t7xUO*ZE5H*Rf2*;*eyAHn#f}Wx<&3Tb=BoPmjU5p`) zGaGE_V@|upn0Xp_TmXDF1>i~g7LJ2d8-0y`{mTHa06j;N7 z{;w=_izWnGAf`bb#3(`ea-pLax&aFi28X6-HSaSDy5>W4H)s$zblo&WwjeVVl1;j? zQ#0vw4QBQM#kGzy0#O8~eD|bV;x<_O9}GJA-r#`=QX~a3ae^4v&u_ZU%m&4)%tX_9 zF!=c%GW0^M*&S66q3z^_H>*S{&pGO^4%DA10P{I@E_6#7GuzLF^o_WFuC}vT^Tm_U zei#xm3Y_B2{epr{I?INXfRJ2=+$kKavmR6hgffDxsoks+bT~NG@O%zRM|V(_CF(*l zI&r!}fP3TXGN`GW)@qg`46)<49qJ)U)pxlv0`!JbZ&(cCQpPl&c8S3=E=+6dW;3i@ zBDDqMaiio!fV~S5oh8BgUiIsrEjZO9_3xqnQAM#|ADT0f4@^bINWKua#e)e`ZO!KB zR5V?>$qQ&!5$)HIz$p1oB`4oQfTbm=(UrH#{rkz=3CZt9WMXt07yOo~%0S+Rm+q}8 znZT$1mX+ZY_7`!C=J5rB0J4Vx9zPT7f^RoaEB57@oNq(L*9tyO5c{A8H|a@4XyP>E zRKb2@%0PVLx)uTKteq#Nn+i?i316iMzRb3aAYBKaRGro-k5yjs`aJ~$I#IAW8P<@3 zOn#x)eCvY~_&me?ZV8o{@h$Q!;h8TM3a9v$1^727SQrJNh2Vy<*bHzq7keZ19xh(l zoYpA?Xd+W7k~o+hN=jp@oChO~Sf+VF1)=lXwMX%$ zGj4vf_ofN_0*NIeAgS6#4Bm2{BADh2>TGolHYwQoYSS|1`YA#jAD5)>1_xj0f(Gl} z(z8K8X2HJxF2xhdzs0~_XvbFNQ~CtZgL-0&2C%W5hBJV73R##=wI!`91zbW?ynW-4 z)8mV*$q9jTZxQFTCcl2kjx6NC1VpRS%2?*$usSf-rDx(PML&AvM2Xr0dZ$#P=u@MR zlr{=af$?jh3)qgKv$xxz6KuyNKe3DPu+zYcvC`nRocdW}=V3p9K8<@SW~k*IEBRhT zS`i9*J69(dgFbh0iveM!UVFX~wPbvq`!fWY6B4>lfjcjn7q23kuWyirQ`v@E04O>2 zyNkjWBLDqbhCYwJ3arx$`XHVsHTgVwp-=y;2hfsZ*kTFfe5gIe0EPrLGA%*xHDd#% z0Hhki=f{?bd)y06gDjhu#O8Mted7GYc05j*m0fa)<=9kj!^EVkNWPYqHp~w#+S}No zAqDVg3cP7+i%i4udKmr+jB3Rq{exb{XB)X_JP$O}yPE}i4%5`&|0A-N`CJ95Hox^_ z(?Y18t_NkV8cOZNxm6>;Ejr8P5C2~aN5ciT`H5aFpCIyLVgjz&Xye< zA)z72YKr-$=2>#NP3R>9y_0OrOA!E*Exz&5m4Bt^x&904G8^pq20I0 zGvgzja9i~Tx+gH)SsfhXj`XnH#DAtEJNIeIXNtW#WSaw5-Td7o43mIB5h1z~NH;r1 zFb1(J1uBXR_S@btVa0QTg2ARns0hacMiPF0Ix?*VWimaPUlhnxii{t17qDKE^}#Gd z(_#a$)Ew@DOT+8=zK)ko-J!@VMkE}0CX=4f0P6~i69m2G}&n0I`Y!rSjl zj-{Ccb6QDV@7*v16PtJ-4YNT-m$*`N-CxIBd`{D#Vx`$*iJEYfOM4^_YnmpN=V`I} zGu|c1rj^wey#lKu(Iz)W;|rC92}B({P^o)A79o8@x`2H zDei;1aIW?TnWA+b-M5~<8y*vJ@yH|6rrGeqVo^s#fO5R#8_n>=hdD6UUXj;$ge6;( z!c$&k4q^Ww1q>jCP{F9h_t)Yx(6ctil-Hf>c*6&AnHoR=+zb4}OHv96WBoFJS&z9w ztSaybkn|B?*mTHl!C%pm)DFS@8HF17wZ6L&{Fd==-V)))L}c9Wd_HbLc4UwFt6K+Y zJ;DjcI3i)1d6!$1c^4M=z)}5wUTdY8PO{|;oX5(dgjG=6Ubn$3<)3(Wr`Mp z&TOdz~ua%5r(BnJGcgTjCy1_b; z{Ty6JST--xyg$!8FY>t?H275B51}4VDENiH4jF@BJsHpfUIM`-1AhbJ^l<8B%VZC5 zBJSZBmtq@sd2^beHz8`jWpt=(Yff;pf%NW?l;ASw5`1 zK_RqxbTS}9yr%-%tpTT=-Em$dpp-o&|6s>$2EgE;Tw4iS7Vi3Z$bNZkyY`wnJVGoo z^6pnSw<`J@f^CCv(ST^7-Ns%o-g{~imdt6y^S7;S9MXnJiU@vmNYoQ|r zz>ir9zSzV;?vPXGGq^JIqy$-Ei5EU@8-e0~k6Lcwq}Ae{rg`nl5a#1PxqX$11H)9a zJHCyXtfhVELrwv3c3nabxvV*MN%K4ZSW3`n=8FMM?;q4CrbR)x#9D{2yb4HMpK=_S z2)%U~ac;Z~6wVYo(k)!T81-J?1peX!{gh)WW$>~TAW?T6!k_29n1zP3MqWPabPT%N zYVOOE(?wU9Sfuc3P(3aaNzz9M&GmH#lTXOy1)8=7Q}IoLL+aONwJf0cL>{auf?^&s z0Unf+@P3K>N&>E>wRK45ql;iHgC@oFd%j;mDmfByVBr~Q1~^`mM=PW69txQ=0mm|M zxy?xfec$F8Zcr*#X=DF8FySJ6)b;#+0VKqGH8IvTZIU2`p(~&AlLCZsv6>ml-#)1E zpv-r5Q6ZKy(prg(9Mtr}muB%O8Q`ti2S@^q28U_>Qq0NF)XH4@uJQ>gMA6=-0fN31 zboQm>MkWBchF=Hk&|(RSJMl)eHI*3O!HVXCIQpLore6WW6zlWh^nwNNW_RCHpuHQi zCna}Fgk!fCi(OJivB31LRu!n3wyK1W6@%Bq%|Zv|J$Yg6NBsVt1Qefw3eLL@=Pplm zOHKS(MmFs1mOx+956D$VrcdW>+nEKuD*A21vBdQ#&pHAvHw> zfN2V0Jj%IuRCxl;WG){g?|g>~@NeP}#tG!O#GWZN!x+}c$8j+IV1;hKy4lOf7Vy9V zMFI+OL3!#@Ca7A87;lUpkE`IXxM=~N1zv+9-%I?zG5Qr{L~z^ajs(Ew7D0`3Y736E zEJ|kwv)5By9Q7;K*-_~EwL7p-d`wq;IW#^~B@SJ{&Po;T%07zA>BkEn`?Ht=$3nLg zipB^0#-^JI!d_y~V9YmNX=Ex-!UU`t*(|_T7dVPx?sCyg>^5GM?z_4hJW`Ww*K{$+ z(;Hs=lHzE4U*lQ@H#j%zygurH&jr31s~@sy%+gHpk8DGE)}dp=HTuF2`i}vuu!M66 zeQ!KOLmvZ>!c!sx#K}cautn50!(aVeqhz-qU9y1FxZAEsU=AJR)m8ib6qzd?rv@f! z>2v5rq2{s?;roZF&8V;5aT;3E26gOD0xy3BZj^?zk~7nMO5KP8tNQWHF#vH5vVl{c zg{>{Fj2}=Kh}}|X}<2g$Q3pH|q zA&kPAvh6hB%I`h&;Hjq>Gc;l* z*nW;drv(u`leB5|=ClAS=@Gz#qLA&>ZNrM1%X}RbAOCdz69-O=tM^$b2=H|?yHAhR zc&S0x{!3=(#Sjr7-4v$ozAg~E&;>WAHUA~V{sT_D8ecNxkDq1~!BnDcA)QFN*6SW9 z+T?c2{6%Zl`Sys34ML2HP>TiwOgBU$zJ`-rs52t#h@Mwa5qP-y%Qo}gm<9d)VjKmX za>_E`)psO&T%#!E>7i8@t8|`UG@pG&y~%vr@hq25U|{ae8*543J{4KsQ{1n2#Jnxr zB()MQeO#vaYUK`UgH_k}84C-bBp564?_nA)LGX3{$4xU{4`Hz&u{aRqDaK3|A|Tnp zsKZ2}gBdX)`CUNSji;JILL5Im?@AnUJ>%0cHf75ZARb5|G`6*#4Vuw-d9;M* z@lMjgOwqZOUezSlnZf;eZ07GwoD3w`B8fBo-k@#$o`=K@JGXA2Q zfzn%>meOz^E}1!ZD;fN5;Z`C(PsWq zi(>Z81L)EaguM8bL;YeJxw3m}OyC0#uOa}F)rj#~r+^0yeLHC6eA|7GiSOCBm@8>R zmq7lAu`&Mxr=0(^Ac_MR1=;CW5^eNf8g(DSr5{C+wHaUPIU@kDJ_0EoeG0X zYt>=L&@eg@;{DgikO!sf3g|ep%pPPKExh+Q4QR3&5Do@jniPvj0cZSxAFBIEiSg@j z#fanY(5Wf2;ZkV-yO2<`HHiUI$Demx?#u_vTO8F)bMUNaz?9^dezo4Sj6fw1iLd&TJ3 z)LDh-*9Y`+cEQvP9w^W5BUbKbJv}#Xt=%q<`39c*Jt9Wb-M%e{AwtDW@nZz&%WFbO zoeD2LrgkIoZpWi^{knl>2$V;IwX?`Y7=jSkZ^WUwzD7qc=0j$&X5cPR3^Ud=>6Uoj ze2&v?qZK%L8E^P9W3QNkA~viD;dIG;$+V(8!@N%7>@93uim$Q+P|%VZ5?AMy4Vl#` zhs-wCzNS$iA})l@CLpHX_^{X-Wg||>J6M;4ZGe-h%8QTK}RIKL|8RE zwJSloF||cm_n?&3=T28w+gjwy&D9J}FxJ9SJTN&k9_@6vQ${^?EMy8|`tBc~q%ya; zTb7V;F7ZYY@>)hi->(PL9x<`r{Nhcl_uJZ!qYcTkc%jT+h@Kb)s@CDJ2d5Te1iwSw zK!^m38)RdDef(NR3py}hf#Ii9=Dldo?Xm}+d8B!uZQB*Q`V51Y%#owqm9JQBOz1oEezOoOUAGQ8@^7G<* zv8%wgXq`W>>fRN6@-W2bs5b=2p(2y}S&hiZZv^}T{F zLVDqq?v1P38?PRTpSueRejjML`gm!t>QR2^_DO~apr9IntXK*3g5$YN7>T>p&+V`%!l)3NyWM!LQghl&=O*kKe&GHV`sbT;{M5f znGty||JyzW-IW*|k+($Y>cKRmq0$r#ZBm`+sjHUSq7r#&zhoNr zrX%Jsx(Ec)+GMrX1Z%B?wv7@~$1$TMm_V!`5VCEKoLf#*Ciym&~UZ9wsljhTz6GJQW z=!*kp`_gQL-Ht9~6nWI$Aw7D${($;ohymE>MM3jgbk!?d>OUR6mNu+v6W&|tLjf>17N z{s};wq(q$2vyo_y93fi`TZW`VQVBT< zIm!_+M>P@Ks4yW(cTsYsjOLig)#1MHlB;t2{r1=X++&Z==e_6a{k&cPj_NtS|7ie5 zas|ErNoyQu_@*CZbl!arTnsgC79e<>psFaFdGll^XugLCc!($R{s(wL1j_x=Hiu*} zR2K*HMy^(tZ?VwO3(xL`!7F(ANN9sa7XVV^F+^&RJ=6T!KL@G|Da86C4W@h1el^#N znRwG)(E4)O#+#v>lAiGQAQ0CA&qd0>i0OkO6O4OQmIXU+B3t7?*#O1jsFeqU3nKu+kWasfkI- znZ{Z38)nG5g`YzW4GfTc=&C}Z*WQBfTjIv#X{mTZ1ne9jh?j&!vog{IK6NtjLGT3r zFyY-}!0|GWfOjRV(#>Q&IYU)6=|e0WFUtC(==X;c8XV7g?d%OJT_Up#3hx;&grcyG z1=CMV6s6=~guLq^l zi~+|}S-*8vAE5B239&z#WufgoDcsPZ`M)**%pxk*T6L7xmy4H(OdY&%I4mCc7y#V~ z1}_BJS6e!X;Pp(Ci-XyRTUY`ah!dA3J0FAg#|>3j`x=hvtDgQiK?0~*hQcirK!z%N zF_Qk$zI4D$`OrZNr{5pmSm2P%_BJrIesYUrWcQ*ghB^Q&Lh2gdMBMMRUjU`abijQ@ z=t>yZNI;V)@-VACJDK?|Ig$h*{I8sypOnDXFeutdSwL&NInE5BD%RwEenTilnQ0K= zlsH0T8OnNm68A}!B@Vd^X?^9)MwHS_@uziKT0nlX;%Y{tz)eMTr2Klmf@{0y9|tJc zJK7`mQt{nx1KT`O?!=oQGdVaEio*57>w-YA0Dt!)6bTxPg;;i~!lHy(vI$mC<4w!I z5s~oRpaT)YDPBm1jg+urn+0`GFeyz+j+wqx=Iw)PVnNyXoSZ}!u!sW#);R}n=E2<| z{b(y0)n`ZQh~-JbgDOiQ2r_~b63Uc59FMWE6mL}oWXa``8AwE*MmVt515GW_^;3W2 z&+xi-JmA_<%DG|_f>-2q!iT?^1Ap?~idrQ4qYJm~vS+XC&AvMLx9Y~%9J9ojPgPx# z#wl;QE);!u?Td#*OK(m7L_hs2^)9dKX=nj(4%qA6IeVp4@Kr|nJ_i-6cTB16n(aLc z?&(a4=;#o`$~nzLbv2{rud(TA#~``GWO?Z5q;u4>1a`Y5*U8y4Zq=(|t>@CG{Y?iM z>GBnXTH}RD{<6r1v!o1-~sNlW~^1 zFa4PQgguvP_M+EzY^sLL9`jR(qG_Oak=z>9zJ*t%W~k{#QB>;N+C+Out^${D1*}$D zZg0wUwd$CUl(g)f5ZwIFJK6rF2Fc&2)&9xMf0uu=>rT~*?|2e5gJSCATV9&%p?$jz zyH{CwlP)DRk$6459zQG9<8-|hE62ret?3=4oEoyQ>xE|;qME-ycF2sl;{2v)=KOI5 zzh#L%_&!G9{im`goW}^gW!a^|w-*Z6OkDK5~TX)>|QS4KLtR!ew&e(KDcV(Rq znwzrsL0Ng4S04pCFet7ClI`ke96AhVH7Zz9qr8{#EA{}{7n^)d^xXii+o1ym7HoEK z2xL|@JF^U10f7ZEptS`QQ;z?&BPJ+gjD#BWu*AcU8vhuCl_s`*bvp!LdSXuEIvG@y zc!wFZn}eD}^CA5H^te7DAf73Q^;lU#jCurF*Q#4kN?;N4z_M5SSe!|5<2PtPz9Qcu z{E#=JB|_rr0W5B-N!D!ncjhM+Y{_FYKg3=h><$XC!Z)18FSeY=exFmtjZf^Pjsi`h zJE;vI1O%tvoIrsuTW!Vfx{Q7B~bX02+TJ;Gm$z=3Xno&}zu|HwF z@T4rIxJfn_FfXx4ofa#_C@dlmXI-?T7n`P{>QkppYhv#n0V?@%Th&`g5UG!Bs;d1$ z<)e~4ycU;~TRU#3ziL_;i{ z78k&00sUjMOb_F-?h@>SAf|zZr}%t5ihyh1lX5ljdbX=1f|CIRh--Yb7xBLWh<>zN zuS~X%s#nr@j}{E8IE+3Z>tdVmhCt8?S;O4ytNkJkZtlni4AETphiG6bqnms2{=Jp`vlDTePSv> zkR0>$p| zMuy`+#&|Mrrcfe?!4Ao}--wL;#|!?m$nb|-42}jeQK30OyPZEmm~yH9JpN!~hrXV% zhP*1V3WN*~#*E-|#>yJZ?YVXoVHZ@NgO&$3+qCdmU^5B>XlOWSUB8*Z1dLl8fYb7& z6#YU4n8OaNmy7Efl8s!1Obl%lG{x^xO;VJ({>5LGAxv3c(}00~MOaZulJSNHppD=6 z%kwy}PPVB(s(*>GeI89mT{IpB&MW4v84NzvuI5;F!T(e!|EIo)M|di05^ffC-KMLXqje-6Q-5au?c)k%2tm8OzdR{ zwk>x@KuN;S*l!BZB;&2G0aO_Q8PXo}B#w(we;bvw@%I44yIUb$)11u3Dwc4XfpWH& ztXJdZQ2Z%8!a(Djd4>n1(Eq3O@Kc1(9~IAi@A)vFQ6}txy_- zf{iCW4j)Ag$5u&(c$+4i+bA?kHy)cvy#5l~XjMfv$$LUvBI}^mPHRC?^3^RTwUR38 zAE&*?OPX>bhPT%8F(9_3edmo&55-xuCX!6l&J;+BeKRN<0OAo~XdwV(#z$CgKcy}v zqqXUfx_|+oUf}jbCw?+rWT&BRjyP5^;QKYM-}(MG?~{|k0{>S6)O;45E()rKZiSyb z&a;ycB(N9P^e+G0_2E&ffzkp}V|O|nns3BEf12I4sgclg;qu2E9jV{`TthPp z+;_0w5l_aYAf&jWPN+>oXa4tcA~DTr>%{&TdzI%_V9g;Iq)^r=CI>l-I&SHWtDq`lP8WgP+J}|K)w)Ye`1KMV_ml?R^o}}&z$LZ ztBQzAdM3Px#-YE53%|sC7rws%Buvz|&$GR#bt#H*_(scge*>?05BEdB{l}iHU=6uA z{T1zoOgS!ls!1n7Mjd zsPiT?yo#JOIpntMuOB~WN#vBYE>g8iwx|xN{#}&&F}~>EKsofruts1-HN`+$_sWlp zSEF_nSH6S>D@~s1#t= z!jTFdkL;D#oX7axO|M!u@Y305d+Y`IoMzn|@A+pH0ps17rw7Uwb{XR83zH*bZndXN z`xQ@BZfxFDZ(q;9B6ss6rbDc-KlSH}F3Wc=Dux;wdG`v&nlF^!3)W%=tSJ{r)eLZZ zVyL;JGow;%QK-g$N4%xj2v#y0WHJimIjAdu7a|+~lxsuKqXr^bU2^mN`qsK8kY3i< zxU!TOjPs6?^(q`8P&8PJWSG0VW4vsm!gvB$dWRMV6`VfGMj`e8tmnhbB@cKsfq$q` zLrqIWy5=k!I!--J0t4~{nLt_Kw={D930p866QRx=rMmR$W3WmDGCCT%2o`IT)x<&^ zJEI=G%U3DO-D(HpS$Oas2%-QN0w?J?H4zB3EJuCsMM2v%(+7?EtM*Kh=XO>RJ~Hr1 zJerV%QW$Rry##d=k@_<#w@#7uo5~1iCON4bPu0Y7&>=w{NXft`q?hXdfY<^g4l7v6 z@CM~tKbEsMur6tFyLQ;sh`zkAk=PC?0_8M^Kwf7Gdc|A!cM^l@TR2m2r5nsC21Vs1 zq{H|-mz-4%&WWd4cu(h}tQa(MyVT>Xsf=t5Q{x1KRJZ|j3+#K-#bqh>4J>6uKVp7t z>41mVcp6u$GzG=4qZB*de*9@X9zvC;mET01vD!I@8Er~RM(0kB5aOXry2fNlrdP76 z*wpfbmbr(q(+k5?Lg?t*4otpD7BHlkeM+ zkZFc_Fkp(3s~*|!$Vuj@TA&+h&j28TtJo&FB#sAefX!J5gQV^-)b}(NmJ)zuNZMnIe1B=S zZi~pe>LnO=Aiy&w=b(yEp9~h1A_CPkAruZ{Y%5kw;ONjf{;&N{SOT83BS`Yhcag`x z1@EG$!6j_Q_|jI5(*x@j@xd{*hKXmWt{_u5s8UTTpUed#fgG@DDeiV9YQ2yG4H31W z(#RkvDx#&1o;Fef{_#tEg%0{mik1=|Vd8WC2RJmJ0{rcN**~B*v>G>f;&YpA;^9zv zSu>MUg_Mt+BwO_#atT3jMmrP&orG6~AsV9xWeUj0v91*PziULMa~qLY|CEoek|5kX7eCOy&fxFn78>ZMG?omY^7`_3d(c z9R2BF8cuV68e_}mL1|y42{G=Ud{jr3l*wWDEBfa`72R^rj$b|NGTqon*xzGn74B5? zIkZ+EQm=n8Ar|rO-ji8NUf0)0r5&Pi{od@ywhLNzPe^wq=5Y19qkGQm)=pK7Z4KGD z&+So}_u#~IKsn;jS~Z{f>z6xUQ{G=V9E5jIxNuwzk~ zOD^TC0r5WA^mx4kByWL-Bx3*B0ucY6V)0}>@~w}CD#%dk3oV(PNn`(@m?ps*Km7_L zn=^7e(<7-Afq2Sp(xGVz{kWE!$=3vffcdfUOH1 z=49x_g{Ia++@JgpxZG&*q1a&5Lmo6|>-6-=Hzlu4sd}NryV)zJ9jy<;e(RGEoZcU2 zJYPR|Eqc&U`DB#78LIJSdSUEu8aKaC;u%B0SO%$_h$fls*WUYH`=4Irez@xScHNe+ z(^f8p1;RLEi$nb#Bw)8!=8{Y4Cnh_ntn@#Gl%KF#_MxzIVb14h*v}c#$V)tASgtI$ zI2b^V@Ek0TtN7Sd(cb#4i2}p&a^j3rDSG}>(mCe-{Y(nf9FUAW5Kz=dhxXcU-`eH> z^rA&uRAYzB&NI?eaBMOPS@|N6^FrD4a{YKh>|P!1(ylmYIX>js@7^)Jbe7@{$E&Vb z8HiQ59mVU&#gFQM{qWeVv(g@EXv$U5KQMR4vEiookBQib2<=w)k;O6+mX^oOwLjY* zzq;3uNkh%-u$lRnOy{`?aM#E`=<+zeOCD9~YyGD;8iKsXUTa1LDFC)F{f;8APZHH1 zyfS`2yLfC-(p6gAU)%cd5{19O+&?^_^z_*kg}Px-wP1Jd!$QxMyb% ze0Y2%JH&Niq0_yr-7wg)MpL+9@{o=rmzj7)9)@tCyvW>h=KrRj>c7hyT7bQ|58?d)}`xf znkZQcG))IGvywPp%Foj|2`JOA2tXrshpDPE;$n(^zC{03GV_}AU2z0TuQmxH1nsmD zyWz8epX5fR*@fAHDlPc1uH}Wf(WF+5c%SO+r~c|d)hL4kZb#1%lm8v5EV(-#UMKZN z5`j$yq&kY>Vs8*j3L9JLGIeAFI%i}+nWt!FYG0bEks6Zai^6gpUqlvyq>ym%Qqmg{ z-yqXd<)mj|we!rhE)=3!5YYIeR!g%8`cIfR@-{i~i=Px>SlR<=^dn)v+6L8_hd1Xo?bun%SVeHzIOM?=arS}kO|jd8(#j1CXb_E0QW-@EN5c(>t#BYnj+C0QGV~qW~Y%CTTeGBZNTU=AL9kl9?+{E4YaCC zu@Hx&6g7Q3Y13Q0q8~rIO~*b9ua?C#AJL1k2bLVY|6YFbGaECFXJw*iKX?}Xe0o94 zb82 z{Pn-|kSb#y{d)D_`ZsM@t+706<;uYR=dRXsW6hs`{Ol<%Ju7(SNI+N2E>49%&aI*O z?o+LUFOVFyKa&9tAie6Ua{R5`GCoVfGMkAzNSpK)rRply`d*Ldzm1BI1a|UkQ-b!a zzz2ti;iYGqFxdVio=eYV&#pYZR%lz`)L%#S$9|3Yovg}H@yo+af3kLU20?rKTLdzA zd#u0}*`!}v3$rphz^KgwZ4}BZR5k!yU5)vwgVp2~Oj-p)f-0_n9bd3S0UCxl(fI;i z8OkYSy}?3ja}D_$+DX8;?4{7OU$+1S0}VM%Ab%M1ueu;@Te+J?!MhAPzHZV;x-1Xs z&Hy-&g}yj}22!`hEA5N0Ko4r4A2Xp9B;<8p+iC`_$fFfU@rRs8;_B>^*}wIBw3Mk{ z(c8XIgofEBm(XQmLBL63mZOVSsduaDM7MhFI165zrj2(nkOut?+1AO4Yb@|kFUI(b zi;mr4g^yo<813`x^#pgqHLC!WN@=cX~$h zWm%op~4qfT0Pd8{5goFNv0agsWZxHoot3t)+c{tNqGR zvHJUpI?9$jyV?Gi-==4z+t+ZN;v-hE*~|@lD)@(~i;Lh;Z4B>(i_mZWicMMaQJE1g z$yU2f&dNDDEBf+Ouavs5gcr81?sKNgatM(+IvTsT>Gs|*8NR!>^nO}*@9mqY;RCNf z%jPofzteke|J`$wB5P%R;qHf5mv82FdV|OQyn2H9p)B*c*5B!s8{fL1apOhkV$<{| z%ZB+yZ2ZC4|6aEhJU?}0vV!gTInV0jvE#F>fr>cGRj1>xRC_Sq(h*fj@xKqPr&M1=Kn@P5;$VPDb}!7AEFY8dRRD+s^C#;3VZg>>WDO z{Iz}Kz|xYA6hH44owR>r_k}Z`j|S-th!?wN$NH$yi&yqJAW%CDHosYJ-+Vo-{iEQ@ z&eN5zq46+zwc=7a4l?HAhldYdJ}R31@8dp)_w;JDS^2%+FZ(|@DS%G`>|+sc?-PY- z-}1a67k+BpsB-JQ>!n1^4EZ#aF>Fvm>U_kj+-PgALm~6aiKPq^|E7|U}KEP|*$cUm4Bu_!$ zo)!4iCM7L^R$h2Wp>OroShFdcYU_T;m!onEE|$+B0CHS3U%W!4!TAP~krLWtnhumn z8|EDaD8MqoixJ)&h!uVi)*iW$B*&EEkDu29vJ7mnBGZgCP!_edPUxlAeBRL5EzW^b z&&b9rVP)rc0!ZDM#t(1U{wPrw5y5E=^tjSOQWNGiR%!N!4Dv-?#6YzFH6>N>Q=ryG zP;`vPj&m?qe;G3_69W!1Td?!Z<&QU@?%rrh`sJg|XWCrKn`Xk(3_@ei3bmjz6NarE zww?i^X{$2{))A4D{|DTz2wn~e%rOewB$=5L3pwku#Sw8iPdUpd{so`pvfTKAl6^#vt&Q zt8HtjyBYR2DCsGXmk5tAS*6ih|@)g0NrO8j3slu z@>HGVshO$gZ0W;Yz5y*^BN9JB73qJH{zyEeAKJy#)2LV8CGLs~GT#L**2Cu=h&xL*9Lcf+dTec+D{zZB|JKC^d>z_ z8hdvBkMpcbRChmRN4j92d7s&i!r^k~^5c_t?^Y!EX{7Cb`6_QN_3Z(7^B@deA{$?N zwO8-{E4|1(JKXAGb&@oaOYiZf~&)(;B*LTfd&q??s{S}=yS35d(1vQ#(SFXlNTC@tY zTQ%B| z0+1L6)%_uKGIUV)&r#nW!AS;}0=MH%oxf=8s8HhkQff}GCn3_LQ~~~K>eM@?BDwc^ z9mHLNYrd7?e!Hj%+AdK)C*Sy^<`2~zG%-MuTV*JMkO8Tp{s}w;pfjWA@~DdolQK-h zpBYOeZ0h|9}%0uvSSr{@2WCE{2; zmcpjBA~p=bx~0pq;vET>fb&q(K9yg4#+g%Fua(z*i;*7Q(VjZ-g-y@!0@bKJ*8j2| z4Jf@pT!a((*H+F%IIcIQ75rKd^?1k-{koyNv-0oQRdeMSY4G7ysKjs`<6dTCWK{>E z^iTQMeyN)(-|y@=JOZYpO^#ju`Wb-QFI&C&(Y5y_>>;n7IpKN#Qlh23;cO=qt{Y^= zS|6}`*m3yLvC#+ptI+hw{j>?uxJ3qHP|#~VM_)c$@%CPXMM!AB?+UQt@ff!s;swd< z2N4~e>hCVV7HE}31($uVp3D_v{7PFyF%mDOeM&E*c+xwcDqk|e@5(j*d;d$c|M6Hm zUA5U-t7g{&alQNYxL+ae9Ue~=9dqlXucf7zvmZO{yH{Z(k$Ncp#>?J){ElZ+T^ggX z16^IIsP)h;x0$V;#xm4td^A8@_|Dj8PL#W$7#b3=xS<*vo>llO3 z6_=6mfio^|r@P$aUEyYU&z~#1+7|Z(=$IUhfUuHCZ{J<{lFN*|Ty?I%AZElie|qYS zh|qocj7A7ca4-8iES@{{QX9d`FNud@=nUK#vW-DSI z(6b#L9lKp1p)@9ovxO_1KJ@U4QrO*+2IHg?(WWywOXtgr%AQ}0?^uZ6FALu~E?(+- zcvku;w-0@BD{(;CO-H}bw77gX-nQkX>a(_M`AW z4$m3Om6?1WnA#TgUFX~G`0ucm)wNf|o)eb$!#JMd7WP8c`fJ-6rE|7}`@`ky+lI1# ze2zSK({3XOQ9&S&z0voPxM>>&uz}h?8uQ=GDRT2`yImEMkul63edydfR#^E1%}nz(7^MMcW>5>=61lN< zPt^q+u=Qi@06=x%CJ2OaX^PhiwoLzpl=Mt#8|WbGF$s9rA#TK#)x0NUKsm=WE-om~ zKNEaVO9=|b6(!Kmwm9(_Z(_+-Dn;Lz5T ze053oDSQJWX<@_yP3lX(n5+#X^Uy71d*pXcW5m{BK*2Ur^=$bq_ih5&Fj+bF7-7&e zl{i11<09diPzN>ut+HTItskzT=U^Zd({Zveo|2}6DeMq$L&M=^2jlW+i4(<$`3P(e z#*MQ0ZUXzcF$zJ3bpg2+=)1QCIiq8Z1}R1&rDB%%r>PlK#_qJwUybq0hbWjq3<`Dq zYCMIB9RzVXARk~QROw+=o%!?f%hsh_>ok@Hk#x$i2Mb0RTReKRQB-G}*MaoKe=(Ii zBx+Fw5+MQcc=xbMyMTW;$<_c@Nw+_AgNRg$D6UzdK@C9L5yQRW9nvjsXJrdNG1zoe zeP=?XVr5(+AGq68f(~&8%)Jr0J2@IjO}u8eez^D-Bx(*klHaGP%f_FH*mYQ13C~IQ zLIO>ya%7Mf$2gfm=z-}Q1R{>O@Zb*En9!FU$3%>x1 z=7!QUQKL@9FI*^C<1s?;vfTg^$|Qdf5&a=WSfeqlS7Lo32xsvt^Tcj!udbj)lw1HV386FqPE4KFRS&v5xyi}go?)2>2JS|T(8Ud5x=CtP zz6qgd`K^zu_WPiWzNUx+E^*-g-07Td6l?qe%cO#5&%O#$4zU2)m$u$)+U&S1OOVV3 z;YABR#Pdk;?P2mrjY%$1zn#nrC2=M0K^$H#@z8Ewar-eWqZ^qq^Eq&i)Kg7AGL9SPR39fSzjYpwi)62=OR+iKWgF8KDNRb zG8n2>APl&cU!kTvK)Yg~=W!|cdfscDEW)#;Y1on^TyEF%B>C_r=@8EIicGY{ilgZL zN|V`31p{cNX-Z@f+clsiB4eD}t)ZYHd%ch=W4Oe?ixrI1=h5);(Dx<)?a#rWa7Zwx zF4DkJBPW_ImT=C{t1pKNQH37By1+}ID~NoGN@F%C!Rw26ga3i-d!r<>2GG0zf=+@j zB;G&i6>f}(?fYvteERWOKV?Sl!m(Rd?{c7va@@BR+<0G&YM!PJLV7>3`s8_!3IDFw zPzpCzGdOL~m_tlhnJk|E{;{V-PCv%#%7X*XT?alTh}gL7FZInGzVfJ;@ov2JFHhc5 zJTKv9o(K5|*Q9~+i<3^{(z4#1WQv<4uRS*K@teXQis&7aUsznO0+sRTqr}AQ9stFg znpkN*EV%@))#^vU;6NV+t5}0zaWc_sTN!2P-PTQW%TxdW0X#)TJO47Nu?{R5OiYQX zut(D3MYTn!Sri%O7$qv>D2*c+5Q(iohg{3b#7^=$pC__7fP%!MH6v3Ro3Q^0^&2AX z%k%*J6wJzN+{IL)k%&;v0_%q$!4#N8h*|kJkQ8+FTK7{osDh&L)MR3rQe?%SjSMmA z=hWGKSFWnR)h++?yZXbh;bB)sE#bi5s@wf_9lFvelQSn@6=+>idhsEb`kq}Cz4spX zngki^wu)|!=oJwe<(h_)2&=sDTZ|?-@h|>m=asH{|Mm@@o6tFu#0H>c-kZBWAo}|R z30EG|_P!j*{1|KfIWMqA&H71b0{mjIfy}IEY@nyG*7Nyc)$-N z>C5`(&6IA7aO$*PYVo^oc_ppIx`w+R>7_joYuOW34hoJ~j~uyJW3v=Esu3R*e*E(N zeP=7{S}j6Xw6^>cmT#>)S(9IJ1m1Xxzp0rJP7t%^ zt@{O%XY0>@b58y0%Twl2kM?@)9V+LxZtIpN$rLET$jGW&8I)kL;zjn+V^smV?6U#fG} zUTlY{Jgh6W)jaRd8TQV74r!rh9kM**N-%lc_fNYL3PY)iegJA^$2T10+{-DBDq;{M zeNU=TIpl5D>AB-UBU|YOQ-6OL8PCZ;1&tD&(}w2>{0cz1^`6wP`|?iz(vE+E{cf$y zslRs>#MC#dzTRyN#ncWW#@B>mK%`-Xk3yi9lZ#z?yEnLwlH+1K~o393vXH{ z1aWW;)@{F%H56Kx*g7(TXgq?JkgiaYQEt=?)`0EKGYJ?=vF$I~0oU+tpyz#@$&d4N zCq-J5$iE2yZ?g*tVv3vnW|G0}Hd3o<+1AD83{+4CJ)zMZe9_w=bd(t^PHbPg=1i4b z_As2z4aB=a7eEP~wF+xkCA@=UJD%af5|0aA(=5eEK!@c43Y2#m>kj=C!NE^hn1%+A z-j#N5U{DT$ls%W#j@*WTq{{s{+WArfw@^q<468{jUt%ReJ3l#iSYHJGP(Uh=-PWqp zx;!}elwVm~nf&ipRv(iTntE7LNlLiTa^OIhXEfP?vCqcglGRzR-{$t_0`stlgh+oF z%<&Y~YJ=^IoNIG6`1wjv(P;x*upXg5VvPuu1;iB~c}5@6;dK4MZ8%Jrki3;KMR3gN zy%tIDhnNUcorM?kdkv93<80&ko1==QXXiz63^n9Zb>rvHv$

%#q+cm(D{E0CANy z%`)TJ<81rJ(DkKU!m6ma02&-)Aq!*Dr4)aoneD?KUX%8n@`fafBLHhM$QCXRGtE@> zD~n$U^uUIz(tA7bik0s=GqG@NM7|QfaYhx9&$i!O%3)*E^}7i`N`Lw|^Kj>$Ty@e^ z=DWIRhKnSalcKPG;VogzkW{kB8}D*<5^Yuh=W&wW09XcQ^nXBs zAE=C|Qh-7R>l3j3g=CYdaWsH~qGdflRCl595lMVUJkXsvcX=58LC_4g%)j3Wz-CzL zZ_73#S0BcQ=wcG<^Zyz88Z$JyDIaY=3#AA8k`l!6Q}ea<0ASDm7<2=G2KO#mT^0)` zv3OpL0|A9xY9vR@@tpvpuqhf?eSv6POmW;JXU6DwuFyY0mL6|aE_86|#AjJBoz33h zIkWL(YTNAf)?ex^InHsCF?sG>fwbUc##0eMO`qZdhSEJiPz*Q8hl5y>(j@kjw zK!MN(LGa;AKFIXNMn0A+ZffN$5ZovKP1Q4O`JgsU>L-$Q1p%C`Ste^s*r|6piI3LT z;7_%ZN>2&Vy$+1lK3x0V?x5<36)o{zFHEy%g)G5)J2vcy>8PoK+=O8PKUi^Wg>3FR z#wz!E0Y!(7ro*Vz6>REElW5JCzS_|;lv(|&)_UP@)ZS49*8w_Uvw<#+pr@}ebCcv% z%O;HGyfX(DM$b8_)K`nj@KI2?6yBlqMHw~V?S~iH9_HkUQ;*yL82L?+N~vRFW(G6Z zm}JH~sf)ST`So1^ehw2ppXKcQY6>B*pU&F;j5!umIUIJv==C5~GMT%7^-iv{-JQ48 zo3_4l*G=)IXKdMR{kt;ut9QeXe_{-g#8R}J@-s>Jlo-F^_zr#pu}TXhP+e4u#sitW zdN!P?>?a#pO^6S|MNkohao$4u!?oKtTd@D}vWV3nL;}4g7!AT8EJfmqIW8(iNXCJ# zG?+I#Maspch>|15aYX<1J!{4(luG6VRDZnyd@~*hpmdTfd4* z>j6a0;yyu)(3IB48QFLoN|o@a+oOgr`%F!i(SVBzFfA-d)uLALxjhR5N00mV{zys} zPFz))to!tJ^;LMEF66?Wn$fWPe^?ry?>tWZiJvgRm}gElr3s=IZbVpVKD1%IF8q(l zg({yr{L=a>#^w^|&Nm&M)NB>A_N|I?%=IVDw=bUirxM>h^9s~gFnr#3UOI@F8@WGO z20u}gR&*n`Lds7LAI3T{GkY*MT#+eSn)1oB>~`G`p;gQ-?+b56Et+^Q^rR*Nlv;zB zl1m_cxEsPxGhddp|JH4#dVa*=^kJ7PacFFw6_=ot)#+$@rdjQDlj?9h+y=9l`&Ajch6;lEqvYMays z4tHMkZ$C{EYZ5Jn7&Jl#8;jrJk(GG6Hc!qe9*LTq(@pg^*Se4;g-?*H+`Qvs@ov}< zx@<)y27WF zYK8JdfTdmxukwn@$->C&oZ>{azVs3ZQ^HbbD%#izyI~te}MC`j?_6RU`r4GP(VRc@`0n_-~Om4(Lzwf-hb4N_m_# zSr3%cRhV^Wn?a`g&qf>qUfL5kxn zlZ@hJC8gD<^;pp2v1)4B9$vx}FKOolPjL)#H`7_M0^FYMn3;mH6&ekdWOC1uE^`Qz zQ>_5?V7@X!Zp5gOQ=)DmRTnXSPG4WpI#&le__&Lg2Ti>MmI*{F5AJDga;%7}f6ru| zfZF8@JsLIuUG-M7fZ8vkuyaXwZs=*`c_2L(@n1GBw5^aq<>w3CO0yo}Nv zUX55<95Ms>lc(x%z%Myj7%$dUM^y?gt&HCSDov7W^SB=HRAPGLYXmdYAEipqH(OHR^O`o{Qu!{>0_(9|_!Z8$R4yw2rDt}fH(D^{%C%Mpx55LQy7aF@jq0vc{+ z5JnBpbrgay$N&!?D?CX_KTEFqVb+x8d(AQc6~kX*ic-Dr--?)+9H!Xs2wPb~s3QK5 z2bPXfGm_FUe(#T{P4MC%37`JbA&%J1-3lbbzHi5$;Dxd>=e^Idt(%%(*)OKnUe&KZ z(xE%-(NkBd5;)a*tPtO1K|dZ&D_7}!43#uZGff<>P4y8awY|_%Wn_-H6_p%^ar~xX z?+~8GS=iA9NkYZJ9Cy64@TlwD{gQ+fu1gG9&xPiHOiaR@ZG{JRMEu=;M#pB67)`qj z#w^wg;rCxBXUQ8JlKMha&Etm9VG*^0m{|kkj{$?D!rPDhATXK7_cPvnFKr-~Mt=)h zz;f`!{|?5%27cd``*gUJm1VdR>-g+w_QBy(ZCZ3=Y!l``T`Wd}t*fQo%O`a~z*u2| z0JjGMgHjm>ETVrOB^_^DbQ^1+2l$Z;tYY>*o<5sth`z7Fi*eA`ZmgPosrAV=<%C+kkpw>)` z>a}CL_GUs7g_r~pCCS}&rx*IJGuSXM^Q2$FG%N%=49*yY>0{M;j(8WU-3S;mc+vGl zQk-H@KR?UNya}d2mh)-~A`&c6$bdh89VFemmk#Vm0~XVEg2)4l0GuuBD@*NnPY7&& z`&a$3uRd{NI(@HvWv&T4o>?w$peI%|R&~TpxgVYm$XFv^d?JK44n5<%_9N`JA3hCO z7a9jZ$+9wCi3YTH%Cj^(ytU>(&oS94d|FS|1er#*UO(REjx+=i+>9_ zecE}v&6EEh!33) zp1!0sq<@)Y-9$39d;bSSH_`{9leX^Ie=ds*d60iuwmpIqaiVwF&bxzES5W)=`93J6 z`8}=a3t?II`JP`TG;9n89mh|dDQ6GExtGrAPc6~J1A3_-7=23?XhDZY7(6IE_3&it z6IbO+NE+tf;)&94lqYWPV1MY@uIo8syQBZejPoXWnO&6$=@#xWXdvq`~+_e8}UXSND=lkDJgk?Xx87#|4!{|jlIsWZ? z)Q2ylb)prszpvh8Jb3o#-5fRtW$cnrV4NK1*>4L7?{tkVc@nyPuD|ke?0QzV>#MZC z@+qAow?ocK`9XUDuBF_vQ5xt)#41@%e`0{>tL^kkt#~h=j@|X~^kX&EoDXN>Uhb80 z3ad!3*nS@Rz;Exq8x`h3zNKuk=4GX;QY%L~DqjsF?mQ@Z;J_pMN{P=)?B4voP#yI5 z{ki*wf96+T!Q4Y9U;8~51u-XW$B*B)Cq1!(r~lcyV@N(?h@DflK37&3+G}Fgsaf&c z^yn_|1##E4dyAsK@?1`JdgVMEj&g2QlOJu*av4K#Qq7wP*FS`yd6g*5&bY z{M@6jzn$p1a-mgP{ZYf9T95CQ16N!F_J)Sv3@{nF^hHD7v+JYof3h&`lo7Z1X%60c z5_|2s<0E0a6VcquaaXrta#G8a(7`XA0CrGZh_>@7O)@6$ws2wq04`D=aaWWoF*Vqe z;gvaO09H&0(KT-Top);0I|>yzAf;901H}@*99cB^2-<$)rC7>pjBO<%z@3q>p#q>ByuEFL zEkM%EUzpB~$1^~L39NA(liGV8_-#Og@kO_4wAo+qov#Npd!#aposeLPP3x#fe{bk3-=;GU}4;s{V;nbf3s=rk#E z+-Dt6!AH1<|7Adjpy3n1Qr}>3|JlE^%)mYUkc=c_aZD=0ZXtCN#}|@&R!oH{p`$lW zV^db4&IF&Cgmpr{J95v;*eFsOZ#;}3qGGw!cnVD5#s&MCm!qP4g%D!(idszpkd)w= zd{z{lJa~O^SoDDkT=p9CWvk@yS?k#A1oz}q3Wq2uL!9*ocuufqFlJrzT;M-5O7bW+ zJv}mGi4HTYotpo90RGn^qhJvm#>-a=PF}24H&2pTnn&Jn=jzU~2~sPvkzBT!l}x>S z^?{3->A^y6QSC*)q$X}L2h>9fu`g^R~zB`k56&kkjKi~xoNQ(Tu< zN&*XmO6z`H@}W`R>KhhleSZk>KJ2NN0=H;4v~EbV-xPkCA^P*y5=N*_FA z)JF#rR1L)^b5dzs>v`0GWVER;0WK~@=yw@cj?CvF`|^;xQ&>y~-5B!_UO*hdtA7b8 zr8tl)9b4srD3#WSXk(K5CoCv=7Lr&E5|v()MQp)`oKpCZU+X|K1bg?NH-jpwR49@q zlCY_h?o6%?Zq+hwI|*wMxFJ>&QV>^xB*{F>%<}z6tSs%KF9?C+=RFzS5y7(NsP&LLM*;-K zLpt5sCr@?^?*T@LX0Ltf`0diRyY|#g&tIzfxHHU8%BF&ACKy14O)-+s{1?V2+3bH7 zM5zT^yk0)EfN1?NZ<4H)3$S*4PKo2FQw|GIoZ32~&tdu6ib`X+-6+|>3g1?;-vTPu z0Srboese`TUCx~6^ZvV<9F|3hKBi=bx}+Xb`V{O=MGq5pdOb<^w4d}I-GA$O%f88R z#+T@CZ{y^pQe2N6szaD1u@wI}-Fj*B18LuaIlCil-zoF=L+qz}KLrmzzg%`B9{am3 z{=I}>FQZ$qDme|QFt?H7Su$gn}b*~+c7b$@1 zF3GAMlU*~jh0v^b4beN-pI#=j=V~wVBo=Ho+}*_ z>LfJ}DV3C_)k7>rg-`5(64X#$&-u3}uyII_rd=0I5Q~E5v|~}$_hvN(>Byjfu{!&& zUsRw&AFZxf3o51>bJRh@!r-PBpH{1*8^(UK-W!hv|CU5|vEKiB$Ffi%CoDhI)~(E4 z54lkozX!r5#hx+M_MLRN#jCBwW6rK?llw(w$niM6H+~m&eHD3Qv#$(~kcpQS+jbm( ze8N_|7J0& z1VpW?tBrGj{Lv?$&kSqtd&Qi2_f1z)z7eC9ZSRo#529IgsVCq=*?ohmS5*&dx(baE zANyo?Zs~uyLfr2zff^f{uVVVCoD>Ke-2Wqfxa1Jno4z>lkXgD`WU^uR#FrRUI@mO= z+zsh&8``la;>X*IZW6ST9jDFRlp)FVTIcY>FP&W%Ki${k@ar=Mr%lsV<0>%GUHXOQ z7ZdjpO^+A_C<1Yz$e5U9bJyB;`LS9bUYj`^lV)&WD*b z5^~BRrzwZ6h8&Yr&WD^&A*V5l$YD}*7IKP0&ZLp^hn&mV93mw)&BtUVHK?cjUFL3S#tFypX;id z%tL++5Bd5OO|WwR<0*%M&X`n{7Y0AKD6#Qz1RRyO3%eb>z47l{mH0b=eyOzdTaP~3 z!{~mf#E@^6_WL`(1;_R<^ZkF>%SXQ7DJy0#K=uOU!cQV!_i9zXAL0f!tHNsaV0L?s z=-z4F+o$wpmFQ2%Ix4!*0}cufAU^5kJnHht?d2XHgj1)PbGy& z(ole2_fr%~hoeK}r&Pcn&wt{1sNODJB&b75Vg={$WeFXL(AK-7sGRu$3wBxu^k~AS z29gk3) z8zv9W|4RVF_Gv8+G)LgA8Vf}%^XiAXY831B_!v82T0qw46LD_+yv1)Qem$GZSO z$*uzdYxueE0iZ>TEJkR5jN#%FJ0Hm4Cp*JYl{-itMJ@yvA!jPUBN5(6JzA#~3v&0ruRFAY_UkOr_%* z_TL3RUIH1eFw6q@j36fI{@S~)BK$!3RVZLDq3snj^?gO#8jr!c^5Nj5?;M1r;A>7s z*}Q4IzX^ScLfLj=6#z6aEUoXWRGL8alGC;>ARTi6-n~YUTxzN>VFRXolxXJ;tYoPS zfSO^Aoq%>aEp7@6MhwE=(<8k6dgBvfUm}PK?z!Hdtjhy1jUiIyOJxd3=JVQwIu_NrtaRL3r=bXzff}{1(y*l zK?<%ouGK4Aq1(*xR+zeCKub26)GhEx>i3Fnn*XPC!VA=*9x! zb^%x=MM_#Qu1i;ubt_@W7wQ62$LrBQL`yw-E0OM)N_#|0@(WTbC#dv_Oz5|gL{GM> z&hIaCaBPQFv5CJdK(G_~6Qu=110E+ssxJH0Q9>CqwRa_$P;yDKn#F777y`~GptDU+ zFE|xQxAdSxkeVQim8l9@w|S3ldVTBbz%0^?Z}^0=`-v~}t@TqSW-2A~rn~U~*O-l6 zvH{yuCK%GSr3N7())KdSv2jhlL~0OA!*wz8&eV~4`*;EW25_muk^_n*ywAIPCDt;O zdb`gR(Dsf?0r~QF{*UMf3hABd8hH0!H=g1yo9oVb*q8LrL}CWRZ{I2|9{i_`<7Cl z12(#^c|!?_4{9lv1>>9dFTW`@uj>NS%s*V=yjt5jrJqPVMVgN&$K2iBXzXHnfA;!s zD=(^B!&Ai>xqxUzS+<_k7h>Svt+$h}N?u<3+JHj4PCDLr(-Q=kUt}NT25;6aEMKaa zRK!7&?(P=bVYk%c2e0rcQ|5xDIu79=A3xqW=2RWv^;VNmO#pG^kaXKl-XrQ)nNj}% zGYNN;$d96HGbSN6n%q&BOm10~pQ}rdM8l6+6auaj8BSf|g@C@y?&=EP#Q%UZqA$xr z9_n(60c6QX@DUumFPjZ z#MWli&Sj&22CTemXQiLIODtc`P}om=7zXKAX#oGaSH z!SX@ss9gVQEdH^2figKjWbBK?L-q}~po?C3@WK=@mMBS&{O{GqerCxyCj-{9t+sv3 z=HX>ofq8>;n>`(5^YQXP@6Hef3ujK;YMTYV=FKEmj{N8lNy6hd=|TcMTvq1wjZ}v) z@qfx1@jH^znGLs0TN84!&q|P0VZ$BHV-w!!XnU$o#Yo{c1KaC=1$5OtJ9_tgp`Bse zBP0O}jsgxiZf8P)jNre2w?tErk>MzkNG G2`0%VDM!=8^}+02 zHDUNEvY|D{-wTDaGTA}XvImDF7cI2}yL;99|8}Z)si=YH28Vhl`I+a<0Qt}xW`WOR z0>EwOXIOdpw+-F3N3c9B-BekedLGLdnQ$2C>sWK`HiE=GByTE2SZVHM8hHV;O7ht} z%dWptUW_Uuj9QNX&Y#F$cCb+p?{?y1lqdjd3-`p@yanS zZN{PNf$=(#vfbC$0_6S}rD3n!f5LcH9S+gm*(RVm(7U)k7&_Hh(`0@uW>&~$+3*{! zUneog#Y|Sw@Ut9QFF|3J5{pHI^(p;9lJ5RVx+Mj2Orzm1^(vY$S(XY3b$VEC(}|~Ib2~^@U$36vQbxo$QKYNJy8I4jEh%QZ)4JM>^pT;;Ek+_ zd2p=}HYWk3y5s?Y9~x5sRtHOY)p*?B*w3YliwxbvPa`%my9>^#&Z5t$t#o#Vrw+Z- zW=60R`e@1hxO5fyu&-FowlL4GKg0fULx4QEl4Gpzbwf!Seu*pT;#F^X-GuC6=3*_; zD8Zr*ph}L|3-Rj$=7dUXC#C&u0rD*IH{I&g$Q*{kQs=8tyx6WHN7Zz>}jO+(LEPGUl*ku{H z-19RYNR4-);&@w!u;9dUEUn{*PaGUT>#jW~Rp=3|=;?@dHfd5O^f%v-*TwiPiBV|+ zpICp0pJB5*VA|VuOhQPoY4g=Z9bCWiYe@fbO`mLxmQiJ9hLNhb8>+kinFb{q zO|nCHlXd;%_^NqRVg2Vkk6Y?qrzAhq_=n?{IIxK0`NgaxWIU+J|1f$h#KR}RQxs9H zTaECEG*74esVFTTdwoBnqC<%nP+_x>agEGfi2I|J?BB=_4@7Ne_n_{(|8g1Q>=LrK z_yDP#*Sh(+VEkmLrrq>XSGWX{xiyT*yliAnTak?@W?X<=DskxTg-=!ffizSk^GkGnXamocwS|Y&?}Ng_!-)P z6=hXrGcY|X%-IhCC&1v-#NwC$3?67Z_QX?Z1>sm5PObArzTlm?ZItC^9 zAl)9AWLIbQ;o7(p^uI7*vW3Z}aTAlDXuogXR=z1`wsD4KgDhEXtlYl#(k^h=XP8YE z_rWo0B4~|sed8d*>-ag6X(3lg9?{XyT^zTZm20D`z$7h^NISF^vWumBB?oLzDS8d zmb-C3vNG;Jvvr?jdD{U?F`+5pgL%cB-0$<~s_-srGf{`*MRE`!gp7muxpTNq z*#++`Z8r;?|$$Q09mbHJR0`SK!=3Yo9y19Kj6~dmb^@AKr z3AVxT;DGi21L(Yc!q+CB0NTFTaihKcOMr0R4IZlyPzVLhWqB=QxqDPap||i!LqQr+ zweKWD@Hkm=Gu7rng_n9xhPkONkHfz^ldcmr*?raV$EQr=3CQH7q?8azl)W_QpWmKU z*@W8sApNTg==x)U?GKBqmF5*TOHdA@!_(1%xtG>_Fk`i^cS7T3LLF)l=ee(nQjuvUsL z_5Fa};O+J&B^o$N&Dv7Aeq<*T_AOnP_CMK4{nyY0&HSJu*;l@LDf8|$HAFLVb5goB zo}!Jeyc|T(Ia^Bkdw5~uHFOpm4~9ex6nuu@p^S9OQE4`hh}HDo31#?L9fo z2TR~9RFNEJiK_`V#*4XR{OZ2=M5B*~3ax4gEFmMMt&;#iK-sIQsiKj5M`4;>g|S2lnX%w>bPXZ^f-l%k_1%R zmL#z3y7gmBqA4w&cHWzc)!TzETz!VDB%tRwB-28gRY7Ko@K16QqVNWaUtU8Z;Nm5d38eL>aI`EdAd-6KuY_2V_j*cx9Ke5cO`&_e#&|inmLEbo9+K=!zI`->Pw6J|y};A1M(%1u+bmzM{9TxhH+I z9ny4UxLJhr5;N7wq7{MzinnFqs|k|HE5B-8>KSX()F;>d)>+Po6o7?M2SMisiNhf` zYKk>fvvU_KXh_pmVl}->RAISztZQdgG6R)$J$f4{HZQtH$+6B!-`|S+j5(s4x!*E) zbuXdD5keEwml|v1N9d}}it1#&xJGydh^NBS^Ofyb=T&$qN=L!B=HASZLafMM)qTGl z_jj)qMT*inXdmWI&t#AT5_pq%94sFu^JnV;oWz@ci(}RQyO5OWxubzNE1pqTci4$FP}xo_V4mK)gmM3 zXBCFA%Mp1^+Qlgq4o=oS$v8PTz?p4*8#?+)F_&qvscvR)>j?(%B`cuz+3suT+pFa1 zPgwT&Sd6LHWm`6{>YZA0UW)iQZy~=FV0!kfc#NWGBZ zK^edDnXnjnj!3uJ*RtpZ?#JIU=xcs;(vRwl7=vBCi%%|w+Yq@B*uj)}r>EXa;VuIb zuQ3cNmthJub44fQ^Mf}|sRXn&N5o-?U;~pAW-?J}T5+P^sezQwoO$V0E5T$PMogLr zMj0&b_%c3cQdH_sCMJ#(6Dh>u*x`{kVTYa84nsSCx6A~WM_7m63S$ln@(geM3=r-7 za#=qKpm+RG`Z(7Yqy!>ZLZy=jvDMR!Q6yy)y4BR!2>_7P;M>9<&|5c79s<(crb# ziZJ(bn;2x`o2Tbv+d+P6_{19-uE#^~x=&7dIC|YgihX5;%7rg0tWVn(q470f>RGCL zer(LuS|IJn`Knb)6A$Z@uioN#T<%ka57XX!;Iu#nS&@T_d#${^lCzjGmniiiA%=S- zL&MxK!00hwv&!`ll%(np?H})QD-k$5KQzDThub_%*}Hd(+*nq@d~vGoT1bCkI-&d( z_Dw%nHeGcwN=l8o_vMe}!t$Z4N{-MSz*@On?)1*-nM+(t>M%0K{CV0A4^Amz-&C-b z6oZY|8o-A+-2*Gk?wdB(a@II9Eo!$Z^ODQVJ{eWct}RL$NK2iRda3n}EWa@A@4r5} zN?U)dL2x_*zP9~@YgYm)3&_ulDzuY3SJ0!scO3C-;lRCtpdgWd#`2bIHCdN0oBON) z8t@J0y|FUNVqW`>QW!4ztN;3mc}Ma=lLgH#h9kxWKS0&%YvVw`QUZnBWLhl&A#*|*2Qnd&R0l6>AtKP2HB1L z82Z%VYvrdM3?X3DJg?ieI$f_0gn`k?h+u489w0%pAS^y_JnUZ2ZV|UaHpy@QS&Do2 z6#|GZ@Kf}@&BGX5NG434kd$GS-9XDp-UI(F@`^Peg*@Y(nYrKHX)*@6<38%A;1G_- zrldCT%B6i^)%lMH%CXx}23Q08vZf_siy3(bwD<9UYs15_QxDcBwMWEGVByQ~g1SwE`y-OuBg*k{F=73N!U-*_bIM(T|>2dtbO^>2@)GA&x1R(k%4 z*{i(jE*`4Y9;oQO8@dkKU>UcqQL{KV`kmrk#UPf(hpnP$EE#SFK* z9;21rH?d$yI?bwl5*XY3nGwST3iPB4=V1)a#1yzO8ROhPckyn2k9?L>C4OyfUR;L$ zm;=cRTe zk7qoY1Go+~%UyV1_^%8t6wOqHjCBJ@3*jl3kGK7v8Fn9UcVFx8L=FmZyJm z8Ur#LmpxB;Ztyef)R`@E#?JP(&Y=j!&j-oHlwd7gQ!9wu=|cw1e_Tz-FWvzbp_ zKqn7_ba^+lHZc_AOS65Hp50IKZPkm9Kf2(l2p_Z)Ju3)h@Iw-h4lg9J6yny~K}=J3 zRVWk(Q@L^wGp3{&hecq8lG&2W_}2H$J1e9g;aC!|sNbYce4QA;oZ5+5gSRcN#g+WQ ztk=2|^pYaK*^#{fLZK2ED5*2KsE^{K7O(dst%H1zz?)c-4dA@4LUMg+U~{={eAwTqp{X@M>6>7)L5FuHfMa5+FalhIPCX1Abb zY6+R*lF*hmXk2;M?=GCFqm#u}<$I!nb(#T5o~d{VnM+C(D!1-Q^Alz-zz91q`v*eG zW4%~PbbGZ`rTfpRe8et-V61Zn1x zwz-$%S-t-QEL89baaOAD=_ubJnFbq2=H;r4x$&^}YuXCeY#pCZ`NtmDwv147@ z=BI2wHL_gcRTl-h+AvdBaIF94dHMp8gzZWPZ&q!(rADH|66)e?Sh4hNeoKMr{zmnJ zkMR&jLD-xp2{l#f>!&cYshWlsv$o`s9UV;XFlwfQNZ?ciLihQpz37yRBH^nxYZsLy zoPWmCWfvAsj-`GBwmV&Tc>i{`2Y+IQ^6FQWtaO_lblS9++t(pZfgDk*>&}g`d)0A8 zMSdRSd4IUDjMAEF=Oy_eJ`7f2xNtJFW;rvh zx72BM+!gis$EH&dFG;Xw<+7Q@q>{1Dnbvjb@=fti(Pl<(NVEO=X4&6w&eo~Z@c`?x z#j$!L(W9ic6_wpMUpb}Qmt@dgYO5J9uim~^puE}g30N&eS=7FeCsWo5E`R>Q#{1%0 z73@)cH8wF51*5+zocY|IzQ4`baC|ccwbJ-~u6U3f_F(t+L$9krf$xV5avwwD(T8$j z74F~NJb0T3&D{LKqpq9HX|KP>ySEwI9~?=q+-st%!jdg7XV@C74E3G6O5CxyUKBs{ zJTHf@-qXC|?vdXz!zS`1qXn)!L)OG`9U6lUVGz@-B@p`2-+Wr>U%%&1Cr2~ z)4Nu5Xh&>nNB?PgrVRbr#oy`tj_%<}N_2!@qW2|$n_{_gi-v1mGFje537O1~lp7p? zMl1&Jh~|J@yJw$Lmp+}ew=Fbyfc$z~{6)P`pFHd=;z}N$ z32|#l*KhKsETONp>Rs*l=iucj!}y*UKKWAtxzZtE{HO{VBcn8b$5B_v3mHCdTDYXc zWf&fEAt^)$-W$gL3S`bKZLUtYOT)G%223l*0JqGkloY(7~oEm=CiJ*1xfyuA(icAAB2P6Esf$!LrNQNc#O)j4`6RKi?z$~ z{F*i@3iQZ{Pl$u{^acls%z1K1Xpu-fAbv`VR7)(y&Zf=%V`6T2K}!BSh7f9~Ci^K5 ze!LHmhVaI?EM5|dMuO;bN#%l(yu;1{NKNG6+be3LJ*2K+_1{GDW5-(ku1sxw461%e z4(hEWey=Oy_ILw9>)*AXxWPZd$c)>@0A{^qw6j+XzVQn?N4D-Gx|r{yGGO&ZOjjJ5 zXKF2UWy`_`0)`rfm};1{#(OwmG`RbjXxc8EtXM@FBVI|UIWu2b)}1LXs!6hq+9Mzc z$bq=twr)01=bSnKz5>!d`FD+vrb1t`I2YVPR5ZyyuNZ8=dSyvcb$j5l&K5K$fUR8F z4WO&xH^&@uH`r+NC&5p3f<$u*fOB?Bx2h?WrBLh7>8~Dswx8kBAmpC9eub4EKmXk6)8KX#zM%_p>v25C%)!ksjeX)90RScgsl45i^dE zG0;MAKVJUTD|SvI%Vv?4VXqHK_XYm+@Gt(f-U5)w{|bn5Y6RO6MB?L_o(pdjDsqYQ zqoKD+kglu}CL}FS>&#OuJaA|YaH>7JUV4#hn>O3%J*Ru;_Y$ zz`3SJRmc`3q+yd5B6y%enBMwWEc5>KGco5*VtJ<)#3b_m}P#Gwm2j&qc6RtGn+QMe1$MGj+lE9(9$QqqV ze(8s-Nq3nYUM-?wp|jPZZRNsOt1%pz0>|hu=7l=YU8Ou#w~l^r4nIQ4g904;u(-0d zG-$w3Mtg3-*^(;rcg7!g5dLA(ht~x~P_N}DLT8C3s%q=&8NT0zN?n`B>Z+r<{|DeJ z`#13X^<8P+`l@N860j~5Woj2za;}5{XH)hF>@OC%`|<_S?g!WR%xu{~&v>A8U3hkA7q?p|qnfjOcBaTT@3ZQS=N$^Ly$eL^sf5Z@+)jrcKUAMw|0ezs z#l4-)Bv9E92LJKoDf*zFZ`bCG`=jyZeiey*i*HX0@CkjuyCuf6y{a&g{{buoEc?6) z+TqzY92Wv_VTcqy9_$0l+d}QgY`~i_caC&BSWabao1ENO#mESDW=#9!tgN#gy21N< zukiW1yB9;NBv9o8mPc&MZBtZ3ET&uLC;0JvB~eB5s z=fwW2Sjb9uv2l=SN*S=|suTQd{$Qu>1rgVIVt?D~eweOzfULO7ed7NBejhj)RHr=o zm|U|yF7Yu>18wFO?>&?;zELmx_+ygO?AmLqZ&jRO8|v@q+2I+``yPs~2{{zml(+pJ z8}(;xp&J-~5_li)U^U*Ja5&9U0%1KySA6!U^B$c{UDoD90fcvQGQl7Y&{P*WpQsEzq$Mr+#YypUv<+nSI#otA) zE{;ad)ajz@P)!jT>4xQsWoLp+iY(X>e^Ols6T^>8g7rTN=8cWk8A3?Nzrj}4yK+$+ zwlKeMmg|-y^|N;hgu-feHGEB`jk&MC-LA7LT((2)a82#q+WdJtsipp2GhH~R|1VqX zcYPze0C+HB;LF7R{YTjq?!T2^-vxtTE>MmuZ6+?5VXca9r|p2wX>nGWmf2p<Fd5+)fg(Ct5D{sL#fjd*BZV6IQ~i1V0Cr;Oa3@*vnMlfI0pQ53rWbJ#?@+7KmJbJF9 zvCI&Ff{pVL3k3ijzY?zRu<}7{m;eYk=S@(5uo$N7%eh4)gsPAjdSO!6t0<+@(q`!S z{G$BZGh^d`4TO*uS=8f7A}L*6pk~iPD&>({8Wejvf3HyZ8*O|27u>1xX4yu0mP1ie zX0=Lf{^9)~o7+vqW(tqi8PBqTXR!&s$Qd1{pF%*50+z=b_t-2q1TQ5TbyTk!#_+;@ zyZingLw;}8Z@r5aA@}||y;1Ee2V_M44-jkajyQtkEdm9LP5N+g?804Gx2GtYaFl^rJGb z>;U(aigp*K06F|wqzA@J9{>wkW77d90>{!FeD3Pf&?WjD#&X{DJKD)dT|dgP-mTT_4{#wyR1szulRKT@Kmj0ikgHf zGxX#p>Bjq^UjWLX$A$kegf4K`6&Z#S|F+cYw&Jt#jA8>kuWEZ_p8E_|K%MbbzWJBk z%(4Rj!mYbmQd%*?ik{L4o~OJ)NBS zj3AL{xWxz7l@kEY9UlEx#1r$|5dIv-mGHRZ47%iz;bR|Vv97unTW=C6kt_UpSUo!C z`n#Dw_<=VU5gW-}5lmt=bW*xw1g%@Yt5&n}_kus`qkM$X`lOTl=7~H|6vmwGx{bYV zp$#T?%V%k0)c>x&bi;Wm(NM>dP+v@dFczFE+1lIQFqtCH4LuLtt5ymuDBimr&H z_KW~k(fEI5N|43T~#FnF`S%!sQ%5$H>!(i;c0i$(v3$Y&VecHNvHCG_`F${1GOCFd!ZL z5iJZ%A-y)4s2*3Tt~<@JddF;3qtm`x$BSKo1InbmK|~tkU-`4PZX|ImXkI7zaf>+&mRu-d=&$H67CaX2JEFRl% ztskI@ql*B)t^~IK2Jc1p-}fr00o*wr0d3 zIp}eT!Ew zrd|{lin#Gyt?|?1^tB#hFE5B0#{>Rq__)rbS|zMD=vg10xAAA8YjQ~4hf=X*j^(~s z&Mx5?1IWFtO)F)J9~y{uoCI6z8$9i%&ygIz9-Y5RaL3`Hs|zUj`94pw*PvbaL^2iM z`-yw}SId*k80Q7FFly_9@x z761w;072Nl0_?7A{CP7Dms=#~$1hyPdG}RS>4_}vXQ{Crm>NTSkbb>(D6B7_|Cn{# z(nC$4iGT2TUWCf4JW*y0+=91^A0&;rzuznT?Dcf=j*{rz*;$3fWKD7N?=AF?Th{hf zUgQqVQrt#h=jKeQWu@I_ za-dx@7lAB+zy~%Pi2oU*gR~N=DB5$;Fiu1 zI2-Ld?>5JhQaa3TknR2x1W6>Sn(oWAp4zjJ!Rp%Ysg8j^Bo?SXZr<8oRohGT`X2Qs zNNQIIwlpM&{^$@+iz;2Cx&0ik^Y#D&tTQ;TZG};oWUR+v`)`)G`|;jWkD5pn$uGw& zjtkZGZyg(10mEYA$Hly5n<6G2wi^M}ti2Zup85+C1Mz&`+AkgIJIi!9)RsezUo&gb z6-acyx`lr^(^}BfH_LF?62zwWx9xCU`zOu-u6L!ED&Jt6pL#7u?m3T|N9N_cXkz^p zSg~0b1CP-!wz)VBT$#M|Fjua8BCoghHHkMH*&FooYWZfFxrgZK$5bZ4ozL&{<%=fY zIWO9KTrz5vrV9@u2lj?suYG@b=T^Zf%YCZnv8SB7u-=8N${#ge^N~frCur{Vzb)C4SHycKV zeP1p;e^w>meAI?6?e-6cz-50x>4a7MEV+8!cC1sa-{$>=lHWZZtX~u!J^mk{Z64Nl z;neV@MFgK!iSy5stL{9S&#mOtT^qe2!1$JAqTN2g-O0QvKp&na`;ndC>QE?tXF zDHlg2U|r16|8K()!52No@SjZW_9jBVcv-P(0#ax5`9q5H-7@EPesJ_~MN^gqsg7dD z{ofXE91jcqlLUCcQqUIyrju* z=a3|OdIQt2vj?W10bmK?dzzgo6=fYlmHxhJV)oXBwgd%0-jOJC)9{mrLl{nR-!s2~ zSVCv2DCSDdji!Q~$O?QqrA$GD%nyf{gw*zp))BDTx=FH6@Lq#l>SlGZ)}&O0t`f=U z=+LbvkWk=UOktTPNA`mjfS3ihoWJ&KRZDBC(cj@GHD%%P*^6-*WmVV{GkGmp28h~L45%TMFo*L(MGeha^{3ybD8cX(y{%G{mvSO3bE<>@8s z-Cqh{_v4;3^i&)i_M%J=GF(Z^B6W+<<&U-LUgi&$`lBUdBMNNPh9{H%ZY+tqaB8lnJ4Y}3h^6asl4Rn?j;Xk%Vs$F>`B1+WkWzc*8vNvjx%?x|A zz@Y83@lkD^_=OMtKD!E^BQzlgi%M1Ja9?z$^_;97o1_gcFWYz}B$njvrZW5Ijtu(l0nFzcS-P*|s3vLQ8~#!_0@yAeIduU`sD^K=r>`}4^w(*FvTavMWX zm~ysk!|Au!;nUa%z*cgT+OvtI!9-TRrRxY1*={xAmh9c?`k$H^954&8#O6u8&UP6UGQvaF_G7??M6nr)QY2 zWWMB1d$(u%+1q`Z>6kC4mK9f4_|`uT-7JX|z@|;br>~z{KJi8Ob}6!c7}B?p1aSz3 zEN@J0$P=(mM&oMWq3*&y8v^E{$Rkkt?N=rNiDVi-+$*z3(K|!+V;1MIeagKov@V}- zQw*0ymKK>01dN_#W7d%{#cG2?G(CB58s_tapoB~*^lg9s1&Fcz5iEWN9={!b96sLX z2MYKt7L^qYYzfUC-p8ZIu+d0#r>U>c_O@p{T$1b(9$}W~<@qxK5}6>@WobK#ae#)E zE1%JiFgPY>1#UU5?))~L@IC?nVU40}OFr8z;wF9`72(Eid|FlcTbY=wT=T(H1FJ3@GzN}kmTZArao!-H0Z;s$VJ~!KN+dhSIThIn5my4FADGGld)36X1nus8 z?AdsIQ_)6YPIfx0{>GTiM{muu?N4uexmNFD&*t>JxO@^}`BY;_aANa;#Buq5uk3^F zAOl-=Q1WcK)MM`)0i~to3K-Z8^m+wEFuOvdPmoCLHe^_gsmdctuyiPO-Ah z&W(*R>~hMg>}039AZz=!m@@>StE6N#>1%;GU-S$!glF++Q^#<07*foL%NmtuH>so+szt#Hd4^jLXoK^Wfs!!+#6-<^IbJYZ7K6Lc)^MlTSsO?hLlJmM6uK1~LuEPj~;+P6GSf+!kt@$1woOvE8E zchrCurN&kFlQ|L0e`gG+(fH|Q^nZfu(T8(f;crDN9-uBw{#MS~W(WRrpMl+D*)Hd`Ll!7QVf9lx35@^|WHu9r7t!%i@= zn`RGM@N);ODkvsg*3!$)eS>yC?LS>|T|M5}6vFgXn!oQTo!j>K(K0VE`Ag zE^^@N#QnONf+TwOv7DWVmyI{~glTDhERWBrh37$B@51*S%&Z=KrFS#&ywT;3$t?Pp z_7VbHxAq4@Zd}LLPIg_|-i74p;xmsn+xK^ELx#sYv|#G7e{1S=+Y%(#8HGPRA}kbI zHlLP2=f}U!n^JALyi|I9z9sycTP6)N<%9t31Tr>_pHK2{FoC5Lt+DE9Z@@6h3W*mE zZ+lj(;)2ink|9(K&|g8XM)pOYLw^IUisGB$C@!R|Y+E#PAxzRyTL6dW2{i$*CTKk} znNLM(lE;AQYN^=Mf7hxUoE(P{XP+d?P%$vEQH!vp0}WTT->VhfmmHoMZ%(37*R;*J!Hyrm;f85rp=Sr?b6@S3 z5?0SY{Mj3L|a5la(xh$Ft!?GXe$(JN}<-(envtk{% zP|=4coEhaR=Sl!7$=CWSb;F$~(W&+IPJ57$%KNKST|O#N@yw@gGv-fONJgK|_O60T za&XnWbG=N5yUW4&TE*Y`o(CzKRXa&l>=iL4-H!1o?dD;>*0?fP{f?yiIU%c5FTc|f zZ!TWm4gs6{l3rRGjmmkN%|{`!mwj(9t1H$jyC|vxSD4$#dLg{(_7<=DgR#Ki7zh=T zknVh|*PZj{g5heVRd`)eim2sX<8OXS>9KKz_Sw-&z9CmFW8fHxPp&_TN-&UzGq4Il zrXj$e#Q*F{h2_QdVVvfKb4OJ|)B15?1tGz&{-pwkL}0XfyEgN2IM<_UCmY@(j^9mDN(Y!_|ctY%ivX_@7v8H7X^H8U`rzG3AjfaCmnZG#uL zwkyMQ4k=y~MsP@ibVA^Py2%{nWl*X*dN8q~m`BFp5PLz;o+~p`4Tns9Di~m|;RIHY zT1_?HNH)~`SD`}oR{CU*aCggN9q4!SKq*0cLI4@;B61%>`wT7S_B`MjA_^D9hE$R* zOTS+y&DuJ#|GD{5FH)$BwN2)y2IYD%Aqh3m_N9mOP%^(_po-IB3d9ke9dWv35mfBO z!_lX6>5}X9CzV7$E|ukFXA7)JU^EVU(#{l7&ODz=prj3Cy#xp}ZCf7b0uGkXmi%e> z2PMHYK!N=a7ReanJ1n(}DFPS4se5Gn1-C*ABIp#^=f}o)ylsLeJ%@2L) zUF7`ds2h{40gb?h5ubxoOJnc!an$SYwpVGS7-HTPdd$HKZM#ol7X$}N+KW>ok;@e? z_(Tf&G~ZOZ6bXf18@anY;;AHkxr#G#&cGz-L6Uiab1tLuNUln2SUTYxj_l(b8Nbl! zZXBALoUKQe=+>Y-m6z1~3X?D~-u-z=b@lmNPUQWCrFZm0>xh3(8cpoAv>w4eX725o zRXTuJuA}&0kC#BO$ZwBSPV@wDMuUz6 zn^vCk8be1nU_5p1ZZVyK|geUAUh4l_MG$KqSvGb=4%1$8z?oy*97# z-9m|QKNQHnW!vbUOF;$zF)&P@AFAfMKTLC}{{SDT1Q!qVfdIh!!rai3Kxmsd%yH(h z@?F)*!(0Md;3>q!o1dzU>?9VKNFC%)_O}X3HNap2#l-i4=X9$!rBXIpFK995Jr-&U zbj=XpT-X4}x%`xrsD@^k6Tt*oJuOVMZ@lD{VUKlZ5Z1JSFDAhvApy`Qa1sa+DBY)N zBBr7HME(|&2g=s3Xs@T#d8V$#s@d}%@`W+*UI2E09wZxe}RJ8pDwyELTy-h<0@HPM)){dW~ zX*AD*#@mtsKR8c_Ia1U0J$|L^W{*Tc{{V>m01uaM1Ujl+b-I?(b1AW|zTi#2DP0zq zPR2MsnV#ZqKK@GOW_#xDnR846KrolF>B}CAt>3!7wyT)oCCq`e_>NWPdpW1nP-}hl zMm(Q|;pBtq=Cm~xUq-7Q1T>4Z$^CmN_S0)WiKqU{zY`ml9|%)zJw`a`l{6Dhu-zDT zLyv(!s+!UOF>4WTrwe_Q-CCGqn%A+yK{uN$-G54T4~m3{*nlwGPbUveTdn7`lk zPTRX6V+bY;@C|`EL>&Z~5>2dSBS_%Wo-@QKNRkF-*XQJo=5XxXj1RjYaTYd_#pmzI z0ByY>2_oOkB0UmoNU$v=_80t;X*&t*EaG$Ml)!>O#GFb3Duyk|BID(YDlcq26FuOc zzW)GKda-E`98C9%Ds>$X1Q`RoU%_5I6-`7K09rX5ag}v~8*(6VDxD$?ftb!d{{XtT zPFy&H$$=hFrz42m;vFFPpA}v2+UO&Qu!#E#_Q1ooL7oU7cvb4sZBQI77mspXI2+52d79NtQ z+S54~m90v&HzNF~X%@DC2#)|%701K3gTX3B)sQE=UfxP7gl#3nTtV+By$HzG9kKwi z9r#$ai3SXqk`1THZ0iUB$eT_6{^4TQ19Z&Iu`-q-sA@@w`GTuB5!r zff*MIQj?%!_qKem_m!Ka4h#SpjPWZ)gW!Ve;20VCerpVxVjAWQMe*dT%*YOaZf(X? z^})=<!9YcTwmc;N0x%(+Fks@uA09)_DLs|16_JRim zI7*e=W*~q$&*p@Uz(AiPC~LXf%pAb``KZNfK>5ahWv-F6%0@^edRoguh{y&2uqG_B z^zPi@CL@5f=?e*IHXW8(M!2Q~7e?Cy1kJwB=(Myzn2b&3W?a7aiwK*azs+iBfiM7z z;Bun$xZ9%J0mNqV6uXVdzAxDY69X^|d46c7<_~P%>J)ec*!-@T3Cpet7{YJ>wA`Ly zJZ(1~B%wqB5GU0n#E1v-^hp8X^9^Vjj>-o-1A=**m{Sup#99T(5TJP5d!N;M(dSUP zz{uz1rlJ4?lQAj-a%~n<>07!0v2lPdB|2F4)S&IHX^DU&N9wGLj2}xVXqaeeGDV{5 zW{J*6VS>4`Z4Bvbh`)pzD(@|ElOW&_2)eNI6CB^a-IZrVNwti3P6`4WG2@<3s*#`% zA}nrxshMZHi8 zK5C&a-LO`zFa)9nCWPpl`AMb>T&0NfEE=Kla;83F{{ zc*wE}U^*^&;DsRqH$}%kd04GZxwr$42kN7y2zk}tw9l)QO3P1;2GZ1bM z$y$E2D11-`L|snjZ3Mw1feN(~4HLn%Nm8GQn@PCfkv;J%JKB&|B0z#fSc#vXs*W9r zzoL=hw%g!IB@f{MxC1t{n<{}5oJV68k(2iyRak8=yn{1xRMlkM$AM7?2RoSWlPc4D zS*98QAi%@}`YC`P&1vHQ0FsK6WuTl!L|f{nfw+fr+SACojFe}SKLpr#kD7Mw1Ow(J zGr=qaK2iB3&xFC8UYk2+gTS+9;FHFC_$r+y_L{E7QTIizY>Wv4KK}qU$@Q&3Xe?`|n-~Mp zbRAbz8%@&u9Dh~9bqz_YSo~QyD{mG*S?6M->Jnfat2$akaF8wCK}MeAClkjDT7@te zM%#{J%XU#;THwk%f0`~Xs#9meNwHRXlc2S+Y3!bpL#c=nA5=hrlTE@&B&nR;tuy_U zCfb1m2GSxuRiNLG;`ATYGo7@$OPg0n0U}u-Dgyu^mP5FU^TB9u_^N6ulXouoLLl`CQwZ+=8suxsnlP)-$qkTtI zbHAE-BIsInl0h4Qzk=$=8H{n~m$e|qwZI#ZkQMHY(_PfH^R)Y+cerpB8e!#*aWTl1 z)_WhPH1st!883MP!{`cLOub=YH(J%Gwv`>NYfD4`BwN$SD(q*vtN;gVuW?|n>fS{F z0Jy-%)73*wf~_LQ2qNPW4;h8WFV^dNb8W&b4T!PzQq`u{)QJSeppZOlsOj|->APdY zg-MBs9+*9oeAT4pliK+t0N&6ey{x7>mQrL3krGUT2Z*=PLTWgZ?-&4> z$P?!aGh3?n&wUoOHLO1yt|RY{p9#oCtJPH0xRUL(m`?W_5PL`5EZ6m*$2U!&q8{O3 zK)ZjL$y_#qSD$7ID>)6{Fb(hrBl;s*-D!xqsPQV9iy4Qm(5 zo3)Pun-9XqpTv2;E27iro2Ou=83ER5Y#H@V+7zi%4W*{C)TwTmYe0)R zuwrER%9DQL1FR+{-?V#is5#`?q2|9smbna*yfVelmuRM6E z`a7EaF==(L@Zan#+WL=pHUd5vG65&=`YF0&stW=(oD*a6`6r@W2Cjy*FdFnZrv#Sx zt&KLOg;p0d?qk220U7PWrC&~AwOT-IiM)M0(x*wm#-^3xo6WY9@Vn4>StD2K-4)b} zt7*r>m$Uv5v82f^+XQj|k>T@QcAr$r`?S?r?y9oRX&?eT2+h_{Rp^x{?$l}9xrXin zNFTb2rZ(-S_nXdRx&rTxCOl7nA4MZr^H5SPadrzpApDPqbS0#|0OlvT^YTOqA{q!X zAV|sw5Wol=@nQ2zA|PinKIX_?WMG~MJ-$etg?H5;KyKLJi~Uu7;m-tlf&wb09VZsB zpUGEfaCEe`dq>{~=C+X)dVwbY0L_6s1$K+J-Nrx}%yv|0hinI(#0$sjt^n4_gCmo1 z_l2HIiOT_pI`AX`c|m4^0hkdbMk_ctftbPLj_5Zz&k%FLI2ip@NmNiI@+Q{(e37t_ zCl?ZJZ@BuXi^vjOEfN6}@=`F(j~5q%Dw4gYue6)`K{5CGttM+=wj6jd#Q7{{OGqL` zoLh{S8xGK?l|%RMjei*fSA)-1)3M6<3p)C-D<+FU@I#AVHH8 z0}!UPh;=S5X>$k=f0%RO-D1@{UU)p>-R!nCQr1Tt0Dmv&v8w<3g;y{o|1F)Z;$#i1I_IV<2^1(@@2%Z3f074Go2HAjZ z`J!upa6OJ=l}Tvn2G*H&4h#>!MWak^7sae??mtXNxg>vkE|aBbF|eu#LhXAL@1;S#m&LEly$(}xXp+W+m#L4 z0iJJ@e3i7;kiDibLtytH0B`2CDG_dDO@P1Lti3B{9T|&SR;4&_f+7X)AKwcM`)P_$ zNOh*)MXjO)xvrjO0Ut$|OcLM@JT3Y8tz8>--q!QK?y{{TYKZ}pk!g#fBuKxA}f5MFZm`=HC|h2 znZWFP)DI({l+T);q{$h9c{lke9wowZd%=ZLk^^uffB+?2(+Rl;$H79~DRBgho>kgZ zwpb4-ADS|vEihxa1j(`|PTvFLl+17dVECRqf@hLREw~wwvQm{}B$zNw#pI4qQv-I; zGs*6btR4<#)`Kv4fBxA%+%^kKlo8yR4vs4U#Ow9Ho-R_EFK>@b4?SJlx!)wVmBe4AwHsVeX zBK|k*m@0+93rBPJ=%C#OPduNysa#2u7vz)o!mjdMc7p^P@d=VfI$L~^!kbM=`kLZ} z7Rhz5f!etkf<$M752KcYve!BSoNYo<1jlEM`wPl zp;zBIuWO(T+DT3C+d_zN19OF^+l0#{WH<=3ENo6OKfFJ^4D|t2sxxxB}0Ax1=jHO1fX=o=W5&8E_;2o%R z=e$Z6waj=p5y6>OpQ$T03!W?os(Mc@ro$F)PjtedeFrp{kvF<(4;0(9jMx!)2tTP_ z1DiRVTMJjB)-?g9`t{vJWPk0jjTrWaMSR*|Jph=AiF zZ5`BAm;fcDN8$jB{XX!o9d}RHX}40S4L~PrtU{-#X*5aXe|60zb#7@@jf42CCp@c^ zMK-r_aEQtKFtVy@X=&O6>PhU|_^R5jrt0*CCQDlvj(&?zO|rg8i%V0~X_xJwek;qC zeOK-Qvp4yw>~t^yCVOP7Y|}lDW1ky|Fa|=_W9t~)u1GZiQKm;O&w{T>q%(9I-^?ks zwLRrfI2HmS8=3Cuux*eV>J_v4VmE1t80{<2dSgA;P1CeENMI&@c?*Z5LHdf?dM&p1 z07vFmf9{PIKm?Y+z7L=am(PAS@xaz;s9bDd*dOSkAkudCNo9nAC*@>sa5x)8SOGJh zKf0c?J?mQMAH#bIkI8ejM^c!m-+0pxY>;4FMeQmUIia91NX_g>^!?SHB~~`0cZp+< zJ^B0bQFR?L>Gi$toR=~90_DDT`|ht4)lGh=w2Cyr_m0OPe`SHyb;grMj)%ne8=vT| zYHPi$tEeu6dx-=$o_rx^bgd!P*E))lOO2pGpX|4sMr|IAp{Jvwbrpcz(Jn4I=dt@M zXQu&Ho3)Q6uc|?4G4PHf(&x{?P}e(exY002y>OC1kkR8Fe>I^}u%o3LL;@fZG327t z=yfYO&ol8j6Fxp(pPG_)84?K+=rJEw_|}V zn30ebUCwET891IJ`zb0^dx&6g3~Wr?{%IO5AUs?;{9DozfmR0gmtJI(1o< z24Km^`;`N!v^2b2gWf^n_VGnVsbCSu5C@;<$^&)PkGL!jXmklC;DuGSpGt4I#lQsp zAy@t()M>UH;$2|g{jDBKdKIZ?X-#qOD?EH*l_O~JKP4u*-7R{w>DAP|l!3b-ox`5r zuP8NrcBYoGEcb0KHV+u{IPSSFz3oLx8mg2u;+v{6J|iZ4j^T;z;JK|wRIR9@=GLf7 zi6oFRPjBw8A052W?Wd@Ji|KlcAE&2K9aVOdQmr-#o&?WmmClwHz0D#V+d$)sg_Z1` zN+(-DQ!iyeWVmzYf1=yh(5t8Fbt%3iyKo)Hii5fDiM#z%nT=VUG*G=Y<8BHwsV zB!UR@Amj2uSe>yq;KDQ&DueMsl6wL^R`Bvr(=zZo_u7;KED+js&RPS9M#6CPCpR zPVC4WPw6T!KzWi#BE-axPhUhr+G65h@Gb;I{{H~cO-SxCPAHWRzjRy|?x z&+>~H7)B*o)w2!|4hU7BCD1Ph)0H)M15+1^i-}MG*MkNG;%>TUOfO-1i5QC_X_qvA zD1rew`6F#6#Jli|Tc1uanY4g;ARIuB_(sHOX+RAY$77$T-*tJ00Rx=QN8jwJX>J?a zFh~%u28bY&6L^dL{rF>(S<=06E!&y8fzi7+7Pb`nkS1iwwZH&+D8RncJ_JSl(uw+E zBpYCxfuFzGSFF*yBoZX{KYyyW*28TP;b~Q?2?X1fP*qVxIp>%r#&6vaq$6l(?Gk-I zG??SO3A_MfkGrY%uzXVnX4W5?t7*vtN}E{bz&lO^;a^}j<~cESM|gce8e}(-%DqUt zWRfir0}DEFH8C#}j`4-Dr6I5ngc7nT1H4F?$?xQ^>FNs;j9)*ZmGs?hKxu+`Gjv|^ zB--;5vpz=D>KYv#BBU%k9ge-R|yPj3aAQ&i9l*cnMl8KT4-~k>#Fr|_vaxO`_!Lk~Jmd_)$ETM6NL|o=BK8jk8fI%lS0YkhJeupZj zNyzks?in&;!A{dRfi2_nRp~lzaw1Hrw+NE%EgXcz=${WF3=?5$RCWQeAj~T6*SXmlJR)FM zEjEUxEw~>3Ba~xQ*B^-Efy%^Ys7#l0VdABys3gcSJe2(l*`-y@BXmbNpQn;)Bs#;s zaI3bg+GL(emYIk%bC@?OY&gCIL>UAdkPLkhkT#h!jz#{87Rhd4o=P{90A$8UGOB6&M1MlxDlZdp4@WJ^e-Do6&JT3>EC8JpngE{aw1p22SkY^lspRl51!6PK* zV)`Rgmvwj50;(mw-s)FlgE-xQHnt++jz~D->+D@%qwAe(vzz|1*==9wmMbg^J zwa@W2+xS{cHK+AP^fbundP7B@#?+=mhx_d(OcuC6x=etp_eMyE;1pX+V)*c~b zXzIAs)YK-x0dP&e;b>Q?@8Ii`1AD9^BhMVH{SYThR796L1AF;Lnt07YMWmoLG<7@P zSxT83oXZ)?p{UgwdJ!hzLr6Z0q1G{)E|%RrCbpb6by93x?qK>X2k0MPPRem%CL%ml z<7Udv9G;=ZwxEG>56xZEX`lGE*ocx>MWps!jRv7)xZX;!Tc(OlwneQnwEbsDm|v`& zHki|BHYDU5kegNmXgHF2jE|bPSK+ZS2a6w71J3$ogpl4mzRD%!oXZSXJF%`mW9HH08J-`$A!J9?NNsO$cqh;+L4Y)n2H_d~)+$zJpK+-_44EX8A73RE67uSW%`Obgn727sYgZKX z;1-8F00#j-e-L|&)kNy$)3wm99WY`dh?(QdEibuNrM3I=Ugx zdEZPuw;YKl=5NoU7tZTJ8?x1RFo*waFS^$CQwDDuRvd@T?lSAl-5hcT$-_0s*-&K_Ad@LsCpF|oO zjUm*ITjH&0s0!MV4g5C^e0+XOd&ETL()%Z*(?45MqZw0Zfae2&j(=ryRM6dJM$>ty z#*)^##1coQZl$N9sY9c)P;B@7RT{>D1Hv<9S*FBs;d`!2&28~4z61f>$o&)*ofF6B z=%yqLMWlHTMHOce0E_q-{ZaH@lqy(mXp_v`6Xgf;LruUL24?u<)AdqRJ6n5^$sc?n zq5yG&erFylPmKPq(mw>)0h@E>^5V*3&_IKJZw5I0lngE-IKD-_NwSsh+SnX(AJJ#^ ze-QvS#wNf5XXv8=+lT-k4E<4)Bk|_YFhUlAdw>Q2i>67K*dUPGc+4L@RhAZpfNhwA zA9gCMOJ|ToV8km7B$>?OJ1DVBO>iU$FfKfPSxC4agqwqSw2wbU8zr(xi01eCCnfEN zMZ_2vzl43=AiJRr24e(<_@6bV$FwkrGA;oVZdOj2+gwCkoc{n-peo1!0zpGm^eWRL+Mf*=rY^9wsnCE$`E@(0amvEKw4z6kC2-C>t) zaj94caCL-C5@+v)ZLc6i1K5szLaM+?24*{$^0&!M1Gfw0Vh`W@|Os{28@(F_&o9D~ssy*%wu@FQOY){yQHA4-_nGtnT zp_2q(gBMOj+D_HOtVkX?Nl>cm+t6MTSL+k)T;Y($^D$)$oJky$j$ugRb^-;%aG`ON zAmagc#uUNCn~1zx>qAUnNFUW_f*gFtEVOjdZ-&{;l;zRm+Wi@cIpPX3V&up;Aqx%- z;QbKQbO107mw?9{Z613zOYl(e&si zaD&Hr{Z`c)45k2_cglHrG;;3jh5&^trW9oF~lZgU+g4SK4%`MWeAPmnug=v)l zAm(_-_E@y5ZFCZ2nf&nB)98d12ah8%VH;EQl`aMVTkuYou)?o#J z$qqM*U+A*RG?XqM6oCL>_E9{!!!b4@%2zlNMkB%$PlE5nNb%(;ccpin$mTE4qIx%V z?ch^BAc&9*`6r`*0EjU+SMdw?HNeLs_g6Z$x}em* zOdat#@wm^zIa%}z+pDJb3x|lGyDPf)d)}*3v2?VyM5E|L?{}utd#Y@_L2&{~9+t?W z=QtBCxfYXzt2yqYv|OK<{E^V8X=&O3zUjTcR9)zO7f~Q+*-Ty6AdksG(oi2IbLlUxad99z*)I?_jD z99~L=`k>%IGA<_xOlbCvN9*X*WX*u&i9acYldji$(-GS^3XZ2%3$E;(2ogR@vnp?K zGil_lG4bgaAy#p9_V)ynKTj0~=QIHjdl0Sc1QK#?LGAWbFQY$-(P^2Vye%SCDi$~v z_wnIVbv>v-K3f;^NcYThq}yDY%@8zZ@#njNIqM zt2C-S>Ds}N4al({-7?3q)@ukZagCxNszf#Q4%vWJ$I=FwbDSwxZz%)WJ8U?@=$8qwy|_TF+9QrQE;4`a1pYJac!(m$s6f2b%X=i zUJPwIY-3EAlWT+&3<^}+Obd*aQoD(9<~a)WV*vSGXI3Ow+IR>Sk515PG=a`ay-tO+D+iz z0*n%0fMV|3$0t5ZH?4Jc=_#5_hg5WYE;$&n1J}_^#@aWirR@h;W}2JY$aCjz!)WGY zE-l8Tce<@vdz@sFRQ0O6Td6L2E&#{Ga*52@VjJ2gpR&7gp0S19X><)ZY}A6{<8RB! zZ*>aor=l(00sjC103WB*`l?+flUYbp0tgrp_v3W^LEigSrluwXq_n{VaAw{ME6z)k zQF52NrKW2GTpdlO#}NyfW~ORXa3^!zNRAgpsNUTjSMT_P34j3$oX|0@aV~6uc?1Yg z0NhDa{ttxa{&0?1I=xjw{Z&TN zrr*NHnHlHEeS{o{!ym7z!1l#6aL_~n-|O^K>+96fQT`H%YaHkV0Bk;9J(8`erKO;0 zX~;DRg83lFly|;($9hd=O0E?u6THsiWo2(Iu)$MQ+M0@nrEiJufJun$^o7Xvx<;2# zHpsc|yLZ$02gPs>M1EeYl=haGZDzbTyn#9St^7H1pEf`0!_&FrQrI_fT~A2R)UB+h zec3H865=cuHKW>fAi2Vr0%ZRHRoQzzSHh$Z(rj=6!KAU# z3zMd-ZPE+~wguzq`KapL3vrWjoQKkTsk&qsAlgrt_2ElOiK_*|&B&Q0R?oI|Qs%V4 z^0B{~8dk9}B5!Gxsa3h$8{9Mx>*T6dq|ig!KM@xk_Tf36({ZzrQU#_s?k_4zrWycj zkZu9U^Yl$bkv9f8jPc(;L~oMQCleyq{QGxH>8m-Rw*xa^Fi^3;l4KHi6MwQ)k7I*- zhYl?T13&$NGARlKX_G|(hNa^^j4{~ZNq^ONbU9%)kTIu zn032B`}_Gk$N{Fp0h>pXYD5W}2+1}mqFh5@$%p`gKEI-nmXE?=tv#(a{_5nA(qiJm zc}1q6*$~J$m1XppWIz&dIP&#SCQRDymdwH6=2m@pIRpSWIsMj_uAB)3P9{I9%hVD< z;0%$0-{zcE>77)P?Y00-h0xL)pAH7xf&`;`L3Z4Bv?$tkmqt$}(djFBIAoFFGX!=3 za8WQsIFdk{_f5BO8=?TQJfmxXacq+r7LMJ!B~5=$WPvk*2j%*!aUfU&ao)e*XZvr&vhvlXLp3MD1k#6-Sr?^NW)lr-(o(G!o|x#PB6KpvN_!9kI=zAG;|a znvgCXa{^%-7%9%;(s-VI74EB3M$-br5q}dA^jZu*1Wm<=kJ(b9WyRJvF)}|@afAsZ z+#U`>iXv)qahW{9g*#2bBXP0f-@Xd02p4Ov6rshUB*-@+E$3jQ#PBzc!Ek8Z#k$Up#_l{nF%d!qtqK#q4)hXQZx9a zC37Ovf_W`%=E4eyf?!0=;|tSY(YjCYjIC2eM7**?u_ilttookKXmwPDZ{dq)1)ww! zUVcj)KmLXlreDR$z_d7@)GaMB@qxsxJ%6*bXuP?B%-{m;$JI;F{hO<F9d&&ZFwsOr7}F(%^17fq}7TTM!lG&+Nv;4)fA!6cQXqtf0N_CTYg!w2nIOs-53H6Bgt>G*$q`^2I*@Dj$A+hddjh1*j;5R zLF}u&(If``SN#{{Rm7A4R{sy&_|0YgFkquuccmDC)cd10SBiSyFdU0j4r7AmK(eVZ?%9 z9(#W2`kvM*5M3jY7Rm>Mv=Lx!${%_N+&+AR;G~V*LGDlJpF>DJ*zIf)g;Q0g1*O1q z@>*m-jDyU|t*NtMnQ0iz1ydrpxt&ELPN5@@44|b@fpZ;*;aS!}+}OCD6&8YDv{;@N z^QJa5^iP+QCx~0LOPtPN$B)zR3ooYh_Yt^^@JHmgDd}FufB^*e8Cd6L)r_bMgIyuK z-Y!%7m%Km*;NQ1wC-|D-xBxu=09AKN`X>>&loi0H zOf(1_5D?PUhPoVL!a0JgTj?_CXo3I{adJmKir${6a|<70GcY751x7f;{4qHZ_hh$0 zu%8&pn-sRvz~cg7@Fo&8ce;Jw!Ujhf=fwkS21J28lPKM)V6b>G0@5eYsP(a19V$$s z?Zu$&1U6u*+Pz47fnWjz5)8_1POK4UOmIwjz*&_mZ>c8}1Dp7(#~bxGCH7aDGHu-D zJF5c#9%KM-{c|ddDiP_DIsMXgZ-yj@xEJ~>^-s48Z`1;0%$^TFaG`x-9r(YZtm=dj z4U;fUu7}=(%ughzF)gf`#%$syNIkp#_$t+PL7;%_XORjD`kk%#Psvh!R}v-BCl)1X zF;3ID?tA_jyqk%YGYkY{yM0Nb=% z0Z?|U;j^Hh8z_#To6Lx@DSDpAsjLH>4sT$nx~aYGI7L>eB$6}TRo-`P`YW9#nyZfN z;Mx_66-b~WW@3A&UfGH5gmjw?037+iOqSSp(>=#^Qx!2LbJ!0|0W>s2*69WM>4^9WYZSD^W z^V;HA*phKKl}dpd#o)-riq_FnHM)+Tn6Py~2|Ri!w^38KZ=)uTmW47Pwwp{%kFOW; zR@fke1i_ZH7$@uGqbzHLN8yMeqQOiVP(dyP_XLRSxGy!+@mpJ~r=roC#oq23d7J%t zTu)K1`i-es^S$+9q%;^q}d={!#_W& z>G5N!wmDE~sRpnYUVHxlWj!i>_ft)`Ah=0CeU#0l*3lps69i)FUYSF7v+%%UNsO&+ zDce^{-KGGNJAri`<5#M>0NTbyq6gRtS|kdjG@Bclg$|r=)oN-v&>Y~=9G?LTooPCi zX0xYO+DRpmXdVk2rMepj9r5C>b=U0Ch8G`+z(E-Pnm;lYb?k@#!BzyeR-?4YYx-P00xJm5eFzu%I#r0Lp-BdxRN7Bz)E1E~aU>4&e;-8sP1UqC{{V$^I2Nk3>KKiRjl{2kfhRNHL?Kx{{XtPENcKZaXgYfV+Tm8UT53^Z1DKS(Iw*mOCW#!ajr6=~Sg% zy87f*q}d+%#D@-Z=B#V_eveb6cDJY1h;H*pUvqgPFtM`rT5VDA=VIn#H~RNbR(n7U z066AyK=V|cUac}nc@csw^-7gp#Fslfl1izDlXLjJMbl}x8M!0w6t67;&56y=lBrIq zX&)CHz`8GfNhFW~?tc6I5vn&5m;3;4^#F-cT22fLp6U})1eajUcpw#`^Tr1Xv|C&F zT?FKr5KrIqR;hq6d)v3|g+Y~p=J__CHENwkx(GIl3mH*mIaXT+z_i*X$IFE+0Db1- zXK@xdDkeby52Oh5Qc)qK$dTJON5gMvM6fhlzj^wr6)8hE6CZvnFHcdz;BAh0Kfm-= zDrz3;WDAZ#A!br3>x_s7V3`GFRwtP)Cil;Edb^9s#2J}c+JZx5T01~Q?RTXbggB4~ zi0l+@kRW89ES=jYPA4BkRWDG3bNqn06Hfe zU(+iFv!zO;7eEqo0_3Q~I2ibWIh-PS$=XEYAAF;v*)bDM!1JK za4rGmYdNK~<;3GXmTs+}H)y$Wk|UJy?5cxxFQiBp7l57Gnw1{}2<&)7-6KhM>V>j6 z1pVPNwA&1|(l~IyTJT)0X@MY1Zy4k&tPly>Y%yc?`^u%Ju*Zlay1da1+(btNSgfyY zF)hXqYeAE5ztte<9mza|4s-+IcovmDgGEzcs%0uoGIQJy>ZEOBL87a!)CRO|U_XjD zU0|!D=sh^qE)BeKxyOi5RT@5tp?AG7);HrZ1fB%*bw^32t<-CdqpoaIAMH^%Z)b40gb zlH1L@{ozHZqUULv&~aed7VNzGpIYjI3w4!j1UGbj7Ozk1-DbT}Pu>ck4~uXBecgHa ze+S8jJ9Ojc$1ANbbrDgmcH&&pU>GnwrD^o4EpnaMkD==wN1~Te0hQ@Zn@|!ygb=m-?s#ey{{T$4T}2@F)M)G?H| z9oQ{cb-k*6S}vU?+JhTi@Q_r5!W$RiCKpT9-5yf>_fQ9)6!y#&w>fUtOwwYLy&d z{0$`hJ)%%l>c7Osz!=ehCJFhK4N@P0a3Ies-0=86JG#R!B+78lt5x^8gLN0bcq2H+ zWj#tVNslg652Qn^Ae&s_c1s-gRjNA3w3BZ5T6${1&B-Kl-E+=s>6q?pC0x?#oYw){ zH_BN>V|jEBg3D*P%Yb-4W{wh}2+|WgbfK>HQfI%kGL`9Ti+f2G->d4$6i=NbCZ7s(R6xDYg#~&YlW^>$mjaq>AEJht&u#;WMOGkp)DYr9uFhJ zeo7iO8pa5AFia+^UPKdtZhrY(cXqa#W@~g&E@;kLBE@7qnyNJk5(t~$^IPXo9OEWL z_Mbn?ls@j2awVY2&Hm4-(r2m5Ts%JgG3a$ljds_QA_1PuOHb4Tzy}BOQ0qE-OE)#3 z$8J91W;L}a=N*;n9J)`F#g~n*si*H+T+#`_cXQ9#SUQ(Ji{fxh0JSMqI{-Hw$JIrv z)3&P#13ky;s~#M;YWiJh1_;~_(P?|>U?5Ch0XJOLM!JncB=!VlT+?fuKn2D@vb?aA zrcFAUmo@fu7K;xR0oJ{-xmu4C!YTRvH(ai zxC_XeN6|q}sTVwb^XQ$`gGBe@GxJrbOFQ5|F~lfb{iF=uBn%Y^)I(i>coM5s)I^X> z%=)52;c@z&*YGwVD$msy9`k}Y^i@q#OwITNpmWF>!JP1`MYWez1b4mx$IS=4+-=3| zs~V$&5!y;`dGTihnssgn^2L3=wf#3`NO2t6ZJII;tstFoaID9eq+ADO!6^o^R0F8nhv7hCMD2Axf6?%gZ^1GqpDm*?JcmDh&@{t{Epd_M0S=(Gx@R|U#6f}#5&5n!t^PS$&)==D zP#A|1c7Z;V;~qM;nV4OZh{E=sqnyx7 zn`yBSWlhqC_f@D}V9Rcw%Kea5WH7o6PBIETGsOxUo$!~Nf^aYry&Pyzv2)8oF~=R0 zT|L3{ed}Ncl5aB!-0hk=^@D|&V30-p0v#y#8a6j>;{wFr=N;61QjcGFce!o99tHUH zwp6+R16#CSB$F0Ydc*hYDKaJm#f|=63rD9m8YINylCJTJezZkL25s$WjDE@*wVcL+ z=NA$n1K-s(O6HW7{AJgJVpQEbr`6Rw)pdxA5h0_w@mX}a$TZqom8!74j@;{!6SMoP z`kiGx5T?o%0jk*~11GjdG4xmZeuN4YjT+S7sE>hw$UWpwc;#abn^R9*`|uMeEYt-K2NdS^#ynWxrOdR&KJ8X#p4>wNM1_0L&$e+L2OvY_DA~P31 zJS=T=NM<+RQX$`n8Mw$!(hG=>h#$mx9%#1)i+&@Xe(<8F1@hA+!v@y#8UBh4P7NpG zXmQQCKc&|**8z242gS9pAlu|uPUeW1fNgVw$)B3%x|15}uPrh=pQ_)(+aE7ZLBI#P zNU`!!wb0;rQ|boV&TL$nlAwrps`RH%I`+}gG?%(TA;XRpih<0iA+{pNx;~poJou5D z{S~UZaw@mBCo-bt%I8XNFKY>%ptmcg_E%mTG~7*AI5mzFXyifkT<)obpf{6v>=xg$ z-}hSLv)tBFX_vX!A4Q&e((yM{(~TceuSakpt{_1M2lNG1r&DRSA|v-(-E*pRZFY^$ zp{D~_>XKX-8e_za*>gMP;yek!PYv12JnY{K!o7f-ixDv?8uz*5o)s|iIl1JX6Nw~- zx?sc*B&X_IOYcm8JTfFA6>;#vHp+lIiJtrbm`svQ?~9|>30E0eq2Nbh2fDV(@e)kU zghI-f4D*#ED>4m7mxduMgWLnzThmLO8Fa*h6X>h#Zrc|S41+WERy6+r z0SENR^YTkBjy0&jXe4Fvitu4kc2gWVq1Sy_nkVW8J8xpzN{8f1Y z&NDtjAmi`NOG3`TwV-f6RSi#x!!zbSsHqqr#%~{aO)J$NLuM=ujOAw3yRUiX(hmyV zSNuJJwgi4_H&sxz;la}f%gY`pqiM9v4FGO{Z5jEiJ4V-7d4MnXDhdRcvA+WG*;=PcrlI1MCd-^d zTG1qv;-=kmS2{xBE&wnQHX!5f7hBPE-{G1+;+#QygvsD=v)Ry@rlKqAtIa`*Z2|%F zIr)05uV{UtdDF9}(j|q$@xaG)VSTzyYS64bqw1!s%Z86mY(L{Y{CpQeT57qWkQ&m6>tR@FK6DiF|MFdQmWodleLdtr2AhNC-Ho2L+D6p(xW z0C?vxTZ!(Hu1kh(Ccy-fZRWSVoo?Fio6Anw69??9YEl{`#E6dHeoJelYQKcl3ax8g zNP^RY`y^tES%4T-XEG?+W&}Sj}B;;v;0VH()YF zuWsIoj;6bs*fxvEvgPs3Iy||1DAdzZ4j=;n+!(U8q1C^pJ+&w|<7osENWYT8)^!rf zhiN-bZ*>p&tH<+a&2?WNwB^a~r>Ofk?HzuvsTw_9-vls$bLg-=xzRgM)GB+*fu^`6 zq9Bov(P8v{rB;TZR$!Mzf)u@TRjTT5_tol7*)3_bPhhae!|CMVd1vj5nWs_ID><(= z1j!2hI_+=-qLFERG z;n%d7Apqk*9KTxtV7BC73oLHk6IL?o&=PMVZ7Q%kU27i;Elp9}ZINSZsvhX?$IV=+ z%706ADml#!`2my~z~CY;7(c31oZ4h=?Z{Hrbvm?xoaD^HB;;Ghoc&p4LPic&mrm+N zi2(30s%mLBadSx{#Z{8mSlw-@W18MwzL%+czzB8XB73XMsB18^UB+>6eAgAD)j5U% zj0s!XJ!m6oAkFUtEOPO#WVuS1G&P&p;WT1&QE36jrN;(-fthJTIyO* zT3!wNtWe^|_A8}%HFVAb;0zuYIjkEhW-sJI(-$>ar(RemzcXZ?S9=BIiId8r%A^RK z*+^=S!c55XznZNU();3O0WoZ(`rDd7G6x5RQ*?K+_j-h0mo(A`ezut}SqP1H}S z;(HS|?xWmfat;UxcLHP6H%5sebdMvMOkV5}Aq3h72b652_d3m-Su+g!PcWO3Vq-F= zI%fh};AHZUq6MyI=pfV6zN=?#gj`{3XgV>j&Bx)*id{2fD-l;{Y5r#kk4{%XrXqvd zFY!%@f<{HWR$1j^Ok`({n@yBWu1sSec2{(|I&GmQ#wKke=AO7Bp)Mpzuo8TKRbX!v zN%(e)0|3ve%H>z6yi|3) zCo86O%A;)(ZZp8{q+okrGck+d8wG%qK8jXTFEA`ef@GeXn)Wn+K_69egJq^n$jBV< zvZy)Ct+rrI{1&Y@1~l4aUM{Ap6-JY$aToF)zN)^Yfu^P#$jKlpZ*V$i@JQfTs#>BQ zq@~~>l9V(d^3W#bGl(q!;%p@|oei)Akpd(0P;POMa3%omtveM#*}%lem0ZDiwSp%Y z7qK2Gsa~Y`+7OlMEDa!$-s6?m`zFUXLs9OtUr+!@&(GCz46_UjoCo0m)z;zli%F2WtQWC#RXKc;@D>Na$> zY44wgWZVvO!sMN__~3elK;0|DJ}G$sG~>Fl(^wlpv6Iez@~d^X6I25*=g5}Cg?CLX zqY;r1EKduw;~${Zksvez9cU--l`x=cy`yaHi#GGF+gudIjG(=@;Wq}&U_vrt1xb?5NG zx${21MJDIS3)>N;%`w6Cj8 znt}YJgMTrVTT1;+Y808MtX$fEg{HuIE_|=oP8e+-hfh|7I8=V@mjYy;KfJ9czNJ?a zS?+NF2|V_nq6X}2fNwEtgY@xIOJqpfBEW(^!sp8_T~cT+l0n@7c1FgM;MpKJ0z{en z!j7mW@(Bb<&(p|-B@h7{@6E_EJO`?q(WIyj--JoTk$)Z#GzQdzVg=7{l5*p^#$&vn zuaaADxPjyZUZlw3X1kNB21-d~LT`&leNRbN@S0b4P{sv4f zyj`;LbmR4~-9aO{Rm8a5`72uX#Zoz&oTw?Z64)1?n)K&SI`-e_)c2S(J9%2wDHha_ z&_i;3Je5v_(%NO`;oLw5Kp&dFs;nB)1EtQe2Y|Wq{{W`5rd2v3wyWL{;>W%($7R%3 z&_6@$^WR3Fy%j)fi~+D`9NlDe?FCIs*-o1SOH7dGMDhClS4ujrpFvu$PwET|qUvF? z&`CY`?Fur=`zlGN6}JF=VnIOWxVvb?V5{23Q5w39(w+2LNMntO9wArSIEafI{!4jx z7+8o17|$e&gy2aqOvR&wL_@2kbxtQ6y>r3Hu@CKVE_|xfd+nwn3-2-1PtaaDt78v8#fRD78mtZ zK;0P!82wYpIOh;j)Ei%+T$2LBxs|D=c`!0!AYKB*(+`WraC7uqI$h20AY9%DuPdL) zBYZB`oIqg&L=1_NscMYo&7Row^j9811*{VsIC|w#)Q}?5d?v%?Z(jwTJ+#R|&S`9J z=R6d4p?>DO>1La8z(cqWVsQiJx6>qnrKa;Ev6)sVkBA797$Yj1pMio+ zq`>+sdSVNSi*g0NifP8J5w({BdqBUcDy{$zixLSoQPV90!#0o6Dy}<2K^uVPFYC&| zuf=%{AOa>ywf^v(wF5~OyyE`=d??8!@p}+3e)y1xIJMD>3mCBSDe9zko<+An8MmUN zsc-;H?lz2P@0DV)B9MuIE;9vFsNh5!`dqCsO;-*rox(4U-O;hYzr5rC0tkRW;E5nl z)F~;p1d9p0URIY$jU6xm2KY>#Jxbf?f|1s_&$ER$ZI1Z-io~Em0OoTaKa?WY>uTvK z3bfqeP+);4vgXeZej9I5XRdYEv#3-IpLa+Wv`N2-_57D0buZNFA6@cG&4DJv%}1}+ zJ)>zQ&Z&13bD!v>(`$6krq)$x6KPxc{C=3T-qtj_>a=XaM{A$a7g6d}A4lD~qnzoy zl79UV)zVU?;i#hdU@;MeMypLn?r|pn03~uwi)y1D1LejVZY|2x)6z9s?{?kLB=`kJ zqXX%Nxr@J>Z0-+P$O>wS|o*pBq_cn(iV5 z$o!SI9ABD?3T4BpJ_-3tEY7 zYIZUY5VNyOl?IQgzKg(lBK(;8E$uf~VPKc%kZ13O%j)!{r*P*Aps!YzEGIcpmx^V} zzr*~SjMa4=OQ=)SGn0P-`mRc)V9i0=P53Iy>(n&ho^J~-xIXmDy;hz)H5mT@C(AQz z(AEv0Iy^a9+Si9Tl4RyiR1B>H#2yN2&Mq=?d{ybjWkIgmW{*#;REP!+ zAS=z#7C6WeI2V=C==2BpDo=GNkRxneMqWuL2QcTB2AiXTM&Tg#&)t^3o2F8yPyo+} za74<6{oSqve>tAg%;4!xbwjZkyYRK3Fx#E2)L9L2Iu%I z6qp(&4DfhCt!c-+b1642APKPG532NC9+c72QKz_Wroaf3mFK!DKKofhm>Wx;05S*F zdP`|+rrCF3BL4tY%jd1I3C$q@vFoz_E7Rix3hLrjK$Rij6E zrwnWW0f&7TXT~2SqE$vv%hf2e%*1uO(x2R{p1-QnvKn=ji=h0TvGiYmC z+W})7t3YiLAQ;8K`by@`y6~EW@LC%NK(&v$& zK_(932^bUk6Ws$uP5vTt0D<51Q!$~`+9n-GqNU9$3YlE?wOjGtf15c{vD@Y`-BW9xJ^%!})6r$>wT{(R-%z-c00i=^ zYxg@)TR(v90NMwVqp)s*BoJ;4LU_sAa;0KK1AYewlm)^-;PPaBVHA8)0Rypw+kfIP zBh%)uh=9S`Bt+a1`=)7j0KD>#X`4lff)Ale^FNi-+H@R~7VZs`!6F(xX2lhNH<%G5 zrfu%wY(#UyK8*x`2IpRWiHHWx@|g9)LBcZ-c)?6uL4bEhf}%d#Okj6I19X6W5`tO? zKQzI_b37ju1+6)d0mzFFos6x$7yttx9>?tPu{315dz&6WAE%<*(mK(R7UKT^CC}t< z<6L5Pbu*fEG!^sR?XM{$@`+DLp@3uRx@2sVQUo5)e0M>}XR zOL>vOpV0+lAX>o0fpPnPMJ*{b;6=Nptk46D5&`vCHTa~!0W&0?_WQt200tZ|Gc)6B z_@Zba`F7{B5DxGzLyk%PAzD%DBLv>}!l?2h212=56B0O%@%yS(Ko%2jMU})*01Js1 zC)PZQk$^PqCg9%x0HTNz1;;rDJ(SD@7;u}w2ajLqnSE(#hz++KTI(~d*DZZIV3J** zxmpyU*o)-Bi>`02>Iv$kL~IIho5^>kUbXwHkUy5$P)mWta3c3ZxSEb6$tFyNBTSHE z1TMXB#iysH>RMnN;!C2^n^fZ6jTj&jN|C|Or${$|2^=A-Ow2e*9fHg2MLNxS0j$aO zQt33UXZn=^E<8l56ezjkz(|;Yr_$(&Ad+PMDYADjg`sJ56srx-U<~pZNlU1%Hi`_u zxq^~YbghiiVWQ(4juA4oHUM22xs_MrbvbYlN6}QPQO$I6K8sCk_oqchu&88t0#69F zZUT&(5!l&g^vz43lZyyib)nRWb;uqBe3v#;SvJP+7d5PVz2?wuFhpfnYaDawb$?pH zy!V-ss%`+=j&i#3>B*O8V+fh$M)rX`PbC?u0L%n2zDq$Tx}>3>o)d=OD!|gb_!3}v zsT-yZGJOcLTRksy;6zy5Wo}XIL`NWSu{4B&2agMGghhr1ZThZHE2qYx)f@zonaG>2 zaqn=cWAe)A`kPCq;fVs?3!JS;weB%*$!p^2pObq?%C2yU#yBcK3yj6jVxgp9i8dLs zw!jDyE#4NaNN$I)igqsJT&m+^bVP6kO)nxVj% zx$KmR*j{$Dj}Mqxx`&54#aOLSnG+eZrBTGX>Wb27lXOlqepGF_5D}K%1coafE)4=e<`i8bpSozh$(!C6MM>>r0E~Cf%q%>aUZY$j-z|=x?AJ^8 zY^|oI*YX_q?mnaXEe%$hmX}OR%R{8V5iasv?JZN7;%T_HFlHl#FTQh}NjscwTyK9e zyzk39mhE?b*!t>34$(}I+sP71`{iDHDpS#E#W(zJSzVYMU4}=+<*&K@)K?DYYHYVNGx3CQ( z3yJPQ#&}hOW=o`j-@3hBv8HXW6K)YoFVJyyI&KCeV{8HI%Cz?bNIC#qFdb;UJ&C6&hPfuH4^tTmZ}-eHImPMy1Vz5>IujOpB#IC#352oqBxv#3@V(2(p05fcv1l!^4 zsAvNQnWP`YNxpM+cA59C0oOJn1VHpzW!p*8v)oTouHfTlhKRT2gxbcr+Goz&f-pTe zPf4yC%|k>G2ml-KBJtvbUis3x3_#4xdhzO@@UkJ0Q!N39{MH>DtO6tsD#p zSh@}Ry<4joBz_`d{{VJfHkzOMA)riwGBfj0<*mFprJ@5LahS-;o}|cwV{r~L6EUuH zV$nWcaHe%J0Knb%-Ev#Ys04f<5p#JUW9Ec4pzbCN;P(0W6(MPzzX zJ6$c9={z~2o4qHWRMWTMX>bvdX!`k~2IFwJ$1&^ZrDG%y2WRgIa0D0x0Wmi`&)?*( zd$xw|1d=icfhxON4S*0hwEb0&cpJ2|SPAk`Ouox-!O1*6yctMzM9WnDccy3HWG zm^b3%Fcpmj5mt>xv;yGG;qfuqV;Y3GxSNt>MprL&uSs=mBE3KzFiai~Yx?r578x^Z zafnj1$;^T-X9W-+%F`#06>{n)QBcz$+T*umC9v#UxwY1lC`)#mi~(^dbsBKiheQEs z810WVOw;PkEdUn@5o=mUl4}A^W+0H2Mv7nnZV9EeL28=e6t z$Lyj81QWOud&&xuN!ka1JpK41Me!WL1msvnp(04h=^BDOo{X^uz9%oA3l*WI56 zJ?yY(qG`?MJTVEW0Ou2HlQ9wklAt>vxQ{cH8w55ap8{v*5A3cb9bjPCiR2p(u&LH? z0NeqA02OaiUQ81tb}Efh0502ZZ^F|QP_S(_Bo7j%q~Kx=?=WV5{{Tfc_H z)+QY@VG6Q^8Z@V12Z218^<2MPrt#H@JdOreQ%(Q|X2)y`pS~47tJB%3{{T`Y%oa0y zto(UuuPMWh8d|THvfvobHh^ zSf)1AzCf@XA*^>Bh$9hcQf&Yd;TN(O1HGj8;_A@8InMnu$3y+#hR$8KanA$NT82kAwtxK5a81gDN zYMem?`6yiZ_2oGZ<+hhjc@Bw>6}!49YWW{ljK)LLHp@jff5Lix%`us z{yW1Xs5%J*M?Q-*$tDzLzG5B}@&(&pWU9r{|Gw8a# zV_Mdp1>hXza{BfS&LB@R1{3k!jFq(Y$*l(99#0FR(Z6$AM{9x!k67_stuk9%kU_LA zi>K811l*GVOeph4o+`1arMMeh_=~I!Z2-gZ40_#k&mgpt1c5L~5U_fKnp|K^ctTz( zJh6;jw8x6GN~|(FU`nY_!;SgKsrlHwY#z1b8Um&AheK5k165--mT(j;eH( zbm6k&a`DI?f6+i#OF%7{HawrnU+G#_yGyC8`g@3MHhcJ`bgX+vOidQK%&?O;CU~;3 zQjn*nsZ@`N+)c-zRrJ&DhUUaX0Bq%NUeGkfga(%aTuwpl_t{U=ojDy#$*AcW#{p0x z3>m&j%q=ZrO`8s|w1|?ZK3d>hIFCiyk4yo!2!_BWA#1vksQOgE_(T^(5=WQGWm9kZ ztZ5|fa1q!oijEYiRj76l8x0PSN2v1q;ZUT?zr<_m)qq+Z07NVGoEt!i0FXf9RW6>! zwSXVyIq*|z9k)vN83$?gOX>V_RZP{XKk*ahqJmF{ikYA|InFXkGEm4Ci_gt$$eT#S z_u&|&4F)bn{1b;s9z6C=NdooDl7w~n1 zW-NR-ONA~Pkw!yFfpadO%MaOlMZGlCi1&`0WRH^H4`^^9vTy*g`}_0Z@?Z9JBDD?k9if!vJ#s!B~X>qN+i5dcT8nx36EH~}uOxsefzTf>yqSQ<;)!wG-@1MU`E z<$oD^I^wtS4}0$Fgn(v87XIqLTddY;oaY;x0nEpA{Uo*296$t@5(KKA&>!?zA`6{~ zT29hk6Hoq}ktZX-eygIQ2xzqC*p->l=sC0)19C1_-5NlVaccuJ9hWzkE?vGU!fH*1 zkVMV~?2P_JFky3vBF5m0$9M>~4bU6_1B(enz)UtQED3}33!81g=rIkj?P(u<`=%`} z1i_5lpT1R+HWrq*;=~I=PiThQ#F@eJOJho`q)Vp3qGtChRxmUZkYgeMQWl28Tmb}R zoO#&y?46;YfNmzh^U1O;0j3-c{{WlFf!gXv8GVe%xx{?aJA14kUSQ+Z1KqY|v5yiy z@`*N^cyj*J^*2)0KTS@~Edj)th08Dsl|Ca4ZhTgswDo=OT+|l-0PKhy7{Ap}qfE49 z2+W1+c(Ich^E{bvB$XQ4kEf;wU~vSKd*OC{H%MDal=qi3unTZ_jfksy`adzWc+RNYRXcK-l`5H8;n>-1Uqa`<>gheczlGNaxJ4?AE0J|7$|cU`9x zdWC9nX1stg0`W4sDo*!0Y7HM0X69Ue%E#*s4eFJz8;p_Z#g;hL)@-yij1CAQ%1TaX zf5mx=h*7#p7>^zh`@joeV}djLEfKz3lv>BNuwq17bh~f zFrb(Q;v)ufkv)&XIGiGTXOl77{{VMGk>_(M?iQ^xh0MY13JtJgCVO$}x(=JssERGr z>lk0W9jp>PNt^xMTdUDDRoLN1o2a;BBp<0<_xU)?wCTtA*;%(O7a-aOB!21l5dZ=> znHTJ(uR-)`K9izaNHHn4kSsS0cl?(`%2~q;HB;$bQ@z|2`gr*QX@Lxg!}E2GmopSld0F*9$=?+Is}ogvIs(;61Y zoCs6tUoYJQz2L&s>N;jQptNV0AG*hN7g2a1Lu~{Mr;c9GZPp_sdIxi(&*Hr;pW6s`gGKw9E-O5I`y>gD@L62FArF)o5tr zz>tBJL68nPCfQXXY&wg$h$b;Stop7XcL#t71NTiB)|jIz zi+994L`C3_xmGmivB|_LyJQcF2)Q6gD%PE{=!iIxH&vHO$F`ji2xuZBi3UDjqI#eJ zC$!tF7dY}c zKSfHoEs|~`AeGaN;f)3H8GuFq05n7r>*5HL`=DR|fiO7-`Y9k88*e-nN!q1zC&2Oo zk8ES+Wmv2SY<5tXjC!pnNh)m{jE%mt`6Q&K1o)(pxk*oyz)6KDW#HezXw0aRDq72b zLs*aiGa?~jkQzZ9*0)Y+F*gS|S?3yIR;y`C+~P#e0xDX4hjW|ofzB1Zpi`EPF?mqc zs2XJ2c^}n8J4VWkq(Q+so65A#qY@fG2l+-JWK?)@v7ZRxGYKGVi{J~Y_1b-!TAgXA z(n;OeV4mwEQ0>H@mt*oq_dq0!92HW%Xf6iis>dA`)tq9mlh1Xxp>Wy`aWl%m4&74{ z1y^3o%1 z#ik4ub5_$)?F3+05@tM=I*tGUS!{W(ta@|(4FFEEm;hPQCejw3wXXgZz;1%*FaZR7 z{FZKz594X>G6a~gjx*}C_fmBiG8woy5DSU=bNv#?%Syo7EXB?T7mihKr&J9UR*1>C zy1Phip^F21#QN~EI_tgN2evF-;+8zqy)c?UryP_JOvU_Bw4IFr+m950BHWG)t=S!| zCebn39m{0#!9pAXiQ|q@NC5mhNkCZ88LAuq02QuQ$GU@aBv?Yk)3`RPNU%JGvRwDN zoEx_U@$ykuI#KElijWV&GvZcjn&WaH6N0bx8mx6f(|8vkl~Z&u4n`p5R83WIyn>y$ zkTKm%_R|)No)kdHfHB!J8yX2E!#PhkA|!VS#1TC3o{4Cz-QvOD5R8PcM>}_&4w$9U z^!M6G-NYZ7(`6tIE^j%G->R>tAZcipZ3Af9(jo_h{z_&L*9R8(&BT67g7aN7qSZc| zsxqF@5M&9rq^vHc-+ixR8UYLfVr}&+VYH^J>b!^pCh#(DeAX}zsaCe2m&=JQi;^*6 zJ(iw0St8MGezvVZ((R1`1B-$El+~CkYN@v394C@_!SYo)B~w)D9XH=&v;*=BN{Itj zDoJ+HCT9FAy`n7|qLV`>2P8%Ee&dA>zO#mwyXByYL~R@h?>;G>$2rB$879pNw@^Ok zr)mfeB(&nvR7uM3O#YJD(Kq6Ucyg-sf$uA6*bUt2Aml53DDBa)%@*g)ReL@E09b1y z@QISO-jN%+8e3?90TLqo@nxXL3Q!3mM?C%G(MP1E{+`xUC+xzo&>BsF5qbOjm5pdm09bRzWhNN_ zkr84@i1{aRbBkabT4MmD!2k;a3`pT17c+4roJEdNGz$sf-s0b7Du!PJlP~}>o|{hH zyGV(}zJVZK2LqlZSE_y>0kcG@-sTq{hjHTfBhFPFSq!C8p&$ma&IWi|c>e(EGs~x* zYij%MrCO}myq@VbrQyMsbBqC*Q0*j{xp!r!(sXqz>X`Ce7{>x)Z|c2$`r`8Y$3zIS66LqNoyoiV}A^o2M6ok_29T57i9-Y11Xc z@n91%=j$ty^LE~F(0%FAZ{jmD2j6v9tLf`j)ryRzP_@kfzk+f3ski&>MLJ?(k!*s0 zPe{Vw_e`s$_eUEnM}IZXJOxw*{ zc91oohLA0h0F?zw1WocRFZwO_7;34Ly5MBz!^J+(E(Ab>60FhDyHiwAXt^>-z#Pe2 zS}u)BzNwm8RK5G%_@3i$8)Dz0k0+NLBJl9s`E%{0Ne&_-h!^{W{{X`g2VsDGm!r4Y zo}WUF!J=(PH!pOOeyc00bQ)A?olcddt!VhE<`*VE&ciIdTK@ozWZF6uMSUv^Obw7E zV5H46=$9Ox#F_j3)lDXO>Yu@izJ_CO!0>AmGWx)y8#qF@dF)7&Zyzx-qgZJ4w1r1VF!_ zrL98F9|<`Yjy~mTbiGwAZBWAKcVZ=I>Uw{^rtsJ@-Y&D=t4(dkHPHD7;fs;a*;i=m zp3r5$f!zgOn_aGOE!sn@*dpFa7wDeH+9Z#}1VXMWD=NwAji}wPsp?1v=s!hXwzZ%Z z@7xb2)=yWawba9dZbT9X-;%4S=^GC11YANAa)V=@=}0W+gNuP z`~A?pp7F9z1^}7xQ@On~_2C!11lS96JSblK&OtW32>n!UdzyB}CJ5)cebQ_qGIR4< z$i~rpF#vf1WoD1U@MdR`VpVpYW(nlQ{_LbDQ!q`+Y!j+ zK>BfVW^H~x?6rur%|8^6Nguj7+Ab~39H;a zd?>csBpik^m+>4|NE2$@+MwF)LDnORdg)6YdQ%L|*_zD1-(%%}T|M+Akau zvCEd&aS`ikhdhY@9tHtfPj=^$r_|NlXUts6tv%M5%`?jKNZLRbx{W-QNLXb+(hS&l zR$HmsEe3K{eBm^af{ffNLz%Za@K3Ye<3w0cB;bA-h0{%t)2XmD_A$N4gqh0!X=8 znvh)ilLvxTr`0V5t}{FffIfVOPk=XR&e(%J7f!AvZ&|= zKsGku50dN0r!_u-rP}akY&4Q{a%F00H6v4{6<7mEmbt73!^HD{&0}@9Zq@CP@U&PX z8LBPep3(zb;ldr#>2W{^6Dh!R%tF-y4kY{|l`C3JiOgHF7PR0G0#wWZcR2_9pvpD? z0FM=>%H+{&`$SA#XAszIx(wkH>r`V)GSd)e0U(1@Q*~lzCP^HqgXZVSC;U#6^h{br zTg@^%%sBpGHZVU@mX!|t{(bW#N=Vbhr&V{b4R->FqXoiAB zj`sOS_E`SUkj)$SaN}jW)84|@I6D$#2!o7zrI}*1?J#p?nqwXCf4CIe622VL$a+^f%sJac(E8bC3EEI9)oc2W>aSVZ3NE%QvqZe__N z4^O|{7-%3D86X}#JN{kQoqKu^K$eS<0!$kZOrmRxoarJ!=JEDZ#`_WKCLt3eE#J-Ttmr|0CFNy1mZp4V?Jl>{;7-D0T4G1 z1mEbH&i?>X;s6&>f<-_g=5fd0=&C`py}$_$k>9adUeMP5AEj0x-QYABiJrk~I5o7$ z4fHmTSy_E=LhhkLl?FHixV^y5#~+{3ZQ|Fq*!h3u#OU-@^}2%1J9kD*fCBD%CWPo| zQm0y=kRiYTNIl2Z4@~IvG%bGD{6yW`V1BWSBkF~%)KjTKtUGKut#m>A3$Hx=xN&2S zxk;>b8rt^?PnvDBIf39jPzQ>-n82KER%Ni$nmpBN4{nT^=j1yrHVr2SRb-v2! z+Gtj+MZ=>0U%4q^j+NxkeX)32lmuRS#jO2(H#r{hlCSH-RiwT|Op6-e& z0$Ex}xCBJ~Y_e+6sZOJ*)M056GbR^a-4><4hLJ5dkRm-jJ1m~Ez|BAo5KBhsu?jq1 zYi^o(JTgtjcTcJOKCQ2O!5JWkE3wgPn))H^cJAQ*>yef@!KO(9W;x*n9d&&_!pD%( za7kSjdve46U5=@=DhOCJb>rh*58`cpyO4RGEKqA z3M|s6Dv#C*ZVhmlzyf3L=-pn8U<80CKb)v_)L7CCT;It?9ZA8i07&4Kaw4kY?$4)#@wcul_b29E^I$v$p{g60B1P#{gv%j zhOV9QPyz_ZGbh#+eWQ_fj2;hy+F4^9rIaiZCvX>;j3)qt8)WknbnR?5;K?1h^HMRA zVq|vgp5bW1i=OZW=bq~29V?)c!()?S`KF>ay{_+qI9hZANf2bj!La%Itn$gE#Z>8k z({U_-V?C8hv)bT`_ccc`Dj^WzGaGjXMKdF@cz}+M*?<0~!T@ zjudrXM2rskPf_qpeT7-h-SK$jB_OnmiN)@dL`eoD&G<|WgyA`G4L&%w6%Z;sG~e}A zD!7voM+~qV(DFuc-Y=BS)r+yTza53fbQGnye%`=Pxd!QZXb>UgkbnQl|rS$0%z;5WU z^;fzVLVr{~`mI*d)3m{Ro&@*+eAiW}IvpmBnq519q&^};V7&MbMUH-5T}GJUw=YLb z?K{mUM7JbCTH0oov^F4~*-)Bg)k~Z|5wxGxTGN0C1LB;42LAx1l;wC-nqH$wxO^mA znV&T?=sBd=m^g!FKTV_yfFK#HZNd(41e1mb?mgR4@Of-l^($ODWeFZ&H}tI>Bg z?`(W7zS3ZNtp2D2wJ&fo460K9BH58n2 zw!o^6A(%fFBeK8J+zkrAfJuV~1uCed?L9`BB0(`^&ZSVF!|Mx4^#MCT1GWN=?iRs7 z(>S+m+k|v!hqSoyfeKo>ceL&Rxu|G4tRx;nWG0QIk}WvIEpKNjKCf7$@ID$~ml4|m zWbOo$o(F>7_DYy`zO7q8W`3TlKPCSFRbkt$a4na-zylx%`9g-UV`+wE^{r^I^!a+J zhU&E%!0p?NouvIEx}mAnHIJ)Z#0Jwa!s7rRPh75CX4~7tFWN1C@Op8irywc?tog!%gx?!O9u)4!O+Tw3}7HGL7$tHbPFSSlB zqd}(RyK&$Pe(PneWS2PHgB(xXE`QpmgII#b zEfUHs8-EBKOiJgTOX$;JxpROqmDKdcmQy>WwgO;@`Q=ZZ@ut2B%S0G#IDA5GECEEq zdsv)GqOGLz6SORn4i(mjw zTYDV$L9mhwFB~7As?L~%H;`unO}&0-3!3W#5ozbg>nTIx`Q?XFdajQ-aW(dO0 z>KfXbZloF+xvXRvAOcPOa12?spwlx;qDg#WL%G*lCa$&V*p{0Zb^bj`Cs~`_d8te%@=MD>{QyNzM5S6 zvOksp1CLv&dXt*kt*8JS+a_dB7Eh$7{63MPqD#z?H~K2wa<4s^5@kpx6`g`H`I z(pnXP1Ot=aS4-^QQEd}agUoMmAfJ-q=<1oc81T0$<26oe!-hvVQ_NmgSEm}(+YhuH zlZDLtP1LE^YiKmg8Ml5G6FRo6)AxS#mW;`h{S{YLFC>=|;BrYnRIRkJk{Yu^go_i4 zL;KN470AUG8W@z&3UQ=w>xXGwVCSp$_#U?aJlVB~}soua&GuT9?4I}wr@=g4{!tLJ-bSf|-gHSoL*2SX3y60)KMgDtr=;m!7nZNrel+9{{UqP5a>MIND19Z?J^DG#Qpdq_i_x2m@o?T=MFo`>5Bo| zfj9f}SG3xLMqoA>NJE@b3hjcN`DLz0nSm;s#-&L0!gssgk^-VW0jTGd%8O6Adoi*mY)86tCT&TqVexN**y0H z(pFzZ)Rt6i-KOVoxbyz2svSkNsQ}Of-Xbi#{{SXgVRaslj|_5~tbJ!mq}C7^8UT=B z0#`e!q9|1$6CBtT&~-kv!b>tnB7Zf9SenLS@pE%M?zQmcmF<_C3};12%&;arBOf)V zL*48fTO|6=saY&wv663Mc|NN`oj1P{;mi^weyY5c9z1`wx41kxt$;b<%>}K-y8RbZ&FK zx=k<*4kN@XXKPqs$()=inj21#i+n#dR<%hm+fD%s=qE23 z4M%P`k3|#%figg{kbqmb4)b*28}fThD?kB@v}1(@T96I@0J@g9!Ohhgt}cCyf=~?S zZX>zKiz#TZms-+!Lq?Y49IN!S@tpYbk$|uU%-;s~l~SWY1WnJ;Xc}b5jPOidRjov{ z7jxiTH z>g5m%cAxBkk>LBeiwjR>Mxg+h`mGnY{W9P9ZdP?s4(>t#H$pZ7>-M^e6^ce8M4t{& z5(dLNEu7q~=zK~GMbt3PW!0UtH1r0HXB=NtC4eAL?^>Z)k;k0K`=0}FD7mtfrB zNQ+DrH61Wzpa2Bea;!9#*b5xV`|i2&%5AKOA5Foc&`t;w_x%#n(IwXsP4X|>m1V8o zFNaOQ{=brXVh9noN2K^!PS-nX^wF}Qfz844^;&w8Hhfd&GO~1410E;FB;j1ISQeP# z#D09NFKuHVs?~;_)|-lm2j42?8X`eFT=V-%tchZSCEI3BBlKESDf^is z1jgZM1C>`y*fIrx;>4?k@i(}=g~H|XR{kw-Y1vRx(^PSAl1KwGs%k0|Nzstm@(0tw zD%FUh&6rp=HI4pr@qPS~r;lKXe(u-vNv-#_ zJ`n7A@kfm4#5mCR5=JVRzS#vwL7~2G7n8NG5jif(;toK+s;{C^; zlFQ5g04nkJ)71X}qQ|~TeNtOY0RRslA^9vWyI=Sn54sL}Ur=!bJG|b@Ygb*4-KnhT z)6)@5hRokFJ=Y7>HBZ*-Uh+f(Xf{5JKNde%o%p8Kg6oW%D->bASCtDJBt`hLw?)1d z#Ff!{Wg2}kgzf?_iHrI!rGxz@?HgLdasY!Ej_aH04QKI8Yt4ZMNXhcQWz$VC(;$#Q z+G6D9{{Yc)`8#cb8jWxh5MTl%-<|^FJ*BBVTmzQ0WA^o3rB{If8H0-)E(@*I_q|rD zfv|Ozc&8=gTGPAU>L|7Z12Y^huSzyk5tD&urs5UaF_(F$59<-^U!@ zYSp1hl+O1NX^>#gq^ocYKbHXkqw_GT0$Rbj#E@p=Y<hed<}7R%b!xS9M}0PwNBtzbPnOzj%c6s**^y3j@-nOS0_cn-!* z{JhkxsRY4+#Qy+woyBt|i?uB+limR$2j%?|b!LD>T<|-6m7Z&Y1?2W9g>VT25>3ti z?w6_iEWO*Lhgc6dQN5rGnI_Ut%1rlA3ZTgjyickhd;_GLN1A#b$-^2%-8eA6v9j0xo%YKIXf2tH}*jl{vFrU(G~5>E`QNFXOHf%?pK5DzVi`f4FfC(0YE&B4bnA)ASXz3kL z#6;K|V;S=7w|Zudj0T4{-bN4KisB#D>XT}aBw$DCpH0=<`S~gOze?7%zV3k!i$Rc2Z(rGD={jz% zja}6#k&;e+^0&fcA~LqFxI6iqHn{{JE2`n8wE=CQ%mF@sky{L=(?|z^OMBc$ilwP- zI!)XxNtiwJl;zoJqoAtRFmBHfbLM|^9Yd^CV+=fiI2c)9#)eYb1PIOKtWAEdt#`zH zBLfhkcxAh-r<+*nG)Gc(N~F5-WUajxp|7^<7=kVNE=q+rID$mXtxXf&$=k^j;I8y&w{ixMdnGk>}FFogTvXh1IvNzt+C;z0Jz8lm5y9=?(zP5-fER}Z=^Rg z2f)DueHBAU?k>fyK?m;(Qm(1Mz-S=$$ys%Y4s&3Jo5XUj$8XjiT)cMw0HSI-uX5;j z#64wgX}aA8QVgil;w%Z9`Yv7XID;cHGr~4jAOHqLL|CQd@ME84hnC{$H62=uoPFCj zxSN6kO9i@%TWK-KFp|oD9~j&AwaeA43m_Z#s{Zm!8%?i>Z| zF>IAjR^kK~W?%%R8jSk18eB_p*+p8BXaIJ&3Qgn^4X@^vR}F$M-?E{l0suZelSCV$ zc!y;VnxqMWTsy1Uc7gzz;>wE$#f5)NM2`%RfVTF00tDjAKD8ick07+FJO&7K+wSW+ zwOj$4alN;S#3653J z<46E9FLUaq6ooo?&lf6%dW49y&EaP`UEy|~q+fA?F=b)3L$G|je*D%8s_e13;CLw+ zsB~D`G1)&v`!|JVw2JzlhkJN%M9pEYa$Gb(E!q`df%>SmH`E5^ z1gSb`CqTjkfN*(0Lq@2!CP#i0{T~&&umYn)F5Ce{8orRfyz2yzrD>uT58O^=stXaKpm9C$8cq-Yslqw7{JZ3+ThT_8C2o)dLm zr2TzOqy0JzwKfCzMiyDmZWWIL>s@8~+NPBsf`}wHX|WzYs~|L5_U=v5?IPXL21x{r za|-d2LcH=CVBc&_t`)j%F>AQyN%cWZjRr`Gh`dU*Mv)}CW;%0h zK$9hPfbFD67r?)#>bXB>3x?y^z??3kn3v7OWRgG!)pGg&0O{G_{{WQqTMma$i4uQb z$y$ep0^U2o`ItXd9T36;vhLmZ!kCzFjF~r!`Gv`ArXzSalRJb#DV-uj*bB^Dr*qH6 zwA^xUAAfS7d3hkXi=4>_fS%%i55)#Z7KFl$N1VX|!gK2$YC!>{x8j2bnLcTn_c_1} z3qc>Eq?IarfDQw3IVNCZ@(OmaZ6hXR5t#X;7#+!z{{XZ=Qbe@5qDbVZ`fT2y9?BIQ z&`5KDl5_hYpwjR)4cj(rw`526vazoNWi}HaI!FU3bWIO)m~Pit18*;?>%;zBep~*_ zPtv&2mYF%mJyJSpVf9K!1p(w1{xABnuo|sf7q(Y&48adVZ+d7`kdi)!Psz zB>6-+~C6ECD=BA z2Zfe5w9XFc)SBb*@5ESly!P;0>>ZvY3(Bu6&;75nl zWeI8BVa$1^R3+~q8JHM|{DL2R=745Mz3`MdxHmYP$Dux+Q^VLx6*>Fa8q_hLMXjn8p6GXpsO>a{o6NyyGjRs zfOrrAK8wuyW}$(kOiip{6|1GyJ4R2tB)5^@pGC{%@!yJdrsRBV#deM`yx!$bJr+;13>v+KSY!jtL`Q3BwvRVma`E&u4;C{ zjudWqYly(^qJ4P-g$t@Ul0AGEdj$UYj2}b~)mkSdReRkz^+P4j9pwRJ)!f<7nkH3n zB+Y;ZF?A-IkGvplp7X-L{v78-GbgdP%}=yc#ZsQCsE$G7y0t^7e0yDN0uMe))4E}6 z2lDfOva$7SbsC272XL1Z`fa;b9Nn#tm{e;5;OMzLUh6)dsm*u;=QvhQq{$O8+mvXw zsa2(qB;;SoMr!T^PbhU2H$6kRs~Z!qbT&$D3CD^OI$+2lUDJ{TW5rWo;0c~px^`?> zo=0+kJw;8T#~qa^E+9pZD)`aXt#B@F1zOYelE`!JPiZixaw<)$Xml=n!=gO-T^^fF zv4Q|^q6!onNLT_#F@lQ9XBIrVCU6Yl6KJ@Jmv^RSa|q$s7d!>Br|JkgR*V1GJJ410BAp_Pce&LA?IIM7@vH zIvqS$4ln~mgZ;l%cBSLIT;eTxL#JpPiEP9+1@GtbM^(QJ$OM67KYT8H?(3_^HHqN= z01U$0+}Oa6AXQyuXPrCu0stOTwRE&;4G4S0&wTurH(09CMi~9AIc#NK472ToTz`mR zHkWgl@>i;cxugIGVr;TY+$#cK%GP|5_;(W66F1{4rZLH(qH$)5ox9xOM*!9?Ye8`t zBlS}0kNwJ++h~gkNR6b3v9w}+R$of0>Y80bqadBIK`#IgLz~UkV0(y@Y5rKB$!k`o z*H@_lvRYt}UGfsKSkeoDam|ba;8wV$ovmPPX&m?}-$kL+>FwI#BLFGa3!3KFr)d9BY2pb#PDj7CEl8L*SQULj;6+k9(Fj2AW zbyi3aP09UIy$I>+HPkv*lSa|B)E@Gn4-N;+dGq1mvnn@xoi>BTmW^^f_U5^y3St9W zx5L#`+aq?{Oat9UD^%JVcVzJfVGwTVn*q$>OSk<=g5pi$!8l+s0w=I4h^a6KUQOae z;az1tt!ajdurrkd8UU~|2$HSQAT#6md=Kr)ie%93(k}+U5585F^)}OBW7;=1c9W2N zJE(z}CjxvYx`w+5dkJlam5PlVs(nXX%FQ&{U`Ub%dtbnnky>Wwi}|fO)QGI`8xm&R zEbT~(f#w3&6xO%1uh~uJ)^NGybo87656YX@YE`Kf{4&7i%k!u@%*hF4q@Q_K&%#>loiwKePR9Zu@I390^geH@6 zU<&~f6^69#xsm{jBXdMRGJf)nmXzN_5nuo|F>lF0=ER7eWKGtQr69Opg3=@Ep?xE0 z8L)wb=0$O4xuiNqPk69S;zi7p-rj2C=n!rWIT!VUqID+$ao*?a-CwjussiE+ktevD z{SeDy_A%-epK=JZO}p}&-2VXV=Z;63e$Z;Eyi5_qnF%)Yx&by5A60D7B0HV{>Cw4Ok z+VUJYe&WGd>H;>QZYfIjuj0 z!0bJK@~UHwbCwz7%cj!x9X7H^Zr2-t-~s!;(LvOyGOZ^A1mqd}g@-}Tr%)MyE^P|V zqgPV3J+2`551%1V<*c|xjGC-#9Oi*Ab8D!YGX^HZ6Y^8lVcbkinUCnA*$$W(%!SuA zp{j5UKy;471JO-M&S@KovB>?3|~GTWr$or@XX)FNyh|*#?6~8rHqSK(-a_Q`t(1fFKS(Udxj# zw_C}>D_f{z1=|rlmpiNmyWHSpe3pj2<@E>{C*-1~rixpLyk2g$$AnfB*jP#AruU=_ z{Z*}CXmKLZ*-@FtD_F>!WX~vJS*JzL6Brjj`g=*6k=ZbX6G1o$5p->7gW!T28*pYj zcq>|Fm(~IX}K{rw3Q=(C*cGUY5u59 zKcR``Uid+!nCB8?CfW$x06sBe^YT(PgB`^Br0Q3>v|zv){nOtALZrdAMWp=x{{UoF z832pVXqYJ2MKIQr4gg?8{d`bWcoOI{*qI~K&1Ln|CkZuLnsl_(SV(CNfK7?-Wy|$V zQ)<-OF9s}P%THaly(s)J2@qmA^;o*9TPXks#6-=^u8$nkn?BSY$vcg_7Rw#GO2a?M zEXuXZ&*Oj}}p4wVc_cU&N!$^qr zvgZq>(n&FxT4B;^O>=$bl#2C- z;Y3-02WWx;{M3wS-y}?PHb^uA4elm9RQnoW#lVvX9zUX62HF7x5o-`st!XWdw-*H1 z$M!(bNe7Z|VQ-R^q`A=IE`JHP=A;~#mWinBh+C;hpPH`)3^n+SS$6zKFX045I5`G{Ej4Q^9E`;CErsTD-GehOoRBK zNR%=dTG+Wf9C~ty73$GHbR8tciCJ^rgZa5Uu0oG)T;^+;+R?#++DHwK4ba)NE{0lbF}F2s;$?iJc(5CroC#k?o)lmdH8ns?eP0PGbmuESra0!jY>OyvH` z{brC1-^|SZ$~uiTI(25el1pYgl|NS;u%nAieSV5TM%TO!dHG>Z8UcaLjGP}nKJ1TU zWGBg)2nl1Dnqhjh>aAKoc8e{+Lj^O3^YifJr}+(b@nv&OtZuQEifJAb>yz zCkM~^qja?t4wetZNE3*Njq~G5h&UEv;P2va25cFw+T?vI;~Og+#Fs$ z`1My_WC$eAZ;w!}^ubSgE^abFDBTkng!G*ycJ9>B1P0#Us>SPiK9ub{o5g=dhmDi7!Ru>B#^B!33z`+XZDzdoR>pD5*RC`NJ#l}HV z;^RM(+qP?t4qz>D&(q0CLs3$T!|A#sBf5s2aNrqKNZ|4lxw~P&$UK5>mV=_v^%`21 z29ve;f-U`1Yx-3^PsRWaCm==(D)l(&dRXT{YCE z@+Y?(`K=m6=-w*>J8ndgKSVTZ8bAIBDxsI|u~W&S7JfzgIN{N3PbL3pO@?ep=g|+ja5%BxEki~QW(XvO8`-cx z79wmh$Lf!9>sijE>5(LJbmEZY0elI7KSg-b2Nty3_er#y7ZJpp{pCMm^-^sy5?nki zlDKJxfJmHOB=$7q9`G6gFrsBQR(n{{GGIbZ{631`N&f)Fk1)C`IIztLrwukaveePh zrq=-=Ma-zG{{Z5B5s&v-=CR7|HR-y|O*$+me&RVdpBwy^R=-x#PQu(|9Q`blKiYSa zqg(yc<`iR$+1gB}F5K~)7_yI5TvK5U_&^x#&+4on!|1bh--!M8T29Kds~0`aBbl;? zX=#)4RvVM#qN=ux>c;43Gj9G03a534J-J3OJd-p1#-OSo)7pG<3PB>mM}#Bpg`bR| zhDo<_eydxiF~AOIfU(YfS3%N$+IRe=Mtv(D+Ej~x00IwiVh^9y0ds(K18BHY5B3}6 z6jeU~{{WaSf33jkkDT8IE`DVbUO~HL7z7y;_b6F`?yJO2RN z{{Z42s*11TK4_3o?vpNyE$y#={*MVvD7E)7hy{_01 z5=nqydHO1j{71=N(Ek9w-z2i)FCNk|lovb?gqZ*zpVdLBku#t8dGO&~ss7zhRaaB^ z8TZ1=9^2lTPAA>fE@^0J?-TV{+L!y*TG7e5`K@h#;D3-<)qew*XN}F09i_~W>j4C1 zUFppwjt1lz0L|5EU+n(?@hf^vzsYFEwP*s`EptqPU;r|H{{Tc(T;1-eXeK6N0*;6O z0H&W$ivC4s^qIX&P%G*j%5c_An%mENtjEJX0^1(yT3_xTGMiJMMYcUNton}IF@iDz zuH!y)l{G~Re;2B>m%AkStGa%lRnXu~&B#^h{{Xsu7emv(@NM+`l-hDL^?EAJV1}Dm zk|eC+%FVvh@?A!&Aw+)-KlzaO- z2?bf{w5nC=6e;Z_mx~2!Q!=aepH6?eKJlsA+)VAWe3-HpEE;m~W>zQnA2OkYG5>oQ^+8^ig;J z0GUl!@elb{pB$e=9MUcc2LK2{8X8NQA+Zh^`@iIqKk(zsBY)xl0O9=#05r}tYq^2M zo597rmdy&}UI{WTarbqKKk*cNmcEzpzy5#O1M5kS_ofRhL4%k+YBw~pX`Vz9AVO3# z{{X~OSN{NYsw$l0$fc-zE(=ToctyC2WMAZ)RsR5utM|!BemZ}Aq_{HuvOp)ma#SXv z+*}$<&5N?5sgM3Y)ib}{XZ*zhWv+3tbcY+0Bj~1S+S!pk*pXiKKh{R*fNu~TJ z&-79CKlaa&{SiAXuA)wz?FRSzr@Oew%IPWV=~ki{J~_hk4JZCB{{Zu!&33v>555&= zjETo9T2*vZTz%m!fMiZ*m|0z3o|>f+$L)*)V#DqhgZ}{8{{Z1wn*RX6{{S)hDaW(Q zI6Y>7zo=B9LEGoz5Czs{S`{d~vZG5}Gb>|D{{W|d_`g*bUH<^azsY(MxvXRoU`drD zD>sX3nr2Ilq)qSHE#qKj$NYeRsBn*u)Jb6qp=yUeYPAN%e9 z0GKH9M!2oJOj!7HCQY0o3XoE4g5V2ynFsInQ|hwM`F~W*&&^@Bx(U+LKNE;>zyjzh zX^Vk#PoM|WD&0T!>3N_(&*nc=Z1ke~L9E-3V#Nb#1He28kYgXwTYg8GQ1bNlRv>*Q W?IJ;#7#T@3Gyed^`XsAq)Bo9M@!z!o diff --git a/lite/demo/cxx/mobile_detection/mobile_detection.cc b/lite/demo/cxx/ssd_detection/ssd_detection.cc similarity index 98% rename from lite/demo/cxx/mobile_detection/mobile_detection.cc rename to lite/demo/cxx/ssd_detection/ssd_detection.cc index 9b8f02aeed..011733eb87 100644 --- a/lite/demo/cxx/mobile_detection/mobile_detection.cc +++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc @@ -194,7 +194,7 @@ void RunModel(std::string model_dir, std::string img_path) { } auto rec_out = detect_object(outptr, static_cast(cnt / 6), 0.6f, img); std::string result_name = - img_path.substr(0, img_path.find(".")) + "_detection_result.jpg"; + img_path.substr(0, img_path.find(".")) + "_ssd_detection_result.jpg"; cv::imwrite(result_name, img); } diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md new file mode 100644 index 0000000000..36d2985a4f --- /dev/null +++ b/lite/demo/cxx/test_cv/README.md @@ -0,0 +1,131 @@ +# 图像预测库的使用 +1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish模式 +example: +```shell +set BUILD_WITH_CV=ON or LITE_WITH_CV=ON +./lite/tools/build.sh +--arm_os=android +--arm_abi=armv8 +--arm_lang=gcc +--android_stl=c++_static +full_publish +``` + +2. 准备模型和优化模型 +example: +```shell +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +./lite/tools/build.sh build_optimize_tool +./build.model_optimize_tool/lite/api/model_optimize_tool +--optimize_out_type=naive_buffer +--optimize_out=model_dir +--model_dir=model_dir +--prefer_int8_kernel=false +``` + +3. 编译并运行完整test_model_cv demo +example: +```shell +cd inference_lite_lib.android.armv8/demo/cxx/test_cv +``` + +- 修改MakeFile, 注释编译test_img_propress 语句 + ```shell + test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + + test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + + #test_img_propress: fetch_opencv test_img_propress.o + # $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress $(CXX_LIBS) $(LDFLAGS) + + #test_img_propress.o: test_img_propress.cc + # $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc + + .PHONY: clean + clean: + rm -f test_model_cv.o + rm -f test_model_cv + #rm -f test_img_propress.o + #rm -f test_img_propress + ``` +- 修改../../..//cxx/include/paddle_image_preprocess.h, 修改paddle_api.h头文件的路径 + ```shell + origin: + #include "lite/api/paddle_api.h" + #include "lite/api/paddle_place.h" + now: + #include "paddle_api.h" + #include "paddle_place.h" + ``` +- 测试模型必须是优化后的模型 + +```shell +make + +adb -s device_id push mobilenet_v1 /data/local/tmp/ +adb -s device_id push test_model_cv /data/local/tmp/ +adb -s device_id push test.jpg /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id shell chmod +x /data/local/tmp/test_model_cv +adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 " +``` +运行成功将在控制台输出部分预测结果 + +4. 编译并运行完整test_img_preprocess demo +example: +```shell +cd inference_lite_lib.android.armv8/demo/cxx/test_cv +``` + +- 修改MakeFile, 注释编译test_model_cv 语句 + ```shell + #test_model_cv: fetch_opencv test_model_cv.o + # $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + + #test_model_cv.o: test_model_cv.cc + # $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + + test_img_propress: fetch_opencv test_img_propress.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress $(CXX_LIBS) $(LDFLAGS) + + test_img_propress.o: test_img_propress.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc + + .PHONY: clean + clean: + #rm -f test_model_cv.o + #rm -f test_model_cv + rm -f test_img_propress.o + rm -f test_img_propress + ``` +- 修改../../..//cxx/include/paddle_image_preprocess.h, 修改paddle_api.h头文件的路径 + ```shell + origin: + #include "lite/api/paddle_api.h" + #include "lite/api/paddle_place.h" + now: + #include "paddle_api.h" + #include "paddle_place.h" + ``` +- 测试模型必须是优化后的模型 + +```shell +make + +adb -s device_id push mobilenet_v1 /data/local/tmp/ +adb -s device_id push test_img_propress /data/local/tmp/ +adb -s device_id push test.jpg /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id shell chmod +x /data/local/tmp/test_model_cv +adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 " +adb -s device_id pull /data/local/tmp/resize.jpg ./ +adb -s device_id pull /data/local/tmp/convert.jpg ./ +adb -s device_id pull /data/local/tmp/flip.jpg ./ +adb -s device_id pull /data/local/tmp/rotate.jpg ./ +``` +运行成功将在控制台输出OpenCV 和 Padlle-lite的耗时;同时,将在test_cv目录下看到生成的图像预处理结果图: 如:resize.jpg、convert.jpg等 diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc new file mode 100644 index 0000000000..c2cbd66cc0 --- /dev/null +++ b/lite/demo/cxx/test_cv/test_img_prepross.cc @@ -0,0 +1,389 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT +#include "paddle_image_preprocess.h" // NOLINT +#include "time.h" // NOLINT +typedef paddle::lite_api::Tensor Tensor; +typedef paddle::lite::utils::cv::ImageFormat ImageFormat; +typedef paddle::lite::utils::cv::FlipParam FlipParam; +typedef paddle::lite::utils::cv::TransParam TransParam; +typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; +typedef paddle::lite_api::DataLayoutType LayoutType; +using namespace paddle::lite_api; // NOLINT + +void fill_with_mat(cv::Mat& mat, uint8_t* src) { // NOLINT + for (int i = 0; i < mat.rows; i++) { + for (int j = 0; j < mat.cols; j++) { + int tmp = (i * mat.cols + j) * 3; + cv::Vec3b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + } + } +} +void test_img(std::vector cluster_id, + std::vector thread_num, + std::string img_path, + std::string dst_path, + ImageFormat srcFormat, + ImageFormat dstFormat, + int width, + int height, + float rotate, + FlipParam flip, + LayoutType layout, + std::string model_dir, + int test_iter = 1) { + // init + // paddle::lite::DeviceInfo::Init(); + // read img and pre-process + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + int srch = img.rows; + int srcw = img.cols; + for (auto& cls : cluster_id) { + for (auto& th : thread_num) { + std::cout << "cluster: " << cls << ", threads: " << th << std::endl; + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode((PowerMode)cls); + config.set_threads(th); + std::cout << "model: " << model_dir; + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data from image + std::unique_ptr input_tensor(predictor->GetInput(0)); + + /* + imread(img_path, param) + IMREAD_UNCHANGED(<0) 表示加载原图,不做任何改变 + IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来 + IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来 + */ + cv::Mat img; + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + img = imread(img_path, cv::IMREAD_COLOR); + } else if (srcFormat == ImageFormat::GRAY) { + img = imread(img_path, cv::IMREAD_GRAYSCALE); + } else { + printf("this format %d does not support \n", srcFormat); + return; + } + if (img.empty()) { + std::cout << "opencv read image " << img_path.c_str() << " failed" + << std::endl; + return; + } + int srch = img.rows; + int srcw = img.cols; + int dsth = height; + int dstw = width; + + std::cout << " input tensor size, num= " << 1 << ", channel= " << 1 + << ", height= " << srch << ", width= " << srcw + << ", srcFormat= " << (ImageFormat)srcFormat << std::endl; + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + if (srcFormat == ImageFormat::GRAY) { + std::cout << "srcFormat: GRAY" << std::endl; + } + if (srcFormat == ImageFormat::BGR) { + std::cout << "srcFormat: BGR" << std::endl; + } + if (srcFormat == ImageFormat::RGB) { + std::cout << "srcFormat: RGB" << std::endl; + } + std::cout << " output tensor size, num=" << 1 << ", channel=" << 1 + << ", height=" << dsth << ", width=" << dstw + << ", dstFormat= " << (ImageFormat)dstFormat << std::endl; + + if (dstFormat == ImageFormat::GRAY) { + std::cout << "dstFormat: GRAY" << std::endl; + } + if (dstFormat == ImageFormat::BGR) { + std::cout << "dstFormat: BGR" << std::endl; + } + if (dstFormat == ImageFormat::RGB) { + std::cout << "dstFormat: RGB" << std::endl; + } + + std::cout << "Rotate = " << rotate << ", Flip = " << flip + << ", Layout = " << static_cast(layout) << std::endl; + if (static_cast(layout) != 1 && static_cast(layout) != 3) { + std::cout << "this layout" << static_cast(layout) + << " is no support" << std::endl; + } + int size = 3 * srch * srcw; + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + size = 3 * srch * srcw; + } else if (srcFormat == ImageFormat::GRAY) { + size = srch * srcw; + } + uint8_t* src = img.data; + + int out_size = srch * srcw; + int resize = dstw * dsth; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + out_size = 3 * srch * srcw; + resize = 3 * dsth * dstw; + } else if (dstFormat == ImageFormat::GRAY) { + out_size = srch * srcw; + resize = dsth * dstw; + } + // out + uint8_t* lite_dst = new uint8_t[out_size]; + uint8_t* resize_tmp = new uint8_t[resize]; + uint8_t* tv_out_ratote = new uint8_t[out_size]; + uint8_t* tv_out_flip = new uint8_t[out_size]; + std::vector shape_out = {1, 3, srch, srcw}; + + input_tensor->Resize(shape_out); + Tensor dst_tensor = *input_tensor; + std::cout << "opencv compute" << std::endl; + cv::Mat im_convert; + cv::Mat im_resize; + cv::Mat im_rotate; + cv::Mat im_flip; + double to_1 = 0; + double to_2 = 0; + double to_3 = 0; + double to_4 = 0; + double to1 = 0; + for (int i = 0; i < test_iter; i++) { + clock_t start = clock(); + clock_t begin = clock(); + // convert bgr-gray + if (dstFormat == srcFormat) { + im_convert = img; + } else if (dstFormat == ImageFormat::BGR && + srcFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR); + } else if (srcFormat == ImageFormat::BGR && + dstFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY); + } else if (dstFormat == srcFormat) { + printf("convert format error \n"); + return; + } + clock_t end = clock(); + to_1 += (end - begin); + + begin = clock(); + // resize default linear + cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f); + end = clock(); + to_2 += (end - begin); + + begin = clock(); + // rotate 90 + if (rotate == 90) { + cv::flip(im_convert.t(), im_rotate, 1); + } else if (rotate == 180) { + cv::flip(im_convert, im_rotate, -1); + } else if (rotate == 270) { + cv::flip(im_convert.t(), im_rotate, 0); + } + end = clock(); + to_3 += (end - begin); + + begin = clock(); + // flip + cv::flip(im_convert, im_flip, flip); + end = clock(); + to_4 += (end - begin); + clock_t ovet = clock(); + to1 += (ovet - start); + } + + std::cout << "Paddle-lite compute" << std::endl; + double lite_to = 0; + double lite_to_1 = 0; + double lite_to_2 = 0; + double lite_to_3 = 0; + double lite_to_4 = 0; + double lite_to_5 = 0; + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = dsth; + tparam.ow = dstw; + tparam.flip_param = flip; + tparam.rotate_param = rotate; + + ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); + + for (int i = 0; i < test_iter; ++i) { + clock_t start = clock(); + clock_t begin = clock(); + image_preprocess.imageConvert(src, lite_dst); + clock_t end = clock(); + lite_to_1 += (end - begin); + + begin = clock(); + image_preprocess.imageResize(lite_dst, resize_tmp); + end = clock(); + lite_to_2 += (end - begin); + + begin = clock(); + image_preprocess.imageRotate( + lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90); + end = clock(); + lite_to_3 += (end - begin); + + begin = clock(); + image_preprocess.imageFlip( + lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip); + end = clock(); + lite_to_4 += (end - begin); + + clock_t over = clock(); + lite_to += (over - start); + + begin = clock(); + image_preprocess.image2Tensor(lite_dst, + &dst_tensor, + (ImageFormat)dstFormat, + srcw, + srch, + layout, + means, + scales); + end = clock(); + lite_to_5 += (end - begin); + } + to_1 = 1000 * to_1 / CLOCKS_PER_SEC; + to_2 = 1000 * to_2 / CLOCKS_PER_SEC; + to_3 = 1000 * to_3 / CLOCKS_PER_SEC; + to_4 = 1000 * to_4 / CLOCKS_PER_SEC; + to1 = 1000 * to1 / CLOCKS_PER_SEC; + std::cout << "opencv convert run time: " << to_1 + << "ms, avg: " << to_1 / test_iter << std::endl; + std::cout << "opencv resize run time: " << to_2 + << "ms, avg: " << to_2 / test_iter << std::endl; + std::cout << "opencv rotate run time: " << to_3 + << "ms, avg: " << to_3 / test_iter << std::endl; + std::cout << "opencv flip time: " << to_4 + << "ms, avg: " << to_4 / test_iter << std::endl; + std::cout << "opencv total run time: " << to1 + << "ms, avg: " << to1 / test_iter << std::endl; + std::cout << "------" << std::endl; + + lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC; + lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC; + lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC; + lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC; + lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC; + lite_to = 1000 * lite_to / CLOCKS_PER_SEC; + std::cout << "lite convert run time: " << lite_to_1 + << "ms, avg: " << lite_to_1 / test_iter << std::endl; + std::cout << "lite resize run time: " << lite_to_2 + << "ms, avg: " << lite_to_2 / test_iter << std::endl; + std::cout << "lite rotate run time: " << lite_to_3 + << "ms, avg: " << lite_to_3 / test_iter << std::endl; + std::cout << "lite flip time: " << lite_to_4 + << "ms, avg: " << lite_to_4 / test_iter << std::endl; + std::cout << "lite total run time: " << lite_to + << "ms, avg: " << lite_to / test_iter << std::endl; + std::cout << "lite img2tensor time: " << lite_to_5 + << "ms, avg: " << lite_to_5 / test_iter << std::endl; + std::cout << "------" << std::endl; + + double max_ratio = 0; + double max_diff = 0; + const double eps = 1e-6f; + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/resize.jpg"; + std::string convert_name = dst_path + "/convert.jpg"; + std::string rotate_name = dst_path + "/rotate.jpg"; + std::string flip_name = dst_path + "/flip.jpg"; + cv::Mat resize_mat(dsth, dstw, CV_8UC3); + cv::Mat convert_mat(srch, srcw, CV_8UC3); + cv::Mat rotate_mat; + if (rotate == 90 || rotate == 270) { + rotate_mat = cv::Mat(srcw, srch, CV_8UC3); + } else { + rotate_mat = cv::Mat(srch, srcw, CV_8UC3); + } + cv::Mat flip_mat(srch, srcw, CV_8UC3); + fill_with_mat(resize_mat, resize_tmp); + fill_with_mat(convert_mat, lite_dst); + fill_with_mat(rotate_mat, tv_out_ratote); + fill_with_mat(flip_mat, tv_out_flip); + cv::imwrite(convert_name, convert_mat); + cv::imwrite(resize_name, resize_mat); + cv::imwrite(rotate_name, rotate_mat); + cv::imwrite(flip_name, flip_mat); + delete[] lite_dst; + delete[] resize_tmp; + delete[] tv_out_ratote; + delete[] tv_out_flip; + } + } +} + +int main(int argc, char** argv) { + if (argc < 7) { + std::cerr << "[ERROR] usage: " << argv[0] + << " image_path dst_apth srcFormat dstFormat width height\n"; + exit(1); + } + std::string image_path = argv[1]; + std::string dst_path = argv[2]; + int srcFormat = atoi(argv[3]); + int dstFormat = atoi(argv[4]); + int width = atoi(argv[5]); + int height = atoi(argv[6]); + int flip = -1; + float rotate = 90; + int layout = 1; + std::string model_dir = "mobilenet_v1"; + if (argc > 7) { + model_dir = argv[7]; + } + if (argc > 8) { + flip = atoi(argv[8]); + } + if (argc > 9) { + rotate = atoi(argv[9]); + } + if (argc > 10) { + layout = atoi(argv[10]); + } + test_img({3}, + {1, 2, 4}, + image_path, + dst_path, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + width, + height, + rotate, + (FlipParam)flip, + (LayoutType)layout, + model_dir, + 20); + return 0; +} diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc new file mode 100644 index 0000000000..24f408bf4a --- /dev/null +++ b/lite/demo/cxx/test_cv/test_model_cv.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT +#include "paddle_image_preprocess.h" // NOLINT +#include "time.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale) { + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} +void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) { +#ifdef LITE_WITH_CV + typedef paddle::lite::utils::cv::ImageFormat ImageFormat; + typedef paddle::lite::utils::cv::FlipParam FlipParam; + typedef paddle::lite::utils::cv::TransParam TransParam; + typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; + typedef paddle::lite_api::DataLayoutType LayoutType; + // init TransParam + TransParam tp; + tp.iw = img.cols; + tp.ih = img.rows; + tp.ow = width; + tp.oh = height; + ImageFormat srcFormat = ImageFormat::BGR; + ImageFormat dstFormat = ImageFormat::RGB; + // init ImagePreprocess + ImagePreprocess img_process(srcFormat, dstFormat, tp); + // init temp var + const uint8_t* img_ptr = reinterpret_cast(img.data); + uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3]; + uint8_t* resize_ptr = new uint8_t[width * height * 3]; + // do convert bgr--rgb + img_process.imageConvert(img_ptr, rgb_ptr); + // do resize + img_process.imageResize(rgb_ptr, resize_ptr); + // data--tensor and normalize + float means[3] = {103.94f, 116.78f, 123.68f}; + float scales[3] = {0.017f, 0.017f, 0.017f}; + img_process.image2Tensor( + resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales); + float* data = dstTensor.mutable_data(); +#else + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + const float* dimg = reinterpret_cast(imgf.data); + float* data = dstTensor.mutable_data(); + neon_mean_scale(dimg, data, width * height, means, scales); +#endif +} + +void RunModel(std::string model_dir, + std::string img_path, + std::vector input_shape, + PowerMode power_mode, + int thread_num, + int test_iter, + int warmup = 0) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + // 3. Prepare input data from image + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize( + {input_shape[0], input_shape[1], input_shape[2], input_shape[3]}); + auto* data = input_tensor->mutable_data(); + // read img and pre-process + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + + pre_process(img, input_shape[3], input_shape[2], *input_tensor); + + // 4. Run predictor + for (int i = 0; i < warmup; ++i) { + predictor->Run(); + } + double lps = 0.f; + double min_time = 1000000.f; + double max_time = 0.f; + for (int i = 0; i < test_iter; ++i) { + clock_t begin = clock(); + predictor->Run(); + clock_t end = clock(); + double t = (end - begin) * 1000; + t = t / CLOCKS_PER_SEC; + lps += t; + if (t < min_time) { + min_time = t; + } + if (t > max_time) { + max_time = t; + } + std::cout << "iter: " << i << ", time: " << t << " ms" << std::endl; + } + std::cout << "================== Speed Report ===================" + << std::endl; + std::cout << "Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num << ", warmup: " << warmup + << ", repeats: " << test_iter << ", avg time: " << lps / test_iter + << " ms" + << ", min time: " << min_time << " ms" + << ", max time: " << max_time << " ms." << std::endl; + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int output_num = 1; + for (int i = 0; i < shape_out.size(); ++i) { + output_num *= shape_out[i]; + } + std::cout << "output_num: " << output_num << std::endl; + for (int i = 0; i < output_num; i += 100) { + std::cout << "i: " << i << ", out: " << outptr[i] << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 7) { + std::cerr << "[ERROR] usage: " << argv[0] + << " model_dir image_path input_shape\n"; + exit(1); + } + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + std::vector input_shape; + input_shape.push_back(atoi(argv[3])); + input_shape.push_back(atoi(argv[4])); + input_shape.push_back(atoi(argv[5])); + input_shape.push_back(atoi(argv[6])); + int power_mode = 3; + int threads = 1; + int test_iter = 100; + int warmup = 10; + if (argc > 7) { + power_mode = atoi(argv[7]); + } + if (argc > 8) { + threads = atoi(argv[8]); + } + if (argc > 9) { + test_iter = atoi(argv[9]); + } + if (argc > 10) { + warmup = atoi(argv[10]); + } + RunModel(model_dir, + img_path, + input_shape, + (PowerMode)power_mode, + threads, + test_iter, + warmup); + return 0; +} diff --git a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc new file mode 100644 index 0000000000..a9beb1ed28 --- /dev/null +++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc @@ -0,0 +1,238 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +struct Object { + cv::Rect rec; + int class_id; + float prob; +}; + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +const char* class_names[] = {"person", "bicycle", "car", + "motorcycle", "airplane", "bus", + "train", "truck", "boat", + "traffic light", "fire hydrant", "stop sign", + "parking meter", "bench", "bird", + "cat", "dog", "horse", + "sheep", "cow", "elephant", + "bear", "zebra", "giraffe", + "backpack", "umbrella", "handbag", + "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", + "kite", "baseball bat", "baseball glove", + "skateboard", "surfboard", "tennis racket", + "bottle", "wine glass", "cup", + "fork", "knife", "spoon", + "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", + "carrot", "hot dog", "pizza", + "donut", "cake", "chair", + "couch", "potted plant", "bed", + "dining table", "toilet", "tv", + "laptop", "mouse", "remote", + "keyboard", "cell phone", "microwave", + "oven", "toaster", "sink", + "refrigerator", "book", "clock", + "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; + +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale(const float* din, + float* dout, + int size, + const std::vector mean, + const std::vector scale) { + if (mean.size() != 3 || scale.size() != 3) { + std::cerr << "[ERROR] mean or scale size must equal to 3\n"; + exit(1); + } + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} + +void pre_process(const cv::Mat& img, int width, int height, float* data) { + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize( + rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + std::vector mean = {0.485f, 0.456f, 0.406f}; + std::vector scale = {0.229f, 0.224f, 0.225f}; + const float* dimg = reinterpret_cast(imgf.data); + neon_mean_scale(dimg, data, width * height, mean, scale); +} + +std::vector detect_object(const float* data, + int count, + float thresh, + cv::Mat& image) { // NOLINT + if (data == nullptr) { + std::cerr << "[ERROR] data can not be nullptr\n"; + exit(1); + } + std::vector rect_out; + for (int iw = 0; iw < count; iw++) { + int oriw = image.cols; + int orih = image.rows; + if (data[1] > thresh) { + Object obj; + int x = static_cast(data[2]); + int y = static_cast(data[3]); + int w = static_cast(data[4] - data[2] + 1); + int h = static_cast(data[5] - data[3] + 1); + cv::Rect rec_clip = + cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows); + obj.class_id = static_cast(data[0]); + obj.prob = data[1]; + obj.rec = rec_clip; + if (w > 0 && h > 0 && obj.prob <= 1) { + rect_out.push_back(obj); + cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 1, cv::LINE_AA); + std::string str_prob = std::to_string(obj.prob); + std::string text = std::string(class_names[obj.class_id]) + ": " + + str_prob.substr(0, str_prob.find(".") + 4); + int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL; + double font_scale = 1.f; + int thickness = 1; + cv::Size text_size = + cv::getTextSize(text, font_face, font_scale, thickness, nullptr); + float new_font_scale = w * 0.5 * font_scale / text_size.width; + text_size = cv::getTextSize( + text, font_face, new_font_scale, thickness, nullptr); + cv::Point origin; + origin.x = x + 3; + origin.y = y + text_size.height + 3; + cv::putText(image, + text, + origin, + font_face, + new_font_scale, + cv::Scalar(0, 255, 255), + thickness, + cv::LINE_AA); + + std::cout << "detection, image size: " << image.cols << ", " + << image.rows + << ", detect object: " << class_names[obj.class_id] + << ", score: " << obj.prob << ", location: x=" << x + << ", y=" << y << ", width=" << w << ", height=" << h + << std::endl; + } + } + data += 6; + } + return rect_out; +} + +void RunModel(std::string model_dir, std::string img_path) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + const int in_width = 608; + const int in_height = 608; + + // 3. Prepare input data from image + // input 0 + std::unique_ptr input_tensor0(std::move(predictor->GetInput(0))); + input_tensor0->Resize({1, 3, in_height, in_width}); + auto* data0 = input_tensor0->mutable_data(); + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + pre_process(img, in_width, in_height, data0); + // input1 + std::unique_ptr input_tensor1(std::move(predictor->GetInput(1))); + input_tensor1->Resize({1, 2}); + auto* data1 = input_tensor1->mutable_data(); + data1[0] = img.rows; + data1[1] = img.cols; + + // 4. Run predictor + predictor->Run(); + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int64_t cnt = 1; + for (auto& i : shape_out) { + cnt *= i; + } + auto rec_out = detect_object(outptr, static_cast(cnt / 6), 0.5f, img); + std::string result_name = + img_path.substr(0, img_path.find(".")) + "_yolov3_detection_result.jpg"; + cv::imwrite(result_name, img); +} + +int main(int argc, char** argv) { + if (argc < 3) { + std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + exit(1); + } + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + RunModel(model_dir, img_path); + return 0; +} diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index ce8b8365a8..74b86c519e 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -1,6 +1,6 @@ # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered # to the model_optimize_tool. -if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) return() endif() diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.cc b/lite/kernels/arm/collect_fpn_proposals_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.h b/lite/kernels/arm/collect_fpn_proposals_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/arm/conditional_block_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index 8c76f243a6..52849a026e 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -110,8 +110,7 @@ void ConvCompute::PrepareForRun() { bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); - bool flag_dw_5x5 = pads_all_equal && - ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); + bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; if (param.groups == ic && ic == oc && kps_equal && pads_equal && @@ -156,8 +155,7 @@ void ConvCompute::PrepareForRun() { bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); - bool flag_dw_5x5 = pads_all_equal && - ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); + bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; if (param.groups == ic && ic == oc && kps_equal && pads_equal && diff --git a/lite/kernels/arm/conv_transpose_compute_test.cc b/lite/kernels/arm/conv_transpose_compute_test.cc deleted file mode 100644 index 298c651d9f..0000000000 --- a/lite/kernels/arm/conv_transpose_compute_test.cc +++ /dev/null @@ -1,371 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/conv_transpose_compute.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -template -static void basic_gemm(int m, - int n, - int k, - const type* a, - const type* b, - const type2* bias, - type2* c, - type2 alpha, - type2 beta, - bool trans_a = false, - bool trans_b = false, - bool flag_bias = false, - bool flag_relu = false) { -#pragma omp parallel for - for (int i = 0; i < m; ++i) { - type2 bias_data = (type2)0; - if (flag_bias) { - bias_data = bias[i]; - } - for (int j = 0; j < n; ++j) { - type2 sum = static_cast(0); - for (int l = 0; l < k; ++l) { - type av; - type bv; - if (trans_a) { - av = a[l * m + i]; - } else { - av = a[i * k + l]; - } - if (trans_b) { - bv = b[j * k + l]; - } else { - bv = b[l * n + j]; - } - sum += av * bv; - } - type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data; - if (flag_relu) { - c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0; - } else { - c[i * n + j] = tmp; - } - } - } -} - -//! for float, dtype1 and type2 is float -//! for int8, dytpe1 is char, dtype2 is int -template -bool deconv_basic(const Dtype1* din, - Dtype2* dout, - int num, - int chout, - int hout, - int wout, - int chin, - int hin, - int win, - const Dtype1* weights, - const Dtype2* bias, - int group, - int kernel_w, - int kernel_h, - int stride_w, - int stride_h, - int dila_w, - int dila_h, - int pad_w, - int pad_h, - bool flag_bias, - bool flag_relu) { - int m = chout * kernel_w * kernel_h / group; - int n = hin * win; - int k = chin / group; - - if (chin != chout || group != chin) { - CHECK_OR_FALSE(chin % group == 0); - CHECK_OR_FALSE(chout % group == 0); - } - - lite::Tensor workspace_tensor; - std::vector wt_shape = {1, 1, 1, group * m * n}; - workspace_tensor.Resize(wt_shape); - auto* workspace_ptr = workspace_tensor.mutable_data(); - - int group_size_in = win * hin * chin / group; - int group_size_out = wout * hout * chout / group; - int group_size_coldata = m * n; - int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); - bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && - (dila_w == 1) && (dila_h == 1); - - for (int i = 0; i < num; ++i) { - const Dtype1* din_batch = din + i * chin * hin * win; - Dtype2* dout_batch = dout + i * chout * hout * wout; - - Dtype2* col_data = workspace_ptr; - if (flag_1x1s1p1) { - col_data = dout_batch; - } - memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); - for (int g = 0; g < group; ++g) { - const Dtype1* din_group = din_batch + g * group_size_in; - const Dtype1* weights_group = weights + g * group_size_weights; - Dtype2* coldata_group = col_data + g * group_size_coldata; - basic_gemm(m, - n, - k, - weights_group, - din_group, - nullptr, - coldata_group, - (Dtype2)1, - (Dtype2)0, - true, - false, - false, - (!flag_bias && flag_relu)); - } - if (!flag_1x1s1p1) { - lite::arm::math::col2im(col_data, - chout, - hout, - wout, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dila_h, - dila_w, - dout_batch); - } - if (flag_bias) { - lite::arm::math::fill_bias_relu( - dout_batch, bias, chout, wout * hout, flag_bias, flag_relu); - } - } - return true; -} - -template -void conv2d_transpose_compute_ref(const operators::ConvParam& param) { - const Dtype1* din = param.x->data(); - Dtype2* dout = param.output->mutable_data(); - - int num = param.x->dims()[0]; - int chout = param.output->dims()[1]; - int hout = param.output->dims()[2]; - int wout = param.output->dims()[3]; - - int chin = param.x->dims()[1]; - int hin = param.x->dims()[2]; - int win = param.x->dims()[3]; - - const Dtype1* weights = param.filter->mutable_data(); - Dtype2* bias = nullptr; - if (param.bias != nullptr) { - bias = param.bias->mutable_data(); - } - - int group = param.groups; - int kernel_h = param.filter->dims()[2]; - int kernel_w = param.filter->dims()[3]; - int stride_h = param.strides[0]; - int stride_w = param.strides[1]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; - bool flag_bias = (param.bias != nullptr); - bool flag_relu = param.fuse_relu; - - deconv_basic(din, - dout, - num, - chout, - hout, - wout, - chin, - hin, - win, - weights, - bias, - group, - kernel_w, - kernel_h, - stride_w, - stride_h, - dila_w, - dila_h, - pad_w, - pad_h, - flag_bias, - flag_relu); -} - -TEST(conv2d_transpose_arm, retrive_op) { - auto op = KernelRegistry::Global().Create( - "conv2d_transpose"); - ASSERT_FALSE(op.empty()); - ASSERT_TRUE(op.front()); -} - -TEST(conv2d_transpose_arm, init) { - Conv2DTransposeCompute compute; - ASSERT_EQ(compute.precision(), PRECISION(kFloat)); - ASSERT_EQ(compute.target(), TARGET(kARM)); -} - -TEST(conv2d_transpose_arm, compute) { - DeviceInfo::Init(); - for (auto n : {1, 2}) { - for (auto ic : {1, 3 /*, 128*/}) { - for (auto oc : {1, 3 /*, 128*/}) { - for (auto ih : {2, 8 /*, 56 , 112, 224, 512*/}) { - for (auto iw : {2, 8 /*, 56, 112, 224, 512*/}) { - for (auto flag_bias : {false, true}) { - for (auto flag_relu : {false, true}) { - for (auto dilation : {1, 2}) { - for (auto stride : {1, 2}) { - for (auto padding : {0, 1, 2}) { - for (auto ks : {2, 3, 5}) { - for (auto group : {1, 2}) { - // obtain shape - if (ic % group != 0 || oc % group != 0) { - group = 1; - } - std::vector input_shape = {n, ic, ih, iw}; - std::vector filter_shape = { - oc / group, ic, ks, ks}; - int oh = (ih - 1) * stride - 2 * padding + - dilation * (ks - 1) + 1; - int ow = (iw - 1) * stride - 2 * padding + - dilation * (ks - 1) + 1; - if (oh < 1 || ow < 1) { - break; - } - std::vector output_shape = {n, oc, oh, ow}; - std::vector bias_shape = {1, oc, 1, 1}; - - // define and resize tensor - Tensor input; - Tensor filter; - Tensor filter_copy; - Tensor bias; - Tensor output; - Tensor output_ref; - input.Resize(input_shape); - filter.Resize(filter_shape); - filter_copy.Resize(filter_shape); - output.Resize(output_shape); - output_ref.Resize(output_shape); - auto* input_data = input.mutable_data(); - auto* filter_data = filter.mutable_data(); - auto* filter_copy_data = - filter_copy.mutable_data(); - auto* output_data = output.mutable_data(); - - // initialize tensor - for (int i = 0; i < input.dims().production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - input_data[i] = sign * static_cast(i % 128); - } - for (int i = 0; i < filter.dims().production(); i++) { - filter_data[i] = - i / - static_cast(filter.dims().production()); - filter_copy_data[i] = - i / static_cast( - filter_copy.dims().production()); - } - if (flag_bias) { - bias.Resize(bias_shape); - auto* bias_data = bias.mutable_data(); - for (int i = 0; i < bias.dims().production(); i++) { - bias_data[i] = static_cast(i); - } - } - - // prepare kernel params and run - std::unique_ptr ctx(new KernelContext); - ctx->As(); - Conv2DTransposeCompute conv2d_transpose; - conv2d_transpose.SetContext(std::move(ctx)); - operators::ConvParam param; - param.x = &input; - param.filter = &filter; - param.output = &output; - param.bias = nullptr; - if (flag_bias) { - bias.Resize(bias_shape); - auto* bias_data = bias.mutable_data(); - for (int i = 0; i < bias.dims().production(); i++) { - bias_data[i] = static_cast(i); - } - param.bias = &bias; - } - param.fuse_relu = flag_relu; - param.paddings = std::vector({padding, padding}); - param.strides = std::vector({stride, stride}); - param.dilations = - std::vector({dilation, dilation}); - param.groups = group; - conv2d_transpose.SetParam(param); - conv2d_transpose.Launch(); - - // invoking ref implementation and compare results - param.filter = &filter_copy; - param.output = &output_ref; - conv2d_transpose_compute_ref(param); - auto* output_ref_data = - output_ref.mutable_data(); - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR( - output_data[i], output_ref_data[i], 1e-3); - } - } - } - } - } - } - } - } - } - } - } - } - } -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle -USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def); diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.h b/lite/kernels/arm/distribute_fpn_proposals_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/grid_sampler_compute.cc b/lite/kernels/arm/grid_sampler_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/grid_sampler_compute.h b/lite/kernels/arm/grid_sampler_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/instance_norm_compute.cc b/lite/kernels/arm/instance_norm_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/instance_norm_compute.h b/lite/kernels/arm/instance_norm_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/merge_lod_tensor_compute.cc b/lite/kernels/arm/merge_lod_tensor_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/merge_lod_tensor_compute.h b/lite/kernels/arm/merge_lod_tensor_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/reduce_prod_compute.cc b/lite/kernels/arm/reduce_prod_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/reduce_prod_compute.h b/lite/kernels/arm/reduce_prod_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/split_lod_tensor_compute.cc b/lite/kernels/arm/split_lod_tensor_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/split_lod_tensor_compute.h b/lite/kernels/arm/split_lod_tensor_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/arm/yolo_box_compute.cc b/lite/kernels/arm/yolo_box_compute.cc index ad8a630b8c..38443bf277 100644 --- a/lite/kernels/arm/yolo_box_compute.cc +++ b/lite/kernels/arm/yolo_box_compute.cc @@ -32,6 +32,8 @@ void YoloBoxCompute::Run() { int class_num = param.class_num; float conf_thresh = param.conf_thresh; int downsample_ratio = param.downsample_ratio; + Boxes->clear(); + Scores->clear(); lite::arm::math::yolobox(X, ImgSize, Boxes, diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index bf59d02726..2df00f00a4 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_CUDA) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA)) return() endif() diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc index 2ebd7e33ba..46b63f2e31 100644 --- a/lite/kernels/cuda/conv_compute_test.cc +++ b/lite/kernels/cuda/conv_compute_test.cc @@ -15,6 +15,7 @@ #include "lite/kernels/cuda/conv_compute.h" #include #include +#include #include #include diff --git a/lite/kernels/cuda/elementwise_add_compute.cu b/lite/kernels/cuda/elementwise_add_compute.cu deleted file mode 100644 index 4bacf532a2..0000000000 --- a/lite/kernels/cuda/elementwise_add_compute.cu +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "lite/backends/cuda/math/elementwise.h" -#include "lite/core/op_registry.h" -#include "lite/kernels/cuda/elementwise_add_compute.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace cuda { - -void ElementwiseAddCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto stream = ctx.exec_stream(); - - const lite::Tensor* x = param.X; - const lite::Tensor* y = param.Y; - lite::Tensor* out = param.Out; - - CHECK(x->dims().production() == y->dims().production()); - - auto* x_data = x->data(); - auto* y_data = y->data(); - auto out_data = out->mutable_data(TARGET(kCUDA)); - - int pixel_num = x->numel(); - lite::cuda::math::elementwise_add( - pixel_num, x_data, y_data, out_data, stream); - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -void ElementwiseAddComputeNHWC::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto stream = ctx.exec_stream(); - - const lite::Tensor* x = param.X; - const lite::Tensor* y = param.Y; - lite::Tensor* out = param.Out; - - CHECK(x->dims().production() == y->dims().production()); - - auto* x_data = x->data(); - auto* y_data = y->data(); - auto out_data = out->mutable_data(TARGET(kCUDA)); - - int pixel_num = x->numel(); - lite::cuda::math::elementwise_add( - pixel_num, x_data, y_data, out_data, stream); - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -void ElementwiseAddComputeInt8::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto stream = ctx.exec_stream(); - - const lite::Tensor* x = param.X; - const lite::Tensor* y = param.Y; - lite::Tensor* out = param.Out; - - CHECK(x->dims().production() == y->dims().production()); - - const int c = x->dims()[3]; - - auto* x_data = x->data(); - auto* y_data = y->data(); - auto out_data = out->mutable_data(TARGET(kCUDA)); - - int pixel_num = x->numel(); - float output_scale = param.output_scale; - if (c % 4 == 0) { - lite::cuda::math::elementwise_add_nhwc4_int8( - pixel_num / 4, - static_cast(x_data), - static_cast(y_data), - 1. / output_scale, - static_cast(out_data), - stream); - } else { - lite::cuda::math::elementwise_add_int8( - pixel_num, x_data, y_data, 1. / output_scale, out_data, stream); - } - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); -} - -} // namespace cuda -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(elementwise_add, - kCUDA, - kFloat, - kNCHW, - paddle::lite::kernels::cuda::ElementwiseAddCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) - .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) - .Finalize(); - -REGISTER_LITE_KERNEL(elementwise_add, - kCUDA, - kFloat, - kNHWC, - paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC, - nhwc_format) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), - DATALAYOUT(kNHWC))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), - DATALAYOUT(kNHWC))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), - DATALAYOUT(kNHWC))}) - .Finalize(); diff --git a/lite/kernels/cuda/elementwise_add_compute.h b/lite/kernels/cuda/elementwise_add_compute.h deleted file mode 100644 index 5c3fecc5d8..0000000000 --- a/lite/kernels/cuda/elementwise_add_compute.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "lite/core/kernel.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace cuda { - -class ElementwiseAddCompute - : public KernelLite { - public: - using param_t = operators::ElementwiseParam; - - void Run() override; - virtual ~ElementwiseAddCompute() = default; -}; - -class ElementwiseAddComputeNHWC - : public KernelLite { - public: - using param_t = operators::ElementwiseParam; - - void Run() override; - virtual ~ElementwiseAddComputeNHWC() = default; -}; - -class ElementwiseAddComputeInt8 - : public KernelLite { - public: - using param_t = operators::ElementwiseParam; - - void Run() override; - virtual ~ElementwiseAddComputeInt8() = default; -}; - -} // namespace cuda -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/cuda/elementwise_add_compute_test.cc b/lite/kernels/cuda/elementwise_add_compute_test.cc deleted file mode 100644 index cc63f1470b..0000000000 --- a/lite/kernels/cuda/elementwise_add_compute_test.cc +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/cuda/elementwise_add_compute.h" -#include -#include -#include -#include "lite/api/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace cuda { - -using Tensor = lite::Tensor; - -static void ElementwiseAddRef(float* x, float* y, float* out, int num) { - for (int i = 0; i < num; ++i) { - out[i] = x[i] + y[i]; - } -} - -TEST(elementwise_add, normal) { - ElementwiseAddCompute elementwise_add_kernel; - std::unique_ptr ctx(new KernelContext); - auto& context = ctx->As(); - - operators::ElementwiseParam param; - Tensor x, y, out; - Tensor x_cpu, y_cpu, out_cpu; - Tensor x_ref, y_ref, out_ref; - - const int n = 1; - const int c = 3; - const int h = 2000; - const int w = 2000; - - x.Resize({n, c, h, w}); - y.Resize({n, c, h, w}); - out.Resize({n, c, h, w}); - x_cpu.Resize({n, c, h, w}); - y_cpu.Resize({n, c, h, w}); - out_cpu.Resize({n, c, h, w}); - x_ref.Resize({n, c, h, w}); - y_ref.Resize({n, c, h, w}); - out_ref.Resize({n, c, h, w}); - - auto* out_data = out.mutable_data(TARGET(kCUDA)); - - auto* x_cpu_data = x_cpu.mutable_data(); - auto* y_cpu_data = y_cpu.mutable_data(); - auto* out_cpu_data = out_cpu.mutable_data(); - - auto* x_ref_data = x_ref.mutable_data(); - auto* y_ref_data = y_ref.mutable_data(); - auto* out_ref_data = out_ref.mutable_data(); - - for (int i = 0; i < x_cpu.numel(); ++i) { - x_cpu_data[i] = i + 5.0; - x_ref_data[i] = i + 5.0; - } - for (int i = 0; i < y_cpu.numel(); ++i) { - y_cpu_data[i] = i - 5.0; - y_ref_data[i] = i - 5.0; - } - - x.Assign(x_cpu_data, x_cpu.dims()); - y.Assign(y_cpu_data, y_cpu.dims()); - - param.X = &x; - param.Y = &y; - param.Out = &out; - elementwise_add_kernel.SetParam(param); - - cudaStream_t stream; - cudaStreamCreate(&stream); - context.SetExecStream(stream); - - elementwise_add_kernel.SetContext(std::move(ctx)); - elementwise_add_kernel.Launch(); - cudaDeviceSynchronize(); - - CopySync( - out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel()); - for (int i = 0; i < out.numel(); i++) { - EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(elementwise_add, int8_out) { - ElementwiseAddComputeInt8 elementwise_add_kernel; - std::unique_ptr ctx(new KernelContext); - auto& context = ctx->As(); - - operators::ElementwiseParam param; - Tensor x, y, out; - Tensor x_cpu, y_cpu, out_cpu; - - const int n = 1; - const int h = 36; - const int w = 36; - const int c = 125; - - x.Resize({n, h, w, c}); - y.Resize({n, h, w, c}); - out.Resize({n, h, w, c}); - x_cpu.Resize({n, h, w, c}); - y_cpu.Resize({n, h, w, c}); - out_cpu.Resize({n, h, w, c}); - - auto* out_data = out.mutable_data(TARGET(kCUDA)); - - auto* x_cpu_data = x_cpu.mutable_data(); - auto* y_cpu_data = y_cpu.mutable_data(); - auto* out_cpu_data = out_cpu.mutable_data(); - - for (int i = 0; i < x_cpu.numel(); ++i) { - x_cpu_data[i] = i + 5.0; - } - for (int i = 0; i < y_cpu.numel(); ++i) { - y_cpu_data[i] = i; - } - - x.Assign(x_cpu_data, x_cpu.dims()); - y.Assign(y_cpu_data, y_cpu.dims()); - - param.X = &x; - param.Y = &y; - param.Out = &out; - param.output_scale = 50 / 127.; - elementwise_add_kernel.SetParam(param); - - cudaStream_t stream; - cudaStreamCreate(&stream); - context.SetExecStream(stream); - - elementwise_add_kernel.SetContext(std::move(ctx)); - auto start = GetCurrentUS(); - for (int i = 0; i < 1000000; i++) { - elementwise_add_kernel.Launch(); - } - LOG(INFO) << "time: " << (GetCurrentUS() - start) / 1000000.; - - CopySync( - out_cpu_data, out_data, sizeof(int8_t) * out.numel(), IoDirection::DtoH); - for (int i = 0; i < out.numel(); i++) { - // LOG(INFO) << float(out_cpu_data[i]); - } -} - -} // namespace cuda -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/cuda/mul_compute.h b/lite/kernels/cuda/mul_compute.h index c2fc4364ef..320b562128 100644 --- a/lite/kernels/cuda/mul_compute.h +++ b/lite/kernels/cuda/mul_compute.h @@ -93,7 +93,6 @@ class MulCompute : public KernelLite { .Slice(param.y_num_col_dims, param.y->dims().size()) .production()); CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; - LOG(INFO) << x_h << " " << x_w << " " << y_h << " " << y_w; mul_compute(blas, x_data, x_h, x_w, y_data, y_h, y_w, out_data); } diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.cu b/lite/kernels/cuda/sequence_pool_concat_compute.cu old mode 100755 new mode 100644 diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.h b/lite/kernels/cuda/sequence_pool_concat_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu index 0a00c06cbf..6b4b2875f3 100644 --- a/lite/kernels/cuda/yolo_box_compute.cu +++ b/lite/kernels/cuda/yolo_box_compute.cu @@ -233,7 +233,7 @@ REGISTER_LITE_KERNEL(yolo_box, DATALAYOUT(kNCHW))}) .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kCUDA), - PRECISION(kFloat), + PRECISION(kInt32), DATALAYOUT(kNCHW))}) .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kCUDA), diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt old mode 100644 new mode 100755 index 7c47e72872..f6c3a39949 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT LITE_WITH_FPGA) +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA)) return() endif() diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/conv_compute.h b/lite/kernels/fpga/conv_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/dropout_compute.cc b/lite/kernels/fpga/dropout_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/fc_compute.h b/lite/kernels/fpga/fc_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/feed_compute.h b/lite/kernels/fpga/feed_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/fetch_compute.h b/lite/kernels/fpga/fetch_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/im2sequence_compute.cc b/lite/kernels/fpga/im2sequence_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/im2sequence_compute.h b/lite/kernels/fpga/im2sequence_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/mul_compute.h b/lite/kernels/fpga/mul_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/norm_compute.cc b/lite/kernels/fpga/norm_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/norm_compute.h b/lite/kernels/fpga/norm_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/prior_box_compute.h b/lite/kernels/fpga/prior_box_compute.h old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc old mode 100644 new mode 100755 diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index c84e996f4c..2c516e47e4 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,11 +1,10 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU) return() endif() lite_cc_library(subgraph_bridge_registry SRCS registry.cc DEPS op) - lite_cc_library(subgraph_bridge_engine SRCS engine.cc DEPS tensor op scope program) diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc index 62eb649e0e..a4d1009f1b 100644 --- a/lite/kernels/npu/bridges/act_op.cc +++ b/lite/kernels/npu/bridges/act_op.cc @@ -43,33 +43,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Act node - auto act_node = graph->AddNode(out_name); - act_node->set_input_x(*x_node); + auto act_node = graph->Add(out_name); + auto act_op = act_node->data(); + act_op->set_input_x(*x_node->data()); // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // clipped_relu etc. - act_node->set_attr_mode(CvtActMode(op_type)); + act_op->set_attr_mode(CvtActMode(op_type)); if (op_type == "relu_clipped") { auto Relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); - act_node->set_attr_coef(Relu_clipped_coef); + act_op->set_attr_coef(Relu_clipped_coef); } else if (op_type == "relu6") { float Relu_clipped_coef = 6.f; - act_node->set_attr_coef(Relu_clipped_coef); + act_op->set_attr_coef(Relu_clipped_coef); } else if (op_type == "leaky_relu") { auto alpha = op_info->GetAttr("alpha"); - act_node->set_attr_negative_slope(alpha); + act_op->set_attr_negative_slope(alpha); } else if (op_type == "hard_sigmoid") { auto slope = op_info->GetAttr("slope"); auto offset = op_info->GetAttr("offset"); - act_node->set_attr_negative_slope(slope); - act_node->set_attr_coef(offset); + act_op->set_attr_negative_slope(slope); + act_op->set_attr_coef(offset); } return SUCCESS; } @@ -79,25 +80,27 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - sigmoid, +REGISTER_SUBGRAPH_BRIDGE(sigmoid, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, relu, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, tanh, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - relu_clipped, +REGISTER_SUBGRAPH_BRIDGE(relu, kNPU, paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(tanh, kNPU, paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(relu_clipped, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, relu6, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - leaky_relu, +REGISTER_SUBGRAPH_BRIDGE(relu6, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, abs, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - softsign, +REGISTER_SUBGRAPH_BRIDGE(leaky_relu, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - softplus, +REGISTER_SUBGRAPH_BRIDGE(abs, kNPU, paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(softsign, + kNPU, paddle::lite::subgraph::npu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - hard_sigmoid, +REGISTER_SUBGRAPH_BRIDGE(softplus, + kNPU, + paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(hard_sigmoid, + kNPU, paddle::lite::subgraph::npu::ActConverter); diff --git a/lite/kernels/npu/bridges/argmax_op.cc b/lite/kernels/npu/bridges/argmax_op.cc old mode 100755 new mode 100644 index 835d4dd1ed..3d397aab9d --- a/lite/kernels/npu/bridges/argmax_op.cc +++ b/lite/kernels/npu/bridges/argmax_op.cc @@ -44,20 +44,21 @@ int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { int axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Axis node - auto axis_const_node = graph->AddNode(out_name + "/axis", axis); + auto axis_node = graph->Add(out_name + "/axis", axis); // Argmax node - auto argmax_node = graph->AddNode(out_name); - argmax_node->set_input_x1(*x_node); - argmax_node->set_input_x2(*axis_const_node); + auto argmax_node = graph->Add(out_name); + auto argmax_op = argmax_node->data(); + argmax_op->set_input_x1(*x_node->data()); + argmax_op->set_input_x2(*axis_node->data()); return SUCCESS; } @@ -66,6 +67,6 @@ int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - arg_max, +REGISTER_SUBGRAPH_BRIDGE(arg_max, + kNPU, paddle::lite::subgraph::npu::ArgmaxConverter); diff --git a/lite/kernels/npu/bridges/argmax_op_test.cc b/lite/kernels/npu/bridges/argmax_op_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc index 57b52cf745..d151fd8d7b 100644 --- a/lite/kernels/npu/bridges/batch_norm_op.cc +++ b/lite/kernels/npu/bridges/batch_norm_op.cc @@ -67,30 +67,31 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { bool use_global_stats = op_info->GetAttr("use_global_stats"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Scale, Bias, Mean, Variance node - auto scale_const_node = graph->AddNode(scale_name, *scale); - auto bias_const_node = graph->AddNode(bias_name, *bias); - auto mean_const_node = graph->AddNode(mean_name, *mean); - auto variance_const_node = graph->AddNode(variance_name, *variance); + auto scale_node = graph->Add(scale_name, *scale); + auto bias_node = graph->Add(bias_name, *bias); + auto mean_node = graph->Add(mean_name, *mean); + auto variance_node = graph->Add(variance_name, *variance); // Batch Norm node - auto batch_norm_node = graph->AddNode(y_name); - batch_norm_node->set_input_x(*x_node); - batch_norm_node->set_input_scale(*scale_const_node); - batch_norm_node->set_input_offset(*bias_const_node); - batch_norm_node->set_input_mean(*mean_const_node); - batch_norm_node->set_input_variance(*variance_const_node); - batch_norm_node->set_attr_momentum(momentum); - batch_norm_node->set_attr_epsilon(epsilon); - batch_norm_node->set_attr_mode(mode); - batch_norm_node->set_attr_use_global_stats(use_global_stats); + auto batch_norm_node = graph->Add(y_name); + auto batch_norm_op = batch_norm_node->data(); + batch_norm_op->set_input_x(*x_node->data()); + batch_norm_op->set_input_scale(*scale_node->data()); + batch_norm_op->set_input_offset(*bias_node->data()); + batch_norm_op->set_input_mean(*mean_node->data()); + batch_norm_op->set_input_variance(*variance_node->data()); + batch_norm_op->set_attr_momentum(momentum); + batch_norm_op->set_attr_epsilon(epsilon); + batch_norm_op->set_attr_mode(mode); + batch_norm_op->set_attr_use_global_stats(use_global_stats); return SUCCESS; } @@ -99,6 +100,6 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - batch_norm, +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kNPU, paddle::lite::subgraph::npu::BatchNormConverter); diff --git a/lite/kernels/npu/bridges/batch_norm_op_test.cc b/lite/kernels/npu/bridges/batch_norm_op_test.cc deleted file mode 100644 index 38a876efb7..0000000000 --- a/lite/kernels/npu/bridges/batch_norm_op_test.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/batch_norm_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -template -void batch_norm_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable(); - auto bias = - scope->FindVar(op_info->Input("Bias").front())->GetMutable(); - auto scale = - scope->FindVar(op_info->Input("Scale").front())->GetMutable(); - auto mean = - scope->FindVar(op_info->Input("Mean").front())->GetMutable(); - auto variance = - scope->FindVar(op_info->Input("Variance").front())->GetMutable(); - - auto x_data = x->data(); - auto y_data = y->mutable_data(); - auto scale_data = scale->mutable_data(); - auto bias_data = bias->mutable_data(); - auto mean_data = mean->mutable_data(); - auto variance_data = variance->mutable_data(); - DDim x_dims = x->dims(); - - float epsilon = op_info->GetAttr("epsilon"); - float momentum = op_info->GetAttr("momentum"); - auto data_layout = op_info->GetAttr("data_layout"); - - bool global_stats = op_info->GetAttr("use_global_stats"); - if (global_stats) { - int64_t outer_size = 0; - int64_t channel_size = 0; - int64_t inner_size = 0; - if (data_layout == "NCHW") { - outer_size = x_dims[0]; - channel_size = x_dims[1]; - inner_size = x_dims.Slice(2, x_dims.size()).production(); - } else { - LOG(FATAL) << "Unknown storage order: " << data_layout; - } - auto x_ptr = x_data; - auto y_ptr = y_data; - for (int o = 0; o < outer_size; o++) { - for (int c = 0; c < channel_size; c++) { - for (int i = 0; i < inner_size; i++) { - dtype norm_x = - (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); - *y_ptr = norm_x * scale_data[c] + bias_data[c]; - x_ptr++; - y_ptr++; - } - } - } - } -} - -void test_batch_norm( - int bs, int ic, int ih, int iw, float epsilon, float momentum) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - std::string scale_var_name = "scale"; - std::string bias_var_name = "bias"; - std::string mean_var_name = "mean"; - std::string variance_var_name = "variance"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* scale = scope.Var(scale_var_name)->GetMutable(); - auto* bias = scope.Var(bias_var_name)->GetMutable(); - auto* mean = scope.Var(mean_var_name)->GetMutable(); - auto* variance = scope.Var(variance_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - scale->Resize({ic}); - bias->Resize({ic}); - mean->Resize({ic}); - variance->Resize({ic}); - - // initialize input&output data - FillTensor(x); - FillTensor(scale); - FillTensor(bias); - FillTensor(mean); - // variance > 0 - FillTensor(variance, 1.f, 5.f); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("batch_norm"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetInput("Scale", {scale_var_name}); - opdesc.SetInput("Bias", {bias_var_name}); - opdesc.SetInput("Mean", {mean_var_name}); - opdesc.SetInput("Variance", {variance_var_name}); - opdesc.SetOutput("Y", {out_var_name}); - opdesc.SetAttr("is_test", 1); - opdesc.SetAttr("use_global_stats", true); - opdesc.SetAttr("epsilon", epsilon); - opdesc.SetAttr("momentum", momentum); - opdesc.SetAttr("data_layout", std::string("NCHW")); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - batch_norm_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, batch_norm) { - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto epsilon : {1e-4f, 1e-5f}) { - for (auto momentum : {0.9f, 0.99f}) { - test_batch_norm(bs, ic, ih, iw, epsilon, momentum); - } - } - } - } - } - } -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(batch_norm); -USE_NPU_BRIDGE(batch_norm); diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc index 44a2734c89..e40af8703d 100644 --- a/lite/kernels/npu/bridges/concat_op.cc +++ b/lite/kernels/npu/bridges/concat_op.cc @@ -44,21 +44,22 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Traverse all of input nodes which are added into the new created concat // node - auto concat_node = graph->AddNode(out_name); - concat_node->set_attr_axis(axis); - concat_node->set_attr_N(num); - concat_node->create_dynamic_input_x(num); + auto concat_node = graph->Add(out_name); + auto concat_op = concat_node->data(); + concat_op->set_attr_axis(axis); + concat_op->set_attr_N(num); + concat_op->create_dynamic_input_x(num); int idx = 1; for (auto& x_name : x_names) { auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } - concat_node->set_dynamic_input_x(idx, *x_node); + concat_op->set_dynamic_input_x(idx, *x_node->data()); idx++; } return SUCCESS; @@ -69,6 +70,6 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - concat, +REGISTER_SUBGRAPH_BRIDGE(concat, + kNPU, paddle::lite::subgraph::npu::ConcatConverter); diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc index 6b34e76880..60877f768b 100644 --- a/lite/kernels/npu/bridges/conv_op.cc +++ b/lite/kernels/npu/bridges/conv_op.cc @@ -67,11 +67,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(dilations.size(), 2L); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } if (paddings.size() == 2L) { @@ -109,104 +109,102 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Filter node - auto filter_const_node = graph->AddNode(filter_name, *filter); + auto filter_node = graph->Add(filter_name, *filter); // Add bias node if exists bias // Supports the bias nodes with the following dimensions // 0: {oc} // 1: {1, oc, oh, ow} // 2: {n, oc, oh, ow} - std::shared_ptr bias_node = nullptr; + std::shared_ptr bias_node = nullptr; bool is_channel_bias = false; if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); - CHECK(bias_type->precision() == PRECISION(kFloat)); - CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); - auto bias = scope->FindMutableTensor(bias_name); - auto bias_dims = bias->dims(); - auto bias_data_size = bias_dims.production(); - auto output_data_size = output_dims.production(); - std::vector bias_shape; - if (bias_data_size == oc) { - // 0: {oc} - bias_shape = {1, oc, 1, 1}; - is_channel_bias = true; - } else if (bias_data_size == output_data_size / bs) { - // 1: {1, oc, oh, ow} - bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; - } else if (bias_data_size == output_data_size) { - // 2: {n, oc, oh, ow} - bias_shape = output_dims.Vectorize(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); } else { - LOG(WARNING) << "[NPU] Bias dimension " << bias_dims - << " isn't supported in conv2d Op when output dimension is " - << output_dims; - return FAILED; - } - if (graph->HasNode(bias_name)) { - // Bias node from input node - bias_node = graph->GetNode(bias_name); - } else { - // Bias node with const data - bias_node = graph->AddNode(bias_name, *bias, bias_shape); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + auto output_data_size = output_dims.production(); + std::vector bias_shape; + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {1, oc, 1, 1}; + is_channel_bias = true; + } else if (bias_data_size == output_data_size / bs) { + // 1: {1, oc, oh, ow} + bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; + } else if (bias_data_size == output_data_size) { + // 2: {n, oc, oh, ow} + bias_shape = output_dims.Vectorize(); + } else { + LOG(WARNING) + << "[NPU] Bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + return FAILED; + } + bias_node = graph->Add(bias_name, *bias, bias_shape); } } // Conv node - std::shared_ptr conv_node = nullptr; + std::shared_ptr conv_node = nullptr; if (use_depthwise_conv && is_depthwise_mode) { - auto depthwise_conv_node = - graph->AddNode(output_name); - depthwise_conv_node->set_input_x(*input_node); - depthwise_conv_node->set_input_filter(*filter_const_node); - depthwise_conv_node->set_attr_mode(1); - depthwise_conv_node->set_attr_algo(0); - depthwise_conv_node->set_attr_format(0); // NCHW - depthwise_conv_node->set_attr_pad_mode(5); // VALID - depthwise_conv_node->set_attr_group(groups); - depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_filter(*filter_node->data()); + conv_op->set_attr_mode(1); + conv_op->set_attr_algo(0); + conv_op->set_attr_format(0); // NCHW + conv_op->set_attr_pad_mode(5); // VALID + conv_op->set_attr_group(groups); + conv_op->set_attr_pad(ge::AttrValue::LIST_INT( {paddings[0], paddings[1], paddings[2], paddings[3]})); - depthwise_conv_node->set_attr_dilation( + conv_op->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - depthwise_conv_node->set_attr_stride( - ge::AttrValue::LIST_INT({strides[0], strides[1]})); - depthwise_conv_node->set_attr_kernel( + conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]})); + conv_op->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - conv_node = depthwise_conv_node; // ConvolutionDepthwise Op doesn't support bias, so append Add node to // support bias if (bias_node != nullptr) { - auto add_node = graph->AddNode(output_name); - add_node->set_input_x1(*depthwise_conv_node); - add_node->set_input_x2(*bias_node); + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_node->data()); + add_op->set_input_x2(*bias_node->data()); conv_node = add_node; } } else { - auto common_conv_node = graph->AddNode(output_name); - common_conv_node->set_input_x(*input_node); - common_conv_node->set_input_w(*filter_const_node); - common_conv_node->set_attr_mode(1); - common_conv_node->set_attr_pad_mode(0); // NOTSET - common_conv_node->set_attr_group(groups); - common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_w(*filter_node->data()); + conv_op->set_attr_mode(1); + conv_op->set_attr_pad_mode(0); // NOTSET + conv_op->set_attr_group(groups); + conv_op->set_attr_pad(ge::AttrValue::LIST_INT( {paddings[0], paddings[0], paddings[2], paddings[2]})); - common_conv_node->set_attr_dilation( + conv_op->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - common_conv_node->set_attr_stride( - ge::AttrValue::LIST_INT({strides[0], strides[1]})); - common_conv_node->set_attr_kernel( + conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]})); + conv_op->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - conv_node = common_conv_node; // Convolution Op only support bias with dimension {1, oc, 1, 1}, // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow) if (bias_node != nullptr) { if (is_channel_bias) { - common_conv_node->set_input_b(*bias_node); + conv_op->set_input_b(*bias_node->data()); } else { - auto add_node = graph->AddNode(output_name); - add_node->set_input_x1(*common_conv_node); - add_node->set_input_x2(*bias_node); + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_node->data()); + add_op->set_input_x2(*bias_node->data()); conv_node = add_node; } } @@ -215,9 +213,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (fuse_relu) { // Append relu node if fuse_relu is true - auto relu_node = graph->AddNode(output_name); - relu_node->set_input_x(*conv_node); - relu_node->set_attr_mode(CvtActMode("relu")); + auto relu_node = graph->Add(output_name); + auto relu_op = relu_node->data(); + relu_op->set_input_x(*conv_node->data()); + relu_op->set_attr_mode(CvtActMode("relu")); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -227,9 +226,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - conv2d, +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kNPU, paddle::lite::subgraph::npu::ConvConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - depthwise_conv2d, +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kNPU, paddle::lite::subgraph::npu::ConvConverter); diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc index 5ac0723c78..ab31a920ec 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op.cc @@ -58,11 +58,11 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(dilations.size(), 2L); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } // Create input sizes node to describe the dimensions of input tensor @@ -83,55 +83,59 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { (input_dims[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i]; input_sizes.push_back(output_size); } - auto input_sizes_const_node = - graph->AddNode(output_name + "/input_sizes", input_sizes); + auto input_sizes_node = graph->Add(output_name + "/input_sizes", input_sizes); // Filter node - auto filter_const_node = graph->AddNode(filter_name, *filter); + auto filter_node = graph->Add(filter_name, *filter); // Deconv node - auto conv_transpose_node = graph->AddNode(output_name); - conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); - conv_transpose_node->set_input_filter(*filter_const_node); - conv_transpose_node->set_input_x(*input_node); + auto conv_transpose_node = graph->Add(output_name); + auto conv_transpose_op = conv_transpose_node->data(); + conv_transpose_op->set_input_input_sizes(*input_sizes_node->data()); + conv_transpose_op->set_input_filter(*filter_node->data()); + conv_transpose_op->set_input_x(*input_node->data()); // Set attributes - conv_transpose_node->set_attr_format(0); // NCHW - conv_transpose_node->set_attr_pad_mode(0); // NOTSET - conv_transpose_node->set_attr_group(groups); - conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT( + conv_transpose_op->set_attr_format(0); // NCHW + conv_transpose_op->set_attr_pad_mode(0); // NOTSET + conv_transpose_op->set_attr_group(groups); + conv_transpose_op->set_attr_pad(ge::AttrValue::LIST_INT( {paddings[0], paddings[1], paddings[2], paddings[3]})); - conv_transpose_node->set_attr_dilation( + conv_transpose_op->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); - conv_transpose_node->set_attr_stride( + conv_transpose_op->set_attr_stride( ge::AttrValue::LIST_INT({strides[0], strides[1]})); - conv_transpose_node->set_attr_kernel( + conv_transpose_op->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); // Append add node to add bias if exists bias - std::shared_ptr output_node = conv_transpose_node; if (HasInputArg(op_info, scope, "Bias")) { - // Create bias node + std::shared_ptr bias_node = nullptr; auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); - CHECK(bias_type->precision() == PRECISION(kFloat)); - CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); - auto bias = scope->FindMutableTensor(bias_name); - auto channel_size = bias->dims().production(); - CHECK_EQ(channel_size, filter_dims[1] * groups); - auto bias_const_node = - graph->AddNode(bias_name, *bias, {1, channel_size, 1, 1}); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto channel_size = bias->dims().production(); + CHECK_EQ(channel_size, filter_dims[1] * groups); + bias_node = graph->Add(bias_name, *bias, {1, channel_size, 1, 1}); + } // Append add node to add bias node - auto add_node = graph->AddNode(output_name); - add_node->set_input_x1(*conv_transpose_node); - add_node->set_input_x2(*bias_const_node); - output_node = add_node; + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_transpose_node->data()); + add_op->set_input_x2(*bias_node->data()); + conv_transpose_node = add_node; } if (fuse_relu) { // Append relu node if fuse_relu is true - auto relu_node = graph->AddNode(output_name); - relu_node->set_input_x(*output_node); - relu_node->set_attr_mode(CvtActMode("relu")); + auto relu_node = graph->Add(output_name); + auto relu_op = relu_node->data(); + relu_op->set_input_x(*conv_transpose_node->data()); + relu_op->set_attr_mode(CvtActMode("relu")); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -141,6 +145,6 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - conv2d_transpose, +REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose, + kNPU, paddle::lite::subgraph::npu::ConvTransposeConverter); diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc index a31a1426dc..69b77b5def 100644 --- a/lite/kernels/npu/bridges/elementwise_ops.cc +++ b/lite/kernels/npu/bridges/elementwise_ops.cc @@ -74,45 +74,45 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { auto y_new_shape = CvtYShape(x_dims, y_dims, axis); - y_node = graph->AddNode(y_name, y_new_shape); + y_node = graph->Add(y_name, *y, y_new_shape); } // Elementwise node - std::shared_ptr elementwise_node = nullptr; + std::shared_ptr elt_node = nullptr; if (op_type == "elementwise_add" || op_type == "fusion_elementwise_add_activation") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x1(*x_node); - elt_node->set_input_x2(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x1(*x_node->data()); + elt_op->set_input_x2(*y_node->data()); } else if (op_type == "elementwise_sub") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x1(*x_node); - elt_node->set_input_x2(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x1(*x_node->data()); + elt_op->set_input_x2(*y_node->data()); } else if (op_type == "elementwise_mul") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x(*x_node); - elt_node->set_input_y(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x(*x_node->data()); + elt_op->set_input_y(*y_node->data()); } else if (op_type == "elementwise_div") { - auto elt_node = graph->AddNode(out_name); - elt_node->set_input_x1(*x_node); - elt_node->set_input_x2(*y_node); - elementwise_node = elt_node; + elt_node = graph->Add(out_name); + auto elt_op = elt_node->data(); + elt_op->set_input_x1(*x_node->data()); + elt_op->set_input_x2(*y_node->data()); } else { LOG(WARNING) << "[NPU] Unsupported op type: " << op_type; return FAILED; @@ -121,11 +121,12 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Act node if (op_type == "fusion_elementwise_add_activation") { auto act_type = op_info->GetAttr("act_type"); - auto act_node = graph->AddNode(out_name); - act_node->set_input_x(*elementwise_node); + auto act_node = graph->Add(out_name); + auto act_op = act_node->data(); + act_op->set_input_x(*elt_node->data()); // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // clipped_relu etc. - act_node->set_attr_mode(CvtActMode(act_type)); + act_op->set_attr_mode(CvtActMode(act_type)); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -135,18 +136,18 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_add, +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - fusion_elementwise_add_activation, +REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_sub, +REGISTER_SUBGRAPH_BRIDGE(elementwise_sub, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_mul, +REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - elementwise_div, +REGISTER_SUBGRAPH_BRIDGE(elementwise_div, + kNPU, paddle::lite::subgraph::npu::ElementwiseConverter); diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc old mode 100755 new mode 100644 index e7e35831dd..546a235148 --- a/lite/kernels/npu/bridges/engine.cc +++ b/lite/kernels/npu/bridges/engine.cc @@ -57,9 +57,11 @@ int Engine::BuildOriginProgram() { VLOG(3) << "The attr '" << kKernelTypeAttr << "' not found, pick the first kernel for " << op_type; #if defined(LITE_WITH_ARM) - auto kernels = op->CreateKernels({Place{TARGET(kARM)}}); + auto kernels = + op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}}); #elif defined(LITE_WITH_X86) - auto kernels = op->CreateKernels({Place{TARGET(kX86)}}); + auto kernels = + op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}}); #endif CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type; picked_kernel = std::move(kernels.front()); diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h old mode 100755 new mode 100644 diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc index 7b66d54565..3d02817215 100644 --- a/lite/kernels/npu/bridges/fc_op.cc +++ b/lite/kernels/npu/bridges/fc_op.cc @@ -57,22 +57,24 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " m: " << m << " k: " << k << " n: " << n; // Create input node and reshape it to (m, k, 1, 1) - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } auto reshaped_input_node = - graph->AddNode(input_name + "/reshape"); - reshaped_input_node->set_input_tensor(*input_node); - reshaped_input_node->set_attr_shape({m, k, 1, 1}); - reshaped_input_node->set_attr_axis(0); + graph->Add(input_name + "/reshape"); + auto reshaped_input_op = reshaped_input_node->data(); + reshaped_input_op->set_input_tensor(*input_node->data()); + reshaped_input_op->set_attr_shape({m, k, 1, 1}); + reshaped_input_op->set_attr_axis(0); // Create w const node, set its shape to (n, k, 1, 1) and fill with // the transposed w tensor Tensor transpose_w; transpose_w.Resize({n, k, 1, 1}); + transpose_w.set_persistable(true); auto transpose_w_data = transpose_w.mutable_data(); auto w_data = w->mutable_data(); for (int i = 0; i < k; i++) { @@ -80,29 +82,36 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { transpose_w_data[j * k + i] = w_data[i * n + j]; } } - auto trans_w_const_node = graph->AddNode(w_name, transpose_w); + auto trans_w_node = graph->Add(w_name, transpose_w); // FC node - auto fc_node = graph->AddNode(out_name + "/fc"); - fc_node->set_input_x(*reshaped_input_node); - fc_node->set_input_w(*trans_w_const_node); + auto fc_node = graph->Add(out_name + "/fc"); + auto fc_op = fc_node->data(); + fc_op->set_input_x(*reshaped_input_node->data()); + fc_op->set_input_w(*trans_w_node->data()); // Add bias node if bias tensor exists if (HasInputArg(op_info, scope, "Bias")) { + std::shared_ptr bias_node = nullptr; auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); - CHECK(bias_type->precision() == PRECISION(kFloat)); - CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); - auto bias = scope->FindMutableTensor(bias_name); - auto bias_dims = bias->dims(); - CHECK_EQ(bias_dims.production(), n); - auto bias_const_node = graph->AddNode(bias_name, *bias, {1, n, 1, 1}); - fc_node->set_input_b(*bias_const_node); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->precision() == PRECISION(kFloat)); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + CHECK_EQ(bias_dims.production(), n); + bias_node = graph->Add(bias_name, *bias, {1, n, 1, 1}); + } + fc_op->set_input_b(*bias_node->data()); } // Reshape output of FC node from (m, n, 1, 1) to (m, n) - auto reshaped_fc_node = graph->AddNode(out_name); - reshaped_fc_node->set_input_tensor(*fc_node); - reshaped_fc_node->set_attr_shape({m, n}); - reshaped_fc_node->set_attr_axis(0); + auto reshaped_fc_node = graph->Add(out_name); + auto reshaped_fc_op = reshaped_fc_node->data(); + reshaped_fc_op->set_input_tensor(*fc_node->data()); + reshaped_fc_op->set_attr_shape({m, n}); + reshaped_fc_op->set_attr_axis(0); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -111,4 +120,4 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, fc, paddle::lite::subgraph::npu::FCConverter); +REGISTER_SUBGRAPH_BRIDGE(fc, kNPU, paddle::lite::subgraph::npu::FCConverter); diff --git a/lite/kernels/npu/bridges/graph.cc b/lite/kernels/npu/bridges/graph.cc old mode 100755 new mode 100644 index 48ebfd5673..7d3afd92bf --- a/lite/kernels/npu/bridges/graph.cc +++ b/lite/kernels/npu/bridges/graph.cc @@ -21,26 +21,52 @@ namespace lite { namespace subgraph { namespace npu { -// Const node -std::shared_ptr Graph::AddNode(const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - auto node = AddNode(name, precision, layout); - node->set_attr_value(CvtTensor(tensor, shape, precision, layout)); +int Graph::Add(const std::string& name, std::shared_ptr node) { + auto it = nodes_.find(name); + if (it != nodes_.end()) { + // Only variable node can be shared with the same name + if (!node->is_var() || !it->second.back()->is_var()) { + LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined."; + return -1; + } + } else { + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; + } + it->second.push_back(node); + return it->second.size(); +} + +// Const or data node +std::shared_ptr Graph::Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout) { + std::shared_ptr node = nullptr; + PrecisionType precision = tensor.precision(); + if (tensor.persistable()) { + // Const node + node = Add(name, precision, layout); + node->data()->set_attr_value( + CvtTensor(tensor, shape, layout)); + } else { + // Data node + node = Add(name, shape, precision, layout); + } return node; } // Data node -std::shared_ptr Graph::AddNode(const std::string& name, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - auto node = AddNode(name); +std::shared_ptr Graph::Add(const std::string& name, + std::vector shape, + PrecisionType precision, + DataLayoutType layout) { + auto node = Add(name, precision, layout); ge::TensorDesc desc( ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision)); - node->update_input_desc_x(desc); + node->data()->update_input_desc_x(desc); return node; } diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h old mode 100755 new mode 100644 index 9b6e49c5e9..cc4a7e2a7c --- a/lite/kernels/npu/bridges/graph.h +++ b/lite/kernels/npu/bridges/graph.h @@ -19,7 +19,7 @@ #include #include #include -#include "ai_ddk_lib/include/graph/op/all_ops.h" +#include "graph/op/all_ops.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" @@ -28,105 +28,94 @@ namespace lite { namespace subgraph { namespace npu { -// Type of graph nodes -class Type { +// Graph and node is defined to collect all of converted HiAI IR nodes +class Node { public: - Type(PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW), - bool persistable = false) - : precision_(precision), layout_(layout), persistable_(persistable) {} - + enum class Role { + kVar = 0, + kConst, + kData, + }; + + Node(std::shared_ptr data, + PrecisionType precision, + DataLayoutType layout, + Role role) + : data_(data), precision_(precision), layout_(layout), role_(role) {} + Node(PrecisionType precision, DataLayoutType layout, Role role) + : precision_(precision), layout_(layout), role_(role) {} + + void set_data(std::shared_ptr data) { data_ = data; } void set_precision(PrecisionType precision) { precision_ = precision; } void set_layout(DataLayoutType layout) { layout_ = layout; } - bool set_persistable(bool persistable) { persistable_ = persistable; } + void set_role(Role role) { role_ = role; } + template + std::shared_ptr data() { + return std::static_pointer_cast(data_); + } + std::shared_ptr data() { return data_; } PrecisionType precision() const { return precision_; } DataLayoutType layout() const { return layout_; } - bool persistable() const { return persistable_; } + bool is_var() const { return role_ == Role::kVar; } + bool is_const() const { return role_ == Role::kConst; } + bool is_data() const { return role_ == Role::kData; } private: + std::shared_ptr data_{nullptr}; PrecisionType precision_{PRECISION(kFloat)}; DataLayoutType layout_{DATALAYOUT(kNCHW)}; - bool persistable_{false}; + Role role_{Role::kVar}; }; -// Graph to collect all of converted HiAI IR nodes class Graph { public: + int Add(const std::string& name, std::shared_ptr node); + + // Variable, const or data node template - std::shared_ptr AddNode(const std::string& name, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - auto unique_name = [&](const std::string& key) { - int idx = 1; - auto it = counts_.find(key); - if (it == counts_.end()) { - counts_.insert(std::make_pair(key, idx)); - } else { - idx = ++(it->second); - } - return key + "_" + std::to_string(idx); - }; - bool persistable = typeid(T) == typeid(ge::op::Const); - auto it = nodes_.find(name); - if (it != nodes_.end()) { - // Only variable can rebind the name - CHECK(!it->second.second.persistable() && !persistable) - << "[NPU] Node " << name << " redefined."; - // Generate a new unique name as the key to bind the origin node: - // new_name->node - nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second)); - nodes_.erase(it); + std::shared_ptr Add(const std::string& name, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + Node::Role role = Node::Role::kVar; + if (typeid(T) == typeid(ge::op::Const)) { + role = Node::Role::kConst; + } else if (typeid(T) == typeid(ge::op::Data)) { + role = Node::Role::kData; } - // Create a new node and bind with the name: name->new_node - auto node = std::make_shared(unique_name(name + "_op")); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, persistable)))); + auto node = std::make_shared(precision, layout, role); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + // Generate a unique name for the created HiAI IR + node->set_data(std::make_shared(name + "__" + std::to_string(idx))); return node; } - // Const node - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout); + // Const or data node + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, tensor.dims().Vectorize(), layout); } - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, tensor, dims.Vectorize(), precision, layout); + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, dims.Vectorize(), layout); } + // Const node template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - std::vector shape = {}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - const std::type_info& info = typeid(T); - PrecisionType precision = PRECISION(kFloat); - if (info == typeid(float)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int8_t)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int32_t)) { - precision = PRECISION(kInt32); - } else { - LOG(FATAL) << "[NPU] Unknow data type " << info.name(); - } + std::shared_ptr Add(const std::string& name, + const std::vector& data, + std::vector shape = {}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { if (shape.empty()) { shape = {static_cast(data.size())}; } else { @@ -138,78 +127,66 @@ class Graph { } Tensor tensor; tensor.Resize(shape); + tensor.set_persistable(true); std::memcpy(reinterpret_cast(tensor.mutable_data()), reinterpret_cast(data.data()), data.size() * sizeof(T)); - return AddNode(name, tensor, precision, layout); + return Add(name, tensor, layout); } template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, data, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + const std::vector& data, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, data, dims.Vectorize(), layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - std::vector shape = {1}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { + std::shared_ptr Add(const std::string& name, + T value, + std::vector shape = {1}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { int64_t size = 1; for (auto i : shape) { size *= i; } std::vector data(size, value); - return AddNode(name, data, shape, layout); + return Add(name, data, shape, layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, value, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + T value, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, value, dims.Vectorize(), layout); } // Data node - std::shared_ptr AddNode( - const std::string& name, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, dims.Vectorize(), precision, layout); - } - - std::shared_ptr GetNode(std::string name) { - CHECK(HasNode(name)) << "[NPU] Node " << name << " not found."; - return nodes_.at(name).first; + std::shared_ptr Add(const std::string& name, + std::vector shape, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, dims.Vectorize(), precision, layout); } - const Type& GetType(const std::string& name) { - CHECK(HasNode(name)) << "[NPU] Node " << name << " not found."; - return nodes_.at(name).second; + std::shared_ptr Get(std::string name) { + CHECK(Has(name)) << "[NPU] Node " << name << " not found."; + return nodes_.at(name).back(); } - bool HasNode(const std::string& name) { + bool Has(const std::string& name) { return nodes_.find(name) != nodes_.end(); } private: - std::unordered_map, Type>> - nodes_; - std::unordered_map counts_; + std::unordered_map>> nodes_; }; } // namespace npu diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc index f95ebc347a..238200abf3 100644 --- a/lite/kernels/npu/bridges/interpolate_op.cc +++ b/lite/kernels/npu/bridges/interpolate_op.cc @@ -55,11 +55,11 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { "supported in HiAI DDK"; // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Priority: OutSize > scale > out_h/out_w @@ -71,17 +71,18 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Update out_h and out_w and create out_size node if has OutSize - std::shared_ptr out_size_node = nullptr; + std::shared_ptr out_size_node = nullptr; if (HasInputArg(op_info, scope, "OutSize")) { auto out_size_name = op_info->Input("OutSize").front(); auto out_size_type = kernel->GetInputDeclType("OutSize"); CHECK(out_size_type->precision() == PRECISION(kInt32)); CHECK(out_size_type->layout() == DATALAYOUT(kNCHW)); - if (graph->HasNode(out_size_name)) { - out_size_node = graph->GetNode(out_size_name); + if (graph->Has(out_size_name)) { + out_size_node = graph->Get(out_size_name); } else { auto out_size = scope->FindMutableTensor(out_size_name); CHECK_EQ(out_size->numel(), 2); + CHECK(out_size->persistable()); auto out_size_data = out_size->mutable_data(); // Update out_h and out_w if has OutSize out_h = out_size_data[0]; @@ -97,22 +98,25 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " is too large, should not exceed " << largest_multiple << " in HiAI DDK"; } - out_size_node = graph->AddNode(out_name + "/out_size", - std::vector({out_h, out_w})); + out_size_node = + graph->Add(out_name + "/out_size", std::vector({out_h, out_w})); } if (interp_method == "bilinear") { - auto bilinear_interp_node = - graph->AddNode(out_name); - bilinear_interp_node->set_input_x(*x_node); - bilinear_interp_node->set_input_size(*out_size_node); - bilinear_interp_node->set_attr_align_corners(align_corners); + auto bilinear_interp_node = graph->Add(out_name); + auto bilinear_interp_op = + bilinear_interp_node->data(); + bilinear_interp_op->set_input_x(*x_node->data()); + bilinear_interp_op->set_input_size(*out_size_node->data()); + bilinear_interp_op->set_attr_align_corners(align_corners); } else if (interp_method == "nearest") { auto nearest_interp_node = - graph->AddNode(out_name); - nearest_interp_node->set_input_image(*x_node); - nearest_interp_node->set_input_size(*out_size_node); - nearest_interp_node->set_attr_align_corners(align_corners); + graph->Add(out_name); + auto nearest_interp_op = + nearest_interp_node->data(); + nearest_interp_op->set_input_image(*x_node->data()); + nearest_interp_op->set_input_size(*out_size_node->data()); + nearest_interp_op->set_attr_align_corners(align_corners); } else { LOG(WARNING) << "[NPU] Unsupported interpolate method: " << interp_method; return FAILED; @@ -125,9 +129,9 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - bilinear_interp, +REGISTER_SUBGRAPH_BRIDGE(bilinear_interp, + kNPU, paddle::lite::subgraph::npu::InterpolateConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - nearest_interp, +REGISTER_SUBGRAPH_BRIDGE(nearest_interp, + kNPU, paddle::lite::subgraph::npu::InterpolateConverter); diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc index f63b6826b9..27df458195 100644 --- a/lite/kernels/npu/bridges/mul_op.cc +++ b/lite/kernels/npu/bridges/mul_op.cc @@ -56,45 +56,46 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { << "[NPU] columns of X must be equal with rows of Y"; int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production(); VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k; - VLOG(3) << "x_name:" << x_name << ", is data: " << graph->HasNode(x_name); - VLOG(3) << "y_name:" << y_name << ", is data: " << graph->HasNode(y_name); - CHECK(graph->HasNode(x_name)) + VLOG(3) << "x_name:" << x_name << ", is data: " << graph->Has(x_name); + VLOG(3) << "y_name:" << y_name << ", is data: " << graph->Has(y_name); + CHECK(graph->Has(x_name)) << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet."; // X node which supports persistable and non-persistable tensor, and // reshape to (m, k) - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); - auto reshaped_x_node = graph->AddNode(x_name + "/reshape"); - reshaped_x_node->set_input_tensor(*x_node); - reshaped_x_node->set_attr_shape({m, k}); - reshaped_x_node->set_attr_axis(0); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + auto reshaped_x_node = graph->Add(x_name + "/reshape"); + auto reshaped_x_op = reshaped_x_node->data(); + reshaped_x_op->set_input_tensor(*x_node->data()); + reshaped_x_op->set_attr_shape({m, k}); + reshaped_x_op->set_attr_axis(0); x_node = reshaped_x_node; } else { - auto x_const_node = graph->AddNode(x_name, *x, {m, k}); - x_node = x_const_node; + x_node = graph->Add(x_name, *x, {m, k}); } // Y node which only supports persistable tensor, and reshape to // (k,n) - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); - auto reshaped_y_node = graph->AddNode(y_name + "/reshape"); - reshaped_y_node->set_input_tensor(*y_node); - reshaped_y_node->set_attr_shape({k, n}); - reshaped_y_node->set_attr_axis(0); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); + auto reshaped_y_node = graph->Add(y_name + "/reshape"); + auto reshaped_y_op = reshaped_y_node->data(); + reshaped_y_op->set_input_tensor(*y_node->data()); + reshaped_y_op->set_attr_shape({k, n}); + reshaped_y_op->set_attr_axis(0); y_node = reshaped_y_node; } else { - auto y_const_node = graph->AddNode(y_name, *y, {k, n}); - y_node = y_const_node; + y_node = graph->Add(y_name, *y, {k, n}); } // Matmul node - auto mul_node = graph->AddNode(out_name); - mul_node->set_input_x1(*x_node); - mul_node->set_input_x2(*y_node); + auto mul_node = graph->Add(out_name); + auto mul_op = mul_node->data(); + mul_op->set_input_x1(*x_node->data()); + mul_op->set_input_x2(*y_node->data()); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -103,4 +104,4 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, mul, paddle::lite::subgraph::npu::MulConverter); +REGISTER_SUBGRAPH_BRIDGE(mul, kNPU, paddle::lite::subgraph::npu::MulConverter); diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc index 451f48b1df..e6852da787 100644 --- a/lite/kernels/npu/bridges/pad2d_op.cc +++ b/lite/kernels/npu/bridges/pad2d_op.cc @@ -45,35 +45,34 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(padding.size(), 4); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Padding node int xds = x_dims.size(); padding.insert(padding.begin(), xds * 2 - 4, 0); - auto padding_const_node = - graph->AddNode(out_name + "/padding", padding, {xds, 2}); + auto padding_node = graph->Add(out_name + "/padding", padding, {xds, 2}); // Pad node - auto pad2d_node = graph->AddNode(out_name); - pad2d_node->set_input_x(*x_node); - pad2d_node->set_input_padding(*padding_const_node); + auto pad2d_node = graph->Add(out_name); + auto pad2d_op = pad2d_node->data(); + pad2d_op->set_input_x(*x_node->data()); + pad2d_op->set_input_padding(*padding_node->data()); auto mode = op_info->GetAttr("mode"); if (mode == "constant") { // Pad value node auto pad_value = op_info->GetAttr("pad_value"); - auto pad_value_const_node = - graph->AddNode(out_name + "/pad_value", pad_value); - pad2d_node->set_input_constant_values(*pad_value_const_node); - pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32 - pad2d_node->set_attr_mode(0); + auto pad_value_node = graph->Add(out_name + "/pad_value", pad_value); + pad2d_op->set_input_constant_values(*pad_value_node->data()); + pad2d_op->set_attr_T(0); // type of pad_value: 0:float 3:int32 + pad2d_op->set_attr_mode(0); } else if (mode == "reflect") { LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; - pad2d_node->set_attr_mode(1); + pad2d_op->set_attr_mode(1); return FAILED; } else { LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK"; @@ -87,6 +86,6 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - pad2d, +REGISTER_SUBGRAPH_BRIDGE(pad2d, + kNPU, paddle::lite::subgraph::npu::Pad2dConverter); diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h old mode 100755 new mode 100644 index d6fc535338..a63a0d889d --- a/lite/kernels/npu/bridges/paddle_use_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_bridges.h @@ -14,40 +14,42 @@ #pragma once -USE_SUBGRAPH_BRIDGE(NPU, sigmoid); -USE_SUBGRAPH_BRIDGE(NPU, relu); -USE_SUBGRAPH_BRIDGE(NPU, tanh); -USE_SUBGRAPH_BRIDGE(NPU, relu_clipped); -USE_SUBGRAPH_BRIDGE(NPU, leaky_relu); -USE_SUBGRAPH_BRIDGE(NPU, softsign); -USE_SUBGRAPH_BRIDGE(NPU, hard_sigmoid); +USE_SUBGRAPH_BRIDGE(sigmoid, kNPU); +USE_SUBGRAPH_BRIDGE(relu, kNPU); +USE_SUBGRAPH_BRIDGE(tanh, kNPU); +USE_SUBGRAPH_BRIDGE(relu_clipped, kNPU); +USE_SUBGRAPH_BRIDGE(leaky_relu, kNPU); +USE_SUBGRAPH_BRIDGE(softsign, kNPU); +USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU); -USE_SUBGRAPH_BRIDGE(NPU, batch_norm); -USE_SUBGRAPH_BRIDGE(NPU, concat); -USE_SUBGRAPH_BRIDGE(NPU, conv2d); -USE_SUBGRAPH_BRIDGE(NPU, depthwise_conv2d); -USE_SUBGRAPH_BRIDGE(NPU, conv2d_transpose); +USE_SUBGRAPH_BRIDGE(batch_norm, kNPU); +USE_SUBGRAPH_BRIDGE(concat, kNPU); +USE_SUBGRAPH_BRIDGE(conv2d, kNPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNPU); +USE_SUBGRAPH_BRIDGE(conv2d_transpose, kNPU); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_add); -USE_SUBGRAPH_BRIDGE(NPU, fusion_elementwise_add_activation); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_sub); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_mul); -USE_SUBGRAPH_BRIDGE(NPU, elementwise_div); +USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU); +USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU); +USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU); +USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU); +USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU); -USE_SUBGRAPH_BRIDGE(NPU, fc); -USE_SUBGRAPH_BRIDGE(NPU, bilinear_interp); -USE_SUBGRAPH_BRIDGE(NPU, nearest_interp); -USE_SUBGRAPH_BRIDGE(NPU, mul); -USE_SUBGRAPH_BRIDGE(NPU, pad2d); -USE_SUBGRAPH_BRIDGE(NPU, pool2d); -USE_SUBGRAPH_BRIDGE(NPU, reduce_mean); -USE_SUBGRAPH_BRIDGE(NPU, reshape); -USE_SUBGRAPH_BRIDGE(NPU, reshape2); -USE_SUBGRAPH_BRIDGE(NPU, scale); -USE_SUBGRAPH_BRIDGE(NPU, shuffle_channel); -USE_SUBGRAPH_BRIDGE(NPU, softmax); -USE_SUBGRAPH_BRIDGE(NPU, split); -USE_SUBGRAPH_BRIDGE(NPU, sqrt); -USE_SUBGRAPH_BRIDGE(NPU, square); -USE_SUBGRAPH_BRIDGE(NPU, transpose); -USE_SUBGRAPH_BRIDGE(NPU, transpose2); +USE_SUBGRAPH_BRIDGE(fc, kNPU); +USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU); +USE_SUBGRAPH_BRIDGE(nearest_interp, kNPU); +USE_SUBGRAPH_BRIDGE(mul, kNPU); +USE_SUBGRAPH_BRIDGE(pad2d, kNPU); +USE_SUBGRAPH_BRIDGE(pool2d, kNPU); +USE_SUBGRAPH_BRIDGE(reduce_mean, kNPU); +USE_SUBGRAPH_BRIDGE(reshape, kNPU); +USE_SUBGRAPH_BRIDGE(reshape2, kNPU); +USE_SUBGRAPH_BRIDGE(scale, kNPU); +USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU); +USE_SUBGRAPH_BRIDGE(softmax, kNPU); +USE_SUBGRAPH_BRIDGE(split, kNPU); +USE_SUBGRAPH_BRIDGE(sqrt, kNPU); +USE_SUBGRAPH_BRIDGE(square, kNPU); +USE_SUBGRAPH_BRIDGE(transpose, kNPU); +USE_SUBGRAPH_BRIDGE(transpose2, kNPU); +USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU); +USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU); diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h deleted file mode 100644 index 9a432d17e5..0000000000 --- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/kernels/npu/bridges/registry.h" - -USE_NPU_BRIDGE(sigmoid); -USE_NPU_BRIDGE(relu); -USE_NPU_BRIDGE(tanh); -USE_NPU_BRIDGE(relu_clipped); -USE_NPU_BRIDGE(leaky_relu); -USE_NPU_BRIDGE(softsign); -USE_NPU_BRIDGE(hard_sigmoid); - -USE_NPU_BRIDGE(batch_norm); -USE_NPU_BRIDGE(concat); -USE_NPU_BRIDGE(conv2d); -USE_NPU_BRIDGE(depthwise_conv2d); -USE_NPU_BRIDGE(conv2d_transpose); - -USE_NPU_BRIDGE(elementwise_add); -USE_NPU_BRIDGE(fusion_elementwise_add_activation); -USE_NPU_BRIDGE(elementwise_sub); -USE_NPU_BRIDGE(elementwise_mul); -USE_NPU_BRIDGE(elementwise_div); - -USE_NPU_BRIDGE(fc); -USE_NPU_BRIDGE(bilinear_interp); -USE_NPU_BRIDGE(nearest_interp); -USE_NPU_BRIDGE(mul); -USE_NPU_BRIDGE(pad2d); -USE_NPU_BRIDGE(pool2d); -USE_NPU_BRIDGE(reduce_mean); -USE_NPU_BRIDGE(reshape); -USE_NPU_BRIDGE(reshape2); -USE_NPU_BRIDGE(scale); -USE_NPU_BRIDGE(shuffle_channel); -USE_NPU_BRIDGE(softmax); -USE_NPU_BRIDGE(split); -USE_NPU_BRIDGE(sqrt); -USE_NPU_BRIDGE(square); -USE_NPU_BRIDGE(transpose); -USE_NPU_BRIDGE(transpose2); diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index 8b108fc4ee..42349d1839 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -48,11 +48,11 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto paddings = op_info->GetAttr>("paddings"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // pool mode @@ -109,19 +109,19 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Pooling node - auto pool_node = graph->AddNode(out_name); - pool_node->set_input_x(*x_node); - pool_node->set_attr_mode(mode); - pool_node->set_attr_pad_mode(pad_mode); - pool_node->set_attr_global_pooling(global_pooling); - pool_node->set_attr_window( - ge::AttrValue::LIST_INT(ksize.begin(), ksize.end())); - pool_node->set_attr_pad(ge::AttrValue::LIST_INT{ + auto pool_node = graph->Add(out_name); + auto pool_op = pool_node->data(); + pool_op->set_input_x(*x_node->data()); + pool_op->set_attr_mode(mode); + pool_op->set_attr_pad_mode(pad_mode); + pool_op->set_attr_global_pooling(global_pooling); + pool_op->set_attr_window(ge::AttrValue::LIST_INT(ksize.begin(), ksize.end())); + pool_op->set_attr_pad(ge::AttrValue::LIST_INT{ paddings[0], paddings[1], paddings[2], paddings[3]}); - pool_node->set_attr_stride( + pool_op->set_attr_stride( ge::AttrValue::LIST_INT(strides.begin(), strides.end())); - pool_node->set_attr_ceil_mode(ceil_mode); - // pool_node->set_attr_data_mode(data_mode); + pool_op->set_attr_ceil_mode(ceil_mode); + // pool_op->set_attr_data_mode(data_mode); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -130,6 +130,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - pool2d, +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kNPU, paddle::lite::subgraph::npu::PoolConverter); diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc deleted file mode 100644 index 298e065547..0000000000 --- a/lite/kernels/npu/bridges/pool_op_test.cc +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/pool_op.h" -#include -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -void pool_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto& in_dims = x->dims(); - auto& out_dims = out->dims(); - - const float* src_ptr = x->data(); - float* dst_ptr = out->mutable_data(); - - std::vector ksize = op_info->GetAttr>("ksize"); - std::vector strides = op_info->GetAttr>("strides"); - std::vector paddings = op_info->GetAttr>("paddings"); - bool exclusive = op_info->GetAttr("exclusive"); - std::string pooling_type = op_info->GetAttr("pooling_type"); - bool global_pooling = op_info->GetAttr("global_pooling"); - - int in_n = in_dims[0]; - int in_c = in_dims[1]; - int in_h = in_dims[2]; - int in_w = in_dims[3]; - int size_in_n = in_c * in_h * in_w; - int size_in_c = in_h * in_w; - - int out_h = out_dims[2]; - int out_w = out_dims[3]; - int size_out_n = in_c * out_h * out_w; - int size_out_c = out_h * out_w; - - int window_h = ksize[0]; - int window_w = ksize[1]; - int stride_h = strides[0]; - int stride_w = strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[2]; - - if (global_pooling == true) { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - const float* src = src_ptr + n * size_in_n + c * size_in_c; - float res = src[0]; - if (pooling_type == "max") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res = cur_val > res ? cur_val : res; - } - } else if (pooling_type == "avg") { - for (int i = 1; i < size_in_c; ++i) { - float cur_val = src[i]; - res += cur_val; - } - res /= size_in_c; - } - dst_ptr[n * size_out_n + c] = res; - } - } - } else { - for (int n = 0; n < in_n; ++n) { - for (int c = 0; c < in_c; ++c) { - for (int h = 0; h < out_h; ++h) { - int sh = h * stride_h; - int eh = sh + window_h; - sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; - eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; - for (int w = 0; w < out_w; ++w) { - int sw = w * stride_w; - int ew = sw + window_w; - sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; - ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; - int pooling_size = (ew - sw) * (eh - sh); - if (pooling_size == 0) continue; - float res = 0.f; - for (int kh = sh; kh < eh; ++kh) { - for (int kw = sw; kw < ew; ++kw) { - int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; - if (kh == sh && kw == sw) { - res = src_ptr[src_idx]; - } else { - if (pooling_type == "max") { - res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; - } - if (pooling_type == "avg") { - res += src_ptr[src_idx]; - } - } - } - } - if (pooling_type == "avg") { - if (exclusive) { - res /= pooling_size; - } else { - res /= window_h * window_w; - } - } - dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; - } - } - } - } - } -} - -void test_pool(int bs, - int ic, - int ih, - int iw, - std::string pooling_type, - bool ceil_mode, - bool global_pooling, - bool exclusive, - int ksize, - int stride, - int padding) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("pool2d"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("pooling_type", pooling_type); - opdesc.SetAttr("ksize", std::vector({ksize, ksize})); - opdesc.SetAttr("global_pooling", global_pooling); - opdesc.SetAttr("exclusive", exclusive); - opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", - std::vector({padding, padding, padding, padding})); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - pool_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, pool) { - for (auto pooling_type : {"max", "avg"}) { - for (auto ceil_mode : {true, false}) { - for (auto global_pooling : {/*true, */ false}) { - for (auto exclusive : {true /*, false*/}) { - for (auto ksize : {2, 3}) { - for (auto stride : {1, 2}) { - for (auto padding : {0, 1}) { - for (auto bs : {1, 3}) { - for (auto ic : {1, 3}) { - for (auto ih : {3, 7}) { - for (auto iw : {3, 7}) { - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); - } - } - } - } - } - } - } - } - } - } - } - for (auto pooling_type : {"max", "avg"}) { - for (auto ceil_mode : {true, false}) { - bool global_pooling = true; - bool exclusive = true; - int ksize = 2; - int stride = 1; - int padding = 0; - int bs = 6; - int ic = 6; - int ih = 6; - int iw = 6; - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); - } - } -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(pool2d); -USE_NPU_BRIDGE(pool2d); diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc index 6c7f29fb27..29f065675c 100644 --- a/lite/kernels/npu/bridges/reduce_mean_op.cc +++ b/lite/kernels/npu/bridges/reduce_mean_op.cc @@ -52,29 +52,30 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::sort(dim.begin(), dim.end()); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Using ReduceSum + Scale to implement ReduceMean // Dim node - auto dim_const_node = graph->AddNode(out_name + "/dim", dim); + auto dim_node = graph->Add(out_name + "/dim", dim); // Reduce Sum node - auto reduce_sum_node = - graph->AddNode(out_name + "/reducesum"); - reduce_sum_node->set_input_x(*x_node); - reduce_sum_node->set_input_w(*dim_const_node); - reduce_sum_node->set_attr_keep_dims(keep_dim); + auto reduce_sum_node = graph->Add(out_name + "/reducesum"); + auto reduce_sum_op = reduce_sum_node->data(); + reduce_sum_op->set_input_x(*x_node->data()); + reduce_sum_op->set_input_w(*dim_node->data()); + reduce_sum_op->set_attr_keep_dims(keep_dim); // Scale node - auto scale_node = graph->AddNode(out_name); - scale_node->set_input_x(*reduce_sum_node); - scale_node->set_attr_axis(1); + auto scale_node = graph->Add(out_name); + auto scale_op = scale_node->data(); + scale_op->set_input_x(*reduce_sum_node->data()); + scale_op->set_attr_axis(1); // Add filter node(fill with scale) float scale = 1; @@ -95,9 +96,8 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag), scale_bias_shape.end()); } - auto filter_const_node = - graph->AddNode(out_name + "/filter", scale, scale_bias_shape); - scale_node->set_input_filter(*filter_const_node); + auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape); + scale_op->set_input_filter(*filter_node->data()); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -106,6 +106,6 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - reduce_mean, +REGISTER_SUBGRAPH_BRIDGE(reduce_mean, + kNPU, paddle::lite::subgraph::npu::ReduceMeanConverter); diff --git a/lite/kernels/npu/bridges/registry.cc b/lite/kernels/npu/bridges/registry.cc index 5f89bcb313..39181ccee9 100644 --- a/lite/kernels/npu/bridges/registry.cc +++ b/lite/kernels/npu/bridges/registry.cc @@ -24,27 +24,27 @@ Registry& Registry::Instance() { return x; } -void Registry::Insert(const std::string& dev_type, - const std::string& op_type, +void Registry::Insert(const std::string& op_type, + const std::string& target, const cvt_func_type& cvt_func_name) { - auto it = map_.find(dev_type); + auto it = map_.find(target); if (it == map_.end()) { map_.insert(std::make_pair( - dev_type, std::unordered_map())); + target, std::unordered_map())); } - map_.at(dev_type).insert(std::make_pair(op_type, cvt_func_name)); + map_.at(target).insert(std::make_pair(op_type, cvt_func_name)); } -const cvt_func_type& Registry::Select(const std::string& dev_type, - const std::string& op_type) const { - return map_.at(dev_type).at(op_type); +const cvt_func_type& Registry::Select(const std::string& op_type, + const std::string& target) const { + return map_.at(target).at(op_type); } -bool Registry::Exists(const std::string& dev_type, - const std::string& op_type) const { - bool found = map_.find(dev_type) != map_.end(); +bool Registry::Exists(const std::string& op_type, + const std::string& target) const { + bool found = map_.find(target) != map_.end(); if (found) { - found = map_.at(dev_type).find(op_type) != map_.at(dev_type).end(); + found = map_.at(target).find(op_type) != map_.at(target).end(); } return found; } diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h index 5198a3f8f2..615a106864 100644 --- a/lite/kernels/npu/bridges/registry.h +++ b/lite/kernels/npu/bridges/registry.h @@ -42,12 +42,12 @@ class Registry { public: static Registry& Instance(); - void Insert(const std::string& dev_type, - const std::string& op_type, + void Insert(const std::string& op_type, + const std::string& target, const cvt_func_type& cvt_func_name); - const cvt_func_type& Select(const std::string& dev_type, - const std::string& op_type) const; - bool Exists(const std::string& dev_type, const std::string& op_type) const; + const cvt_func_type& Select(const std::string& op_type, + const std::string& target) const; + bool Exists(const std::string& op_type, const std::string& target) const; Registry() = default; private: @@ -67,24 +67,24 @@ class Registry { #define UNUSED __attribute__((unused)) #endif -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ +#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg) \ struct __test_global_namespace_##uniq_name##__ {}; \ static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ __test_global_namespace_##uniq_name##__>::value, \ msg) -#define REGISTER_SUBGRAPH_BRIDGE(dev_type, op_type, cvt_func_name) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_subgraph_bridge_##dev_type##_##op_type##__, \ +#define REGISTER_SUBGRAPH_BRIDGE(op_type__, target__, cvt_func_name) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \ + __reg_subgraph_bridge_##op_type__##_##target__##__, \ "REGISTER_SUBGRAPH_BRIDGE must be called in global namespace only " \ "once!"); \ - int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert() { \ + int __reg_subgraph_bridge_##op_type__##_##target__##_Insert() { \ paddle::lite::subgraph::Registry::Instance().Insert( \ - #dev_type, #op_type, cvt_func_name); \ + #op_type__, #target__, cvt_func_name); \ return 0; \ } -#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) \ - extern int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert(); \ - static int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert_return \ - UNUSED = __reg_subgraph_bridge_##dev_type##_##op_type##_Insert(); +#define USE_SUBGRAPH_BRIDGE(op_type__, target__) \ + extern int __reg_subgraph_bridge_##op_type__##_##target__##_Insert(); \ + static int __reg_subgraph_bridge_##op_type__##_##target__##_Insert_return \ + UNUSED = __reg_subgraph_bridge_##op_type__##_##target__##_Insert(); diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc index d5100dee4a..50c7f9d65a 100644 --- a/lite/kernels/npu/bridges/reshape_op.cc +++ b/lite/kernels/npu/bridges/reshape_op.cc @@ -34,26 +34,25 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); auto x_type = kernel->GetInputDeclType("X"); - CHECK(x_type->precision() == PRECISION(kFloat)); - CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); - CHECK(out_type->precision() == PRECISION(kFloat)); - CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Reshape node - auto reshape_node = graph->AddNode(out_name); - reshape_node->set_input_tensor(*x_node); + auto reshape_node = graph->Add( + out_name, x_node->precision(), x_node->layout()); + auto reshape_op = reshape_node->data(); + reshape_op->set_input_tensor(*x_node->data()); // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr) if (HasInputArg(op_info, scope, "ShapeTensor")) { @@ -64,9 +63,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // auto actual_shape_type = kernel->GetInputDeclType("Shape"); // CHECK(actual_shape_type->precision() == PRECISION(kInt32)); // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW)); - std::shared_ptr actual_shape_node = nullptr; - if (graph->HasNode(actual_shape_name)) { - actual_shape_node = graph->GetNode(actual_shape_name); + std::shared_ptr actual_shape_node = nullptr; + if (graph->Has(actual_shape_name)) { + actual_shape_node = graph->Get(actual_shape_name); } else { auto actual_shape = scope->FindMutableTensor(actual_shape_name); auto actual_shape_dims = actual_shape->dims(); @@ -80,13 +79,13 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " "but Shape has " << out_shape.size(); + return FAILED; } - auto actual_shape_const_node = - graph->AddNode(actual_shape_name, - std::vector(out_shape.begin(), out_shape.end())); - actual_shape_node = actual_shape_const_node; + actual_shape_node = + graph->Add(actual_shape_name, + std::vector(out_shape.begin(), out_shape.end())); } - reshape_node->set_input_w(*actual_shape_node); + reshape_op->set_input_w(*actual_shape_node->data()); } else { auto shape = op_info->GetAttr>("shape"); auto out_dims = lite::operators::ValidateShape(shape, x_dims); @@ -95,33 +94,12 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " "but shape has " << out_shape.size(); + return FAILED; } - reshape_node->set_attr_shape( + reshape_op->set_attr_shape( ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); } - // XShape node - if (op_type == "reshape2") { - // Append an extra reshape node to calc XShape - std::vector xshape_dims(x_dims.size() + 1, 1); - for (size_t i = 0; i < x_dims.size(); i++) { - xshape_dims[i + 1] = x_dims[i]; - } - if (xshape_dims.size() > 4) { - LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, " - "but XShape has " - << xshape_dims.size(); - return FAILED; - } - auto xshape_name = op_info->Output("XShape").front(); - // auto xshape_type = kernel->GetOutputDeclType("XShape"); - // CHECK(xshape_type->precision() == PRECISION(kFloat)); - // CHECK(xshape_type->layout() == DATALAYOUT(kNCHW)); - auto xshape_node = graph->AddNode(xshape_name); - xshape_node->set_input_tensor(*x_node); - xshape_node->set_attr_shape( - ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end())); - } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -130,9 +108,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - reshape, +REGISTER_SUBGRAPH_BRIDGE(reshape, + kNPU, paddle::lite::subgraph::npu::ReshapeConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - reshape2, +REGISTER_SUBGRAPH_BRIDGE(reshape2, + kNPU, paddle::lite::subgraph::npu::ReshapeConverter); diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc index ca04996faf..d0139a9e2f 100644 --- a/lite/kernels/npu/bridges/scale_op.cc +++ b/lite/kernels/npu/bridges/scale_op.cc @@ -37,12 +37,15 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - CHECK_GE(x_dims.size(), 2); + auto x_rank = x_dims.size(); + CHECK_GE(x_rank, 2); auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); CHECK(out_type->precision() == PRECISION(kFloat)); CHECK(out_type->layout() == DATALAYOUT(kNCHW)); - std::vector scale_bias_shape = {x_dims[1]}; + // HiAI only support [n, c, 1, 1] for the shape of scale and bias + std::vector scale_bias_shape = { + 1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1}; float scale = op_info->GetAttr("scale"); float bias = op_info->GetAttr("bias"); bool bias_after_scale = op_info->GetAttr("bias_after_scale"); @@ -51,29 +54,28 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x, CvtShape(x_dims)); } // Scale node - auto scale_node = graph->AddNode(out_name); - scale_node->set_input_x(*x_node); - scale_node->set_attr_axis(1); + auto scale_node = graph->Add(out_name); + auto scale_op = scale_node->data(); + scale_op->set_input_x(*x_node->data()); + scale_op->set_attr_axis(1); // Add filter node(fill with scale) - auto filter_const_node = - graph->AddNode(out_name + "/filter", scale, scale_bias_shape); - scale_node->set_input_filter(*filter_const_node); + auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape); + scale_op->set_input_filter(*filter_node->data()); // Add bias node(fill with bias) if (fabs(bias) > 1e-6f) { - auto bias_const_node = - graph->AddNode(out_name + "/bias", bias, scale_bias_shape); - scale_node->set_input_bias(*bias_const_node); - scale_node->set_attr_has_bias_value(true); + auto bias_node = graph->Add(out_name + "/bias", bias, scale_bias_shape); + scale_op->set_input_bias(*bias_node->data()); + scale_op->set_attr_has_bias_value(true); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -83,6 +85,6 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - scale, +REGISTER_SUBGRAPH_BRIDGE(scale, + kNPU, paddle::lite::subgraph::npu::ScaleConverter); diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc index 47469e1506..0552bd2382 100644 --- a/lite/kernels/npu/bridges/shuffle_channel_op.cc +++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc @@ -44,17 +44,19 @@ int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto group = op_info->GetAttr("group"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Shuffle Channel node - auto shuffle_channel_node = graph->AddNode(out_name); - shuffle_channel_node->set_input_x(*x_node); - shuffle_channel_node->set_attr_group(group); + auto shuffle_channel_node = graph->Add(out_name); + auto shuffle_channel_op = + shuffle_channel_node->data(); + shuffle_channel_op->set_input_x(*x_node->data()); + shuffle_channel_op->set_attr_group(group); return SUCCESS; } @@ -63,6 +65,6 @@ int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - shuffle_channel, +REGISTER_SUBGRAPH_BRIDGE(shuffle_channel, + kNPU, paddle::lite::subgraph::npu::ShuffleChannelConverter); diff --git a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc b/lite/kernels/npu/bridges/shuffle_channel_op_test.cc deleted file mode 100644 index cbf2eac9f3..0000000000 --- a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/shuffle_channel_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -void shuffle_channel_ref( - const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->mutable_data(); - auto out_data = out->mutable_data(); - int group = op_info->GetAttr("group"); - auto x_dims = x->dims(); - - int n_size = x_dims.production() / x_dims[0]; - int c_size = n_size / x_dims[1]; - for (int n = 0; n < x_dims[0]; n++) { - int g_num = x_dims[1] / group; - auto tmp_out_data = out_data; - for (int g = 0; g < g_num; g++) { - auto tmp_x_data = x_data + g * c_size; - for (int i = 0; i < group; i++) { - std::memcpy(tmp_out_data, - tmp_x_data + i * g_num * c_size, - c_size * sizeof(float)); - tmp_out_data += c_size; - } - } - x_data += n_size; - out_data += n_size; - } -} - -void test_shuffle_channel(int bs, int ic, int ih, int iw, int group) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("shuffle_channel"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("group", group); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - shuffle_channel_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, softmax) { - for (auto bs : {1, 4}) { - for (auto ic : {1, 24, 35}) { - for (auto ih : {1, 4}) { - for (auto iw : {1, 4}) { - for (auto group : {1, 3, 7, 24, 35}) { - if (ic % group != 0) continue; - test_shuffle_channel(bs, ic, ih, iw, group); - } - } - } - } - } -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(shuffle_channel); -USE_NPU_BRIDGE(shuffle_channel); diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc index 01d8b0a944..24bbb790e0 100644 --- a/lite/kernels/npu/bridges/softmax_op.cc +++ b/lite/kernels/npu/bridges/softmax_op.cc @@ -37,29 +37,34 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto x_rank = x_dims.size(); auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); CHECK(out_type->precision() == PRECISION(kFloat)); CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto axis = op_info->GetAttr("axis"); - if (x_dims.size() > 3) { - CHECK(!(axis == 2 && x_dims[3] > 1)) - << "[NPU] Unsupported softmax params: axis = " << axis - << " :x_w = " << x_dims[3]; + if (axis < 0) { + axis += x_rank; + } + if (axis == 2 && x_rank > 3 && x_dims[3] != 1) { + LOG(WARNING) << "[NPU] Unsupported softmax params: axis = " << axis + << " :x_w = " << x_dims[3]; + return FAILED; } // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Softmax node - auto softmax_node = graph->AddNode(out_name); - softmax_node->set_input_x(*x_node); - softmax_node->set_attr_axis(axis); + auto softmax_node = graph->Add(out_name); + auto softmax_op = softmax_node->data(); + softmax_op->set_input_x(*x_node->data()); + softmax_op->set_attr_axis(axis); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -68,6 +73,6 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - softmax, +REGISTER_SUBGRAPH_BRIDGE(softmax, + kNPU, paddle::lite::subgraph::npu::SoftmaxConverter); diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc index 597de04d5b..2cdf49fd54 100644 --- a/lite/kernels/npu/bridges/split_op.cc +++ b/lite/kernels/npu/bridges/split_op.cc @@ -47,33 +47,34 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { int64_t sections_num = static_cast(sections.size()); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Split node - auto split_node = graph->AddNode(op_type + "/" + x_name); - split_node->set_input_x(*x_node); - split_node->set_attr_axis(static_cast(axis)); + auto split_node = graph->Add(op_type + "/" + x_name); + auto split_op = split_node->data(); + split_op->set_input_x(*x_node->data()); + split_op->set_attr_axis(static_cast(axis)); if (num > 0) { - split_node->set_attr_output_num(static_cast(num)); + split_op->set_attr_output_num(static_cast(num)); } else { - split_node->set_attr_output_num(sections_num); + split_op->set_attr_output_num(sections_num); auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end()); - split_node->set_attr_size_split(size_split); + split_op->set_attr_size_split(size_split); } - split_node->create_dynamic_output_y(out_names.size()); + split_op->create_dynamic_output_y(out_names.size()); int idx = 1; for (auto& out_name : out_names) { - auto zero_const_node = - graph->AddNode(out_name + "/zero" + std::to_string(idx), 0); - auto add_node = graph->AddNode(out_name); - add_node->set_input_x1(*split_node, "y" + std::to_string(idx)); - add_node->set_input_x2(*zero_const_node); + auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0); + auto add_node = graph->Add(out_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx)); + add_op->set_input_x2(*zero_node->data()); idx++; } return REBUILD_WHEN_SHAPE_CHANGED; @@ -84,6 +85,6 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - split, +REGISTER_SUBGRAPH_BRIDGE(split, + kNPU, paddle::lite::subgraph::npu::SplitConverter); diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc index 2ee58862fb..e8fde2272a 100644 --- a/lite/kernels/npu/bridges/sqrt_op.cc +++ b/lite/kernels/npu/bridges/sqrt_op.cc @@ -43,16 +43,17 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Sqrt node - auto sqrt_node = graph->AddNode(out_name); - sqrt_node->set_input_x(*x_node); + auto sqrt_node = graph->Add(out_name); + auto sqrt_op = sqrt_node->data(); + sqrt_op->set_input_x(*x_node->data()); return SUCCESS; } @@ -61,4 +62,6 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, sqrt, paddle::lite::subgraph::npu::SqrtConverter); +REGISTER_SUBGRAPH_BRIDGE(sqrt, + kNPU, + paddle::lite::subgraph::npu::SqrtConverter); diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc index 3f6676c8a8..f03c7690cb 100644 --- a/lite/kernels/npu/bridges/square_op.cc +++ b/lite/kernels/npu/bridges/square_op.cc @@ -43,16 +43,17 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Square node - auto square_node = graph->AddNode(out_name); - square_node->set_input_x(*x_node); + auto square_node = graph->Add(out_name); + auto square_op = square_node->data(); + square_op->set_input_x(*x_node->data()); return SUCCESS; } @@ -61,6 +62,6 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - square, +REGISTER_SUBGRAPH_BRIDGE(square, + kNPU, paddle::lite::subgraph::npu::SquareConverter); diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc index 70449dac7a..bdac84df3c 100644 --- a/lite/kernels/npu/bridges/transpose_op.cc +++ b/lite/kernels/npu/bridges/transpose_op.cc @@ -37,23 +37,24 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - auto out_name = op_info->Input("Out").front(); + auto out_name = op_info->Output("Out").front(); auto axis = op_info->GetAttr>("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Transpose node - auto transpose_node = graph->AddNode(out_name); - transpose_node->set_input_x(*x_node); - auto w_const_node = graph->AddNode(out_name + "/w", 1.0f); - transpose_node->set_input_w(*w_const_node); - transpose_node->set_attr_order( + auto transpose_node = graph->Add(out_name); + auto transpose_op = transpose_node->data(); + transpose_op->set_input_x(*x_node->data()); + auto w_node = graph->Add(out_name + "/w", 1.0f); + transpose_op->set_input_w(*w_node->data()); + transpose_op->set_attr_order( ge::AttrValue::LIST_INT(axis.begin(), axis.end())); return SUCCESS; } @@ -63,9 +64,9 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - transpose, +REGISTER_SUBGRAPH_BRIDGE(transpose, + kNPU, paddle::lite::subgraph::npu::TransposeConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - transpose2, +REGISTER_SUBGRAPH_BRIDGE(transpose2, + kNPU, paddle::lite::subgraph::npu::TransposeConverter); diff --git a/lite/kernels/npu/bridges/transpose_op_test.cc b/lite/kernels/npu/bridges/transpose_op_test.cc deleted file mode 100644 index 9ad2610caa..0000000000 --- a/lite/kernels/npu/bridges/transpose_op_test.cc +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/transpose_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -int data_index(std::vector pos, DDimLite dims) { - int d1 = dims[1]; - int d2 = dims[2]; - int d3 = dims[3]; - return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1; -} - -std::vector pos_trans(std::vector in_pos, std::vector axis) { - std::vector out_pos(in_pos.size()); - for (int i = 0; i < axis.size(); i++) { - out_pos[axis[i]] = in_pos[i]; - } - return out_pos; -} - -void transpose_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto input = - scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto output = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_dims = input->dims(); - auto y_dims = output->dims(); - auto axis = op_info->GetAttr>("axis"); - - auto* input_data = input->data(); - auto* output_data = output->mutable_data(); - - int input_n = x_dims[0]; - int input_c = x_dims[1]; - int input_h = x_dims[2]; - int input_w = x_dims[3]; - int output_n = y_dims[0]; - int output_c = y_dims[1]; - int output_h = y_dims[2]; - int output_w = y_dims[3]; - - for (int n = 0; n < input_n; ++n) { - for (int c = 0; c < input_c; ++c) { - for (int h = 0; h < input_h; ++h) { - for (int w = 0; w < input_w; ++w) { - std::vector in_pos{n, c, h, w}; - std::vector out_pos = pos_trans(in_pos, axis); - int in_index = data_index(in_pos, x_dims); - int out_index = data_index(out_pos, y_dims); - output_data[out_index] = input_data[in_index]; - } - } - } - } -} - -void test_transpose(int bs, int ic, int ih, int iw, std::vector axis) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("transpose"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axis", axis); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - transpose_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, transpose) { -#if 0 - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto axis : {std::vector{0, 1, 2, 3}, - std::vector{0, 1, 3, 2}, - std::vector{0, 3, 1, 2}, - std::vector{1, 2, 3, 0}, - std::vector{3, 2, 1, 0}, - std::vector{2, 3, 1, 0}}) { - test_transpose(bs, ic, ih, iw, axis); - } - } - } - } - } -#endif - test_transpose(2, 3, 4, 5, std::vector{0, 1, 3, 2}); - // test_transpose(2, 3, 4, 5, std::vector{0, 1, 2, 3}); - // test_transpose(2, 2, 2, 2, std::vector{0,1,3,2}); - // test_transpose(1, 1, 2, 2, std::vector{0,1,3,2}); - // test_transpose(1, 1, 1, 2, std::vector{0,1,2,3}); -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(transpose); -USE_NPU_BRIDGE(transpose); - -USE_LITE_OP(transpose2); -USE_NPU_BRIDGE(transpose2); diff --git a/lite/kernels/npu/bridges/unsqueeze_op.cc b/lite/kernels/npu/bridges/unsqueeze_op.cc old mode 100755 new mode 100644 index 8ff95d4ed8..bcb3bee83b --- a/lite/kernels/npu/bridges/unsqueeze_op.cc +++ b/lite/kernels/npu/bridges/unsqueeze_op.cc @@ -32,30 +32,30 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto x_name = op_info->Input("X").front(); auto x_type = kernel->GetInputDeclType("X"); - CHECK(x_type->precision() == PRECISION(kFloat)); CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); auto out_type = kernel->GetOutputDeclType("Out"); - CHECK(out_type->precision() == PRECISION(kFloat)); CHECK(out_type->layout() == DATALAYOUT(kNCHW)); auto out_shape = scope->FindTensor(out_name)->dims().Vectorize(); CHECK(op_info->HasAttr("axes")) << "[NPU] unsqueeze not support axes from tensor now"; // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Unsqueeze node - auto unsqueeze_node = graph->AddNode(out_name); - unsqueeze_node->set_input_tensor(*x_node); - unsqueeze_node->set_attr_shape( + auto unsqueeze_node = graph->Add(out_name); + auto unsqueeze_op = unsqueeze_node->data(); + unsqueeze_op->set_input_tensor(*x_node->data()); + unsqueeze_op->set_attr_shape( ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -65,9 +65,9 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(NPU, - unsqueeze, +REGISTER_SUBGRAPH_BRIDGE(unsqueeze, + kNPU, paddle::lite::subgraph::npu::UnsqueezeConverter); -REGISTER_SUBGRAPH_BRIDGE(NPU, - unsqueeze2, +REGISTER_SUBGRAPH_BRIDGE(unsqueeze2, + kNPU, paddle::lite::subgraph::npu::UnsqueezeConverter); diff --git a/lite/kernels/npu/bridges/unsqueeze_op_test.cc b/lite/kernels/npu/bridges/unsqueeze_op_test.cc deleted file mode 100755 index c59843f614..0000000000 --- a/lite/kernels/npu/bridges/unsqueeze_op_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/unsqueeze_op.h" -#include -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -static DDim GetOutputShape(const std::vector& unsqz_dims, - const DDim& in_dims) { - int output_size = in_dims.size() + static_cast(unsqz_dims.size()); - int cur_output_size = in_dims.size(); - std::vector output_shape(output_size, 0); - - // Validate Check: rank range. - CHECK_LE(output_size, 6) << "The output tensor's rank should be less than 6."; - - for (int axis : unsqz_dims) { - int cur = axis < 0 ? axis + cur_output_size + 1 : axis; - // Validate Check: the axis bound - CHECK((cur >= 0) && (cur <= cur_output_size)) - << "The unsqueeze dims must be within range of current rank."; - // Move old axis, and insert new axis - for (int i = cur_output_size; i >= cur; --i) { - if (output_shape[i] == 1) { - // Move axis - output_shape[i + 1] = 1; - output_shape[i] = 0; - } - } - - output_shape[cur] = 1; - // Add the output size. - cur_output_size++; - } - - // Make output shape - for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { - if (output_shape[out_idx] == 0) { - output_shape[out_idx] = in_dims[in_idx++]; - } - } - - return DDim(output_shape); -} - -template -void unsqueeze_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - - auto x = scope->FindTensor("x"); - auto out = scope->FindMutableTensor("out_ref"); - auto axes = op_info->GetAttr>("axes"); - auto y_dims = GetOutputShape(axes, x->dims()); - out->Resize(y_dims); - - auto x_data = x->data(); - auto out_data = out->mutable_data(); - - memcpy(out_data, x_data, x->numel() * sizeof(float)); -} - -void test_unsqueeze(const std::vector& input_shape, - std::vector axes) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - auto* x = scope.NewTensor(x_var_name); - auto* out = scope.NewTensor(out_var_name); - auto* out_ref = scope.NewTensor(out_ref_var_name); - x->Resize(input_shape); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("unsqueeze"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - opdesc.SetAttr("axes", axes); - - // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - - // execute reference implementation and save to output tensor - unsqueeze_ref(op); - - // compare results - CHECK_EQ(out->dims().size(), out_ref->dims().size()); - for (int i = 0; i < out->dims().size(); i++) { - CHECK_EQ(out->dims()[i], out_ref->dims()[i]); - } - - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); - } -} - -TEST(NPUBridges, unsqueeze) { - test_unsqueeze({2}, {0, 2}); - test_unsqueeze({2, 3}, {1, 3}); - test_unsqueeze({1, 2, 3}, {3}); - test_unsqueeze({5, 6, 7}, {1}); -} - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(unsqueeze); -USE_NPU_BRIDGE(unsqueeze); diff --git a/lite/kernels/npu/bridges/utility.cc b/lite/kernels/npu/bridges/utility.cc old mode 100755 new mode 100644 index f79936c5d7..d9c9ffae92 --- a/lite/kernels/npu/bridges/utility.cc +++ b/lite/kernels/npu/bridges/utility.cc @@ -85,10 +85,26 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) { return otype; } +std::vector CvtShape(const std::vector& in_shape) { + std::vector out_shape; + // Padding the shape to 4-dimensions(NCHW) + for (int i = 0; i < 4 - in_shape.size(); i++) { + out_shape.push_back(1); + } + for (int i = 0; i < in_shape.size(); i++) { + out_shape.push_back(in_shape[i]); + } + return out_shape; +} + +std::vector CvtShape(const DDim& in_dims) { + return CvtShape(in_dims.Vectorize()); +} + ge::TensorPtr CvtTensor(const Tensor& in_tensor, std::vector out_shape, - PrecisionType in_precision, DataLayoutType in_layout) { + PrecisionType in_precision = in_tensor.precision(); auto in_size = in_tensor.dims().production(); auto in_shape = in_tensor.dims().Vectorize(); if (out_shape.empty()) { diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h old mode 100755 new mode 100644 index e8300a0472..c4721d55a0 --- a/lite/kernels/npu/bridges/utility.h +++ b/lite/kernels/npu/bridges/utility.h @@ -19,12 +19,12 @@ #include #include #include -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "graph/buffer.h" +#include "graph/graph.h" +#include "graph/model.h" +#include "graph/op/all_ops.h" +#include "graph/operator.h" +#include "graph/operator_reg.h" #include "lite/core/op_lite.h" #include "lite/utils/macros.h" @@ -70,59 +70,15 @@ ge::DataType CvtPrecisionType(PrecisionType itype); ge::Format CvtDataLayoutType(DataLayoutType itype); +// Padding the shape to 4-dimensions(NCHW) for HiAI +std::vector CvtShape(const std::vector& in_shape); + +std::vector CvtShape(const DDim& in_dims); + ge::TensorPtr CvtTensor(const Tensor& in_tensor, std::vector out_shape = {}, - PrecisionType in_precision = PRECISION(kFloat), DataLayoutType in_layout = DATALAYOUT(kNCHW)); -template -ge::TensorPtr CreateTensorAndFillData(const std::vector& data, - std::vector shape = {}, - ge::Format format = ge::FORMAT_NCHW) { - const std::type_info& info = typeid(T); - ge::DataType type = ge::DT_FLOAT; - if (info == typeid(float)) { - type = ge::DT_FLOAT; - } else if (info == typeid(int8_t)) { - type = ge::DT_INT8; - } else if (info == typeid(int16_t)) { - type = ge::DT_INT16; - } else if (info == typeid(int32_t)) { - type = ge::DT_INT32; - } else if (info == typeid(int64_t)) { - type = ge::DT_INT64; - } else { - LOG(FATAL) << "[NPU] Unknow value type " << info.name(); - } - if (shape.empty()) { - shape = {static_cast(data.size())}; - } else { - int size = 1; - for (auto i : shape) { - size *= i; - } - CHECK_EQ(data.size(), size); - } - ge::TensorDesc desc(ge::Shape(shape), format, type); - ge::TensorPtr tensor = std::make_shared(); - tensor->SetTensorDesc(desc); - tensor->SetData(reinterpret_cast(data.data()), - data.size() * sizeof(T)); - return tensor; -} - -template -ge::TensorPtr CreateTensorAndFillData(T value, - std::vector shape = {1}, - ge::Format format = ge::FORMAT_NCHW) { - int64_t size = 1; - for (auto i : shape) { - size *= i; - } - std::vector data(size, value); - return CreateTensorAndFillData(data, shape, format); -} - int CvtActMode(std::string act_type); } // namespace npu diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc deleted file mode 100644 index 9a05a33062..0000000000 --- a/lite/kernels/npu/graph_compute.cc +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/npu/graph_compute.h" -#include -#include - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { - -void GraphCompute::PrepareForRun() { - auto& ctx = this->ctx_->template As(); - auto& param = this->Param(); - - // Load HiAI model from the weight tensor and release its buffer - // to save memory - CHECK(param.weight); - CHECK(lite::npu::LoadModel(*param.weight, &model_client_, &model_name_)); - // TODO(hong19860320): find an good way to free the model data. - // No interface exists to free the data of tensor, so I resize the dim to 1 - // and change target to force it to realloc a small size memory. - param.weight->Resize({1}); - param.weight->mutable_data(TargetType::kARM); - CHECK(model_client_); - - // Query the dimensions of NPU input and output tensors from HiAI model - std::vector npu_idims; - std::vector npu_odims; - int ret = - model_client_->GetModelIOTensorDim(model_name_, npu_idims, npu_odims); - CHECK_EQ(ret, hiai::AI_SUCCESS) - << "[NPU] Get the dimensions of input and output tensors failed."; - - // Check whether the data sizes of NPU input and output tensors are the - // same as CPU's, then create and initialize NPU input and output tensors. - npu_itensors_.resize(npu_idims.size()); - npu_otensors_.resize(npu_odims.size()); - npu_idatasizes_.resize(npu_idims.size()); - npu_odatasizes_.resize(npu_odims.size()); - for (size_t i = 0; i < npu_idims.size(); ++i) { - auto cpu_itensor = param.inputs[i].second; - CHECK(cpu_itensor); - VLOG(3) << "[NPU] CPU input dims[" << i << "]: " << cpu_itensor->dims(); - VLOG(3) << "[NPU] NPU input dims[" << i << "]: {" - << npu_idims[i].GetNumber() << "," << npu_idims[i].GetChannel() - << "," << npu_idims[i].GetHeight() << "," << npu_idims[i].GetWidth() - << "}"; - npu_idatasizes_[i] = npu_idims[i].GetNumber() * npu_idims[i].GetChannel() * - npu_idims[i].GetHeight() * npu_idims[i].GetWidth(); - CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]); - npu_itensors_[i].reset(new hiai::AiTensor); - npu_itensors_[i]->Init(&(npu_idims[i])); - } - for (size_t i = 0; i < npu_odims.size(); ++i) { - auto cpu_otensor = param.outputs[i].second; - CHECK(cpu_otensor); - VLOG(3) << "[NPU] CPU output dims[" << i << "]: " << cpu_otensor->dims(); - VLOG(3) << "[NPU] NPU output dims[" << i << "]: {" - << npu_odims[i].GetNumber() << "," << npu_odims[i].GetChannel() - << "," << npu_odims[i].GetHeight() << "," << npu_odims[i].GetWidth() - << "}"; - npu_odatasizes_[i] = npu_odims[i].GetNumber() * npu_odims[i].GetChannel() * - npu_odims[i].GetHeight() * npu_odims[i].GetWidth(); - if (cpu_otensor->dims().production() != npu_odatasizes_[i]) { - cpu_otensor->Resize({npu_odims[i].GetNumber(), - npu_odims[i].GetChannel(), - npu_odims[i].GetHeight(), - npu_odims[i].GetWidth()}); - } - npu_otensors_[i].reset(new hiai::AiTensor); - npu_otensors_[i]->Init(&(npu_odims[i])); - } -} - -void GraphCompute::Run() { - auto& param = this->Param(); - - // Check whether the data sizes of NPU input tensors are the same as - // CPU's, and copy the data of CPU input tensors to NPU's. - CHECK_EQ(param.inputs.size(), npu_itensors_.size()); - CHECK_EQ(param.outputs.size(), npu_otensors_.size()); - for (size_t i = 0; i < param.inputs.size(); ++i) { - auto cpu_itensor = param.inputs[i].second; - CHECK(cpu_itensor); - CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]); - std::memcpy(static_cast(npu_itensors_[i]->GetBuffer()), - cpu_itensor->data(), - sizeof(float) * static_cast(npu_idatasizes_[i])); - } - - // Run HiAI model with model name - std::string key = "model_name"; // Note: key seems must be model_name - model_context_.AddPara(key, model_name_); - auto GetCurrentUS = []() -> double { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; - }; - int istamp; - auto start_time = GetCurrentUS(); - CHECK_EQ(hiai::AI_SUCCESS, - model_client_->Process( - model_context_, npu_itensors_, npu_otensors_, 1000, istamp)); - VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; - - // Check whether the data sizes of NPU output tensors are the same as - // CPU's, and copy the data of NPU output tensors to CPU's. - for (size_t i = 0; i < param.outputs.size(); ++i) { - auto cpu_otensor = param.outputs[i].second; - CHECK(cpu_otensor); - CHECK_EQ(cpu_otensor->dims().production(), npu_odatasizes_[i]); - std::memcpy(cpu_otensor->mutable_data(), - static_cast(npu_otensors_[i]->GetBuffer()), - sizeof(float) * static_cast(npu_odatasizes_[i])); - } -} - -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(graph_op, - kNPU, - kFloat, - kNCHW, - paddle::lite::kernels::npu::GraphCompute, - def) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .Finalize(); diff --git a/lite/kernels/npu/graph_compute.h b/lite/kernels/npu/graph_compute.h deleted file mode 100644 index b289b8e42f..0000000000 --- a/lite/kernels/npu/graph_compute.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" -#include "lite/core/types.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { - -class GraphCompute : public KernelLite { - public: - using param_t = operators::GraphParam; - - void PrepareForRun() override; - - void Run() override; - - virtual ~GraphCompute() = default; - - private: - std::shared_ptr model_client_; - std::string model_name_; - hiai::AiContext model_context_; - - std::vector npu_idatasizes_; - std::vector npu_odatasizes_; - std::vector> npu_itensors_; - std::vector> npu_otensors_; -}; - -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc old mode 100755 new mode 100644 index d9b1919506..6f32099274 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -16,7 +16,7 @@ #include #include #include -#include "ai_ddk_lib/include/hiai_ir_build.h" +#include "hiai_ir_build.h" // NOLINT #include "lite/backends/npu/device.h" #include "lite/core/op_registry.h" #include "lite/kernels/npu/bridges/graph.h" @@ -39,13 +39,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); - if (!bridges.Exists("NPU", op_type)) { + if (!bridges.Exists(op_type, "kNPU")) { return subgraph::FAILED; } auto kernel = inst.kernel(); - status |= bridges.Select("NPU", op_type)(reinterpret_cast(&graph), - const_cast(op), - const_cast(kernel)); + status |= bridges.Select(op_type, "kNPU")(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { return subgraph::FAILED; } @@ -57,26 +57,26 @@ int SubgraphEngine::BuildDeviceProgram() { std::vector device_inodes; std::vector device_onodes; for (auto& input_name : input_names_) { - if (graph.HasNode(input_name)) { - if (!graph.GetType(input_name).persistable()) { - device_inodes.push_back(*graph.GetNode(input_name)); + if (graph.Has(input_name)) { + if (graph.Get(input_name)->is_data()) { + device_inodes.push_back(*graph.Get(input_name)->data()); device_inames_.push_back(input_name); } else { LOG(WARNING) << "[NPU] Input node " << input_name - << " is skipped because it is a persistable node."; + << " is ignored because it is not a data node."; } } else { LOG(WARNING) << "[NPU] Input node " << input_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } for (auto& output_name : output_names_) { - if (graph.HasNode(output_name)) { - device_onodes.push_back(*graph.GetNode(output_name)); + if (graph.Has(output_name)) { + device_onodes.push_back(*graph.Get(output_name)->data()); device_onames_.push_back(output_name); } else { LOG(WARNING) << "[NPU] Output node " << output_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } CHECK(!device_inames_.empty()) @@ -108,14 +108,14 @@ int SubgraphEngine::BuildDeviceProgram() { origin_otensors_.resize(device_onames_.size()); device_otensors_.resize(device_onames_.size()); for (int i = 0; i < device_inames_.size(); i++) { - auto type = graph.GetType(device_inames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_inames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); CHECK(origin_itensors_[i]); origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[NPU] Inputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: {" << device_idims[i].GetNumber() << "," << device_idims[i].GetChannel() << "," @@ -129,14 +129,14 @@ int SubgraphEngine::BuildDeviceProgram() { device_itensors_[i]->Init(&(device_idims[i])); } for (int i = 0; i < device_onames_.size(); i++) { - auto type = graph.GetType(device_onames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_onames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); CHECK(origin_otensors_[i]); origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[NPU] Outputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: {" << device_odims[i].GetNumber() << "," << device_odims[i].GetChannel() << "," diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h old mode 100755 new mode 100644 index 27b4a36cfe..2cdc4a0e62 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -17,7 +17,7 @@ #include #include #include -#include "ai_ddk_lib/include/HiAiModelManagerService.h" +#include "HiAiModelManagerService.h" #include "lite/core/kernel.h" #include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/registry.h" diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 3423b1e920..f4d3254a7b 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -14,7 +14,7 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps}) add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps}) add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) @@ -49,12 +49,14 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) -lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc - DEPS conv2d_1x1_opencl cl_image_converter op_registry program context - ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) +#lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc +# DEPS conv2d_1x1_opencl cl_image_converter op_registry program context +# ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) + lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc DEPS reshape_opencl cl_image_converter op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) + lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc DEPS conv_opencl op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) diff --git a/lite/kernels/opencl/conv2d_1x1_compute.cc b/lite/kernels/opencl/conv2d_1x1_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/opencl/conv2d_1x1_compute_test.cc b/lite/kernels/opencl/conv2d_1x1_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_compute.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/kernels/x86/fc_compute_test.cc b/lite/kernels/x86/fc_compute_test.cc deleted file mode 100644 index abc0597457..0000000000 --- a/lite/kernels/x86/fc_compute_test.cc +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "lite/kernels/x86/fc_compute.h" -#include -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace x86 { - -TEST(fc_x86, retrive_op) { - auto fc = - KernelRegistry::Global().Create("fc"); - ASSERT_FALSE(fc.empty()); - ASSERT_TRUE(fc.front()); -} - -TEST(fc_x86, init) { - FcCompute fc; - ASSERT_EQ(fc.precision(), PRECISION(kFloat)); - ASSERT_EQ(fc.target(), TARGET(kX86)); -} - -TEST(fc_x86, run_test) { - lite::Tensor x, w, b, out; - constexpr int batch_size = 2; - std::vector x_shape{batch_size, 3}; - x.Resize(lite::DDim(x_shape)); - std::vector w_shape{3, 4}; - w.Resize(lite::DDim(w_shape)); - std::vector b_shape{1, 4}; - b.Resize(lite::DDim(b_shape)); - std::vector out_shape{1, 4}; - out.Resize(lite::DDim(out_shape)); - - auto x_data = x.mutable_data(); - auto w_data = w.mutable_data(); - auto b_data = b.mutable_data(); - auto out_data = out.mutable_data(); - - for (int64_t i = 0; i < x.dims().production(); i++) { - x_data[i] = static_cast(i); - } - for (int64_t i = 0; i < w.dims().production(); i++) { - w_data[i] = static_cast(i); - } - for (int64_t i = 0; i < b.dims().production(); i++) { - b_data[i] = static_cast(i); - } - - /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3, // - w_data, 3, 4, // - b_data, ref_data); */ - - // FcCompute fc; - FcCompute fc; - operators::FcParam param; - - param.in_num_col_dims = 1; - param.input = &x; - param.w = &w; - param.bias = &b; - param.output = &out; - param.in_mat_dims = x.dims(); - - // std::unique_ptr ctx(new KernelContext); - // ctx->As(); - fc.SetParam(param); - // fc.SetContext(std::move(ctx)); - fc.Run(); - - VLOG(3) << "output vs ref"; - for (int i = 0; i < out.dims().production(); i++) { - VLOG(3) << out_data[i]; - } - - /* for (int i = 0; i < out.dims().production(); ++i) { - EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); - }*/ -} - -} // namespace x86 -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h index bbbdb91deb..ca2ddf60c5 100644 --- a/lite/kernels/x86/layer_norm_compute.h +++ b/lite/kernels/x86/layer_norm_compute.h @@ -78,7 +78,7 @@ class LayerNormCompute : public KernelLite { Scale->data(), Bias->data(), static_cast(left), - static_cast(epsilon), + epsilon, right); } diff --git a/lite/kernels/x86/relu_compute.cc b/lite/kernels/x86/relu_compute.cc deleted file mode 100644 index 684b144254..0000000000 --- a/lite/kernels/x86/relu_compute.cc +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/x86/relu_compute.h" - -REGISTER_LITE_KERNEL(relu, - kX86, - kFloat, - kNCHW, - paddle::lite::kernels::x86::ReluCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) - .Finalize(); diff --git a/lite/kernels/x86/relu_compute.h b/lite/kernels/x86/relu_compute.h deleted file mode 100644 index b80a99302a..0000000000 --- a/lite/kernels/x86/relu_compute.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_lite.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" -#include "lite/operators/relu_op.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace x86 { - -template -class ReluCompute : public KernelLite { - public: - using param_t = operators::ActivationParam; - - void Run() override { - auto& param = *param_.get_mutable(); - auto n = param.X->dims().production(); - const float* input = param.X->data(); - float* output = param.Out->mutable_data(); - for (int i = 0; i < n; i++) { - output[i] = std::max(0.f, input[i]); - } - } - - virtual ~ReluCompute() = default; -}; - -} // namespace x86 -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc index f674af84ca..e3d4588aa2 100644 --- a/lite/kernels/xpu/bridges/act_op.cc +++ b/lite/kernels/xpu/bridges/act_op.cc @@ -43,20 +43,21 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Act node if (op_type == "relu") { - graph->AddNode(out_name, graph->builder_.CreateRelu(*x_node)); + graph->Add(out_name, graph->builder_.CreateRelu(*x_node->data())); } else if (op_type == "tanh") { - graph->AddNode(out_name, graph->builder_.CreateUnaryOp("tanh", *x_node)); + graph->Add(out_name, + graph->builder_.CreateUnaryOp("tanh", *x_node->data())); } else if (op_type == "gelu") { - graph->AddNode(out_name, graph->builder_.CreateGelu(*x_node)); + graph->Add(out_name, graph->builder_.CreateGelu(*x_node->data())); } else { // TODO(hong19860320) supports more activation ops LOG(WARNING) << "[XPU] Unsupported activation type " << op_type; @@ -70,6 +71,6 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, relu, paddle::lite::subgraph::xpu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, tanh, paddle::lite::subgraph::xpu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, gelu, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(relu, kXPU, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(tanh, kXPU, paddle::lite::subgraph::xpu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(gelu, kXPU, paddle::lite::subgraph::xpu::ActConverter); diff --git a/lite/kernels/xpu/bridges/act_op_test.cc b/lite/kernels/xpu/bridges/act_op_test.cc deleted file mode 100644 index 1a3efab46e..0000000000 --- a/lite/kernels/xpu/bridges/act_op_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/xpu/bridges/registry.h" -#include "lite/kernels/xpu/bridges/test_helper.h" -#include "lite/operators/activation_ops.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -void relu_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); - auto x_data = x->data(); - auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - DDim out_dims = out->dims(); - CHECK_EQ(x_dims.production(), out_dims.production()); - for (int i = 0; i < out_dims.production(); i++) { - out_data[i] = std::max(0.f, x_data[i]); - } -} - -void test_relu(int bs, int ic, int ih, int iw) { - // prepare input&output variables - Scope scope; - std::string x_var_name("x"); - std::string out_var_name("out"); - std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - - // initialize input&output data - FillTensor(x); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("relu"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetOutput("Out", {out_var_name}); - - // create and convert op to XPU model, and run it on XPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - relu_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, relu) { - for (auto bs : {1, 3}) { - for (auto ic : {3, 4}) { - for (auto ih : {2, 5}) { - for (auto iw : {5, 9}) { - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw; - test_relu(bs, ic, ih, iw); - } - } - } - } -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(relu); -USE_XPU_BRIDGE(relu); diff --git a/lite/kernels/xpu/bridges/batch_norm_op.cc b/lite/kernels/xpu/bridges/batch_norm_op.cc index 980f241660..d84b9cc4f1 100644 --- a/lite/kernels/xpu/bridges/batch_norm_op.cc +++ b/lite/kernels/xpu/bridges/batch_norm_op.cc @@ -37,55 +37,61 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); + auto scale_name = op_info->Input("Scale").front(); auto scale_type = kernel->GetInputDeclType("Scale"); CHECK(scale_type->precision() == PRECISION(kFloat)); CHECK(scale_type->layout() == DATALAYOUT(kNCHW)); auto scale = scope->FindMutableTensor(scale_name); + auto bias_name = op_info->Input("Bias").front(); auto bias_type = kernel->GetInputDeclType("Bias"); CHECK(bias_type->precision() == PRECISION(kFloat)); CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); auto bias = scope->FindMutableTensor(bias_name); + auto mean_name = op_info->Input("Mean").front(); auto mean_type = kernel->GetInputDeclType("Mean"); CHECK(mean_type->precision() == PRECISION(kFloat)); CHECK(mean_type->layout() == DATALAYOUT(kNCHW)); auto mean = scope->FindMutableTensor(mean_name); + auto variance_name = op_info->Input("Variance").front(); auto variance_type = kernel->GetInputDeclType("Variance"); CHECK(variance_type->precision() == PRECISION(kFloat)); CHECK(variance_type->layout() == DATALAYOUT(kNCHW)); auto variance = scope->FindMutableTensor(variance_name); + auto y_name = op_info->Output("Y").front(); auto y_type = kernel->GetOutputDeclType("Y"); CHECK(y_type->precision() == PRECISION(kFloat)); CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + auto epsilon = op_info->GetAttr("epsilon"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Scale, Bias, Mean, Variance node - auto scale_const_node = graph->AddNode(scale_name, *scale); - auto bias_const_node = graph->AddNode(bias_name, *bias); - auto mean_const_node = graph->AddNode(mean_name, *mean); - auto variance_const_node = graph->AddNode(variance_name, *variance); + auto scale_node = graph->Add(scale_name, *scale); + auto bias_node = graph->Add(bias_name, *bias); + auto mean_node = graph->Add(mean_name, *mean); + auto variance_node = graph->Add(variance_name, *variance); // Batch Norm node and extract the first field as the output node - auto batch_norm_node = graph->builder_.CreateBatchNorm(*x_node, - *scale_const_node, - *bias_const_node, - *mean_const_node, - *variance_const_node, + auto batch_norm_data = graph->builder_.CreateBatchNorm(*x_node->data(), + *scale_node->data(), + *bias_node->data(), + *mean_node->data(), + *variance_node->data(), 1, epsilon); - graph->AddNode(y_name, graph->builder_.GetField(batch_norm_node, 0)); + graph->Add(y_name, graph->builder_.GetField(batch_norm_data, 0)); return SUCCESS; } @@ -94,6 +100,6 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - batch_norm, +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kXPU, paddle::lite::subgraph::xpu::BatchNormConverter); diff --git a/lite/kernels/xpu/bridges/batch_norm_op_test.cc b/lite/kernels/xpu/bridges/batch_norm_op_test.cc deleted file mode 100644 index dec475530a..0000000000 --- a/lite/kernels/xpu/bridges/batch_norm_op_test.cc +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/batch_norm_op.h" -#include -#include "lite/core/op_registry.h" -#include "lite/kernels/xpu/bridges/registry.h" -#include "lite/kernels/xpu/bridges/test_helper.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -template -void batch_norm_ref(const std::shared_ptr op) { - Scope* scope = op->scope(); - const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable(); - auto bias = - scope->FindVar(op_info->Input("Bias").front())->GetMutable(); - auto scale = - scope->FindVar(op_info->Input("Scale").front())->GetMutable(); - auto mean = - scope->FindVar(op_info->Input("Mean").front())->GetMutable(); - auto variance = - scope->FindVar(op_info->Input("Variance").front())->GetMutable(); - - auto x_data = x->data(); - auto y_data = y->mutable_data(); - auto scale_data = scale->mutable_data(); - auto bias_data = bias->mutable_data(); - auto mean_data = mean->mutable_data(); - auto variance_data = variance->mutable_data(); - DDim x_dims = x->dims(); - - float epsilon = op_info->GetAttr("epsilon"); - auto data_layout = op_info->GetAttr("data_layout"); - - bool global_stats = op_info->GetAttr("use_global_stats"); - if (global_stats) { - int64_t outer_size = 0; - int64_t channel_size = 0; - int64_t inner_size = 0; - if (data_layout == "NCHW") { - outer_size = x_dims[0]; - channel_size = x_dims[1]; - inner_size = x_dims.Slice(2, x_dims.size()).production(); - } else { - LOG(FATAL) << "Unknown storage order: " << data_layout; - } - auto x_ptr = x_data; - auto y_ptr = y_data; - for (int o = 0; o < outer_size; o++) { - for (int c = 0; c < channel_size; c++) { - for (int i = 0; i < inner_size; i++) { - dtype norm_x = - (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); - *y_ptr = norm_x * scale_data[c] + bias_data[c]; - x_ptr++; - y_ptr++; - } - } - } - } -} - -void test_batch_norm(int bs, int ic, int ih, int iw, float epsilon) { - // prepare input&output variables - Scope scope; - std::string x_var_name = "x"; - std::string out_var_name = "out"; - std::string out_ref_var_name = "out_ref"; - std::string scale_var_name = "scale"; - std::string bias_var_name = "bias"; - std::string mean_var_name = "mean"; - std::string variance_var_name = "variance"; - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* scale = scope.Var(scale_var_name)->GetMutable(); - auto* bias = scope.Var(bias_var_name)->GetMutable(); - auto* mean = scope.Var(mean_var_name)->GetMutable(); - auto* variance = scope.Var(variance_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - scale->Resize({ic}); - bias->Resize({ic}); - mean->Resize({ic}); - variance->Resize({ic}); - - // initialize input&output data - FillTensor(x); - FillTensor(scale); - FillTensor(bias); - FillTensor(mean); - // variance > 0 - FillTensor(variance, 1.f, 5.f); - - // initialize op desc - cpp::OpDesc opdesc; - opdesc.SetType("batch_norm"); - opdesc.SetInput("X", {x_var_name}); - opdesc.SetInput("Scale", {scale_var_name}); - opdesc.SetInput("Bias", {bias_var_name}); - opdesc.SetInput("Mean", {mean_var_name}); - opdesc.SetInput("Variance", {variance_var_name}); - opdesc.SetOutput("Y", {out_var_name}); - opdesc.SetAttr("is_test", 1); - opdesc.SetAttr("use_global_stats", true); - opdesc.SetAttr("epsilon", epsilon); - opdesc.SetAttr("momentum", 0.9f); - opdesc.SetAttr("data_layout", std::string("NCHW")); - - // create and convert op to XPU model, then run it on XPU - auto op = CreateOp(opdesc, &scope); - LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); - - // execute reference implementation and save to output tensor - batch_norm_ref(op); - - // compare results - auto* out_data = out->mutable_data(); - auto* out_ref_data = out_ref->mutable_data(); - for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); - } -} - -TEST(NPUBridges, batch_norm) { - for (auto bs : {1, 3}) { - for (auto ic : {2, 3}) { - for (auto ih : {4}) { - for (auto iw : {5}) { - for (auto epsilon : {1e-5f}) { - test_batch_norm(bs, ic, ih, iw, epsilon); - } - } - } - } - } -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -USE_LITE_OP(batch_norm); -USE_XPU_BRIDGE(batch_norm); diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc index 5e9e5448a1..fe9c598847 100644 --- a/lite/kernels/xpu/bridges/conv_op.cc +++ b/lite/kernels/xpu/bridges/conv_op.cc @@ -61,11 +61,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(dilations.size(), 2L); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } if (paddings.size() == 2L) { @@ -99,7 +99,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { DDim output_dims(output_shape); // Filter node - auto filter_const_node = graph->AddNode(filter_name, *filter); + auto filter_node = graph->Add(filter_name, *filter); // Conv node auto conv_attrs = xtcl::make_node(); @@ -114,9 +114,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { conv_attrs->out_layout = ""; // conv_attrs->out_dtype = ""; auto conv_node = - graph->AddNode(output_name, - graph->builder_.CreateConv2D( - *input_node, *filter_const_node, conv_attrs)); + graph->Add(output_name, + graph->builder_.CreateConv2D( + *input_node->data(), *filter_node->data(), conv_attrs)); // Add bias node if exists bias // supports the bias nodes with the following dimensions @@ -149,30 +149,27 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " isn't supported in conv2d Op when output dimension is " << output_dims; } - std::shared_ptr bias_node = nullptr; - if (graph->HasNode(bias_name)) { - // Bias node from input node - bias_node = graph->GetNode(bias_name); + std::shared_ptr bias_node = nullptr; + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); } else { - // Bias node with const data - bias_node = graph->AddNode(bias_name, *bias, bias_shape); + bias_node = graph->Add(bias_name, *bias, bias_shape); } - std::shared_ptr add_node = nullptr; if (is_channel_bias) { - add_node = graph->AddNode( - output_name, - graph->builder_.CreateBiasAdd(*conv_node, 1, *bias_node)); + conv_node = graph->Add(output_name, + graph->builder_.CreateBiasAdd( + *conv_node->data(), 1, *bias_node->data())); } else { - add_node = graph->AddNode( - output_name, - graph->builder_.CreateBinaryOp("add", *conv_node, *bias_node)); + conv_node = + graph->Add(output_name, + graph->builder_.CreateBinaryOp( + "add", *conv_node->data(), *bias_node->data())); } - conv_node = add_node; } if (fuse_relu) { // Append relu node if fuse_relu is true - graph->AddNode(output_name, graph->builder_.CreateRelu(*conv_node)); + graph->Add(output_name, graph->builder_.CreateRelu(*conv_node->data())); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -182,9 +179,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - conv2d, +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kXPU, paddle::lite::subgraph::xpu::ConvConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, - depthwise_conv2d, +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kXPU, paddle::lite::subgraph::xpu::ConvConverter); diff --git a/lite/kernels/xpu/bridges/dropout_op.cc b/lite/kernels/xpu/bridges/dropout_op.cc old mode 100755 new mode 100644 index ae81facd53..df869e17ff --- a/lite/kernels/xpu/bridges/dropout_op.cc +++ b/lite/kernels/xpu/bridges/dropout_op.cc @@ -46,21 +46,21 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { op_info->GetAttr("dropout_implementation"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Dropout node if (dropout_implementation == "downgrade_in_infer") { - graph->AddNode( - out_name, - graph->builder_.CreateScale(*x_node, 1.f - dropout_prob, 0.0f, false)); + graph->Add(out_name, + graph->builder_.CreateScale( + *x_node->data(), 1.f - dropout_prob, 0.0f, false)); } else if (dropout_implementation == "upscale_in_train") { - graph->AddNode(out_name, - graph->builder_.CreateScale(*x_node, 1.0f, 0.0f, false)); + graph->Add(out_name, + graph->builder_.CreateScale(*x_node->data(), 1.0f, 0.0f, false)); } else { LOG(WARNING) << "[XPU] Unsupported dropout_implementation == " << dropout_implementation << " for dropout"; @@ -74,6 +74,6 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - dropout, +REGISTER_SUBGRAPH_BRIDGE(dropout, + kXPU, paddle::lite::subgraph::xpu::DropoutConverter); diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc index 49a42c55d6..7fcae312b9 100644 --- a/lite/kernels/xpu/bridges/elementwise_ops.cc +++ b/lite/kernels/xpu/bridges/elementwise_ops.cc @@ -50,29 +50,31 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { - y_node = graph->AddNode(y_name, y_dims); + y_node = graph->Add(y_name, *y); } // Elementwise node - std::shared_ptr elementwise_node = nullptr; + std::shared_ptr elt_node = nullptr; if (y_dims.size() == 1) { - elementwise_node = graph->AddNode( - out_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node)); + elt_node = graph->Add( + out_name, + graph->builder_.CreateBiasAdd(*x_node->data(), axis, *y_node->data())); } else if (x_dims.size() == y_dims.size()) { - elementwise_node = graph->AddNode( - out_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node)); + elt_node = graph->Add(out_name, + graph->builder_.CreateBinaryOp( + "add", *x_node->data(), *y_node->data())); } else { LOG(WARNING) << "[XPU] elementwise_add only support y of one dimension, or x " @@ -88,6 +90,6 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - elementwise_add, +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kXPU, paddle::lite::subgraph::xpu::ElementwiseConverter); diff --git a/lite/kernels/xpu/bridges/gather_op.cc b/lite/kernels/xpu/bridges/gather_op.cc old mode 100755 new mode 100644 index 06d1c67b0d..845bbb8d98 --- a/lite/kernels/xpu/bridges/gather_op.cc +++ b/lite/kernels/xpu/bridges/gather_op.cc @@ -54,38 +54,42 @@ int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_dims = out->dims(); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Index node - std::shared_ptr index_node = nullptr; - if (graph->HasNode(index_name)) { - index_node = graph->GetNode(index_name); + std::shared_ptr index_node = nullptr; + if (graph->Has(index_name)) { + index_node = graph->Get(index_name); } else { - index_node = graph->AddNode( - index_name, index_dims, index_type->precision(), index_type->layout()); + index_node = graph->Add(index_name, *index); } // Flatten index node if (index_dims.size() != 1) { index_node = - graph->AddNode(index_name + "/reshape", - graph->builder_.CreateReshape(*index_node, {-1}), - index_type->precision(), - index_type->layout()); + graph->Add(index_name + "/reshape", + graph->builder_.CreateReshape(*index_node->data(), {-1}), + index_node->precision(), + index_node->layout()); } // Reshape the gather node with the inferred shape as the output node - auto gather_node = graph->AddNode( - out_name, - graph->builder_.CreateGather(*x_node, *index_node, /* axis= */ 0)); + auto gather_node = + graph->Add(out_name, + graph->builder_.CreateGather( + *x_node->data(), *index_node->data(), /* axis= */ 0), + x_node->precision(), + x_node->layout()); if (out_dims.size() != 2) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *gather_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape(*gather_node->data(), + CvtShape(out_dims)), + gather_node->precision(), + gather_node->layout()); } return SUCCESS; } @@ -95,6 +99,6 @@ int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - gather, +REGISTER_SUBGRAPH_BRIDGE(gather, + kXPU, paddle::lite::subgraph::xpu::GatherConverter); diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc old mode 100755 new mode 100644 index 1691e4b0c5..43aaad3402 --- a/lite/kernels/xpu/bridges/graph.cc +++ b/lite/kernels/xpu/bridges/graph.cc @@ -21,71 +21,70 @@ namespace lite { namespace subgraph { namespace xpu { -std::shared_ptr Graph::AddNode(const std::string& name, - const xtcl::xExpr& layer, - PrecisionType precision, - DataLayoutType layout) { - auto unique_name = [&](const std::string& key) { - int idx = 1; - auto it = counts_.find(key); - if (it == counts_.end()) { - counts_.insert(std::make_pair(key, idx)); - } else { - idx = ++(it->second); - } - return key + "_" + std::to_string(idx); - }; +int Graph::Add(const std::string& name, std::shared_ptr node) { auto it = nodes_.find(name); if (it != nodes_.end()) { - // Only variable can rebind the name - CHECK(!it->second.second.persistable()) << "[XPU] Node " << name - << " redefined."; - // Generate a new unique name as the key to bind the origin node if the - // origin node isn't a const node: new_name->node - nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second)); - nodes_.erase(it); + // Only variable node can be shared with the same name + if (!node->is_var() || !it->second.back()->is_var()) { + LOG(FATAL) << "[XPU] Const or data node " << name << " is redefined."; + return -1; + } + } else { + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; } - // Create a new node and bind with the name: name->new_node - auto node = std::make_shared(layer); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, false)))); - builder_.SetLayer(unique_name(name + "_op")); - return node; + it->second.push_back(node); + return it->second.size(); } -// Const node -std::shared_ptr Graph::AddNode(const std::string& name, - const Tensor& tensor, - PrecisionType precision, - DataLayoutType layout) { - return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout); +// Variable node +std::shared_ptr Graph::Add(const std::string& name, + const xtcl::xExpr& layer, + PrecisionType precision, + DataLayoutType layout) { + auto node = std::make_shared(precision, layout, Node::Role::kVar); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + node->set_data(std::make_shared(layer)); + // Generate a unique name for the current XTCL layer + builder_.SetLayer(name + "__" + std::to_string(idx)); + return node; } -std::shared_ptr Graph::AddNode(const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined."; - auto node = std::make_shared(builder_.CreateTensor( - name, CvtShape(shape), CvtPrecisionType(precision))); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, true)))); - params_.emplace( - std::make_pair(name, *CvtTensor(tensor, shape, precision, layout))); +// Const or data node +std::shared_ptr Graph::Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout) { + std::shared_ptr node = nullptr; + PrecisionType precision = tensor.precision(); + if (tensor.persistable()) { + // Const node + node = std::make_shared(precision, layout, Node::Role::kConst); + auto idx = Add(name, node); + CHECK_EQ(idx, 1); + node->set_data(std::make_shared(builder_.CreateTensor( + name, CvtShape(shape), CvtPrecisionType(precision)))); + params_.emplace(std::make_pair(name, *CvtTensor(tensor, shape, layout))); + } else { + // Data node + node = Add(name, shape, precision, layout); + } return node; } // Data node -std::shared_ptr Graph::AddNode(const std::string& name, - std::vector shape, - PrecisionType precision, - DataLayoutType layout) { - CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined."; - auto node = std::make_shared(builder_.CreateTensor( - name, CvtShape(shape), CvtPrecisionType(precision))); - nodes_.insert(std::make_pair( - name, std::make_pair(node, Type(precision, layout, false)))); +std::shared_ptr Graph::Add(const std::string& name, + std::vector shape, + PrecisionType precision, + DataLayoutType layout) { + auto node = std::make_shared(precision, layout, Node::Role::kData); + auto idx = Add(name, node); + CHECK_EQ(idx, 1); + node->set_data(std::make_shared(builder_.CreateTensor( + name, CvtShape(shape), CvtPrecisionType(precision)))); return node; } diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h old mode 100755 new mode 100644 index 3107346851..dafd8d8532 --- a/lite/kernels/xpu/bridges/graph.h +++ b/lite/kernels/xpu/bridges/graph.h @@ -28,78 +28,78 @@ namespace lite { namespace subgraph { namespace xpu { -// Type of graph nodes -class Type { +// Graph and node is defined to collect all of converted XTCL IR nodes +class Node { public: - Type(PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW), - bool persistable = false) - : precision_(precision), layout_(layout), persistable_(persistable) {} - + enum class Role { + kVar = 0, + kConst, + kData, + }; + + Node(std::shared_ptr data, + PrecisionType precision, + DataLayoutType layout, + Role role) + : data_(data), precision_(precision), layout_(layout), role_(role) {} + Node(PrecisionType precision, DataLayoutType layout, Role role) + : precision_(precision), layout_(layout), role_(role) {} + + void set_data(std::shared_ptr data) { data_ = data; } void set_precision(PrecisionType precision) { precision_ = precision; } void set_layout(DataLayoutType layout) { layout_ = layout; } - void set_persistable(bool persistable) { persistable_ = persistable; } + void set_role(Role role) { role_ = role; } + std::shared_ptr data() { return data_; } PrecisionType precision() const { return precision_; } DataLayoutType layout() const { return layout_; } - bool persistable() const { return persistable_; } + Role role() const { return role_; } + bool is_var() const { return role_ == Role::kVar; } + bool is_const() const { return role_ == Role::kConst; } + bool is_data() const { return role_ == Role::kData; } private: + std::shared_ptr data_{nullptr}; PrecisionType precision_{PRECISION(kFloat)}; DataLayoutType layout_{DATALAYOUT(kNCHW)}; - bool persistable_{false}; + Role role_{Role::kVar}; }; -// Graph to collect all of converted XPU IR nodes class Graph { public: - // Layer node - std::shared_ptr AddNode( - const std::string& name, - const xtcl::xExpr& layer, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); + int Add(const std::string& name, std::shared_ptr node); + + // Variable node + std::shared_ptr Add(const std::string& name, + const xtcl::xExpr& layer, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + // Const or data node + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, tensor.dims().Vectorize(), layout); + } - // Const node - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - const Tensor& tensor, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, tensor, dims.Vectorize(), precision, layout); + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, dims.Vectorize(), layout); } + // Const node template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - std::vector shape = {}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - const std::type_info& info = typeid(T); - PrecisionType precision = PRECISION(kFloat); - if (info == typeid(float)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int8_t)) { - precision = PRECISION(kFloat); - } else if (info == typeid(int32_t)) { - precision = PRECISION(kInt32); - } else { - LOG(FATAL) << "[XPU] Unknow data type " << info.name(); - } + std::shared_ptr Add(const std::string& name, + const std::vector& data, + std::vector shape = {}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { if (shape.empty()) { shape = {static_cast(data.size())}; } else { @@ -111,70 +111,61 @@ class Graph { } Tensor tensor; tensor.Resize(shape); + tensor.set_persistable(true); std::memcpy(reinterpret_cast(tensor.mutable_data()), reinterpret_cast(data.data()), data.size() * sizeof(T)); - return AddNode(name, tensor, precision, layout); + return Add(name, tensor, layout); } template - std::shared_ptr AddNode( - const std::string& name, - const std::vector& data, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, data, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + const std::vector& data, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, data, dims.Vectorize(), layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - std::vector shape = {1}, - DataLayoutType layout = DATALAYOUT(kNCHW)) { + std::shared_ptr Add(const std::string& name, + T value, + std::vector shape = {1}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { int64_t size = 1; for (auto i : shape) { size *= i; } std::vector data(size, value); - return AddNode(name, data, shape, layout); + return Add(name, data, shape, layout); } template - std::shared_ptr AddNode( - const std::string& name, - T value, - DDim dims, - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, value, dims.Vectorize(), layout); + std::shared_ptr Add(const std::string& name, + T value, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, value, dims.Vectorize(), layout); } // Data node - std::shared_ptr AddNode( - const std::string& name, - std::vector shape, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)); - - std::shared_ptr AddNode( - const std::string& name, - DDim dims, - PrecisionType precision = PRECISION(kFloat), - DataLayoutType layout = DATALAYOUT(kNCHW)) { - return AddNode(name, dims.Vectorize(), precision, layout); - } - - std::shared_ptr GetNode(const std::string& name) { - CHECK(HasNode(name)) << "[XPU] Node " << name << " not found."; - return nodes_.at(name).first; + std::shared_ptr Add(const std::string& name, + std::vector shape, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, dims.Vectorize(), precision, layout); } - const Type& GetType(const std::string& name) { - CHECK(HasNode(name)) << "[XPU] Node " << name << " not found."; - return nodes_.at(name).second; + std::shared_ptr Get(const std::string& name) { + CHECK(Has(name)) << "[XPU] Node " << name << " not found."; + return nodes_.at(name).back(); } - bool HasNode(const std::string& name) { + bool Has(const std::string& name) { return nodes_.find(name) != nodes_.end(); } @@ -184,9 +175,7 @@ class Graph { xtcl::network::xTensorCompiler::ParamNDArrayMap params_; private: - std::unordered_map, Type>> - nodes_; - std::unordered_map counts_; + std::unordered_map>> nodes_; }; } // namespace xpu diff --git a/lite/kernels/xpu/bridges/layer_norm_op.cc b/lite/kernels/xpu/bridges/layer_norm_op.cc old mode 100755 new mode 100644 index 601dd42770..3ad190b73f --- a/lite/kernels/xpu/bridges/layer_norm_op.cc +++ b/lite/kernels/xpu/bridges/layer_norm_op.cc @@ -51,23 +51,23 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto x_inner_size = x_dims.Slice(axis, x_rank).production(); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } if (reshape) { auto reshaped_x_dims = x_dims.Slice(0, axis).Vectorize(); reshaped_x_dims.push_back(x_inner_size); - x_node = - graph->AddNode(x_name + "/reshape", - graph->builder_.CreateReshape( - *x_node, CvtShape(reshaped_x_dims))); + x_node = graph->Add( + x_name + "/reshape", + graph->builder_.CreateReshape( + *x_node->data(), CvtShape(reshaped_x_dims))); } // Scale node - std::shared_ptr scale_const_node = nullptr; + std::shared_ptr scale_node = nullptr; if (HasInputArg(op_info, scope, "Scale")) { auto scale_name = op_info->Input("Scale").front(); auto scale_type = kernel->GetInputDeclType("Scale"); @@ -77,14 +77,13 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scale_dims = scale->dims(); CHECK_EQ(scale_dims.size(), 1); CHECK_EQ(scale_dims.production(), x_inner_size); - scale_const_node = graph->AddNode(scale_name, *scale); + scale_node = graph->Add(scale_name, *scale); } else { - scale_const_node = - graph->AddNode(y_name + "/scale_one", 1.0f, {x_inner_size}); + scale_node = graph->Add(y_name + "/scale_one", 1.0f, {x_inner_size}); } // Bias node - std::shared_ptr bias_const_node = nullptr; + std::shared_ptr bias_node = nullptr; if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); auto bias_type = kernel->GetInputDeclType("Bias"); @@ -94,26 +93,25 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto bias_dims = bias->dims(); CHECK_EQ(bias_dims.size(), 1); CHECK_EQ(bias_dims.production(), x_inner_size); - bias_const_node = graph->AddNode(bias_name, *bias); + bias_node = graph->Add(bias_name, *bias); } else { - bias_const_node = - graph->AddNode(y_name + "/bias_zero", 0.0f, {x_inner_size}); + bias_node = graph->Add(y_name + "/bias_zero", 0.0f, {x_inner_size}); } // Layer Norm node auto layer_norm_node = - graph->AddNode(y_name, - graph->builder_.CreateLayerNorm(*x_node, - *scale_const_node, - *bias_const_node, - axis, - epsilon, - true, - true)); + graph->Add(y_name, + graph->builder_.CreateLayerNorm(*x_node->data(), + *scale_node->data(), + *bias_node->data(), + axis, + epsilon, + true, + true)); if (reshape) { - graph->AddNode(y_name, - graph->builder_.CreateReshape( - *layer_norm_node, CvtShape(y_dims))); + graph->Add(y_name, + graph->builder_.CreateReshape(*layer_norm_node->data(), + CvtShape(y_dims))); } return REBUILD_WHEN_SHAPE_CHANGED; } @@ -123,6 +121,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - layer_norm, +REGISTER_SUBGRAPH_BRIDGE(layer_norm, + kXPU, paddle::lite::subgraph::xpu::LayerNormConverter); diff --git a/lite/kernels/xpu/bridges/lookup_table_op.cc b/lite/kernels/xpu/bridges/lookup_table_op.cc old mode 100755 new mode 100644 index a03e0c2d24..eecf50b5bd --- a/lite/kernels/xpu/bridges/lookup_table_op.cc +++ b/lite/kernels/xpu/bridges/lookup_table_op.cc @@ -57,30 +57,37 @@ int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // Ids node - std::shared_ptr ids_node = nullptr; - if (graph->HasNode(ids_name)) { - ids_node = graph->GetNode(ids_name); + std::shared_ptr ids_node = nullptr; + if (graph->Has(ids_name)) { + ids_node = graph->Get(ids_name); } else { - ids_node = graph->AddNode( - ids_name, ids_dims, ids_type->precision(), ids_type->layout()); + ids_node = graph->Add(ids_name, *ids); } // Flatten Ids node if (ids_dims.size() != 1) { - ids_node = graph->AddNode(ids_name + "/reshape", - graph->builder_.CreateReshape(*ids_node, {-1}), - ids_type->precision(), - ids_type->layout()); + ids_node = + graph->Add(ids_name + "/reshape", + graph->builder_.CreateReshape(*ids_node->data(), {-1}), + ids_node->precision(), + ids_node->layout()); } - auto w_const_node = graph->AddNode(w_name, *w); + + // W node + auto w_node = graph->Add(w_name, *w); // Reshape the gather node with the inferred shape as the output node - auto gather_node = graph->AddNode( - out_name, - graph->builder_.CreateGather(*w_const_node, *ids_node, /* axis= */ 0)); + auto gather_node = + graph->Add(out_name, + graph->builder_.CreateGather( + *w_node->data(), *ids_node->data(), /* axis= */ 0), + w_node->precision(), + w_node->layout()); if (out_dims.size() != 2) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *gather_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape(*gather_node->data(), + CvtShape(out_dims)), + gather_node->precision(), + gather_node->layout()); } return SUCCESS; } @@ -90,6 +97,6 @@ int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - lookup_table, +REGISTER_SUBGRAPH_BRIDGE(lookup_table, + kXPU, paddle::lite::subgraph::xpu::LookupTableConverter); diff --git a/lite/kernels/xpu/bridges/matmul_op.cc b/lite/kernels/xpu/bridges/matmul_op.cc old mode 100755 new mode 100644 index 330b336840..c17ba8423c --- a/lite/kernels/xpu/bridges/matmul_op.cc +++ b/lite/kernels/xpu/bridges/matmul_op.cc @@ -57,19 +57,19 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto alpha = op_info->GetAttr("alpha"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { - y_node = graph->AddNode(y_name, y_dims); + y_node = graph->Add(y_name, *y); } // Matmul node @@ -80,52 +80,55 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (x_dims.size() != 3) { auto m = static_cast(x_dims[x_dims.size() - 2]); auto k = static_cast(x_dims[x_dims.size() - 1]); - x_node = - graph->AddNode(x_name + "/reshape", - graph->builder_.CreateReshape(*x_node, {-1, m, k})); + x_node = graph->Add( + x_name + "/reshape", + graph->builder_.CreateReshape(*x_node->data(), {-1, m, k})); if (transpose_x) { - x_node = - graph->AddNode(x_name + "/reshape/transpose", - graph->builder_.CreateTranspose(*x_node, {0, 2, 1})); + x_node = graph->Add( + x_name + "/reshape/transpose", + graph->builder_.CreateTranspose(*x_node->data(), {0, 2, 1})); } } // Reshape and transposed Y node if (y_dims.size() != 3) { auto k = static_cast(y_dims[y_dims.size() - 2]); auto n = static_cast(y_dims[y_dims.size() - 1]); - y_node = - graph->AddNode(y_name + "/reshape", - graph->builder_.CreateReshape(*y_node, {-1, k, n})); + y_node = graph->Add( + y_name + "/reshape", + graph->builder_.CreateReshape(*y_node->data(), {-1, k, n})); if (!transpose_y) { - y_node = - graph->AddNode(y_name + "/reshape/transpose", - graph->builder_.CreateTranspose(*y_node, {0, 2, 1})); + y_node = graph->Add( + y_name + "/reshape/transpose", + graph->builder_.CreateTranspose(*y_node->data(), {0, 2, 1})); } } // Matmul node - auto matmul_node = graph->AddNode( - out_name, graph->builder_.CreateBatchMatmul(*x_node, *y_node)); + auto matmul_node = graph->Add( + out_name, + graph->builder_.CreateBatchMatmul(*x_node->data(), *y_node->data())); if (fabs(alpha - 1) > 1e-6f) { - matmul_node = graph->AddNode( - out_name, graph->builder_.CreateScale(*matmul_node, alpha)); + matmul_node = graph->Add( + out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha)); } if (out_dims.size() != 3) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *matmul_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape( + *matmul_node->data(), CvtShape(out_dims))); } } else if (x_dims.size() == 2 && y_dims.size() == 2) { // x: [M, K], y: [K, N], out: [M, N] if (transpose_x) { - x_node = graph->AddNode(x_name + "/transpose", - graph->builder_.CreateTranspose(*x_node, {1, 0})); + x_node = + graph->Add(x_name + "/transpose", + graph->builder_.CreateTranspose(*x_node->data(), {1, 0})); } - auto matmul_node = graph->AddNode( - out_name, - graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y)); + auto matmul_node = + graph->Add(out_name, + graph->builder_.CreateMatmul2D( + *x_node->data(), *y_node->data(), transpose_y)); if (fabs(alpha - 1) > 1e-6f) { - matmul_node = graph->AddNode( - out_name, graph->builder_.CreateScale(*matmul_node, alpha)); + matmul_node = graph->Add( + out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha)); } } else if (x_dims.size() == 1 && y_dims.size() == 1) { // x: [K], y: [K], out: [1] @@ -141,6 +144,6 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - matmul, +REGISTER_SUBGRAPH_BRIDGE(matmul, + kXPU, paddle::lite::subgraph::xpu::MatmulConverter); diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc index 4078055745..e12f767d13 100644 --- a/lite/kernels/xpu/bridges/mul_op.cc +++ b/lite/kernels/xpu/bridges/mul_op.cc @@ -56,49 +56,50 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(x_matrix_dims[1], y_matrix_dims[0]); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Flatten X node if (x_dims.size() != 2) { - x_node = - graph->AddNode(x_name + "/reshape", - graph->builder_.CreateReshape( - *x_node, {-1, static_cast(x_matrix_dims[1])})); + x_node = graph->Add( + x_name + "/reshape", + graph->builder_.CreateReshape( + *x_node->data(), {-1, static_cast(x_matrix_dims[1])})); } // Y node - std::shared_ptr y_node = nullptr; - if (graph->HasNode(y_name)) { - y_node = graph->GetNode(y_name); + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); } else { - y_node = graph->AddNode(y_name, y_dims); + y_node = graph->Add(y_name, *y); } // Flatten Y node if (y_dims.size() != 2) { - y_node = - graph->AddNode(y_name + "/reshape", - graph->builder_.CreateReshape( - *y_node, {static_cast(y_matrix_dims[0]), -1})); + y_node = graph->Add( + y_name + "/reshape", + graph->builder_.CreateReshape( + *y_node->data(), {static_cast(y_matrix_dims[0]), -1})); } // Reshape the matmul node with the inferred shape as the output node - auto matmul_node = graph->AddNode( - out_name, graph->builder_.CreateMatmul2D(*x_node, *y_node, false)); + auto matmul_node = graph->Add( + out_name, + graph->builder_.CreateMatmul2D(*x_node->data(), *y_node->data(), false)); if (out_dims.size() != 2) { - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *matmul_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape( + *matmul_node->data(), CvtShape(out_dims))); } return REBUILD_WHEN_SHAPE_CHANGED; -} +} // namespace xpu } // namespace xpu } // namespace subgraph } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, mul, paddle::lite::subgraph::xpu::MulConverter); +REGISTER_SUBGRAPH_BRIDGE(mul, kXPU, paddle::lite::subgraph::xpu::MulConverter); diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h old mode 100755 new mode 100644 index 588fcdd6e4..bed88034ae --- a/lite/kernels/xpu/bridges/paddle_use_bridges.h +++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h @@ -14,25 +14,25 @@ #pragma once -USE_SUBGRAPH_BRIDGE(XPU, relu); -USE_SUBGRAPH_BRIDGE(XPU, tanh); -USE_SUBGRAPH_BRIDGE(XPU, conv2d); -USE_SUBGRAPH_BRIDGE(XPU, depthwise_conv2d); -USE_SUBGRAPH_BRIDGE(XPU, elementwise_add); -USE_SUBGRAPH_BRIDGE(XPU, pool2d); -USE_SUBGRAPH_BRIDGE(XPU, softmax); -USE_SUBGRAPH_BRIDGE(XPU, mul); -USE_SUBGRAPH_BRIDGE(XPU, batch_norm); -USE_SUBGRAPH_BRIDGE(XPU, stack); -USE_SUBGRAPH_BRIDGE(XPU, gather); -USE_SUBGRAPH_BRIDGE(XPU, scale); -USE_SUBGRAPH_BRIDGE(XPU, lookup_table); -USE_SUBGRAPH_BRIDGE(XPU, slice); -USE_SUBGRAPH_BRIDGE(XPU, transpose); -USE_SUBGRAPH_BRIDGE(XPU, transpose2); -USE_SUBGRAPH_BRIDGE(XPU, reshape); -USE_SUBGRAPH_BRIDGE(XPU, reshape2); -USE_SUBGRAPH_BRIDGE(XPU, layer_norm); -USE_SUBGRAPH_BRIDGE(XPU, gelu); -USE_SUBGRAPH_BRIDGE(XPU, dropout); -USE_SUBGRAPH_BRIDGE(XPU, matmul); +USE_SUBGRAPH_BRIDGE(relu, kXPU); +USE_SUBGRAPH_BRIDGE(tanh, kXPU); +USE_SUBGRAPH_BRIDGE(conv2d, kXPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kXPU); +USE_SUBGRAPH_BRIDGE(elementwise_add, kXPU); +USE_SUBGRAPH_BRIDGE(pool2d, kXPU); +USE_SUBGRAPH_BRIDGE(softmax, kXPU); +USE_SUBGRAPH_BRIDGE(mul, kXPU); +USE_SUBGRAPH_BRIDGE(batch_norm, kXPU); +USE_SUBGRAPH_BRIDGE(stack, kXPU); +USE_SUBGRAPH_BRIDGE(gather, kXPU); +USE_SUBGRAPH_BRIDGE(scale, kXPU); +USE_SUBGRAPH_BRIDGE(lookup_table, kXPU); +USE_SUBGRAPH_BRIDGE(slice, kXPU); +USE_SUBGRAPH_BRIDGE(transpose, kXPU); +USE_SUBGRAPH_BRIDGE(transpose2, kXPU); +USE_SUBGRAPH_BRIDGE(reshape, kXPU); +USE_SUBGRAPH_BRIDGE(reshape2, kXPU); +USE_SUBGRAPH_BRIDGE(layer_norm, kXPU); +USE_SUBGRAPH_BRIDGE(gelu, kXPU); +USE_SUBGRAPH_BRIDGE(dropout, kXPU); +USE_SUBGRAPH_BRIDGE(matmul, kXPU); diff --git a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h deleted file mode 100644 index 3c76e0e8b5..0000000000 --- a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "lite/kernels/xpu/bridges/registry.h" - -USE_XPU_BRIDGE(relu); -USE_XPU_BRIDGE(conv2d); -USE_XPU_BRIDGE(depthwise_conv2d); -USE_XPU_BRIDGE(elementwise_add); -USE_XPU_BRIDGE(pool2d); -USE_XPU_BRIDGE(softmax); -USE_XPU_BRIDGE(mul); -USE_XPU_BRIDGE(batch_norm); diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc index 60787a3429..90653edcce 100644 --- a/lite/kernels/xpu/bridges/pool_op.cc +++ b/lite/kernels/xpu/bridges/pool_op.cc @@ -50,21 +50,22 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto exclusive = op_info->GetAttr("exclusive"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Pool node if (pooling_type == "max") { if (global_pooling) { - graph->AddNode(out_name, graph->builder_.CreateGlobalMaxPool2D(*x_node)); + graph->Add(out_name, + graph->builder_.CreateGlobalMaxPool2D(*x_node->data())); } else { - graph->AddNode( + graph->Add( out_name, - graph->builder_.CreateMaxPool2D(*x_node, + graph->builder_.CreateMaxPool2D(*x_node->data(), CvtShape(ksize), CvtShape(strides), CvtShape(paddings), @@ -73,12 +74,13 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } } else if (pooling_type == "avg") { if (global_pooling) { - graph->AddNode(out_name, graph->builder_.CreateGlobalAvgPool2D(*x_node)); + graph->Add(out_name, + graph->builder_.CreateGlobalAvgPool2D(*x_node->data())); } else { // !exclusive ---> count_include_pad - graph->AddNode( + graph->Add( out_name, - graph->builder_.CreateAvgPool2D(*x_node, + graph->builder_.CreateAvgPool2D(*x_node->data(), CvtShape(ksize), CvtShape(strides), CvtShape(paddings), @@ -98,6 +100,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - pool2d, +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kXPU, paddle::lite::subgraph::xpu::PoolConverter); diff --git a/lite/kernels/xpu/bridges/registry.cc b/lite/kernels/xpu/bridges/registry.cc deleted file mode 100644 index 4ab1b69a25..0000000000 --- a/lite/kernels/xpu/bridges/registry.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/xpu/bridges/registry.h" -#include - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -Factory& Factory::Instance() { - static Factory g_xpu_bridge; - return g_xpu_bridge; -} - -bool Factory::HasType(const std::string& op_type) const { - return map_.count(op_type); -} - -void Factory::Insert(const std::string& op_type, const func_type& func_name) { - map_.insert(std::make_pair(op_type, func_name)); -} - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/xpu/bridges/registry.h b/lite/kernels/xpu/bridges/registry.h deleted file mode 100644 index c990399c1c..0000000000 --- a/lite/kernels/xpu/bridges/registry.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "lite/core/op_lite.h" -#include "lite/utils/macros.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { -namespace bridges { - -// xpu network builder and constant tensors -class graph_ctx_type { - public: - std::shared_ptr builder; - std::shared_ptr params; -}; - -// var_name, xpu node pointer -using node_map_type = - std::unordered_map>; - -using func_type = std::function, graph_ctx_type*, const node_map_type&)>; -using cvt_map_type = std::unordered_map; -class Factory { - public: - static Factory& Instance(); - - const cvt_map_type& AllFunctions() const { return map_; } - bool HasType(const std::string& op_type) const; - void Insert(const std::string& op_type, const func_type& func_name); - Factory() = default; - - private: - cvt_map_type map_; - DISALLOW_COPY_AND_ASSIGN(Factory); -}; - -} // namespace bridges -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -// some platform-independent defintion -#if defined(_WIN32) -#define UNUSED -#define __builtin_expect(EXP, C) (EXP) -#else -#define UNUSED __attribute__((unused)) -#endif - -#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ - struct __test_global_namespace_##uniq_name##__ {}; \ - static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ - __test_global_namespace_##uniq_name##__>::value, \ - msg) - -#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name) \ - STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ - __reg_xpu_bridge_##op_type##__, \ - "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \ - int __reg_xpu_bridge_##op_type##_Insert() { \ - paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert( \ - #op_type, cvt_func_name); \ - return 0; \ - } - -#define USE_XPU_BRIDGE(op_type) \ - extern int __reg_xpu_bridge_##op_type##_Insert(); \ - static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \ - __reg_xpu_bridge_##op_type##_Insert(); diff --git a/lite/kernels/xpu/bridges/reshape_op.cc b/lite/kernels/xpu/bridges/reshape_op.cc old mode 100755 new mode 100644 index eeee6c7244..5e9a37d18e --- a/lite/kernels/xpu/bridges/reshape_op.cc +++ b/lite/kernels/xpu/bridges/reshape_op.cc @@ -33,22 +33,16 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); - auto x_type = kernel->GetInputDeclType("X"); - CHECK(x_type->precision() == PRECISION(kFloat)); - CHECK(x_type->layout() == DATALAYOUT(kNCHW)); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); auto out_name = op_info->Output("Out").front(); - auto out_type = kernel->GetOutputDeclType("Out"); - CHECK(out_type->precision() == PRECISION(kFloat)); - CHECK(out_type->layout() == DATALAYOUT(kNCHW)); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } std::vector shape; @@ -59,6 +53,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // CHECK(shape_tensor_type->layout() == DATALAYOUT(kNCHW)); for (auto shape_tensor_name : shape_tensor_names) { auto shape_tensor = scope->FindMutableTensor(shape_tensor_name); + CHECK(shape_tensor->persistable()); auto shape_tensor_data = shape_tensor->mutable_data(); shape.emplace_back(shape_tensor_data[0]); } @@ -73,6 +68,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { // CHECK(actual_shape_type->precision() == PRECISION(kInt32)); // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW)); auto actual_shape = scope->FindMutableTensor(actual_shape_name); + CHECK(actual_shape->persistable()); auto actual_shape_dims = actual_shape->dims(); auto actual_shape_data = actual_shape->mutable_data(); auto shape = std::vector( @@ -86,9 +82,11 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_dims = operators::ValidateShape(shape, x_dims); // Reshape node - graph->AddNode(out_name, - graph->builder_.CreateReshape( - *x_node, CvtShape(out_dims))); + graph->Add(out_name, + graph->builder_.CreateReshape(*x_node->data(), + CvtShape(out_dims)), + x_node->precision(), + x_node->layout()); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -97,9 +95,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - reshape2, +REGISTER_SUBGRAPH_BRIDGE(reshape2, + kXPU, paddle::lite::subgraph::xpu::ReshapeConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, - reshape, +REGISTER_SUBGRAPH_BRIDGE(reshape, + kXPU, paddle::lite::subgraph::xpu::ReshapeConverter); diff --git a/lite/kernels/xpu/bridges/scale_op.cc b/lite/kernels/xpu/bridges/scale_op.cc old mode 100755 new mode 100644 index a3423d290c..e6871390ac --- a/lite/kernels/xpu/bridges/scale_op.cc +++ b/lite/kernels/xpu/bridges/scale_op.cc @@ -46,17 +46,17 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { float bias = op_info->GetAttr("bias"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Scale node - graph->AddNode( - out_name, - graph->builder_.CreateScale(*x_node, scale, bias, bias_after_scale)); + graph->Add(out_name, + graph->builder_.CreateScale( + *x_node->data(), scale, bias, bias_after_scale)); return SUCCESS; } @@ -65,6 +65,6 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - scale, +REGISTER_SUBGRAPH_BRIDGE(scale, + kXPU, paddle::lite::subgraph::xpu::ScaleConverter); diff --git a/lite/kernels/xpu/bridges/slice_op.cc b/lite/kernels/xpu/bridges/slice_op.cc old mode 100755 new mode 100644 index 90c91d3b59..3e4592d454 --- a/lite/kernels/xpu/bridges/slice_op.cc +++ b/lite/kernels/xpu/bridges/slice_op.cc @@ -46,11 +46,11 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto ends = op_info->GetAttr>("ends"); // Input node - std::shared_ptr input_node = nullptr; - if (graph->HasNode(input_name)) { - input_node = graph->GetNode(input_name); + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); } else { - input_node = graph->AddNode(input_name, input_dims); + input_node = graph->Add(input_name, *input); } // Calculate the begin and end of the slice in all of @@ -74,9 +74,9 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { strides.push_back(1); } } - graph->AddNode( - out_name, - graph->builder_.CreateStridedSlice(*input_node, begin, end, strides)); + graph->Add(out_name, + graph->builder_.CreateStridedSlice( + *input_node->data(), begin, end, strides)); return REBUILD_WHEN_SHAPE_CHANGED; } @@ -85,6 +85,6 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - slice, +REGISTER_SUBGRAPH_BRIDGE(slice, + kXPU, paddle::lite::subgraph::xpu::SliceConverter); diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc index 6deb536ef1..d964f29a86 100644 --- a/lite/kernels/xpu/bridges/softmax_op.cc +++ b/lite/kernels/xpu/bridges/softmax_op.cc @@ -44,15 +44,15 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Softmax node - graph->AddNode(out_name, graph->builder_.CreateSoftmax(*x_node, axis)); + graph->Add(out_name, graph->builder_.CreateSoftmax(*x_node->data(), axis)); return SUCCESS; } @@ -61,6 +61,6 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - softmax, +REGISTER_SUBGRAPH_BRIDGE(softmax, + kXPU, paddle::lite::subgraph::xpu::SoftmaxConverter); diff --git a/lite/kernels/xpu/bridges/stack_op.cc b/lite/kernels/xpu/bridges/stack_op.cc old mode 100755 new mode 100644 index eb7d6d7b79..69673aaeba --- a/lite/kernels/xpu/bridges/stack_op.cc +++ b/lite/kernels/xpu/bridges/stack_op.cc @@ -46,19 +46,19 @@ int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) { for (auto& x_name : x_names) { auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } - x_nodes.push_back(*x_node); + x_nodes.push_back(*x_node->data()); } // Stack node - graph->AddNode(y_name, - graph->builder_.CreateStack( - xtcl::network::TupleNode::make(x_nodes), axis)); + graph->Add(y_name, + graph->builder_.CreateStack( + xtcl::network::TupleNode::make(x_nodes), axis)); return SUCCESS; } @@ -67,6 +67,6 @@ int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - stack, +REGISTER_SUBGRAPH_BRIDGE(stack, + kXPU, paddle::lite::subgraph::xpu::StackConverter); diff --git a/lite/kernels/xpu/bridges/transpose_op.cc b/lite/kernels/xpu/bridges/transpose_op.cc old mode 100755 new mode 100644 index b6823dd6a8..4217fe0119 --- a/lite/kernels/xpu/bridges/transpose_op.cc +++ b/lite/kernels/xpu/bridges/transpose_op.cc @@ -44,19 +44,19 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto axis = op_info->GetAttr>("axis"); // X node - std::shared_ptr x_node = nullptr; - if (graph->HasNode(x_name)) { - x_node = graph->GetNode(x_name); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); } else { - x_node = graph->AddNode(x_name, x_dims); + x_node = graph->Add(x_name, *x); } // Transpose node - graph->AddNode(out_name, - graph->builder_.CreateTranspose( - *x_node, - CvtShape( - std::vector(axis.begin(), axis.end())))); + graph->Add(out_name, + graph->builder_.CreateTranspose( + *x_node->data(), + CvtShape( + std::vector(axis.begin(), axis.end())))); return SUCCESS; } @@ -66,9 +66,9 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(XPU, - transpose, +REGISTER_SUBGRAPH_BRIDGE(transpose, + kXPU, paddle::lite::subgraph::xpu::TransposeConverter); -REGISTER_SUBGRAPH_BRIDGE(XPU, - transpose2, +REGISTER_SUBGRAPH_BRIDGE(transpose2, + kXPU, paddle::lite::subgraph::xpu::TransposeConverter); diff --git a/lite/kernels/xpu/bridges/utility.cc b/lite/kernels/xpu/bridges/utility.cc old mode 100755 new mode 100644 index 79fad7c8b4..ce28f38019 --- a/lite/kernels/xpu/bridges/utility.cc +++ b/lite/kernels/xpu/bridges/utility.cc @@ -103,7 +103,7 @@ DLDeviceType CvtDLDeviceType(TargetType in_type) { out_type = kDLGPU; break; case TARGET(kXPU): - out_type = kDLCPU; + out_type = static_cast(kDLXPU); break; default: LOG(FATAL) << "[XPU] Can not convert target type(" << TargetToStr(in_type) @@ -115,8 +115,8 @@ DLDeviceType CvtDLDeviceType(TargetType in_type) { std::shared_ptr CvtTensor(const Tensor& in_tensor, std::vector out_shape, - PrecisionType in_precision, DataLayoutType in_layout) { + PrecisionType in_precision = in_tensor.precision(); auto in_shape = in_tensor.dims().Vectorize(); if (out_shape.empty()) { out_shape = in_shape; diff --git a/lite/kernels/xpu/bridges/utility.h b/lite/kernels/xpu/bridges/utility.h old mode 100755 new mode 100644 index a02a5ddff0..7769558545 --- a/lite/kernels/xpu/bridges/utility.h +++ b/lite/kernels/xpu/bridges/utility.h @@ -58,7 +58,6 @@ xtcl::Array CvtShape(const DDim& in_dims) { std::shared_ptr CvtTensor( const Tensor& in_tensor, std::vector out_shape = {}, - PrecisionType in_precision = PRECISION(kFloat), DataLayoutType in_layout = DATALAYOUT(kNCHW)); } // namespace xpu diff --git a/lite/kernels/xpu/graph_compute.cc b/lite/kernels/xpu/graph_compute.cc deleted file mode 100644 index b9e5be1a1d..0000000000 --- a/lite/kernels/xpu/graph_compute.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/xpu/graph_compute.h" -#include -#include -#include -#include -#include "lite/backends/xpu/runtime.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { - -void GraphCompute::PrepareForRun() { - // auto& ctx = this->ctx_->template As(); - auto& param = this->Param(); - CHECK(param.weight); - CHECK(lite::xpu::LoadModel(*param.weight, &runtime_)); - CHECK(runtime_ != nullptr); -} - -void GraphCompute::Run() { - auto& param = this->Param(); - auto GetCurrentUS = []() -> double { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; - }; - auto start_time = GetCurrentUS(); - for (int i = 0; i < param.inputs.size(); i++) { - auto input_var_name = param.inputs[i].first; - auto input_tensor = param.inputs[i].second; - LOG(INFO) << "input dims[" << i << ":" << input_var_name - << "]: " << input_tensor->dims(); - auto input_tensor_data = input_tensor->data(); - for (int j = 0; j < input_tensor->dims().production(); j++) { - VLOG(3) << input_tensor_data[j]; - } - auto input_ndarray = xtcl::xNDArray::Empty( - input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0}); - auto input_ndarray_data = - static_cast(input_ndarray.ToDLPack()->dl_tensor.data); - std::memcpy(input_ndarray_data, - input_tensor_data, - sizeof(float) * input_tensor->dims().production()); - runtime_->SetInputZeroCopy(input_var_name, - &input_ndarray.ToDLPack()->dl_tensor); - } - runtime_->Run(); - for (int i = 0; i < param.outputs.size(); i++) { - auto output_ndarray = runtime_->GetOutput(i); - auto output_var_name = param.outputs[i].first; - auto output_tensor = param.outputs[i].second; - output_tensor->Resize(output_ndarray.Shape()); - LOG(INFO) << "output dims[" << i << ":" << output_var_name - << "]: " << output_tensor->dims(); - auto output_ndarray_data = - static_cast(output_ndarray.ToDLPack()->dl_tensor.data); - auto output_tensor_data = output_tensor->mutable_data(); - std::memcpy(output_tensor_data, - output_ndarray_data, - sizeof(float) * output_tensor->dims().production()); - for (int j = 0; j < output_tensor->dims().production(); j++) { - VLOG(3) << output_tensor_data[j]; - } - } - LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us"; -} - -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(graph_op, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::GraphCompute, - def) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))}) - .Finalize(); diff --git a/lite/kernels/xpu/graph_compute.h b/lite/kernels/xpu/graph_compute.h deleted file mode 100644 index 5406daa8a1..0000000000 --- a/lite/kernels/xpu/graph_compute.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" -#include "lite/core/types.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace xpu { - -class GraphCompute : public KernelLite { - public: - using param_t = operators::GraphParam; - - void PrepareForRun() override; - - void Run() override; - - virtual ~GraphCompute() = default; - - private: - std::shared_ptr runtime_{nullptr}; -}; - -} // namespace xpu -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc old mode 100755 new mode 100644 index 07a74b0454..15df4f80ca --- a/lite/kernels/xpu/subgraph_compute.cc +++ b/lite/kernels/xpu/subgraph_compute.cc @@ -39,13 +39,13 @@ int SubgraphEngine::BuildDeviceProgram() { op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); - if (!bridges.Exists("XPU", op_type)) { + if (!bridges.Exists(op_type, "kXPU")) { return subgraph::FAILED; } auto kernel = inst.kernel(); - status |= bridges.Select("XPU", op_type)(reinterpret_cast(&graph), - const_cast(op), - const_cast(kernel)); + status |= bridges.Select(op_type, "kXPU")(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { return subgraph::FAILED; } @@ -57,26 +57,26 @@ int SubgraphEngine::BuildDeviceProgram() { std::vector device_inodes; std::vector device_onodes; for (auto& input_name : input_names_) { - if (graph.HasNode(input_name)) { - if (!graph.GetType(input_name).persistable()) { - device_inodes.push_back(graph.GetNode(input_name).get()); + if (graph.Has(input_name)) { + if (graph.Get(input_name)->is_data()) { + device_inodes.push_back(graph.Get(input_name)->data().get()); device_inames_.push_back(input_name); } else { LOG(WARNING) << "[XPU] Input node " << input_name - << " is skipped because it is a persistable node."; + << " is ignored because it is not a data node."; } } else { LOG(WARNING) << "[XPU] Input node " << input_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } for (auto& output_name : output_names_) { - if (graph.HasNode(output_name)) { - device_onodes.push_back(graph.GetNode(output_name).get()); + if (graph.Has(output_name)) { + device_onodes.push_back(graph.Get(output_name)->data().get()); device_onames_.push_back(output_name); } else { LOG(WARNING) << "[XPU] Output node " << output_name - << " is skipped because it does not exist."; + << " is ignored because it does not exist."; } } CHECK(!device_inames_.empty()) @@ -98,14 +98,14 @@ int SubgraphEngine::BuildDeviceProgram() { origin_otensors_.resize(device_onames_.size()); device_otensors_.resize(device_onames_.size()); for (int i = 0; i < device_inames_.size(); i++) { - auto type = graph.GetType(device_inames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_inames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); CHECK(origin_itensors_[i]); origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[XPU] Inputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: " << origin_idims_[i]; // Prepare the device input tensors which share data with the origin input @@ -122,14 +122,14 @@ int SubgraphEngine::BuildDeviceProgram() { device_itensors_[i].byte_offset = 0; } for (int i = 0; i < device_onames_.size(); i++) { - auto type = graph.GetType(device_onames_[i]); - auto precision = type.precision(); - auto layout = type.layout(); + auto node = graph.Get(device_onames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); CHECK(origin_otensors_[i]); origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[XPU] Outputs[" << i - << "] precision: " << PrecisionToStr(precision) + VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i] + << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout) << " dims: " << origin_odims_[i]; // Prepare the device output tensors which share data with the origin output @@ -175,7 +175,7 @@ int SubgraphEngine::LaunchDeviceProgram() { // Update the data pointer of DLTensor to track the origin input tensors device_itensors_[i].data = const_cast(origin_itensors_[i]->raw_data()); - device_program_->SetInputZeroCopy(device_inames_[i], &device_itensors_[i]); + device_program_->SetInput(device_inames_[i], &device_itensors_[i]); } // Run the XPU model auto GetCurrentUS = []() -> double { diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h old mode 100755 new mode 100644 diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h index 717dd3c5a6..9be2be9543 100644 --- a/lite/model_parser/naive_buffer/naive_buffer.h +++ b/lite/model_parser/naive_buffer/naive_buffer.h @@ -128,19 +128,23 @@ using Float64Builder = PrimaryBuilder; template class PrimaryListBuilder : public FieldBuilder { - std::vector data_; + const Primary* data_{nullptr}; + int size_{0}; public: using value_type = Primary; explicit PrimaryListBuilder(BinaryTable* table) : FieldBuilder(table) {} - PrimaryListBuilder(BinaryTable* table, const std::vector& val) - : FieldBuilder(table), data_(val) {} + PrimaryListBuilder(BinaryTable* table, const Primary* val, int size) + : FieldBuilder(table), data_(val), size_(size) {} /// Set data. - void set(const std::vector& x) { data_ = x; } + void set(const Primary* x, int size) { + data_ = x; + size_ = size; + } - const std::vector& data() const { return data_; } + const Primary* data() const { return data_; } /// Save information to the corresponding BinaryTable. void Save() override; @@ -149,14 +153,12 @@ class PrimaryListBuilder : public FieldBuilder { void Load() override; /// Number of elements. - size_t size() const { return data_.size(); } + size_t size() const { return size_; } - Type type() const override { - return core::StdTypeToRepr>(); - } + Type type() const override { return core::StdTypeToRepr(); } /// clear builder - void Clear() { data_.clear(); } + void Clear() { size_ = 0; } ~PrimaryListBuilder() = default; }; @@ -381,17 +383,14 @@ void PrimaryBuilder::Load() { template void PrimaryListBuilder::Load() { - CHECK(data_.empty()) << "Duplicate load"; + CHECK(data_ == nullptr) << "Duplicate load"; // Load number of elements first. uint64_t num_elems{}; memcpy(&num_elems, table()->cursor(), sizeof(uint64_t)); table()->Consume(sizeof(uint64_t)); - data_.resize(num_elems); - for (uint64_t i = 0; i < num_elems; i++) { - memcpy(&data_[i], table()->cursor(), sizeof(value_type)); - table()->Consume(sizeof(value_type)); - } + set(reinterpret_cast(table()->cursor()), num_elems); + table()->Consume(num_elems * sizeof(value_type)); } template @@ -404,7 +403,7 @@ void PrimaryListBuilder::Save() { table()->Require(num_elems * sizeof(value_type)); memcpy(table()->cursor(), - reinterpret_cast(&data_[0]), + reinterpret_cast(data_), num_elems * sizeof(value_type)); table()->Consume(num_elems * sizeof(value_type)); } diff --git a/lite/model_parser/naive_buffer/param_desc.cc b/lite/model_parser/naive_buffer/param_desc.cc index 4397b3c413..cc97b02716 100644 --- a/lite/model_parser/naive_buffer/param_desc.cc +++ b/lite/model_parser/naive_buffer/param_desc.cc @@ -150,9 +150,9 @@ void ParamDesc::SetDim(const std::vector& dim) { << "Data Type mismatch"; \ std::vector res; \ auto& data_builder = desc_->GetField>("data"); \ - auto& data = data_builder.data(); \ - size_t size = data.size() / sizeof(T); \ - auto* data_ptr = reinterpret_cast(&data[0]); \ + auto data = data_builder.data(); \ + size_t size = data_builder.size() / sizeof(T); \ + auto* data_ptr = reinterpret_cast(data); \ for (size_t i = 0; i < size; ++i) { \ res.push_back(data_ptr[i]); \ } \ @@ -178,8 +178,7 @@ GET_DATA_IMPL(double, FP64); data_builder->Clear(); \ size_t size = size__ * sizeof(T); \ auto* data_ptr = reinterpret_cast(data_ptr__); \ - std::vector data_vec(data_ptr, data_ptr + size); \ - data_builder->set(data_vec); + data_builder->set(data_ptr, size); #define SET_DATA_IMPL(T, type__) \ template <> \ diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 190cf7194c..f307cb66ac 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -50,6 +50,7 @@ add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS}) add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS}) add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS}) +add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS}) # 2.basic ops not used in basic models add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) @@ -78,11 +79,9 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS}) add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS}) add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS}) -add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS}) add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS}) add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS}) add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS}) - add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS}) diff --git a/lite/operators/collect_fpn_proposals_op.cc b/lite/operators/collect_fpn_proposals_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/collect_fpn_proposals_op.h b/lite/operators/collect_fpn_proposals_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc index 3210520cd5..aa500ba35c 100644 --- a/lite/operators/compare_op.cc +++ b/lite/operators/compare_op.cc @@ -54,7 +54,7 @@ bool CompareOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { } // namespace paddle REGISTER_LITE_OP(equal, paddle::lite::operators::CompareOp); -REGISTER_LITE_OP(notequal, paddle::lite::operators::CompareOp); +REGISTER_LITE_OP(not_equal, paddle::lite::operators::CompareOp); REGISTER_LITE_OP(less_than, paddle::lite::operators::CompareOp); REGISTER_LITE_OP(less_equal, paddle::lite::operators::CompareOp); REGISTER_LITE_OP(greater_than, paddle::lite::operators::CompareOp); diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/distribute_fpn_proposals_op.cc b/lite/operators/distribute_fpn_proposals_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/distribute_fpn_proposals_op.h b/lite/operators/distribute_fpn_proposals_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/dropout_op.cc b/lite/operators/dropout_op.cc index bef0891847..03047de3b3 100644 --- a/lite/operators/dropout_op.cc +++ b/lite/operators/dropout_op.cc @@ -33,7 +33,7 @@ bool DropoutOp::InferShape() const { param_.mask->Resize(x_dims); } // share LoD - // param_.output->set_lod(param_.input->lod()); + param_.output->set_lod(param_.x->lod()); return true; } diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc index ad3fcf79a3..702950ae18 100644 --- a/lite/operators/fc_op.cc +++ b/lite/operators/fc_op.cc @@ -61,7 +61,7 @@ bool FcOpLite::InferShape() const { param_.output->Resize(lite::DDim(output_dims)); // share LoD - // param_.output->set_lod(param_.input->lod()); + param_.output->set_lod(param_.input->lod()); return true; } diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc deleted file mode 100644 index 018ce264e2..0000000000 --- a/lite/operators/graph_op.cc +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/operators/graph_op.h" -#include -#include "lite/core/op_registry.h" - -namespace paddle { -namespace lite { -namespace operators { - -bool GraphOpLite::CheckShape() const { - CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL); - CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL); - return true; -} - -bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ } - -bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { - auto inputs = op_desc.Input("Inputs"); - auto weight = op_desc.Input("Weight"); - auto outputs = op_desc.Output("Outputs"); - - for (auto var : inputs) { - CHECK(scope->FindVar(var)); - param_.inputs.push_back( - std::make_pair(var, scope->FindVar(var)->GetMutable())); - } - - param_.weight = scope->FindVar(weight.front())->GetMutable(); - CHECK(param_.weight); - - for (auto var : outputs) { - CHECK(scope->FindVar(var)); - param_.outputs.push_back( - std::make_pair(var, scope->FindVar(var)->GetMutable())); - } - - return true; -} - -} // namespace operators -} // namespace lite -} // namespace paddle - -REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite); diff --git a/lite/operators/graph_op.h b/lite/operators/graph_op.h deleted file mode 100644 index 20a7cd9b8d..0000000000 --- a/lite/operators/graph_op.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_lite.h" -#include "lite/core/scope.h" -#include "lite/core/tensor.h" -#include "lite/operators/op_params.h" -#include "lite/utils/all.h" - -namespace paddle { -namespace lite { -namespace operators { - -class GraphOpLite : public OpLite { - public: - GraphOpLite() {} - - explicit GraphOpLite(const std::string &type) : OpLite(type) {} - - bool CheckShape() const override; - - bool InferShape() const override; - - bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; - - void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } - - std::string DebugString() const override { return "graph_op"; } - - private: - mutable GraphParam param_; -}; - -} // namespace operators -} // namespace lite -} // namespace paddle diff --git a/lite/operators/grid_sampler_op.cc b/lite/operators/grid_sampler_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/grid_sampler_op.h b/lite/operators/grid_sampler_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/instance_norm_op.h b/lite/operators/instance_norm_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/merge_lod_tensor_op.cc b/lite/operators/merge_lod_tensor_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/merge_lod_tensor_op.h b/lite/operators/merge_lod_tensor_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/reduce_prod_op.h b/lite/operators/reduce_prod_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/sequence_pool_concat_op.cc b/lite/operators/sequence_pool_concat_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/sequence_pool_concat_op.h b/lite/operators/sequence_pool_concat_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/split_lod_tensor_op.cc b/lite/operators/split_lod_tensor_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/split_lod_tensor_op.h b/lite/operators/split_lod_tensor_op.h old mode 100755 new mode 100644 diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc old mode 100755 new mode 100644 diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h old mode 100755 new mode 100644 diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt index 05fcc06b10..697c9874ef 100644 --- a/lite/tests/cv/CMakeLists.txt +++ b/lite/tests/cv/CMakeLists.txt @@ -1,3 +1,3 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) - lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm) endif() diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h index 728d316714..92f68543bb 100644 --- a/lite/tests/cv/cv_basic.h +++ b/lite/tests/cv/cv_basic.h @@ -192,7 +192,6 @@ void nv21_bgra_basic(const uint8_t* in_data, nv2bgra(in_data, out_data, srcw, srch, 0, 1); } -/* /* 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B @@ -217,6 +216,21 @@ void bgr_gray_basic(const uint8_t* in_data, } } } +void bgra_gray_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + for (int i = 0; i < srch; i++) { + const uint8_t* din_ptr = in_data + i * 4 * srcw; + uint8_t* dout_ptr = out_data + i * srcw; + for (int j = 0; j < srcw; j++) { + int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38; + sum = sum >> 7; + *dout_ptr++ = sum; + din_ptr += 4; + } + } +} void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { @@ -228,6 +242,17 @@ void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } } +void gray_bgra_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + *dst++ = 255; + src++; + } + } +} // bgr2bgra, rgb2rgba void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { @@ -340,6 +365,16 @@ void image_convert_basic(const uint8_t* in_data, (srcFormat == ImageFormat::GRAY && dstFormat == ImageFormat::BGR)) { gray_bgr_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGBA && + dstFormat == ImageFormat::GRAY) || + (srcFormat == ImageFormat::BGRA && + dstFormat == ImageFormat::GRAY)) { + bgra_gray_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::RGBA) || + (srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::BGRA)) { + gray_bgra_basic(in_data, out_data, srcw, srch); } else if ((srcFormat == ImageFormat::RGBA && dstFormat == ImageFormat::RGB) || (srcFormat == ImageFormat::BGRA && @@ -525,6 +560,7 @@ void image_resize_basic(const uint8_t* in_data, int y_flag = 0; // only one line if (y_in_start < 0) { y_flag = 1; + y_in_end = 0; } float b0 = ibeta[dy * 2]; float b1 = ibeta[dy * 2 + 1]; @@ -750,6 +786,26 @@ void image_flip_basic(const uint8_t* in_data, flipxy_basic(in_data, srch, srcw, out_data, num); } } +void gray_to_tensor_basic(const uint8_t* bgr, + float* output, + int width, + int height, + float* means, + float* scales, + int num) { + int size = width * height; + float mean_val = means[0]; + float scale_val = scales[0]; + + for (int h = 0; h < height; h++) { + const uint8_t* ptr_bgr = bgr + h * width * num; + float* ptr_h = output + h * width; + for (int i = 0; i < width; i++) { + *ptr_h++ = (ptr_bgr[0] - mean_val) * scale_val; + ptr_bgr += num; + } + } +} void bgr_to_tensor_chw_basic(const uint8_t* bgr, float* output, @@ -828,5 +884,8 @@ void image_to_tensor_basic(const uint8_t* in_data, } else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA)) { bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4); + } else if (srcFormat == ImageFormat::GRAY && + (layout == LayoutType::kNHWC || layout == LayoutType::kNCHW)) { + gray_to_tensor_basic(in_data, output, srcw, srch, means, scales, 1); } } diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc index eefd30f74f..e22e327e8b 100644 --- a/lite/tests/cv/image_convert_test.cc +++ b/lite/tests/cv/image_convert_test.cc @@ -20,6 +20,7 @@ #include "lite/core/profile/timer.h" #include "lite/tests/cv/cv_basic.h" #include "lite/utils/cv/paddle_image_preprocess.h" +#include "time.h" // NOLINT DEFINE_int32(cluster, 3, "cluster id"); DEFINE_int32(threads, 1, "threads num"); @@ -28,15 +29,15 @@ DEFINE_int32(repeats, 1, "repeats times"); DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(check_result, true, "check the result"); -DEFINE_int32(srcFormat, 0, "input image format"); -DEFINE_int32(dstFormat, 1, "output image format"); +DEFINE_int32(srcFormat, 0, "input image format RGBA"); +DEFINE_int32(dstFormat, 2, "output image format RGB"); DEFINE_int32(srch, 1920, "input height"); DEFINE_int32(srcw, 1080, "input width"); DEFINE_int32(dsth, 960, "output height"); DEFINE_int32(dstw, 540, "output width"); DEFINE_int32(angle, 90, "rotate angel"); DEFINE_int32(flip_num, 0, "flip x"); -DEFINE_int32(layout, 0, "layout nchw"); +DEFINE_int32(layout, 1, "layout nchw"); typedef paddle::lite::utils::cv::ImageFormat ImageFormat; typedef paddle::lite::utils::cv::FlipParam FlipParam; @@ -99,7 +100,7 @@ void test_img(const std::vector& cluster_id, float rotate, FlipParam flip, LayoutType layout, - int test_iter = 1) { + int test_iter = 10) { #ifdef LITE_WITH_ARM paddle::lite::DeviceInfo::Init(); #endif @@ -221,7 +222,7 @@ void test_img(const std::vector& cluster_id, float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; if (FLAGS_check_result) { - LOG(INFO) << "image convert basic compute"; + // LOG(INFO) << "image convert basic compute"; image_convert_basic(src, basic_dst, (ImageFormat)srcFormat, @@ -230,7 +231,7 @@ void test_img(const std::vector& cluster_id, srch, out_size); - LOG(INFO) << "image resize basic compute"; + // LOG(INFO) << "image resize basic compute"; image_resize_basic(basic_dst, resize_basic, (ImageFormat)dstFormat, @@ -239,7 +240,7 @@ void test_img(const std::vector& cluster_id, dstw, dsth); - LOG(INFO) << "image rotate basic compute"; + // LOG(INFO) << "image rotate basic compute"; image_rotate_basic(resize_basic, tv_out_ratote_basic, (ImageFormat)dstFormat, @@ -247,7 +248,7 @@ void test_img(const std::vector& cluster_id, dsth, rotate); - LOG(INFO) << "image flip basic compute"; + // LOG(INFO) << "image flip basic compute"; image_flip_basic(resize_basic, tv_out_flip_basic, (ImageFormat)dstFormat, @@ -255,7 +256,7 @@ void test_img(const std::vector& cluster_id, dsth, flip); - LOG(INFO) << "image to tensor basic compute"; + // LOG(INFO) << "image to tensor basic compute"; image_to_tensor_basic(resize_basic, &tensor_basic, (ImageFormat)dstFormat, @@ -267,10 +268,13 @@ void test_img(const std::vector& cluster_id, } Timer t1; + Timer t_convert; + Timer t_resize; + Timer t_flip; + Timer t_rotate; + Timer t_tensor; LOG(INFO) << "saber cv compute"; - double to = 0; - double min_time = 100000; TransParam tparam; tparam.ih = srch; tparam.iw = srcw; @@ -285,15 +289,17 @@ void test_img(const std::vector& cluster_id, ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); for (int i = 0; i < test_iter; ++i) { - t1.Reset(); t1.Start(); - LOG(INFO) << "image convert saber compute"; + // LOG(INFO) << "image convert saber compute"; + t_convert.Start(); // 方法一: image_preprocess.imageCovert(src, lite_dst); - image_preprocess.imageCovert( + image_preprocess.imageConvert( src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat); + t_convert.Stop(); - LOG(INFO) << "image resize saber compute"; + // LOG(INFO) << "image resize saber compute"; + t_resize.Start(); // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp); image_preprocess.imageResize(lite_dst, resize_tmp, @@ -302,8 +308,10 @@ void test_img(const std::vector& cluster_id, srch, dstw, dsth); + t_resize.Stop(); - LOG(INFO) << "image rotate saber compute"; + // LOG(INFO) << "image rotate saber compute"; + t_rotate.Start(); // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote); image_preprocess.imageRotate(resize_tmp, tv_out_ratote, @@ -311,13 +319,17 @@ void test_img(const std::vector& cluster_id, dstw, dsth, rotate); + t_rotate.Stop(); - LOG(INFO) << "image flip saber compute"; + // LOG(INFO) << "image flip saber compute"; + t_flip.Start(); // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip); image_preprocess.imageFlip( resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip); + t_flip.Stop(); - LOG(INFO) << "image to tensor compute"; + // LOG(INFO) << "image to tensor compute"; + t_tensor.Start(); // 方法一: image_preprocess.image2Tensor( // resize_tmp, &dst_tensor, layout, means, scales); image_preprocess.image2Tensor(resize_tmp, @@ -328,16 +340,27 @@ void test_img(const std::vector& cluster_id, layout, means, scales); - + t_tensor.Stop(); t1.Stop(); - double tdiff = t1.LapTimes().Avg(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } } - LOG(INFO) << "image trans total time : " << to - << ", avg time : " << to / test_iter; + LOG(INFO) << "image convert avg time : " << t_convert.LapTimes().Avg() + << ", min time: " << t_convert.LapTimes().Min() + << ", max time: " << t_convert.LapTimes().Max(); + LOG(INFO) << "image resize avg time : " << t_resize.LapTimes().Avg() + << ", min time: " << t_resize.LapTimes().Min() + << ", max time: " << t_resize.LapTimes().Max(); + LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg() + << ", min time: " << t_rotate.LapTimes().Min() + << ", max time: " << t_rotate.LapTimes().Max(); + LOG(INFO) << "image flip avg time : " << t_flip.LapTimes().Avg() + << ", min time: " << t_flip.LapTimes().Min() + << ", max time: " << t_flip.LapTimes().Max(); + LOG(INFO) << "image tensor avg time : " << t_tensor.LapTimes().Avg() + << ", min time: " << t_tensor.LapTimes().Min() + << ", max time: " << t_tensor.LapTimes().Max(); + LOG(INFO) << "image trans total avg time : " << t1.LapTimes().Avg() + << ", min time: " << t1.LapTimes().Min() + << ", max time: " << t1.LapTimes().Max(); double max_ratio = 0; double max_diff = 0; @@ -536,7 +559,7 @@ void test_img(const std::vector& cluster_id, } } -#if 1 +#if 0 TEST(TestImageConvertRand, test_func_image_convert_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -546,19 +569,16 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { for (auto rotate : {180}) { for (auto flip : {0}) { for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { - for (auto dstFormat : {0, 1, 2, 3}) { + for (auto dstFormat : {0, 1, 2, 3, 4}) { for (auto layout : {1}) { - if ((dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::NV12 || + if ((srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { + (dstFormat == ImageFormat::GRAY)) { + continue; + } + if ((dstFormat == ImageFormat::NV12 || + dstFormat == ImageFormat::NV21) && + (srcFormat == ImageFormat::GRAY)) { continue; } if (srcFormat == ImageFormat::NV12 || @@ -591,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { } } #endif -#if 1 +#if 0 TEST(TestImageConvertRand, test_func_image_resize_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -601,21 +621,13 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) { for (auto rotate : {180}) { for (auto flip : {0}) { for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { - for (auto dstFormat : {0, 1, 2, 3}) { + for (auto dstFormat : {0, 1, 2, 3, 4, 11}) { for (auto layout : {1}) { if (dstFormat == ImageFormat::NV12 || - dstFormat == ImageFormat::NV21 || - (dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || + dstFormat == ImageFormat::NV21 || (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { + dstFormat == ImageFormat::GRAY) { continue; } if (srcFormat == ImageFormat::NV12 || @@ -656,25 +668,10 @@ TEST(TestImageConvertRand, test_func_image_trans_preprocess) { for (auto ww : {32, 112}) { for (auto hh : {112}) { for (auto rotate : {90, 180, 270}) { - for (auto flip : {0, 1, 2}) { - for (auto srcFormat : {11}) { - for (auto dstFormat : {3}) { + for (auto flip : {-1, 0, 1}) { + for (auto srcFormat : {0}) { + for (auto dstFormat : {0, 1, 2, 3, 4}) { for (auto layout : {1, 3}) { - if (dstFormat == ImageFormat::NV12 || - dstFormat == ImageFormat::NV21 || - (dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::NV12 || - srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { - continue; - } if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { if (w % 2) { // is not ou shu, two line y == one line @@ -717,7 +714,8 @@ TEST(TestImageConvertCustom, test_func_image_preprocess_custom) { (ImageFormat)FLAGS_dstFormat, FLAGS_angle, (FlipParam)FLAGS_flip_num, - (LayoutType)FLAGS_layout); + (LayoutType)FLAGS_layout, + 20); } #endif #endif diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 4622376742..a7ae414573 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,68 +1,70 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) + lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) if(LITE_BUILD_EXTRA) - lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() - lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc new file mode 100644 index 0000000000..ae65e0e3c3 --- /dev/null +++ b/lite/tests/kernels/batch_norm_compute_test.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class BatchNormComputeTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string op_type_ = "batch_norm"; + std::string input_ = "x"; + std::string scale_ = "scale"; + std::string bias_ = "bias"; + std::string mean_ = "mean"; + std::string variance_ = "variance"; + std::string output_ = "y"; + std::string mean_out_ = "mean_out"; + std::string saved_mean_ = "saved_mean"; + std::string variance_out_ = "variance_out"; + std::string saved_variance_ = "saved_variance"; + DDim dims_{{1, 2, 3, 4}}; + bool use_global_stats_ = false; + float momentum_ = 0.9; + float epsilon_ = 1e-5f; + std::string data_layout_ = "NCHW"; + int is_test_ = 1; + + public: + BatchNormComputeTest(const Place& place, + const std::string& alias, + DDim dims, + float epsilon) + : TestCase(place, alias), dims_(dims), epsilon_(epsilon) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(input_); + auto scale = scope->FindTensor(scale_); + auto bias = scope->FindTensor(bias_); + auto mean = scope->FindTensor(mean_); + auto variance = scope->FindTensor(variance_); + + auto y = scope->NewTensor(output_); + auto mean_out = scope->NewTensor(mean_out_); + auto variance_out = scope->NewTensor(variance_out_); + auto saved_mean = scope->NewTensor(saved_mean_); + auto saved_variance = scope->NewTensor(saved_variance_); + CHECK(y); + CHECK(mean_out); + CHECK(variance_out); + CHECK(saved_mean); + CHECK(saved_variance); + y->Resize(dims_); + + int64_t channel_size = 0; + if (data_layout_ == "NCHW") { + channel_size = dims_[1]; + } else { + LOG(FATAL) << "Unknown storage order: " << data_layout_; + } + mean_out->Resize({channel_size}); + variance_out->Resize({channel_size}); + saved_mean->Resize({channel_size}); + saved_variance->Resize({channel_size}); + + auto x_data = x->data(); + auto y_data = y->mutable_data(); + auto scale_data = scale->data(); + auto bias_data = bias->data(); + auto mean_data = mean->data(); + auto variance_data = variance->data(); + + int64_t outer_size = 0; + int64_t inner_size = 0; + if (data_layout_ == "NCHW") { + outer_size = dims_[0]; + inner_size = dims_.Slice(2, dims_.size()).production(); + } else { + LOG(FATAL) << "Unknown storage order: " << data_layout_; + } + auto x_ptr = x_data; + auto y_ptr = y_data; + for (int o = 0; o < outer_size; o++) { + for (int c = 0; c < channel_size; c++) { + for (int i = 0; i < inner_size; i++) { + float norm_x = + (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon_); + *y_ptr = norm_x * scale_data[c] + bias_data[c]; + x_ptr++; + y_ptr++; + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType(op_type_); + op_desc->SetInput("X", {input_}); + op_desc->SetInput("Bias", {bias_}); + op_desc->SetInput("Scale", {scale_}); + op_desc->SetInput("Mean", {mean_}); + op_desc->SetInput("Variance", {variance_}); + op_desc->SetOutput("Y", {output_}); + op_desc->SetOutput("MeanOut", {mean_out_}); + op_desc->SetOutput("VarianceOut", {variance_out_}); + op_desc->SetOutput("SavedMean", {saved_mean_}); + op_desc->SetOutput("SavedVariance", {saved_variance_}); + op_desc->SetAttr("epsilon", epsilon_); + op_desc->SetAttr("momentum", momentum_); + op_desc->SetAttr("use_global_stats", use_global_stats_); + op_desc->SetAttr("data_layout", data_layout_); + op_desc->SetAttr("is_test", is_test_); + } + + void PrepareData() override { + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + + DDim scale_dim({dims_[1]}); + std::vector scale(scale_dim.production()); + fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production()); + + std::vector bias(scale_dim.production()); + fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production()); + + std::vector mean(scale_dim.production()); + fill_data_rand(mean.data(), -1.f, 1.f, scale_dim.production()); + + std::vector variance(scale_dim.production()); + fill_data_rand(variance.data(), 0.f, 1.f, scale_dim.production()); + + SetCommonTensor(input_, dims_, din.data()); + SetCommonTensor(scale_, scale_dim, scale.data()); + SetCommonTensor(bias_, scale_dim, bias.data()); + SetCommonTensor(mean_, scale_dim, mean.data()); + SetCommonTensor(variance_, scale_dim, variance.data()); + } +}; + +TEST(BatchNorm, precision) { + LOG(INFO) << "test BatchNorm op"; + float abs_error = 2e-5; + Place place; +#if defined(LITE_WITH_XPU) + place = TARGET(kXPU); +#elif defined(LITE_WITH_NPU) + place = TARGET(kNPU); +#else + return; +#endif + + for (auto dims : + std::vector>{{1, 2, 3, 4}, {5, 6, 7, 8}}) { + for (auto epsilon : {1e-5f}) { + std::unique_ptr tester( + new BatchNormComputeTest(place, "def", DDim(dims), epsilon)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision( + {"mean_out", "saved_mean", "variance_out", "saved_variance"}); + } + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/grid_sampler_compute_test.cc b/lite/tests/kernels/grid_sampler_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc new file mode 100644 index 0000000000..d94c2e5154 --- /dev/null +++ b/lite/tests/kernels/pool_compute_test.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class PoolComputeTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string op_type_ = "pool2d"; + std::string x_ = "x"; + std::string out_ = "out"; + DDim dims_{{1, 2, 3, 4}}; + std::string pooling_type_ = "max"; + bool global_pooling_ = false; + std::vector strides_{1, 1}; + std::vector paddings_{0, 0}; + std::vector ksize_{2, 2}; + bool exclusive_ = true; + bool ceil_mode_ = false; + bool adaptive_ = false; + std::string padding_algorithm_; + + public: + PoolComputeTest(const Place& place, + const std::string& alias, + DDim dims, + std::string pooling_type, + bool global_pooling, + std::vector strides = {1, 1}, + std::vector paddings = {0, 0}, + std::vector ksize = {2, 2}, + bool exclusive = true, + bool ceil_mode = false, + bool adaptive = false, + std::string padding_algorithm = "") + : TestCase(place, alias), + dims_(dims), + pooling_type_(pooling_type), + global_pooling_(global_pooling), + strides_(strides), + paddings_(paddings), + ksize_(ksize), + exclusive_(exclusive), + ceil_mode_(ceil_mode), + adaptive_(adaptive) {} + + void RunBaseline(Scope* scope) override { + std::vector paddings_new{paddings_}; + if (paddings_new.size() == 1L) { + paddings_new = std::vector(4, paddings_new[0]); + } else if (paddings_new.size() == 2L) { + paddings_new.insert(paddings_new.begin(), paddings_new[0]); + paddings_new.insert(paddings_new.begin() + 2, paddings_new[2]); + } + CHECK_EQ(paddings_new.size(), 4L); + if (padding_algorithm_ == "SAME") { + for (int i = 0; i < strides_.size(); ++i) { + int out_size = (dims_[i + 2] + strides_[i] - 1) / strides_[i]; + int pad_sum = + std::max((out_size - 1) * strides_[i] + ksize_[i] - dims_[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + *(paddings_new.begin() + i * 2) = pad_0; + *(paddings_new.begin() + i * 2 + 1) = pad_1; + } + } + if (padding_algorithm_ == "VALID" || global_pooling_ || adaptive_) { + for (size_t i = 0; i < paddings_new.size(); i++) { + paddings_new[i] = 0; + } + } + + std::vector ksize_new{ksize_}; + if (global_pooling_) { + ksize_new.clear(); + ksize_new.push_back(dims_[2]); + ksize_new.push_back(dims_[3]); + } + + std::vector out_shape{dims_[0], dims_[1]}; + if (adaptive_) { + out_shape.insert(out_shape.end(), ksize_new.begin(), ksize_new.end()); + } else { + for (size_t i = 0; i < ksize_new.size(); ++i) { + int out_size; + if (!ceil_mode_) { + out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] + + paddings_new[2 * i + 1]) / + strides_[i] + + 1; + } else { + out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] + + paddings_new[2 * i + 1] + strides_[i] - 1) / + strides_[i] + + 1; + } + out_shape.push_back(out_size); + } + } + + auto out = scope->NewTensor(out_); + CHECK(out); + out->Resize(DDim(out_shape)); + auto out_dims = out->dims(); + auto dst_ptr = out->mutable_data(); + + auto x = scope->FindTensor(x_); + auto src_ptr = x->data(); + + int in_n = dims_[0]; + int in_c = dims_[1]; + int in_h = dims_[2]; + int in_w = dims_[3]; + int size_in_n = in_c * in_h * in_w; + int size_in_c = in_h * in_w; + + int out_h = out_dims[2]; + int out_w = out_dims[3]; + int size_out_n = in_c * out_h * out_w; + int size_out_c = out_h * out_w; + + int window_h = ksize_new[0]; + int window_w = ksize_new[1]; + int stride_h = strides_[0]; + int stride_w = strides_[1]; + int pad_t = paddings_new[0]; + int pad_l = paddings_new[2]; + + if (global_pooling_) { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + const float* src = src_ptr + n * size_in_n + c * size_in_c; + float res = src[0]; + if (pooling_type_ == "max") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res = cur_val > res ? cur_val : res; + } + } else if (pooling_type_ == "avg") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res += cur_val; + } + res /= size_in_c; + } + dst_ptr[n * size_out_n + c] = res; + } + } + } else { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + for (int h = 0; h < out_h; ++h) { + int sh = h * stride_h; + int eh = sh + window_h; + sh = (sh - pad_t) < 0 ? 0 : sh - pad_t; + eh = (eh - pad_t) > in_h ? in_h : eh - pad_t; + for (int w = 0; w < out_w; ++w) { + int sw = w * stride_w; + int ew = sw + window_w; + sw = (sw - pad_l) < 0 ? 0 : sw - pad_l; + ew = (ew - pad_l) > in_w ? in_w : ew - pad_l; + int pooling_size = (ew - sw) * (eh - sh); + if (pooling_size == 0) continue; + float res = 0.f; + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; + if (kh == sh && kw == sw) { + res = src_ptr[src_idx]; + } else { + if (pooling_type_ == "max") { + res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; + } + if (pooling_type_ == "avg") { + res += src_ptr[src_idx]; + } + } + } + } + if (pooling_type_ == "avg") { + if (exclusive_) { + res /= pooling_size; + } else { + res /= window_h * window_w; + } + } + dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; + } + } + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType(op_type_); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); + op_desc->SetAttr("pooling_type", pooling_type_); + op_desc->SetAttr("global_pooling", global_pooling_); + op_desc->SetAttr("strides", strides_); + op_desc->SetAttr("paddings", paddings_); + op_desc->SetAttr("ksize", ksize_); + op_desc->SetAttr("exclusive", exclusive_); + op_desc->SetAttr("ceil_mode", ceil_mode_); + op_desc->SetAttr("adaptive", adaptive_); + if (!padding_algorithm_.empty()) { + op_desc->SetAttr("padding_algorithm", padding_algorithm_); + } + } + + void PrepareData() override { + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(x_, dims_, din.data()); + } +}; + +void TestPoolGlobal(Place place, float abs_error = 2e-5) { + for (auto dims : std::vector>{{2, 3, 4, 5}}) { + for (std::string pooling_type : {"max", "avg"}) { + std::unique_ptr tester( + new PoolComputeTest(place, "def", DDim(dims), pooling_type, true)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } + } +} + +void TestPoolAlgorithm(Place place, float abs_error = 2e-5) { + for (auto dims : std::vector>{{2, 3, 4, 5}}) { + for (auto pooling_type : {"max", "avg"}) { + for (auto padding_algorithm : {"SAME", "VALID"}) { + std::unique_ptr tester( + new PoolComputeTest(place, + "def", + DDim(dims), + pooling_type, + false, + {2, 2}, + {0, 0}, + {2, 2}, + true, + false, + false, + padding_algorithm)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } + } + } +} + +void TestPoolHelper(Place place, + float abs_error, + std::vector dims, + std::string pooling_type, + std::vector strides, + std::vector paddings, + std::vector ksize) { + std::unique_ptr tester(new PoolComputeTest( + place, "def", DDim(dims), pooling_type, false, strides, paddings, ksize)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); +} + +void TestPoolStrides(Place place, float abs_error = 2e-5) { + for (auto pooling_type : {"max", "avg"}) { + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2}); + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 2}, {0, 0}, {2, 2}); + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {2, 2}, {0, 0}, {2, 2}); + } +} + +void TestPoolPaddings(Place place, float abs_error = 2e-5) { + for (auto pooling_type : {"max", "avg"}) { + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2}); + TestPoolHelper( + place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {1, 1}, {2, 2}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {0, 0, 1, 1}, + {2, 2}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {1, 0, 1, 0}, + {2, 2}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {1, 0, 0, 1}, + {2, 2}); + } +} + +void TestPoolKsize(Place place, float abs_error = 2e-5) { + for (auto pooling_type : {"max", "avg"}) { + for (auto ksize : {2, 3}) { + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {1, 1}, + {0, 0}, + {ksize, ksize}); + TestPoolHelper(place, + abs_error, + {2, 3, 6, 7}, + pooling_type, + {2, 2}, + {1, 1}, + {ksize, ksize}); + } + } +} + +TEST(Pool, precision) { + LOG(INFO) << "test pool op"; + float abs_error = 2e-5; + Place place; +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#else + return; +#endif + + TestPoolGlobal(place, abs_error); + TestPoolAlgorithm(place, abs_error); + TestPoolStrides(place, abs_error); + TestPoolPaddings(place, abs_error); + TestPoolKsize(place, abs_error); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/reduce_prod_compute_test.cc b/lite/tests/kernels/reduce_prod_compute_test.cc old mode 100755 new mode 100644 diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc old mode 100755 new mode 100644 index 85cd724148..b82c291a41 --- a/lite/tests/kernels/reshape_compute_test.cc +++ b/lite/tests/kernels/reshape_compute_test.cc @@ -16,6 +16,7 @@ #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -29,19 +30,19 @@ class ReshapeComputeTester : public arena::TestCase { std::string xshape_ = "xshape"; std::vector shape_tensor_vct_; std::string shape_tensor_; - DDim x_dims_; + DDim dims_; std::vector shape_; bool inplace_ = false; public: ReshapeComputeTester(const Place& place, const std::string& alias, - DDim x_dims, + DDim dims, std::vector shape, bool is_shape_tensor_vct = false, bool is_shape_tensor = false, bool is_shape = true) - : TestCase(place, alias), x_dims_(x_dims) { + : TestCase(place, alias), dims_(dims) { if (is_shape_tensor_vct) { for (size_t i = 0; i < shape.size(); i++) { shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i)); @@ -60,7 +61,6 @@ class ReshapeComputeTester : public arena::TestCase { CHECK(out); auto* x = scope->FindTensor(input_); - auto x_dims = x->dims(); std::vector out_shape; if (shape_tensor_vct_.size() > 0) { @@ -86,8 +86,8 @@ class ReshapeComputeTester : public arena::TestCase { CHECK_EQ(unk_dim_idx, -1); unk_dim_idx = i; } else if (out_shape[i] == 0) { - CHECK_LE(i, x_dims.size()); - final_out_shape[i] = x_dims[i]; + CHECK_LE(i, dims_.size()); + final_out_shape[i] = dims_[i]; } else if (out_shape[i] > 0) { final_out_shape[i] = out_shape[i]; } else { @@ -97,18 +97,18 @@ class ReshapeComputeTester : public arena::TestCase { } if (unk_dim_idx > -1) { - final_out_shape[unk_dim_idx] = x_dims.production() / cap; + final_out_shape[unk_dim_idx] = dims_.production() / cap; } out->Resize(final_out_shape); auto x_data = x->data(); auto out_data = out->mutable_data(); - memcpy(out_data, x_data, sizeof(float) * x_dims.production()); + memcpy(out_data, x_data, sizeof(float) * dims_.production()); if (op_type_ == "reshape2") { auto* xshape = scope->NewTensor(xshape_); - auto xshape_dims = x_dims.Vectorize(); + auto xshape_dims = dims_.Vectorize(); xshape_dims.insert(xshape_dims.begin(), 0); xshape->Resize(xshape_dims); } @@ -134,11 +134,9 @@ class ReshapeComputeTester : public arena::TestCase { } void PrepareData() override { - std::vector data(x_dims_.production()); - for (int i = 0; i < x_dims_.production(); i++) { - data[i] = i * 1.1; - } - SetCommonTensor(input_, x_dims_, data.data()); + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(input_, dims_, din.data()); if (shape_tensor_vct_.size() > 0) { for (size_t i = 0; i < shape_.size(); i++) { @@ -161,13 +159,16 @@ TEST(Reshape, precision) { LOG(INFO) << "test Reshape op"; float abs_error = 2e-5; Place place; -#ifdef LITE_WITH_XPU +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_XPU) place = TARGET(kXPU); #else return; #endif - DDim x_dims{{2, 3, 4, 5}}; + DDim dims{{2, 3, 4, 5}}; std::vector> shapes{{5, 4, 3, 2}, {2, 3, 20}, {2, 60}, @@ -176,8 +177,11 @@ TEST(Reshape, precision) { {0, 0, 20}, {0, 0, -1}}; for (auto shape : shapes) { +#ifdef LITE_WITH_NPU + if (dims.size() > 4 || shape.size() > 4) continue; +#endif std::unique_ptr tester( - new ReshapeComputeTester(place, "def", x_dims, shape)); + new ReshapeComputeTester(place, "def", dims, shape)); arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision({"xshape"}); } diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc index 706936d2b1..1ededcd52d 100644 --- a/lite/tests/kernels/scale_compute_test.cc +++ b/lite/tests/kernels/scale_compute_test.cc @@ -16,6 +16,7 @@ #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -23,31 +24,33 @@ namespace lite { class ScaleComputeTester : public arena::TestCase { protected: // common attributes for this op. - std::string input_ = "x"; - std::string output_ = "out"; + std::string x_ = "x"; + std::string out_ = "out"; + DDim x_dims_{{100, 20}}; float scale_ = 0.; float bias_ = 0.; - DDim dims_{{100, 20}}; bool bias_after_scale_; public: ScaleComputeTester(const Place& place, const std::string& alias, + const DDim& x_dims, float scale, float bias, bool bias_after_scale) : TestCase(place, alias), + x_dims_(x_dims), scale_(scale), bias_(bias), bias_after_scale_(bias_after_scale) {} void RunBaseline(Scope* scope) override { - auto* out = scope->NewTensor(output_); + auto* out = scope->NewTensor(out_); CHECK(out); - out->Resize(dims_); + out->Resize(x_dims_); auto* out_data = out->mutable_data(); - auto* x = scope->FindTensor(input_); + auto* x = scope->FindTensor(x_); const auto* x_data = x->data(); float bias = bias_; @@ -56,35 +59,34 @@ class ScaleComputeTester : public arena::TestCase { bias *= scale_; } - for (int i = 0; i < dims_.production(); i++) { + for (int i = 0; i < x_dims_.production(); i++) { out_data[i] = x_data[i] * scale_ + bias; } } void PrepareOpDesc(cpp::OpDesc* op_desc) { op_desc->SetType("scale"); - op_desc->SetInput("X", {input_}); - op_desc->SetOutput("Out", {output_}); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); op_desc->SetAttr("scale", scale_); op_desc->SetAttr("bias", bias_); op_desc->SetAttr("bias_after_scale", bias_after_scale_); } void PrepareData() override { - std::vector data(dims_.production()); - - for (int i = 0; i < dims_.production(); i++) { - data[i] = i * 1.1; - } - - SetCommonTensor(input_, dims_, data.data()); + std::vector x(x_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + SetCommonTensor(x_, x_dims_, x.data()); } }; TEST(Scale, precision) { Place place; float abs_error = 2e-5; -#if defined(LITE_WITH_ARM) +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 4e-3; // Using fp16 in NPU +#elif defined(LITE_WITH_ARM) place = TARGET(kARM); #elif defined(LITE_WITH_XPU) place = TARGET(kXPU); @@ -95,13 +97,16 @@ TEST(Scale, precision) { return; #endif - for (float scale : {0.123, 2., -1.2}) { - for (float bias : {1., 0., -1.2331}) { - for (bool bias_before : {true, false}) { - std::unique_ptr tester( - new ScaleComputeTester(place, "def", scale, bias, bias_before)); - arena::Arena arena(std::move(tester), place, abs_error); - arena.TestPrecision(); + for (auto x_dims : + std::vector>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) { + for (float scale : {0.123, 2., -1.2}) { + for (float bias : {1., 0., -1.2331}) { + for (bool bias_after_scale : {true, false}) { + std::unique_ptr tester(new ScaleComputeTester( + place, "def", DDim(x_dims), scale, bias, bias_after_scale)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } } } } @@ -117,8 +122,8 @@ TEST(Scale, performance) { return; #endif - std::unique_ptr tester( - new ScaleComputeTester(place, "def", 1.2, 1.1, true)); + std::unique_ptr tester(new ScaleComputeTester( + place, "def", DDim(std::vector{5, 2, 3, 4}), 1.2, 1.1, true)); // To modify the arm context, one can retrive the context as follows. // #ifdef LITE_WITH_ARM diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc index 66123625fa..66dd7bbe37 100644 --- a/lite/tests/kernels/shuffle_channel_compute_test.cc +++ b/lite/tests/kernels/shuffle_channel_compute_test.cc @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine. -// Open this. -/*#include +#include #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -40,28 +39,29 @@ class ShuffleChannelComputeTester : public arena::TestCase { auto* out = scope->NewTensor(output_); CHECK(out); out->Resize(dims_); - auto* outputs = out->mutable_data(); + auto* out_data = out->mutable_data(); + auto* x = scope->FindTensor(input_); - const auto* inputs = x->data(); - DDim x_dims = x->dims(); - int num = x->dims()[0]; - int channel = x->dims()[1]; - int height = x->dims()[2]; - int width = x->dims()[3]; - int fea_size = channel * height * width; + const auto* in_data = x->data(); + + int num = dims_[0]; + int channel = dims_[1]; + int height = dims_[2]; + int width = dims_[3]; + int feather_size = channel * height * width; int spatial_size = height * width; - int group_row = group_; - int group_col = channel / group_; - for (int k = 0; k < num; ++k) { - inputs += k * fea_size; - outputs += k * fea_size; - for (int i = 0; i < group_row; ++i) { - for (int j = 0; j < group_col; ++j) { - const float* p_i = inputs + (i * group_col + j) * spatial_size; - float* p_o = outputs + (j * group_row + i) * spatial_size; + int group_num = group_; + int group_size = channel / group_; + for (int n = 0; n < num; n++) { + for (int i = 0; i < group_num; ++i) { + for (int j = 0; j < group_size; ++j) { + const float* p_i = in_data + (i * group_size + j) * spatial_size; + float* p_o = out_data + (j * group_num + i) * spatial_size; memcpy(p_o, p_i, spatial_size * sizeof(float)); } } + in_data += feather_size; + out_data += feather_size; } } @@ -73,35 +73,33 @@ class ShuffleChannelComputeTester : public arena::TestCase { } void PrepareData() override { - std::vector data(dims_.production()); - - for (int i = 0; i < dims_.production(); i++) { - data[i] = i * 1.1; - } - - SetCommonTensor(input_, dims_, data.data()); + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(input_, dims_, din.data()); } }; -void test_shuffle_channel(Place place) { - for (int group : {4}) { +void test_shuffle_channel(Place place, float abs_error = 2e-5) { + for (int group : {2, 4, 8}) { std::unique_ptr tester( new ShuffleChannelComputeTester(place, "def", group)); - arena::Arena arena(std::move(tester), place, 2e-5); + arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision(); } } TEST(ShuffleChannel, precision) { -// #ifdef LITE_WITH_X86 -// Place place(TARGET(kX86)); -// #endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_shuffle_channel(place); + Place place; + float abs_error = 2e-5; +#ifdef LITE_WITH_NPU + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#else + return; #endif + + test_shuffle_channel(place, abs_error); } } // namespace lite } // namespace paddle -*/ diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc old mode 100755 new mode 100644 index 94100da2b1..a91f6534ff --- a/lite/tests/kernels/softmax_compute_test.cc +++ b/lite/tests/kernels/softmax_compute_test.cc @@ -25,33 +25,33 @@ class SoftmaxComputeTest : public arena::TestCase { protected: // common attributes for this op. std::string op_type_ = "softmax"; - std::string input_ = "x"; - std::string output_ = "out"; - DDim dims_{{1, 2, 3, 4}}; + DDim x_dims_{{1, 2, 3, 4}}; + std::string x_ = "x"; + std::string out_ = "out"; int axis_ = 1; public: SoftmaxComputeTest(const Place& place, const std::string& alias, - DDim dims, + DDim x_dims, int axis) - : TestCase(place, alias), dims_(dims), axis_(axis) {} + : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {} void RunBaseline(Scope* scope) override { - auto x = scope->FindTensor(input_); - auto out = scope->NewTensor(output_); + auto x = scope->FindTensor(x_); + auto out = scope->NewTensor(out_); CHECK(out); - out->Resize(dims_); + out->Resize(x_dims_); auto x_data = x->data(); auto out_data = out->mutable_data(); - auto x_rank = dims_.size(); + auto x_rank = x_dims_.size(); if (axis_ < 0) { axis_ += x_rank; } - int axis_size = dims_[axis_]; - int outer_num = dims_.Slice(0, axis_).production(); - int inner_num = dims_.Slice(axis_ + 1, x_rank).production(); + int axis_size = x_dims_[axis_]; + int outer_num = x_dims_.Slice(0, axis_).production(); + int inner_num = x_dims_.Slice(axis_ + 1, x_rank).production(); int compute_size = outer_num * inner_num; for (int i = 0; i < compute_size; i++) { int idx_inner = i % inner_num; @@ -84,15 +84,15 @@ class SoftmaxComputeTest : public arena::TestCase { void PrepareOpDesc(cpp::OpDesc* op_desc) { op_desc->SetType(op_type_); - op_desc->SetInput("X", {input_}); - op_desc->SetOutput("Out", {output_}); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); op_desc->SetAttr("axis", axis_); } void PrepareData() override { - std::vector din(dims_.production()); - fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); - SetCommonTensor(input_, dims_, din.data()); + std::vector x(x_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + SetCommonTensor(x_, x_dims_, x.data()); } }; @@ -100,18 +100,21 @@ TEST(Softmax, precision) { LOG(INFO) << "test softmax op"; float abs_error = 2e-5; Place place; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 4e-3; // Using fp16 in NPU +#elif defined(LITE_WITH_XPU) place = TARGET(kXPU); #else return; #endif - std::vector> dims{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}; - for (auto dim_in : dims) { + for (auto x_dims : + std::vector>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) { for (auto axis : {-1, 0, 1, 2, 3}) { - if (axis >= dim_in.size()) continue; + if (axis >= x_dims.size()) continue; std::unique_ptr tester( - new SoftmaxComputeTest(place, "def", DDim(dim_in), axis)); + new SoftmaxComputeTest(place, "def", DDim(x_dims), axis)); arena::Arena arena(std::move(tester), place, abs_error); arena.TestPrecision(); } diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc old mode 100755 new mode 100644 index 62e0fc8e41..b4407bb569 --- a/lite/tests/kernels/transpose_compute_test.cc +++ b/lite/tests/kernels/transpose_compute_test.cc @@ -16,6 +16,7 @@ #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" namespace paddle { namespace lite { @@ -24,13 +25,13 @@ int data_index(std::vector pos, DDimLite dims) { int d1 = dims[1]; int d2 = dims[2]; int d3 = dims[3]; - return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1; + return pos[0] * d1 * d2 * d3 + pos[1] * d2 * d3 + pos[2] * d3 + pos[3]; } std::vector pos_trans(std::vector in_pos, std::vector axis) { std::vector out_pos(in_pos.size()); for (int i = 0; i < axis.size(); i++) { - out_pos[axis[i]] = in_pos[i]; + out_pos[i] = in_pos[axis[i]]; } return out_pos; } @@ -42,35 +43,34 @@ class TransposeComputeTester : public arena::TestCase { std::string input_ = "x"; std::string output_ = "out"; std::string xshape_ = "xshape"; - DDim x_dims_; + DDim dims_; std::vector axis_; public: TransposeComputeTester(const Place& place, const std::string& alias, - DDim x_dims, + DDim dims, std::vector axis) - : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {} + : TestCase(place, alias), dims_(dims), axis_(axis) {} void RunBaseline(Scope* scope) override { auto* out = scope->NewTensor(output_); CHECK(out); auto* x = scope->FindTensor(input_); - auto x_dims = x->dims(); - std::vector out_shape(x_dims.size(), 0); - for (size_t i = 0; i < x_dims.size(); i++) { - out_shape[i] = x_dims[axis_[i]]; + std::vector out_shape(dims_.size(), 0); + for (size_t i = 0; i < dims_.size(); i++) { + out_shape[i] = dims_[axis_[i]]; } out->Resize(out_shape); auto y_dims = out->dims(); - int input_n = x_dims[0]; - int input_c = x_dims[1]; - int input_h = x_dims[2]; - int input_w = x_dims[3]; + int input_n = dims_[0]; + int input_c = dims_[1]; + int input_h = dims_[2]; + int input_w = dims_[3]; auto input_data = x->data(); auto output_data = out->mutable_data(); @@ -81,7 +81,7 @@ class TransposeComputeTester : public arena::TestCase { for (int w = 0; w < input_w; ++w) { std::vector in_pos{n, c, h, w}; std::vector out_pos = pos_trans(in_pos, axis_); - int in_index = data_index(in_pos, x_dims); + int in_index = data_index(in_pos, dims_); int out_index = data_index(out_pos, y_dims); output_data[out_index] = input_data[in_index]; } @@ -91,7 +91,7 @@ class TransposeComputeTester : public arena::TestCase { if (op_type_ == "transpose2") { auto* xshape = scope->NewTensor(xshape_); - auto xshape_dims = x_dims.Vectorize(); + auto xshape_dims = dims_.Vectorize(); xshape_dims.insert(xshape_dims.begin(), 0); xshape->Resize(xshape_dims); } @@ -108,11 +108,9 @@ class TransposeComputeTester : public arena::TestCase { } void PrepareData() override { - std::vector data(x_dims_.production()); - for (int i = 0; i < x_dims_.production(); i++) { - data[i] = i * 1.1; - } - SetCommonTensor(input_, x_dims_, data.data()); + std::vector din(dims_.production()); + fill_data_rand(din.data(), -1.f, 1.f, dims_.production()); + SetCommonTensor(input_, dims_, din.data()); } }; @@ -122,14 +120,16 @@ TEST(Transpose, precision) { Place place; #ifdef LITE_WITH_XPU place = TARGET(kXPU); +#elif defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU #else return; #endif DDim x_dims{{2, 3, 4, 5}}; - // [XPU]: {3, 1, 0, 2} is unsupported std::vector> axes{ - {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}}; + {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}, {3, 1, 0, 2}}; for (auto axis : axes) { std::unique_ptr tester( new TransposeComputeTester(place, "def", x_dims, axis)); diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc index 590d3fd29c..d8ec2b01f7 100644 --- a/lite/tests/kernels/unsqueeze_compute_test.cc +++ b/lite/tests/kernels/unsqueeze_compute_test.cc @@ -223,67 +223,73 @@ class Unsqueeze2ComputeTester : public arena::TestCase { } }; -void test_unsqueeze(Place place) { +void test_unsqueeze(Place place, float abs_error = 2e-5) { for (std::vector axes : {std::vector({1}), std::vector({0, 2}), std::vector({0, -2})}) { - for (int N : {1}) { - for (int C : {3}) { - for (int H : {1}) { - for (int W : {5}) { - for (int input_axes_flag : {1, 2, 3}) { - LOG(INFO) << N << " " << C << " " << H << " " << W << " " - << input_axes_flag; - std::unique_ptr tester( - new UnsqueezeComputeTester( - place, "def", axes, DDim({N, C, H, W}), input_axes_flag)); - arena::Arena arena(std::move(tester), place, 2e-5); - arena.TestPrecision(); - } - } - } + for (auto dims : std::vector>{{3}, {3, 5}, {3, 5, 7}}) + for (int input_axes_flag : {1, 2, 3}) { +#ifdef LITE_WITH_NPU + if (input_axes_flag != 1) continue; + if (dims.size() + axes.size() > 4) continue; +#endif + std::unique_ptr tester(new UnsqueezeComputeTester( + place, "def", axes, DDim(dims), input_axes_flag)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); } - } } } -void test_unsqueeze2(Place place) { +void test_unsqueeze2(Place place, + float abs_error = 2e-5, + std::vector ignored_outs = {}) { for (std::vector axes : {std::vector({0}), std::vector({0, 2}), std::vector({0, -2})}) { - for (int N : {1}) { - for (int C : {3}) { - for (int H : {1}) { - for (int W : {5}) { - std::unique_ptr tester(new Unsqueeze2ComputeTester( - place, "def", axes, DDim({N, C, H, W}))); - arena::Arena arena(std::move(tester), place, 2e-5); - arena.TestPrecision(); - } - } - } + for (auto dims : + std::vector>{{3}, {3, 5}, {3, 5, 7}}) { +#ifdef LITE_WITH_NPU + if (dims.size() + axes.size() > 4) continue; +#endif + std::unique_ptr tester( + new Unsqueeze2ComputeTester(place, "def", axes, DDim(dims))); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(ignored_outs); } } } TEST(squeeze, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_unsqueeze(place); + Place place; + float abs_error = 2e-5; +#ifdef LITE_WITH_NPU + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; #endif + + test_unsqueeze(place, abs_error); } TEST(squeeze2, precision) { -#ifdef LITE_WITH_X86 - Place place(TARGET(kX86)); -#endif -#ifdef LITE_WITH_ARM - Place place(TARGET(kARM)); - test_unsqueeze2(place); + Place place; + float abs_error = 2e-5; + std::vector ignored_outs = {}; +#ifdef LITE_WITH_NPU + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU + ignored_outs.push_back("XShape"); // not supported out in NPU +#elif defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; #endif + + test_unsqueeze2(place, abs_error, ignored_outs); } } // namespace lite diff --git a/lite/tests/utils/timer.h b/lite/tests/utils/timer.h deleted file mode 100644 index 095f32046e..0000000000 --- a/lite/tests/utils/timer.h +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include - -namespace paddle { -namespace lite { - -class Timer final { - public: - Timer() {} - - ~Timer() {} - - void clear() { ms_time_.clear(); } - - void start() { tstart_ = std::chrono::system_clock::now(); } - - void end() { - tend_ = std::chrono::system_clock::now(); - auto ts = - std::chrono::duration_cast(tend_ - tstart_); - latest_time_ = 1000.f * static_cast(ts.count()) * - std::chrono::microseconds::period::num / - std::chrono::microseconds::period::den; - ms_time_.push_back(latest_time_); - } - - float latest_time() const { return latest_time_; } - - float get_average_ms() { - if (ms_time_.size() == 0) { - return 0.f; - } - float sum = 0.f; - for (auto i : ms_time_) { - sum += i; - } - return sum / ms_time_.size(); - } - - float get_sum_ms() { - if (ms_time_.size() == 0) { - return 0.f; - } - float sum = 0.f; - for (auto i : ms_time_) { - sum += i; - } - return sum; - } - - // return tile (0-99) time. - float get_tile_time(float tile) { - if (tile < 0 || tile > 100) { - return -1.f; - } - int total_items = static_cast(ms_time_.size()); - if (total_items <= 0) { - return -2.f; - } - ms_time_.sort(); - int pos = static_cast(tile * total_items / 100); - auto it = ms_time_.begin(); - for (int i = 0; i < pos; ++i) { - ++it; - } - return *it; - } - - std::list get_time_stat() { return ms_time_; } - - float get_min_time() { - ms_time_.sort(); - return *ms_time_.begin(); - } - - float get_max_time() { - ms_time_.sort([](int a, int b) { return a > b; }); - return *ms_time_.begin(); - } - - private: - std::chrono::time_point tstart_; - std::chrono::time_point tend_; - std::list ms_time_; - float latest_time_; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh deleted file mode 100755 index f4cfee5ec6..0000000000 --- a/lite/tools/build_bm.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -set -ex - -# global variables with default value -BM_SDK_ROOT="$(pwd)/../BM_SDK" # BM SDK -TARGET_NAME="BM1682" # default target -BUILD_EXTRA=OFF # ON(with sequence ops)/OFF -WITH_TESTING=ON # ON/OFF - -function print_usage { - echo -e "\nUSAGE:" - echo - echo "----------------------------------------" - echo -e "--bm_sdk_root=" - echo -e "--target_name=" - echo "----------------------------------------" - echo -} - -# readonly variables with default value -readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ - -DWITH_PYTHON=OFF \ - -DLITE_WITH_ARM=OFF" - -readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THRLITE_BUILD_THREADSEADS:-1} - -readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz -readonly workspace=$(pwd) - -function prepare_thirdparty { - if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then - rm -rf $workspace/third-party - - if [ ! -f $workspace/third-party-05b862.tar.gz ]; then - wget $THIRDPARTY_TAR - fi - tar xzf third-party-05b862.tar.gz - else - git submodule update --init --recursive - fi -} - -# for code gen, a source file is generated after a test, but is dependended by some targets in cmake. -# here we fake an empty file to make cmake works. -function prepare_workspace { - # in build directory - # 1. Prepare gen_code file - GEN_CODE_PATH_PREFIX=lite/gen_code - mkdir -p ./${GEN_CODE_PATH_PREFIX} - touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc - - # 2.Prepare debug tool - DEBUG_TOOL_PATH_PREFIX=lite/tools/debug - mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} - cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ - - # clone submodule - # git submodule update --init --recursive - prepare_thirdparty -} - -function build_bm { - build_dir=${workspace}/build.lite.bm - mkdir -p $build_dir - cd $build_dir - - prepare_workspace - cmake .. \ - ${CMAKE_COMMON_OPTIONS} \ - -DWITH_GPU=OFF \ - -DWITH_MKLDNN=OFF \ - -DLITE_WITH_X86=ON \ - -DWITH_MKL=ON \ - -DLITE_BUILD_EXTRA=ON \ - -DLITE_WITH_XPU=OFF \ - -DLITE_WITH_BM=ON \ - -DWITH_TESTING=${WITH_TESTING} \ - -DBM_SDK_ROOT=${BM_SDK_ROOT} - - make -j$NUM_CORES_FOR_COMPILE - - cd - - echo "Done" -} - -function main { - # Parse command line. - for i in "$@"; do - case $i in - --target_name=*) - TARGET_NAME="${i#*=}" - shift - ;; - --bm_sdk_root=*) - BM_SDK_ROOT="${i#*=}" - shift - ;; - bm) - build_bm - shift - ;; - *) - # unknown option - print_usage - exit 1 - ;; - esac - done -} - -main $@ diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh index 9f28274471..fdf287501e 100755 --- a/lite/tools/build_xpu.sh +++ b/lite/tools/build_xpu.sh @@ -104,6 +104,11 @@ function main { build_xpu shift ;; + full_publish) + TARGET_NAME=publish_inference + build_xpu + shift + ;; *) # unknown option print_usage diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 91afc5039c..a0273efe13 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -610,6 +610,44 @@ function build_arm { } +# $1: ARM_TARGET_OS in "ios", "ios64" +# $2: ARM_TARGET_ARCH_ABI in "armv7", "armv8" +function build_ios { + local os=$1 + local abi=$2 + build_dir=build.ios.${os}.${abi} + echo "building ios target into $build_dir" + echo "target os: $os" + echo "target abi: $abi" + mkdir -p ${build_dir} + cd ${build_dir} + GEN_CODE_PATH_PREFIX=lite/gen_code + mkdir -p ./${GEN_CODE_PATH_PREFIX} + touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + + cmake .. \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_ARM=ON \ + -DWITH_TESTING=OFF \ + -DLITE_WITH_JAVA=OFF \ + -DLITE_SHUTDOWN_LOG=ON \ + -DLITE_ON_TINY_PUBLISH=ON \ + -DLITE_WITH_OPENMP=OFF \ + -DWITH_ARM_DOTPROD=OFF \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DARM_TARGET_ARCH_ABI=$abi \ + -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ + -DLITE_WITH_CV=$BUILD_CV \ + -DARM_TARGET_OS=$os + + make -j4 publish_inference + cd - +} + # $1: ARM_TARGET_OS in "android" # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" # $3: ARM_TARGET_LANG in "gcc" "clang" @@ -773,6 +811,21 @@ function build_test_arm_subtask_armlinux { echo "Done" } +# sub-task3 +# this task will test IOS compiling, which requires cmake_version>=3.15 +function build_test_arm_subtask_ios { + cur=$PWD + # job 8 + build_ios "ios" "armv7" + cd $cur + + # job 9 + build_ios "ios64" "armv8" + cd $cur + + echo "Done" +} + # this method need to invoke `build_test_arm_subtask_android` first. function build_test_arm_subtask_model { # We just test following single one environment to limit the CI time. @@ -1042,6 +1095,10 @@ function main { build_test_arm_subtask_armlinux shift ;; + build_test_arm_subtask_ios) + build_test_arm_subtask_ios + shift + ;; check_style) check_style shift diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py index 140d773207..35012d5b16 100644 --- a/lite/tools/cmake_tools/create_fake_kernel_registry.py +++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py @@ -18,6 +18,9 @@ import logging from ast import RegisterLiteKernelParser from utils import * +if len(sys.argv) != 4: + print("Error: create_fake_kernel_registry.py requires three inputs!") + exit(1) ops_list_path = sys.argv[1] dest_path = sys.argv[2] kernelmap_path = sys.argv[3] diff --git a/lite/tools/cmake_tools/parse_kernel_registry.py b/lite/tools/cmake_tools/parse_kernel_registry.py index f4f0b95483..6c020ec438 100644 --- a/lite/tools/cmake_tools/parse_kernel_registry.py +++ b/lite/tools/cmake_tools/parse_kernel_registry.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function import sys import logging from ast import RegisterLiteKernelParser +if len(sys.argv) != 5: + print("Error: parse_kernel_registry.py requires four inputs!") + exit(1) ops_list_path = sys.argv[1] dest_path = sys.argv[2] minkernels_list_path = sys.argv[3] diff --git a/lite/tools/cmake_tools/parse_op_registry.py b/lite/tools/cmake_tools/parse_op_registry.py index db58c455a9..7eb3337ed8 100644 --- a/lite/tools/cmake_tools/parse_op_registry.py +++ b/lite/tools/cmake_tools/parse_op_registry.py @@ -13,10 +13,14 @@ # limitations under the License. ''' Collect op registry information. ''' +from __future__ import print_function import sys import logging from ast import RegisterLiteOpParser +if len(sys.argv) != 5: + print("Error: parse_op_registry.py requires four inputs!") + exit(1) ops_list_path = sys.argv[1] dest_path = sys.argv[2] minops_list_path = sys.argv[3] diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py new file mode 100644 index 0000000000..f6a3af6bd3 --- /dev/null +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -0,0 +1,129 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys +import logging +from ast import RegisterLiteKernelParser +from ast import RegisterLiteOpParser + +if len(sys.argv) != 4: + print("Error: record_supported_kernel_op.py requires three inputs!") + exit(1) +kernels_list_path = sys.argv[1] +ops_list_path = sys.argv[2] +kernel_op_map_dest_path = sys.argv[3] + + +out_lines = [ +''' +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +const std::vector> supported_ops_target = { +''' +] + +ops_lines=[] + +# valid targets and valid_ops +valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"] +valid_ops = [[],[],[],[],[],[],[],[],[],[]] +class TargetType: + kUnk = 0 + kHost = 1 + kX86 = 2 + kCUDA = 3 + kARM = 4 + kOpenCL = 5 + kFPGA = 7 + kNPU = 8 + kXPU = 9 + kAny = 6 # any target + +# record op_info of valid kernels into `valid_ops` according to different target type +with open(kernels_list_path) as f: + paths = set([path for path in f]) + for path in paths: + with open(path.strip()) as g: + c = g.read() + kernel_parser = RegisterLiteKernelParser(c) + kernel_parser.parse() + for k in kernel_parser.kernels: + if hasattr(TargetType, k.target): + index=getattr(TargetType, k.target) + valid_ops[index].append(k.op_type) + +# clear the repeated ops +for target in valid_targets: + index = getattr(TargetType, target) + valid_ops[index] = list(set(valid_ops[index])) + +paths = set() +with open(ops_list_path) as f: + paths = set([path for path in f]) + for path in paths: + str_info = open(path.strip()).read() + op_parser = RegisterLiteOpParser(str_info) + ops = op_parser.parse() + for op in ops: + if "_grad" in op: + continue + out = ' {"%s", { "' % op + op_targets = [] + for target in valid_targets: + if op in valid_ops[getattr(TargetType, target)]: + op_targets.append(target) + if len(op_targets) > 0: + out = out +'", "'.join(op_targets)+ '" }}' + else: + # unknow type op: kUnk = 0 + valid_ops[0].append(op) + out = out +'kUnk" }}' + ops_lines.append(out) + +with open(kernel_op_map_dest_path, 'w') as f: + logging.info("write kernel list to %s" % kernel_op_map_dest_path) + f.write('\n'.join(out_lines)) + # write kernels into head file + for target in valid_targets: + if len(valid_ops[getattr(TargetType, target)]) == 0 : + f.write("\n // %s_OPS: " %target) + f.write('\n {},') + else: + f.write("\n // %s_OPS: " %target) + f.write('\n {"') + f.write('","'.join(valid_ops[getattr(TargetType, target)])) + f.write('"},\n') + f.write('};') + # write op info into head file + f.write('\nconst std::map> supported_ops={\n') + f.write(',\n'.join(ops_lines)) + f.write('\n};') diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt index 0edcb2ef24..6c88e70de1 100644 --- a/lite/utils/cv/CMakeLists.txt +++ b/lite/utils/cv/CMakeLists.txt @@ -1,5 +1,4 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) - set(lite_cv_deps) lite_cc_library(paddle_cv_arm SRCS image_convert.cc paddle_image_preprocess.cc @@ -7,5 +6,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ image_flip.cc image_rotate.cc image_resize.cc - DEPS ${lite_cv_deps} paddle_api place) + DEPS paddle_api place) endif() diff --git a/lite/utils/cv/image2tensor.cc b/lite/utils/cv/image2tensor.cc index b51a82da1d..3a09039a0f 100644 --- a/lite/utils/cv/image2tensor.cc +++ b/lite/utils/cv/image2tensor.cc @@ -18,6 +18,13 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void gray_to_tensor(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales); + void bgr_to_tensor_chw(const uint8_t* src, float* output, int width, @@ -52,7 +59,7 @@ void bgra_to_tensor_hwc(const uint8_t* src, * NCHW * param src: input image data * param dstTensor: output tensor data - * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param layout: output tensor layout,support NHWC and NCHW @@ -79,6 +86,9 @@ void Image2Tensor::choose(const uint8_t* src, } else if (layout == LayoutType::kNHWC && (srcFormat == BGRA || srcFormat == RGBA)) { impl_ = bgra_to_tensor_hwc; + } else if ((layout == LayoutType::kNHWC || layout == LayoutType::kNCHW) && + (srcFormat == GRAY)) { + impl_ = gray_to_tensor; } else { printf("this layout: %d or image format: %d not support \n", static_cast(layout), @@ -87,6 +97,147 @@ void Image2Tensor::choose(const uint8_t* src, } impl_(src, output, srcw, srch, means, scales); } + +void gray_to_tensor(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales) { + int size = width * height; + float mean_val = means[0]; + float scale_val = scales[0]; + + int dim16 = width >> 16; + int remain = width % 16; + + float32x4_t vmean = vdupq_n_f32(mean_val); + float32x4_t vscale = vdupq_n_f32(scale_val); +#pragma omp parallel for + for (int i = 0; i < height; i += 1) { + const uint8_t* din_ptr = src + i * width; + float* ptr_h = output + i * width; + int cnt = dim16; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #64] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr0], #192] \n" + "1: \n" + "ld1 {v0.8b}, [%[inptr0]], #8 \n" // d8 = y0y1y2.." + "ld1 {v1.8b}, [%[inptr0]], #8 \n" // d8 = y0y1y2.." + // 8->16 + "ushll v3.8h, v0.8b, #0 \n" + "ushll v4.8h, v0.8b, #0 \n" + // 16->32 + "ushll v6.4s, v3.4h, #0 \n" + "ushll2 v7.4s, v3.8h, #0 \n" + "ushll v8.4s, v4.4h, #0 \n" + "ushll2 v9.4s, v4.8h, #0 \n" + // int32->fp32 + "ucvtf v12.4s, v6.4s \n" + "ucvtf v13.4s, v7.4s \n" + "ucvtf v14.4s, v8.4s \n" + "ucvtf v15.4s, v9.4s \n" + // sub -mean + "fsub v12.4s, v12.4s, %w[vmean].4s \n" + "fsub v13.4s, v13.4s, %w[vmean].4s \n" + "fsub v14.4s, v14.4s, %w[vmean].4s \n" + "fsub v15.4s, v15.4s, %w[vmean].4s \n" + // mul * scale + "fmul v6.4s, v12.4s, %w[vscale].4s \n" + "fmul v7.4s, v13.4s, %w[vscale].4s \n" + "fmul v8.4s, v14.4s, %w[vscale].4s \n" + "fmul v9.4s, v15.4s, %w[vscale].4s \n" + // store + "st1 {v6.4s}, [%[outr0]], #16 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v7.4s}, [%[outr0]], #16 \n" + "st1 {v8.4s}, [%[outr0]], #16 \n" + "st1 {v9.4s}, [%[outr0]], #16 \n" + "bne 1b \n" + : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt) + : [vmean] "w"(vmean), [vscale] "w"(vscale) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #64] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr0], #192] @ preload a, 64byte\n" + "1: \n" + "vld1.8 {d12, d13}, [%[inptr0]]! \n" + // 8->16 + "vmovl.u8 q8, d12 \n" + "vmovl.u8 q9, d13 \n" + // 16->32 + "vmovl.u16 q11, d16 \n" + "vmovl.u16 q12, d17 \n" + "vmovl.u16 q13, d18 \n" + "vmovl.u16 q14, d19 \n" + // int32->fp32 + "vcvt.f32.u32 q7, q11 \n" + "vcvt.f32.u32 q8, q12 \n" + "vcvt.f32.u32 q9, q13 \n" + "vcvt.f32.u32 q10, q14 \n" + // sub -mean + "vsub.f32 q7, q7, %q[vmean] \n" + "vsub.f32 q8, q8, %q[vmean] \n" + "vsub.f32 q9, q9, %q[vmean] \n" + "vsub.f32 q10, q10, %q[vmean] \n" + // mul *scale + "vmul.f32 q11, q7, %q[vscale] \n" + "vmul.f32 q12, q8, %q[vscale] \n" + "vmul.f32 q13, q9, %q[vscale] \n" + "vmul.f32 q14, q10, %q[vscale] \n" + // store + "vst1.32 {d22 - d23}, [%[outr0]]! \n" + "subs %[cnt], #1 \n" + "vst1.32 {d24 - d25}, [%[outr0]]! \n" + "vst1.32 {d26 - d27}, [%[outr0]]! \n" + "vst1.32 {d28 - d29}, [%[outr0]]! \n" + "bne 1b" + : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt) + : [vmean] "w"(vmean), [vscale] "w"(vscale) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14"); +#endif + } + for (int j = 0; j < remain; j++) { + *ptr_h++ = (*din_ptr - mean_val) * scale_val; + din_ptr++; + } + } +} + void bgr_to_tensor_chw(const uint8_t* src, float* output, int width, @@ -390,6 +541,7 @@ void bgra_to_tensor_chw(const uint8_t* src, } } } + void bgr_to_tensor_hwc(const uint8_t* src, float* output, int width, diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc index 24b6db70dd..385f56d233 100644 --- a/lite/utils/cv/image_convert.cc +++ b/lite/utils/cv/image_convert.cc @@ -30,10 +30,14 @@ void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgra rgba to gray +void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgr rgb to gray void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch); // gray to bgr rgb void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// gray to bgra rgba +void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgr to bgra or rgb to rgba void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgra to bgr or rgba to rgb @@ -112,6 +116,12 @@ void ImageConvert::choose(const uint8_t* src, } else if ((srcFormat == RGB && dstFormat == BGRA) || (srcFormat == BGR && dstFormat == RGBA)) { impl_ = hwc3_trans_hwc4; + } else if ((srcFormat == GRAY && dstFormat == RGBA) || + (srcFormat == GRAY && dstFormat == BGRA)) { + impl_ = hwc1_to_hwc4; + } else if ((srcFormat == RGBA && dstFormat == GRAY) || + (srcFormat == BGRA && dstFormat == GRAY)) { + impl_ = hwc4_to_hwc1; } else { printf("srcFormat: %d, dstFormat: %d does not support! \n", srcFormat, @@ -989,7 +999,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { "vshrn.u32 d24, q6, #7 \n" "vshrn.u32 d25, q7, #7 \n" "vshrn.u32 d26, q8, #7 \n" - "vshrn.u32 d27, q8, #7 \n" + "vshrn.u32 d27, q9, #7 \n" // 16->8 "vmovn.u16 d4, q10 \n" "vmovn.u16 d5, q11 \n" @@ -1077,6 +1087,280 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } /* +采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R +采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B +b = 0.114 *128 = 14.529 = 15 +g = 0.587 * 128 = 75.136 = 75 +r = 0.2989 * 127 = 38.2592 = 38 +Gray = (15*B + 75*G + 38*R)/128 +bgra2gray, rgba2gray +*/ +void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + uint8_t b = 15; + uint8_t g = 75; + uint8_t r = 38; + + uint8x8_t vb = vdup_n_u8(b); + uint8x8_t vg = vdup_n_u8(g); + uint8x8_t vr = vdup_n_u8(r); +#ifdef __aarch64__ +#else + uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; + uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; + uint8_t vr_array[8] = {r, r, r, r, r, r, r, r}; +#endif + int cnt_pro = srcw >> 3; + int remain_pro = srcw % 8; + int win = srcw * 4; + int i = 0; +#pragma omp parallel for + for (i = 0; i < srch - 3; i += 4) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + uint8_t* outr0 = dst + i * srcw; + uint8_t* outr1 = outr0 + srcw; + uint8_t* outr2 = outr1 + srcw; + uint8_t* outr3 = outr2 + srcw; + + int cnt = cnt_pro; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr1], #128] \n" + "prfm pldl1keep, [%[inptr2]] \n" + "prfm pldl1keep, [%[inptr2], #128] \n" + "prfm pldl1keep, [%[inptr3]] \n" + "prfm pldl1keep, [%[inptr3], #128] \n" + "1: \n" + "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v4.8b - v7.8b}, [%[inptr1]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v8.8b - v11.8b}, [%[inptr2]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + // mul b + "umull v13.8h, v0.8b, %w[vb].8b \n" // v0 * vb + "umull v14.8h, v4.8b, %w[vb].8b \n" // v0 * vb + "umull v15.8h, v8.8b, %w[vb].8b \n" // v0 * vb + "umull v16.8h, v12.8b, %w[vb].8b \n" // v0 * vb + // mul g + "umull v17.8h, v1.8b, %w[vg].8b \n" // v0 * vb + "umull v18.8h, v5.8b, %w[vg].8b \n" // v0 * vb + "umull v19.8h, v9.8b, %w[vg].8b \n" // v0 * vb + "umull v20.8h, v13.8b, %w[vg].8b \n" // v0 * vb + // mul r + "umlal v13.8h, v2.8b, %w[vr].8b \n" // v0 * vb + "umlal v14.8h, v6.8b, %w[vr].8b \n" // v0 * vb + "umlal v15.8h, v10.8b, %w[vr].8b \n" // v0 * vb + "umlal v16.8h, v14.8b, %w[vr].8b \n" // v0 * vb + // 16->32 + "uaddl v0.4s, v17.4h, v13.4h \n" + "uaddl2 v1.4s, v17.8h, v13.8h \n" + "uaddl v2.4s, v18.4h, v14.4h \n" + "uaddl2 v3.4s, v18.8h, v14.8h \n" + "uaddl v4.4s, v19.4h, v15.4h \n" + "uaddl2 v5.4s, v19.8h, v15.8h \n" + "uaddl v6.4s, v20.4h, v16.4h \n" + "uaddl2 v7.4s, v20.8h, v16.8h \n" + // 32->16 v0 >> 7 + "shrn v12.4h, v0.4s, #7 \n" + "shrn2 v12.8h, v1.4s, #7 \n" + "shrn v13.4h, v2.4s, #7 \n" + "shrn2 v13.8h, v3.4s, #7 \n" + "shrn v14.4h, v4.4s, #7 \n" + "shrn2 v14.8h, v5.4s, #7 \n" + "shrn v15.4h, v6.4s, #7 \n" + "shrn2 v15.8h, v7.4s, #7 \n" + // 16->8 + "xtn v0.8b, v12.8h \n" + "xtn v1.8b, v13.8h \n" + "xtn v2.8b, v14.8h \n" + "xtn v3.8b, v15.8h \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v0.8b}, [%[outr0]], #8 \n" + "st1 {v1.8b}, [%[outr1]], #8 \n" + "st1 {v2.8b}, [%[outr2]], #8 \n" + "st1 {v3.8b}, [%[outr3]], #8 \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr1]] @ preload a, 64byte\n" + "pld [%[inptr1], #128] @ preload a, 64byte\n" + "pld [%[inptr2]] @ preload a, 64byte\n" + "pld [%[inptr2], #128] @ preload a, 64byte\n" + "pld [%[inptr3]] @ preload a, 64byte\n" + "pld [%[inptr3], #128] @ preload a, 64byte\n" + "vld1.8 d0, [%[vb]] \n" + "vld1.8 d1, [%[vg]] \n" + "vld1.8 d2, [%[vr]] \n" + "1: \n" + "vld4.8 {d3, d4, d5, d6}, [%[inptr0]]! \n" + "vld4.8 {d7, d8, d9, d10}, [%[inptr1]]! \n" + "vld4.8 {d11, d12, d13, d14}, [%[inptr2]]! \n" + "vld4.8 {d15, d16, d17, d18}, [%[inptr3]]! \n" + // vb + "vmull.u8 q10, d3, d0 \n" + "vmull.u8 q11, d7, d0 \n" + "vmull.u8 q12, d11, d0 \n" + "vmull.u8 q13, d15, d0 \n" + // vg + "vmull.u8 q14, d4, d1 \n" + "vmull.u8 q15, d8, d1 \n" + "vmull.u8 q5, d12, d1 \n" + "vmull.u8 q7, d16, d1 \n" + // vr + "vmlal.u8 q10, d5, d2 \n" + "vmlal.u8 q11, d9, d2 \n" + "vmlal.u8 q12, d13, d2 \n" + "vmlal.u8 q13, d17, d2 \n" + // 16->32 + "vaddl.u16 q2, d28, d20 \n" + "vaddl.u16 q3, d29, d21 \n" + "vaddl.u16 q4, d30, d22 \n" + "vaddl.u16 q10, d31, d23 \n" + "vaddl.u16 q6, d10, d24 \n" + "vaddl.u16 q11, d11, d25 \n" + "vaddl.u16 q8, d14, d26 \n" + "vaddl.u16 q9, d15, d27 \n" + // 32->16 q2 >> 7 + "vshrn.u32 d10, q2, #7 \n" + "vshrn.u32 d11, q3, #7 \n" + "vshrn.u32 d14, q4, #7 \n" + "vshrn.u32 d15, q10, #7 \n" + "vshrn.u32 d24, q6, #7 \n" + "vshrn.u32 d25, q11, #7 \n" + "vshrn.u32 d26, q8, #7 \n" + "vshrn.u32 d27, q9, #7 \n" + // 16->8 + "vmovn.u16 d4, q5 \n" + "vmovn.u16 d5, q7 \n" + "vmovn.u16 d6, q12 \n" + "vmovn.u16 d7, q13 \n" + "subs %[cnt], #1 \n" + // store + "vst1.8 d4, [%[outr0]]! \n" + "vst1.8 d5, [%[outr1]]! \n" + "vst1.8 d6, [%[outr2]]! \n" + "vst1.8 d7, [%[outr3]]! \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array) + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + for (; j < remain_pro; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + *outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7; + *outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7; + *outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7; + inptr0 += 4; + inptr1 += 4; + inptr2 += 4; + inptr3 += 4; + } + } + for (; i < srch; i++) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + uint8_t* outr0 = dst + i * srcw; + for (j = 0; j < cnt_pro; j++) { + uint8x8x4_t y0 = vld4_u8(inptr0); // d8 = y0y3y6y9.. d9 = y1y4y7...y + uint16x8_t val0 = vmull_u8(y0.val[0], vb); + + uint16x8_t val0_1 = vmull_u8(y0.val[1], vg); + + val0 = vmlal_u8(val0, y0.val[2], vr); + + uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0)); + uint32x4_t v0_sum1 = + vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0)); + + uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7); + uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7); + + uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16); + + uint8x8_t vout0 = vmovn_u16(v0_sum); + + inptr0 += 32; + vst1_u8(outr0, vout0); + outr0 += 8; + } + for (; j < srcw; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + inptr0 += 4; + } + } +} +/* 采用CV_GRAY2BGR,转换公式B = G = R = Gray 采用CV_GRAY2RGB,转换公式R = G = B = Gray gray2bgr, gray2rgb @@ -1091,6 +1375,22 @@ void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } } +/* +采用CV_GRAY2BGRA,转换公式B = G = R = Gray A=255 +采用CV_GRAY2RGBA,转换公式R = G = B = Gray A=255 +gray2bgra, gray2rgba +*/ +void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + *dst++ = 255; + src++; + } + } +} // bgr2bgra, rgb2rgba void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc index fd84691a2d..f535c858e4 100644 --- a/lite/utils/cv/image_flip.cc +++ b/lite/utils/cv/image_flip.cc @@ -19,6 +19,23 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageFlip::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param) { + if (srcFormat == GRAY) { + flip_hwc1(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGR || srcFormat == RGB) { + flip_hwc3(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + flip_hwc4(src, dst, srcw, srch, flip_param); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} // gray void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in); void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in); @@ -43,6 +60,9 @@ void flip_hwc1(const uint8_t* src, flip_hwc1_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc1_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } @@ -57,6 +77,9 @@ void flip_hwc3(const uint8_t* src, flip_hwc3_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc3_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } @@ -71,6 +94,9 @@ void flip_hwc4(const uint8_t* src, flip_hwc4_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc4_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } /* diff --git a/lite/utils/cv/image_flip.h b/lite/utils/cv/image_flip.h index 5e513324a1..7215b9494a 100644 --- a/lite/utils/cv/image_flip.h +++ b/lite/utils/cv/image_flip.h @@ -21,6 +21,15 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageFlip { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param); +}; void flip_hwc1( const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param); void flip_hwc3( diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc index 8b0b8aa17d..cd02a2cf4b 100644 --- a/lite/utils/cv/image_resize.cc +++ b/lite/utils/cv/image_resize.cc @@ -38,6 +38,15 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageResize::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth) { + resize(src, dst, srcFormat, srcw, srch, dstw, dsth); +} void compute_xy(int srcw, int srch, int dstw, diff --git a/lite/utils/cv/image_resize.h b/lite/utils/cv/image_resize.h index e2e399f542..f11f7b5d93 100644 --- a/lite/utils/cv/image_resize.h +++ b/lite/utils/cv/image_resize.h @@ -39,6 +39,16 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageResize { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth); +}; void resize(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc index 04ba840766..98e61fb444 100644 --- a/lite/utils/cv/image_rotate.cc +++ b/lite/utils/cv/image_rotate.cc @@ -19,6 +19,26 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageRotate::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree) { + if (degree != 90 && degree != 180 && degree != 270) { + printf("this degree: %f not support \n", degree); + } + if (srcFormat == GRAY) { + rotate_hwc1(src, dst, srcw, srch, degree); + } else if (srcFormat == BGR || srcFormat == RGB) { + rotate_hwc3(src, dst, srcw, srch, degree); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + rotate_hwc4(src, dst, srcw, srch, degree); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} // gray void rotate_hwc1_90( const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); @@ -50,6 +70,9 @@ void rotate_hwc1( rotate_hwc1_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc1_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } @@ -61,6 +84,9 @@ void rotate_hwc3( rotate_hwc3_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc3_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } @@ -72,6 +98,9 @@ void rotate_hwc4( rotate_hwc4_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc4_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } #ifdef __aarch64__ @@ -578,6 +607,7 @@ void rotate_hwc1_90(const uint8_t* src, int stride_h = 4 * w_in; int stride_h_w = 4 * w_in - 8; int stride_out = 4 * w_out; + int ww = w_out - 8; #pragma omp parallel for for (i = 0; i < h_in - 7; i += 8) { const uint8_t* inptr0 = src + i * w_in; @@ -586,7 +616,7 @@ void rotate_hwc1_90(const uint8_t* src, const uint8_t* inptr3 = inptr2 + w_in; int j = 0; for (; j < w_in - 7; j += 8) { - uint8_t* outptr0 = dst + j * w_out + i; + uint8_t* outptr0 = dst + j * w_out + (ww - i); uint8_t* outptr1 = outptr0 + w_out; uint8_t* outptr2 = outptr1 + w_out; uint8_t* outptr3 = outptr2 + w_out; @@ -648,7 +678,7 @@ void rotate_hwc1_90(const uint8_t* src, const uint8_t* inptr6 = inptr5 + w_in; const uint8_t* inptr7 = inptr6 + w_in; for (; j < w_in; j++) { - uint8_t* outptr = dst + j * w_out + i; + uint8_t* outptr = dst + j * w_out + ww - i; *outptr++ = *inptr0++; *outptr++ = *inptr1++; *outptr++ = *inptr2++; @@ -659,10 +689,11 @@ void rotate_hwc1_90(const uint8_t* src, *outptr++ = *inptr7++; } } + ww = w_out - 1; for (; i < h_in; i++) { const uint8_t* inptr0 = src + i * w_in; for (int j = 0; j < w_in; j++) { - uint8_t* outptr0 = dst + j * w_out + i; + uint8_t* outptr0 = dst + j * w_out + ww - i; *outptr0 = *inptr0++; } } @@ -693,9 +724,9 @@ void rotate_hwc1_180(const uint8_t* src, const uint8_t* inptr3 = inptr2 + w_in; uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w; // last - uint8_t* outptr1 = outptr0 + w_out; - uint8_t* outptr2 = outptr1 + w_out; - uint8_t* outptr3 = outptr2 + w_out; + uint8_t* outptr1 = outptr0 - w_out; + uint8_t* outptr2 = outptr1 - w_out; + uint8_t* outptr3 = outptr2 - w_out; if (i + 3 >= h_in) { uint8_t* ptr = zerobuff + w_in - stride_w; diff --git a/lite/utils/cv/image_rotate.h b/lite/utils/cv/image_rotate.h index 8335fca280..8e04a3f524 100644 --- a/lite/utils/cv/image_rotate.h +++ b/lite/utils/cv/image_rotate.h @@ -16,10 +16,20 @@ #include #include +#include "lite/utils/cv/paddle_image_preprocess.h" namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageRotate { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree); +}; void rotate_hwc1( const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree); void rotate_hwc3( diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc index f180475568..c46811a046 100644 --- a/lite/utils/cv/paddle_image_preprocess.cc +++ b/lite/utils/cv/paddle_image_preprocess.cc @@ -25,7 +25,6 @@ namespace paddle { namespace lite { namespace utils { namespace cv { - #define PI 3.14159265f #define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180)) #define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI)) @@ -38,7 +37,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, this->dstFormat_ = dstFormat; this->transParam_ = param; } -void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) { +void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) { ImageConvert img_convert; img_convert.choose(src, dst, @@ -48,10 +47,10 @@ void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) { this->transParam_.ih); } -void ImagePreprocess::imageCovert(const uint8_t* src, - uint8_t* dst, - ImageFormat srcFormat, - ImageFormat dstFormat) { +void ImagePreprocess::imageConvert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat) { ImageConvert img_convert; img_convert.choose(src, dst, @@ -68,7 +67,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, int srch, int dstw, int dsth) { - resize(src, dst, srcFormat, srcw, srch, dstw, dsth); + ImageResize img_resize; + img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); } void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { @@ -77,7 +77,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { int dstw = this->transParam_.ow; int dsth = this->transParam_.oh; auto srcFormat = this->dstFormat_; - resize(src, dst, srcFormat, srcw, srch, dstw, dsth); + ImageResize img_resize; + img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); } void ImagePreprocess::imageRotate(const uint8_t* src, @@ -86,19 +87,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, int srcw, int srch, float degree) { - if (degree != 90 && degree != 180 && degree != 270) { - printf("this degree: %f not support \n", degree); - } - if (srcFormat == GRAY) { - rotate_hwc1(src, dst, srcw, srch, degree); - } else if (srcFormat == BGR || srcFormat == RGB) { - rotate_hwc3(src, dst, srcw, srch, degree); - } else if (srcFormat == BGRA || srcFormat == RGBA) { - rotate_hwc4(src, dst, srcw, srch, degree); - } else { - printf("this srcFormat: %d does not support! \n", srcFormat); - return; - } + ImageRotate img_rotate; + img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); } void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) { @@ -106,10 +96,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) { auto srch = this->transParam_.oh; auto srcFormat = this->dstFormat_; auto degree = this->transParam_.rotate_param; - if (degree != 90 && degree != 180 && degree != 270) { - printf("this degree: %f not support \n", degree); - } - ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree); + ImageRotate img_rotate; + img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); } void ImagePreprocess::imageFlip(const uint8_t* src, @@ -118,16 +106,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, int srcw, int srch, FlipParam flip_param) { - if (srcFormat == GRAY) { - flip_hwc1(src, dst, srcw, srch, flip_param); - } else if (srcFormat == BGR || srcFormat == RGB) { - flip_hwc3(src, dst, srcw, srch, flip_param); - } else if (srcFormat == BGRA || srcFormat == RGBA) { - flip_hwc4(src, dst, srcw, srch, flip_param); - } else { - printf("this srcFormat: %d does not support! \n", srcFormat); - return; - } + ImageFlip img_flip; + img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); } void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) { @@ -135,7 +115,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) { auto srch = this->transParam_.oh; auto srcFormat = this->dstFormat_; auto flip_param = this->transParam_.flip_param; - ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param); + ImageFlip img_flip; + img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); } void ImagePreprocess::image2Tensor(const uint8_t* src, diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h index 5a46a9e48e..a12c0d11f0 100644 --- a/lite/utils/cv/paddle_image_preprocess.h +++ b/lite/utils/cv/paddle_image_preprocess.h @@ -19,6 +19,7 @@ #include #include "lite/api/paddle_api.h" #include "lite/api/paddle_place.h" + namespace paddle { namespace lite { namespace utils { @@ -37,9 +38,9 @@ enum ImageFormat { }; // flip enum enum FlipParam { - X = 0, // flip along the X axis - Y, // flip along the Y axis - XY // flip along the XY axis + XY = -1, // flip along the XY axis + X = 0, // flip along the X axis + Y // flip along the Y axis }; // transform param typedef struct { @@ -69,11 +70,12 @@ class ImagePreprocess { * BGR(RGB)and BGRA(RGBA) transform, * BGR(RGB)and RGB(BGR) transform, * BGR(RGB)and RGBA(BGRA) transform, - * BGR(RGB)and GRAY transform, + * BGR(RGB) and GRAY transform, + * BGRA(RGBA) and GRAY transform, * param src: input image data * param dst: output image data */ - void imageCovert(const uint8_t* src, uint8_t* dst); + void imageConvert(const uint8_t* src, uint8_t* dst); /* * image color convert * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), @@ -81,6 +83,7 @@ class ImagePreprocess { * BGR(RGB)and RGB(BGR) transform, * BGR(RGB)and RGBA(BGRA) transform, * BGR(RGB)and GRAY transform, + * BGRA(RGBA) and GRAY transform, * param src: input image data * param dst: output image data * param srcFormat: input image image format support: GRAY, NV12(NV21), @@ -88,10 +91,10 @@ class ImagePreprocess { * param dstFormat: output image image format, support GRAY, BGR(RGB) and * BGRA(RGBA) */ - void imageCovert(const uint8_t* src, - uint8_t* dst, - ImageFormat srcFormat, - ImageFormat dstFormat); + void imageConvert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat); /* * image resize, use bilinear method * support image format: 1-channel image (egs: GRAY, 2-channel image (egs: @@ -171,7 +174,8 @@ class ImagePreprocess { FlipParam flip_param); /* * change image data to tensor data - * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC + * and * NCHW * param src: input image data * param dstTensor: output tensor data @@ -186,7 +190,8 @@ class ImagePreprocess { float* scales); /* * change image data to tensor data - * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC + * and * NCHW * param src: input image data * param dstTensor: output tensor data diff --git a/lite/utils/env.h b/lite/utils/env.h old mode 100755 new mode 100644 diff --git a/mobile/src/common/log.h b/mobile/src/common/log.h index 69654c505d..3b42188b62 100644 --- a/mobile/src/common/log.h +++ b/mobile/src/common/log.h @@ -80,7 +80,6 @@ static const char *ANDROID_LOG_TAG = #endif enum LogLevel { - kNO_LOG, kLOG_ERROR, kLOG_WARNING, kLOG_INFO, @@ -89,15 +88,16 @@ enum LogLevel { kLOG_DEBUG1, kLOG_DEBUG2, kLOG_DEBUG3, - kLOG_DEBUG4 + kLOG_DEBUG4, + kNO_LOG, }; // log level static LogLevel log_level = kLOG_DEBUG4; -static std::vector logs{"NO ", "ERROR ", "WARNING", "INFO ", - "VERBOSE", "DEBUG ", "DEBUG1 ", "DEBUG2 ", - "DEBUG3 ", "DEBUG4 "}; +static std::vector logs{"ERROR ", "WARNING", "INFO ", "VERBOSE", + "DEBUG ", "DEBUG1 ", "DEBUG2 ", "DEBUG3 ", + "DEBUG4 ", "NO "}; struct ToLog; struct Print; @@ -217,7 +217,6 @@ struct ToLog { #define ANDROIDLOGV(...) enum LogLevel { - kNO_LOG, kLOG_ERROR, kLOG_WARNING, kLOG_INFO, @@ -226,7 +225,8 @@ enum LogLevel { kLOG_DEBUG1, kLOG_DEBUG2, kLOG_DEBUG3, - kLOG_DEBUG4 + kLOG_DEBUG4, + kNO_LOG }; struct ToLog; diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/framework/cl/cl_deleter.h b/mobile/src/framework/cl/cl_deleter.h index 55af631174..731e5de663 100644 --- a/mobile/src/framework/cl/cl_deleter.h +++ b/mobile/src/framework/cl/cl_deleter.h @@ -15,45 +15,51 @@ limitations under the License. */ #pragma once #include "CL/cl.h" - +#include "common/log.h" struct CLKernelDeleter { template void operator()(T *clKernelObj) { - clReleaseKernel(clKernelObj); + const cl_int status = clReleaseKernel(clKernelObj); + LOG(paddle_mobile::kNO_LOG) << "clReleaseKernel status: " << status; } }; struct CLMemDeleter { template void operator()(T *clMemObj) { - clReleaseMemObject(clMemObj); + const cl_int status = clReleaseMemObject(clMemObj); + LOG(paddle_mobile::kNO_LOG) << "CLMemDeleter status: " << status; } }; struct CLEventDeleter { template void operator()(T *clEventObj) { - clReleaseEvent(clEventObj); + const cl_int status = clReleaseEvent(clEventObj); + LOG(paddle_mobile::kNO_LOG) << "CLEventDeleter status: " << status; } }; struct CLCommQueueDeleter { template void operator()(T *clQueueObj) { - clReleaseCommandQueue(clQueueObj); + const cl_int status = clReleaseCommandQueue(clQueueObj); + LOG(paddle_mobile::kNO_LOG) << "CLCommQueueDeleter status: " << status; } }; struct CLContextDeleter { template void operator()(T *clContextObj) { - clReleaseContext(clContextObj); + const cl_int status = clReleaseContext(clContextObj); + LOG(paddle_mobile::kNO_LOG) << "CLContextDeleter status: " << status; } }; struct CLProgramDeleter { template void operator()(T *clProgramObj) { - clReleaseProgram(clProgramObj); + const cl_int status = clReleaseProgram(clProgramObj); + LOG(paddle_mobile::kNO_LOG) << "CLProgramDeleter status: " << status; } }; diff --git a/mobile/src/framework/cl/cl_engine.cpp b/mobile/src/framework/cl/cl_engine.cpp index c39ae00b00..e8a8361eac 100644 --- a/mobile/src/framework/cl/cl_engine.cpp +++ b/mobile/src/framework/cl/cl_engine.cpp @@ -23,9 +23,11 @@ namespace paddle_mobile { namespace framework { bool CLEngine::Init() { + LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init()"; if (initialized_) { return true; } + LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init() ..."; cl_int status; bool is_setplatform_success = SetPlatform(); bool is_setcldeviceid_success = SetClDeviceId(); @@ -53,12 +55,14 @@ bool CLEngine::SetPlatform() { return false; } /**For clarity, choose the first available platform. */ + LOG(paddle_mobile::kNO_LOG) << "numPlatforms: " << numPlatforms; if (numPlatforms > 0) { cl_platform_id *platforms = reinterpret_cast( malloc(numPlatforms * sizeof(cl_platform_id))); status = clGetPlatformIDs(numPlatforms, platforms, NULL); platform_ = platforms[0]; free(platforms); + LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_; return status == CL_SUCCESS; } @@ -67,70 +71,21 @@ bool CLEngine::SetPlatform() { bool CLEngine::SetClDeviceId() { cl_uint numDevices = 0; - devices_ = NULL; + LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_; cl_int status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { return false; } + LOG(paddle_mobile::kNO_LOG) << "numDevices: " << numDevices; + if (numDevices > 0) { - devices_ = reinterpret_cast( - malloc(numDevices * sizeof(cl_device_id))); status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_, NULL); + LOG(paddle_mobile::kNO_LOG) << "devices_[0]" << devices_[0]; return status == CL_SUCCESS; } return false; } - -// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel( -// const std::string &kernel_name) { -// std::unique_ptr<_cl_kernel, clKernel_deleter> kernel( -// clCreateKernel(program_.get(), kernel_name.c_str(), NULL)); -// return std::move(kernel); -//} -// -// bool CLEngine::SetClCommandQueue() { -// cl_int status; -// command_queue_.reset( -// clCreateCommandQueue(context_.get(), devices_[0], 0, &status)); -// return true; -//} - -// bool CLEngine::SetClContext() { -// context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL)); -// return true; -//} - -// bool CLEngine::LoadKernelFromFile(const char *kernel_file) { -// size_t size; -// char *str; -// std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary)); -// -// if (!f.is_open()) { -// return false; -// } -// -// size_t fileSize; -// f.seekg(0, std::fstream::end); -// size = fileSize = (size_t)f.tellg(); -// f.seekg(0, std::fstream::beg); -// str = new char[size + 1]; -// if (!str) { -// f.close(); -// return 0; -// } -// -// f.read(str, fileSize); -// f.close(); -// str[size] = '\0'; -// const char *source = str; -// size_t sourceSize[] = {strlen(source)}; -// program_.reset( -// clCreateProgramWithSource(context_.get(), 1, &source, sourceSize, -// NULL)); -// return true; -//} - } // namespace framework } // namespace paddle_mobile diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h index 2e21dd9e39..2a6362ebc0 100644 --- a/mobile/src/framework/cl/cl_engine.h +++ b/mobile/src/framework/cl/cl_engine.h @@ -57,19 +57,27 @@ class CLLocalWorkSizeInfo { // max number of work-items in local_work_size in dim 2 size_t max_work_item_size2; }; - +inline void ctx_info(const char *errinfo, const void *private_info, size_t cb, + void *user_data) { + fprintf(stderr, "OpenCL Error (via pfn_notify): %s\n", errinfo); +} class CLEngine { public: static CLEngine *Instance(); bool Init(); bool isInitSuccess(); - std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() { + + std::shared_ptr<_cl_context> CreateContext() { + DLOG << "CreateContext ---"; + DLOG << "platform: " << platform_; + DLOG << "devices_[0]: " << devices_[0]; + cl_int status; - cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status); - std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c); + cl_context c = clCreateContext(NULL, 1, devices_, &ctx_info, NULL, &status); + std::shared_ptr<_cl_context> context(c, CLContextDeleter()); CL_CHECK_ERRORS(status); - return std::move(context_ptr); + return std::move(context); } std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue( @@ -84,14 +92,14 @@ class CLEngine { } cl_context getContext() { - if (context_ == nullptr) { + if (context_.get() == nullptr) { context_ = CreateContext(); } return context_.get(); } cl_command_queue getClCommandQueue() { - if (command_queue_ == nullptr) { + if (command_queue_.get() == nullptr) { command_queue_ = CreateClCommandQueue(getContext()); } return command_queue_.get(); @@ -124,9 +132,9 @@ class CLEngine { if (status != CL_SUCCESS || ret_size / sizeof(size_t) < 3) { return CLLocalWorkSizeInfo(0, 0, 0, 0); } - DLOG << max_work_item_sizes[0]; - DLOG << max_work_item_sizes[1]; - DLOG << max_work_item_sizes[2]; + DLOG << " max_work_item_sizes {" << max_work_item_sizes[0] << ", " + << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "}"; + localWorkSizeInfo_ = CLLocalWorkSizeInfo(max_work_group_size, max_work_item_sizes[0], max_work_item_sizes[1], max_work_item_sizes[2]); @@ -182,8 +190,8 @@ class CLEngine { cl_program p = clCreateProgramWithSource(context, 1, &source, sourceSize, &status_); - DLOG << " cl kernel from source"; - DLOG << " source size: " << sourceSize[0]; + LOG(kLOG_DEBUG4) << " cl kernel from source"; + LOG(kLOG_DEBUG4) << " source size: " << sourceSize[0]; CL_CHECK_ERRORS(status_); std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p); @@ -216,11 +224,7 @@ class CLEngine { DLOG << " program build error: " << log; } - if (status == CL_SUCCESS) { - return true; - } else { - return false; - } + return status == CL_SUCCESS; } cl_device_id DeviceID(int index = 0) { return devices_[index]; } @@ -239,28 +243,13 @@ class CLEngine { CLLocalWorkSizeInfo localWorkSizeInfo_; - cl_platform_id platform_; - - cl_device_id *devices_; - cl_int status_; - std::string cl_path_; - std::unique_ptr<_cl_program, CLProgramDeleter> program_; - - std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr; - - std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ = - nullptr; - - // bool SetClContext(); - - // bool SetClCommandQueue(); - - // bool LoadKernelFromFile(const char *kernel_file); - - // bool BuildProgram(); bool is_init_success_ = false; + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_; + std::shared_ptr<_cl_context> context_; + cl_device_id devices_[10]; + cl_platform_id platform_; }; } // namespace framework diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h index 893456211d..db9aa37ae2 100644 --- a/mobile/src/framework/cl/cl_helper.h +++ b/mobile/src/framework/cl/cl_helper.h @@ -36,9 +36,9 @@ class CLHelper { void AddKernel(const std::string &kernel_name, const std::string &file_name, const std::string &options = "") { - DLOG << " begin add kernel "; + LOG(kLOG_DEBUG1) << " begin add kernel "; auto kernel = scope_->GetKernel(kernel_name, file_name, options); - DLOG << " add kernel ing "; + LOG(kLOG_DEBUG1) << " begin add kernel "; kernels.emplace_back(std::move(kernel)); } diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h index d3d48cda8b..57656c3c6d 100644 --- a/mobile/src/framework/cl/cl_image.h +++ b/mobile/src/framework/cl/cl_image.h @@ -87,14 +87,14 @@ class CLImage { PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, " need call SetTensorData first"); - DLOG << " begin init cl image "; + LOG(kNO_LOG) << " begin init cl image "; image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); half_t *image_data = new half_t[product(image_dims_) * 4]; - DLOG << " convert to image"; + LOG(kNO_LOG) << " convert to image"; converter->NCHWToImage(tensor_data_, image_data, tensor_dims_); - DLOG << " end convert to image"; + LOG(kNO_LOG) << " end convert to image"; InitCLImage(context, image_dims_[0], image_dims_[1], image_data); @@ -105,7 +105,7 @@ class CLImage { tensor_data_ = nullptr; image_converter_ = converter; initialized_ = true; - DLOG << " end init cl image"; + LOG(kNO_LOG) << " end init cl image"; } void InitNImage(cl_context context, cl_command_queue command_queue) { @@ -137,9 +137,9 @@ class CLImage { // CLImageConverterFolder(); CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .") - DLOG << " to get image dims "; + // LOG(kNO_LOG) << " to get image dims "; image_dims_ = normal_converter->InitImageDimInfoWith(dim); - DLOG << " end get image dims " << image_dims_; + // LOG(kNO_LOG) << " end get image dims " << image_dims_; InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); @@ -148,7 +148,7 @@ class CLImage { image_converter_ = normal_converter; cl_event_ = CLEngine::Instance()->CreateEvent(context); initialized_ = true; - DLOG << " end init cl image"; + // LOG(kNO_LOG) << " end init cl image"; } /** * create fake size cl_mem for mem share @@ -169,9 +169,9 @@ class CLImage { InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr); // cheat cl_image they got what they wanted image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); - DLOG << "InitFakeSizeImage ... "; - DLOG << "real_image_dims: " << real_image_dims_; - DLOG << "image_dims_: " << image_dims_; + LOG(kNO_LOG) << "InitFakeSizeImage ... "; + LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; + LOG(kNO_LOG) << "image_dims_: " << image_dims_; PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] && real_image_dims_[1] >= image_dims_[1], "real image is not enough"); @@ -182,7 +182,7 @@ class CLImage { initialized_ = true; shared_mem_ = true; - DLOG << " end init FakeSizeImage"; + LOG(kNO_LOG) << " end init FakeSizeImage"; } /** * init cl mem with a exist cl mem @@ -197,21 +197,21 @@ class CLImage { real_image_dims_ = src.real_image_dims_; image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); - DLOG << "InitWithExistMem ... "; - DLOG << "real_image_dims: " << real_image_dims_; - DLOG << "image_dims_: " << image_dims_; + LOG(kNO_LOG) << "InitWithExistMem ... "; + LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; + LOG(kNO_LOG) << "image_dims_: " << image_dims_; if (real_image_dims_[0] < image_dims_[0] || real_image_dims_[1] < image_dims_[1]) { - DLOG << "real image is not enough!"; - DLOG << "real_image_dims: " << real_image_dims_; - DLOG << "image_dims_: " << image_dims_; + LOG(kNO_LOG) << "real image is not enough!"; + LOG(kNO_LOG) << "real_image_dims: " << real_image_dims_; + LOG(kNO_LOG) << "image_dims_: " << image_dims_; } PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] && real_image_dims_[1] >= image_dims_[1], "real image is not enough!"); if (cl_image_ != src.cl_image_) { - cl_image_.reset(src.cl_image_.get(), CLMemDeleter()); + cl_image_ = src.cl_image_; } tensor_dims_ = need_dims; @@ -221,7 +221,7 @@ class CLImage { initialized_ = true; shared_mem_ = true; - DLOG << " end init WithExistMem"; + LOG(kNO_LOG) << " end init WithExistMem"; } void InitConv2dTransposeFilterCLImage(cl_context context, @@ -233,18 +233,6 @@ class CLImage { InitCLImage(context, command_queue, converter); } - /*! The internal of two tensors share the same memory block. */ - inline CLImage &ShareHolderWith(const CLImage &src) { - PADDLE_MOBILE_ENFORCE( - src.cl_image_ != nullptr, - "Tensor holds no memory. Call Tensor::mutable_data first.") - - if (cl_image_ != src.cl_image_) { - cl_image_.reset(src.cl_image_.get(), CLMemDeleter()); - } - return *this; - } - cl_mem GetCLImage() const { return cl_image_.get(); } const DDim &ImageDims() const { return image_dims_; } diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h index 643ce32b57..49e705e5a0 100644 --- a/mobile/src/framework/cl/cl_scope.h +++ b/mobile/src/framework/cl/cl_scope.h @@ -35,30 +35,27 @@ namespace framework { class CLScope { public: - CLScope() { - CLEngine *engine = CLEngine::Instance(); - context_ = engine->getContext(); - command_queue_ = engine->getClCommandQueue(); - localWorkSizeInfo_ = engine->getLocalWorkSizeInfo(); - } + CLScope() {} - cl_command_queue CommandQueue() { return command_queue_; } + cl_command_queue CommandQueue() { + return CLEngine::Instance()->getClCommandQueue(); + } std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( const std::string &kernel_name, const std::string &file_name, const std::string &options) { - DLOG << " to get program " << file_name; + LOG(kLOG_DEBUG2) << " to get program " << file_name; auto program = Program(file_name, kernel_name, options); - DLOG << " end get program ~ "; - DLOG << " to create kernel: " << kernel_name; + LOG(kLOG_DEBUG2) << " end get program ~ "; + LOG(kLOG_DEBUG2) << " to create kernel: " << kernel_name; std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( clCreateKernel(program, kernel_name.c_str(), &status_)); CL_CHECK_ERRORS(status_); - DLOG << " end create kernel ~ "; + LOG(kLOG_DEBUG2) << " end create kernel ~ "; return std::move(kernel); } - cl_context Context() { return context_; } + cl_context Context() { return CLEngine::Instance()->getContext(); } cl_program Program(const std::string &file_name, const std::string &kernel_name, @@ -79,11 +76,13 @@ class CLScope { std::string header(header_it->second.begin(), header_it->second.end()); source = header + "\n" + source; auto program = CLEngine::Instance()->CreateProgramWithSource( - context_, source.c_str()); + CLEngine::Instance()->getContext(), source.c_str()); - DLOG << " --- begin build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- begin build program -> " << program_key + << " --- "; CLEngine::Instance()->BuildProgram(program.get(), options); - DLOG << " --- end build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- end build program -> " << program_key + << " --- "; programs_[program_key] = std::move(program); return programs_[program_key].get(); @@ -97,19 +96,23 @@ class CLScope { return it->second.get(); } auto program = CLEngine::Instance()->CreateProgramWith( - context_, + CLEngine::Instance()->getContext(), CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name); - DLOG << " --- begin build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- begin build program ele-> " << program_key + << " --- "; CLEngine::Instance()->BuildProgram(program.get(), options); - DLOG << " --- end build program -> " << program_key << " --- "; + LOG(kLOG_DEBUG3) << " --- end build program ele-> " << program_key + << " --- "; programs_[program_key] = std::move(program); return programs_[program_key].get(); } } - CLLocalWorkSizeInfo LocalWorkSizeInfo() { return localWorkSizeInfo_; } + CLLocalWorkSizeInfo LocalWorkSizeInfo() { + return CLEngine::Instance()->getLocalWorkSizeInfo(); + } size_t KernelWorkSize(cl_kernel kernel) { size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel); return kernel_work_size; @@ -117,12 +120,9 @@ class CLScope { private: cl_int status_; - cl_context context_; - cl_command_queue command_queue_; std::unordered_map> programs_; - CLLocalWorkSizeInfo localWorkSizeInfo_; }; } // namespace framework diff --git a/mobile/src/framework/context.h b/mobile/src/framework/context.h index 944d54cc49..18e40311bc 100644 --- a/mobile/src/framework/context.h +++ b/mobile/src/framework/context.h @@ -44,15 +44,13 @@ namespace framework { struct CPUContext { private: CPUContext(); - virtual ~CPUContext() {} public: + ~CPUContext() {} + static CPUContext* Context() { - static CPUContext* ctx = nullptr; - if (ctx == nullptr) { - ctx = new CPUContext(); - } - return ctx; + static CPUContext ctx; + return &ctx; } void set_thread_num(int thread_num, diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp index d03cefe59a..cda5c5522c 100644 --- a/mobile/src/framework/executor.cpp +++ b/mobile/src/framework/executor.cpp @@ -80,7 +80,7 @@ Executor::Executor(const Program &program, std::vector> ops = block_desc->Ops(); for (int j = 0; j < ops.size(); ++j) { std::shared_ptr op_desc = ops[j]; - DLOG << "create op: " << op_desc->Type(); + LOG(kLOG_INFO) << "create op[" << j << "]: " << op_desc->Type(); auto op_handler = OpRegistry::CreateOp( op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(), @@ -111,7 +111,8 @@ Executor::Executor(const Program &program, clock_gettime(CLOCK_MONOTONIC, &ts); profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; #endif - DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type(); + LOG(kLOG_INFO) << "Initialize op[" << count++ + << "]: " << op_handler->Type(); if (op_handler->Type() == "feed" || op_handler->Type() == "fetch") { op_handler->setPrePostType(config_.pre_post_type); } @@ -1015,7 +1016,7 @@ void Executor::InitMemory() { const TensorDesc &desc = var_desc->Tensor_desc(); // DDim ddim = make_ddim(desc.Dims()); DDim ddim = cl_image->dims(); - DLOG << var_desc->Name(); + LOG(kLOG_DEBUG1) << "init image of " << var_desc->Name(); cl_image->InitEmptyImage(context, command_queue, ddim); } } diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp index 34cf6253cb..31274743f8 100644 --- a/mobile/src/framework/loader.cpp +++ b/mobile/src/framework/loader.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "framework/loader.h" +#include #include "framework/lod_tensor.h" #include "framework/program/program-optimize/program_optimize.h" @@ -173,7 +174,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { rewind(fp); DLOG << "model size: " << size; - + PADDLE_MOBILE_ENFORCE(size > 0, "model size should > 0") *out = reinterpret_cast(malloc(size)); size_t cur_len = 0; diff --git a/mobile/src/framework/operator.cpp b/mobile/src/framework/operator.cpp index 402512c723..a091a49b35 100644 --- a/mobile/src/framework/operator.cpp +++ b/mobile/src/framework/operator.cpp @@ -62,31 +62,39 @@ void OperatorBase::Run() { DLOG << "-------------" << type_ << "----------------------------"; vector input_keys = GetInputKeys(); for (const auto key : input_keys) { - auto var_vec_in = inputs_.at(key); - for (int i = 0; i < var_vec_in.size(); ++i) { - auto var = this->scope_->FindVar(var_vec_in[i]); - if (var->IsInitialized() && - var->template IsType()) { - const Tensor *tensor = var->template Get(); - if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; + if (inputs_.count(key) > 0) { + auto var_vec_in = inputs_.at(key); + for (int i = 0; i < var_vec_in.size(); ++i) { + auto var = this->scope_->FindVar(var_vec_in[i]); + if (var->IsInitialized() && + var->template IsType()) { + const Tensor *tensor = var->template Get(); + if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; #ifdef PADDLE_MOBILE_FPGA - DLOG << var_vec_in[i]; + DLOG << var_vec_in[i]; #endif + } } + } else { + DLOG << "did not find key (" << key << ") in inputs_"; } } for (const auto key : GetOutKeys()) { - auto var_vec_out = outputs_.at(key); - for (int i = 0; i < var_vec_out.size(); ++i) { - auto var = scope_->FindVar(var_vec_out[i]); - if (var->IsInitialized() && - var->template IsType()) { - const Tensor *tensor = var->template Get(); - if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; + if (outputs_.count(key) > 0) { + auto var_vec_out = outputs_.at(key); + for (int i = 0; i < var_vec_out.size(); ++i) { + auto var = scope_->FindVar(var_vec_out[i]); + if (var->IsInitialized() && + var->template IsType()) { + const Tensor *tensor = var->template Get(); + if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; #ifdef PADDLE_MOBILE_FPGA - DLOG << var_vec_out[i]; + DLOG << var_vec_out[i]; #endif + } } + } else { + DLOG << "did not find key (" << key << ") in outputs_"; } } #endif @@ -100,27 +108,37 @@ void OperatorBase::Run() { DLOG << "-------------" << type_ << "----------------------------"; vector input_keys = GetInputKeys(); for (const auto key : input_keys) { - auto var_vec_in = inputs_.at(key); - for (int i = 0; i < var_vec_in.size(); ++i) { - auto var = scope_->FindVar(var_vec_in[i]); - if (var->IsInitialized() && var->template IsType()) { - const CLImage *cl_image = var->template Get(); - if (cl_image) { - DLOG << type_ << " input- " << key << "=" << *cl_image; + if (inputs_.count(key) > 0) { + auto var_vec_in = inputs_.at(key); + for (int i = 0; i < var_vec_in.size(); ++i) { + auto var = scope_->FindVar(var_vec_in[i]); + if (var->IsInitialized() && + var->template IsType()) { + const CLImage *cl_image = var->template Get(); + if (cl_image) { + DLOG << type_ << " input- " << key << "=" << *cl_image; + } } } + } else { + DLOG << "did not find key (" << key << ") in inputs_"; } } for (const auto key : GetOutKeys()) { - auto var_vec_out = outputs_.at(key); - for (int i = 0; i < var_vec_out.size(); ++i) { - auto var = scope_->FindVar(var_vec_out[i]); - if (var->IsInitialized() && var->template IsType()) { - const CLImage *cl_image = var->template Get(); - if (cl_image) { - DLOG << type_ << " output- " << key << "=" << *cl_image; + if (outputs_.count(key) > 0) { + auto var_vec_out = outputs_.at(key); + for (int i = 0; i < var_vec_out.size(); ++i) { + auto var = scope_->FindVar(var_vec_out[i]); + if (var->IsInitialized() && + var->template IsType()) { + const CLImage *cl_image = var->template Get(); + if (cl_image) { + DLOG << type_ << " output- " << key << "=" << *cl_image; + } } } + } else { + DLOG << "did not find key (" << key << ") in outputs_"; } } #endif diff --git a/mobile/src/io/opencl_interface.cpp b/mobile/src/io/opencl_interface.cpp index 1df5b48339..636cd1b760 100644 --- a/mobile/src/io/opencl_interface.cpp +++ b/mobile/src/io/opencl_interface.cpp @@ -28,8 +28,26 @@ cl_command_queue getClCommandQueue() { } bool isInitSuccess() { + prepareOpenclRuntime(); return framework::CLEngine::Instance()->isInitSuccess(); } +bool prepareOpenclRuntime() { +#ifdef PREPARE_OPENCL_RUNTIME + DLOG << "cl runtime prepared. "; + cl_uint numPlatforms; // the NO. of platforms + cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); + if (status == CL_SUCCESS) { + if (numPlatforms > 0) { + cl_platform_id *platforms = reinterpret_cast( + malloc(numPlatforms * sizeof(cl_platform_id))); + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + free(platforms); + } + } +#endif + return true; +} + } // namespace paddle_mobile #endif diff --git a/mobile/src/io/opencl_interface.h b/mobile/src/io/opencl_interface.h index f1039f1373..6a3608790a 100644 --- a/mobile/src/io/opencl_interface.h +++ b/mobile/src/io/opencl_interface.h @@ -21,6 +21,7 @@ namespace paddle_mobile { cl_context getContext(); cl_command_queue getClCommandQueue(); bool isInitSuccess(); +bool prepareOpenclRuntime(); } // namespace paddle_mobile diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h index 8b8f0683ab..8c40b0696a 100644 --- a/mobile/src/io/paddle_mobile.h +++ b/mobile/src/io/paddle_mobile.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "io/paddle_inference_api.h" #ifdef PADDLE_MOBILE_CL #include "framework/cl/cl_engine.h" +#include "io/opencl_interface.h" #endif namespace paddle_mobile { @@ -34,16 +35,24 @@ template class PaddleMobile { public: explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) { -#ifndef PADDLE_MOBILE_CL bool is_gpu = std::is_same, Device>::value; +#ifndef PADDLE_MOBILE_CL PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on"); +#else + if (is_gpu) { + prepareOpenclRuntime(); + } #endif } PaddleMobile() { -#ifndef PADDLE_MOBILE_CL bool is_gpu = std::is_same, Device>::value; +#ifndef PADDLE_MOBILE_CL PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on"); +#else + if (is_gpu) { // recheck when run cpu in with opencl. + prepareOpenclRuntime(); + } #endif } virtual ~PaddleMobile() { Clear(); } diff --git a/mobile/src/operators/expand_op.cpp b/mobile/src/operators/expand_op.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/expand_op.h b/mobile/src/operators/expand_op.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/grid_sampler_op.cpp b/mobile/src/operators/grid_sampler_op.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/grid_sampler_op.h b/mobile/src/operators/grid_sampler_op.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl index 4895c07d20..b7f4d16c3b 100644 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl @@ -110,4 +110,22 @@ __kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias, half4 in = read_imageh(input, sampler, coords); half4 output = mad(in, biase, 0); write_imageh(outputImage, coords, output); +} + +__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords_bias); + half4 output = in * biase; + write_imageh(outputImage, coords, output); } \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp index 8d66b50a99..4261681f3e 100644 --- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp @@ -43,7 +43,10 @@ bool ConvTransposeKernel::Init( this->cl_helper_.AddKernel("conv_transpose3x3s2", "conv_transpose_kernel.cl"); } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + param->ExecMode() = ConvTransposeParam::EXEC_CONVTRANS_FLOAT; + param->Filter()->InitConv2dTransposeFilterCLImage( + cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl"); } return true; } @@ -58,6 +61,9 @@ void ConvTransposeKernel::Compute( case ConvTransposeParam::EXEC_CONVTRANS3x3s2_FLOAT: ConvTranspose3x3s2AddBnRelu(&this->cl_helper_, param); break; + case ConvTransposeParam::EXEC_CONVTRANS_FLOAT: + ConvTransposeAddBnRelu(&this->cl_helper_, param); + break; default: PADDLE_MOBILE_THROW_EXCEPTION( "Invalid convolution transpose execute mode %d", param.ExecMode()); diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp index fd5b9e6bc3..37034a0189 100644 --- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp +++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp @@ -30,16 +30,23 @@ bool ElementwiseMulKernel::Init( if (bias->dims() == param->InputX()->dims()) { DLOG << "init element wise mul"; this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 1) { - DLOG << "init channel_mul"; - this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 2) { - // etc. input 1 72 28 28 - // filter 1 72 - DLOG << "init channel_mul_d2"; - this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); } else { - PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet"); + const int bias_dim_size = bias->dims().size(); + if (bias_dim_size == 1) { + DLOG << "init channel_mul"; + this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); + } else if (bias_dim_size == 2) { + // etc. input 1 72 28 28 + // filter 1 72 + DLOG << "init channel_mul_d2"; + this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); + } else if (bias_dim_size == 4) { + DLOG << "init channel_mul_d4"; + this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl"); + } else { + PADDLE_MOBILE_ENFORCE(false, + "element mul not supported this situation yet"); + } } return true; } @@ -71,68 +78,103 @@ void ElementwiseMulKernel::Compute( clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 1) { - DLOG << "channel mul"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 2) { - DLOG << "channel mul d2"; + } else { + const int bias_dim_size = bias->dims().size(); + if (bias_dim_size == 1) { + DLOG << "channel mul"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else if (bias_dim_size == 2) { + DLOG << "channel mul d2"; - // etc. input 1 72 28 28 - // filter 1 72 --> 1 1 1 72 - DLOG << "input->ImageDims(): " << input->ImageDims(); - DLOG << "bias->ImageDims(): " << bias->ImageDims(); - DLOG << "out->ImageDims(): " << output->ImageDims(); + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); - DLOG << "channel mul d2"; - cl_mem input_image = input->GetCLImage(); - cl_mem bias_image = bias->GetCLImage(); - cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[input->dims().size() - 1]; - status = clSetKernelArg(kernel, 0, sizeof(cl_mem), - reinterpret_cast(&input_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), - reinterpret_cast(&bias_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_mem), - reinterpret_cast(&output_image)); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), - reinterpret_cast(&tensor_w)); - CL_CHECK_ERRORS(status); - auto width = input->ImageWidth(); - auto height = input->ImageHeight(); - size_t global_work_size[2] = {width, height}; - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); - // bias->PrintTensor(*bias); - } else { - PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") + // bias->PrintTensor(*bias); + } else if (bias_dim_size == 4) { + DLOG << "channel_mul_d4"; + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); + + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") + } } } diff --git a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/expand_kernel.cpp b/mobile/src/operators/kernel/cl/expand_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/expand_kernel.h b/mobile/src/operators/kernel/expand_kernel.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp old mode 100755 new mode 100644 diff --git a/mobile/src/operators/kernel/grid_sampler_kernel.h b/mobile/src/operators/kernel/grid_sampler_kernel.h old mode 100755 new mode 100644 diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h index e58159fbb7..f588b9fc79 100644 --- a/mobile/src/operators/op_param.h +++ b/mobile/src/operators/op_param.h @@ -344,10 +344,14 @@ class OpParam { template static const T GetAttr(const string &key, const AttributeMap &map) { + PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map", + key.c_str()) return ((Attribute)map.at(key)).Get(); } static const std::string GetStringAttr(const string &key, const AttributeMap &map) { + PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map", + key.c_str()) return ((Attribute)map.at(key)).GetString(); } @@ -355,6 +359,10 @@ class OpParam { return map.count(key) > 0; } + static const bool HasVar(const string &key, const VariableNameMap &var_map) { + return var_map.count(key) > 0; + } + template static T *GetVarValue(const string &key, const VariableNameMap &var_map, const Scope &scope) { @@ -2624,6 +2632,7 @@ class ConvTransposeParam : public OpParam { EXEC_DECONV4X4_FLOAT, EXEC_DEPTHWISETRANS_FLOAT, EXEC_CONVTRANS3x3s2_FLOAT, + EXEC_CONVTRANS_FLOAT, }; ExecMode &ExecMode() const { return exec_mode_; } @@ -3100,16 +3109,37 @@ class NearestInterpolationParam : public OpParam { const AttributeMap &attrs, Scope *scope) : OpParam(inputs, outputs, attrs, scope) { input_x_ = InputXFrom(inputs, *scope); - input_outsize_ = InputOutSizeFrom(inputs, *scope); + const bool has_out_size = HasVar("OutSize", inputs); + + if (has_out_size) { + input_outsize_ = InputOutSizeFrom(inputs, *scope); + } + out_ = OutFrom(outputs, *scope); - out_h_ = GetAttr("out_h", attrs); - out_w_ = GetAttr("out_w", attrs); + + if (HasAttr("out_h", attrs)) { + out_h_ = GetAttr("out_h", attrs); + } else if (HasAttr("out_h ", attrs)) { + // some models hurts .... attr with space .. + out_h_ = GetAttr("out_h ", attrs); + } + + if (HasAttr("out_w", attrs)) { + out_w_ = GetAttr("out_w", attrs); + } else if (HasAttr("out_w ", attrs)) { + // some models hurts .... attr with space .. + out_w_ = GetAttr("out_w ", attrs); + } + + LOG(kLOG_DEBUG1) << "out_h_: " << out_h_; + LOG(kLOG_DEBUG1) << "out_w_: " << out_w_; + if (HasAttr("scale", attrs)) { has_scale_ = true; scale_ = GetAttr("scale", attrs); } - DLOG << "has_scale_: " << has_scale_; - DLOG << "scale_: " << scale_; + LOG(kLOG_DEBUG1) << "has_scale_: " << has_scale_; + LOG(kLOG_DEBUG1) << "scale_: " << scale_; } const GType *InputX() const { return input_x_; } const GType *InputOutPutSize() const { return input_outsize_; } diff --git a/mobile/src/pass/memory_optimize_cl.cpp b/mobile/src/pass/memory_optimize_cl.cpp index 355123349d..53bb675f17 100644 --- a/mobile/src/pass/memory_optimize_cl.cpp +++ b/mobile/src/pass/memory_optimize_cl.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PADDLE_MOBILE_CL #include "pass/memory_optimize_cl.h" #include +#include #include "framework/cl/cl_image.h" #include "framework/lod_tensor.h" namespace paddle_mobile { @@ -79,7 +80,7 @@ void MemoryOptPassCl::operator()( std::vector fetch_var_nodes; for (const auto &op : block->Ops()) { - DLOG << "op_desc->Type(): " << op->Type(); + LOG(kNO_LOG) << "op_desc->Type(): " << op->Type(); for (const auto &outputs : op->GetOutputs()) { for (const auto &output : outputs.second) { // not a persistable and not a exclude one ,then add it to @@ -87,7 +88,7 @@ void MemoryOptPassCl::operator()( if (!IsPersistable(output) && std::find(exclude_var_names.begin(), exclude_var_names.end(), output) == exclude_var_names.end()) { - DLOG << "output: " << output; + LOG(kNO_LOG) << "output: " << output; ClVarNode *node = CreateNode(output); analysis_nodes_.push(node); } @@ -100,7 +101,7 @@ void MemoryOptPassCl::operator()( if (!IsPersistable(input) && std::find(exclude_var_names.begin(), exclude_var_names.end(), input) == exclude_var_names.end()) { - DLOG << "input: " << input; + LOG(kNO_LOG) << "input: " << input; ClVarNode *node = CreateNode(input); analysis_nodes_.push(node); if (op->Type() == "fetch") { @@ -114,7 +115,7 @@ void MemoryOptPassCl::operator()( if (!IsPersistable(output) && std::find(exclude_var_names.begin(), exclude_var_names.end(), output) == exclude_var_names.end()) { - DLOG << "output: " << output; + LOG(kNO_LOG) << "output: " << output; ClVarNode *node = CreateNode(output); analysis_nodes_.push(node); } @@ -164,8 +165,8 @@ void MemoryOptPassCl::ShareData( cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue(); for (const auto &list : reused_nodes_) { - DLOG << "\n"; - DLOG << "gpu . share memory within these variables"; + LOG(kNO_LOG) << "\n"; + LOG(kNO_LOG) << "gpu . share memory within these variables"; int64_t x_based_max_numl = -1; int64_t y_based_max_numl = -1; int64_t x_based_max_x = -1; diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt index 76ddd78f1a..078440f45b 100644 --- a/mobile/test/CMakeLists.txt +++ b/mobile/test/CMakeLists.txt @@ -551,6 +551,12 @@ if (ENABLE_ALL_TEST) ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-inference-api-v2 paddle-mobile) + + if (GPU_CL) + ADD_EXECUTABLE(test-net-male2fe net/test_mobilenet_male2fe.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-net-male2fe paddle-mobile) + endif() + endif () else () # gen test diff --git a/mobile/test/executor_for_test_opencl.h b/mobile/test/executor_for_test_opencl.h old mode 100755 new mode 100644 diff --git a/mobile/test/net/test_inference_api_v2.cpp b/mobile/test/net/test_inference_api_v2.cpp old mode 100755 new mode 100644 diff --git a/mobile/test/net/test_mobilenet_male2fe.cpp b/mobile/test/net/test_mobilenet_male2fe.cpp new file mode 100644 index 0000000000..eb83b5bafe --- /dev/null +++ b/mobile/test/net/test_mobilenet_male2fe.cpp @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../../src/common/types.h" +#include "../test_helper.h" +#include "../test_include.h" + +void feed(PaddleMobile *paddle_mobile, const DDim &dims, + std::string image_path, std::string feed_name) { + float *input_data_array = new float[product(dims)]; + std::ifstream in(image_path, std::ios::in); + for (int i = 0; i < product(dims); i++) { + float num; + in >> num; + input_data_array[i] = num; + } + in.close(); + framework::Tensor input_tensor(input_data_array, dims); + DLOG << feed_name << " : " << input_tensor; + paddle_mobile->Feed(feed_name, input_tensor); +} + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + auto time1 = paddle_mobile::time(); +#ifdef PADDLE_MOBILE_CL + paddle_mobile.SetCLPath("/data/local/tmp/bin"); +#endif + + if (paddle_mobile.Load(std::string("../models/nanbiannv") + "/model", + std::string("../models/nanbiannv") + "/params", + true)) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + feed(&paddle_mobile, {1, 3, 256, 256}, "../images/input_1_3_256_256", + "image"); + + auto time3 = paddle_mobile::time(); + paddle_mobile.Predict(); + auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) + << "ms" << std::endl; + } + + auto rgb = paddle_mobile.Fetch("rgb"); + auto mask = paddle_mobile.Fetch("mask"); + LOG(kLOG_INFO) << "rgb" << *rgb; + LOG(kLOG_INFO) << "mask" << *mask; + return 0; +} diff --git a/mobile/test/net/test_net_multi_feed.cpp b/mobile/test/net/test_net_multi_feed.cpp old mode 100755 new mode 100644 diff --git a/mobile/test/operators/test_expend_op.cpp b/mobile/test/operators/test_expend_op.cpp old mode 100755 new mode 100644 diff --git a/mobile/tools/python/fluidtools/run_multi_feed.py b/mobile/tools/python/fluidtools/run_multi_feed.py old mode 100755 new mode 100644 -- GitLab