From c28971a94d35c12023abe4647ec8890bb3497529 Mon Sep 17 00:00:00 2001 From: "baolei.an" Date: Fri, 6 Dec 2019 11:39:16 +0800 Subject: [PATCH] merge to newest version --- cmake/cuda.cmake | 25 +- cmake/cudnn.cmake | 6 +- cmake/external/eigen.cmake | 16 +- cmake/external/xbyak.cmake | 6 +- cmake/external/xxhash.cmake | 13 +- cmake/generic.cmake | 5 +- cmake/lite.cmake | 5 +- lite/CMakeLists.txt | 16 +- lite/api/CMakeLists.txt | 31 +- lite/api/benchmark.cc | 7 +- lite/api/cxx_api.cc | 31 +- lite/api/cxx_api.h | 7 + lite/api/mobilenetv1_test.cc | 7 +- lite/api/model_optimize_tool.cc | 161 +- lite/api/model_test.cc | 16 +- lite/api/paddle_api.cc | 9 +- lite/api/paddle_api.h | 2 +- lite/api/paddle_place.cc | 9 +- lite/api/paddle_place.h | 3 +- lite/api/paddle_use_passes.h | 7 +- lite/api/test_step_rnn_lite_x86.cc | 14 - lite/backends/arm/math/CMakeLists.txt | 20 +- lite/backends/arm/math/col_im_transform.cc | 17 +- lite/backends/arm/math/col_im_transform.h | 6 +- .../arm/math/conv3x3_winograd_fp32_c4.cc | 564 ++++ .../arm/math/conv3x3s1_direct_fp32.cc | 10 +- .../arm/math/conv3x3s1_direct_int8.cc | 5 +- .../arm/math/conv3x3s1p01_depthwise_fp32.cc | 2539 +++++++++++++++++ .../arm/math/conv3x3s1px_depthwise_fp32.cc | 541 ++++ .../arm/math/conv3x3s2_direct_fp32.cc | 10 +- .../arm/math/conv3x3s2_direct_int8.cc | 10 +- .../arm/math/conv3x3s2p01_depthwise_fp32.cc | 1862 ++++++++++++ .../arm/math/conv3x3s2px_depthwise_fp32.cc | 362 +++ lite/backends/arm/math/conv_block_utils.h | 1007 +++---- lite/backends/arm/math/conv_depthwise.h | 32 - lite/backends/arm/math/conv_impl.cc | 220 +- lite/backends/arm/math/conv_impl.h | 16 + lite/backends/arm/math/conv_winograd_3x3.cc | 6 +- lite/backends/arm/math/funcs.h | 2 + lite/backends/arm/math/interpolate.cc | 65 +- lite/backends/arm/math/interpolate.h | 5 +- lite/backends/arm/math/layout.cc | 668 +++++ lite/backends/arm/math/layout.h | 30 + lite/backends/arm/math/packed_sgemm.cc | 644 ++++- lite/backends/arm/math/packed_sgemm_c4.cc | 1171 ++++++++ lite/backends/arm/math/packed_sgemm_c4.h | 53 + lite/backends/arm/math/pooling.cc | 18 +- lite/backends/arm/math/sgemv.cc | 570 +++- lite/backends/arm/math/sgemv.h | 9 +- lite/backends/cuda/CMakeLists.txt | 3 +- lite/backends/cuda/cuda_utils.h | 9 + lite/backends/cuda/math/CMakeLists.txt | 9 +- lite/backends/cuda/math/batched_gemm.cc | 134 + lite/backends/cuda/math/batched_gemm.h | 80 + lite/backends/cuda/math/cudnn_conv.cc | 26 +- lite/backends/cuda/math/cudnn_pool.cc | 159 ++ lite/backends/cuda/math/cudnn_pool.h | 79 + lite/backends/cuda/math/elementwise.cu | 94 +- lite/backends/cuda/math/elementwise.h | 21 + lite/backends/cuda/math/gemm.cc | 100 + lite/backends/cuda/math/gemm.h | 74 + lite/backends/cuda/math/utils.h | 18 + lite/backends/fpga/KD/pes/conv_process.hpp | 21 +- .../fpga/KD/pes/depthwise_conv_pe.hpp | 11 +- lite/backends/fpga/KD/pes/pooling_pe.hpp | 10 +- lite/backends/npu/builder.cc | 8 +- lite/backends/npu/builder.h | 111 - lite/backends/opencl/cl_wrapper.cc | 2 +- lite/backends/x86/math/CMakeLists.txt | 3 +- lite/backends/x86/math/beam_search.cc | 1 + lite/backends/x86/math/pooling.cc | 8 +- lite/backends/x86/math/search_fc.cc | 79 + lite/backends/x86/math/search_fc.h | 184 ++ .../x86/math/sequence_topk_avg_pooling.cc | 151 + .../x86/math/sequence_topk_avg_pooling.h | 46 + lite/core/CMakeLists.txt | 4 +- lite/core/arena/framework.cc | 3 + lite/core/context.h | 9 +- lite/core/device_info.cc | 4 +- lite/core/kernel.h | 18 +- lite/core/memory.cc | 2 +- .../mir/fusion/conv_activation_fuse_pass.cc | 1 + lite/core/mir/fusion/conv_bn_fuse_pass.cc | 2 +- .../mir/fusion/conv_elementwise_fuse_pass.cc | 3 +- .../elementwise_add_activation_fuse_pass.cc | 1 + lite/core/mir/fusion/fc_fuse_pass.cc | 1 + .../core/mir/fusion/quant_dequant_op_fuser.cc | 4 + lite/core/mir/memory_optimize_pass.cc | 3 +- lite/core/mir/pass.h | 33 +- lite/core/mir/pass_utils.cc | 32 +- lite/core/mir/pass_utils.h | 4 +- lite/core/mir/static_kernel_pick_pass.h | 24 +- .../mir/subgraph/generate_npu_program_pass.cc | 49 +- .../mir/subgraph/generate_npu_program_pass.h | 4 - .../generate_npu_program_pass_test.cc | 4 +- .../mir/subgraph/generate_xpu_program_pass.cc | 49 +- .../mir/subgraph/generate_xpu_program_pass.h | 4 - lite/core/mir/type_layout_cast_pass.cc | 24 +- lite/core/mir/type_target_cast_pass.cc | 34 +- lite/core/mir/variable_place_inference_pass.h | 70 +- lite/core/op_registry.cc | 2 + lite/core/op_registry.h | 16 +- lite/core/optimizer.h | 87 +- lite/core/profile/CMakeLists.txt | 3 +- lite/core/profile/profiler.cc | 117 + lite/core/profile/profiler.h | 59 + lite/core/profile/test_timer.cc | 81 + lite/core/profile/timer.h | 114 + lite/core/program.cc | 8 +- lite/core/program.h | 48 +- lite/demo/cxx/Makefile.def | 14 +- lite/demo/cxx/README.md | 26 +- .../mobile_detection/Makefile.android.armv7 | 61 + .../mobile_detection/Makefile.android.armv8 | 61 + .../mobile_full/Makefile.android.armv7 | 20 +- .../mobile_full/Makefile.android.armv8 | 20 +- .../mobile_light/Makefile.android.armv7 | 14 +- .../mobile_light/Makefile.android.armv8 | 14 +- .../cxx/mobile_detection/mobile_detection.cc | 210 ++ lite/demo/cxx/mobile_detection/test.jpg | Bin 0 -> 127499 bytes .../cxx/mobile_full/mobilenetv1_full_api.cc | 30 +- .../cxx/mobile_light/mobilenetv1_light_api.cc | 26 +- lite/gen_code/CMakeLists.txt | 2 - lite/kernels/arm/CMakeLists.txt | 62 +- lite/kernels/arm/conv_compute.cc | 73 +- lite/kernels/arm/conv_depthwise.cc | 35 +- lite/kernels/arm/conv_gemmlike.h | 17 +- lite/kernels/arm/conv_transpose_compute.cc | 25 +- lite/kernels/arm/conv_winograd.cc | 177 +- lite/kernels/arm/conv_winograd.h | 1 + lite/kernels/arm/fc_compute.cc | 3 +- lite/kernels/arm/fill_constant_compute.cc | 37 + lite/kernels/arm/interpolate_compute.cc | 14 +- lite/kernels/arm/layout_compute.cc | 179 ++ lite/kernels/arm/layout_compute.h | 43 + lite/kernels/arm/lookup_table_compute.cc | 14 +- lite/kernels/arm/lookup_table_compute_test.cc | 115 + lite/kernels/arm/lrn_compute.cc | 7 +- lite/kernels/arm/lrn_compute_test.cc | 4 +- lite/kernels/arm/matmul_compute.cc | 2 +- lite/kernels/arm/mul_compute.cc | 5 +- lite/kernels/arm/pool_compute.cc | 22 +- lite/kernels/arm/pool_compute_test.cc | 176 +- lite/kernels/arm/split_compute.cc | 4 + lite/kernels/cuda/CMakeLists.txt | 38 +- .../cuda/attention_padding_mask_compute.cu | 162 ++ .../cuda/attention_padding_mask_compute.h | 38 + .../attention_padding_mask_compute_test.cc | 134 + lite/kernels/cuda/bilinear_interp_compute.cu | 84 +- .../cuda/bilinear_interp_compute_test.cc | 111 + lite/kernels/cuda/calib_compute_cuda_test.cc | 27 +- lite/kernels/cuda/concat_compute.cu | 4 +- lite/kernels/cuda/conv_compute.cc | 25 +- lite/kernels/cuda/conv_compute_test.cc | 15 +- lite/kernels/cuda/elementwise_compute.cu | 318 +++ lite/kernels/cuda/elementwise_compute.h | 98 + lite/kernels/cuda/elementwise_compute_test.cc | 252 ++ lite/kernels/cuda/feed_compute.cc | 45 +- lite/kernels/cuda/feed_compute.h | 3 +- lite/kernels/cuda/layout_compute.cc | 27 + lite/kernels/cuda/lookup_table_compute.cu | 11 + .../cuda/match_matrix_tensor_compute.cu | 145 + .../cuda/match_matrix_tensor_compute.h | 42 + .../cuda/match_matrix_tensor_compute_test.cc | 122 + lite/kernels/cuda/mul_compute_test.cc | 2 + lite/kernels/cuda/nearest_interp_compute.cu | 87 +- .../cuda/nearest_interp_compute_test.cc | 111 + lite/kernels/cuda/pool_compute.cu | 76 +- lite/kernels/cuda/pool_compute.h | 17 + lite/kernels/cuda/pool_compute_test.cc | 252 +- .../cuda/search_aligned_mat_mul_compute.cc | 38 + .../cuda/search_aligned_mat_mul_compute.h | 103 + .../search_aligned_mat_mul_compute_test.cc | 221 ++ lite/kernels/cuda/search_fc_compute.cu | 170 ++ lite/kernels/cuda/search_fc_compute.h | 52 + lite/kernels/cuda/search_fc_compute_test.cc | 110 + lite/kernels/cuda/search_grnn_compute.cu | 351 +++ lite/kernels/cuda/search_grnn_compute.h | 46 + lite/kernels/cuda/search_grnn_compute_test.cc | 103 + .../cuda/search_group_padding_compute.cu | 164 ++ .../cuda/search_group_padding_compute.h | 38 + .../cuda/search_group_padding_compute_test.cc | 127 + .../cuda/search_seq_depadding_compute.cu | 115 + .../cuda/search_seq_depadding_compute.h | 39 + .../cuda/search_seq_depadding_compute_test.cc | 88 + lite/kernels/cuda/search_seq_fc_compute.cu | 98 + lite/kernels/cuda/search_seq_fc_compute.h | 43 + .../cuda/search_seq_fc_compute_test.cc | 175 ++ .../cuda/sequence_arithmetic_compute.cu | 249 ++ .../cuda/sequence_arithmetic_compute.h | 41 + .../cuda/sequence_arithmetic_compute_test.cc | 131 + lite/kernels/cuda/sequence_concat_compute.cu | 151 + lite/kernels/cuda/sequence_concat_compute.h | 40 + .../cuda/sequence_concat_compute_test.cc | 163 ++ lite/kernels/cuda/sequence_pool_compute.cu | 258 ++ lite/kernels/cuda/sequence_pool_compute.h | 35 + .../cuda/sequence_pool_compute_test.cc | 104 + lite/kernels/cuda/sequence_reverse_compute.cu | 130 + lite/kernels/cuda/sequence_reverse_compute.h | 38 + .../cuda/sequence_reverse_compute_test.cc | 105 + .../cuda/sequence_topk_avg_pooling_compute.cu | 209 ++ .../cuda/sequence_topk_avg_pooling_compute.h | 43 + lite/kernels/cuda/softmax_compute.cu | 23 +- lite/kernels/cuda/var_conv_2d_compute.cu | 263 ++ lite/kernels/cuda/var_conv_2d_compute.h | 37 + lite/kernels/cuda/var_conv_2d_compute_test.cc | 360 +++ lite/kernels/fpga/conv_compute.cc | 7 + lite/kernels/fpga/conv_compute_test.cc | 20 +- lite/kernels/npu/bridges/CMakeLists.txt | 9 + lite/kernels/npu/bridges/act_op.cc | 23 +- lite/kernels/npu/bridges/act_op_test.cc | 122 +- lite/kernels/npu/bridges/batch_norm_op.cc | 6 +- lite/kernels/npu/bridges/conv_op.cc | 36 +- lite/kernels/npu/bridges/conv_op_test.cc | 5 +- lite/kernels/npu/bridges/conv_transpose_op.cc | 17 +- .../npu/bridges/conv_transpose_op_test.cc | 3 +- lite/kernels/npu/bridges/elementwise_ops.cc | 85 +- .../npu/bridges/elementwise_ops_test.cc | 87 +- lite/kernels/npu/bridges/interpolate_op.cc | 70 +- lite/kernels/npu/bridges/mul_op.cc | 99 +- .../npu/bridges/paddle_use_npu_bridges.h | 39 +- lite/kernels/npu/bridges/pool_op.cc | 79 +- lite/kernels/npu/bridges/pool_op_test.cc | 5 +- lite/kernels/npu/bridges/reduce_mean_op.cc | 111 + .../npu/bridges/reduce_mean_op_test.cc | 347 +++ lite/kernels/npu/bridges/reshape_op.cc | 6 +- lite/kernels/npu/bridges/sqrt_op.cc | 54 + lite/kernels/npu/bridges/sqrt_op_test.cc | 93 + lite/kernels/npu/bridges/square_op.cc | 55 + lite/kernels/npu/bridges/square_op_test.cc | 92 + lite/kernels/opencl/CMakeLists.txt | 2 +- lite/kernels/opencl/conv_compute.cc | 28 +- lite/kernels/opencl/conv_compute_test.cc | 14 +- .../opencl/depthwise_conv2d_compute.cc | 2 +- .../opencl/depthwise_conv2d_compute_test.cc | 3 +- lite/kernels/opencl/io_copy_compute.cc | 5 +- lite/kernels/opencl/pool_compute.cc | 14 +- lite/kernels/opencl/pool_compute_test.cc | 4 +- lite/kernels/x86/CMakeLists.txt | 35 +- .../x86/attention_padding_mask_compute.cc | 28 + .../x86/attention_padding_mask_compute.h | 83 + .../attention_padding_mask_compute_test.cc | 132 + lite/kernels/x86/cast_compute.cc | 25 + lite/kernels/x86/cast_compute.h | 80 + lite/kernels/x86/cast_compute_test.cc | 77 + lite/kernels/x86/conv_compute.h | 41 +- lite/kernels/x86/conv_compute_test.cc | 6 +- lite/kernels/x86/fill_constant_compute.cc | 36 + lite/kernels/x86/gather_compute.cc | 32 + lite/kernels/x86/gather_compute.h | 99 + lite/kernels/x86/gather_compute_test.cc | 159 ++ lite/kernels/x86/layer_norm_compute.cc | 29 + lite/kernels/x86/layer_norm_compute.h | 91 + lite/kernels/x86/layer_norm_compute_test.cc | 169 ++ lite/kernels/x86/lookup_table_compute.cc | 10 + lite/kernels/x86/lookup_table_compute.h | 11 +- lite/kernels/x86/lookup_table_compute_test.cc | 82 + .../x86/match_matrix_tensor_compute.cc | 142 + .../kernels/x86/match_matrix_tensor_compute.h | 42 + .../x86/match_matrix_tensor_compute_test.cc | 116 + lite/kernels/x86/mean_compute.cc | 36 - lite/kernels/x86/mul_compute.cc | 18 - lite/kernels/x86/mul_compute.h | 72 - lite/kernels/x86/pool_compute.h | 5 +- lite/kernels/x86/pool_compute_test.cc | 3 +- .../x86/search_aligned_mat_mul_compute.cc | 30 + .../x86/search_aligned_mat_mul_compute.h | 83 + lite/kernels/x86/search_fc_compute.cc | 27 + lite/kernels/x86/search_fc_compute.h | 44 + lite/kernels/x86/search_fc_compute_test.cc | 122 + lite/kernels/x86/search_grnn_compute.cc | 332 +++ lite/kernels/x86/search_grnn_compute.h | 44 + lite/kernels/x86/search_grnn_compute_test.cc | 100 + .../x86/search_group_padding_compute.cc | 28 + .../x86/search_group_padding_compute.h | 105 + .../x86/search_group_padding_compute_test.cc | 92 + .../x86/search_seq_depadding_compute.cc | 76 + .../x86/search_seq_depadding_compute.h | 40 + .../x86/search_seq_depadding_compute_test.cc | 83 + lite/kernels/x86/search_seq_fc_compute.cc | 27 + lite/kernels/x86/search_seq_fc_compute.h | 73 + .../x86/sequence_arithmetic_compute.cc | 38 + .../kernels/x86/sequence_arithmetic_compute.h | 111 + .../x86/sequence_arithmetic_compute_test.cc | 125 + lite/kernels/x86/sequence_concat_compute.cc | 25 + lite/kernels/x86/sequence_concat_compute.h | 84 + .../x86/sequence_concat_compute_test.cc | 163 ++ lite/kernels/x86/sequence_reverse_compute.cc | 32 + lite/kernels/x86/sequence_reverse_compute.h | 63 + .../x86/sequence_reverse_compute_test.cc | 108 + .../x86/sequence_topk_avg_pooling_compute.cc | 29 + .../x86/sequence_topk_avg_pooling_compute.h | 50 + lite/kernels/x86/softmax_compute.cc | 10 + lite/kernels/x86/stack_compute.cc | 25 + lite/kernels/x86/stack_compute.h | 72 + lite/kernels/x86/stack_compute_test.cc | 89 + lite/kernels/x86/var_conv_2d_compute.cc | 27 + lite/kernels/x86/var_conv_2d_compute.h | 213 ++ lite/kernels/x86/var_conv_2d_compute_test.cc | 315 ++ lite/kernels/xpu/bridges/conv_op.cc | 31 +- lite/kernels/xpu/bridges/conv_op_test.cc | 5 +- lite/kernels/xpu/bridges/pool_op_test.cc | 5 +- lite/model_parser/model_parser.cc | 2 +- lite/operators/CMakeLists.txt | 92 +- lite/operators/activation_ops.cc | 1 + lite/operators/attention_padding_mask_op.cc | 70 + lite/operators/attention_padding_mask_op.h | 46 + lite/operators/conv_op.cc | 48 +- lite/operators/conv_op.h | 52 +- lite/operators/conv_transpose_op.cc | 93 +- lite/operators/conv_transpose_op.h | 1 + lite/operators/fill_constant_op.cc | 23 + lite/operators/interpolate_op.cc | 55 +- lite/operators/lookup_table_v2_op.cc | 68 + lite/operators/lookup_table_v2_op.h | 46 + lite/operators/lrn_op.cc | 6 +- lite/operators/match_matrix_tensor_op.cc | 105 + lite/operators/match_matrix_tensor_op.h | 49 + lite/operators/op_params.h | 146 +- lite/operators/pool_op.cc | 38 +- lite/operators/pool_op.h | 56 +- lite/operators/search_aligned_mat_mul_op.cc | 101 + lite/operators/search_aligned_mat_mul_op.h | 47 + lite/operators/search_fc_op.cc | 80 + lite/operators/search_fc_op.h | 46 + lite/operators/search_grnn_op.cc | 94 + lite/operators/search_grnn_op.h | 48 + lite/operators/search_group_padding_op.cc | 67 + lite/operators/search_group_padding_op.h | 41 + lite/operators/search_seq_depadding_op.cc | 71 + lite/operators/search_seq_depadding_op.h | 49 + lite/operators/search_seq_fc_op.cc | 80 + lite/operators/search_seq_fc_op.h | 47 + lite/operators/search_seq_softmax_op.cc | 52 + lite/operators/search_seq_softmax_op.h | 47 + lite/operators/sequence_arithmetic_op.cc | 58 + lite/operators/sequence_arithmetic_op.h | 46 + lite/operators/sequence_concat_op.cc | 85 + lite/operators/sequence_concat_op.h | 41 + lite/operators/sequence_reverse_op.cc | 55 + lite/operators/sequence_reverse_op.h | 41 + .../operators/sequence_topk_avg_pooling_op.cc | 85 + lite/operators/sequence_topk_avg_pooling_op.h | 49 + lite/operators/split_op.cc | 31 +- lite/operators/unsqueeze_op.cc | 11 +- lite/operators/var_conv_2d_op.cc | 79 + lite/operators/var_conv_2d_op.h | 41 + lite/tests/cv/image_convert_test.cc | 12 +- lite/tests/kernels/CMakeLists.txt | 2 + .../kernels/bilinear_interp_compute_test.cc | 100 +- .../kernels/conv2d_transpose_compute_test.cc | 185 +- .../kernels/fill_constant_compute_test.cc | 178 ++ lite/tests/kernels/lrn_compute_test.cc | 2 +- .../kernels/nearest_interp_compute_test.cc | 84 +- .../search_aligned_mat_mul_compute_test.cc | 220 ++ .../kernels/search_seq_fc_compute_test.cc | 177 ++ .../kernels/shuffle_channel_compute_test.cc | 15 +- lite/tests/kernels/unsqueeze_compute_test.cc | 5 +- lite/tests/math/CMakeLists.txt | 8 + lite/tests/math/conv_compute_test.cc | 228 +- lite/tests/math/conv_int8_compute_test.cc | 203 +- .../tests/math/conv_transpose_compute_test.cc | 121 +- lite/tests/math/gemm_int8_compute_test.cc | 30 +- lite/tests/math/gemv_int8_compute_test.cc | 30 +- lite/tests/math/layout_compute_test.cc | 608 ++++ lite/tests/math/pool_compute_test.cc | 106 +- lite/tests/math/sgemm_c4_compute_test.cc | 236 ++ lite/tests/math/sgemm_compute_test.cc | 16 +- lite/tests/math/sgemv_compute_test.cc | 194 ++ lite/tests/utils/naive_math_impl.h | 138 +- lite/tools/build.sh | 16 +- lite/tools/build_npu.sh | 4 +- lite/tools/ci_build.sh | 24 +- lite/tools/debug/debug_utils.h | 6 +- lite/utils/cv/paddle_image_preprocess.cc | 234 -- lite/utils/cv/paddle_image_preprocess.h | 6 +- lite/utils/io.h | 60 + lite/utils/logging.cc | 8 +- lite/utils/logging.h | 31 + lite/utils/replace_stl/stream.cc | 58 +- lite/utils/replace_stl/stream.h | 21 +- mobile/src/fpga/V2/api.cpp | 2 +- mobile/src/fpga/V2/image.cpp | 11 +- mobile/src/fpga/V2/pe.cpp | 473 ++- mobile/src/fpga/common/driver.cpp | 2 +- mobile/src/fpga/common/fpga_common.h | 1 + .../fpga/V2/anchor_generator_kernel.cpp | 2 +- .../kernel/fpga/V2/concat_kernel.cpp | 9 + .../kernel/fpga/V2/elementwise_add_kernel.cpp | 33 +- .../fpga/V2/elementwise_add_relu_kernel.cpp | 31 +- .../kernel/fpga/V2/reshape2_kernel.cpp | 23 +- .../kernel/fpga/V2/sigmoid_kernel.cpp | 2 +- .../operators/kernel/fpga/V2/slice_kernel.cpp | 29 +- .../src/operators/math/depthwise_conv3x3.cpp | 10 +- 394 files changed, 30742 insertions(+), 3308 deletions(-) create mode 100644 lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc create mode 100644 lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc create mode 100644 lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc create mode 100644 lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc create mode 100644 lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc create mode 100644 lite/backends/arm/math/layout.cc create mode 100644 lite/backends/arm/math/layout.h create mode 100644 lite/backends/arm/math/packed_sgemm_c4.cc create mode 100644 lite/backends/arm/math/packed_sgemm_c4.h create mode 100644 lite/backends/cuda/math/batched_gemm.cc create mode 100644 lite/backends/cuda/math/batched_gemm.h create mode 100644 lite/backends/cuda/math/cudnn_pool.cc create mode 100644 lite/backends/cuda/math/cudnn_pool.h create mode 100644 lite/backends/cuda/math/gemm.cc create mode 100644 lite/backends/cuda/math/gemm.h mode change 100755 => 100644 lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp create mode 100644 lite/backends/x86/math/search_fc.cc create mode 100644 lite/backends/x86/math/search_fc.h create mode 100644 lite/backends/x86/math/sequence_topk_avg_pooling.cc create mode 100644 lite/backends/x86/math/sequence_topk_avg_pooling.h create mode 100644 lite/core/profile/profiler.cc create mode 100644 lite/core/profile/profiler.h create mode 100644 lite/core/profile/test_timer.cc create mode 100644 lite/core/profile/timer.h create mode 100644 lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 create mode 100644 lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 create mode 100644 lite/demo/cxx/mobile_detection/mobile_detection.cc create mode 100644 lite/demo/cxx/mobile_detection/test.jpg create mode 100644 lite/kernels/arm/layout_compute.cc create mode 100644 lite/kernels/arm/layout_compute.h create mode 100644 lite/kernels/arm/lookup_table_compute_test.cc create mode 100644 lite/kernels/cuda/attention_padding_mask_compute.cu create mode 100644 lite/kernels/cuda/attention_padding_mask_compute.h create mode 100644 lite/kernels/cuda/attention_padding_mask_compute_test.cc create mode 100644 lite/kernels/cuda/elementwise_compute.cu create mode 100644 lite/kernels/cuda/elementwise_compute.h create mode 100644 lite/kernels/cuda/elementwise_compute_test.cc create mode 100644 lite/kernels/cuda/match_matrix_tensor_compute.cu create mode 100644 lite/kernels/cuda/match_matrix_tensor_compute.h create mode 100644 lite/kernels/cuda/match_matrix_tensor_compute_test.cc create mode 100644 lite/kernels/cuda/search_aligned_mat_mul_compute.cc create mode 100644 lite/kernels/cuda/search_aligned_mat_mul_compute.h create mode 100644 lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc create mode 100644 lite/kernels/cuda/search_fc_compute.cu create mode 100644 lite/kernels/cuda/search_fc_compute.h create mode 100644 lite/kernels/cuda/search_fc_compute_test.cc create mode 100644 lite/kernels/cuda/search_grnn_compute.cu create mode 100644 lite/kernels/cuda/search_grnn_compute.h create mode 100644 lite/kernels/cuda/search_grnn_compute_test.cc create mode 100644 lite/kernels/cuda/search_group_padding_compute.cu create mode 100644 lite/kernels/cuda/search_group_padding_compute.h create mode 100644 lite/kernels/cuda/search_group_padding_compute_test.cc create mode 100644 lite/kernels/cuda/search_seq_depadding_compute.cu create mode 100644 lite/kernels/cuda/search_seq_depadding_compute.h create mode 100644 lite/kernels/cuda/search_seq_depadding_compute_test.cc create mode 100644 lite/kernels/cuda/search_seq_fc_compute.cu create mode 100644 lite/kernels/cuda/search_seq_fc_compute.h create mode 100644 lite/kernels/cuda/search_seq_fc_compute_test.cc create mode 100644 lite/kernels/cuda/sequence_arithmetic_compute.cu create mode 100644 lite/kernels/cuda/sequence_arithmetic_compute.h create mode 100644 lite/kernels/cuda/sequence_arithmetic_compute_test.cc create mode 100644 lite/kernels/cuda/sequence_concat_compute.cu create mode 100644 lite/kernels/cuda/sequence_concat_compute.h create mode 100644 lite/kernels/cuda/sequence_concat_compute_test.cc create mode 100644 lite/kernels/cuda/sequence_pool_compute.cu create mode 100644 lite/kernels/cuda/sequence_pool_compute.h create mode 100644 lite/kernels/cuda/sequence_pool_compute_test.cc create mode 100644 lite/kernels/cuda/sequence_reverse_compute.cu create mode 100644 lite/kernels/cuda/sequence_reverse_compute.h create mode 100644 lite/kernels/cuda/sequence_reverse_compute_test.cc create mode 100644 lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu create mode 100644 lite/kernels/cuda/sequence_topk_avg_pooling_compute.h create mode 100644 lite/kernels/cuda/var_conv_2d_compute.cu create mode 100644 lite/kernels/cuda/var_conv_2d_compute.h create mode 100644 lite/kernels/cuda/var_conv_2d_compute_test.cc create mode 100644 lite/kernels/npu/bridges/reduce_mean_op.cc create mode 100644 lite/kernels/npu/bridges/reduce_mean_op_test.cc create mode 100644 lite/kernels/npu/bridges/sqrt_op.cc create mode 100644 lite/kernels/npu/bridges/sqrt_op_test.cc create mode 100644 lite/kernels/npu/bridges/square_op.cc create mode 100644 lite/kernels/npu/bridges/square_op_test.cc create mode 100644 lite/kernels/x86/attention_padding_mask_compute.cc create mode 100644 lite/kernels/x86/attention_padding_mask_compute.h create mode 100644 lite/kernels/x86/attention_padding_mask_compute_test.cc create mode 100644 lite/kernels/x86/cast_compute.cc create mode 100644 lite/kernels/x86/cast_compute.h create mode 100644 lite/kernels/x86/cast_compute_test.cc create mode 100644 lite/kernels/x86/gather_compute.cc create mode 100644 lite/kernels/x86/gather_compute.h create mode 100644 lite/kernels/x86/gather_compute_test.cc create mode 100644 lite/kernels/x86/layer_norm_compute.cc create mode 100644 lite/kernels/x86/layer_norm_compute.h create mode 100644 lite/kernels/x86/layer_norm_compute_test.cc create mode 100644 lite/kernels/x86/lookup_table_compute_test.cc create mode 100644 lite/kernels/x86/match_matrix_tensor_compute.cc create mode 100644 lite/kernels/x86/match_matrix_tensor_compute.h create mode 100644 lite/kernels/x86/match_matrix_tensor_compute_test.cc create mode 100644 lite/kernels/x86/search_aligned_mat_mul_compute.cc create mode 100644 lite/kernels/x86/search_aligned_mat_mul_compute.h create mode 100644 lite/kernels/x86/search_fc_compute.cc create mode 100644 lite/kernels/x86/search_fc_compute.h create mode 100644 lite/kernels/x86/search_fc_compute_test.cc create mode 100644 lite/kernels/x86/search_grnn_compute.cc create mode 100644 lite/kernels/x86/search_grnn_compute.h create mode 100644 lite/kernels/x86/search_grnn_compute_test.cc create mode 100644 lite/kernels/x86/search_group_padding_compute.cc create mode 100644 lite/kernels/x86/search_group_padding_compute.h create mode 100644 lite/kernels/x86/search_group_padding_compute_test.cc create mode 100644 lite/kernels/x86/search_seq_depadding_compute.cc create mode 100644 lite/kernels/x86/search_seq_depadding_compute.h create mode 100644 lite/kernels/x86/search_seq_depadding_compute_test.cc create mode 100644 lite/kernels/x86/search_seq_fc_compute.cc create mode 100644 lite/kernels/x86/search_seq_fc_compute.h create mode 100644 lite/kernels/x86/sequence_arithmetic_compute.cc create mode 100644 lite/kernels/x86/sequence_arithmetic_compute.h create mode 100644 lite/kernels/x86/sequence_arithmetic_compute_test.cc create mode 100644 lite/kernels/x86/sequence_concat_compute.cc create mode 100644 lite/kernels/x86/sequence_concat_compute.h create mode 100644 lite/kernels/x86/sequence_concat_compute_test.cc create mode 100644 lite/kernels/x86/sequence_reverse_compute.cc create mode 100644 lite/kernels/x86/sequence_reverse_compute.h create mode 100644 lite/kernels/x86/sequence_reverse_compute_test.cc create mode 100644 lite/kernels/x86/sequence_topk_avg_pooling_compute.cc create mode 100644 lite/kernels/x86/sequence_topk_avg_pooling_compute.h create mode 100644 lite/kernels/x86/stack_compute.cc create mode 100644 lite/kernels/x86/stack_compute.h create mode 100644 lite/kernels/x86/stack_compute_test.cc create mode 100644 lite/kernels/x86/var_conv_2d_compute.cc create mode 100644 lite/kernels/x86/var_conv_2d_compute.h create mode 100644 lite/kernels/x86/var_conv_2d_compute_test.cc create mode 100644 lite/operators/attention_padding_mask_op.cc create mode 100644 lite/operators/attention_padding_mask_op.h create mode 100644 lite/operators/lookup_table_v2_op.cc create mode 100644 lite/operators/lookup_table_v2_op.h create mode 100644 lite/operators/match_matrix_tensor_op.cc create mode 100644 lite/operators/match_matrix_tensor_op.h create mode 100644 lite/operators/search_aligned_mat_mul_op.cc create mode 100644 lite/operators/search_aligned_mat_mul_op.h create mode 100644 lite/operators/search_fc_op.cc create mode 100644 lite/operators/search_fc_op.h create mode 100644 lite/operators/search_grnn_op.cc create mode 100644 lite/operators/search_grnn_op.h create mode 100644 lite/operators/search_group_padding_op.cc create mode 100644 lite/operators/search_group_padding_op.h create mode 100644 lite/operators/search_seq_depadding_op.cc create mode 100644 lite/operators/search_seq_depadding_op.h create mode 100644 lite/operators/search_seq_fc_op.cc create mode 100644 lite/operators/search_seq_fc_op.h create mode 100644 lite/operators/search_seq_softmax_op.cc create mode 100644 lite/operators/search_seq_softmax_op.h create mode 100644 lite/operators/sequence_arithmetic_op.cc create mode 100644 lite/operators/sequence_arithmetic_op.h create mode 100644 lite/operators/sequence_concat_op.cc create mode 100644 lite/operators/sequence_concat_op.h create mode 100644 lite/operators/sequence_reverse_op.cc create mode 100644 lite/operators/sequence_reverse_op.h create mode 100644 lite/operators/sequence_topk_avg_pooling_op.cc create mode 100644 lite/operators/sequence_topk_avg_pooling_op.h create mode 100644 lite/operators/var_conv_2d_op.cc create mode 100644 lite/operators/var_conv_2d_op.h create mode 100644 lite/tests/kernels/fill_constant_compute_test.cc create mode 100644 lite/tests/kernels/search_aligned_mat_mul_compute_test.cc create mode 100644 lite/tests/kernels/search_seq_fc_compute_test.cc create mode 100644 lite/tests/math/layout_compute_test.cc create mode 100644 lite/tests/math/sgemm_c4_compute_test.cc create mode 100644 lite/tests/math/sgemv_compute_test.cc mode change 100644 => 100755 mobile/src/fpga/V2/image.cpp mode change 100644 => 100755 mobile/src/fpga/V2/pe.cpp mode change 100644 => 100755 mobile/src/fpga/common/driver.cpp mode change 100644 => 100755 mobile/src/fpga/common/fpga_common.h mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 9ff908a4c8..a5d3d57218 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -174,15 +174,26 @@ if(NOT WITH_DSO) endif(WIN32) endif(NOT WITH_DSO) -get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY) -function(import_static_library alias path) +function(add_cuda_static_lib alias cuda_lib_paths file_name) + unset(ABS_PATH CACHE) + find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH) add_library(${alias} STATIC IMPORTED GLOBAL) - set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path}) + set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH}) + set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE) + if (NOT ABS_PATH) + message(FATAL_ERROR "Can not find CUDA static library: ${file_name}") + endif() endfunction() -import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a) -import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a) -import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a) -import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a) + +add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a) +add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a) +add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a) +add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a) +if(NOT ${CUDA_VERSION} LESS 10.1) + add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a) +endif() + +set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES}) # setting nvcc arch flags select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 842b94d47e..574baa86a8 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu - ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ + /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ + /usr/lib/${TARGET_ARCH}-linux-gnu/ $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib /usr/lib ${CUDA_TOOLKIT_ROOT_DIR} - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ) if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0)) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index bd0d117a63..599e7bba7e 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -1,5 +1,6 @@ INCLUDE(ExternalProject) +SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) @@ -16,9 +17,12 @@ if(WITH_AMD_GPU) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e + GIT_TAG + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip + DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} + DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} + DOWNLOAD_NAME "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -29,12 +33,14 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen - GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c + GIT_TAG + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} + DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} - DOWNLOAD_NAME "eigen" + DOWNLOAD_NAME "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 1d61154c0d..5166b494c4 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -20,6 +20,7 @@ endif() include(ExternalProject) +SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak) set(XBYAK_PROJECT extern_xbyak) set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) @@ -38,8 +39,11 @@ ExternalProject_Add( ${XBYAK_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS "" - GIT_REPOSITORY "https://github.com/herumi/xbyak.git" GIT_TAG "v5.661" # Jul 26th + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip + DOWNLOAD_DIR ${XBYAK_SOURCECODE_DIR} + DOWNLOAD_NAME "xbyak-5.66.zip" + DOWNLOAD_NO_PROGRESS 1 PREFIX ${XBYAK_PREFIX_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index 23b1e02108..fdc20351e8 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -1,5 +1,6 @@ INCLUDE(ExternalProject) +SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash) set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash) set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") @@ -18,10 +19,12 @@ if(WIN32) ExternalProject_Add( extern_xxhash ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" GIT_TAG "v0.6.5" + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip + DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR} + DOWNLOAD_NAME "xxHash-0.6.5.zip" + DOWNLOAD_NO_PROGRESS 1 PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" UPDATE_COMMAND "" BUILD_IN_SOURCE 1 PATCH_COMMAND @@ -41,10 +44,12 @@ else() ExternalProject_Add( extern_xxhash ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" GIT_TAG "v0.6.5" + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip + DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR} + DOWNLOAD_NO_PROGRESS 1 PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" + DOWNLOAD_NAME "xxHash-0.6.5.zip" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_IN_SOURCE 1 diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 415eb451a9..225a3c19a1 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) + target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES}) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${os_dependency_modules}) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) @@ -507,7 +510,7 @@ function(nv_test TARGET_NAME) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest -gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} ) + gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} ) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 4423e27e1a..3b9b4ece23 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -164,7 +164,9 @@ function(lite_cc_library TARGET) endfunction() function(lite_cc_binary TARGET) - set(options "") + if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(options " -g ") + endif() set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) @@ -255,6 +257,7 @@ endfunction() set(arm_kernels CACHE INTERNAL "arm kernels") set(x86_kernels CACHE INTERNAL "x86 kernels") +set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 173f04126e..036df2a824 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}") message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}") message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") +message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") @@ -121,6 +122,9 @@ if (LITE_WITH_X86) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) endif() +if(LITE_WITH_CUDA) + add_dependencies(publish_inference paddle_full_api_shared) +endif(LITE_WITH_CUDA) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (NOT LITE_ON_TINY_PUBLISH) # add cxx lib @@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" ) add_dependencies(tiny_publish_lib bundle_light_api) add_dependencies(publish_inference tiny_publish_lib) @@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ) add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared) add_dependencies(publish_inference tiny_publish_cxx_lib) + add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD + COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so) endif() endif() endif() @@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) endif() endif() - if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND + if ((ARM_TARGET_OS STREQUAL "android") AND ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) if (NOT LITE_ON_TINY_PUBLISH) # copy @@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + ) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index aef0fc396e..e660bbcdd6 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH) set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") endif() set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer) -if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) +if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) #full api dynamic library add_library(paddle_full_api_shared SHARED "") target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) @@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and add_dependencies(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash) endif() - + if(LITE_WITH_CUDA) + target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") + endif(LITE_WITH_CUDA) #light api dynamic library lite_cc_library(paddle_light_api_shared MODULE SRCS light_api_shared.cc @@ -65,6 +67,7 @@ endif() message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") +message(STATUS "get CUDA kernels ${cuda_kernels}") message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") @@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH) ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels} - BM_DEPS ${bm_kenrels}) + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + BM_DEPS ${bm_kernels}) endif() # for light api set(light_api_deps scope target_wrapper_host model_parser program) if(LITE_WITH_CUDA) + get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) set(light_api_deps ${light_api_deps} target_wrapper_cuda) - set(cuda_static_deps cudart_static cublas_static curand_static - cudnn_static culibos_static) endif() lite_cc_library(light_api SRCS light_api.cc DEPS scope target_wrapper_host model_parser @@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels} - BM_DEPS ${bm_kenrels}) + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + BM_DEPS ${bm_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING @@ -305,9 +307,10 @@ if(NOT IOS) NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} - X86_DEPS ${x86_kernels}) + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} @@ -316,7 +319,9 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} - X86_DEPS ${x86_kernels}) + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) + endif() #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc index 462a5e2381..c137324b57 100644 --- a/lite/api/benchmark.cc +++ b/lite/api/benchmark.cc @@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir, const std::vector>& input_shapes) { lite_api::CxxConfig config; config.set_model_dir(load_model_dir); - std::vector vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kOpenCL), PRECISION(kFloat)}}; + std::vector vaild_places = { + Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}, + }; if (FLAGS_is_quantized_model) { vaild_places.insert(vaild_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index a2b538aa77..4647f20bbe 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -24,13 +24,6 @@ namespace paddle { namespace lite { -static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] = - ".tailored_ops_source_list"; -static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list"; -static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] = - ".tailored_kernels_source_list"; -static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list"; - void Predictor::SaveModel(const std::string &dir, lite_api::LiteModelType model_type, bool record_info) { @@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) { // get inputs names std::vector Predictor::GetInputNames() { return input_names_; } + // get outputnames std::vector Predictor::GetOutputNames() { return output_names_; } + // append the names of inputs and outputs into input_names_ and output_names_ void Predictor::PrepareFeedFetch() { + std::vector feeds; + std::vector fetchs; +#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU) + // The shape of input tensors must be determined before generating NPU and XPU + // program. auto current_block = program_desc_.GetBlock(0); - std::vector feeds; - std::vector fetchs; for (size_t i = 0; i < current_block->OpsSize(); i++) { auto op = current_block->GetOp(i); +#else + if (!program_) { + GenRuntimeProgram(); + } + const auto &insts = program_->instructions(); + for (size_t i = 0; i < program_->num_instructions(); i++) { + const auto &op = insts[i].op()->op_info(); +#endif if (op->Type() == "feed") { feeds.push_back(op); } else if (op->Type() == "fetch") { fetchs.push_back(op); } } + input_names_.resize(feeds.size()); output_names_.resize(fetchs.size()); for (size_t i = 0; i < feeds.size(); i++) { @@ -190,6 +197,7 @@ std::vector Predictor::GetOutputs() const { const cpp::ProgramDesc &Predictor::program_desc() const { return program_desc_; } + const RuntimeProgram &Predictor::runtime_program() const { return *program_; } void Predictor::Build(const lite_api::CxxConfig &config, @@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc, const std::vector &valid_places, const std::vector &passes) { program_desc_ = desc; + // `inner_places` is used to optimize passes std::vector inner_places = valid_places; inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); Program program(desc, scope_, inner_places); - /// The first place in valid_places is + core::KernelPickFactor factor; factor.ConsiderTarget(); factor.ConsiderPrecision(); factor.ConsiderDataLayout(); + optimizer_.Run(std::move(program), inner_places, factor, passes); exec_scope_ = optimizer_.exec_scope(); PrepareFeedFetch(); @@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { auto *var = exec_scope_->FindVar(name); return &var->Get(); } + // get input by name lite::Tensor *Predictor::GetInputByName(const std::string &name) { auto element = std::find(input_names_.begin(), input_names_.end(), name); diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 502ce812e1..504710d9fa 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -29,6 +29,13 @@ namespace paddle { namespace lite { +static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] = + ".tailored_ops_source_list"; +static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list"; +static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] = + ".tailored_kernels_source_list"; +static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list"; + /* * Predictor for inference, input a model, it will optimize and execute it. */ diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index 63a401745b..79f9bea762 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV1, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}, + TARGET(kARM), // enable kARM CPU kernel when no opencl kernel }); TestModel(valid_places); diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc index 1aef522b2a..1c426e8568 100644 --- a/lite/api/model_optimize_tool.cc +++ b/lite/api/model_optimize_tool.cc @@ -20,6 +20,7 @@ // model_optimize_tool's compiling period #include "all_kernel_faked.cc" // NOLINT #include "kernel_src_map.h" // NOLINT +#include "lite/api/cxx_api.h" #include "lite/api/paddle_api.h" #include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_passes.h" @@ -31,6 +32,18 @@ DEFINE_string(model_dir, "", "path of the model. This option will be ignored if model_file " "and param_file are exist"); +DEFINE_string(model_filename, + "", + "model topo filename of the model in models set. This option" + " will be used to specific tailoring"); +DEFINE_string(param_filename, + "", + "model param filename of the model in models set. This option" + " will be used to specific tailoring"); +DEFINE_string(model_set_dir, + "", + "path of the models set. This option will be used to specific" + " tailoring"); DEFINE_string(model_file, "", "model file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string( @@ -58,29 +71,23 @@ void DisplayKernels() { LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString(); } -void Main() { - if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) { - LOG(WARNING) - << "Load combined-param model. Option model_dir will be ignored"; - } - - if (FLAGS_display_kernels) { - DisplayKernels(); - exit(0); - } - - lite_api::CxxConfig config; - config.set_model_dir(FLAGS_model_dir); - config.set_model_file(FLAGS_model_file); - config.set_param_file(FLAGS_param_file); - +std::vector ParserValidPlaces() { std::vector valid_places; - auto target_reprs = lite::Split(FLAGS_valid_targets, " "); + auto target_reprs = lite::Split(FLAGS_valid_targets, ","); for (auto& target_repr : target_reprs) { if (target_repr == "arm") { valid_places.emplace_back(TARGET(kARM)); } else if (target_repr == "opencl") { - valid_places.emplace_back(TARGET(kOpenCL)); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}); + valid_places.emplace_back( + TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel } else if (target_repr == "x86") { valid_places.emplace_back(TARGET(kX86)); } else { @@ -100,26 +107,130 @@ void Main() { valid_places.insert(valid_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); } + return valid_places; +} + +void RunOptimize(const std::string& model_dir, + const std::string& model_file, + const std::string& param_file, + const std::string& optimize_out, + const std::string& optimize_out_type, + const std::vector& valid_places, + bool record_tailoring_info) { + if (!model_file.empty() && !param_file.empty()) { + LOG(WARNING) + << "Load combined-param model. Option model_dir will be ignored"; + } + + lite_api::CxxConfig config; + config.set_model_dir(model_dir); + config.set_model_file(model_file); + config.set_param_file(param_file); + config.set_valid_places(valid_places); auto predictor = lite_api::CreatePaddlePredictor(config); LiteModelType model_type; - if (FLAGS_optimize_out_type == "protobuf") { + if (optimize_out_type == "protobuf") { model_type = LiteModelType::kProtobuf; - } else if (FLAGS_optimize_out_type == "naive_buffer") { + } else if (optimize_out_type == "naive_buffer") { model_type = LiteModelType::kNaiveBuffer; } else { - LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type; + LOG(FATAL) << "Unsupported Model type :" << optimize_out_type; } - OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); + OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); predictor->SaveOptimizedModel( - FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info); - if (FLAGS_record_tailoring_info) { + optimize_out, model_type, record_tailoring_info); + if (record_tailoring_info) { LOG(INFO) << "Record the information of tailored model into :" - << FLAGS_optimize_out; + << optimize_out; + } +} + +void CollectModelMetaInfo(const std::string& output_dir, + const std::vector& models, + const std::string& filename) { + std::set total; + for (const auto& name : models) { + std::string model_path = + lite::Join({output_dir, name, filename}, "/"); + auto lines = lite::ReadLines(model_path); + total.insert(lines.begin(), lines.end()); + } + std::string output_path = + lite::Join({output_dir, filename}, "/"); + lite::WriteLines(std::vector(total.begin(), total.end()), + output_path); +} + +void Main() { + if (FLAGS_display_kernels) { + DisplayKernels(); + exit(0); } + + auto valid_places = ParserValidPlaces(); + if (FLAGS_model_set_dir == "") { + RunOptimize(FLAGS_model_dir, + FLAGS_model_file, + FLAGS_param_file, + FLAGS_optimize_out, + FLAGS_optimize_out_type, + valid_places, + FLAGS_record_tailoring_info); + return; + } + + if (!FLAGS_record_tailoring_info) { + LOG(WARNING) << "--model_set_dir option only be used with " + "--record_tailoring_info=true together"; + return; + } + + auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true); + if (model_dirs.size() == 0) { + LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model"; + } + // Optimize models in FLAGS_model_set_dir + for (const auto& name : model_dirs) { + std::string input_model_dir = + lite::Join({FLAGS_model_set_dir, name}, "/"); + std::string output_model_dir = + lite::Join({FLAGS_optimize_out, name}, "/"); + + std::string model_file = ""; + std::string param_file = ""; + + if (FLAGS_model_filename != "" && FLAGS_param_filename != "") { + model_file = + lite::Join({input_model_dir, FLAGS_model_filename}, "/"); + param_file = + lite::Join({input_model_dir, FLAGS_param_filename}, "/"); + } + + LOG(INFO) << "Start optimize model: " << input_model_dir; + RunOptimize(input_model_dir, + model_file, + param_file, + output_model_dir, + FLAGS_optimize_out_type, + valid_places, + FLAGS_record_tailoring_info); + LOG(INFO) << "Optimize done. "; + } + + // Collect all models information + CollectModelMetaInfo( + FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME); + CollectModelMetaInfo(FLAGS_optimize_out, + model_dirs, + lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); } } // namespace lite_api diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index 1358267000..a04e86b7d2 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -21,14 +21,14 @@ #include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/core/device_info.h" -#include "lite/tests/utils/timer.h" +#include "lite/core/profile/timer.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" #ifdef LITE_WITH_PROFILE #include "lite/core/profile/basic_profiler.h" #endif // LITE_WITH_PROFILE -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_string(input_shape, "1,3,224,224", @@ -102,20 +102,20 @@ void Run(const std::vector>& input_shapes, Timer ti; for (int j = 0; j < repeat; ++j) { - ti.start(); + ti.Start(); predictor->Run(); - ti.end(); - LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms"; + float t = ti.Stop(); + LOG(INFO) << "iter: " << j << ", time: " << t << " ms"; } LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", power_mode: " << static_cast(power_mode) << ", threads num " << thread_num << ", warmup: " << warmup_times - << ", repeats: " << repeat << ", avg time: " << ti.get_average_ms() + << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg() << " ms" - << ", min time: " << ti.get_min_time() << " ms" - << ", max time: " << ti.get_max_time() << " ms."; + << ", min time: " << ti.LapTimes().Min() << " ms" + << ", max time: " << ti.LapTimes().Max() << " ms."; auto output = predictor->GetOutput(0); auto out = output->data(); diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index f148096bb6..aabb535292 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) { } } template -void Tensor::CopyToCpu(T *data) { +void Tensor::CopyToCpu(T *data) const { const T *src_data = tensor(raw_tensor_)->data(); int64_t num = tensor(raw_tensor_)->numel(); CHECK(num > 0) << "You should call Resize interface first"; @@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); template void Tensor::CopyFromCpu(const int *); +template void Tensor::CopyFromCpu(const int64_t *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); -template void Tensor::CopyToCpu(int8_t *); -template void Tensor::CopyToCpu(float *); -template void Tensor::CopyToCpu(int *); +template void Tensor::CopyToCpu(int8_t *) const; +template void Tensor::CopyToCpu(float *) const; +template void Tensor::CopyToCpu(int *) const; shape_t Tensor::shape() const { return ctensor(raw_tensor_)->dims().Vectorize(); diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 42b455da81..c578769bd5 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -49,7 +49,7 @@ struct LITE_API Tensor { void CopyFromCpu(const T* data); template - void CopyToCpu(T* data); + void CopyToCpu(T* data) const; /// Shape of the tensor. shape_t shape() const; TargetType target() const; diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index 3d7d496afb..894d839185 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) { "any", "fpga", "npu", - "xpu", - "bm"}; + "xpu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) { "kAny", "kFPGA", "kNPU", - "kXPU", - "kBM"}; + "kXPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -131,8 +129,7 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), - TARGET(kFPGA), - TARGET(kBM)}); + TARGET(kFPGA)}); if (target == TARGET(kAny)) { return valid_set; } diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index a13abb699c..07284be095 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -52,9 +52,8 @@ enum class TargetType : int { kFPGA = 7, kNPU = 8, kXPU = 9, - kBM = 10, kAny = 6, // any target - NUM = 11, // number of fields. + NUM = 10, // number of fields. }; enum class PrecisionType : int { kUnk = 0, diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 70355fdf89..9d56d262ab 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass); USE_MIR_PASS(variable_place_inference_pass); USE_MIR_PASS(type_target_cast_pass); USE_MIR_PASS(generate_program_pass); -USE_MIR_PASS(subgraph_program_pass); +#ifdef LITE_WITH_NPU +USE_MIR_PASS(generate_npu_program_pass); +#endif +#ifdef LITE_WITH_XPU +USE_MIR_PASS(generate_xpu_program_pass); +#endif USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(argument_type_display_pass); diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc index c483373dc7..5314c5ed75 100644 --- a/lite/api/test_step_rnn_lite_x86.cc +++ b/lite/api/test_step_rnn_lite_x86.cc @@ -12,20 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #include #include #include diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt index cbbcf49a5f..076c791daa 100644 --- a/lite/backends/arm/math/CMakeLists.txt +++ b/lite/backends/arm/math/CMakeLists.txt @@ -57,9 +57,10 @@ endif() if (NOT HAS_ARM_MATH_LIB_DIR) # TODO(xxx): seperate them and do not deps proto, eigen3 - cc_library(math_arm SRCS - funcs.cc + cc_library(math_arm SRCS + funcs.cc packed_sgemm.cc + packed_sgemm_c4.cc sgemm.cc gemm_prepacked_int8.cc gemm_s8.cc @@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR) gemv_arm_int8.cc conv3x3s1_direct_fp32.cc conv3x3s2_direct_fp32.cc - conv3x3s1_depthwise_fp32.cc - conv3x3s2_depthwise_fp32.cc + conv3x3s1p01_depthwise_fp32.cc + conv3x3s2p01_depthwise_fp32.cc + conv3x3s1px_depthwise_fp32.cc + conv3x3s2px_depthwise_fp32.cc conv3x3s1_direct_int8.cc conv3x3s2_direct_int8.cc conv3x3s1_depthwise_int8.cc @@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR) conv5x5s1_depthwise_int8.cc conv5x5s1_depthwise_fp32.cc conv5x5s2_depthwise_fp32.cc - conv_depthwise_3x3p0.cc - conv_depthwise_3x3p1.cc - conv_depthwise_3x3s1.cc - conv_depthwise_3x3s2.cc + conv3x3_winograd_fp32_c4.cc conv_winograd_3x3.cc conv_impl.cc - softmax.cc + softmax.cc scale.cc pooling.cc elementwise.cc + layout.cc lrn.cc decode_bboxes.cc concat.cc @@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR) anchor_generator.cc DEPS ${lite_kernel_deps} context tensor) endif() - diff --git a/lite/backends/arm/math/col_im_transform.cc b/lite/backends/arm/math/col_im_transform.cc index b5d2c6af13..38be1d689d 100644 --- a/lite/backends/arm/math/col_im_transform.cc +++ b/lite/backends/arm/math/col_im_transform.cc @@ -32,8 +32,10 @@ void col2im(const float* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, @@ -41,19 +43,22 @@ void col2im(const float* data_col, float* data_im) { memset(data_im, 0, height * width * channels * sizeof(float)); const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_h0 + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { data_col += output_w; } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_w0 + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { data_im[input_row * width + input_col] += *data_col; diff --git a/lite/backends/arm/math/col_im_transform.h b/lite/backends/arm/math/col_im_transform.h index 8560679d7f..e3e32c4715 100644 --- a/lite/backends/arm/math/col_im_transform.h +++ b/lite/backends/arm/math/col_im_transform.h @@ -26,8 +26,10 @@ void col2im(const Dtype* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc new file mode 100644 index 0000000000..5834461b8f --- /dev/null +++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc @@ -0,0 +1,564 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/backends/arm/math/packed_sgemm_c4.h" +#ifdef ARM_WITH_OMP +#include +#endif +#include + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +void input_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride); +void output_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride); +void output_trans_c4_post(const float* src, + int src_stride, + float* dest, + int dest_stride, + float* bias_value, + bool has_relu); +void weight_trans_c4( + float* dest, const float* src, int ic, int oc, void* workspace); + +/* +*The following function conv_compute_6x6_3x3 is base on +*MNN[https://github.com/alibaba/MNN] +* +*Copyright © 2018, Alibaba Group Holding Limited +*/ +void conv_compute_6x6_3x3(const float* input, + float* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const float* weight, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx) { + const int pad_h = (*param.paddings)[0]; + const int pad_w = (*param.paddings)[2]; + float* tmp_work_space = + ctx->workspace_data() + ctx->llc_size() / sizeof(float); + + int in_n_stride = chin * hin * win; + int out_n_stride = chout * hout * wout; + int ic_stride = win * hin; + int oc_stride = wout * hout; + int ic_4 = (chin + 3) / 4; + int oc_4 = (chout + 3) / 4; + + int tile_w = (wout + 5) / 6; + int tile_h = (hout + 5) / 6; + int size_tile = tile_h * tile_w; + float zero_ptr[8]; + memset(zero_ptr, 0, 8 * sizeof(float)); + + int w_pad = win + pad_w * 2; + int h_pad = hin + pad_h * 2; + float* input_c4 = tmp_work_space; + int new_h_stride = w_pad * 4; + int new_c_stride = new_h_stride * h_pad; + + int ic_4_stride = w_pad * h_pad * 4; + int oc_4_stride = wout * hout * 4; + + int tile_block = 8; +#ifdef __aarch64__ + tile_block = 16; +#endif + int block_count = (size_tile + tile_block - 1) / tile_block; + + int threads = ctx->threads(); + float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride; + int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 256; + memset(g_tmp_data, 0, threads * tmp_data_thread_stride * sizeof(float)); + float* g_trans_tmp_data = g_tmp_data + threads * tmp_data_thread_stride; + float* g_trans_remain_tmp_data = g_trans_tmp_data + threads * 256; + + // begin compute + for (int ni = 0; ni < num; ++ni) { + // trans input to c4 + for (int i = 0; i < ic_4; ++i) { + prepack_input_nxwc4_dw(input + ni * in_n_stride, + input_c4 + i * new_c_stride, + i * 4, + -pad_h, + hin + pad_h, + -pad_w, + win + pad_w, + chin, + win, + hin, + zero_ptr); + } + float* output_ptr = output + ni * out_n_stride; + + const float* weight_ptr = weight; + const float* bias_ptr = bias; +#pragma omp parallel for num_threads(threads) + for (int tbi = 0; tbi < block_count; ++tbi) { +#ifdef ARM_WITH_OMP + float* tmp_data = + g_tmp_data + omp_get_thread_num() * tmp_data_thread_stride; + float* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 256; + float* trans_remain_tmp_data = + g_trans_remain_tmp_data + omp_get_thread_num() * 256; +#else + float* tmp_data = g_tmp_data; + float* trans_tmp_data = g_trans_tmp_data; + float* trans_remain_tmp_data = g_trans_remain_tmp_data; +#endif + int tile_index = tbi * tile_block; + int tile_remain = size_tile - tile_index; + int tile_count = tile_remain > tile_block ? tile_block : tile_remain; + + // input trans + int c_gi_stride = tile_count * oc_4 * 4; + int b_gi_stride = tile_count * ic_4 * 4; + //* + for (int ti = 0; ti < tile_count; ++ti) { + int index = tile_index + ti; + + int tw_index = index % tile_w; + int th_index = index / tile_w; + + int src_x = tw_index * 6; + int src_y = th_index * 6; + int ex = src_x + 8 > w_pad ? w_pad - src_x : 8; + int ey = src_y + 8 > h_pad ? h_pad - src_y : 8; + + float* dst_ptr = tmp_data + ti * 4; + const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4; + + if (ex == 8 && ey == 8) { + // trans input + for (int ci = 0; ci < ic_4; ++ci) { + const float* src_ci = src_ptr + ci * ic_4_stride; + for (int i = 0; i < 8; ++i) { + const float* ci_ptr = src_ci + i * w_pad * 4; + input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32); + } + float* dst_ci = dst_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + input_trans_c4(trans_tmp_data + i * 32, + 4, + dst_ci + i * b_gi_stride * 8, + b_gi_stride); + } + } + } else { + // trans remain input + int x_size = ex; + for (int ci = 0; ci < ic_4; ++ci) { + const float* src_ci = src_ptr + ci * ic_4_stride; + // pad + memset(trans_remain_tmp_data, 0, 256 * sizeof(float)); + if (x_size > 0) { + for (int yi = 0; yi < ey; ++yi) { + float* dst_yi = trans_remain_tmp_data + yi * 32; + const float* src_yi = src_ci + w_pad * yi * 4; + memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4); + } + } + + // trans + for (int i = 0; i < 8; ++i) { + float* ci_ptr = trans_remain_tmp_data + i * 32; + input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32); + } + float* dst_ci = dst_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + input_trans_c4(trans_tmp_data + i * 32, + 4, + dst_ci + i * b_gi_stride * 8, + b_gi_stride); + } + } // for ci_4 + } + } + //*/ + // input trans end + // *begin compute dot + // * + //* + float* dst_temp_data = tmp_data + tile_block * ic_4 * 256; + float* b_ptr = tmp_data; + int w_gi_stride = ic_4 * oc_4 * 16; + for (int gi = 0; gi < 64; ++gi) { + float* origin_C = dst_temp_data + gi * c_gi_stride; + float* origin_B = b_ptr + gi * b_gi_stride; + const float* origin_A = weight + gi * w_gi_stride; + sgemm_prepack_c4_small(oc_4 * 4, + tile_count, + ic_4 * 4, + origin_A, + origin_B, + origin_C, + nullptr, + false, + false, + ctx); + } + //*/ + //* + // output trans + float bias_value[4]; + memset(bias_value, 0, 4 * sizeof(float)); + + for (int ti = 0; ti < tile_count; ++ti) { + int index = tile_index + ti; + + int tw_index = index % tile_w; + int th_index = index / tile_w; + + int dst_x = tw_index * 6; + int dst_y = th_index * 6; + + int ex = dst_x + 6 > wout ? wout - dst_x : 6; + int ey = dst_y + 6 > hout ? hout - dst_y : 6; + + float* dst_ptr = output + (dst_y * wout + dst_x) * 4; + float* src_ptr = dst_temp_data + ti * 4; + + if (ex == 6) { + // trans output + for (int ci = 0; ci < oc_4; ++ci) { + if (param.bias) { + bias_value[0] = bias[ci * 4]; + bias_value[1] = bias[ci * 4 + 1]; + bias_value[2] = bias[ci * 4 + 2]; + bias_value[3] = bias[ci * 4 + 3]; + } + + float* dst_ci = dst_ptr + ci * oc_4_stride; + float* src_ci = src_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + output_trans_c4(src_ci + i * c_gi_stride * 8, + c_gi_stride, + trans_tmp_data + i * 4, + 32); + } + for (int i = 0; i < ey; ++i) { + output_trans_c4_post(trans_tmp_data + i * 32, + 4, + trans_remain_tmp_data + i * 24, + 4, + bias_value, + param.fuse_relu); + } + write_to_output_c4_fp32(trans_remain_tmp_data, + output_ptr, + ci * 4, + ci * 4 + 4, + dst_y, + dst_y + ey, + dst_x, + dst_x + ex, + chout, + hout, + wout, + false, + zero_ptr); + } + } else { + for (int ci = 0; ci < oc_4; ++ci) { + if (param.bias) { + bias_value[0] = bias[ci * 4]; + bias_value[1] = bias[ci * 4 + 1]; + bias_value[2] = bias[ci * 4 + 2]; + bias_value[3] = bias[ci * 4 + 3]; + } + // trans output + float* dst_ci = dst_ptr + ci * oc_4_stride; + float* src_ci = src_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + output_trans_c4(src_ci + i * c_gi_stride * 8, + c_gi_stride, + trans_tmp_data + i * 4, + 32); + } + for (int i = 0; i < ey; ++i) { + output_trans_c4_post(trans_tmp_data + i * 32, + 4, + trans_remain_tmp_data + i * 24, + 4, + bias_value, + param.fuse_relu); + } + // copy to dest + memset(trans_tmp_data, 0, 144 * sizeof(float)); + for (int i = 0; i < ey; ++i) { + memcpy(trans_tmp_data + i * ex * 4, + trans_remain_tmp_data + i * 24, + ex * sizeof(float) * 4); + } + write_to_output_c4_fp32(trans_tmp_data, + output_ptr, + ci * 4, + ci * 4 + 4, + dst_y, + dst_y + ey, + dst_x, + dst_x + ex, + chout, + hout, + wout, + false, + zero_ptr); + } + } + } + //*/ + } // for block_count + } // for num +} // conv_compute + +void output_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride) { + const float32x4_t src0 = vld1q_f32(src); + const float32x4_t src1 = vld1q_f32(src + src_stride); + const float32x4_t src2 = vld1q_f32(src + src_stride * 2); + const float32x4_t src3 = vld1q_f32(src + src_stride * 3); + const float32x4_t src4 = vld1q_f32(src + src_stride * 4); + const float32x4_t src5 = vld1q_f32(src + src_stride * 5); + const float32x4_t src6 = vld1q_f32(src + src_stride * 6); + const float32x4_t src7 = vld1q_f32(src + src_stride * 7); + + float32x4_t tmp024a = vaddq_f32(src1, src2); + float32x4_t tmp135a = vsubq_f32(src1, src2); + float32x4_t tmp024b = vaddq_f32(src3, src4); + float32x4_t tmp135b = vsubq_f32(src3, src4); + float32x4_t tmp024c = vaddq_f32(src5, src6); + float32x4_t tmp135c = vsubq_f32(src5, src6); + + float32x4_t dest0 = + vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c); + float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)), + vmulq_n_f32(tmp024c, 0.25f)); + float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)), + vmulq_n_f32(tmp024c, 0.0625f)); + + float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)), + vmulq_n_f32(tmp135c, 0.5f)); + float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)), + vmulq_n_f32(tmp135c, 0.125f)); + float32x4_t dest5 = + vaddq_f32(src7, + vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)), + vmulq_n_f32(tmp135c, 0.03125f))); + + vst1q_f32(dest, dest0); + vst1q_f32(dest + dest_stride, dest1); + vst1q_f32(dest + dest_stride * 2, dest2); + vst1q_f32(dest + dest_stride * 3, dest3); + vst1q_f32(dest + dest_stride * 4, dest4); + vst1q_f32(dest + dest_stride * 5, dest5); +} +void output_trans_c4_post(const float* src, + int src_stride, + float* dest, + int dest_stride, + float* bias_value, + bool has_relu = false) { + const float32x4_t src0 = vld1q_f32(src); + const float32x4_t src1 = vld1q_f32(src + src_stride); + const float32x4_t src2 = vld1q_f32(src + src_stride * 2); + const float32x4_t src3 = vld1q_f32(src + src_stride * 3); + const float32x4_t src4 = vld1q_f32(src + src_stride * 4); + const float32x4_t src5 = vld1q_f32(src + src_stride * 5); + const float32x4_t src6 = vld1q_f32(src + src_stride * 6); + const float32x4_t src7 = vld1q_f32(src + src_stride * 7); + + float32x4_t tmp024a = vaddq_f32(src1, src2); + float32x4_t tmp135a = vsubq_f32(src1, src2); + float32x4_t tmp024b = vaddq_f32(src3, src4); + float32x4_t tmp135b = vsubq_f32(src3, src4); + float32x4_t tmp024c = vaddq_f32(src5, src6); + float32x4_t tmp135c = vsubq_f32(src5, src6); + + float32x4_t dest0 = + vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c); + float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)), + vmulq_n_f32(tmp024c, 0.25f)); + float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)), + vmulq_n_f32(tmp024c, 0.0625f)); + + float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)), + vmulq_n_f32(tmp135c, 0.5f)); + float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)), + vmulq_n_f32(tmp135c, 0.125f)); + float32x4_t dest5 = + vaddq_f32(src7, + vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)), + vmulq_n_f32(tmp135c, 0.03125f))); + + if (bias_value) { + float32x4_t bias = vld1q_f32(bias_value); + dest0 = vaddq_f32(dest0, bias); + dest1 = vaddq_f32(dest1, bias); + dest2 = vaddq_f32(dest2, bias); + dest3 = vaddq_f32(dest3, bias); + dest4 = vaddq_f32(dest4, bias); + dest5 = vaddq_f32(dest5, bias); + } + + if (has_relu) { + float32x4_t zeros = vdupq_n_f32(0); + dest0 = vmaxq_f32(dest0, zeros); + dest1 = vmaxq_f32(dest1, zeros); + dest2 = vmaxq_f32(dest2, zeros); + dest3 = vmaxq_f32(dest3, zeros); + dest4 = vmaxq_f32(dest4, zeros); + dest5 = vmaxq_f32(dest5, zeros); + } + + vst1q_f32(dest, dest0); + vst1q_f32(dest + dest_stride, dest1); + vst1q_f32(dest + dest_stride * 2, dest2); + vst1q_f32(dest + dest_stride * 3, dest3); + vst1q_f32(dest + dest_stride * 4, dest4); + vst1q_f32(dest + dest_stride * 5, dest5); +} + +void input_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride) { + float32x4_t src0 = vld1q_f32(src); + float32x4_t src1 = vld1q_f32(src + src_stride); + float32x4_t src2 = vld1q_f32(src + src_stride * 2); + float32x4_t src3 = vld1q_f32(src + src_stride * 3); + float32x4_t src4 = vld1q_f32(src + src_stride * 4); + float32x4_t src5 = vld1q_f32(src + src_stride * 5); + float32x4_t src6 = vld1q_f32(src + src_stride * 6); + float32x4_t src7 = vld1q_f32(src + src_stride * 7); + + float32x4_t dst0 = vaddq_f32(vsubq_f32(src0, src6), + vmulq_n_f32(vsubq_f32(src4, src2), 5.25)); + float32x4_t dst7 = vaddq_f32(vsubq_f32(src7, src1), + vmulq_n_f32(vsubq_f32(src3, src5), 5.25)); + + float32x4_t tmp12a = + vsubq_f32(vaddq_f32(src2, src6), vmulq_n_f32(src4, 4.25)); + float32x4_t tmp12b = + vsubq_f32(vaddq_f32(src1, src5), vmulq_n_f32(src3, 4.25)); + float32x4_t dst1 = vaddq_f32(tmp12a, tmp12b); + float32x4_t dst2 = vsubq_f32(tmp12a, tmp12b); + + float32x4_t tmp34a = vsubq_f32(vaddq_f32(src6, vmulq_n_f32(src2, 0.25)), + vmulq_n_f32(src4, 1.25)); + float32x4_t tmp34b = + vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 0.5), vmulq_n_f32(src3, 2.5)), + vmulq_n_f32(src5, 2)); + float32x4_t dst3 = vaddq_f32(tmp34a, tmp34b); + float32x4_t dst4 = vsubq_f32(tmp34a, tmp34b); + + float32x4_t tmp56a = + vaddq_f32(src6, vmulq_n_f32(vsubq_f32(src2, vmulq_n_f32(src4, 1.25)), 4)); + float32x4_t tmp56b = + vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 2), vmulq_n_f32(src3, 2.5)), + vmulq_n_f32(src5, 0.5)); + float32x4_t dst5 = vaddq_f32(tmp56a, tmp56b); + float32x4_t dst6 = vsubq_f32(tmp56a, tmp56b); + + vst1q_f32(dest, dst0); + vst1q_f32(dest + dest_stride, dst1); + vst1q_f32(dest + dest_stride * 2, dst2); + vst1q_f32(dest + dest_stride * 3, dst3); + vst1q_f32(dest + dest_stride * 4, dst4); + vst1q_f32(dest + dest_stride * 5, dst5); + vst1q_f32(dest + dest_stride * 6, dst6); + vst1q_f32(dest + dest_stride * 7, dst7); +} +void weight_trans_c4( + float* dest, const float* din, int ch_in, int ch_out, void* workspace) { + const float coeff[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {32.0f / 45, 16.0f / 45, 8.0f / 45}, + {32.0f / 45, -16.0f / 45, 8.0f / 45}, + {0.0f, 0.0f, 1.0f}}; + + float* ptr_out = static_cast(workspace); + + for (int i = 0; i < ch_out; i++) { + for (int j = 0; j < ch_in; j++) { + const float* kernel0 = + static_cast(din) + (i * ch_in + j) * 9; + float* ptr_channel = ptr_out + (i * ch_in + j) * 64; + + //! transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + //! h + float tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = + k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; + tmp[i][1] = + k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; + tmp[i][2] = + k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; + } + + //! v + for (int j = 0; j < 8; j++) { + float* tmpp = &tmp[j][0]; + for (int i = 0; i < 8; i++) { + ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] + + tmpp[1] * coeff[i][1] + + tmpp[2] * coeff[i][2]; + } + } + } + } + + int oc_pad = (ch_out + 3) / 4 * 4; + int ic_pad = (ch_in + 3) / 4 * 4; + int c_stride = ic_pad * oc_pad; + for (int i = 0; i < ch_out * ch_in * 64; ++i) { + int new_c = i % 64; + int new_oc = i / ch_in / 64 / 4; + int new_ic = i / 64 % (ch_in * 4) % ch_in; + int new_inner = i / ch_in / 64 % 4; + int dest_ind = + new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner; + dest[dest_ind] = ptr_out[i]; + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc index 6a1fa37681..b4972a1eca 100644 --- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc @@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param, auto dim_in = param.x->dims(); auto dim_out = param.output->dims(); const int threads = ctx->threads(); + auto paddings = *param.paddings; int llc_size = ctx->llc_size() / sizeof(float); - const int pad_w = param.paddings[1]; - const int pad_h = param.paddings[0]; + const int pad_w = paddings[2]; + const int pad_h = paddings[0]; int ow = dim_out[3]; int oh = dim_out[2]; int ic = dim_in[1]; @@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data, ARMContext* ctx) { const int threads = ctx->threads(); int l2_size = ctx->llc_size() / sizeof(float); + auto paddings = *param.paddings; - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int win_round = wout_round + 2; bool flag_relu = param.fuse_relu; diff --git a/lite/backends/arm/math/conv3x3s1_direct_int8.cc b/lite/backends/arm/math/conv3x3s1_direct_int8.cc index f966313e11..64e72bc441 100644 --- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc +++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc @@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din, const operators::ConvParam& param, Context* ctx, const float* scale) { + auto paddings = *param.paddings; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; const int threads = ctx->threads(); int llc_size = ctx->llc_size() / 4; diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc new file mode 100644 index 0000000000..e4c9fb99ef --- /dev/null +++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc @@ -0,0 +1,2539 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void conv_depthwise_3x3s1p0_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1p0_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1p1_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1p1_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1_fp32(const float *din, + float *dout, + int num, + int ch_out, + int h_out, + int w_out, + int ch_in, + int h_in, + int w_in, + const float *weights, + const float *bias, + int pad, + bool flag_bias, + bool flag_relu, + ARMContext *ctx) { + if (pad == 0) { + if (w_in > 5) { + conv_depthwise_3x3s1p0_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s1p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } + if (pad == 1) { + if (w_in > 4) { + conv_depthwise_3x3s1p1_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s1p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } +} + +#ifdef __aarch64__ +#define INIT_S1 \ + "PRFM PLDL1KEEP, [%[din_ptr0]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr1]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr2]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr3]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr4]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr5]] \n" \ + "movi v21.4s, #0x0\n" /* out0 = 0 */ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ + +#define LEFT_COMPUTE_S1 \ + "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \ + \ + "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/ \ + "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16=1234 */ \ + "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ \ + \ + /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ + +#define LEFT_RESULT_S1 \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "blt 3f \n" + +#define MID_COMPUTE_S1 \ + "1: \n" /* r0 */ \ + "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define MID_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_COMPUTE_S1 \ + "3: \n" \ + "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" \ + "ld1 {v22.4s}, [%[doutr0]] \n" \ + "ld1 {v23.4s}, [%[doutr1]] \n" \ + "ld1 {v24.4s}, [%[doutr2]] \n" \ + "ld1 {v25.4s}, [%[doutr3]] \n" \ + \ + "bif v0.16b, %[vzero].16b, v18.16b \n" \ + "bif v1.16b, %[vzero].16b, v19.16b \n" \ + "bif v2.16b, %[vzero].16b, v18.16b \n" \ + "bif v3.16b, %[vzero].16b, v19.16b \n" \ + \ + "bif v4.16b, %[vzero].16b, v18.16b \n" \ + "bif v5.16b, %[vzero].16b, v19.16b \n" \ + "bif v6.16b, %[vzero].16b, v18.16b \n" \ + "bif v7.16b, %[vzero].16b, v19.16b \n" \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v8.16b, %[vzero].16b, v18.16b \n" \ + "bif v9.16b, %[vzero].16b, v19.16b \n" \ + "bif v10.16b, %[vzero].16b, v18.16b \n" \ + "bif v11.16b, %[vzero].16b, v19.16b \n" \ + \ + "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v18.4s}, [%[rmask]] \n" \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define LEFT_RESULT_S1_RELU \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "blt 3f \n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define COMPUTE_S_S1 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s}, [%[din0]], #16\n" \ + "ld1 {v1.4s}, [%[din1]], #16\n" \ + "ld1 {v2.4s}, [%[din2]], #16\n" \ + "ld1 {v3.4s}, [%[din3]], #16\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask].16b\n" \ + "bif v2.16b, %[zero].16b, %[mask].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask].16b\n" \ + \ + "ext v4.16b, %[zero].16b, v0.16b, #12\n" \ + "ext v5.16b, %[zero].16b, v1.16b, #12\n" \ + "ext v6.16b, %[zero].16b, v2.16b, #12\n" \ + "ext v7.16b, %[zero].16b, v3.16b, #12\n" \ + \ + "ext v8.16b, v0.16b, %[zero].16b, #4\n" \ + "ext v9.16b, v1.16b, %[zero].16b, #4\n" \ + "ext v10.16b, v2.16b, %[zero].16b, #4\n" \ + "ext v11.16b, v3.16b, %[zero].16b, #4\n" \ + \ + "fmul v12.4s, v0.4s, %[wr0].s[1]\n" \ + "fmul v13.4s, v1.4s, %[wr0].s[1]\n" \ + \ + "fmul v14.4s, v1.4s, %[wr1].s[1]\n" \ + "fmul v15.4s, v2.4s, %[wr1].s[1]\n" \ + \ + "fmul v16.4s, v2.4s, %[wr2].s[1]\n" \ + "fmul v17.4s, v3.4s, %[wr2].s[1]\n" \ + \ + "fmla v12.4s, v4.4s, %[wr0].s[0]\n" \ + "fmla v13.4s, v5.4s, %[wr0].s[0]\n" \ + \ + "fmla v14.4s, v5.4s, %[wr1].s[0]\n" \ + "fmla v15.4s, v6.4s, %[wr1].s[0]\n" \ + \ + "fmla v16.4s, v6.4s, %[wr2].s[0]\n" \ + "fmla v17.4s, v7.4s, %[wr2].s[0]\n" \ + \ + "fmla v12.4s, v8.4s, %[wr0].s[2]\n" \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "fmla v14.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v15.4s, v10.4s, %[wr1].s[2]\n" \ + \ + "fmla v16.4s, v10.4s, %[wr2].s[2]\n" \ + "fmla v17.4s, v11.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v14.4s\n" \ + "fadd v12.4s, v12.4s, v16.4s\n" \ + \ + "fadd v13.4s, v13.4s, v15.4s\n" \ + "fadd v13.4s, v13.4s, v17.4s\n" \ + \ + "fadd v12.4s, v12.4s, %[bias].4s\n" \ + "fadd v13.4s, v13.4s, %[bias].4s\n" + +#define RESULT_S_S1 \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "fmax v12.4s, v12.4s, %[zero].4s\n" \ + "fmax v13.4s, v13.4s, %[zero].4s\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s, v1.4s}, [%[din0]]\n" \ + "ld1 {v2.4s, v3.4s}, [%[din1]]\n" \ + "ld1 {v4.4s, v5.4s}, [%[din2]]\n" \ + "ld1 {v6.4s, v7.4s}, [%[din3]]\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v2.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v4.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v5.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v6.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v7.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "ext v8.16b, v0.16b, v1.16b, #4\n" \ + "ext v9.16b, v0.16b, v1.16b, #8\n" \ + \ + "and v12.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v13.16b, %[vbias].16b, %[vbias].16b \n" /* r0 */ \ + "fmul v10.4s, v0.4s, %[wr0].s[0]\n" \ + "fmul v11.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v12.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "ext v8.16b, v2.16b, v3.16b, #4\n" \ + "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */ \ + "fmul v14.4s, v2.4s, %[wr0].s[0]\n" \ + "fmla v10.4s, v2.4s, %[wr1].s[0]\n" \ + \ + "fmul v15.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr1].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr1].s[2]\n" \ + \ + "ext v8.16b, v4.16b, v5.16b, #4\n" \ + "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */ \ + "fmla v14.4s, v4.4s, %[wr1].s[0]\n" \ + "fmla v10.4s, v4.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr1].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "ext v8.16b, v6.16b, v7.16b, #4\n" \ + "ext v9.16b, v6.16b, v7.16b, #8\n" \ + \ + "fmla v14.4s, v6.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fadd v12.4s, v12.4s, v10.4s\n" \ + \ + "fmla v13.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v11.4s\n" \ + "fadd v13.4s, v13.4s, v14.4s\n" \ + "fadd v13.4s, v13.4s, v15.4s\n" // \ + // "prfm pldl1keep, [%[out1]]\n" \ + // "prfm pldl1keep, [%[out2]]\n" \ + // \ + // "st1 {v12.4s}, [%[out1]]\n" \ + // "st1 {v13.4s}, [%[out2]]\n" \ + + +#else +#define INIT_S1 \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" + +#define LEFT_COMPUTE_S1 \ + "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" \ + "vext.32 q7, q8, q9, #1 @ 1234\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" \ + "vext.32 q7, q10, q11, #1 @ 1234\n" \ + \ + /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" \ + "vext.32 q7, q12, q13, #1 @ 1234\n" \ + \ + /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" \ + "vext.32 q7, q14, q15, #1 @ 1234\n" + +#define LEFT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_COMPUTE_S1 \ + "1: @ right pad entry\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define MID_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_COMPUTE_S1 \ + "3: @ right pad entry\n" \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define LEFT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define COMPUTE_S_S1 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + \ + "vld1.32 {d12-d13}, [%[din0]]!\n" \ + "vld1.32 {d14-d15}, [%[din1]]!\n" \ + "vld1.32 {d16-d17}, [%[din2]]!\n" \ + "vld1.32 {d18-d19}, [%[din3]]!\n" \ + \ + "vbif q6, %q[vzero], %q[mask]\n" \ + "vbif q7, %q[vzero], %q[mask]\n" \ + "vbif q8, %q[vzero], %q[mask]\n" \ + "vbif q9, %q[vzero], %q[mask]\n" \ + \ + "vmul.f32 q14, q6, %e[wr0][1]\n" \ + "vmul.f32 q15, q7, %e[wr0][1]\n" \ + \ + "vmla.f32 q14, q7, %e[wr1][1]\n" \ + "vmla.f32 q15, q8, %e[wr1][1]\n" \ + \ + "vmla.f32 q14, q8, %e[wr2][1]\n" \ + "vmla.f32 q15, q9, %e[wr2][1]\n" \ + \ + "vext.32 q10, %q[vzero], q6, #3\n" \ + "vext.32 q11, %q[vzero], q7, #3\n" \ + "vext.32 q12, %q[vzero], q8, #3\n" \ + "vext.32 q13, %q[vzero], q9, #3\n" \ + \ + "vmla.f32 q14, q10, %e[wr0][0]\n" \ + "vmla.f32 q15, q11, %e[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %e[wr1][0]\n" \ + "vmla.f32 q15, q12, %e[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %e[wr2][0]\n" \ + "vmla.f32 q15, q13, %e[wr2][0]\n" \ + \ + "vext.32 q10, q6, %q[vzero], #1\n" \ + "vext.32 q11, q7, %q[vzero], #1\n" \ + "vext.32 q12, q8, %q[vzero], #1\n" \ + "vext.32 q13, q9, %q[vzero], #1\n" \ + \ + "vmla.f32 q14, q10, %f[wr0][0]\n" \ + "vmla.f32 q15, q11, %f[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %f[wr1][0]\n" \ + "vmla.f32 q15, q12, %f[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %f[wr2][0]\n" \ + "vmla.f32 q15, q13, %f[wr2][0]\n" \ + \ + "vadd.f32 q14, q14, %q[bias]\n" \ + "vadd.f32 q15, q15, %q[bias]\n" + +#define RESULT_S_S1 \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmax.f32 q14, q14, %q[vzero]\n" \ + "vmax.f32 q15, q15, %q[vzero]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + "vadd.f32 q4, q4, q10 @ q4 += q10 \n" \ + \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vadd.f32 q14, q4, q11 @ q4 += q10 \n" \ + \ + "vadd.f32 q5, q5, q8 @ q4 += q10 \n" \ + "vadd.f32 q15, q5, q9 @ q4 += q10 \n" + +#endif +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p1_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = (w_in + 3) >> 2; + int cnt_col = tile_w - 2; + + unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + din_ptr4 = dr3; + din_ptr5 = dr4; + dr0 = dr3; + dr1 = dr4; + dr2 = dr5; + } else { + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + } + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 > h_in) { + switch (i + 5 - h_in) { + case 5: + din_ptr1 = zero_ptr; + case 4: + din_ptr2 = zero_ptr; + case 3: + din_ptr3 = zero_ptr; + case 2: + din_ptr4 = zero_ptr; + case 1: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + // unsigned int* rst_mask = rmask; + + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + dr0 = dr1; + dr1 = dr2; + dr2 = dr3; + dr3 = dr2 + w_in; + } else { + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + } + //! process bottom pad + if (i + 3 > h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = cnt_col; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p1_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[4] = {3, 2, 1, 0}; + const float zero[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + int hs = -1; + int he = 3; + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + int h_cnt = (h_out + 1) >> 1; + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_cnt; ++j) { + const float *dr0 = din_channel + hs * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + if (hs == -1) { + dr0 = zero; + } + + switch (he - h_in) { + case 2: + dr2 = zero; + doutr1 = trash_buf; + case 1: + dr3 = zero; + default: + break; + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + doutr0 = doutr1; + doutr1 += w_out; + hs += 2; + he += 2; + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p0_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = w_out >> 2; + int remain = w_out % 4; + + unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); + const int remian_idx[4] = {0, 1, 2, 3}; + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 >= h_in) { + switch (i + 5 - h_in) { + case 4: + din_ptr1 = zero_ptr; + case 3: + din_ptr2 = zero_ptr; + case 2: + din_ptr3 = zero_ptr; + case 1: + din_ptr4 = zero_ptr; + case 0: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1 + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_out; i += 2) { + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + //! process bottom pad + if (i + 3 >= h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + case 0: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = tile_w; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1 + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p0_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp1 = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); + uint32x4_t vmask_rp2 = + vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#endif // __aarch64__ + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_out; j += 2) { + const float *dr0 = din_channel + j * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + doutr0 = dout_channel + j * w_out; + doutr1 = doutr0 + w_out; + + if (j + 3 >= h_in) { + switch (j + 3 - h_in) { + case 3: + dr1 = zero_ptr; + case 2: + dr2 = zero_ptr; + case 1: + dr3 = zero_ptr; + doutr1 = trash_buf; + case 0: + dr3 = zero_ptr; + doutr1 = trash_buf; + default: + break; + } + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + unsigned int *vmask_ptr = vmask; + float bias_val = flag_bias ? bias[i] : 0.f; + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc new file mode 100644 index 0000000000..08e5efecd7 --- /dev/null +++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc @@ -0,0 +1,541 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/core/context.h" +#include "lite/operators/op_params.h" +#ifdef ARM_WITH_OMP +#include +#endif + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +void conv_3x3s1_depthwise_fp32(const float* i_data, + float* o_data, + int bs, + int oc, + int oh, + int ow, + int ic, + int ih, + int win, + const float* weights, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx) { + int threads = ctx->threads(); + + auto paddings = *param.paddings; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; + + const int out_c_block = 4; + const int out_h_kernel = 2; + const int out_w_kernel = 4; + const int win_ext = ow + 2; + const int ow_round = ROUNDUP(ow, 4); + const int win_round = ROUNDUP(win_ext, 4); + const int hin_round = oh + 2; + const int prein_size = win_round * hin_round * out_c_block; + auto workspace_size = + threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; + ctx->ExtendWorkspace(sizeof(float) * workspace_size); + + bool flag_relu = param.fuse_relu; + bool flag_bias = param.bias != nullptr; + + /// get workspace + float* ptr_zero = ctx->workspace_data(); + memset(ptr_zero, 0, sizeof(float) * win_round); + float* ptr_write = ptr_zero + win_round; + + int size_in_channel = win * ih; + int size_out_channel = ow * oh; + + int ws = -pad_w; + int we = ws + win_round; + int hs = -pad_h; + int he = hs + hin_round; + int w_loop = ow_round / 4; + auto remain = w_loop * 4 - ow; + bool flag_remain = remain > 0; + remain = 4 - remain; + remain = remain > 0 ? remain : 0; + int row_len = win_round * out_c_block; + + for (int n = 0; n < bs; ++n) { + const float* din_batch = i_data + n * ic * size_in_channel; + float* dout_batch = o_data + n * oc * size_out_channel; +#pragma omp parallel for num_threads(threads) + for (int c = 0; c < oc; c += out_c_block) { +#ifdef ARM_WITH_OMP + float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; +#else + float* pre_din = ptr_write + ow_round; +#endif + /// const array size + float pre_out[out_c_block * out_w_kernel * out_h_kernel]; // NOLINT + prepack_input_nxwc4_dw( + din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); + const float* weight_c = weights + c * 9; // kernel_w * kernel_h + float* dout_c00 = dout_batch + c * size_out_channel; + float bias_local[4] = {0, 0, 0, 0}; + if (flag_bias) { + bias_local[0] = bias[c]; + bias_local[1] = bias[c + 1]; + bias_local[2] = bias[c + 2]; + bias_local[3] = bias[c + 3]; + } + float32x4_t vbias = vld1q_f32(bias_local); +#ifdef __aarch64__ + float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 + float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 + float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 + float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 + float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 + float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 + float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 + float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 + float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 +#endif + for (int h = 0; h < oh; h += out_h_kernel) { + float* outc00 = dout_c00 + h * ow; + float* outc01 = outc00 + ow; + float* outc10 = outc00 + size_out_channel; + float* outc11 = outc10 + ow; + float* outc20 = outc10 + size_out_channel; + float* outc21 = outc20 + ow; + float* outc30 = outc20 + size_out_channel; + float* outc31 = outc30 + ow; + const float* inr0 = pre_din + h * row_len; + const float* inr1 = inr0 + row_len; + const float* inr2 = inr1 + row_len; + const float* inr3 = inr2 + row_len; + if (c + out_c_block > oc) { + switch (c + out_c_block - oc) { + case 3: + outc10 = ptr_write; + outc11 = ptr_write; + case 2: + outc20 = ptr_write; + outc21 = ptr_write; + case 1: + outc30 = ptr_write; + outc31 = ptr_write; + default: + break; + } + } + if (h + out_h_kernel > oh) { + outc01 = ptr_write; + outc11 = ptr_write; + outc21 = ptr_write; + outc31 = ptr_write; + } + float* outl[] = {outc00, + outc10, + outc20, + outc30, + outc01, + outc11, + outc21, + outc31, + reinterpret_cast(bias_local), + reinterpret_cast(flag_relu)}; + void* outl_ptr = reinterpret_cast(outl); + for (int w = 0; w < w_loop; ++w) { + bool flag_mask = (w == w_loop - 1) && flag_remain; + float* out0 = pre_out; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ + "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ + "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ + "ldp q8, q9, [%[inr1]], #32\n" /* load input r1*/ + "ldp q4, q5, [%[inr0]]\n" /* load input r0*/ + "ldp q10, q11, [%[inr1]]\n" /* load input r1*/ + /* r0, r1, mul w0, get out r0, r1 */ + "fmul v15.4s , %[w0].4s, v0.4s\n" /* outr00 = w0 * r0, 0*/ + "fmul v16.4s , %[w0].4s, v1.4s\n" /* outr01 = w0 * r0, 1*/ + "fmul v17.4s , %[w0].4s, v2.4s\n" /* outr02 = w0 * r0, 2*/ + "fmul v18.4s , %[w0].4s, v3.4s\n" /* outr03 = w0 * r0, 3*/ + "fmul v19.4s , %[w0].4s, v6.4s\n" /* outr10 = w0 * r1, 0*/ + "fmul v20.4s , %[w0].4s, v7.4s\n" /* outr11 = w0 * r1, 1*/ + "fmul v21.4s , %[w0].4s, v8.4s\n" /* outr12 = w0 * r1, 2*/ + "fmul v22.4s , %[w0].4s, v9.4s\n" /* outr13 = w0 * r1, 3*/ + /* r0, r1, mul w1, get out r0, r1 */ + "fmla v15.4s , %[w1].4s, v1.4s\n" /* outr00 = w1 * r0[1]*/ + "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ + "fmla v16.4s , %[w1].4s, v2.4s\n" /* outr01 = w1 * r0[2]*/ + "fmla v17.4s , %[w1].4s, v3.4s\n" /* outr02 = w1 * r0[3]*/ + "fmla v18.4s , %[w1].4s, v4.4s\n" /* outr03 = w1 * r0[4]*/ + "fmla v19.4s , %[w1].4s, v7.4s\n" /* outr10 = w1 * r1[1]*/ + "fmla v20.4s , %[w1].4s, v8.4s\n" /* outr11 = w1 * r1[2]*/ + "fmla v21.4s , %[w1].4s, v9.4s\n" /* outr12 = w1 * r1[3]*/ + "fmla v22.4s , %[w1].4s, v10.4s\n"/* outr13 = w1 * r1[4]*/ + /* r0, r1, mul w2, get out r0, r1 */ + "fmla v15.4s , %[w2].4s, v2.4s\n" /* outr00 = w2 * r0[2]*/ + "fmla v16.4s , %[w2].4s, v3.4s\n" /* outr01 = w2 * r0[3]*/ + "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ + "fmla v17.4s , %[w2].4s, v4.4s\n" /* outr02 = w2 * r0[4]*/ + "fmla v18.4s , %[w2].4s, v5.4s\n" /* outr03 = w2 * r0[5]*/ + "ldp q4, q5, [%[inr2]]\n" /* load input r2*/ + "fmla v19.4s , %[w2].4s, v8.4s\n" /* outr10 = w2 * r1[2]*/ + "fmla v20.4s , %[w2].4s, v9.4s\n" /* outr11 = w2 * r1[3]*/ + "fmla v21.4s , %[w2].4s, v10.4s\n"/* outr12 = w2 * r1[4]*/ + "fmla v22.4s , %[w2].4s, v11.4s\n"/* outr13 = w2 * r1[5]*/ + /* r1, r2, mul w3, get out r0, r1 */ + "fmla v15.4s , %[w3].4s, v6.4s\n" /* outr00 = w3 * r1[0]*/ + "fmla v16.4s , %[w3].4s, v7.4s\n" /* outr01 = w3 * r1[1]*/ + "fmla v17.4s , %[w3].4s, v8.4s\n" /* outr02 = w3 * r1[2]*/ + "fmla v18.4s , %[w3].4s, v9.4s\n" /* outr03 = w3 * r1[3]*/ + "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr10 = w3 * r2[0]*/ + "fmla v20.4s , %[w3].4s, v1.4s\n" /* outr11 = w3 * r2[1]*/ + "fmla v21.4s , %[w3].4s, v2.4s\n" /* outr12 = w3 * r2[2]*/ + "fmla v22.4s , %[w3].4s, v3.4s\n" /* outr13 = w3 * r2[3]*/ + /* r1, r2, mul w4, get out r0, r1 */ + "fmla v15.4s , %[w4].4s, v7.4s\n" /* outr00 = w4 * r1[1]*/ + "ldp q6, q7, [%[inr3]], #32\n" /* load input r3*/ + "fmla v16.4s , %[w4].4s, v8.4s\n" /* outr01 = w4 * r1[2]*/ + "fmla v17.4s , %[w4].4s, v9.4s\n" /* outr02 = w4 * r1[3]*/ + "fmla v18.4s , %[w4].4s, v10.4s\n"/* outr03 = w4 * r1[4]*/ + "ldp x0, x1, [%[outl]] \n" + "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr10 = w4 * r2[1]*/ + "fmla v20.4s , %[w4].4s, v2.4s\n" /* outr11 = w4 * r2[2]*/ + "fmla v21.4s , %[w4].4s, v3.4s\n" /* outr12 = w4 * r2[3]*/ + "fmla v22.4s , %[w4].4s, v4.4s\n" /* outr13 = w4 * r2[4]*/ + /* r1, r2, mul w5, get out r0, r1 */ + "fmla v15.4s , %[w5].4s, v8.4s\n" /* outr00 = w5 * r1[2]*/ + "fmla v16.4s , %[w5].4s, v9.4s\n" /* outr01 = w5 * r1[3]*/ + "ldp q8, q9, [%[inr3]], #32\n" /* load input r3*/ + "fmla v17.4s , %[w5].4s, v10.4s\n"/* outr02 = w5 * r1[4]*/ + "fmla v18.4s , %[w5].4s, v11.4s\n"/* outr03 = w5 * r1[5]*/ + "ldp q10, q11, [%[inr3]]\n" /* load input r3*/ + "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr10 = w5 * r2[2]*/ + "fmla v20.4s , %[w5].4s, v3.4s\n" /* outr11 = w5 * r2[3]*/ + "fmla v21.4s , %[w5].4s, v4.4s\n" /* outr12 = w5 * r2[4]*/ + "fmla v22.4s , %[w5].4s, v5.4s\n" /* outr13 = w5 * r2[5]*/ + /* r2, r3, mul w6, get out r0, r1 */ + "fmla v15.4s , %[w6].4s, v0.4s\n" /* outr00 = w6 * r2[0]*/ + "fmla v16.4s , %[w6].4s, v1.4s\n" /* outr01 = w6 * r2[1]*/ + "fmla v17.4s , %[w6].4s, v2.4s\n" /* outr02 = w6 * r2[2]*/ + "fmla v18.4s , %[w6].4s, v3.4s\n" /* outr03 = w6 * r2[3]*/ + "ldp x2, x3, [%[outl], #16] \n" + "fmla v19.4s , %[w6].4s, v6.4s\n" /* outr10 = w6 * r3[0]*/ + "fmla v20.4s , %[w6].4s, v7.4s\n" /* outr11 = w6 * r3[1]*/ + "fmla v21.4s , %[w6].4s, v8.4s\n" /* outr12 = w6 * r3[2]*/ + "fmla v22.4s , %[w6].4s, v9.4s\n" /* outr13 = w6 * r3[3]*/ + /* r2, r3, mul w7, get out r0, r1 */ + "fmla v15.4s , %[w7].4s, v1.4s\n" /* outr00 = w7 * r2[1]*/ + "fmla v16.4s , %[w7].4s, v2.4s\n" /* outr01 = w7 * r2[2]*/ + "fmla v17.4s , %[w7].4s, v3.4s\n" /* outr02 = w7 * r2[3]*/ + "fmla v18.4s , %[w7].4s, v4.4s\n" /* outr03 = w7 * r2[4]*/ + "ldp x4, x5, [%[outl], #32] \n" + "fmla v19.4s , %[w7].4s, v7.4s\n" /* outr10 = w7 * r3[1]*/ + "fmla v20.4s , %[w7].4s, v8.4s\n" /* outr11 = w7 * r3[2]*/ + "fmla v21.4s , %[w7].4s, v9.4s\n" /* outr12 = w7 * r3[3]*/ + "fmla v22.4s , %[w7].4s, v10.4s\n"/* outr13 = w7 * r3[4]*/ + /* r2, r3, mul w8, get out r0, r1 */ + "fmla v15.4s , %[w8].4s, v2.4s\n" /* outr00 = w8 * r2[2]*/ + "fmla v16.4s , %[w8].4s, v3.4s\n" /* outr01 = w8 * r2[3]*/ + "fmla v17.4s , %[w8].4s, v4.4s\n" /* outr02 = w8 * r2[0]*/ + "fmla v18.4s , %[w8].4s, v5.4s\n" /* outr03 = w8 * r2[1]*/ + "ldp x6, x7, [%[outl], #48] \n" + "fmla v19.4s , %[w8].4s, v8.4s\n" /* outr10 = w8 * r3[2]*/ + "fmla v20.4s , %[w8].4s, v9.4s\n" /* outr11 = w8 * r3[3]*/ + "fmla v21.4s , %[w8].4s, v10.4s\n"/* outr12 = w8 * r3[0]*/ + "fmla v22.4s , %[w8].4s, v11.4s\n"/* outr13 = w8 * r3[1]*/ + + "fadd v15.4s, v15.4s, %[vbias].4s\n"/* add bias */ + "fadd v16.4s, v16.4s, %[vbias].4s\n"/* add bias */ + "fadd v17.4s, v17.4s, %[vbias].4s\n"/* add bias */ + "fadd v18.4s, v18.4s, %[vbias].4s\n"/* add bias */ + "fadd v19.4s, v19.4s, %[vbias].4s\n"/* add bias */ + "fadd v20.4s, v20.4s, %[vbias].4s\n"/* add bias */ + "fadd v21.4s, v21.4s, %[vbias].4s\n"/* add bias */ + "fadd v22.4s, v22.4s, %[vbias].4s\n"/* add bias */ + + /* transpose */ + "trn1 v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/ + "trn2 v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/ + "trn1 v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/ + "trn2 v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/ + "trn1 v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/ + "trn2 v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/ + "trn1 v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/ + "trn2 v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/ + "trn1 v15.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ + "trn2 v19.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ + "trn1 v17.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ + "trn2 v21.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ + "trn1 v16.2d, v4.2d, v6.2d\n" /* r1: a0a1a2a3*/ + "trn2 v20.2d, v4.2d, v6.2d\n" /* r1: c0c1c2c3*/ + "trn1 v18.2d, v5.2d, v7.2d\n" /* r1: b0b1b2b3*/ + "trn2 v22.2d, v5.2d, v7.2d\n" /* r1: d0d1d2d3*/ + + "cbz %w[flag_relu], 0f\n" /* skip relu*/ + "movi v0.4s, #0\n" /* for relu */ + "fmax v15.4s, v15.4s, v0.4s\n" + "fmax v16.4s, v16.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v0.4s\n" + "fmax v19.4s, v19.4s, v0.4s\n" + "fmax v20.4s, v20.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v0.4s\n" + "fmax v22.4s, v22.4s, v0.4s\n" + "0:\n" + "cbnz %w[flag_mask], 1f\n" + "str q15, [x0]\n" /* save outc00 */ + "str q16, [x4]\n" /* save outc01 */ + "str q17, [x1]\n" /* save outc10 */ + "str q18, [x5]\n" /* save outc11 */ + "str q19, [x2]\n" /* save outc20 */ + "str q20, [x6]\n" /* save outc21 */ + "str q21, [x3]\n" /* save outc30 */ + "str q22, [x7]\n" /* save outc31 */ + "b 2f\n" + "1:\n" + "str q15, [%[out]], #16 \n" /* save remain to pre_out */ + "str q17, [%[out]], #16 \n" /* save remain to pre_out */ + "str q19, [%[out]], #16 \n" /* save remain to pre_out */ + "str q21, [%[out]], #16 \n" /* save remain to pre_out */ + "str q16, [%[out]], #16 \n" /* save remain to pre_out */ + "str q18, [%[out]], #16 \n" /* save remain to pre_out */ + "str q20, [%[out]], #16 \n" /* save remain to pre_out */ + "str q22, [%[out]], #16 \n" /* save remain to pre_out */ + "2:\n" + :[inr0] "+r"(inr0), [inr1] "+r"(inr1), + [inr2] "+r"(inr2), [inr3] "+r"(inr3), + [out]"+r"(out0) + :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), + [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), + [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), + [vbias]"w" (vbias), [outl] "r" (outl_ptr), + [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu) + : "cc", "memory", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8", "v9", "v10", "v11", "v15", + "v16","v17","v18","v19","v20","v21","v22", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7" + ); +#else + asm volatile( + /* load weights */ + "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, w1, to q5, q6\n" + "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, to q7\n" + /* load r0, r1 */ + "vld1.32 {d0-d3}, [%[r0]]! @ load r0, q0, q1\n" + "vld1.32 {d4-d7}, [%[r0]]! @ load r0, q2, q3\n" + /* main loop */ + "0: @ main loop\n" + /* mul r0 with w0, w1, w2, get out r0 */ + "vmul.f32 q8, q5, q0 @ w0 * inr00\n" + "vmul.f32 q9, q5, q1 @ w0 * inr01\n" + "vmul.f32 q10, q5, q2 @ w0 * inr02\n" + "vmul.f32 q11, q5, q3 @ w0 * inr03\n" + "vmla.f32 q8, q6, q1 @ w1 * inr01\n" + "vld1.32 {d0-d3}, [%[r0]] @ load r0, q0, q1\n" + "vmla.f32 q9, q6, q2 @ w1 * inr02\n" + "vmla.f32 q10, q6, q3 @ w1 * inr03\n" + "vmla.f32 q11, q6, q0 @ w1 * inr04\n" + "vmla.f32 q8, q7, q2 @ w2 * inr02\n" + "vmla.f32 q9, q7, q3 @ w2 * inr03\n" + "vld1.32 {d4-d7}, [%[r1]]! @ load r0, q2, q3\n" + "vmla.f32 q10, q7, q0 @ w2 * inr04\n" + "vmla.f32 q11, q7, q1 @ w2 * inr05\n" + "vld1.32 {d0-d3}, [%[r1]]! @ load r0, q0, q1\n" + "vld1.32 {d8-d9}, [%[wc0]]! @ load w3 to q4\n" + /* mul r1 with w0-w5, get out r0, r1 */ + "vmul.f32 q12, q5, q2 @ w0 * inr10\n" + "vmul.f32 q13, q5, q3 @ w0 * inr11\n" + "vmul.f32 q14, q5, q0 @ w0 * inr12\n" + "vmul.f32 q15, q5, q1 @ w0 * inr13\n" + "vld1.32 {d10-d11}, [%[wc0]]! @ load w4 to q5\n" + "vmla.f32 q8, q4, q2 @ w3 * inr10\n" + "vmla.f32 q9, q4, q3 @ w3 * inr11\n" + "vmla.f32 q10, q4, q0 @ w3 * inr12\n" + "vmla.f32 q11, q4, q1 @ w3 * inr13\n" + /* mul r1 with w1, w4, get out r1, r0 */ + "vmla.f32 q8, q5, q3 @ w4 * inr11\n" + "vmla.f32 q12, q6, q3 @ w1 * inr11\n" + "vld1.32 {d4-d7}, [%[r1]] @ load r1, q2, q3\n" + "vmla.f32 q9, q5, q0 @ w4 * inr12\n" + "vmla.f32 q13, q6, q0 @ w1 * inr12\n" + "vmla.f32 q10, q5, q1 @ w4 * inr13\n" + "vmla.f32 q14, q6, q1 @ w1 * inr13\n" + "vmla.f32 q11, q5, q2 @ w4 * inr14\n" + "vmla.f32 q15, q6, q2 @ w1 * inr14\n" + "vld1.32 {d12-d13}, [%[wc0]]! @ load w5 to q6\n" + /* mul r1 with w2, w5, get out r1, r0 */ + "vmla.f32 q12, q7, q0 @ w2 * inr12\n" + "vmla.f32 q13, q7, q1 @ w2 * inr13\n" + "vmla.f32 q8, q6, q0 @ w5 * inr12\n" + "vmla.f32 q9, q6, q1 @ w5 * inr13\n" + "vld1.32 {d0-d3}, [%[r2]]! @ load r2, q0, q1\n" + "vmla.f32 q14, q7, q2 @ w2 * inr14\n" + "vmla.f32 q15, q7, q3 @ w2 * inr15\n" + "vmla.f32 q10, q6, q2 @ w5 * inr14\n" + "vmla.f32 q11, q6, q3 @ w5 * inr15\n" + "vld1.32 {d4-d7}, [%[r2]]! @ load r2, q0, q1\n" + "vld1.32 {d14-d15}, [%[wc0]]! @ load w6, to q7\n" + /* mul r2 with w3-w8, get out r0, r1 */ + "vmla.f32 q12, q4, q0 @ w3 * inr20\n" + "vmla.f32 q13, q4, q1 @ w3 * inr21\n" + "vmla.f32 q14, q4, q2 @ w3 * inr22\n" + "vmla.f32 q15, q4, q3 @ w3 * inr23\n" + "vld1.32 {d8-d9}, [%[wc0]]! @ load w7, to q4\n" + "vmla.f32 q8, q7, q0 @ w6 * inr20\n" + "vmla.f32 q9, q7, q1 @ w6 * inr21\n" + "vmla.f32 q10, q7, q2 @ w6 * inr22\n" + "vmla.f32 q11, q7, q3 @ w6 * inr23\n" + /* mul r2 with w4, w7, get out r1, r0 */ + "vmla.f32 q8, q4, q1 @ w7 * inr21\n" + "vmla.f32 q12, q5, q1 @ w4 * inr21\n" + "vld1.32 {d0-d3}, [%[r2]] @ load r2, q0, q1\n" + "vmla.f32 q9, q4, q2 @ w7 * inr22\n" + "vmla.f32 q13, q5, q2 @ w4 * inr22\n" + "vmla.f32 q10, q4, q3 @ w7 * inr23\n" + "vmla.f32 q14, q5, q3 @ w4 * inr23\n" + "vmla.f32 q11, q4, q0 @ w7 * inr24\n" + "vmla.f32 q15, q5, q0 @ w4 * inr24\n" + "vld1.32 {d10-d11}, [%[wc0]]! @ load w8 to q5\n" + /* mul r1 with w5, w8, get out r1, r0 */ + "vmla.f32 q12, q6, q2 @ w5 * inr22\n" + "vmla.f32 q13, q6, q3 @ w5 * inr23\n" + "vmla.f32 q8, q5, q2 @ w8 * inr22\n" + "vmla.f32 q9, q5, q3 @ w8 * inr23\n" + "vld1.32 {d4-d7}, [%[r3]]! @ load r3, q2, q3\n" + "ldr r4, [%[outl], #32] @ load bias addr to r4\n" + "vmla.f32 q14, q6, q0 @ w5 * inr24\n" + "vmla.f32 q15, q6, q1 @ w5 * inr25\n" + "vmla.f32 q10, q5, q0 @ w8 * inr24\n" + "vmla.f32 q11, q5, q1 @ w8 * inr25\n" + "vld1.32 {d0-d3}, [%[r3]]! @ load r3, q0, q1\n" + "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" + /* mul r3 with w6, w7, w8, get out r1 */ + "vmla.f32 q12, q7, q2 @ w6 * inr30\n" + "vmla.f32 q13, q7, q3 @ w6 * inr31\n" + "vmla.f32 q14, q7, q0 @ w6 * inr32\n" + "vmla.f32 q15, q7, q1 @ w6 * inr33\n" + "vmla.f32 q12, q4, q3 @ w7 * inr31\n" + "vld1.32 {d4-d7}, [%[r3]] @ load r3, q2, q3\n" + "vld1.32 {d12-d13}, [r4] @ load bias\n" + "vmla.f32 q13, q4, q0 @ w7 * inr32\n" + "vmla.f32 q14, q4, q1 @ w7 * inr33\n" + "vmla.f32 q15, q4, q2 @ w7 * inr34\n" + "ldr r0, [%[outl]] @ load outc00 to r0\n" + "vmla.f32 q12, q5, q0 @ w8 * inr32\n" + "vmla.f32 q13, q5, q1 @ w8 * inr33\n" + "ldr r5, [%[outl], #36] @ load flag_relu to r5\n" + "vmla.f32 q14, q5, q2 @ w8 * inr34\n" + "vmla.f32 q15, q5, q3 @ w8 * inr35\n" + "ldr r1, [%[outl], #4] @ load outc10 to r1\n" + "vadd.f32 q8, q8, q6 @ r00 add bias\n" + "vadd.f32 q9, q9, q6 @ r01 add bias\n" + "vadd.f32 q10, q10, q6 @ r02 add bias\n" + "vadd.f32 q11, q11, q6 @ r03 add bias\n" + "ldr r2, [%[outl], #8] @ load outc20 to r2\n" + "vadd.f32 q12, q12, q6 @ r10 add bias\n" + "vadd.f32 q13, q13, q6 @ r11 add bias\n" + "vadd.f32 q14, q14, q6 @ r12 add bias\n" + "vadd.f32 q15, q15, q6 @ r13 add bias\n" + "ldr r3, [%[outl], #12] @ load outc30 to r3\n" + "vmov.u32 q7, #0 @ mov zero to q7\n" + "cmp r5, #0 @ cmp flag relu\n" + "beq 1f @ skip relu\n" + "vmax.f32 q8, q8, q7 @ r00 relu\n" + "vmax.f32 q9, q9, q7 @ r01 relu\n" + "vmax.f32 q10, q10, q7 @ r02 relu\n" + "vmax.f32 q11, q11, q7 @ r03 relu\n" + "vmax.f32 q12, q12, q7 @ r10 relu\n" + "vmax.f32 q13, q13, q7 @ r11 relu\n" + "vmax.f32 q14, q14, q7 @ r12 relu\n" + "vmax.f32 q15, q15, q7 @ r13 relu\n" + "1:\n" + "ldr r4, [%[outl], #16] @ load outc01 to r4\n" + "vtrn.32 q8, q9 @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n" + "vtrn.32 q10, q11 @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n" + "vtrn.32 q12, q13 @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n" + "vtrn.32 q14, q15 @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n" + "ldr r5, [%[outl], #20] @ load outc11 to r5\n" + "vswp d17, d20 @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n" + "vswp d19, d22 @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n" + "vswp d25, d28 @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n" + "vswp d27, d30 @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n" + "cmp %[flag_mask], #0 @ cmp flag mask\n" + "bne 2f\n" + "vst1.32 {d16-d17}, [r0] @ save outc00\n" + "vst1.32 {d18-d19}, [r1] @ save outc10\n" + "vst1.32 {d20-d21}, [r2] @ save outc20\n" + "vst1.32 {d22-d23}, [r3] @ save outc30\n" + "vst1.32 {d24-d25}, [r4] @ save outc01\n" + "vst1.32 {d26-d27}, [r5] @ save outc11\n" + "ldr r0, [%[outl], #24] @ load outc21 to r0\n" + "ldr r1, [%[outl], #28] @ load outc31 to r1\n" + "vst1.32 {d28-d29}, [r0] @ save outc21\n" + "vst1.32 {d30-d31}, [r1] @ save outc31\n" + "b 3f @ branch end\n" + "2: \n" + "vst1.32 {d16-d17}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d18-d19}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d20-d21}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d22-d23}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d24-d25}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d26-d27}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d28-d29}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d30-d31}, [%[out0]]! @ save remain to pre_out\n" + "3: \n" + : [r0] "+r"(inr0), [r1] "+r"(inr1), + [r2] "+r"(inr2), [r3] "+r"(inr3), + [out0] "+r"(out0), [wc0] "+r"(weight_c) + : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5" + ); +#endif // __arch64__ + // clang-format on + outl[0] += 4; + outl[1] += 4; + outl[2] += 4; + outl[3] += 4; + outl[4] += 4; + outl[5] += 4; + outl[6] += 4; + outl[7] += 4; + if (flag_mask) { + memcpy(outl[0] - 4, pre_out, remain * sizeof(float)); + memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float)); + memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float)); + memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float)); + memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float)); + memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float)); + memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float)); + memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float)); + } + } + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc index 8260718a50..807135f57d 100644 --- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc +++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc @@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param, ARMContext* ctx) { auto dim_in = param.x->dims(); auto dim_out = param.output->dims(); + auto paddings = *param.paddings; const int threads = ctx->threads(); int llc_size = ctx->llc_size() / sizeof(float); - const int pad_w = param.paddings[1]; - const int pad_h = param.paddings[0]; + const int pad_w = paddings[2]; + const int pad_h = paddings[0]; int ow = dim_out[3]; int oh = dim_out[2]; int ic = dim_in[1]; @@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data, //! 3x3s2 convolution, implemented by direct algorithm //! prepack input to tmp buffer //! write output to tmp buffer + auto paddings = *param.paddings; const int threads = ctx->threads(); int l2_size = ctx->llc_size() / sizeof(float); - const int pad_w = param.paddings[1]; - const int pad_h = param.paddings[0]; + const int pad_w = paddings[2]; + const int pad_h = paddings[0]; const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int win_round = wout_round * 2 /*stride_w*/ + 1; bool flag_relu = param.fuse_relu; diff --git a/lite/backends/arm/math/conv3x3s2_direct_int8.cc b/lite/backends/arm/math/conv3x3s2_direct_int8.cc index 01b7a812eb..26829544bf 100644 --- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc +++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc @@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, //! 3x3s2 int8 convolution, implemented by direct algorithm //! prepack input to tmp buffer //! write output to tmp buffer + auto paddings = *param.paddings; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; const int threads = ctx->threads(); int llc_size = ctx->llc_size() / 4; @@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, //! 3x3s2 int8 convolution, implemented by direct algorithm //! prepack input to tmp buffer //! write output to tmp buffer + auto paddings = *param.paddings; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; const int threads = ctx->threads(); //! set 1/4 l2 cache int llc_size = ctx->llc_size() / 4; diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc new file mode 100644 index 0000000000..455781e37e --- /dev/null +++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc @@ -0,0 +1,1862 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +void conv_depthwise_3x3s2p0_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p0_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2_fp32(const float* din, + float* dout, + int num, + int ch_out, + int h_out, + int w_out, + int ch_in, + int h_in, + int w_in, + const float* weights, + const float* bias, + int pad, + bool flag_bias, + bool flag_relu, + ARMContext* ctx) { + if (pad == 0) { + if (w_in > 7) { + conv_depthwise_3x3s2p0_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s2p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } + if (pad == 1) { + if (w_in > 7) { + conv_depthwise_3x3s2p1_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s2p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } +} +#ifdef __aarch64__ +#define INIT_S2 \ + "prfm pldl1keep, [%[inptr0]] \n" \ + "prfm pldl1keep, [%[inptr1]] \n" \ + "prfm pldl1keep, [%[inptr2]] \n" \ + "prfm pldl1keep, [%[inptr3]] \n" \ + "prfm pldl1keep, [%[inptr4]] \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" + +#define LEFT_COMPUTE_S2 \ + "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[1] \n" /* {0,2,4,6} * w01 */ \ + "fmul v12.4s, v1.4s, %[w0].s[2] \n" /* {1,3,5,7} * w02 */ \ + "fmla v16.4s, v10.4s, %[w0].s[0] \n" /* {0,1,3,5} * w00*/ \ + \ + "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" /* v10 = {0,1,3,5} */ \ + \ + "sub %[inptr0], %[inptr0], #4 \n" \ + "sub %[inptr1], %[inptr1], #4 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[1] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" \ + \ + "sub %[inptr2], %[inptr2], #4 \n" \ + "sub %[inptr3], %[inptr3], #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[1] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[1] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[2] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[2] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[0] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" \ + \ + "sub %[inptr4], %[inptr4], #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[1] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define LEFT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, v18.16b, #4 \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, v19.16b, #4 \n" \ + \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, v20.16b, #4 \n" \ + \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, v21.16b, #4 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define MID_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %w[remain], #1 \n" \ + "blt 4f \n" \ + "3: \n" \ + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v2.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v3.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" \ + \ + "bif v6.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v7.16b, %[vzero].16b, %[mask2].16b \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" \ + "bif v8.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v9.16b, %[vzero].16b, %[mask2].16b \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" \ + "ld1 {v0.4s}, [%[outptr0]] \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" \ + "ld1 {v1.4s}, [%[outptr1]] \n" + +#define RIGHT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define LEFT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define COMPUTE_S_S2 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v9.16b, v11.16b, #12 \n" \ + "ext v7.16b, v9.16b, v13.16b, #12 \n" \ + "ext v8.16b, v9.16b, v15.16b, #12 \n" \ + \ + "fmul v4.4s, v10.4s, %[wr0].s[1] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[2] \n" \ + "fmul v6.4s, v6.4s, %[wr0].s[0] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[1] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[2] \n" \ + "fmla v6.4s, v7.4s, %[wr1].s[0] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[1] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[2] \n" \ + "fmla v6.4s, v8.4s, %[wr2].s[0] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v6.4s \n" + +#define RESULT_S_S2 \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + "and v4.16b, %[bias].16b, %[bias].16b \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v10.16b, v9.16b, #4 \n" \ + "ext v7.16b, v12.16b, v9.16b, #4 \n" \ + "ext v8.16b, v14.16b, v9.16b, #4 \n" \ + \ + "fmla v4.4s, v10.4s, %[wr0].s[0] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[1] \n" \ + "fmul v16.4s, v6.4s, %[wr0].s[2] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[0] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[1] \n" \ + "fmla v16.4s, v7.4s, %[wr1].s[2] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[0] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[1] \n" \ + "fmla v16.4s, v8.4s, %[wr2].s[2] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v16.4s \n" + +#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + "st1 {v4.4s}, [%[out]] \n" + +#else +#define INIT_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + \ + "vdup.32 q3, %[bias] @ and \n" + +#define LEFT_COMPUTE_S2 \ + "vext.32 q6, q9, q11, #3 @ shift right 1 data\n" \ + "vext.32 q7, q9, q13, #3 @ shift right 1 data\n" \ + "vext.32 q8, q9, q15, #3 @ shift right 1 data\n" \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, out0\n" \ + \ + "sub %[din0_ptr], #4 @ inpitr0 - 1\n" \ + "sub %[din1_ptr], #4 @ inpitr1 - 1\n" \ + "sub %[din2_ptr], #4 @ inpitr2 - 1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, out1\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, out1\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, out1\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define LEFT_RESULT_S2 \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" \ + "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + "vext.32 q6, q10, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q7, q12, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vext.32 q6, q14, q8, #1 @ shift left 1 \n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define MID_RESULT_S2 \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %[remain], #1 \n" \ + "blt 3f \n" \ + \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q6, q14, q9, #1 @ shift left 1 \n" \ + "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RIGHT_RESULT_S2 \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define LEFT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define COMPUTE_S_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q9, q11, #3 @ shift left 1 \n" \ + "vext.32 q7, q9, q13, #3 @ shift left 1 \n" \ + "vext.32 q8, q9, q15, #3 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu\n" \ + \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + "vext.32 q8, q14, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#endif + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + * w_in > 7 + */ +void conv_depthwise_3x3s2p1_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int cnt_col = (w_out >> 2) - 2; + int size_right_remain = w_in - (7 + cnt_col * 8); + if (size_right_remain >= 9) { + cnt_col++; + size_right_remain -= 8; + } + int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // + + int size_right_pad = w_out * 2 - w_in; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + din3_ptr = dr2; + din4_ptr = dr3; + dr0 = dr3; + dr1 = dr4; + } else { + dr0 = dr4; + dr1 = dr0 + w_in; + } + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i + 4 > h_in) { + switch (i + 4 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i / 2 + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + } else { + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + } + + //! process bottom pad + if (i + 2 > h_in) { + switch (i + 2 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = cnt_col; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p1_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + int hs = -1; + int he = 2; + float out_buf[4]; + for (int j = 0; j < h_out; ++j) { + const float* dr0 = din_channel + hs * w_in; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + if (hs == -1) { + dr0 = zeros; + } + if (he > h_in) { + dr2 = zeros; + } + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + hs += 2; + he += 2; + } + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + */ +// w_in > 7 +void conv_depthwise_3x3s2p0_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + + int tile_w = w_out >> 2; + int cnt_remain = w_out % 4; + + unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + dr0 = dr4; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i * 2 + 5 > h_in) { + switch (i * 2 + 5 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + case 0: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2 + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_out; i++) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + //! process bottom pad + if (i * 2 + 3 > h_in) { + switch (i * 2 + 3 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = tile_w; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU + RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p0_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + float out_buf[4]; + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + for (int j = 0; j < h_out; j++) { + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + if (j * 2 + 2 >= h_in) { + switch (j + 2 - h_in) { + case 1: + din1_ptr = zero_ptr; + case 0: + din2_ptr = zero_ptr; + default: + break; + } + } + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + } + } + } +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc new file mode 100644 index 0000000000..9852c0f84e --- /dev/null +++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc @@ -0,0 +1,362 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/core/context.h" +#include "lite/operators/op_params.h" +#ifdef ARM_WITH_OMP +#include +#endif + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void conv_3x3s2_depthwise_fp32(const float* i_data, + float* o_data, + int bs, + int oc, + int oh, + int ow, + int ic, + int ih, + int win, + const float* weights, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx) { + auto paddings = *param.paddings; + int threads = ctx->threads(); + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; + const int out_c_block = 4; + const int out_h_kernel = 1; + const int out_w_kernel = 4; + const int win_ext = ow * 2 + 1; + const int ow_round = ROUNDUP(ow, 4); + const int win_round = ROUNDUP(win_ext, 4); + const int hin_round = oh * 2 + 1; + const int prein_size = win_round * hin_round * out_c_block; + auto workspace_size = + threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; + ctx->ExtendWorkspace(sizeof(float) * workspace_size); + + bool flag_relu = param.fuse_relu; + bool flag_bias = param.bias != nullptr; + + /// get workspace + auto ptr_zero = ctx->workspace_data(); + memset(ptr_zero, 0, sizeof(float) * win_round); + float* ptr_write = ptr_zero + win_round; + + int size_in_channel = win * ih; + int size_out_channel = ow * oh; + + int ws = -pad_w; + int we = ws + win_round; + int hs = -pad_h; + int he = hs + hin_round; + int w_loop = ow_round / 4; + auto remain = w_loop * 4 - ow; + bool flag_remain = remain > 0; + remain = 4 - remain; + remain = remain > 0 ? remain : 0; + int row_len = win_round * out_c_block; + + for (int n = 0; n < bs; ++n) { + const float* din_batch = i_data + n * ic * size_in_channel; + float* dout_batch = o_data + n * oc * size_out_channel; +#pragma omp parallel for num_threads(threads) + for (int c = 0; c < oc; c += out_c_block) { +#ifdef ARM_WITH_OMP + float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; +#else + float* pre_din = ptr_write + ow_round; +#endif + /// const array size + prepack_input_nxwc4_dw( + din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); + const float* weight_c = weights + c * 9; // kernel_w * kernel_h + float* dout_c00 = dout_batch + c * size_out_channel; + float bias_local[4] = {0, 0, 0, 0}; + if (flag_bias) { + bias_local[0] = bias[c]; + bias_local[1] = bias[c + 1]; + bias_local[2] = bias[c + 2]; + bias_local[3] = bias[c + 3]; + } +#ifdef __aarch64__ + float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 + float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 + float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 + float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 + float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 + float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 + float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 + float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 + float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 +#endif + for (int h = 0; h < oh; h += out_h_kernel) { + float* outc0 = dout_c00 + h * ow; + float* outc1 = outc0 + size_out_channel; + float* outc2 = outc1 + size_out_channel; + float* outc3 = outc2 + size_out_channel; + const float* inr0 = pre_din + h * 2 * row_len; + const float* inr1 = inr0 + row_len; + const float* inr2 = inr1 + row_len; + if (c + out_c_block > oc) { + switch (c + out_c_block - oc) { + case 3: + outc1 = ptr_write; + case 2: + outc2 = ptr_write; + case 1: + outc3 = ptr_write; + default: + break; + } + } + auto c0 = outc0; + auto c1 = outc1; + auto c2 = outc2; + auto c3 = outc3; + float pre_out[16]; + for (int w = 0; w < w_loop; ++w) { + bool flag_mask = (w == w_loop - 1) && flag_remain; + if (flag_mask) { + c0 = outc0; + c1 = outc1; + c2 = outc2; + c3 = outc3; + outc0 = pre_out; + outc1 = pre_out + 4; + outc2 = pre_out + 8; + outc3 = pre_out + 12; + } +// clang-format off +#ifdef __aarch64__ + asm volatile( + "ldr q8, [%[bias]]\n" /* load bias */ + "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ + "and v19.16b, v8.16b, v8.16b\n" + "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ + "and v20.16b, v8.16b, v8.16b\n" + "ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/ + "and v21.16b, v8.16b, v8.16b\n" + "ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/ + "and v22.16b, v8.16b, v8.16b\n" + "ldr q8, [%[inr0]]\n" /* load input r0*/ + /* r0 mul w0-w2, get out */ + "fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/ + "fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/ + "fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/ + "fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/ + "fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/ + "ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/ + "fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/ + "fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/ + "fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/ + "fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/ + "ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/ + "fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/ + "ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/ + "fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/ + "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ + "fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/ + "ldr q8, [%[inr1]]\n" /* load input r1*/ + /* r1, mul w3-w5, get out */ + "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/ + "fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/ + "fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/ + "fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/ + "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/ + "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ + "fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/ + "fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/ + "fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/ + "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/ + "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ + "fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/ + "ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/ + "fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/ + "ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/ + "fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/ + "ldr q8, [%[inr2]]\n" /* load input r2*/ + /* r2, mul w6-w8, get out r0, r1 */ + "fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/ + "fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/ + "fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/ + "fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/ + "fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/ + "fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/ + "fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/ + "fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/ + "fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/ + "fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/ + "fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/ + "fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/ + /* transpose */ + "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/ + "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/ + "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/ + "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/ + "trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ + "trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ + "trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ + "trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ + /* relu */ + "cbz %w[flag_relu], 0f\n" /* skip relu*/ + "movi v0.4s, #0\n" /* for relu */ + "fmax v19.4s, v19.4s, v0.4s\n" + "fmax v20.4s, v20.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v0.4s\n" + "fmax v22.4s, v22.4s, v0.4s\n" + /* save result */ + "0:\n" + "str q19, [%[outc0]], #16\n" + "str q20, [%[outc1]], #16\n" + "str q21, [%[outc2]], #16\n" + "str q22, [%[outc3]], #16\n" + :[inr0] "+r"(inr0), [inr1] "+r"(inr1), + [inr2] "+r"(inr2), + [outc0]"+r"(outc0), [outc1]"+r"(outc1), + [outc2]"+r"(outc2), [outc3]"+r"(outc3) + :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), + [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), + [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), + [bias] "r" (bias_local), [flag_relu]"r"(flag_relu) + : "cc", "memory", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8", "v19","v20","v21","v22" + ); +#else + asm volatile( + /* fill with bias */ + "vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */ + /* load weights */ + "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */ + "vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/ + "vand.i32 q12, q8, q8\n" + "vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/ + "vand.i32 q13, q8, q8\n" + "vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/ + "vand.i32 q14, q8, q8\n" + "vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/ + "vand.i32 q15, q8, q8\n" + "vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/ + /* mul r0 with w0, w1, w2 */ + "vmla.f32 q12, q9, q0 @ w0 * inr0\n" + "vmla.f32 q13, q9, q2 @ w0 * inr2\n" + "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */ + "vmla.f32 q14, q9, q4 @ w0 * inr4\n" + "vmla.f32 q15, q9, q6 @ w0 * inr6\n" + "vmla.f32 q12, q10, q1 @ w1 * inr1\n" + "vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n" + "vmla.f32 q13, q10, q3 @ w1 * inr3\n" + "vmla.f32 q14, q10, q5 @ w1 * inr5\n" + "vmla.f32 q15, q10, q7 @ w1 * inr7\n" + "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */ + "vmla.f32 q12, q11, q2 @ w2 * inr2\n" + "vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n" + "vmla.f32 q13, q11, q4 @ w2 * inr4\n" + "vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n" + "vmla.f32 q14, q11, q6 @ w2 * inr6\n" + "vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n" + "vmla.f32 q15, q11, q8 @ w2 * inr8\n" + /* mul r1 with w3, w4, w5 */ + "vmla.f32 q12, q9, q0 @ w3 * inr0\n" + "vmla.f32 q13, q9, q2 @ w3 * inr2\n" + "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */ + "vmla.f32 q14, q9, q4 @ w3 * inr4\n" + "vmla.f32 q15, q9, q6 @ w3 * inr6\n" + "vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/ + "vmla.f32 q12, q10, q1 @ w4 * inr1\n" + "vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n" + "vmla.f32 q13, q10, q3 @ w4 * inr3\n" + "vmla.f32 q14, q10, q5 @ w4 * inr5\n" + "vmla.f32 q15, q10, q7 @ w4 * inr7\n" + "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */ + "vmla.f32 q12, q11, q2 @ w5 * inr2\n" + "vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n" + "vmla.f32 q13, q11, q4 @ w5 * inr4\n" + "vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n" + "vmla.f32 q14, q11, q6 @ w5 * inr6\n" + "vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n" + "vmla.f32 q15, q11, q8 @ w5 * inr8\n" + /* mul r2 with w6, w7, w8 */ + "vmla.f32 q12, q9, q0 @ w6 * inr0\n" + "vmla.f32 q13, q9, q2 @ w6 * inr2\n" + "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */ + "vmla.f32 q14, q9, q4 @ w6 * inr4\n" + "vmla.f32 q15, q9, q6 @ w6 * inr6\n" + "vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/ + "vmla.f32 q12, q10, q1 @ w7 * inr1\n" + "vmla.f32 q13, q10, q3 @ w7 * inr3\n" + "vmla.f32 q14, q10, q5 @ w7 * inr5\n" + "vmla.f32 q15, q10, q7 @ w7 * inr7\n" + "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" + "vmla.f32 q12, q11, q2 @ w8 * inr2\n" + "vmla.f32 q13, q11, q4 @ w8 * inr4\n" + "vmla.f32 q14, q11, q6 @ w8 * inr6\n" + "vmla.f32 q15, q11, q8 @ w8 * inr8\n" + /* transpose */ + "vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/ + "vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/ + "vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/ + "vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/ + "cmp %[flag_relu], #0\n" + "beq 0f\n" /* skip relu*/ + "vmov.u32 q0, #0\n" + "vmax.f32 q12, q12, q0\n" + "vmax.f32 q13, q13, q0\n" + "vmax.f32 q14, q14, q0\n" + "vmax.f32 q15, q15, q0\n" + "0:\n" + "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/ + "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/ + "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/ + "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/ + :[r0] "+r"(inr0), [r1] "+r"(inr1), + [r2] "+r"(inr2), [wc0] "+r" (weight_c), + [outc0]"+r"(outc0), [outc1]"+r"(outc1), + [outc2]"+r"(outc2), [outc3]"+r"(outc3) + :[bias] "r" (bias_local), + [flag_relu]"r"(flag_relu) + :"cc", "memory", + "q0","q1","q2","q3","q4","q5","q6","q7", + "q8", "q9","q10","q11","q12","q13","q14","q15" + ); +#endif // __arch64__ + // clang-format off + if (flag_mask) { + for (int i = 0; i < remain; ++i) { + c0[i] = pre_out[i]; + c1[i] = pre_out[i + 4]; + c2[i] = pre_out[i + 8]; + c3[i] = pre_out[i + 12]; + } + } + } + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h index b2d16d18d2..e4279d9a72 100644 --- a/lite/backends/arm/math/conv_block_utils.h +++ b/lite/backends/arm/math/conv_block_utils.h @@ -254,6 +254,7 @@ inline void prepack_input_nxwc4_dw(const float* din, LOG(FATAL) << "prepack_dw_input, valid height must > zero"; } float32x4_t vzero = vdupq_n_f32(0.f); + auto out_data = dout; int size_w = we - ws; int w0 = ws < 0 ? 0 : ws; @@ -269,6 +270,7 @@ inline void prepack_input_nxwc4_dw(const float* din, bool flag_ext_l = left_remain > 0; int left_sl = 4 - left_remain; + int left_valid_sl = left_sl > width ? width : left_sl; uint32x4_t vmask_padl; bool flag_mask_l = false; if (flag_ext_l) { @@ -290,6 +292,7 @@ inline void prepack_input_nxwc4_dw(const float* din, } int size_c = width * height; for (int h = hs; h < he; ++h) { + dout = out_data + (h - hs) * 4 * size_w; auto ptr_c0 = din + cs * size_c + h * width; auto ptr_c1 = ptr_c0 + size_c; auto ptr_c2 = ptr_c1 + size_c; @@ -351,10 +354,10 @@ inline void prepack_input_nxwc4_dw(const float* din, } transpose_4x4(vc0, vc1, vc2, vc3, dout); dout += 16; - ptr_c0 += left_sl; - ptr_c1 += left_sl; - ptr_c2 += left_sl; - ptr_c3 += left_sl; + ptr_c0 += left_valid_sl; + ptr_c1 += left_valid_sl; + ptr_c2 += left_valid_sl; + ptr_c3 += left_valid_sl; } /// valid for (int i = 0; i < cnt_valid; ++i) { @@ -722,7 +725,57 @@ inline bool write_to_output_c1_fp32(const float* din, } return true; } - +#ifdef __aarch64__ +#define NCHWC2_TRANS_FP32_COMPUTE \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1*/ \ + "movi v20.4s, #0 \n" /* for relu */ \ + "1: \n" /* main loop*/ \ + "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1*/ \ + "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ \ + "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ + +#define NCHWC2_TRANS_FP32_RELU \ + "fmax v2.4s, v4.4s, v20.4s \n" /*relu*/ \ + "fmax v3.4s, v5.4s, v20.4s \n" /*relu*/ + +#define NCHWC2_TRANS_FP32_STORE \ + "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ \ + \ + "str q2, [%[doutc0r0]], #16 \n" /* store c0r0*/ \ + "str q3, [%[doutc1r0]], #16 \n" /* store c2r0*/ \ + \ + "bne 1b \n" /* jump to main loop*/ +#else +#define NCHWC2_TRANS_FP32_COMPUTE \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " \ + "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" \ + "vmov.u32 q15, #0 @ dump zero\n" \ + "1: @ main loop\n" \ + "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " \ + "c1r0, c1r1 \n" \ + "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " \ + "c1r2, c1r3 \n" \ + \ + "vswp d1, d2 @ swap data\n" + +#define NCHWC2_TRANS_FP32_RELU \ + "vmax.f32 q0, q0, q15 @ relu\n" \ + "vmax.f32 q1, q1, q15 @ relu\n" + +#define NCHWC2_TRANS_FP32_STORE \ + "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " \ + "pointer\n" \ + \ + "subs %[cnt], %[cnt], #1 @ loop count - 1\n" \ + \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" \ + \ + "bne 1b @ jump to main loop\n" +#endif /*wirte result in outputs * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] */ @@ -777,127 +830,41 @@ inline bool write_to_output_c2_fp32(const float* din, int cnt_loop = cnt; if (flag_relu) { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - - "fmax v2.4s, v4.4s, v20.4s \n" /*relu*/ - "fmax v3.4s, v5.4s, v20.4s \n" /*relu*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - - "str q2, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q3, [%[doutc1r0]], #16 \n" /* store c2r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v20"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU + NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " - "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " - "c1r0, c1r1 \n" - "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " - "c1r2, c1r3 \n" - - "vswp d1, d2 @ swap data\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU + NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q15"); #endif } else { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "1: \n" /* main loop*/ - "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - - "str q4, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q5, [%[doutc1r0]], #16 \n" /* store c2r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", "v1", "v2", "v3", "v4", "v5"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " - "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "1: @ main loop\n" - "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " - "c1r0, c1r1 \n" - "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " - "c1r2, c1r3 \n" - - "vswp d1, d2 @ swap data\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -922,6 +889,70 @@ inline bool write_to_output_c2_fp32(const float* din, return true; } +#ifdef __aarch64__ +#define NCHWC4_TRANS_FP32_COMPUTE \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "movi v20.4s, #0 \n" /* for relu */ \ + "1: \n" /* main loop*/ \ + "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ \ + "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ \ + "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ \ + "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ \ + "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ + +#define NCHWC4_TRANS_FP32_RELU \ + "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ \ + "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ \ + "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ \ + "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ + +#define NCHWC4_TRANS_FP32_STORE \ + "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ \ + "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ \ + "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ \ + "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ \ + "bne 1b \n" /* jump to main loop*/ +#else +#define NCHWC4_TRANS_FP32_COMPUTE \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + "vmov.u32 q15, #0 @ dump zero\n" \ + "1: @ main loop\n" \ + "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " \ + "\n" \ + "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " \ + "\n" \ + \ + "vswp d1, d4 @ swap data\n" \ + "vswp d3, d6 @ swap data\n" + +#define NCHWC4_TRANS_FP32_RELU \ + "vmax.f32 q0, q0, q15 @ relu\n" \ + "vmax.f32 q1, q1, q15 @ relu\n" \ + "vmax.f32 q2, q2, q15 @ relu\n" \ + "vmax.f32 q3, q3, q15 @ relu\n" + +#define NCHWC4_TRANS_FP32_STORE \ + "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" \ + "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" \ + "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" \ + "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], %[cnt], #1 @ loop count - 1\n" \ + \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + \ + "bne 1b @ jump to main loop\n" +#endif /*wirte result in outputs * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] */ @@ -958,7 +989,9 @@ inline bool write_to_output_c4_fp32(const float* din, int size_h = (he > height ? height : he) - hs; // size_h == hei_n - int cnt = (width - ws) / w4; + int valid_we = we > width ? width : we; + int cnt = (valid_we - ws) / w4; + int remain = valid_we - ws - cnt * w4; for (int i = 0; i < size_h; i++) { int size_w = i * width; @@ -983,185 +1016,88 @@ inline bool write_to_output_c4_fp32(const float* din, int cnt_loop = cnt; if (flag_relu) { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v16", - "v17", - "v18", - "v19", - "v20"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU + NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v16", + "v17", + "v18", + "v19", + "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - "vmax.f32 q2, q2, q15 @ relu\n" - "vmax.f32 q3, q3, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU + NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q15"); #endif } else { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v8", - "v9", - "v10", - "v11", - "v16", - "v17", - "v18", - "v19"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", + "v1", + "v2", + "v3", + "v8", + "v9", + "v10", + "v11", + "v16", + "v17", + "v18", + "v19"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3"); #endif } } - if (we > width) { + if (remain > 0) { int offset = i * w_round * c4 + c4 * w4 * cnt; din_hei_ptr = ptr_din + offset; - int j = we - w4; + int j = 0; if (flag_relu) { - for (; j < width; ++j) { + for (; j < remain; ++j) { *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f); *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f); *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f); @@ -1169,7 +1105,7 @@ inline bool write_to_output_c4_fp32(const float* din, din_hei_ptr += w4; } } else { - for (; j < width; ++j) { + for (; j < remain; ++j) { *(doutc0_ptr++) = din_hei_ptr[0]; *(doutc1_ptr++) = din_hei_ptr[1]; *(doutc2_ptr++) = din_hei_ptr[2]; @@ -1182,6 +1118,120 @@ inline bool write_to_output_c4_fp32(const float* din, return true; } +#ifdef __aarch64__ +#define NCHWC8_TRANS_FP32_COMPUTE \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "movi v20.4s, #0 \n" /* for relu */ \ + "1: \n" /* main loop*/ \ + "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ \ + "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ \ + "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ \ + "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + \ + "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ \ + "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ \ + "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ \ + "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + \ + "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ \ + "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ \ + "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ \ + "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ \ + "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + \ + "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ \ + "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ \ + "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ \ + "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ \ + "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ + +#define NCHWC8_TRANS_FP32_RELU \ + "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ \ + "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ \ + "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ \ + "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ \ + \ + "fmax v8.4s, v8.4s, v20.4s \n" /*relu*/ \ + "fmax v9.4s, v9.4s, v20.4s \n" /*relu*/ \ + "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ \ + "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ + +#define NCHWC8_TRANS_FP32_STORE \ + "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ \ + "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ \ + "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ \ + "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ \ + "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ \ + "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ \ + "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ \ + "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ \ + \ + "bne 1b \n" /* jump to main loop*/ +#else +#define NCHWC8_TRANS_FP32_COMPUTE \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" \ + "vmov.u32 q15, #0 @ dump zero\n" \ + "1: @ main loop\n" \ + "vtrn.32 q0, q2 @ trans q0, q2 \n" \ + "vtrn.32 q4, q6 @ trans q4, q6 \n" \ + "vswp.32 d1, d8 @ swap d1, d8 \n" \ + "vswp.32 d5, d12 @ swap d5, d12\n" \ + \ + "vtrn.32 q1, q3 @ trans q1, q3 \n" \ + "vtrn.32 q5, q7 @ trans q5, q7 \n" \ + "vswp.32 d3, d10 @ swap d3, d10\n" \ + "vswp.32 d7, d14 @ swap d7, d14\n" + +#define NCHWC8_TRANS_FP32_RELU \ + "vmax.f32 q0, q0, q15 @ relu\n" \ + "vmax.f32 q1, q1, q15 @ relu\n" \ + "vmax.f32 q2, q2, q15 @ relu\n" \ + "vmax.f32 q3, q3, q15 @ relu\n" \ + \ + "vmax.f32 q4, q4, q15 @ relu\n" \ + "vmax.f32 q5, q5, q15 @ relu\n" \ + "vmax.f32 q6, q6, q15 @ relu\n" \ + "vmax.f32 q7, q7, q15 @ relu\n" + +#define NCHWC8_TRANS_FP32_STORE \ + "subs %[cnt], %[cnt], #1 @ loop count - 1\n" \ + "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " \ + "pointer\n" \ + \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + \ + "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " \ + "pointer\n" \ + \ + "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" \ + \ + "bne 1b @ jump to main loop\n" + +#endif /*wirte result in outputs * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w] */ @@ -1261,158 +1311,54 @@ inline bool write_to_output_c8_fp32(const float* din, if (cnt > 0) { int cnt_loop = cnt; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ - - "fmax v8.4s, v8.4s, v20.4s \n" /*relu*/ - "fmax v9.4s, v9.4s, v20.4s \n" /*relu*/ - "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ - "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU + NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - "vmax.f32 q2, q2, q15 @ relu\n" - "vmax.f32 q3, q3, q15 @ relu\n" - - "vmax.f32 q4, q4, q15 @ relu\n" - "vmax.f32 q5, q5, q15 @ relu\n" - "vmax.f32 q6, q6, q15 @ relu\n" - "vmax.f32 q7, q7, q15 @ relu\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4", "q15"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU + NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q4", "q15"); #endif } if (we > width) { @@ -1468,138 +1414,53 @@ inline bool write_to_output_c8_fp32(const float* din, if (cnt > 0) { int cnt_loop = cnt; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q4"); #endif } if (we > width) { diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h index 1a23982cd5..b6c3478880 100644 --- a/lite/backends/arm/math/conv_depthwise.h +++ b/lite/backends/arm/math/conv_depthwise.h @@ -85,38 +85,6 @@ void conv_depthwise_3x3s2_fp32(const float* din, bool flag_relu, ARMContext* ctx); -void conv_depthwise_3x3p0_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_3x3p1_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - template void conv_depthwise_3x3s1_int8(Dtype* dout, const int8_t* din, diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc index 010563bf93..dc68e65f42 100644 --- a/lite/backends/arm/math/conv_impl.cc +++ b/lite/backends/arm/math/conv_impl.cc @@ -107,29 +107,35 @@ void im2col(const Dtype* data_im, int width, int kernel_h, int kernel_w, - int pad_h, - int pad_w, + int pad_top, + int pad_bottom, + int pad_left, + int pad_right, int stride_h, int stride_w, int dilation_h, int dilation_w, Dtype* data_col) { const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) / + stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_top + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { for (int output_cols = output_w; output_cols; output_cols--) { *(data_col++) = 0; } } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_left + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { *(data_col++) = data_im[input_row * width + input_col]; @@ -202,7 +208,8 @@ void conv1x1s1_gemm(const float* i_data, k, flag_bias, bias_group, - flag_relu); + flag_relu, + ctx); } else { sgemm_prepack(false, m, @@ -361,6 +368,8 @@ void conv_im2col_gemm(const float* i_data, float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); + auto paddings = *param.paddings; + auto dilations = *param.dilations; //! use gemv when the output channel size = 1 for (int b = 0; b < num; ++b) { // dC @@ -378,12 +387,14 @@ void conv_im2col_gemm(const float* i_data, win, kernel_h, kernel_w, - param.paddings[0], - param.paddings[1], + paddings[0], + paddings[1], + paddings[2], + paddings[3], param.strides[0], param.strides[1], - param.dilations[0], - param.dilations[1], + dilations[0], + dilations[1], dB); if (n == 1) { @@ -395,7 +406,8 @@ void conv_im2col_gemm(const float* i_data, k, flag_bias, bias_group, - flag_relu); + flag_relu, + ctx); } else { int ldb = n; sgemm_prepack(false, @@ -434,14 +446,16 @@ void conv_im2col_gemm_int8(const int8_t* i_data, const float* scale) { int group = param.groups; auto filter_dims = param.filter->dims(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; int kernel_h = filter_dims[2]; int kernel_w = filter_dims[3]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; const int m = oc / group; const int n = oh * ow; const int k = ic * kernel_h * kernel_w / group; @@ -482,7 +496,9 @@ void conv_im2col_gemm_int8(const int8_t* i_data, kernel_h, kernel_w, pad_h, + paddings[1], pad_w, + paddings[3], stride_h, stride_w, dila_h, @@ -562,90 +578,83 @@ void conv_depthwise_3x3_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - if (pad_w != pad_h) { - LOG(FATAL) << "fp32 depthwise conv3x3 pad_w: " << pad_w - << ", pad_h: " << pad_h << " must be equal"; - return; - } + auto paddings = *param.paddings; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; int stride = param.strides[1]; int pad = pad_w; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; - if (stride == 1 && pad < 2) { // support pad = [0, 1] - conv_depthwise_3x3s1_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else if (stride == 2 && pad < 2) { // support pad = [0, 1] - conv_depthwise_3x3s2_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else { - LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride - << " or pad(<2): " << pad << " unsupported"; - } -#if 0 - if (pad == 1) { - conv_depthwise_3x3p1_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - stride, - flag_bias, - flag_relu, - ctx); - } else if (pad == 0 && h_in > 2) { - conv_depthwise_3x3p0_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - stride, - flag_bias, - flag_relu, - ctx); + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (stride == 1) { + if (pads_equal && (pad_h == pad_w) && (pad < 2)) { // support pad = [0, 1] + conv_depthwise_3x3s1_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + pad, + flag_bias, + flag_relu, + ctx); + } else { + conv_3x3s1_depthwise_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + param, + ctx); + } + + } else if (stride == 2) { + if (pad_h == pad_w && (pad < 2)) { // support pad = [0, 1] + conv_depthwise_3x3s2_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + pad, + flag_bias, + flag_relu, + ctx); + } else { + conv_3x3s2_depthwise_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + param, + ctx); + } } else { - LOG(FATAL) << "unsupport this type 3x3 dw conv"; + LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride << " unsupported"; } -#endif } void conv_depthwise_5x5_fp32(const void* din, @@ -662,7 +671,8 @@ void conv_depthwise_5x5_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad = param.paddings[1]; + auto paddings = *param.paddings; + int pad = paddings[0]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -718,8 +728,9 @@ void conv_depthwise_3x3_int8_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -776,8 +787,9 @@ void conv_depthwise_3x3_int8_int8(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -834,8 +846,9 @@ void conv_depthwise_5x5_int8_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -875,8 +888,9 @@ void conv_depthwise_5x5_int8_int8(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h index c5baa31e14..f4d00039aa 100644 --- a/lite/backends/arm/math/conv_impl.h +++ b/lite/backends/arm/math/conv_impl.h @@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor, const int* bias, int channel, int channel_size); +// new winograd +void weight_trans_c4( + float* dest, const float* src, int ic, int oc, void* workspace); +void conv_compute_6x6_3x3(const float* input, + float* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const float* weight, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx); } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc index 87b08f6310..894b946a32 100644 --- a/lite/backends/arm/math/conv_winograd_3x3.cc +++ b/lite/backends/arm/math/conv_winograd_3x3.cc @@ -37,9 +37,9 @@ void conv_winograd3x3(const float* din, const operators::ConvParam& param, ARMContext* ctx) { int threads = ctx->threads(); - - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + const int pad_h = paddings[0]; + const int pad_w = paddings[1]; int size_in_channel = win * hin; int size_out_channel = wout * hout; bool flag_relu = param.fuse_relu; diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h index d8ef6ff47d..8977b5712c 100644 --- a/lite/backends/arm/math/funcs.h +++ b/lite/backends/arm/math/funcs.h @@ -39,10 +39,12 @@ #include "lite/backends/arm/math/im2sequence.h" #include "lite/backends/arm/math/increment.h" #include "lite/backends/arm/math/interpolate.h" +#include "lite/backends/arm/math/layout.h" #include "lite/backends/arm/math/lrn.h" #include "lite/backends/arm/math/negative.h" #include "lite/backends/arm/math/norm.h" #include "lite/backends/arm/math/packed_sgemm.h" +#include "lite/backends/arm/math/packed_sgemm_c4.h" #include "lite/backends/arm/math/pad2d.h" #include "lite/backends/arm/math/pooling.h" #include "lite/backends/arm/math/power.h" diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc index f89410ad11..e9e18043df 100644 --- a/lite/backends/arm/math/interpolate.cc +++ b/lite/backends/arm/math/interpolate.cc @@ -22,6 +22,28 @@ namespace lite { namespace arm { namespace math { +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + vec_new_shape.push_back(static_cast(*tensor->data())); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->dims().production()); + return vec_new_data; +} + // The following function bilinear_interp is partially base on // https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp // Tencent is pleased to support the open source community by making ncnn @@ -472,33 +494,52 @@ void nearest_interp(const float* src, void interpolate(lite::Tensor* X, lite::Tensor* OutSize, + std::vector SizeTensor, + lite::Tensor* Scale, lite::Tensor* Out, int out_height, int out_width, - float height_scale, - float width_scale, + float scale, bool with_align, std::string interpolate_type) { + int in_h = X->dims()[2]; + int in_w = X->dims()[3]; + if (SizeTensor.size() > 0) { + auto new_size = get_new_shape(SizeTensor); + out_height = new_size[0]; + out_width = new_size[1]; + } else { + auto scale_tensor = Scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_height = static_cast(in_h * scale); + out_width = static_cast(in_w * scale); + } + auto out_size = OutSize; + if (out_size != nullptr) { + auto out_size_data = get_new_data_from_tensor(out_size); + out_height = static_cast(out_size_data[0]); + out_width = static_cast(out_size_data[1]); + } + } + float height_scale = scale; + float width_scale = scale; if (out_width > 0 && out_height > 0) { height_scale = static_cast(out_height / X->dims()[2]); width_scale = static_cast(out_width / X->dims()[3]); } - if (OutSize != nullptr) { - auto OutSize_data = OutSize->data(); - int h_out = OutSize_data[0]; // HW - int w_out = OutSize_data[1]; // HW - int num_cout = Out->dims()[0]; - int c_cout = Out->dims()[1]; - Out->Resize({num_cout, c_cout, h_out, w_out}); - } + int num_cout = X->dims()[0]; + int c_cout = X->dims()[1]; + Out->Resize({num_cout, c_cout, out_height, out_width}); float* dout = Out->mutable_data(); const float* din = X->data(); int out_num = Out->dims()[0]; int out_c = Out->dims()[1]; int count = out_num * out_c; - int in_h = X->dims()[2]; - int in_w = X->dims()[3]; int out_h = Out->dims()[2]; int out_w = Out->dims()[3]; int spatial_in = in_h * in_w; diff --git a/lite/backends/arm/math/interpolate.h b/lite/backends/arm/math/interpolate.h index be250f6a5e..e9c41c5bc8 100644 --- a/lite/backends/arm/math/interpolate.h +++ b/lite/backends/arm/math/interpolate.h @@ -44,11 +44,12 @@ void nearest_interp(const float* src, void interpolate(lite::Tensor* X, lite::Tensor* OutSize, + std::vector SizeTensor, + lite::Tensor* Scale, lite::Tensor* Out, int out_height, int out_width, - float height_scale, - float width_scale, + float scale, bool with_align, std::string interpolate_type); diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc new file mode 100644 index 0000000000..fd9126ab48 --- /dev/null +++ b/lite/backends/arm/math/layout.cc @@ -0,0 +1,668 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/layout.h" +#include +#include +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +#ifdef __aarch64__ +#define TRANS_C4 \ + "ld1 {v0.4s}, [%[din0_ptr]] \n" \ + "ld1 {v1.4s}, [%[din1_ptr]] \n" \ + "ld1 {v2.4s}, [%[din2_ptr]] \n" \ + "ld1 {v3.4s}, [%[din3_ptr]] \n" \ + \ + "1: \n" \ + "trn1 v4.4s, v0.4s, v1.4s \n" /*00 10 02 12 */ \ + "trn1 v5.4s, v2.4s, v3.4s \n" /*20 30 22 32 */ \ + "trn2 v6.4s, v0.4s, v1.4s \n" /*01 11 03 13 */ \ + "trn2 v7.4s, v2.4s, v3.4s \n" /*21 31 23 33 */ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \ + \ + "trn1 v8.2d, v4.2d, v5.2d \n" /*00 10 20 30 */ \ + "trn1 v9.2d, v6.2d, v7.2d \n" /*01 11 21 31 */ \ + "trn2 v10.2d, v4.2d, v5.2d \n" /*02 12 22 32 */ \ + "trn2 v11.2d, v6.2d, v7.2d \n" /*03 13 23 33 */ \ + \ + "ld1 {v0.4s}, [%[din0_ptr]] \n" \ + "ld1 {v1.4s}, [%[din1_ptr]] \n" \ + "ld1 {v2.4s}, [%[din2_ptr]] \n" \ + "ld1 {v3.4s}, [%[din3_ptr]] \n" \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + "str q8, [%[out0_ptr]], #16 \n" \ + "str q9, [%[out1_ptr]], #16 \n" \ + "str q10, [%[out2_ptr]], #16 \n" \ + "str q11, [%[out3_ptr]], #16 \n" \ + "bne 1b \n" + +#define TRANS_C8 \ + "1: \n" \ + "ld1 {v0.8b}, [%[din0_ptr]] \n" \ + "ld1 {v1.8b}, [%[din1_ptr]] \n" \ + "ld1 {v2.8b}, [%[din2_ptr]] \n" \ + "ld1 {v3.8b}, [%[din3_ptr]] \n" \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "trn1 v8.8b, v0.8b, v1.8b \n" /*00 10 02 12 04 14 06 16 */ \ + "trn1 v9.8b, v2.8b, v3.8b \n" /*20 30 22 32 */ \ + "trn2 v12.8b, v0.8b, v1.8b \n" /*01 11 03 13 05 15 07 17 */ \ + "trn2 v13.8b, v2.8b, v3.8b \n" /*21 31 23 33 */ \ + \ + "ld1 {v4.8b}, [%[din0_ptr]] \n" \ + "ld1 {v5.8b}, [%[din1_ptr]] \n" \ + "ld1 {v6.8b}, [%[din2_ptr]] \n" \ + "ld1 {v7.8b}, [%[din3_ptr]] \n" \ + \ + "trn1 v10.8b, v4.8b, v5.8b \n" /*40 50 42 52 */ \ + "trn1 v11.8b, v6.8b, v7.8b \n" /*60 70 62 72 */ \ + "trn2 v14.8b, v4.8b, v5.8b \n" /*41 51 43 53 */ \ + "trn2 v15.8b, v6.8b, v7.8b \n" /*61 71 63 73 */ \ + \ + "trn1 v0.4h, v8.4h, v9.4h \n" /*00 10 20 30 04 14 24 34*/ \ + "trn1 v2.4h, v12.4h, v13.4h \n" /*01 11 21 31 05 15 25 35*/ \ + "trn1 v1.4h, v10.4h, v11.4h \n" /*40 50 60 70 44 54 64 74*/ \ + "trn1 v3.4h, v14.4h, v15.4h \n" /*41 51 61 71 45 55 65 75*/ \ + \ + "trn2 v4.4h, v8.4h, v9.4h \n" /*02 10 20 30 06 14 24 34*/ \ + "trn2 v6.4h, v12.4h, v13.4h \n" /*03 11 21 31 07 15 25 35*/ \ + "trn2 v5.4h, v10.4h, v11.4h \n" /*42 50 60 70 46 54 64 74*/ \ + "trn2 v7.4h, v14.4h, v15.4h \n" /*43 51 61 71 47 55 65 75*/ \ + \ + "trn1 v8.2s, v0.2s, v1.2s \n" /*00 10 20 30 40 50 60 70*/ \ + "trn1 v9.2s, v2.2s, v3.2s \n" /*01 11 21 31 41 51 61 71*/ \ + "trn1 v10.2s, v4.2s, v5.2s \n" /*02 12 22 32 42 50 60 70*/ \ + "trn1 v11.2s, v6.2s, v7.2s \n" /*03 13 23 33 41 51 61 71*/ \ + \ + "trn2 v12.2s, v0.2s, v1.2s \n" /*04 14 24 34 44 54 64 74*/ \ + "trn2 v13.2s, v2.2s, v3.2s \n" /*05 15 25 35 45 55 65 75*/ \ + "trn2 v14.2s, v4.2s, v5.2s \n" /*06 16 22 32 42 50 60 70*/ \ + "trn2 v15.2s, v6.2s, v7.2s \n" /*07 17 23 33 41 51 61 71*/ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + "st1 {v8.8b}, [%[out0_ptr]], #8 \n" \ + "st1 {v9.8b}, [%[out1_ptr]], #8 \n" \ + "st1 {v10.8b}, [%[out2_ptr]], #8 \n" \ + "st1 {v11.8b}, [%[out3_ptr]], #8 \n" \ + \ + "st1 {v11.8b}, [%[out4_ptr]], #8 \n" \ + "st1 {v12.8b}, [%[out5_ptr]], #8 \n" \ + "st1 {v13.8b}, [%[out6_ptr]], #8 \n" \ + "st1 {v14.8b}, [%[out7_ptr]], #8 \n" \ + "bne 1b \n" + +#else +#define TRANS_C4 \ + "1: \n" \ + "vld1.32 {d0-d1}, [%[din0_ptr]] \n" \ + "vld1.32 {d2-d3}, [%[din1_ptr]] \n" \ + "vld1.32 {d4-d5}, [%[din2_ptr]] \n" \ + "vld1.32 {d6-d7}, [%[din3_ptr]] \n" \ + \ + "vtrn.32 q0, q1 \n" /*00 10 02 12 01 11 03 13*/ \ + "vtrn.32 q2, q3 \n" /*20 30 22 32 21 31 23 33 */ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \ + "vswp d1, d4 \n" \ + "vswp d3, d6 \n" \ + \ + "subs %[cnt], %[cnt], #1 \n" \ + "vst1.32 {d0-d1}, [%[out0_ptr]]! \n" \ + "vst1.32 {d2-d3}, [%[out1_ptr]]! \n" \ + "vst1.32 {d4-d5}, [%[out2_ptr]]! \n" \ + "vst1.32 {d6-d7}, [%[out3_ptr]]! \n" \ + "bne 1b \n" + +#define TRANS_C8 \ + "1: \n" \ + "vld1.8 d0, [%[din0_ptr]] \n" \ + "vld1.8 d1, [%[din1_ptr]] \n" \ + "vld1.8 d2, [%[din2_ptr]] \n" \ + "vld1.8 d3, [%[din3_ptr]] \n" \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "vtrn.8 d0, d1 \n" /*00 10 02 12 04 14 06 16*/ \ + "vtrn.8 d2, d3 \n" /*20 30 22 32 24 34 26 36 */ \ + \ + "vld1.8 d4, [%[din0_ptr]] \n" \ + "vld1.8 d5, [%[din1_ptr]] \n" \ + "vld1.8 d6, [%[din2_ptr]] \n" \ + "vld1.8 d7, [%[din3_ptr]] \n" \ + \ + "vtrn.16 d0, d2 \n" /*00 10 20 30 04 14 24 34*/ \ + "vtrn.16 d1, d3 \n" /* 01 11 21 31 05 15 25 35 */ \ + "vtrn.8 d4, d5 \n" /*40 50 02 12 04 14 06 16*/ \ + "vtrn.8 d6, d7 \n" /*60 70 22 32 24 34 26 36 */ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "vtrn.16 d4, d6 \n" /*40 50 60 70 04 14 24 34*/ \ + "vtrn.16 d5, d7 \n" /* 41 51 61 71 05 15 25 35 */ \ + \ + "vtrn.32 d0, d4 \n" /*00 10 20 30 40 50 60 70*/ \ + "vtrn.32 d1, d5 \n" /* 01 11 21 31 41 51 61 71 */ \ + "vtrn.32 d2, d6 \n" /*02 12 22 32 42 52 62 72*/ \ + "vtrn.32 d3, d7 \n" /* 03 11 21 33 43 53 63 73 */ \ + \ + "subs %[cnt], %[cnt], #1 \n" \ + "vst1.8 {d0}, [%[out0_ptr]]! \n" \ + "vst1.8 {d1}, [%[out1_ptr]]! \n" \ + "vst1.8 {d2}, [%[out2_ptr]]! \n" \ + "vst1.8 {d3}, [%[out3_ptr]]! \n" \ + "vst1.8 {d4}, [%[out4_ptr]]! \n" \ + "vst1.8 {d5}, [%[out5_ptr]]! \n" \ + "vst1.8 {d6}, [%[out6_ptr]]! \n" \ + "vst1.8 {d7}, [%[out7_ptr]]! \n" \ + "bne 1b \n" + +#endif +template <> +void NCHW2NHWC(int N, int C, int size, const float* X, float* Y) { + int cnt = C >> 2; + int remain = C % 4; + int sum = C * size; + int stride = size << 4; // 4 * size + int stride_w = stride >> 2; + for (int n = 0; n < N; n++) { + const float* din = X + n * sum; + float* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < size - 3; s += 4) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + size; + const float* din2_ptr = din1_ptr + size; + const float* din3_ptr = din2_ptr + size; + float* out0_ptr = dout + s * C; + float* out1_ptr = out0_ptr + C; + float* out2_ptr = out1_ptr + C; + float* out3_ptr = out2_ptr + C; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12"); +#else + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + for (int i = 0; i < remain; i++) { + const float* ptr = din0_ptr; + *out0_ptr++ = *ptr++; + *out1_ptr++ = *ptr++; + *out2_ptr++ = *ptr++; + *out3_ptr++ = *ptr++; + din0_ptr += size; + } + } + // remain size + for (; s < size; s++) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + size; + const float* din2_ptr = din1_ptr + size; + const float* din3_ptr = din2_ptr + size; + float* out0_ptr = dout + s * C; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + din0_ptr += stride_w; + din1_ptr += stride_w; + din2_ptr += stride_w; + din3_ptr += stride_w; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += size; + } + } + } +} +template <> +void NCHW2NHWC(int N, int C, int size, const int8_t* X, int8_t* Y) { + int cnt = C >> 3; + int remain = C % 8; + int sum = C * size; + int stride = size << 3; // 8 * size + int stride_w = size << 4; // 4 * size * 4 + for (int n = 0; n < N; n++) { + const int8_t* din = X + n * sum; + int8_t* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < size - 7; s += 8) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + size; + const int8_t* din2_ptr = din1_ptr + size; + const int8_t* din3_ptr = din2_ptr + size; + int8_t* out0_ptr = dout + s * C; + int8_t* out1_ptr = out0_ptr + C; + int8_t* out2_ptr = out1_ptr + C; + int8_t* out3_ptr = out2_ptr + C; + int8_t* out4_ptr = out3_ptr + C; + int8_t* out5_ptr = out4_ptr + C; + int8_t* out6_ptr = out5_ptr + C; + int8_t* out7_ptr = out6_ptr + C; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel + for (int i = 0; i < remain; i++) { + const int8_t* ptr = din0_ptr; + *out0_ptr = *ptr++; + *out1_ptr = *ptr++; + *out2_ptr = *ptr++; + *out3_ptr = *ptr++; + din0_ptr += size; + *out4_ptr = *ptr++; + *out5_ptr = *ptr++; + *out6_ptr = *ptr++; + *out7_ptr = *ptr++; + } + } + // remain size + for (; s < size; s++) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + size; + const int8_t* din2_ptr = din1_ptr + size; + const int8_t* din3_ptr = din2_ptr + size; + const int8_t* din4_ptr = din3_ptr + size; + const int8_t* din5_ptr = din4_ptr + size; + const int8_t* din6_ptr = din5_ptr + size; + const int8_t* din7_ptr = din6_ptr + size; + int8_t* out0_ptr = dout + s * C; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + *out0_ptr++ = *din4_ptr; + *out0_ptr++ = *din5_ptr; + *out0_ptr++ = *din6_ptr; + *out0_ptr++ = *din7_ptr; + din0_ptr += stride; + din1_ptr += stride; + din2_ptr += stride; + din3_ptr += stride; + din4_ptr += stride; + din5_ptr += stride; + din6_ptr += stride; + din7_ptr += stride; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += size; + } + } + } +} +template <> +void NHWC2NCHW(int N, int C, int size, const float* X, float* Y) { + int cnt = size >> 2; + int remain = size % 4; + int sum = C * size; + int stride = C << 4; // 4 * size + int stride_w = C << 2; + for (int n = 0; n < N; n++) { + const float* din = X + n * sum; + float* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < C - 3; s += 4) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + C; + const float* din2_ptr = din1_ptr + C; + const float* din3_ptr = din2_ptr + C; + float* out0_ptr = dout + s * size; + float* out1_ptr = out0_ptr + size; + float* out2_ptr = out1_ptr + size; + float* out3_ptr = out2_ptr + size; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11"); +#else + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + for (int i = 0; i < remain; i++) { + const float* ptr = din0_ptr; + *out0_ptr++ = *ptr++; + *out1_ptr++ = *ptr++; + *out2_ptr++ = *ptr++; + *out3_ptr++ = *ptr++; + din0_ptr += C; + } + } + // remain size + for (; s < C; s++) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + C; + const float* din2_ptr = din1_ptr + C; + const float* din3_ptr = din2_ptr + C; + float* out0_ptr = dout + s * size; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + din0_ptr += stride_w; + din1_ptr += stride_w; + din2_ptr += stride_w; + din3_ptr += stride_w; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += C; + } + } + } +} +template <> +void NHWC2NCHW(int N, int C, int size, const int8_t* X, int8_t* Y) { + int cnt = size >> 3; + int remain = size % 8; + int sum = C * size; + int stride = C << 3; // 8 * size + int stride_w = C << 4; // 4 * size + for (int n = 0; n < N; n++) { + const int8_t* din = X + n * sum; + int8_t* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < C - 7; s += 8) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + C; + const int8_t* din2_ptr = din1_ptr + C; + const int8_t* din3_ptr = din2_ptr + C; + const int8_t* din4_ptr = din3_ptr + C; + const int8_t* din5_ptr = din4_ptr + C; + const int8_t* din6_ptr = din5_ptr + C; + const int8_t* din7_ptr = din6_ptr + C; + int8_t* out0_ptr = dout + s * size; + int8_t* out1_ptr = out0_ptr + size; + int8_t* out2_ptr = out1_ptr + size; + int8_t* out3_ptr = out2_ptr + size; + int8_t* out4_ptr = out3_ptr + size; + int8_t* out5_ptr = out4_ptr + size; + int8_t* out6_ptr = out5_ptr + size; + int8_t* out7_ptr = out6_ptr + size; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + for (int i = 0; i < remain; i++) { + const int8_t* ptr = din0_ptr; + *out0_ptr++ = *ptr++; + *out1_ptr++ = *ptr++; + *out2_ptr++ = *ptr++; + *out3_ptr++ = *ptr++; + *out4_ptr++ = *ptr++; + *out5_ptr++ = *ptr++; + *out6_ptr++ = *ptr++; + *out7_ptr++ = *ptr++; + din0_ptr += C; + } + } + // remain size + for (; s < C; s++) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + C; + const int8_t* din2_ptr = din1_ptr + C; + const int8_t* din3_ptr = din2_ptr + C; + const int8_t* din4_ptr = din3_ptr + C; + const int8_t* din5_ptr = din4_ptr + C; + const int8_t* din6_ptr = din5_ptr + C; + const int8_t* din7_ptr = din6_ptr + C; + int8_t* out0_ptr = dout + s * size; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + *out0_ptr++ = *din4_ptr; + *out0_ptr++ = *din5_ptr; + *out0_ptr++ = *din6_ptr; + *out0_ptr++ = *din7_ptr; + din0_ptr += stride; + din1_ptr += stride; + din2_ptr += stride; + din3_ptr += stride; + din4_ptr += stride; + din5_ptr += stride; + din6_ptr += stride; + din7_ptr += stride; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += C; + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/layout.h b/lite/backends/arm/math/layout.h new file mode 100644 index 0000000000..ed0e2f8b78 --- /dev/null +++ b/lite/backends/arm/math/layout.h @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +template +void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y); + +template +void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y); + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc index 0d6eed9904..092e6937c4 100644 --- a/lite/backends/arm/math/packed_sgemm.cc +++ b/lite/backends/arm/math/packed_sgemm.cc @@ -53,6 +53,38 @@ void sgemm_prepacked_8x12(bool is_transB, bool has_bias, bool has_relu, ARMContext *ctx); + +void pack_m4(float *out, + const float *in, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax); + +void pack_trans_m4(float *out, + const float *in, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax); +void sgemm_prepacked_4x4(bool is_transB, + int M, + int N, + int K, + const float *A_packed, + const float *B, + int ldb, + float beta, + float *C, + int ldc, + const float *bias, + bool has_bias, + bool has_relu, + ARMContext *ctx); #else // for kA72 void prepackA_6x8(float *out, @@ -139,13 +171,21 @@ void prepackA(float *out, bool is_trans, ARMContext *ctx) { #ifdef __aarch64__ - if (is_trans) { - prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + if (mmax <= 4) { + if (is_trans) { + pack_trans_m4(out, in, alpha, ldin, m0, mmax, k0, kmax); + } else { + pack_m4(out, in, alpha, ldin, m0, mmax, k0, kmax); + } } else { - prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + if (is_trans) { + prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + } else { + prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + } } #else - if (ctx->arch() == kA73) { + if (ctx->arch() == kA73 || mmax <= 4) { if (is_trans) { prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax); } else { @@ -212,22 +252,39 @@ void sgemm_prepack(bool is_transB, bool has_relu, ARMContext *ctx) { #ifdef __aarch64__ - sgemm_prepacked_8x12(is_transB, - M, - N, - K, - A_packed, - B, - ldb, - beta, - C, - ldc, - bias, - has_bias, - has_relu, - ctx); + if (M <= 4) { + sgemm_prepacked_4x4(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + beta, + C, + ldc, + bias, + has_bias, + has_relu, + ctx); + } else { + sgemm_prepacked_8x12(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + beta, + C, + ldc, + bias, + has_bias, + has_relu, + ctx); + } #else // armv7 - if (ctx->arch() == kA73) { + if (ctx->arch() == kA73 || M <= 4) { sgemm_prepacked_4x8(is_transB, M, N, @@ -522,6 +579,147 @@ void prepackA_8x12(float *dout, } } } +void pack_m4(float *dout, + const float *inptr, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax) { + int x_len = kmax - k0; + int stride = x_len * 4; + float zerobuff[x_len]; // NOLINT + memset(zerobuff, 0, sizeof(float) * x_len); + bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; + +#pragma omp parallel for + for (int y = m0; y < mmax; y += 4) { + float *outptr = dout + stride * (y - m0) / 4; + + const float *inptr0 = inptr + y * ldin + k0; + const float *inptr1 = inptr0 + ldin; + const float *inptr2 = inptr1 + ldin; + const float *inptr3 = inptr2 + ldin; + + asm volatile( + "prfm pldl1keep, [%[ptr0]] \n" + "prfm pldl1keep, [%[ptr0], #64] \n" + "prfm pldl1keep, [%[ptr1]] \n" + "prfm pldl1keep, [%[ptr1], #64] \n" + "prfm pldl1keep, [%[ptr2]] \n" + "prfm pldl1keep, [%[ptr2], #64] \n" + "prfm pldl1keep, [%[ptr3]] \n" + "prfm pldl1keep, [%[ptr3], #64] \n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3) + : "memory"); + + int x = x_len; + //! cope with row index exceed real size, set to zero buffer + if ((y + 3) >= mmax) { + switch ((y + 3) - mmax) { + case 2: + inptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + default: + break; + } + } + for (; x > 7; x -= 8) { + asm volatile( + "cbz %w[has_alpha], 0f\n" /* check alpha == 1.f? */ + "dup v31.4s, %w[alpha]\n" /* alpha to vector */ + "ldp q0, q1, [%[inptr0]], #32\n" /* load r0, a0~a7 */ + "ldp q2, q3, [%[inptr1]], #32\n" /* load r1, b0~b7 */ + "fmul v0.4s, v31.4s, v0.4s\n" /* mul alpha */ + "fmul v1.4s, v31.4s, v1.4s\n" /* mul alpha */ + "ldp q4, q5, [%[inptr2]], #32\n" /* load r2, c0~c7 */ + "fmul v2.4s, v31.4s, v2.4s\n" /* mul alpha */ + "fmul v3.4s, v31.4s, v3.4s\n" /* mul alpha */ + "ldp q6, q7, [%[inptr3]], #32\n" /* load r3, d0~d7 */ + "fmul v4.4s, v31.4s, v4.4s\n" /* mul alpha */ + "fmul v5.4s, v31.4s, v5.4s\n" /* mul alpha */ + "fmul v6.4s, v31.4s, v6.4s\n" /* mul alpha */ + "fmul v7.4s, v31.4s, v7.4s\n" /* mul alpha */ + "b 1f\n" /* to main process */ + "0: \n" /* alpha == 1 */ + "ldp q0, q1, [%[inptr0]], #32\n" /* load r0, a0~a7 */ + "ldp q2, q3, [%[inptr1]], #32\n" /* load r1, b0~b7 */ + "ldp q4, q5, [%[inptr2]], #32\n" /* load r2, c0~c7 */ + "ldp q6, q7, [%[inptr3]], #32\n" /* load r3, d0~d7 */ + "1: \n" /* main process */ + "trn1 v8.4s, v0.4s, v2.4s\n" /* a0b0a2b2*/ + "trn2 v9.4s, v0.4s, v2.4s\n" /* a1b1a3b3*/ + "trn1 v10.4s, v1.4s, v3.4s\n" /* a4b4a6b6*/ + "trn2 v11.4s, v1.4s, v3.4s\n" /* a5b5a7b7*/ + + "trn1 v12.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/ + "trn2 v13.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/ + "trn1 v14.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/ + "trn2 v15.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/ + + "trn1 v0.2d, v8.2d, v12.2d\n" /* a0b0c0d0 */ + "trn1 v1.2d, v9.2d, v13.2d\n" /* a1b1c1d1 */ + "trn1 v2.2d, v10.2d, v14.2d\n" /* a4b4c4d4 */ + "trn1 v3.2d, v11.2d, v15.2d\n" /* a5b5c5d5 */ + + "trn2 v4.2d, v8.2d, v12.2d\n" /* a2b2c2d2 */ + "trn2 v5.2d, v9.2d, v13.2d\n" /* a3b3c3d3 */ + "stp q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/ + "trn2 v6.2d, v10.2d, v14.2d\n" /* a6b6c6d6 */ + "trn2 v7.2d, v11.2d, v15.2d\n" /* a7b7c7d7 */ + "stp q4, q5, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/ + "stp q2, q3, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/ + "stp q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/ + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr] "+r"(outptr) + : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "cc", + "memory"); + } + + for (; x > 0; x--) { + if (has_alpha) { + *outptr++ = *inptr0++ * alpha; + *outptr++ = *inptr1++ * alpha; + *outptr++ = *inptr2++ * alpha; + *outptr++ = *inptr3++ * alpha; + } else { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + } + } + } +} void prepackA_trans_8x12(float *outptr, const float *in, @@ -682,6 +880,128 @@ void prepackA_trans_8x12(float *outptr, } } } +void pack_trans_m4(float *outptr, + const float *in, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax) { + auto inptr = in + k0 * ldin + m0; + uint32_t mask_buffer[4] = {0, 1, 2, 3}; + int x_len = mmax - m0; + int y_len = kmax - k0; + int right_remain = x_len - 4 * (x_len / 4); + int stride_out = 4 * y_len; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask1 = + vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); + + bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; + float32x4_t valpha = vdupq_n_f32(alpha); + +#pragma omp parallel for + for (int y = 0; y < y_len - 3; y += 4) { + const float *ptr0 = inptr + y * ldin; + const float *ptr1 = ptr0 + ldin; + const float *ptr2 = ptr1 + ldin; + const float *ptr3 = ptr2 + ldin; + + asm volatile( + "prfm pldl1keep, [%[ptr0]] \n" + "prfm pldl1keep, [%[ptr0], #64] \n" + "prfm pldl1keep, [%[ptr1]] \n" + "prfm pldl1keep, [%[ptr1], #64] \n" + "prfm pldl1keep, [%[ptr2]] \n" + "prfm pldl1keep, [%[ptr2], #64] \n" + "prfm pldl1keep, [%[ptr3]] \n" + "prfm pldl1keep, [%[ptr3], #64] \n" + : + : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3) + : "memory"); + + float *outptr_row_col = outptr + y * 4; + int i = 0; + for (; i < x_len - 3; i += 4) { + float32x4_t vr00 = vld1q_f32(ptr0); + float32x4_t vr10 = vld1q_f32(ptr1); + float32x4_t vr20 = vld1q_f32(ptr2); + float32x4_t vr30 = vld1q_f32(ptr3); + if (has_alpha) { + vr00 = vmulq_f32(vr00, valpha); + vr10 = vmulq_f32(vr10, valpha); + vr20 = vmulq_f32(vr20, valpha); + vr30 = vmulq_f32(vr30, valpha); + } + + vst1q_f32(outptr_row_col, vr00); + vst1q_f32(outptr_row_col + 4, vr10); + vst1q_f32(outptr_row_col + 8, vr20); + vst1q_f32(outptr_row_col + 12, vr30); + + ptr0 += 4; + ptr1 += 4; + ptr2 += 4; + ptr3 += 4; + + outptr_row_col += stride_out; + } + if (right_remain > 0) { + float32x4_t vr00 = vld1q_f32(ptr0); + float32x4_t vr10 = vld1q_f32(ptr1); + float32x4_t vr20 = vld1q_f32(ptr2); + float32x4_t vr30 = vld1q_f32(ptr3); + + if (has_alpha) { + vr00 = vmulq_f32(vr00, valpha); + vr10 = vmulq_f32(vr10, valpha); + vr20 = vmulq_f32(vr20, valpha); + vr30 = vmulq_f32(vr30, valpha); + } + + float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero); + float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero); + float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero); + float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero); + + vst1q_f32(outptr_row_col, vr00_1); + vst1q_f32(outptr_row_col + 4, vr10_1); + vst1q_f32(outptr_row_col + 8, vr20_1); + vst1q_f32(outptr_row_col + 12, vr30_1); + } + } + +#pragma omp parallel for + for (int y = 4 * (y_len / 4); y < y_len; ++y) { + const float *ptr0 = inptr + y * ldin; + float *outptr_row_col = outptr + y * 4; + int i = 0; + for (; i < x_len - 3; i += 4) { + float32x4_t vr0 = vld1q_f32(ptr0); + if (has_alpha) { + vr0 = vmulq_f32(vr0, valpha); + } + vst1q_f32(outptr_row_col, vr0); + + ptr0 += 4; + + outptr_row_col += stride_out; + } + if (right_remain > 0) { + float32x4_t vr0 = vld1q_f32(ptr0); + + if (has_alpha) { + vr0 = vmulq_f32(vr0, valpha); + } + + float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero); + + vst1q_f32(outptr_row_col, vr0_1); + } + } +} #else // __aarch64__ void prepackA_6x8(float* outptr, @@ -2592,6 +2912,292 @@ void sgemm_prepacked_8x12(bool is_transB, } } } + +void sgemm_prepacked_4x4(bool is_transB, + int M, + int N, + int K, + const float *A_packed, + const float *B, + int ldb, + float beta, + float *C, + int ldc, + const float *bias, + bool has_bias, + bool has_relu, + ARMContext *ctx) { + size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; + auto workspace = ctx->workspace_data(); + int threads = ctx->threads(); + + const int n_block = 4; + const int m_block = 4; + //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 + int x_block = (l2_cache - (m_block * K)) / (sizeof(float) * (K + m_block)); + x_block /= n_block; + x_block *= n_block; + int x_num = (N + (x_block - 1)) / x_block; + x_block = (N + x_num - 1) / x_num; + x_block = (x_block + n_block - 1) / n_block; + x_block *= n_block; + x_block = x_block < n_block ? n_block : x_block; + + // unroll 2 loop + int tail_pre = (K & (KBLOCK - 1)); + int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; + if (tail_pre == 0) { + tail_pre = KBLOCK; + } + + bool flag_p_remain = false; + int remain = 0; + + int has_beta = fabsf(beta) > 1e-8f ? 1 : 0; + //! apanel is pre_compute outside gemm + for (unsigned int x0 = 0; x0 < N; x0 += x_block) { + unsigned int xmax = x0 + x_block; + if (xmax > N) { + xmax = N; + } + int bblocks = (xmax - x0 + n_block - 1) / n_block; + remain = xmax - x0 - (bblocks - 1) * n_block; + if (remain > 0) { + flag_p_remain = true; + } + //! load bpanel + float *b_pannel = workspace; + if (is_transB) { + pack_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K); + } else { + pack_trans_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K); + } +#pragma omp parallel for num_threads(threads) + for (unsigned int y = 0; y < M; y += m_block) { + unsigned int ymax = y + m_block; + if (ymax > M) { + ymax = M; + } + + float bias_local[4] = {0}; + if (has_bias) { + bias_local[0] = bias[y]; + bias_local[1] = bias[y + 1]; + bias_local[2] = bias[y + 2]; + bias_local[3] = bias[y + 3]; + } + + float cout0[n_block]; // NOLINT + float cout1[n_block]; // NOLINT + float cout2[n_block]; // NOLINT + float cout3[n_block]; // NOLINT + + float *c_ptr0 = C + y * ldc + x0; + float *c_ptr1 = c_ptr0 + ldc; + float *c_ptr2 = c_ptr1 + ldc; + float *c_ptr3 = c_ptr2 + ldc; + + float *pout0 = c_ptr0; + float *pout1 = c_ptr1; + float *pout2 = c_ptr2; + float *pout3 = c_ptr3; + + const float *a_ptr_l = A_packed + y * K; + const float *b_ptr_l = b_pannel; + for (int xb = 0; xb < bblocks; xb++) { + if ((y + 3) >= ymax) { + switch ((y + 3) - ymax) { + case 2: + c_ptr1 = cout1; + case 1: + c_ptr2 = cout2; + case 0: + c_ptr3 = cout3; + default: + break; + } + } + if (flag_p_remain && (xb == bblocks - 1)) { + pout0 = c_ptr0; + pout1 = c_ptr1; + pout2 = c_ptr2; + pout3 = c_ptr3; + + c_ptr0 = cout0; + c_ptr1 = cout1; + c_ptr2 = cout2; + c_ptr3 = cout3; + if (has_beta) { + for (int i = 0; i < remain; ++i) { + cout0[i] = pout0[i]; + cout1[i] = pout1[i]; + cout2[i] = pout2[i]; + cout3[i] = pout3[i]; + } + } + } + const float *a_ptr = a_ptr_l; + const float *b_ptr = b_ptr_l + xb * K * 4; + int tail = tail_pre; + int k = k_pre; + // clang-format off + asm volatile( + "prfm pldl1keep, [%[a_ptr]]\n" /* preload a*/ + "ld1 {v2.4s}, [%[bias_ptr]]\n" /* load bias to q2, q3*/ + "dup v8.4s, v2.s[0]\n" /* out0 = 0 */ + "prfm pldl1keep, [%[b_ptr]]\n" /* preload b*/ + "dup v9.4s, v2.s[1]\n" /* out1 = 0*/ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ + "dup v10.4s, v2.s[2]\n" /* out2 = 0*/ + "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ + "dup v11.4s, v2.s[3]\n" /* out3 = 0*/ + "cbz %w[has_beta], 0f\n" /* check beta == 0? */ + /* process beta */ + "dup v7.4s, %w[beta]\n" /* beta to vector */ + "ld1 {v0.4s}, [%[c_ptr0]]\n" /* load output r0 */ + "ld1 {v1.4s}, [%[c_ptr1]]\n" /* load output r1 */ + "fmla v8.4s, v0.4s, v7.4s\n" /* cr00 += beta * c_r00*/ + "fmla v9.4s, v1.4s, v7.4s\n" /* cr10 += beta * c_r10*/ + "ld1 {v2.4s}, [%[c_ptr2]]\n" + "ld1 {v3.4s}, [%[c_ptr3]]\n" + "fmla v10.4s, v2.4s, v7.4s\n" /* cr20 += beta * c_r20*/ + "fmla v11.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/ + + "0: \n" /* check loop count */ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00,a10 to q0, q1*/ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ + "cbz %w[k], 2f\n" /* check loop count > 0 */ + /* main loop */ + /* unrool 0*/ + "1:\n" /* main loop */ + "fmla v8.4s, v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */ + "fmla v9.4s, v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4 */ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b3 to q6, q7 */ + "fmla v10.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4 */ + "fmla v11.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4 */ + + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a20, a30 to q2, q3 */ + "fmla v8.4s, v5.4s, v1.s[0]\n" /* out0 = b1 * a10[0], b1 =q5 */ + "fmla v9.4s, v5.4s, v1.s[1]\n" /* out1 = b1 * a10[1], b1 =q5 */ + "fmla v10.4s, v5.4s, v1.s[2]\n" /* out2 = b1 * a10[2], b1 =q5 */ + "fmla v11.4s, v5.4s, v1.s[3]\n" /* out3 = b1 * a10[3], b1 =q5 */ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ + + "fmla v8.4s, v6.4s, v2.s[0]\n" /* out0 = b2 * a20[0], b2 =q6 */ + "fmla v9.4s, v6.4s, v2.s[1]\n" /* out1 = b2 * a20[1], b2 =q6 */ + "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b2 =q6*/ + "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b2 =q6*/ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a10 to q0, q1 */ + + "fmla v8.4s, v7.4s, v3.s[0]\n" /* out0 = b3 * a30[0], b3 =q7*/ + "fmla v9.4s, v7.4s, v3.s[1]\n" /* out1 = b3 * a30[1], b3 =q7*/ + "subs %w[k], %w[k], #1\n" /* loop count - 1*/ + "fmla v10.4s, v7.4s, v3.s[2]\n" /* out2 = b3 * a30[2], b3 =q7*/ + "fmla v11.4s, v7.4s, v3.s[3]\n" /* out3 = b3 * a30[3], b3 =q7*/ + + "bne 1b\n" + "2:\n" /* process tail*/ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ + "beq 3f\n" /*jump to tail = 1*/ + /* final unrool 0*/ + /* unrool 0, tail > 1*/ + "fmla v8.4s, v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */ + "fmla v9.4s, v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4 */ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ + "fmla v10.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4 */ + "fmla v11.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4 */ + + "beq 4f\n" /*jump to tail = 2*/ + /* unrool 1, tail > 2*/ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b3 to q6, q7 */ + + "fmla v8.4s, v5.4s, v1.s[0]\n" /* out0 = b1 * a10[0], b1 =q5 */ + "fmla v9.4s, v5.4s, v1.s[1]\n" /* out1 = b1 * a10[1], b1 =q5*/ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ + "fmla v10.4s, v5.4s, v1.s[2]\n" /* out2 = b1 * a10[2], b1 =q5 */ + "fmla v11.4s, v5.4s, v1.s[3]\n" /* out3 = b1 * a10[3], b1 =q5 */ + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a20, a30 to q2, q3 */ + + "beq 5f\n" /*jump to tail = 3*/ + /* unrool 2, tail = 4*/ + "fmla v8.4s, v6.4s, v2.s[0]\n" /* out0 = b2 * a20[0], b1 =q6 */ + "fmla v9.4s, v6.4s, v2.s[1]\n" /* out1 = b2 * a20[1], b1 =q6 */ + "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b1 =q6*/ + "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/ + + /* unrool 3, tail = 4*/ + + "fmla v8.4s, v7.4s, v3.s[0]\n" /* out0 = b3 * a30[0], b3 =q7*/ + "fmla v9.4s, v7.4s, v3.s[1]\n" /* out1 = b3 * a30[1], b3 =q7*/ + "fmla v10.4s, v7.4s, v3.s[2]\n" /* out2 = b3 * a30[2], b3 =q7*/ + "fmla v11.4s, v7.4s, v3.s[3]\n" /* out3 = b3 * a30[3], b3 =q7*/ + + "b 11f\n" + /* tails==1 final tail*/ + "3: \n" /* tail=1*/ + "fmla v8.4s, v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */ + "fmla v9.4s, v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4 */ + "fmla v10.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4 */ + "fmla v11.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4 */ + + "b 11f\n" + /* tails==2 final tail*/ + "4:\n" /* tail = 2*/ + + "fmla v8.4s, v5.4s, v1.s[0]\n" /* out0 = b1 * a10[0], b1 =q5 */ + "fmla v9.4s, v5.4s, v1.s[1]\n" /* out1 = b1 * a10[1], b1 =q5*/ + "fmla v10.4s, v5.4s, v1.s[2]\n" /* out2 = b1 * a10[2], b1 =q5 */ + "fmla v11.4s, v5.4s, v1.s[3]\n" /* out3 = b1 * a10[3], b1 =q5 */ + + "b 11f\n" + /* tails==3 final tail*/ + "5:\n" /* tail = 3*/ + "fmla v8.4s, v6.4s, v2.s[0]\n" /* out0 = b2 * a20[0], b1 =q6 */ + "fmla v9.4s, v6.4s, v2.s[1]\n" /* out1 = b2 * a20[1], b1 =q6 */ + "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b1 =q6*/ + "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/ + + "11: \n" /* check if relu */ + "cbz %w[relu], 12f\n" /* skip relu */ + "movi v2.4s, #0\n" /* for relu*/ + "fmax v8.4s, v8.4s, v2.4s\n" /* relu*/ + "fmax v9.4s, v9.4s, v2.4s\n" /* relu*/ + "fmax v10.4s, v10.4s, v2.4s\n" /* relu*/ + "fmax v11.4s, v11.4s, v2.4s\n" /* relu*/ + "12: \n" + "st1 {v8.4s}, [%[c_ptr0]], #16\n" /* store r0 */ + "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */ + "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */ + "st1 {v11.4s}, [%[c_ptr3]], #16\n" /* store r3 */ + + : [a_ptr] "+r"(a_ptr), + [b_ptr] "+r"(b_ptr), + [k] "+r"(k), + [tail] "+r"(tail), + [c_ptr0] "+r"(c_ptr0), + [c_ptr1] "+r"(c_ptr1), + [c_ptr2] "+r"(c_ptr2), + [c_ptr3] "+r"(c_ptr3) + : [bias_ptr] "r"(bias_local), + [relu] "r"(has_relu), + [has_beta] "r"(has_beta), + [beta] "r"(beta) + : "cc","memory", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8","v9","v10","v11"); + // clang-format on + if (flag_p_remain && (xb == bblocks - 1)) { + for (int i = 0; i < remain; ++i) { + *pout0++ = cout0[i]; + *pout1++ = cout1[i]; + *pout2++ = cout2[i]; + *pout3++ = cout3[i]; + } + } + } + } + } +} #else // __aarch64__ /** * \brief gemm with ablock = 6, bblock = 8, output 6x8 diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc new file mode 100644 index 0000000000..8087e0337b --- /dev/null +++ b/lite/backends/arm/math/packed_sgemm_c4.cc @@ -0,0 +1,1171 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/packed_sgemm_c4.h" +#include + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void loadb_c4(float* out, + const float* in, + const int xstart, + const int xend, + const int k_round, + const int n) { + const int xlen = (xend - xstart + NBLOCK_C4 - 1) / NBLOCK_C4 * NBLOCK_C4; + int xloop = xlen / NBLOCK_C4; + const int flag_remain = n < xstart + xlen; + int remain = 0; + int remain4 = 0; + int remain1 = 0; + if (flag_remain) { + remain = (n - xstart) - (xloop - 1) * NBLOCK_C4; + remain4 = remain >> 2; + remain1 = remain & 3; + xloop -= 1; + } + const int ldo = NBLOCK_C4 * k_round; + const int kloop = k_round >> 2; + in += xstart * 4; + if (xloop > 0) { +#pragma omp parallel for + for (int i = 0; i < kloop; ++i) { + float* out_ptr = out + 4 * NBLOCK_C4 * i; + const float* in_ptr = in + i * 4 * n; + for (int j = 0; j < xloop; ++j) { + float* out_p = out_ptr + j * ldo; +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + "st1 {v0.4s, v1.4s}, [%[out]], #32 \n" + "ld1 {v4.4s, v5.4s}, [%[in]], #32 \n" + "st1 {v2.4s, v3.4s}, [%[out]], #32 \n" + "ld1 {v6.4s, v7.4s}, [%[in]], #32 \n" + "st1 {v4.4s, v5.4s}, [%[out]], #32 \n" + "st1 {v6.4s, v7.4s}, [%[out]], #32 \n" + : [in] "+r"(in_ptr), [out] "+r"(out_p) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! \n" + "vld1.32 {d4-d7}, [%[in]]! \n" + "vst1.32 {d0-d3}, [%[out]]! \n" + "vld1.32 {d8-d11}, [%[in]]! \n" + "vst1.32 {d4-d7}, [%[out]]! \n" + "vld1.32 {d12-d15}, [%[in]]! \n" + "vst1.32 {d8-d11}, [%[out]]! \n" + "vst1.32 {d12-d15}, [%[out]]! \n" + : [in] "+r"(in_ptr), [out] "+r"(out_p) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif // __aarch674__ + } + } + } + float* out_remain4 = out + xloop * k_round * NBLOCK_C4; + const float* in_remain4 = in + xloop * NBLOCK_C4 * 4; + if (remain4) { +#pragma omp parallel for + for (int i = 0; i < kloop; ++i) { + float* out_ptr = out_remain4 + 4 * 4 * i; + const float* in_ptr = in_remain4 + i * 4 * n; +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + "st1 {v0.4s, v1.4s}, [%[out]], #32 \n" + "st1 {v2.4s, v3.4s}, [%[out]], #32 \n" + : [in] "+r"(in_ptr), [out] "+r"(out_ptr) + : + : "v0", "v1", "v2", "v3"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! \n" + "vld1.32 {d4-d7}, [%[in]]! \n" + "vst1.32 {d0-d3}, [%[out]]! \n" + "vst1.32 {d4-d7}, [%[out]]! \n" + : [in] "+r"(in_ptr), [out] "+r"(out_ptr) + : + : "q0", "q1", "q2", "q3"); +#endif // __aarch64__ + } + } + float* out_remain1 = out_remain4 + remain4 * k_round * 4; + const float* in_remain1 = in_remain4 + remain4 * 4 * 4; + if (remain1) { +#pragma omp parallel for + for (int i = 0; i < kloop; ++i) { + float* out_ptr = out_remain1 + 4 * remain1 * i; + const float* in_ptr = in_remain1 + i * 4 * n; + for (int j = 0; j < remain1; ++j) { + float32x4_t vin = vld1q_f32(in_ptr); + in_ptr += 4; + vst1q_f32(out_ptr, vin); + out_ptr += 4; + } + } + } +} + +void sgemm_prepack_c4_common(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx) { + const int m_round = (M + 3) / 4 * 4; + const int k_round = (K + 3) / 4 * 4; + size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; + int threads = ctx->threads(); + auto workspace = ctx->workspace_data(); + // l2 = ablock * K * threads + K * bchunk_w + threads * ablock * bchunk_w; + int bchunk_w = (l2_cache - threads * k_round * sizeof(float)) / + ((k_round + threads * MBLOCK_C4) * sizeof(float)); + bchunk_w = bchunk_w > N ? N : bchunk_w; + bchunk_w = bchunk_w / NBLOCK_C4 * NBLOCK_C4; + bchunk_w = bchunk_w > NBLOCK_C4 ? bchunk_w : NBLOCK_C4; + int bchunk_loop = (N + bchunk_w - 1) / bchunk_w; + + const int h_loop = m_round >> 2; // MBLOCK_C4 == 4; + const int kcnt = (k_round + KBLOCK_C4 - 1) / KBLOCK_C4; + const int ldc = N * 4; + const int lda = k_round * 4; + float bias_buf[m_round]; // NOLINT + if (has_bias) { + memcpy(bias_buf, bias, M * sizeof(float)); + memset(bias_buf + M, 0, (m_round - M) * sizeof(float)); + } else { + memset(bias_buf, 0, m_round * sizeof(float)); + } + // bchunk_loop + float* c = C; + for (int n = 0; n < bchunk_loop; ++n) { + int x_start = n * bchunk_w; + int x_end = x_start + bchunk_w; + int w_loop = bchunk_w / NBLOCK_C4; + int flag_remain = 0; + int w_loop4 = 0; + int remain = 0; + if (x_end > N) { + w_loop = (N - x_start) / NBLOCK_C4; + int w_loop_rem = (N - x_start) - w_loop * NBLOCK_C4; + w_loop4 = w_loop_rem >> 2; + remain = w_loop_rem & 3; + x_end = N; + flag_remain = 1; + } + float* bchunk = workspace; + loadb_c4(bchunk, B, x_start, x_end, k_round, N); + float* cchunk = c + n * bchunk_w * 4; + int has_remain = (n == bchunk_loop - 1) && flag_remain; +#pragma omp parallel for num_threads(threads) + for (int h = 0; h < h_loop; ++h) { + float* bias_h = bias_buf + h * 4; +#ifdef __aarch64__ + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vbias = vld1q_f32(bias_h); +#endif + const float* ablock = A_packed + h * lda; + const float* bblock = bchunk; + float* cblock = cchunk + h * ldc; + for (int w = 0; w < w_loop; ++w) { + int cnt = kcnt; + const float* ablock_ptr = ablock; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[a]] \n" + "prfm pldl1keep, [%[b]] \n" + "prfm pldl1keep, [%[b], #64] \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vbias].16b \n" /* mov bias to c2*/ + "mov v12.16b, %[vbias].16b \n" /* mov bias to c3*/ + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "mov v13.16b, %[vbias].16b \n" /* mov bias to c4*/ + "mov v14.16b, %[vbias].16b \n" /* mov bias to c5*/ + "mov v15.16b, %[vbias].16b \n" /* mov bias to c6*/ + "mov v16.16b, %[vbias].16b \n" /* mov bias to c7*/ + "1:\n" + /* load b0b1b2b3 to v5-v8 */ + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v7.4s, v8.4s}, [%[b]], #32 \n" + "prfm pldl1keep, [%[b]] \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v1.4s, v7.s[0] \n" + "fmla v12.4s, v1.4s, v8.s[0] \n" + /* load b4b5b6b7 to v25-v28 */ + "ld1 {v25.4s, v26.4s}, [%[b]], #32 \n" + "ld1 {v27.4s, v28.4s}, [%[b]], #32 \n" + "prfm pldl1keep, [%[a], #32] \n" + "fmla v9.4s, v2.4s, v5.s[1] \n" + "fmla v10.4s, v2.4s, v6.s[1] \n" + "fmla v11.4s, v2.4s, v7.s[1] \n" + "fmla v12.4s, v2.4s, v8.s[1] \n" + "prfm pldl1keep, [%[b], #64] \n" + "fmla v13.4s, v1.4s, v25.s[0] \n" + "fmla v14.4s, v1.4s, v26.s[0] \n" + "fmla v15.4s, v1.4s, v27.s[0] \n" + "fmla v16.4s, v1.4s, v28.s[0] \n" + /* load a2a3 to v3-v4 */ + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "prfm pldl1keep, [%[b], #128] \n" + "fmla v13.4s, v2.4s, v25.s[1] \n" + "fmla v14.4s, v2.4s, v26.s[1] \n" + "fmla v15.4s, v2.4s, v27.s[1] \n" + "fmla v16.4s, v2.4s, v28.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v3.4s, v7.s[2] \n" + "fmla v12.4s, v3.4s, v8.s[2] \n" + "fmla v13.4s, v3.4s, v25.s[2] \n" + "fmla v14.4s, v3.4s, v26.s[2] \n" + "fmla v15.4s, v3.4s, v27.s[2] \n" + "fmla v16.4s, v3.4s, v28.s[2] \n" + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "fmla v9.4s, v4.4s, v5.s[3] \n" + "fmla v10.4s, v4.4s, v6.s[3] \n" + "fmla v11.4s, v4.4s, v7.s[3] \n" + "fmla v12.4s, v4.4s, v8.s[3] \n" + + "fmla v13.4s, v4.4s, v25.s[3] \n" + "fmla v14.4s, v4.4s, v26.s[3] \n" + "fmla v15.4s, v4.4s, v27.s[3] \n" + "fmla v16.4s, v4.4s, v28.s[3] \n" + "bne 1b\n" + "cbz %w[relu], 2f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "fmax v12.4s, v12.4s, %[vzero].4s \n" + "fmax v13.4s, v13.4s, %[vzero].4s \n" + "fmax v14.4s, v14.4s, %[vzero].4s \n" + "fmax v15.4s, v15.4s, %[vzero].4s \n" + "fmax v16.4s, v16.4s, %[vzero].4s \n" + "2:\n" + "st1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[c]], #64 \n" + "st1 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[c]], #64 \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), [relu] "r"(has_relu), + [vbias] "w"(vbias), [vzero] "w" (vzero) + : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v25", "v26", "v27", "v28", "cc", "memory"); +#else + asm volatile( + "vld1.32 {d6-d7}, [%[bias]] \n" + "pld [%[a]] \n" + "pld [%[b]] \n" + "pld [%[b], #64] \n" + "vmov.32 q8, q3 \n" /* mov bias to c0*/ + "vmov.32 q9, q3 \n" /* mov bias to c1*/ + "vmov.32 q10, q3 \n" /* mov bias to c2*/ + "vmov.32 q11, q3 \n" /* mov bias to c3*/ + "vld1.32 {d0-d3}, [%[a]]! \n" + "vmov.32 q12, q3 \n" /* mov bias to c4*/ + "vmov.32 q13, q3 \n" /* mov bias to c5*/ + "vmov.32 q14, q3 \n" /* mov bias to c6*/ + "vmov.32 q15, q3 \n" /* mov bias to c7*/ + "1:\n" + /* c0c1c2c3 */ + "vld1.32 {d8-d11}, [%[b]]! \n" + "vld1.32 {d12-d15}, [%[b]]! \n" + "pld [%[b]] \n" + "vmla.f32 q8, q0, d8[0] \n" + "vmla.f32 q9, q0, d10[0] \n" + "vmla.f32 q10, q0, d12[0] \n" + "vmla.f32 q11, q0, d14[0] \n" + "vld1.32 {d4-d7}, [%[a]]! \n" + "vmla.f32 q8, q1, d8[1] \n" + "vmla.f32 q9, q1, d10[1] \n" + "vmla.f32 q10, q1, d12[1] \n" + "vmla.f32 q11, q1, d14[1] \n" + "pld [%[b], #64] \n" + "vmla.f32 q8, q2, d9[0] \n" + "vmla.f32 q9, q2, d11[0] \n" + "vmla.f32 q10, q2, d13[0] \n" + "vmla.f32 q11, q2, d15[0] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q8, q3, d9[1] \n" + "vmla.f32 q9, q3, d11[1] \n" + "vld1.f32 {d8-d11}, [%[b]]! \n" + "vmla.f32 q10, q3, d13[1] \n" + "vmla.f32 q11, q3, d15[1] \n" + "vld1.32 {d12-d15}, [%[b]]! \n" + /* c4c5c6c7 */ + "vmla.f32 q12, q0, d8[0] \n" + "vmla.f32 q13, q0, d10[0] \n" + "vmla.f32 q14, q0, d12[0] \n" + "vmla.f32 q15, q0, d14[0] \n" + "pld [%[a], #32] \n" + "vmla.f32 q12, q1, d8[1] \n" + "vmla.f32 q13, q1, d10[1] \n" + "vmla.f32 q14, q1, d12[1] \n" + "vmla.f32 q15, q1, d14[1] \n" + "vld1.32 {d0-d3}, [%[a]]! \n" + "vmla.f32 q12, q2, d9[0] \n" + "vmla.f32 q13, q2, d11[0] \n" + "vmla.f32 q14, q2, d13[0] \n" + "vmla.f32 q15, q2, d15[0] \n" + "pld [%[b], #64] \n" + "vmla.f32 q12, q3, d9[1] \n" + "vmla.f32 q13, q3, d11[1] \n" + "vmla.f32 q14, q3, d13[1] \n" + "vmla.f32 q15, q3, d15[1] \n" + "bne 1b\n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmov.u32 q0, #0 \n" + "vmax.f32 q8, q8, q0 \n" + "vmax.f32 q9, q9, q0 \n" + "vmax.f32 q10, q10, q0 \n" + "vmax.f32 q11, q11, q0 \n" + "vmax.f32 q12, q12, q0 \n" + "vmax.f32 q13, q13, q0 \n" + "vmax.f32 q14, q14, q0 \n" + "vmax.f32 q15, q15, q0 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]! \n" + "vst1.32 {d20-d23}, [%[c]]! \n" + "vst1.32 {d24-d27}, [%[c]]! \n" + "vst1.32 {d28-d31}, [%[c]]! \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), + [relu] "r"(has_relu) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"); +#endif + // clang-format on + } + if (has_remain) { + if (w_loop4 > 0) { + int cnt = kcnt; + const float* ablock_ptr = ablock; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[a]] \n" + "prfm pldl1keep, [%[b]] \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vbias].16b \n" /* mov bias to c2*/ + "mov v12.16b, %[vbias].16b \n" /* mov bias to c3*/ + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "1:\n" + /* load b0b1b2b3 to v5-v8 */ + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v7.4s, v8.4s}, [%[b]], #32 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v1.4s, v7.s[0] \n" + "fmla v12.4s, v1.4s, v8.s[0] \n" + /* load a2a3 to v3-v4 */ + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "prfm pldl1keep, [%[a]] \n" + "fmla v9.4s, v2.4s, v5.s[1] \n" + "fmla v10.4s, v2.4s, v6.s[1] \n" + "fmla v11.4s, v2.4s, v7.s[1] \n" + "fmla v12.4s, v2.4s, v8.s[1] \n" + "prfm pldl1keep, [%[b]] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v3.4s, v7.s[2] \n" + "fmla v12.4s, v3.4s, v8.s[2] \n" + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "fmla v9.4s, v4.4s, v5.s[3] \n" + "fmla v10.4s, v4.4s, v6.s[3] \n" + "fmla v11.4s, v4.4s, v7.s[3] \n" + "fmla v12.4s, v4.4s, v8.s[3] \n" + "bne 1b\n" + "cbz %w[relu], 2f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "fmax v12.4s, v12.4s, %[vzero].4s \n" + "2:\n" + "st1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[c]], #64 \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), + [relu] "r"(has_relu), + [vbias] "w"(vbias), + [vzero] "w" (vzero) + : "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "cc", "memory"); +#else + asm volatile( + "pld [%[a]] \n" + "pld [%[b]] \n" + "vld1.32 {d6-d7}, [%[bias]] \n" + "vld1.32 {d0-d3}, [%[a]]! \n" /* load a0 a1 */ + "vmov.32 q8, q3 \n" /* mov bias to c0 */ + "vmov.32 q9, q3 \n" /* mov bias to c1 */ + "vmov.32 q10, q3 \n" /* mov bias to c2 */ + "vmov.32 q11, q3 \n" /* mov bias to c3 */ + "1:\n" + /* c0c1c2c3 */ + "vld1.32 {d8-d11}, [%[b]]! \n" + "vld1.32 {d12-d15}, [%[b]]! \n" + "pld [%[b]] \n" + "vmla.f32 q8, q0, d8[0] \n" + "vmla.f32 q9, q0, d10[0] \n" + "vmla.f32 q10, q0, d12[0] \n" + "vmla.f32 q11, q0, d14[0] \n" + "vld1.32 {d4-d7}, [%[a]]! \n" + "pld [%[a]] \n" + "vmla.f32 q8, q1, d8[1] \n" + "vmla.f32 q9, q1, d10[1] \n" + "vmla.f32 q10, q1, d12[1] \n" + "vmla.f32 q11, q1, d14[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q8, q2, d9[0] \n" + "vmla.f32 q9, q2, d11[0] \n" + "vmla.f32 q10, q2, d13[0] \n" + "vmla.f32 q11, q2, d15[0] \n" + "vld1.32 {d0-d3}, [%[a]]! \n" + "vmla.f32 q8, q3, d9[1] \n" + "vmla.f32 q9, q3, d11[1] \n" + "vmla.f32 q10, q3, d13[1] \n" + "vmla.f32 q11, q3, d15[1] \n" + "bne 1b\n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmov.u32 q0, #0 \n" + "vmax.f32 q8, q8, q0 \n" + "vmax.f32 q9, q9, q0 \n" + "vmax.f32 q10, q10, q0 \n" + "vmax.f32 q11, q11, q0 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]! \n" + "vst1.32 {d20-d23}, [%[c]]! \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), [relu] "r"(has_relu) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "cc", "memory"); +#endif + // clang-format on + } + if (remain > 0) { + int cnt = kcnt; + const float* ablock_ptr = ablock; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[a]] \n" + "prfm pldl1keep, [%[b]] \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "cmp %w[remain], #3 \n" + "beq 1f \n" + "cmp %w[remain], #2 \n" + "beq 2f \n" + /* remain 1 */ + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vzero].16b \n" /* mov zero to c1*/ + "3: \n" + "ld1 {v5.4s}, [%[b]], #16 \n" + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v2.4s, v5.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v4.4s, v5.s[3] \n" + "bne 3b \n" + "fadd v9.4s, v9.4s, v10.4s \n" + "cbz %w[relu], 6f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "6: \n" + "st1 {v9.4s}, [%[c]], #16 \n" + "b 9f \n" + /* remain 2 */ + "2: \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vzero].16b \n" /* mov zero to c2*/ + "mov v12.16b, %[vzero].16b \n" /* mov zero to c3*/ + "4: \n" + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v2.4s, v5.s[1] \n" + "fmla v12.4s, v2.4s, v6.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v4.4s, v5.s[3] \n" + "fmla v12.4s, v4.4s, v6.s[3] \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "bne 4b \n" + "fadd v9.4s, v9.4s, v11.4s \n" + "fadd v10.4s, v10.4s, v12.4s \n" + "cbz %w[relu], 7f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "7: \n" + "st1 {v9.4s, v10.4s}, [%[c]], #32 \n" + "b 9f \n" + /* remain 3 */ + "1: \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vbias].16b \n" /* mov bias to c2*/ + "5: \n" + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v7.4s}, [%[b]], #16 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v1.4s, v7.s[0] \n" + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "fmla v9.4s, v2.4s, v5.s[1] \n" + "fmla v10.4s, v2.4s, v6.s[1] \n" + "fmla v11.4s, v2.4s, v7.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v3.4s, v7.s[2] \n" + "prfm pldl1keep, [%[a]] \n" + "fmla v9.4s, v4.4s, v5.s[3] \n" + "fmla v10.4s, v4.4s, v6.s[3] \n" + "fmla v11.4s, v4.4s, v7.s[3] \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "bne 5b \n" + "cbz %w[relu], 8f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "8: \n" + "st1 {v9.4s, v10.4s}, [%[c]], #32 \n" + "st1 {v11.4s}, [%[c]], #16 \n" + "9:\n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), [relu] "r"(has_relu), + [remain] "r"(remain), [vbias] "w"(vbias), + [vzero] "w" (vzero) + : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", + "v10", "v11", "v12", "cc","memory"); +#else + asm volatile( + "pld [%[a]] \n" + "pld [%[b]] \n" + "vld1.32 {d0-d1}, [%[bias]] \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "vmov.u32 q15, #0 \n" + "cmp %[remain], #3 \n" + "beq 1f \n" + "cmp %[remain], #2 \n" + "beq 2f \n" + /* remain 1 */ + "vmov.32 q9, q0 \n" /* mov bias to c0*/ + "vmov.32 q10, q15 \n" /* mov zero to c1*/ + "3: \n" + "vld1.32 {d10-d11}, [%[b]]! \n" + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q9, q1, d10[0] \n" + "vmla.f32 q10, q2, d10[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "vmla.f32 q9, q3, d11[0] \n" + "vmla.f32 q10, q4, d11[1] \n" + "bne 3b \n" + "vadd.f32 q9, q9, q10 \n" + "cmp %[relu], #0 \n" + "beq 6f \n" + "vmax.f32 q9, q9, q15 \n" + "6: \n" + "vst1.32 {d18-d19}, [%[c]]! \n" + "b 9f \n" + /* remain 2 */ + "2: \n" + "vmov.u32 q9, q0 \n" /* mov bias to c0*/ + "vmov.u32 q10, q0 \n" /* mov bias to c1*/ + "vmov.u32 q11, q15 \n" /* mov zero to c2*/ + "vmov.u32 q12, q15 \n" /* mov zero to c3*/ + "4: \n" + "vld1.32 {d10-d13}, [%[b]]! \n" + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q9, q1, d10[0] \n" + "vmla.f32 q10, q1, d12[0] \n" + "vmla.f32 q11, q2, d10[1] \n" + "vmla.f32 q12, q2, d12[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q9, q3, d11[0] \n" + "vmla.f32 q10, q3, d13[0] \n" + "vmla.f32 q11, q4, d11[1] \n" + "vmla.f32 q12, q4, d13[1] \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "bne 4b \n" + "vadd.f32 q9, q9, q11 \n" + "vadd.f32 q10, q10, q12 \n" + "cmp %[relu], #0 \n" + "beq 7f \n" + "vmax.f32 q9, q9, q15 \n" + "vmax.f32 q10, q10, q15 \n" + "7: \n" + "vst1.32 {d18-d21}, [%[c]]! \n" + "b 9f \n" + /* remain 3 */ + "1: \n" + "vmov.u32 q9, q0 \n" /* mov bias to c0*/ + "vmov.u32 q10, q0 \n" /* mov bias to c1*/ + "vmov.u32 q11, q0 \n" /* mov bias to c2*/ + "5: \n" + "vld1.32 {d10-d13}, [%[b]]! \n" + "vld1.32 {d14-d15}, [%[b]]! \n" + "vmla.f32 q9, q1, d10[0] \n" + "vmla.f32 q10, q1, d12[0] \n" + "vmla.f32 q11, q1, d14[0] \n" + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q9, q2, d10[1] \n" + "vmla.f32 q10, q2, d12[1] \n" + "vmla.f32 q11, q2, d14[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q9, q3, d11[0] \n" + "vmla.f32 q10, q3, d13[0] \n" + "vmla.f32 q11, q3, d15[0] \n" + "pld [%[a]] \n" + "vmla.f32 q9, q4, d11[1] \n" + "vmla.f32 q10, q4, d13[1] \n" + "vmla.f32 q11, q4, d15[1] \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "bne 5b \n" + "cmp %[relu], #0 \n" + "beq 8f \n" + "vmax.f32 q9, q9, q15 \n" + "vmax.f32 q10, q10, q15 \n" + "vmax.f32 q11, q11, q15 \n" + "8: \n" + "vst1.32 {d18-d21}, [%[c]]! \n" + "vst1.32 {d22-d23}, [%[c]]! \n" + "9:\n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), + [relu] "r"(has_relu), + [remain] "r"(remain) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q9", + "q10", "q11", "q12", "q15", "cc","memory"); +#endif + // clang-format on + } + } + } + } +} + +void sgemm_prepack_c4_small(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx) { + const int m_round = (M + 3) / 4 * 4; + const int k_round = (K + 3) / 4 * 4; + const int mloop = m_round >> 2; + const int lda = 4 * k_round; + const int ldb_byte = 4 * N * sizeof(float); + const int kcnt = k_round >> 2; + float bias_buf[m_round]; // NOLINT + if (has_bias) { + memcpy(bias_buf, bias, M * sizeof(float)); + memset(bias_buf + M, 0, (m_round - M) * sizeof(float)); + } else { + memset(bias_buf, 0, m_round * sizeof(float)); + } +#ifdef __aarch64__ + float32x4_t vzero = vdupq_n_f32(0.f); +#endif + const float* bias_ptr = bias_buf; + for (int m = 0; m < mloop; ++m) { +#ifdef __aarch64__ + float32x4_t vbias = vld1q_f32(bias_ptr); +#endif + const float* b = B; + int n = N; +#ifdef __aarch64__ + for (; n > 7; n -= 8) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + // clang-format off + asm volatile( + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + /* mov bias to c0-c7*/ + "mov v8.16b, %[vbias].16b \n" + "mov v9.16b, %[vbias].16b \n" + "mov v10.16b, %[vbias].16b \n" + "mov v11.16b, %[vbias].16b \n" + /* load b0, b1 */ + "ld1 {v0.4s, v1.4s}, [%[b]], #32 \n" + "mov v12.16b, %[vbias].16b \n" + "mov v13.16b, %[vbias].16b \n" + "mov v14.16b, %[vbias].16b \n" + "mov v15.16b, %[vbias].16b \n" + "1:\n" + /* load b2, b3 */ + "ld1 {v2.4s, v3.4s}, [%[b]], #32 \n" + /* load a2, a3 */ + "ld1 {v18.4s, v19.4s}, [%[a]], #32 \n" + "fmla v8.4s, v16.4s, v0.s[0] \n" + "fmla v9.4s, v16.4s, v1.s[0] \n" + "fmla v10.4s, v16.4s, v2.s[0] \n" + "fmla v11.4s, v16.4s, v3.s[0] \n" + "prfm pldl1keep, [%[b]] \n" + "fmla v8.4s, v17.4s, v0.s[1] \n" + "fmla v9.4s, v17.4s, v1.s[1] \n" + "fmla v10.4s, v17.4s, v2.s[1] \n" + "fmla v11.4s, v17.4s, v3.s[1] \n" + /* load b4, b5 */ + "ld1 {v4.4s, v5.4s}, [%[b]], #32 \n" + "fmla v8.4s, v18.4s, v0.s[2] \n" + "fmla v9.4s, v18.4s, v1.s[2] \n" + "fmla v10.4s, v18.4s, v2.s[2] \n" + "fmla v11.4s, v18.4s, v3.s[2] \n" + /* load b6, b7 */ + "ld1 {v6.4s, v7.4s}, [%[b]], #32 \n" + "fmla v8.4s, v19.4s, v0.s[3] \n" + "fmla v9.4s, v19.4s, v1.s[3] \n" + "fmla v10.4s, v19.4s, v2.s[3] \n" + "fmla v11.4s, v19.4s, v3.s[3] \n" + "sub %[b], %[b], #128 \n" + "fmla v12.4s, v16.4s, v4.s[0] \n" + "fmla v13.4s, v16.4s, v5.s[0] \n" + "fmla v14.4s, v16.4s, v6.s[0] \n" + "fmla v15.4s, v16.4s, v7.s[0] \n" + "add %[b], %[b], %[ldb] \n" + "fmla v12.4s, v17.4s, v4.s[1] \n" + "fmla v13.4s, v17.4s, v5.s[1] \n" + "fmla v14.4s, v17.4s, v6.s[1] \n" + "fmla v15.4s, v17.4s, v7.s[1] \n" + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + "fmla v12.4s, v18.4s, v4.s[2] \n" + "fmla v13.4s, v18.4s, v5.s[2] \n" + "fmla v14.4s, v18.4s, v6.s[2] \n" + "fmla v15.4s, v18.4s, v7.s[2] \n" + /* load b0, b1 */ + "ld1 {v0.4s, v1.4s}, [%[b]], #32 \n" + "fmla v12.4s, v19.4s, v4.s[3] \n" + "fmla v13.4s, v19.4s, v5.s[3] \n" + "fmla v14.4s, v19.4s, v6.s[3] \n" + "fmla v15.4s, v19.4s, v7.s[3] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "bne 1b \n" + "cbz %w[relu], 2f \n" + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "fmax v12.4s, v12.4s, %[vzero].4s \n" + "fmax v13.4s, v13.4s, %[vzero].4s \n" + "fmax v14.4s, v14.4s, %[vzero].4s \n" + "fmax v15.4s, v15.4s, %[vzero].4s \n" + "2:\n" + "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[c]], #64 \n" + "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[c]], #64 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [vbias] "w" (vbias), + [vzero] "w" (vzero) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "cc", "memory" + ); + b += 4 * 8; + } + for (; n > 3; n -= 4) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + /* mov bias to c0-c3*/ + "mov v8.16b, %[vbias].16b \n" + "mov v9.16b, %[vbias].16b \n" + "mov v10.16b, %[vbias].16b \n" + "mov v11.16b, %[vbias].16b \n" + "1:\n" + /* load b0-b3 */ + "ld1 {v0.4s, v1.4s}, [%[b]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[b]], #32 \n" + /* load a2, a3 */ + "ld1 {v18.4s, v19.4s}, [%[a]], #32 \n" + "fmla v8.4s, v16.4s, v0.s[0] \n" + "fmla v9.4s, v16.4s, v1.s[0] \n" + "fmla v10.4s, v16.4s, v2.s[0] \n" + "fmla v11.4s, v16.4s, v3.s[0] \n" + "sub %[b], %[b], #64 \n" + "fmla v8.4s, v17.4s, v0.s[1] \n" + "fmla v9.4s, v17.4s, v1.s[1] \n" + "fmla v10.4s, v17.4s, v2.s[1] \n" + "fmla v11.4s, v17.4s, v3.s[1] \n" + "add %[b], %[b], %[ldb] \n" + "fmla v8.4s, v18.4s, v0.s[2] \n" + "fmla v9.4s, v18.4s, v1.s[2] \n" + "fmla v10.4s, v18.4s, v2.s[2] \n" + "fmla v11.4s, v18.4s, v3.s[2] \n" + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + "fmla v8.4s, v19.4s, v0.s[3] \n" + "fmla v9.4s, v19.4s, v1.s[3] \n" + "fmla v10.4s, v19.4s, v2.s[3] \n" + "fmla v11.4s, v19.4s, v3.s[3] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "bne 1b \n" + "cbz %w[relu], 2f \n" + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "2:\n" + "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[c]], #64 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [vbias] "w" (vbias), + [vzero] "w" (vzero) + : "v0", "v1", "v2", "v3", "v8", "v9", + "v10", "v11", "v16", "v17", "v18", + "v19", "cc", "memory" + ); + b += 4 * 4; + } + for (; n > 0; n--) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + /* mov bias to c0 */ + "mov v8.16b, %[vbias].16b \n" + "mov v9.16b, %[vzero].16b \n" + "1:\n" + /* load b0 */ + "ld1 {v0.4s}, [%[b]], #16 \n" + /* load a2, a3 */ + "ld1 {v18.4s, v19.4s}, [%[a]], #32 \n" + "fmla v8.4s, v16.4s, v0.s[0] \n" + "fmla v9.4s, v17.4s, v0.s[1] \n" + "sub %[b], %[b], #16 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "add %[b], %[b], %[ldb] \n" + "fmla v8.4s, v18.4s, v0.s[2] \n" + "fmla v9.4s, v19.4s, v0.s[3] \n" + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + "bne 1b \n" + "fadd v8.4s, v8.4s, v9.4s \n" + "cbz %w[relu], 2f \n" + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "2:\n" + "st1 {v8.4s}, [%[c]], #16 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [vbias] "w" (vbias), + [vzero] "w" (vzero) + : "v0", "v8", "v9", "v16", "v17", + "v18", "v19", "cc", "memory" + ); + b += 4; + } +#else + for (; n > 7; n -= 8) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + // clang-format off + asm volatile( + "vld1.32 {d6-d7}, [%[bias]] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + /* mov bias to c0-c7*/ + "vmov.u32 q8, q3 \n" + "vmov.u32 q9, q3 \n" + "vmov.u32 q10, q3 \n" + "vmov.u32 q11, q3 \n" + /* load b0, b1 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vmov.u32 q12, q3 \n" + "vmov.u32 q13, q3 \n" + "vmov.u32 q14, q3 \n" + "vmov.u32 q15, q3 \n" + "1:\n" + /* load b2, b3 */ + "vld1.32 {d4-d7}, [%[b]]! \n" + /* load a2, a3 */ + "vld1.32 {d12-d15}, [%[a]]! \n" + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q4, d2[0] \n" + "vmla.f32 q10, q4, d4[0] \n" + "vmla.f32 q11, q4, d6[0] \n" + "pld [%[b]] \n" + "vmla.f32 q8, q5, d0[1] \n" + "vmla.f32 q9, q5, d2[1] \n" + "vmla.f32 q10, q5, d4[1] \n" + "vmla.f32 q11, q5, d6[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q8, q6, d1[0] \n" + "vmla.f32 q9, q6, d3[0] \n" + "vmla.f32 q10, q6, d5[0] \n" + "vmla.f32 q11, q6, d7[0] \n" + "pld [%[b], #64] \n" + "vmla.f32 q8, q7, d1[1] \n" + "vmla.f32 q9, q7, d3[1] \n" + /* load b4, b5 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vmla.f32 q10, q7, d5[1] \n" + "vmla.f32 q11, q7, d7[1] \n" + /* load b6, b7 */ + "vld1.32 {d4-d7}, [%[b]]! \n" + "vmla.f32 q12, q4, d0[0] \n" + "vmla.f32 q13, q4, d2[0] \n" + "vmla.f32 q14, q4, d4[0] \n" + "vmla.f32 q15, q4, d6[0] \n" + "sub %[b], %[b], #128 \n" + "vmla.f32 q12, q5, d0[1] \n" + "vmla.f32 q13, q5, d2[1] \n" + "vmla.f32 q14, q5, d4[1] \n" + "vmla.f32 q15, q5, d6[1] \n" + "add %[b], %[b], %[ldb] \n" + "vmla.f32 q12, q6, d1[0] \n" + "vmla.f32 q13, q6, d3[0] \n" + "vmla.f32 q14, q6, d5[0] \n" + "vmla.f32 q15, q6, d7[0] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + "vmla.f32 q12, q7, d1[1] \n" + "vmla.f32 q13, q7, d3[1] \n" + /* load b0, b1 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vmla.f32 q14, q7, d5[1] \n" + "vmla.f32 q15, q7, d7[1] \n" + "bne 1b \n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmov.u32 q0, #0 \n" + "vmax.f32 q8, q8, q0 \n" + "vmax.f32 q9, q9, q0 \n" + "vmax.f32 q10, q10, q0 \n" + "vmax.f32 q11, q11, q0 \n" + "vmax.f32 q12, q12, q0 \n" + "vmax.f32 q13, q13, q0 \n" + "vmax.f32 q14, q14, q0 \n" + "vmax.f32 q15, q15, q0 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]! \n" + "vst1.32 {d20-d23}, [%[c]]! \n" + "vst1.32 {d24-d27}, [%[c]]! \n" + "vst1.32 {d28-d31}, [%[c]]! \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [bias] "r" (bias_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15", "cc", "memory" + ); + b += 4 * 8; + } + for (; n > 3; n -= 4) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + "vld1.32 {d24-d25}, [%[bias]] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + /* mov bias to c0-c3*/ + "vmov.u32 q8, q12 \n" + "vmov.u32 q9, q12 \n" + "vmov.u32 q10, q12 \n" + "vmov.u32 q11, q12 \n" + "vmov.u32 q13, #0 \n" + "1:\n" + /* load b0-b3 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vld1.32 {d4-d7}, [%[b]]! \n" + /* load a2, a3 */ + "vld1.32 {d12-d15}, [%[a]]!\n" + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q4, d2[0] \n" + "vmla.f32 q10, q4, d4[0] \n" + "vmla.f32 q11, q4, d6[0] \n" + "sub %[b], %[b], #64 \n" + "vmla.f32 q8, q5, d0[1] \n" + "vmla.f32 q9, q5, d2[1] \n" + "vmla.f32 q10, q5, d4[1] \n" + "vmla.f32 q11, q5, d6[1] \n" + "add %[b], %[b], %[ldb] \n" + "vmla.f32 q8, q6, d1[0] \n" + "vmla.f32 q9, q6, d3[0] \n" + "vmla.f32 q10, q6, d5[0] \n" + "vmla.f32 q11, q6, d7[0] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + "vmla.f32 q8, q7, d1[1] \n" + "vmla.f32 q9, q7, d3[1] \n" + "vmla.f32 q10, q7, d5[1] \n" + "vmla.f32 q11, q7, d7[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmax.f32 q8, q8, q13 \n" + "vmax.f32 q9, q9, q13 \n" + "vmax.f32 q10, q10, q13 \n" + "vmax.f32 q11, q11, q13 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]!\n" + "vst1.32 {d20-d23}, [%[c]]!\n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [bias] "r" (bias_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "cc", "memory" + ); + b += 4 * 4; + } + for (; n > 0; n--) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + "vld1.32 {d14-d15}, [%[bias]] \n" + "vmov.u32 q8, #0 \n" + /* load a0, a1 */ + "vld1.32 {d2-d5}, [%[a]]! \n" + /* mov bias to c0 */ + "vmov.u32 q5, q7 \n" + "vmov.u32 q6, q8 \n" + "1:\n" + /* load b0 */ + "vld1.32 {d0-d1}, [%[b]]! \n" + /* load a2, a3 */ + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q5, q1, d0[0] \n" + "vmla.f32 q6, q2, d0[1] \n" + "sub %[b], %[b], #16 \n" + "subs %[cnt], %[cnt], #1 \n" + "add %[b], %[b], %[ldb] \n" + "vmla.f32 q5, q3, d1[0] \n" + "vmla.f32 q6, q4, d1[1] \n" + /* load a0, a1 */ + "vld1.32 {d2-d5}, [%[a]]! \n" + "bne 1b \n" + "vadd.f32 q5, q5, q6 \n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmax.f32 q5, q5, q8 \n" + "2:\n" + "vst1.32 {d10-d11}, [%[c]]!\n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [bias] "r" (bias_ptr) + : "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "cc", "memory" + ); + // clang-format on + b += 4; + } +#endif + bias_ptr += 4; + A_packed += lda; + } +} + +void sgemm_prepack_c4(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx) { + if (N > 16) { + sgemm_prepack_c4_common( + M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx); + } else { + sgemm_prepack_c4_small( + M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx); + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h new file mode 100644 index 0000000000..21e5af6343 --- /dev/null +++ b/lite/backends/arm/math/packed_sgemm_c4.h @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/context.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +constexpr int MBLOCK_C4 = 4; +constexpr int NBLOCK_C4 = 8; +constexpr int KBLOCK_C4 = 4; + +void sgemm_prepack_c4(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx); +void sgemm_prepack_c4_small(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx); +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index a857e9830c..8524d7376f 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -46,7 +46,7 @@ void pooling_basic(const float* din, int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int size_channel_in = win * hin; int size_channel_out = wout * hout; if (global_pooling) { @@ -125,18 +125,22 @@ void pooling_basic(const float* din, int bh = kernel_h; int bw = kernel_w; if (ew == win) { - bw = sw + kernel_w >= win + pad_w ? win + pad_w - : sw + kernel_w; + bw = (sw + kernel_w) >= (win + paddings[3]) + ? (win + paddings[3]) + : (sw + kernel_w); bw -= sw; - if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) { + if ((sw - pad_w) < 0 && + (sw + kernel_w) > (win + paddings[3])) { bw += pad_w; } } if (eh == hin) { - bh = sh + kernel_h >= hin + pad_h ? hin + pad_h - : sh + kernel_h; + bh = (sh + kernel_h) >= (hin + paddings[1]) + ? (hin + paddings[1]) + : (sh + kernel_h); bh -= sh; - if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) { + if ((sh - pad_h) < 0 && + (sh + kernel_h) > (hin + paddings[1])) { bh += pad_h; } } diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc index 506451932d..1830423136 100644 --- a/lite/backends/arm/math/sgemv.cc +++ b/lite/backends/arm/math/sgemv.cc @@ -14,6 +14,7 @@ #include "lite/backends/arm/math/sgemv.h" #include +#include #include "lite/utils/cp_logging.h" namespace paddle { @@ -50,6 +51,495 @@ void sgemv_bias_relu(const bool transA, const float *x, float *y, const float *bias); +#ifdef __aarch64__ +void sgemv_trans(const int M, + const int N, + const float *A, + const float *x, + float *y, + bool flag_bias, + const float *bias, + bool flag_relu, + const ARMContext *ctx) { + int m_cnt16 = M >> 4; + int m_cnt8 = (M & 15) >> 3; + int m_cnt4 = (M & 15 & 7) >> 2; + int m_remain = M & 15 & 7 & 3; + int ths = ctx->threads(); + int valid_ths = std::min((N + 3) / 4, ths); + int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4); + valid_ths = (N + valid_block - 1) / valid_block; + int block_cnt = valid_block / 4; + float zero_buf[M]; // NOLINT + float y_buf[valid_ths * M]; // NOLINT + memset(zero_buf, 0, M * sizeof(float)); + if (flag_bias) { + memcpy(y_buf, bias, M * sizeof(float)); + memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float)); + } else { + memset(y_buf, 0, valid_ths * M * sizeof(float)); + } +#pragma omp parallel for + for (int t = 0; t < valid_ths; ++t) { + float *block_y = y_buf + t * M; + const float *block_x = x + t * valid_block; + const float *block_A = A + t * valid_block * M; + for (int i = 0; i < block_cnt; ++i) { + float *y_ptr = block_y; + const float *x_ptr = block_x + i * 4; + const float *in0_ptr = block_A + i * 4 * M; + const float *in1_ptr = in0_ptr + M; + const float *in2_ptr = in1_ptr + M; + const float *in3_ptr = in2_ptr + M; + int offset = t * valid_block + (i + 1) * 4 - N; + if (offset > 0) { + if (offset > 3) { + in0_ptr = zero_buf; + in1_ptr = zero_buf; + in2_ptr = zero_buf; + in3_ptr = zero_buf; + } else { + switch (offset) { + case 3: + in1_ptr = zero_buf; + case 2: + in2_ptr = zero_buf; + case 1: + in3_ptr = zero_buf; + default: + break; + } + } + } + // clang-format off + if (m_cnt16 > 0) { + int cnt16 = m_cnt16; + asm volatile( + "ld1 {v4.4s}, [%[x]] \n" /* load x to v4 */ + "ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[in0]], #64 \n" /* load in0 to v5, v6, v7, v8 */ + "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[in1]], #64 \n" /* load in1 to v9, v10, v11, v12 */ + "ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[in2]], #64 \n" /* load in2 to v13, v14, v15, v16 */ + "ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[in3]], #64 \n" /* load in3 to v17, v18, v19, v20 */ + "1:\n" + "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[y]] \n" /*load y to v0, v1, v2, v3 */ + "fmla v0.4s, v5.4s, v4.s[0] \n" /* v0 += v5 * v4[0] */ + "fmla v1.4s, v6.4s, v4.s[0] \n" /* v1 += v6 * v4[0] */ + "fmla v2.4s, v7.4s, v4.s[0] \n" /* v2 += v7 * v4[0] */ + "fmla v3.4s, v8.4s, v4.s[0] \n" /* v3 += v8 * v4[0] */ + "ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[in0]], #64 \n" /* load in0 to v5, v6, v7, v8 */ + "fmla v0.4s, v9.4s, v4.s[1] \n" /* v0 += v9 * v4[1] */ + "fmla v1.4s, v10.4s, v4.s[1] \n" /* v1 += v10 * v4[1] */ + "fmla v2.4s, v11.4s, v4.s[1] \n" /* v2 += v11 * v4[1] */ + "fmla v3.4s, v12.4s, v4.s[1] \n" /* v3 += v12 * v4[1] */ + "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[in1]], #64 \n" /* load in1 to v9, v10, v11, v12 */ + "fmla v0.4s, v13.4s, v4.s[2] \n" /* v0 += v13 * v4[2] */ + "fmla v1.4s, v14.4s, v4.s[2] \n" /* v1 += v14 * v4[2] */ + "fmla v2.4s, v15.4s, v4.s[2] \n" /* v2 += v15 * v4[2] */ + "fmla v3.4s, v16.4s, v4.s[2] \n" /* v3 += v16 * v4[2] */ + "ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[in2]], #64 \n" /* load in2 to v13, v14, v15, v16 */ + "fmla v0.4s, v17.4s, v4.s[3] \n" /* v0 += v17 * v4[3] */ + "fmla v1.4s, v18.4s, v4.s[3] \n" /* v1 += v18 * v4[3] */ + "fmla v2.4s, v19.4s, v4.s[3] \n" /* v2 += v19 * v4[3] */ + "fmla v3.4s, v20.4s, v4.s[3] \n" /* v3 += v20 * v4[3] */ + "ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[in3]], #64 \n" /* load in3 to v17, v18, v19, v20 */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[y]], #64 \n" /* store v0, v1, v2, v3 to y */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #64 \n" /* restore in0 address */ + "sub %[in1], %[in1], #64 \n" /* restore in1 address */ + "sub %[in2], %[in2], #64 \n" /* restore in2 address */ + "sub %[in3], %[in3], #64 \n" /* restore in3 address */ + : [cnt] "+r"(cnt16), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v17", "v18", "v19", "v20", "cc", "memory" + ); + } + if (m_cnt8 > 0) { + int cnt8 = m_cnt8; + asm volatile( + "ld1 {v2.4s}, [%[x]] \n" /* load x to v2 */ + "ld1 {v3.4s, v4.4s}, [%[in0]], #32 \n" /* load in0 to v3, v4 */ + "ld1 {v5.4s, v6.4s}, [%[in1]], #32 \n" /* load in1 to v5, v6 */ + "ld1 {v7.4s, v8.4s}, [%[in2]], #32 \n" /* load in2 to v7, v8 */ + "ld1 {v9.4s, v10.4s}, [%[in3]], #32 \n" /* load in3 to v9, v10*/ + "1:\n" + "ld1 {v0.4s, v1.4s}, [%[y]] \n" /* load y to v0, v1 */ + "fmla v0.4s, v3.4s, v2.s[0] \n" /* v0 += v3 * v2[0] */ + "fmla v1.4s, v4.4s, v2.s[0] \n" /* v1 += v4 * v2[0] */ + "prfm pldl1keep, [%[in0]] \n" /* preload in0 */ + "ld1 {v3.4s, v4.4s}, [%[in0]], #32 \n" /* load in0 to v3, v4 */ + "fmla v0.4s, v5.4s, v2.s[1] \n" /* v0 += v5 * v2[1] */ + "fmla v1.4s, v6.4s, v2.s[1] \n" /* v1 += v6 * v2[1] */ + "prfm pldl1keep, [%[in1]] \n" /* preload in1 */ + "ld1 {v5.4s, v6.4s}, [%[in1]], #32 \n" /* load in0 to v5, v6 */ + "fmla v0.4s, v7.4s, v2.s[2] \n" /* v0 += v7 * v2[2] */ + "fmla v1.4s, v8.4s, v2.s[2] \n" /* v1 += v8 * v2[2] */ + "prfm pldl1keep, [%[in2]] \n" /* preload in2 */ + "ld1 {v7.4s, v8.4s}, [%[in2]], #32 \n" /* load in0 to v7, v8 */ + "fmla v0.4s, v9.4s, v2.s[3] \n" /* v0 += v9 * v2[3] */ + "fmla v1.4s, v10.4s, v2.s[3] \n" /* v1 += v10 * v2[3] */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "prfm pldl1keep, [%[in3]] \n" /* preload in3 */ + "st1 {v0.4s, v1.4s}, [%[y]], #32 \n" /* store v0, v1 to y */ + "ld1 {v9.4s, v10.4s},[%[in3]], #32 \n" /* load in0 to v9, v10*/ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #32 \n" /* restore in0 address */ + "sub %[in1], %[in1], #32 \n" /* restore in1 address */ + "sub %[in2], %[in2], #32 \n" /* restore in2 address */ + "sub %[in3], %[in3], #32 \n" /* restore in3 address */ + : [cnt] "+r"(cnt8), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "cc", "memory" + ); + } + if (m_cnt4 > 0) { + int cnt4 = m_cnt4; + asm volatile( + "ld1 {v1.4s}, [%[in0]], #16 \n" /* load in0 to v1 */ + "ld1 {v2.4s}, [%[in1]], #16 \n" /* load in1 to v2 */ + "ld1 {v3.4s}, [%[in2]], #16 \n" /* load in2 to v3 */ + "ld1 {v4.4s}, [%[in3]], #16 \n" /* load in3 to v4 */ + "ld1 {v5.4s}, [%[x]] \n" /* load x to v5 */ + "1:\n" + "ld1 {v0.4s}, [%[y]] \n" /* load y to v0 */ + "fmla v0.4s, v1.4s, v5.s[0] \n" /* v0 += v1 * v5[0] */ + "prfm pldl1keep, [%[in0]] \n" /* preload in0 */ + "ld1 {v1.4s}, [%[in0]], #16 \n" /* load in0 to v1 */ + "fmla v0.4s, v2.4s, v5.s[1] \n" /* v0 += v2 * v5[1] */ + "prfm pldl1keep, [%[in1]] \n" /* preload in1 */ + "ld1 {v2.4s}, [%[in1]], #16 \n" /* load in1 to v2 */ + "fmla v0.4s, v3.4s, v5.s[2] \n" /* v0 += v3 * v5[2] */ + "prfm pldl1keep, [%[in2]] \n" /* preload in2 */ + "ld1 {v3.4s}, [%[in2]], #16 \n" /* load in2 to v3 */ + "fmla v0.4s, v4.4s, v5.s[3] \n" /* v0 += v4 * v5[3] */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "prfm pldl1keep, [%[in3]] \n" /* preload in3 */ + "st1 {v0.4s}, [%[y]], #16 \n" /* store v0 to y */ + "ld1 {v4.4s}, [%[in3]], #16 \n" /* load in3 to v4 */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #16 \n" /* restore in0 address*/ + "sub %[in1], %[in1], #16 \n" /* restore in1 address*/ + "sub %[in2], %[in2], #16 \n" /* restore in2 address*/ + "sub %[in3], %[in3], #16 \n" /* restore in3 address*/ + : [cnt] "+r"(cnt4), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory" + ); + } + // clang-format on + for (int r = 0; r < m_remain; ++r) { + float val0 = x_ptr[0] * in0_ptr[r]; + float val1 = x_ptr[1] * in1_ptr[r]; + float val2 = x_ptr[2] * in2_ptr[r]; + float val3 = x_ptr[3] * in3_ptr[r]; + y_ptr[r] += val0 + val1 + val2 + val3; + } + } + } + int cnt4 = M >> 2; + int remain = M & 3; + //! do reduction + int rdc_ths = valid_ths >> 1; + while (rdc_ths > 0) { +#pragma omp parallel for + for (int t = 0; t < rdc_ths; ++t) { + float *y0 = y_buf + t * M; + for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) { + float *y0_ptr = y0; + float *y_ptr = y_buf + i * M; + for (int j = 0; j < cnt4; ++j) { + float32x4_t val0 = vld1q_f32(y0_ptr + j * 4); + float32x4_t val1 = vld1q_f32(y_ptr + j * 4); + float32x4_t val = vaddq_f32(val0, val1); + vst1q_f32(y0_ptr + j * 4, val); + } + y0_ptr += cnt4 * 4; + y_ptr += cnt4 * 4; + for (int j = 0; j < remain; ++j) { + y0_ptr[j] += y_ptr[j]; + } + } + } + valid_ths = rdc_ths; + rdc_ths = rdc_ths >> 1; + } + if (flag_relu) { + float *in_y = y_buf; + float32x4_t vzero = vdupq_n_f32(0.f); + if (cnt4 > 0) { + int cnt = cnt4; + asm volatile( + "ld1 {v0.4s}, [%[in_y]], #16 \n" /* load y to v0 */ + "1:\n" + "fmax v1.4s, v0.4s, %[vzero].4s \n" /* v0 relu */ + "ld1 {v0.4s}, [%[in_y]], #16 \n" /* load y to v0 */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "st1 {v1.4s}, [%[out_y]], #16 \n" /* store v1 to y */ + "bne 1b \n" /* branch to label 1*/ + "sub %[in_y], %[in_y], #16 \n" /* restore in_y */ + : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y) + : [vzero] "w"(vzero) + : "v0", "v1", "cc", "memory"); + } + for (int r = 0; r < remain; ++r) { + y[r] = in_y[r] > 0.f ? in_y[r] : 0.f; + } + } else { + memcpy(y, y_buf, M * sizeof(float)); + } +} +#else +void sgemv_trans(const int M, + const int N, + const float *A, + const float *x, + float *y, + bool flag_bias, + const float *bias, + bool flag_relu, + const ARMContext *ctx) { + int m_cnt8 = M >> 3; + int m_cnt4 = (M & 7) >> 2; + int m_remain = M & 7 & 3; + int ths = ctx->threads(); + int valid_ths = std::min((N + 3) / 4, ths); + int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4); + valid_ths = (N + valid_block - 1) / valid_block; + int block_cnt = valid_block / 4; + float zero_buf[M]; // NOLINT + float y_buf[valid_ths * M]; // NOLINT + memset(zero_buf, 0, M * sizeof(float)); + if (flag_bias) { + memcpy(y_buf, bias, M * sizeof(float)); + memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float)); + } else { + memset(y_buf, 0, valid_ths * M * sizeof(float)); + } +#pragma omp parallel for + for (int t = 0; t < valid_ths; ++t) { + float *block_y = y_buf + t * M; + const float *block_x = x + t * valid_block; + const float *block_A = A + t * valid_block * M; + for (int i = 0; i < block_cnt; ++i) { + float *y_ptr = block_y; + const float *x_ptr = block_x + i * 4; + const float *in0_ptr = block_A + i * 4 * M; + const float *in1_ptr = in0_ptr + M; + const float *in2_ptr = in1_ptr + M; + const float *in3_ptr = in2_ptr + M; + int offset = t * valid_block + (i + 1) * 4 - N; + if (offset > 0) { + if (offset > 3) { + in0_ptr = zero_buf; + in1_ptr = zero_buf; + in2_ptr = zero_buf; + in3_ptr = zero_buf; + } else { + switch (offset) { + case 3: + in1_ptr = zero_buf; + case 2: + in2_ptr = zero_buf; + case 1: + in3_ptr = zero_buf; + default: + break; + } + } + } + // clang-format off + if (m_cnt8 > 0) { + int cnt8 = m_cnt8; + asm volatile( + "vld1.32 {d4-d5}, [%[x]] \n" /* load x to q2 */ + "vld1.32 {d6-d9}, [%[in0]]! \n" /* load in0 to q3, q4 */ + "vld1.32 {d10-d13},[%[in1]]! \n" /* load in1 to q5, q6 */ + "vld1.32 {d14-d17},[%[in2]]! \n" /* load in2 to q7, q8 */ + "vld1.32 {d18-d21},[%[in3]]! \n" /* load in3 to q9, q10*/ + "1:\n" + "vld1.32 {d0-d3}, [%[y]] \n" /* load y to q0, q1 */ + "vmla.f32 q0, q3, d4[0] \n" /* q0 += q3 * q2[0] */ + "vmla.f32 q1, q4, d4[0] \n" /* q1 += q4 * q2[0] */ + "pld [%[in0]] \n" /* preload in0 */ + "vld1.32 {d6-d9}, [%[in0]]! \n" /* load in0 to q3, q4 */ + "vmla.f32 q0, q5, d4[1] \n" /* q0 += q5 * q2[1] */ + "vmla.f32 q1, q6, d4[1] \n" /* q1 += q6 * q2[1] */ + "pld [%[in1]] \n" /* preload in1 */ + "vld1.32 {d10-d13},[%[in1]]! \n" /* load in0 to q5, q6 */ + "vmla.f32 q0, q7, d5[0] \n" /* q0 += q7 * q2[2] */ + "vmla.f32 q1, q8, d5[0] \n" /* q1 += q8 * q2[2] */ + "pld [%[in2]] \n" /* preload in2 */ + "vld1.32 {d14-d17},[%[in2]]! \n" /* load in0 to q7, q8 */ + "vmla.f32 q0, q9, d5[1] \n" /* q0 += q9 * q2[3] */ + "vmla.f32 q1, q10, d5[1] \n" /* q1 += q10 * q2[3] */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "pld [%[in3]] \n" /* preload in3 */ + "vst1.32 {d0-d3}, [%[y]]! \n" /* store q0, q1 to y */ + "vld1.32 {d18-d21},[%[in3]]! \n" /* load in0 to q9, q10*/ + "pld [%[y], #32] \n" /* preload y */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #32 \n" /* restore in0 address */ + "sub %[in1], %[in1], #32 \n" /* restore in1 address */ + "sub %[in2], %[in2], #32 \n" /* restore in2 address */ + "sub %[in3], %[in3], #32 \n" /* restore in3 address */ + : [cnt] "+r"(cnt8), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9", "q10", "cc", "memory" + ); + } + if (m_cnt4 > 0) { + int cnt4 = m_cnt4; + asm volatile( + "vld1.32 {d2-d3}, [%[in0]]! \n" /* load in0 to q1 */ + "vld1.32 {d4-d5}, [%[in1]]! \n" /* load in1 to q2 */ + "vld1.32 {d6-d7}, [%[in2]]! \n" /* load in2 to q3 */ + "vld1.32 {d8-d9}, [%[in3]]! \n" /* load in3 to q4 */ + "vld1.32 {d10-d11},[%[x]] \n" /* load x to q5 */ + "1:\n" + "vld1.32 {d0-d1}, [%[y]] \n" /* load y to q0 */ + "vmla.f32 q0, q1, d10[0] \n" /* q0 += q1 * q5[0] */ + "pld [%[in0]] \n" /* preload in0 */ + "vld1.32 {d2-d3}, [%[in0]]! \n" /* load in0 to q1 */ + "vmla.f32 q0, q2, d10[1] \n" /* q0 += q2 * q5[1] */ + "pld [%[in1]] \n" /* preload in1 */ + "vld1.32 {d4-d5}, [%[in1]]! \n" /* load in0 to q2 */ + "vmla.f32 q0, q3, d11[0] \n" /* q0 += q3 * q5[2] */ + "pld [%[in2]] \n" /* preload in2 */ + "vld1.32 {d6-d7}, [%[in2]]! \n" /* load in0 to q3 */ + "vmla.f32 q0, q4, d11[1] \n" /* q0 += q4 * q5[3] */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "pld [%[in3]] \n" /* preload in3 */ + "vst1.32 {d0-d1}, [%[y]]! \n" /* store q0 to y */ + "vld1.32 {d8-d9}, [%[in3]]! \n" /* load in0 to q4 */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #16 \n" /* restore in0 address*/ + "sub %[in1], %[in1], #16 \n" /* restore in1 address*/ + "sub %[in2], %[in2], #16 \n" /* restore in2 address*/ + "sub %[in3], %[in3], #16 \n" /* restore in3 address*/ + : [cnt] "+r"(cnt4), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", "cc", "memory" + ); + } + // clang-format on + for (int r = 0; r < m_remain; ++r) { + float val0 = x_ptr[0] * in0_ptr[r]; + float val1 = x_ptr[1] * in1_ptr[r]; + float val2 = x_ptr[2] * in2_ptr[r]; + float val3 = x_ptr[3] * in3_ptr[r]; + y_ptr[r] += val0 + val1 + val2 + val3; + } + } + } + //! do reduction + int rdc_ths = valid_ths >> 1; + while (rdc_ths > 0) { +#pragma omp parallel for + for (int t = 0; t < rdc_ths; ++t) { + float *y0 = y_buf + t * M; + for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) { + float *y0_ptr = y0; + float *y_ptr = y_buf + i * M; + for (int j = 0; j < m_cnt8; ++j) { + float32x4_t val00 = vld1q_f32(y0_ptr + j * 8); + float32x4_t val01 = vld1q_f32(y0_ptr + j * 8 + 4); + float32x4_t val10 = vld1q_f32(y_ptr + j * 8); + float32x4_t val11 = vld1q_f32(y_ptr + j * 8 + 4); + float32x4_t val0 = vaddq_f32(val00, val10); + float32x4_t val1 = vaddq_f32(val01, val11); + vst1q_f32(y0_ptr + j * 8, val0); + vst1q_f32(y0_ptr + j * 8 + 4, val1); + } + y0_ptr += m_cnt8 * 8; + y_ptr += m_cnt8 * 8; + for (int j = 0; j < m_cnt4; ++j) { + float32x4_t val0 = vld1q_f32(y0_ptr + j * 4); + float32x4_t val1 = vld1q_f32(y_ptr + j * 4); + float32x4_t val = vaddq_f32(val0, val1); + vst1q_f32(y0_ptr + j * 4, val); + } + y0_ptr += m_cnt4 * 4; + y_ptr += m_cnt4 * 4; + for (int j = 0; j < m_remain; ++j) { + y0_ptr[j] += y_ptr[j]; + } + } + } + valid_ths = rdc_ths; + rdc_ths = rdc_ths >> 1; + } + if (flag_relu) { + float *in_y = y_buf; + float32x4_t vzero = vdupq_n_f32(0.f); + if (m_cnt8 > 0) { + int cnt8 = m_cnt8; + asm volatile( + "vld1.32 {d0-d3}, [%[in_y]]! \n" /* load y to q0, q1 */ + "1:\n" + "vmax.f32 q2, q0, %q[vzero] \n" /* q0 relu */ + "vld1.32 {d0-d1}, [%[in_y]]! \n" /* load y to q0 */ + "vmax.f32 q3, q1, %q[vzero] \n" /* q1 relu */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "vst1.32 {d4-d7}, [%[out_y]]! \n" /* store q0, q1 to y*/ + "vld1.32 {d2-d3}, [%[in_y]]! \n" /* load y to q0 */ + "bne 1b \n" /* branch to label 1*/ + "sub %[in_y], %[in_y], #32 \n" /* restore in_y */ + : [cnt] "+r"(cnt8), [in_y] "+r"(in_y), [out_y] "+r"(y) + : [vzero] "w"(vzero) + : "q0", "q1", "q2", "q3", "cc", "memory"); + } + if (m_cnt4 > 0) { + int cnt4 = m_cnt4; + asm volatile( + "vld1.32 {d0-d1}, [%[in_y]]! \n" /* load y to q0 */ + "1:\n" + "vmax.f32 q1, q0, %q[vzero] \n" /* q0 relu */ + "vld1.32 {d0-d1}, [%[in_y]]! \n" /* load y to q0 */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "vst1.32 {d2-d3}, [%[out_y]]! \n" /* store q1 to y */ + "bne 1b \n" /* branch to label 1*/ + "sub %[in_y], %[in_y], #16 \n" /* restore in_y */ + : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y) + : [vzero] "w"(vzero) + : "q0", "q1", "cc", "memory"); + } + for (int r = 0; r < m_remain; ++r) { + y[r] = in_y[r] > 0.f ? in_y[r] : 0.f; + } + } else { + memcpy(y, y_buf, M * sizeof(float)); + } +} +#endif // __aarch64__ bool sgemv(const float *A, const float *x, @@ -59,33 +549,34 @@ bool sgemv(const float *A, int N, bool is_bias, const float *bias, - bool is_relu) { + bool is_relu, + const ARMContext *ctx) { if (transA) { - LOG(ERROR) << " sgemv, transA is not supported now"; - return false; - } - if (is_bias) { - //! with bias - if (is_relu) { - //! with relu - sgemv_bias_relu(transA, M, N, A, x, y, bias); - } else { - //! without relu - sgemv_bias(transA, M, N, A, x, y, bias); - } + sgemv_trans(M, N, A, x, y, is_bias, bias, is_relu, ctx); } else { - //! without bias - if (is_relu) { - //! with relu - sgemv_relu(transA, M, N, A, x, y); + if (is_bias) { + //! with bias + if (is_relu) { + //! with relu + sgemv_bias_relu(transA, M, N, A, x, y, bias); + } else { + //! without relu + sgemv_bias(transA, M, N, A, x, y, bias); + } } else { - //! without relu - sgemv(transA, M, N, A, x, y); + //! without bias + if (is_relu) { + //! with relu + sgemv_relu(transA, M, N, A, x, y); + } else { + //! without relu + sgemv(transA, M, N, A, x, y); + } } } return true; } - +// clang-format off //! define compute kernel #ifdef __aarch64__ #define SGEMV_IN_8 \ @@ -179,8 +670,8 @@ bool sgemv(const float *A, "fmla v5.4s, v9.4s, v21.4s \n" /* mul + add*/ \ "fmla v6.4s, v9.4s, v23.4s \n" /* mul + add*/ \ "fmla v7.4s, v9.4s, v25.4s \n" /* mul + add*/ \ - "bne 1b \n" /* jump to main loop */ /* pair add to final \ - result */ \ + "bne 1b \n" /* jump to main loop */ \ + /* pair add to final result */ \ "2: \n" /* reduce to scale */ \ "faddp v16.4s, v0.4s, v0.4s\n" /* pair add to vector */ \ "faddp s8, v16.2s \n" /* pair add to scale */ \ @@ -231,8 +722,8 @@ bool sgemv(const float *A, "fmla v0.4s, v8.4s, v10.4s \n" /* mul + add*/ \ "subs %w[cnt], %w[cnt], #1 \n" /* sub main loop count */ \ "fmla v1.4s, v9.4s, v11.4s \n" /* mul + add*/ \ - "bne 1b \n" /* jump to main loop */ /* pair add to final \ - result */ \ + "bne 1b \n" /* jump to main loop */ \ + /* pair add to final result */ \ "2: \n" /* reduce to scale */ \ "fadd v9.4s, v0.4s, v1.4s \n" /* add 2 vector */ \ "faddp v10.4s, v9.4s, v9.4s\n" /* pair add to vector */ \ @@ -283,7 +774,7 @@ bool sgemv(const float *A, "fmax s8, s8, s0 \n" /* relu */ \ "str s8, [%[out]] \n" /* save result */ -#else //__aarch64__ +#else // __aarch64__ #define SGEMV_IN_4 \ "pld [%[in]] @ preload cache line, input\n" \ @@ -349,8 +840,8 @@ bool sgemv(const float *A, "vmla.f32 q1, q5, q9 @ mul add\n" \ "vmla.f32 q2, q5, q11 @ mul add\n" \ "vmla.f32 q3, q5, q13 @ mul add\n" \ - "bne 1b @ jump to main loop\n" /* pair add to final \ - result */ \ + "bne 1b @ jump to main loop\n" \ + /* pair add to final result */ \ "2: @ pair add \n" \ "vpadd.f32 d8, d0, d1 @ pair add, first step\n" \ "vpadd.f32 d9, d2, d3 @ pair add, first step\n" \ @@ -382,13 +873,10 @@ bool sgemv(const float *A, "vmla.f32 q0, q12, q14 @ mul add\n" \ "vmla.f32 q0, q13, q15 @ mul add\n" \ "subs %[cnt] , #1 @ sub loop count \n" \ - "bne 1b @ jump to main loop\n" /* pair add to \ - final result \ - */ \ + "bne 1b @ jump to main loop\n" \ "2: @ end processing\n" \ "vpadd.f32 d2, d0, d1 @ pair add, first step\n" \ - "vpadd.f32 d0, d2, d2 @ pair add, final step\n" /* check tails \ - */ \ + "vpadd.f32 d0, d2, d2 @ pair add, final step\n"/*check tails*/ \ "cmp %[tail], #1 @ check whether has mid cols\n" \ "blt 4f @ jump to end\n" \ "3: @ tail loop\n" \ @@ -422,7 +910,7 @@ bool sgemv(const float *A, "vmax.f32 d0, d0, d1 @ relu\n" \ "vst1.32 {d0[0]}, [%[out]] @ save result\n" #endif - +// clang-format on void sgemv(const bool transA, const int M, const int N, @@ -523,7 +1011,7 @@ void sgemv(const bool transA, [tmp4] "r"(tmp4) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -579,7 +1067,7 @@ void sgemv(const bool transA, : [out] "r"(ptr_out) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } void sgemv_relu(const bool transA, @@ -671,7 +1159,7 @@ void sgemv_relu(const bool transA, : [out] "r"(ptr_out) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -727,7 +1215,7 @@ void sgemv_relu(const bool transA, : [out] "r"(ptr_out) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } void sgemv_bias(const bool transA, @@ -822,7 +1310,7 @@ void sgemv_bias(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -887,7 +1375,7 @@ void sgemv_bias(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } void sgemv_bias_relu(const bool transA, @@ -980,7 +1468,7 @@ void sgemv_bias_relu(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -1045,7 +1533,7 @@ void sgemv_bias_relu(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } } // namespace math diff --git a/lite/backends/arm/math/sgemv.h b/lite/backends/arm/math/sgemv.h index 4d74006f93..aa17349c99 100644 --- a/lite/backends/arm/math/sgemv.h +++ b/lite/backends/arm/math/sgemv.h @@ -15,6 +15,8 @@ #pragma once #include +#include "lite/core/context.h" +#include "lite/core/device_info.h" namespace paddle { namespace lite { @@ -28,9 +30,10 @@ bool sgemv(const float* A, bool transA, int M, int N, - bool is_bias = false, - const float* bias = nullptr, - bool is_relu = false); + bool is_bias, + const float* bias, + bool is_relu, + const ARMContext* ctx); } // namespace math } // namespace arm diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt index a6c3fcc66a..f73b4120e6 100644 --- a/lite/backends/cuda/CMakeLists.txt +++ b/lite/backends/cuda/CMakeLists.txt @@ -1,8 +1,7 @@ if(NOT LITE_WITH_CUDA) return() endif() -set(cuda_static_deps cudnn_static cublas_static curand_static - culibos_static cudart_static) +get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps}) nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps}) diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h index 13bf8190ef..9da70262f5 100644 --- a/lite/backends/cuda/cuda_utils.h +++ b/lite/backends/cuda/cuda_utils.h @@ -56,6 +56,15 @@ CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \ } +const int CUDA_NUM_THREADS = 512; +// CUDA: number of blocks for threads. +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} +inline int CUDA_GET_BLOCKS(const int N, const int base) { + return (N + base - 1) / base; +} + namespace paddle { namespace lite { namespace cuda { diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt index a5ee25643b..fafd74ae7a 100644 --- a/lite/backends/cuda/math/CMakeLists.txt +++ b/lite/backends/cuda/math/CMakeLists.txt @@ -2,8 +2,7 @@ if(NOT LITE_WITH_CUDA) return() endif() -set(cuda_static_deps cudnn_static cublas_static curand_static - culibos_static cudart_static) +get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps}) nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps}) @@ -12,6 +11,9 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps}) nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps}) nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps}) +nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps}) +nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps}) +nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps}) set ( math_cuda @@ -21,6 +23,9 @@ set ( cuda_type_trans cuda_transpose cuda_elementwise + cudnn_pool + cuda_gemm + cuda_batched_gemm ) set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda") diff --git a/lite/backends/cuda/math/batched_gemm.cc b/lite/backends/cuda/math/batched_gemm.cc new file mode 100644 index 0000000000..e815109276 --- /dev/null +++ b/lite/backends/cuda/math/batched_gemm.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/batched_gemm.h" +#include +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template <> +bool BatchedGemm::init(const bool trans_a, + const bool trans_b, + const int max_batch_size, + Context *ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + cudaMalloc(reinterpret_cast(&A_), + 3 * max_batch_size * sizeof(float *)); + return true; +} + +template <> +bool BatchedGemm::run(const float alpha, + const float beta, + const float *a[], + const float *b[], + float *c[], + const int m, + const int n, + const int k, + const int batch_size) { + CHECK(a != nullptr); + CHECK(b != nullptr); + CHECK(c != nullptr); + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + cudaMemcpyAsync(A_, + a, + batch_size * sizeof(const float *), + cudaMemcpyHostToDevice, + exe_stream_); + cudaMemcpyAsync(A_ + batch_size, + b, + batch_size * sizeof(const float *), + cudaMemcpyHostToDevice, + exe_stream_); + cudaMemcpyAsync(A_ + batch_size * 2, + c, + batch_size * sizeof(float *), + cudaMemcpyHostToDevice, + exe_stream_); + CUBLAS_CALL(cublasSgemmBatched(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + const_cast(A_ + batch_size), + ldb_, + const_cast(A_), + lda_, + &beta, + A_ + batch_size * 2, + ldc_, + batch_size)); + return true; +} + +template <> +bool BatchedGemm::run(const float alpha, + const float beta, + const float *a[], + const int m, + const int n, + const int k, + const int batch_size) { + CHECK(a != nullptr); + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + cudaMemcpyAsync(A_, + a, + 3 * batch_size * sizeof(const float *), + cudaMemcpyDefault, + exe_stream_); + CUBLAS_CALL(cublasSgemmBatched(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + const_cast(A_ + batch_size), + ldb_, + const_cast(A_), + lda_, + &beta, + A_ + batch_size * 2, + ldc_, + batch_size)); + return true; +} + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/batched_gemm.h b/lite/backends/cuda/math/batched_gemm.h new file mode 100644 index 0000000000..2b91d3a524 --- /dev/null +++ b/lite/backends/cuda/math/batched_gemm.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class BatchedGemm { + public: + BatchedGemm() : cu_handle_(nullptr) {} + ~BatchedGemm() { + if (A_ != nullptr) { + cudaFree(A_); + } + } + + bool init(const bool trans_a, + const bool trans_b, + const int max_batch_size, + Context* ctx); + + bool run(const PtypeOut alpha, + const PtypeOut beta, + const PtypeIn* a[], + const PtypeIn* b[], + PtypeOut* c[], + const int m, + const int n, + const int k, + const int batch_size); + + bool run(const PtypeOut alpha, + const PtypeOut beta, + const PtypeIn* a[], + const int m, + const int n, + const int k, + const int batch_size); + + private: + cudaStream_t exe_stream_; + cublasHandle_t cu_handle_; + cublasOperation_t cu_trans_a_; + cublasOperation_t cu_trans_b_; + int m_{-1}; + int n_{-1}; + int k_{-1}; + int lda_{-1}; + int ldb_{-1}; + int ldc_{-1}; + PtypeIn** A_{nullptr}; +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc index 72ed3951f6..a4f33f467f 100644 --- a/lite/backends/cuda/math/cudnn_conv.cc +++ b/lite/backends/cuda/math/cudnn_conv.cc @@ -31,6 +31,9 @@ bool CudnnConv2D::create(const operators::ConvParam& param, auto o_dims = param.output->dims(); int batch = x_dims[0]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int iw = x_dims[3]; // nchw int ih = x_dims[2]; int ic = x_dims[1]; @@ -41,10 +44,10 @@ bool CudnnConv2D::create(const operators::ConvParam& param, int kh = w_dims[2]; int sw = param.strides[1]; int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; + int pw = paddings[2]; + int ph = paddings[0]; + int dw = dilations[1]; + int dh = dilations[0]; CHECK(ic % param.groups == 0) << "The conv input channel shoud be divide group number."; @@ -133,8 +136,8 @@ bool CudnnConv2D::create(const operators::ConvParam& param, this->fwd_algo_ = algo_cache.GetAlgorithm(x_dims.Vectorize(), w_dims.Vectorize(), param.strides, - param.paddings, - param.dilations, + *param.paddings, + *param.dilations, 0, search_func); @@ -311,12 +314,15 @@ bool CudnnConv2DInt8::create(const operators::ConvParam& param, int kw = w_dims[2]; int kh = w_dims[1]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int sw = param.strides[1]; int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; + int pw = paddings[2]; + int ph = paddings[0]; + int dw = dilations[1]; + int dh = dilations[0]; std::vector weight_scale = param.weight_scale; float input_scale = param.input_scale; diff --git a/lite/backends/cuda/math/cudnn_pool.cc b/lite/backends/cuda/math/cudnn_pool.cc new file mode 100644 index 0000000000..f970fc326b --- /dev/null +++ b/lite/backends/cuda/math/cudnn_pool.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/cudnn_pool.h" +#include "lite/backends/cuda/math/activation.h" +#include "lite/backends/cuda/math/scale.h" +#include "lite/backends/cuda/math/type_trans.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +inline void UpdatePadding(std::vector* paddings, + const bool global_pooling, + const bool adaptive, + const std::vector& data_dims, + const std::vector& strides, + const std::vector& ksize) { + if (paddings->size() == data_dims.size()) { + for (size_t i = 0; i < data_dims.size(); ++i) { + int copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + CHECK(data_dims.size() * 2 == paddings->size()) + << "Paddings size should be the same or twice as the pooling size."; + } + if (global_pooling || adaptive) { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +inline void UpdateKsize(std::vector* ksize, + const std::vector& data_dims) { + ksize->resize(static_cast(data_dims.size())); + for (size_t i = 0; i < ksize->size(); ++i) { + *(ksize->begin() + i) = static_cast(data_dims[i]); + } +} + +template <> +bool CudnnPool2DNHWC::create( + const operators::PoolParam& param, Context* ctx) { + return true; +} + +template <> +bool CudnnPool2DNHWC::init(const operators::PoolParam& param, + Context* ctx) { + this->stream_ = ctx->exec_stream(); + CUDNN_CHECK(cudnnCreate(&this->handle_)); + CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_)); + + cudnnCreateTensorDescriptor(&this->input_desc_); + cudnnCreateTensorDescriptor(&this->output_desc_); + cudnnCreatePoolingDescriptor(&this->pooling_desc_); + + return create(param, ctx); +} + +template <> +bool CudnnPool2DNHWC::run( + const operators::PoolParam& param) { + auto x_dims = param.x->dims(); + auto o_dims = param.output->dims(); + int batch = x_dims[0]; + const float* in_data = param.x->data(); + float* out_data = param.output->mutable_data(TARGET(kCUDA)); + + int ih = x_dims[1]; + int iw = x_dims[2]; // nchw + int ic = x_dims[3]; + + int oh = o_dims[1]; + int ow = o_dims[2]; + int oc = o_dims[3]; + + std::vector ksize = param.ksize; + std::vector strides = param.strides; + std::vector paddings = *(param.paddings.get()); + + std::string pooling_type = param.pooling_type; + bool global_pooling = param.global_pooling; + bool exclusive = param.exclusive; + bool adaptive = param.adaptive; + + std::vector data_dims = {ih, iw}; + UpdatePadding(&paddings, global_pooling, adaptive, data_dims, strides, ksize); + + if (data_dims.size() * 2 == paddings.size()) { + for (size_t i = 0; i < data_dims.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + + if (global_pooling) { + UpdateKsize(&ksize, data_dims); + } + CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_, + CUDNN_TENSOR_NHWC, + CUDNN_DATA_FLOAT, + batch, + ic, + ih, + iw)); + + CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_, + CUDNN_TENSOR_NHWC, + CUDNN_DATA_FLOAT, + batch, + oc, + oh, + ow)); + cudnnPoolingMode_t mode; + if (pooling_type == "max") { + mode = CUDNN_POOLING_MAX; + } else { + mode = exclusive ? CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING + : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + } + CUDNN_CHECK(cudnnSetPoolingNdDescriptor(this->pooling_desc_, + mode, + CUDNN_NOT_PROPAGATE_NAN, + ksize.size(), + ksize.data(), + paddings.data(), + strides.data())); + float alpha = 1.0f; + float beta = 0.0f; + CUDNN_CHECK(cudnnPoolingForward(this->handle_, + this->pooling_desc_, + &alpha, + this->input_desc_, + in_data, + &beta, + this->output_desc_, + out_data)); + + return true; +} + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_pool.h b/lite/backends/cuda/math/cudnn_pool.h new file mode 100644 index 0000000000..acdc695b50 --- /dev/null +++ b/lite/backends/cuda/math/cudnn_pool.h @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class CudnnPool2DBase { + public: + CudnnPool2DBase() + : handle_(NULL), + input_desc_(NULL), + output_desc_(NULL), + pooling_desc_(NULL) {} + + ~CudnnPool2DBase() { + if (handle_ != NULL) { + CUDNN_CHECK(cudnnDestroy(handle_)); + } + if (input_desc_) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_)); + } + if (output_desc_) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_)); + } + if (pooling_desc_) { + cudnnDestroyPoolingDescriptor(pooling_desc_); + } + } + + protected: + cudaStream_t stream_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t input_desc_; + cudnnTensorDescriptor_t output_desc_; + cudnnPoolingDescriptor_t pooling_desc_; +}; + +template +class CudnnPool2DNHWC : public CudnnPool2DBase { + public: + CudnnPool2DNHWC() : CudnnPool2DBase() {} + virtual ~CudnnPool2DNHWC() = default; + virtual bool init(const operators::PoolParam& param, + Context* ctx); + + virtual bool create(const operators::PoolParam& param, + Context* ctx); + + virtual bool run(const operators::PoolParam& param); +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/elementwise.cu b/lite/backends/cuda/math/elementwise.cu index 57c9ec022a..8f0ebd1f97 100644 --- a/lite/backends/cuda/math/elementwise.cu +++ b/lite/backends/cuda/math/elementwise.cu @@ -13,13 +13,55 @@ // limitations under the License. #include "lite/backends/cuda/math/elementwise.h" -#include "lite/backends/cuda/math/utils.h" namespace paddle { namespace lite { namespace cuda { namespace math { +template +__global__ void elementwise_kernel(const size_t total, + const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < total) { + int idx = tid / post % n; +#if __CUDA_ARCH__ >= 350 + out_data[tid] = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type); +#else + out_data[tid] = binary_calc(x_data[tid], y_data[idx], type); +#endif + } +} + +template +__global__ void elementwise_relu_kernel(const size_t total, + const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < total) { + int idx = tid / post % n; + Dtype temp; +#if __CUDA_ARCH__ >= 350 + temp = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type); + +#else + temp = binary_calc(x_data[tid], y_data[idx], type); +#endif + out_data[tid] = temp > 0 ? temp : 0; + } +} + template __global__ void elementwise_add_kernel(const size_t total, const Dtype* x_data, @@ -76,6 +118,56 @@ __global__ void elementwise_add_nhwc4_int8_kernel(const size_t total, } } +template +void elementwise(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream) { + int num = pre * n * post; + int thread = 256; + int block = (num + thread - 1) / thread; + elementwise_kernel<<>>( + num, x_data, y_data, out_data, pre, n, post, type); +} + +template +void elementwise_relu(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream) { + int num = pre * n * post; + int thread = 256; + int block = (num + thread - 1) / thread; + elementwise_relu_kernel<<>>( + num, x_data, y_data, out_data, pre, n, post, type); +} + +template void elementwise(const float*, + const float*, + float*, + int, + int, + int, + BinaryOperation, + cudaStream_t); + +template void elementwise_relu(const float*, + const float*, + float*, + int, + int, + int, + BinaryOperation, + cudaStream_t); + template void elementwise_add(int num, const Dtype* x_data, diff --git a/lite/backends/cuda/math/elementwise.h b/lite/backends/cuda/math/elementwise.h index 7fcdf95021..ce45d0544e 100644 --- a/lite/backends/cuda/math/elementwise.h +++ b/lite/backends/cuda/math/elementwise.h @@ -15,12 +15,33 @@ #pragma once #include #include +#include "lite/backends/cuda/math/utils.h" namespace paddle { namespace lite { namespace cuda { namespace math { +template +void elementwise(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream); + +template +void elementwise_relu(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream); + template void elementwise_add(int num, const Dtype* x_data, diff --git a/lite/backends/cuda/math/gemm.cc b/lite/backends/cuda/math/gemm.cc new file mode 100644 index 0000000000..a9f12984aa --- /dev/null +++ b/lite/backends/cuda/math/gemm.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/gemm.h" +#include +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template <> +bool Gemm::init(const bool trans_a, + bool trans_b, + const int m, + const int n, + const int k, + Context *ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + lda_ = (!trans_a) ? k : m; + ldb_ = (!trans_b) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + return true; +} + +template <> +bool Gemm::init(const bool trans_a, + bool trans_b, + const int m, + const int n, + const int k, + const int lda, + const int ldb, + const int ldc, + Context *ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + m_ = m; + n_ = n; + k_ = k; + lda_ = lda; + ldb_ = ldb; + ldc_ = ldc; + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + return true; +} + +template <> +bool Gemm::run(const float alpha, + const float beta, + const float *a, + const float *b, + float *c, + Context *ctx) { + CUBLAS_CALL(cublasSgemm(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + b, + ldb_, + a, + lda_, + &beta, + c, + ldc_)); + return true; +} + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/gemm.h b/lite/backends/cuda/math/gemm.h new file mode 100644 index 0000000000..12194d54b0 --- /dev/null +++ b/lite/backends/cuda/math/gemm.h @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class Gemm { + public: + Gemm() : cu_handle_(nullptr) {} + ~Gemm() {} + bool init(const bool trans_a, + const bool trans_b, + const int m, + const int n, + const int k, + Context* ctx); + bool init(const bool trans_a, + const bool trans_b, + const int m, + const int n, + const int k, + const int lda, + const int ldb, + const int ldc, + Context* ctx); + + bool run(const PtypeOut alpha, + const PtypeOut beta, + const PtypeIn* a, + const PtypeIn* b, + PtypeOut* c, + Context* ctx); + + private: + cudaStream_t exe_stream_; + cublasHandle_t cu_handle_; + cublasOperation_t cu_trans_a_; + cublasOperation_t cu_trans_b_; + int m_{-1}; + int n_{-1}; + int k_{-1}; + int lda_{-1}; + int ldb_{-1}; + int ldc_{-1}; +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h index b4cd82fd8d..b6aa9c7d16 100644 --- a/lite/backends/cuda/math/utils.h +++ b/lite/backends/cuda/math/utils.h @@ -25,6 +25,24 @@ namespace lite { namespace cuda { namespace math { +enum class BinaryOperation { + kADD = 0, + kMUL = 1, + kDIV = 2, +}; + +template +__device__ T binary_calc(T x, T y, BinaryOperation type); + +template <> +__device__ __forceinline__ float binary_calc(float x, + float y, + BinaryOperation type) { + if (type == BinaryOperation::kADD) return x + y; + if (type == BinaryOperation::kMUL) return x * y; + if (type == BinaryOperation::kDIV) return x / y; +} + template __device__ T from_float(float x); diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp index fd17218d06..23332b422d 100644 --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ b/lite/backends/fpga/KD/pes/conv_process.hpp @@ -294,10 +294,17 @@ inline void split_filter_num(const ConvParam& c_param) { args.image.channels = input->shape().channel(); args.image.width = input->shape().width(); args.image.height = input->shape().height(); - args.image.pad_width = param.paddings[1]; + auto paddings = *param.padding; + args.image.pad_width = param.paddings[2]; args.image.pad_height = param.paddings[0]; args.output.address = out_address; args.output.scale_address = out_scale_address; + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; + } param.splitParams().push_back(conv_param); } } @@ -372,10 +379,18 @@ inline void split_channel(const ConvParam& c_param) { args.image.channels = conv_param->input.shape().channel(); args.image.width = conv_param->input.shape().width(); args.image.height = conv_param->input.shape().height(); - args.image.pad_width = param.paddings[1]; - args.image.pad_height = param.paddings[0]; + auto paddings = *param.paddings; + args.image.pad_width = paddings[2]; + args.image.pad_height = paddings[0]; + args.output.address = conv_param->output.mutableData(); args.output.scale_address = conv_param->output.scale(); + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; + } param.splitParams().push_back(conv_param); } } diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp old mode 100755 new mode 100644 index 9d7b9b544b..f86806102d --- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp @@ -61,14 +61,21 @@ class DepthwiseConvPE : public PE { args.image.channels = input->shape().channel(); args.image.height = input->shape().height(); args.image.width = input->shape().width(); - args.image.pad_width = param.paddings[0]; - args.image.pad_height = param.paddings[1]; + auto paddings = *param.paddings; + args.image.pad_width = param.paddings[2]; + args.image.pad_height = param.paddings[0]; args.image.scale_address = input->scale(); args.output.address = output->data(); args.output.scale_address = output->scale(); args.out_width = param.output->shape().width(); args.out_height = param.output->shape().height(); args.sub_conv_num = 1; + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; + } param.args = args; inplace_.relu_enable = param_.relu.enabled; diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp index fd3be1f463..5bb4f5285a 100644 --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp @@ -45,13 +45,14 @@ class PoolingPE : public PE { PoolingArgs args = {0}; args.mode = param_.type; + auto paddings = *param_.paddings; args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height)); args.image.address = input->data(); args.image.channels = input->shape().channel(); args.image.height = input->shape().height(); args.image.width = input->shape().width(); - args.image.pad_height = param_.paddings[0]; - args.image.pad_width = param_.paddings[1]; + args.image.pad_height = paddings[0]; + args.image.pad_width = paddings[2]; args.image.scale_address = input->scale(); args.output.address = output->mutableData(); args.output.scale_address = output->scale(); @@ -76,12 +77,13 @@ class PoolingPE : public PE { float* image_addr = float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); float16* data_out = output->data(); + auto paddings = *param_.paddings; int image_height = input->shape().height(); int image_width = input->shape().width(); int image_channels = input->shape().channel(); - int image_pad_h = param_.paddings[0]; - int image_pad_w = param_.paddings[1]; + int image_pad_h = paddings[0]; + int image_pad_w = paddings[2]; int kernel_height = param_.kernelSize[1]; int kernel_width = param_.kernelSize[0]; int kernel_step_h = param_.strides[0]; diff --git a/lite/backends/npu/builder.cc b/lite/backends/npu/builder.cc index ad5bed5be9..954fad8c91 100644 --- a/lite/backends/npu/builder.cc +++ b/lite/backends/npu/builder.cc @@ -142,21 +142,25 @@ ge::TensorPtr CvtTensor(lite::Tensor* in_tensor, int CvtActMode(std::string act_type) { int act_mode = 1; - if (act_type == "sigmod") { + if (act_type == "sigmoid") { act_mode = 0; } else if (act_type == "relu") { act_mode = 1; } else if (act_type == "tanh") { act_mode = 2; + } else if (act_type == "relu_clipped") { + act_mode = 3; } else if (act_type == "elu") { act_mode = 4; + } else if (act_type == "leaky_relu") { + act_mode = 5; } else if (act_type == "abs") { act_mode = 6; } else if (act_type == "softsign") { act_mode = 8; } else if (act_type == "softplus") { act_mode = 9; - } else if (act_type == "hardsigmoid") { + } else if (act_type == "hard_sigmoid") { act_mode = 10; } else { // TODO(hong19860320) support more activation mode diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h index 02f7071a4e..70200354fb 100644 --- a/lite/backends/npu/builder.h +++ b/lite/backends/npu/builder.h @@ -31,117 +31,6 @@ // Extended Ops of HIAI DDK namespace ge { -/** - * Multiply the matrix x1 by the matrix x2 to generate x1 * x2. - * The inputs must be two-dimensional matrices and the inner dimension of "x1" - * (after being transposed if transpose_x1 is true) must match the outer - * dimension of "x2" (after being transposed if transposed_x2 is true). - * x : the first input tensor, must be non const op. - * w : the second input tensor, must be const op. - * bias: the optional bias tensor, must be const op. - * - * y : the output tensor. - * - * has_bias: If true, enable input bias. - */ -REG_OP(MatMul) - .INPUT(x, TensorType({DT_FLOAT})) - .INPUT(w, TensorType({DT_FLOAT})) - .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT})) // bias must be const input - .OUTPUT(y, TensorType({DT_FLOAT})) - .ATTR(has_bias, AttrValue::BOOL{false}) // when has input::bias,set true - .OP_END(); - -/** - * Computes the gradients of convolution with respect to the input. - * - * input_sizes : An integer vector representing the shape of input, - * where input is a 4-D [batch, height, width, channels] tensor. - * filter : the filter tensor, with shape [H , W, filter_channel, - * filter_number], filter_channel must be same as x channel. - * x : The input tensor. - * - * y : The output tensor. - * - * format: 0: NCHW. 1: NHWC - * group : 1: default - * num_output : 0: default, num_output must be equal to - * (filter_channel * group) - * pad : Padding for the beginning and ending along each axis - * stride : Stride along each axis. - * dilation : dilation value along each axis of the filter. - * pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET - * bias_term : 0: default - * kernel : The shape of the convolution kernel - */ -REG_OP(Deconvolution) - .INPUT(input_sizes, TensorType({DT_UINT8})) - .INPUT(filter, TensorType({DT_FLOAT})) - .INPUT(x, TensorType({DT_FLOAT})) - .OPTIONAL_INPUT(b, TensorType({DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT})) - .ATTR(mode, AttrValue::INT{1}) - .ATTR(format, AttrValue::INT{1}) - .ATTR(group, AttrValue::INT{1}) - .ATTR(num_output, AttrValue::INT{0}) - .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0})) - .ATTR(stride, AttrValue::LIST_INT({1, 1})) - .ATTR(dilation, AttrValue::LIST_INT({1, 1})) - .ATTR(pad_mode, AttrValue::INT{0}) - .ATTR(bias_term, AttrValue::INT{0}) - .ATTR(kernel, AttrValue::LIST_INT({0, 0})) - .OP_END(); - -/** - * Resize images to size using bilinear interpolation. - * - * x : The tensor of 4-D - * w : A int32 Tensor of 2 elements: [height, width]. - * - * y : the output tensor - * - * align_corners : If true, the centers of the 4 corner pixels of the - * input and output tensors are aligned, preserving the values at the corner - * pixels. - * output_dim_mode : Defaults 2, including 0: zoom_factor , 1: - * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is - * controled by the [height, width] of w. - * shrink_factor : shrink factor. - * zoom_factor : zoom factor. - * pad_begin : begin of pad. - * pad_end : end of pad. - */ -REG_OP(ResizeBilinear) - .INPUT(x, TensorType({DT_FLOAT, DT_INT32})) - .INPUT(w, TensorType({DT_FLOAT, DT_INT32})) - .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32})) - .ATTR(align_corners, AttrValue::BOOL{false}) - .ATTR(output_dim_mode, AttrValue::INT{2}) - .ATTR(shrink_factor, AttrValue::INT{1}) - .ATTR(zoom_factor, AttrValue::INT{1}) - .ATTR(pad_begin, AttrValue::INT{0}) - .ATTR(pad_end, AttrValue::INT{0}) - .OP_END(); - -/** - * Resize images to size using nearest neighbor interpolation. - * - * image : Resize images to size using nearest neighbor interpolation. - * size : Must be one dimension and two elements - * - * output : the output tensor - * - * align_corners : If true, the centers of the 4 corner pixels of the - * input and output tensors are aligned, preserving the values at the corner - * pixels. Defaults to false - */ -REG_OP(ResizeNearestNeighbor) - .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL})) - .INPUT(size, TensorType({DT_INT32})) - .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL})) - .ATTR(align_corners, AttrValue::BOOL{false}) - .OP_END(); - /** * Pads a tensor. * diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc index 357ac8c2d6..93e176f9ed 100644 --- a/lite/backends/opencl/cl_wrapper.cc +++ b/lite/backends/opencl/cl_wrapper.cc @@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() { do { \ cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \ if (cl_func##_ == nullptr) { \ - LOG(ERROR) << "Cannot find the " << #cl_func \ + LOG(FATAL) << "Cannot find the " << #cl_func \ << " symbol in libOpenCL.so!"; \ break; \ } \ diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt index 2dea4364d5..a891076323 100644 --- a/lite/backends/x86/math/CMakeLists.txt +++ b/lite/backends/x86/math/CMakeLists.txt @@ -50,7 +50,8 @@ math_library(unpooling) math_library(vol2col) ## math_library(prelu) math_library(tree2col DEPS math_function) - +math_library(sequence_topk_avg_pooling) +math_library(search_fc DEPS blas dynload_mklml) # cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) # cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) # cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index bbe35b4de5..8d61fb3bbb 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "lite/backends/x86/math/beam_search.h" #include +#include #include #include "lite/fluid/lod.h" diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc index 9da239f9c6..ab6c1edb48 100644 --- a/lite/backends/x86/math/pooling.cc +++ b/lite/backends/x86/math/pooling.cc @@ -49,7 +49,7 @@ class Pool2dFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; @@ -130,7 +130,7 @@ class Pool2dGradFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; @@ -213,7 +213,7 @@ class MaxPool2dGradFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; @@ -629,7 +629,7 @@ class MaxPool2dWithIndexFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc new file mode 100644 index 0000000000..56fc363cb4 --- /dev/null +++ b/lite/backends/x86/math/search_fc.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/x86/math/search_fc.h" +#include +#include + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +class SearchFcFunctor { + public: + void operator()(const lite::X86Context& context, + const lite::Tensor& bottom, + const lite::Tensor& w, + const lite::Tensor& b, + lite::Tensor* top, + int out_size) { + int batch = bottom.dims()[0]; + + int _out = w.dims()[0]; // 100 + int _in = w.dims()[1]; // 228 + + lite::DDim dims(std::vector({bottom.dims()[0], out_size})); + + const auto bottom_data = bottom.data(); + auto top_data = top->mutable_data(lite::TargetType::kX86); + const auto weights = w.data(); + auto blas = math::GetBlas(context); + call_gemm(blas, + CblasNoTrans, + CblasTrans, + batch, + _out, + _in, + 1.0f, + bottom_data, + weights, + 0.0f, + top_data); + if (true) { + const auto* bias_data = b.data(); + for (int i = 0; i < batch; ++i) { + // add bias here + sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out); + } + } + } + + // private: +}; + +#define DEFINE_FUNCTOR(type) \ + template class SearchFcFunctor; + +FOR_ALL_TYPES(DEFINE_FUNCTOR); + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/search_fc.h b/lite/backends/x86/math/search_fc.h new file mode 100644 index 0000000000..e415c39602 --- /dev/null +++ b/lite/backends/x86/math/search_fc.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/backends/x86/math/blas.h" +#include "lite/backends/x86/mklml.h" +#include "lite/core/context.h" +#include "lite/core/tensor.h" +#include "lite/fluid/data_type.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +void call_gemm(const BlasT blas, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const T* B, + const T beta, + T* C) { +#ifndef __NAIVE_GEMM__ + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm((TransA == CblasTrans), + (TransB == CblasTrans), + M, + N, + K, + alpha, + A, + B, + beta, + C); +#endif // !__NAIVE_GEMM__ +} + +// To align with Lego +#ifndef LEGO_USE_FLOAT +#define LEGO_USE_FLOAT +#endif +#ifndef LEGO_SSE +#define LEGO_SSE +#endif + +#if defined(LEGO_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_setzero_px _mm256_setzero_ps +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_hadd_px _mm256_hadd_ps +#define _mm256_permute2f128_px _mm256_permute2f128_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss +#define _mm256_castpx256_px128 _mm256_castps256_ps128 +#define _mm256_max_px _mm256_max_ps +#define _mm256_sub_px _mm256_sub_ps +#define _mm256_set1_px _mm256_set1_ps +#define _mm256_sqrt_px _mm256_sqrt_ps +#define _mm256_div_px _mm256_div_ps +#define _mm_setzero_px _mm_setzero_ps +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_hadd_px _mm_hadd_ps +#define _mm_store_sx _mm_store_ss +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps +#define _mm_max_px _mm_max_ps +#define _mm_sub_px _mm_sub_ps +#define _mm_set1_px _mm_set1_ps +#define _mm_sqrt_px _mm_sqrt_ps +#define _mm_div_px _mm_div_ps + +#elif defined(LEGO_USE_DOUBLE) + +#define __m256x __m256d +#define __m128x __m128d + +static const unsigned int AVX_STEP_SIZE = 4; +static const unsigned int SSE_STEP_SIZE = 2; +static const unsigned int AVX_CUT_LEN_MASK = 3U; +static const unsigned int SSE_CUT_LEN_MASK = 1U; + +#define _mm256_setzero_px _mm256_setzero_pd +#define _mm256_mul_px _mm256_mul_pd +#define _mm256_add_px _mm256_add_pd +#define _mm256_load_px _mm256_loadu_pd +#define _mm256_hadd_px _mm256_hadd_pd +#define _mm256_permute2f128_px _mm256_permute2f128_pd +#define _mm256_store_px _mm256_storeu_pd +#define _mm256_broadcast_sx _mm256_broadcast_sd +#define _mm256_castpx256_px128 _mm256_castpd256_pd128 +#define _mm256_max_px _mm256_max_pd +#define _mm256_sub_px _mm256_sub_pd +#define _mm256_set1_px _mm256_set1_pd +#define _mm256_sqrt_px _mm256_sqrt_pd +#define _mm256_div_px _mm256_div_pd +#define _mm_setzero_px _mm_setzero_pd +#define _mm_add_px _mm_add_pd +#define _mm_mul_px _mm_mul_pd +#define _mm_load_px _mm_loadu_pd +#define _mm_hadd_px _mm_hadd_pd +#define _mm_store_sx _mm_store_sd +#define _mm_store_px _mm_storeu_pd +#define _mm_load1_px _mm_load1_pd +#define _mm_max_px _mm_max_pd +#define _mm_sub_px _mm_sub_pd +#define _mm_set1_px _mm_set1_pd +#define _mm_sqrt_px _mm_sqrt_pd +#define _mm_div_px _mm_div_pd +#endif + +template +inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px( + z + jjj, + _mm256_add_px(_mm256_load_px(x + jjj), _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] + y[jjj]; + } +} + +template +class SearchFcFunctor { + public: + void operator()(const lite::Context& context, + const lite::Tensor& X, + const lite::Tensor& W, + const lite::Tensor& b, + lite::Tensor* Out, + int out_size); +}; + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle + +#define FOR_ALL_TYPES(macro) macro(float); diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc new file mode 100644 index 0000000000..035a7923c7 --- /dev/null +++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/x86/math/sequence_topk_avg_pooling.h" +#include +#include + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +void get_topk_pos(const T* data, int length, int k, int* pos, bool debug) { + size_t real_k = k < length ? k : length; + + std::vector v(data, data + length); + + std::vector topk_pos; + T min_val = -10000000.0; + while (topk_pos.size() < real_k) { + T max_val = min_val; + int max_pos = -1; + for (int i = 0; i < length; ++i) { + if (v[i] > max_val) { + max_pos = i; + max_val = v[i]; + } + } + + assert(max_pos >= 0); + + topk_pos.push_back(max_pos); + v[max_pos] = min_val; + } + + assert(topk_pos.size() > 0); + while (topk_pos.size() < (size_t)k) { + topk_pos.push_back(-1); + } + + for (size_t i = 0; i < topk_pos.size(); ++i) { + pos[i] = topk_pos[i]; + } +} + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +class SequenceTopkAvgPoolingFunctor { + public: + void operator()(const lite::Tensor& in, + const lite::Tensor& row, + const lite::Tensor& col, + lite::Tensor* out, + lite::Tensor* pos, + int channel_num, + std::vector topks) { + auto k_num = topks.size(); + auto max_k = topks[topks.size() - 1]; + std::vector vec_pos_shape; + auto in_lod = in.lod()[0]; + auto row_lod = row.lod()[0]; + auto col_lod = col.lod()[0]; + int batch_size = row_lod.size() - 1; + int pos_total_size = row_lod[batch_size] * channel_num * max_k; + vec_pos_shape.push_back(pos_total_size); + lite::DDim dims(vec_pos_shape); + pos->Resize(dims); + auto pos_data = pos->mutable_data(lite::TargetType::kX86); + + int offset = 0; + std::vector vec_out_lod; + vec_out_lod.reserve(batch_size + 1); + for (int i = 0; i <= batch_size; ++i) { + offset = row_lod[i]; + vec_out_lod.push_back(offset); + } + + lite::LoD lod_temp; + lod_temp.push_back(vec_out_lod); + out->set_lod(lod_temp); + + auto in_data = in.data(); + auto out_data = out->mutable_data(lite::TargetType::kX86); + + T* sum_data = new T[max_k]; + for (int i = 0; i < batch_size; ++i) { + int total_size = in_lod[i + 1] - in_lod[i]; + int row_size = row_lod[i + 1] - row_lod[i]; + int col_size = col_lod[i + 1] - col_lod[i]; + + CHECK_EQ(total_size, channel_num * row_size * col_size) + << "size wrong in sequence_topk_avg_pooling_op!"; + + int feature_num = row_size * col_size; + for (int j = 0; j < channel_num; ++j) { + auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num; + + for (int r = 0; r < row_size; ++r) { + auto row_data = input_offset_feature_data + r * col_size; + auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + + r * channel_num * max_k + j * max_k; + auto out_slice_data = out_data + row_lod[i] * channel_num * k_num + + r * channel_num * k_num + j * k_num; + + get_topk_pos(row_data, col_size, max_k, pos_slice_data); + if (pos_slice_data[0] == -1) { + sum_data[0] = 0.0; + } else { + sum_data[0] = row_data[pos_slice_data[0]]; + } + for (int k = 1; k < max_k; ++k) { + if (pos_slice_data[k] == -1) { + sum_data[k] = sum_data[k - 1]; + } else { + sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]]; + } + } + for (size_t k = 0; k < k_num; ++k) { + out_slice_data[k] = sum_data[topks[k] - 1] / topks[k]; + } + } + } + } + delete[] sum_data; + } +}; + +#define DEFINE_FUNCTOR(type) \ + template class SequenceTopkAvgPoolingFunctor; + +FOR_ALL_TYPES(DEFINE_FUNCTOR); + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.h b/lite/backends/x86/math/sequence_topk_avg_pooling.h new file mode 100644 index 0000000000..78d458c4d8 --- /dev/null +++ b/lite/backends/x86/math/sequence_topk_avg_pooling.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/core/context.h" +#include "lite/core/tensor.h" +#include "lite/fluid/data_type.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { +template +void get_topk_pos( + const T* data, int length, int k, int* pos, bool debug = false); + +template +class SequenceTopkAvgPoolingFunctor { + public: + void operator()(const lite::Tensor& X, + const lite::Tensor& ROW, + const lite::Tensor& COLUMN, + lite::Tensor* Out, + lite::Tensor* pos, + int channel_num, + std::vector topks); +}; + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle + +#define FOR_ALL_TYPES(macro) macro(float); diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index b02ef8fed6..641302cd2d 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -100,7 +100,7 @@ add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) #----------------------------------------------- NOT CHANGE ----------------------------------------------- lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor - PROFILE_DEPS basic_profiler + PROFILE_DEPS lite_profiler ) lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel cpp_op_desc tensor @@ -114,7 +114,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper) lite_cc_library(program SRCS program.cc DEPS op kernel model_parser ${ops} ${cpp_wrapper} - PROFILE_DEPS basic_profiler) + PROFILE_DEPS lite_profiler) if (NOT LITE_ON_TINY_PUBLISH) lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program) diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index c59c078787..561a508d20 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -37,6 +37,9 @@ void TestCase::CreateInstruction() { // prepare context (*it)->SetContext(std::move(ctx_)); instruction_.reset(new Instruction(op, std::move(*it))); +#ifdef LITE_WITH_PROFILE + instruction_->set_profiler(new profile::Profiler()); +#endif } void TestCase::PrepareInputsForInstruction() { diff --git a/lite/core/context.h b/lite/core/context.h index 19238f1a9b..5063600d36 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -253,6 +253,13 @@ class Context { std::string name() const { return "CUDAContext"; } + CUDAContext& operator=(const CUDAContext& context) { + this->Init( + context.device_id_, context.exec_stream_id_, context.io_stream_id_); + cublas_fp32_ = const_cast(context).cublas_fp32(); + return *this; + } + private: int device_id_; // overall information @@ -345,7 +352,6 @@ class ContextScheduler { std::unique_ptr NewContext(TargetType target) { std::unique_ptr ctx(new KernelContext); - switch (target) { case TARGET(kHost): kernel_contexts_[TargetType::kHost].As().CopySharedTo( @@ -416,6 +422,7 @@ class ContextScheduler { void InitContext() { kernel_contexts_[Type].As().InitOnce(); } + ContextScheduler() { InitContext(); #ifdef LITE_WITH_X86 diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 166c04c000..f5b757ac3c 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -1039,7 +1039,7 @@ int DeviceInfo::Setup() { << ", max freq: " << max_freqs_[i] << ", min freq: " << min_freqs_[i] << ", cluster ID: " << cluster_ids_[core_ids_[i]] - << ", CPU ARCH: A" << archs_[i]; + << ", CPU ARCH: A" << static_cast(archs_[i]); } LOG(INFO) << "L1 DataCache size is: "; for (int i = 0; i < core_num_; ++i) { @@ -1093,7 +1093,7 @@ void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { RequestPowerRandLowMode(shift_num, thread_num); break; default: - LOG(FATAL) << "Unsupported power mode: " << mode; + LOG(FATAL) << "Unsupported power mode: " << static_cast(mode); break; } if (active_ids_.empty()) { diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 05d7a6b333..86193235a2 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -31,7 +31,7 @@ #include "lite/utils/replace_stl/stream.h" #ifdef LITE_WITH_PROFILE -#include "lite/core/profile/basic_profiler.h" +#include "lite/core/profile/profiler.h" #endif // LITE_WITH_PROFILE namespace paddle { @@ -58,7 +58,10 @@ class KernelBase { virtual void Run() = 0; #ifdef LITE_WITH_PROFILE - void SetProfileID(uint32_t id) { profile_id_ = id; } + void SetProfiler(profile::Profiler* profiler, int id) { + profiler_ = profiler; + profile_id_ = id; + } #endif void Launch() { @@ -82,10 +85,12 @@ class KernelBase { #endif #ifdef LITE_WITH_PROFILE - if (profile_id_ >= 0) { - profile::ProfileBlock x(profile_id_, "kernel"); - Run(); - } + CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " + "When LITE_WITH_PROFILE is defined, please set a " + "Profiler for Instruction."; + profiler_->StartTiming(profile_id_, ctx_.get()); + Run(); + profiler_->StopTiming(profile_id_, ctx_.get()); #else Run(); #endif @@ -175,6 +180,7 @@ class KernelBase { bool is_first_epoch_{true}; #ifdef LITE_WITH_PROFILE + profile::Profiler* profiler_{nullptr}; int profile_id_{-1}; #endif }; diff --git a/lite/core/memory.cc b/lite/core/memory.cc index ec94f69be1..eefada3f99 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -110,7 +110,7 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { TargetWrapper::MemcpySync( dst, src, size, IoDirection::DtoD); break; -#endif +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index ff064fb2ee..0d11b47db6 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -47,4 +47,5 @@ void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_activation_fuse_pass, paddle::lite::mir::ConvActivationFusePass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}) .BindKernel("conv2d"); diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc index d9d9c1bbf5..5ab5f8c0a4 100644 --- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc @@ -45,4 +45,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kX86)}); + .ExcludeTargets({TARGET(kX86), TARGET(kXPU)}); diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc index fd9aadc5d0..b1b492ce03 100644 --- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc @@ -46,4 +46,5 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass, paddle::lite::mir::ConvElementwiseFusePass) - .BindTargets({TARGET(kAny)}); + .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}); diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc index af66f5ab66..e4391cd242 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc @@ -35,4 +35,5 @@ void ElementwiseAddActivationFusePass::Apply( REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass, paddle::lite::mir::ElementwiseAddActivationFusePass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}) .BindKernel("fusion_elementwise_add_activation"); diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index ed10f06f56..7fc4492192 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -33,4 +33,5 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}) .BindKernel("fc"); diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index f823f45dc6..da611e4490 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -396,6 +396,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, op_desc->SetAttr("input_scale", scale_value); op_desc->SetInput("X", {input_act_node->arg()->name}); IR_NODE_LINK_TO(input_act_node, quantized_node) + auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); + quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); // delete nodes and edges std::unordered_set nodes2rm = {input_scale_node, @@ -440,6 +442,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, op_desc->SetInput("Y", {input_act_right_node->arg()->name}); IR_NODE_LINK_TO(input_act_left_node, quantized_node) IR_NODE_LINK_TO(input_act_right_node, quantized_node) + auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); + quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); // delete nodes and edges std::unordered_set nodes2rm = {input_scale_left_node, diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 1f2355e8a3..4f41ba4a60 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -255,4 +255,5 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { } // namespace paddle REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) - .BindTargets({TARGET(kARM)}); + .BindTargets({TARGET(kARM)}) + .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)}); diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h index 4de0fdbf35..4e8c8be292 100644 --- a/lite/core/mir/pass.h +++ b/lite/core/mir/pass.h @@ -52,34 +52,44 @@ class Pass { // Bind targets. At runtime, there must be one device in the bound targets. void BindTargets(const std::set& targets) { - std::set res; for (const auto& target : targets) { const std::set& universe = ExpandValidTargets(target); std::set_union(bound_targets_.begin(), bound_targets_.end(), universe.begin(), universe.end(), - std::inserter(res, res.begin())); + std::inserter(bound_targets_, bound_targets_.begin())); } - bound_targets_ = res; } // Exclude targets. At runtime, there must be one device in the bound targets. + // Disable the pass if one of the valid devices is in the excluded targets. void ExcludeTargets(const std::set& targets) { - std::set res; for (const auto& target : targets) { const std::set& universe = ExpandValidTargets(target); - std::set_difference(bound_targets_.begin(), - bound_targets_.end(), - universe.begin(), - universe.end(), - std::inserter(res, res.begin())); + std::set updated_bound_targets; + std::set_difference( + bound_targets_.begin(), + bound_targets_.end(), + universe.begin(), + universe.end(), + std::inserter(updated_bound_targets, updated_bound_targets.begin())); + bound_targets_ = updated_bound_targets; + std::set_union( + excluded_targets_.begin(), + excluded_targets_.end(), + universe.begin(), + universe.end(), + std::inserter(excluded_targets_, excluded_targets_.begin())); } - bound_targets_ = res; } // Get all bound targets. - const std::set& Targets() const { return bound_targets_; } + const std::set& BoundTargets() const { return bound_targets_; } + // Get all excluded targets. + const std::set& ExcludedTargets() const { + return excluded_targets_; + } // Some passes are only available on qualified kernels and need to be // explicitly declared. @@ -116,6 +126,7 @@ class Pass { std::string name_; std::string doc_; std::set bound_targets_; + std::set excluded_targets_; std::unordered_map> bound_kernels_; }; diff --git a/lite/core/mir/pass_utils.cc b/lite/core/mir/pass_utils.cc index 4f6be2c186..5bddfcbd3c 100644 --- a/lite/core/mir/pass_utils.cc +++ b/lite/core/mir/pass_utils.cc @@ -47,10 +47,34 @@ bool KernelRegistered(const std::string name, const Place& place) { return false; } -bool PassMatchesTarget(const mir::Pass& pass, TargetType target) { - const auto& targets = pass.Targets(); - if (targets.find(TARGET(kAny)) != targets.end()) return true; - return (targets.find(target) != targets.end()); +bool PassMatchesTarget(const mir::Pass& pass, + const std::set& targets) { + // Whether the pass is suitable for targets ? The condition is the + // intersection of targets and pass's bound targets is not empty, besides the + // intersection of targets and pass's excluded targets is empty. The formula + // is as follows: matched = !empty(targets ^ pass.bound_targets) && + // empty(targets ^ pass.excluded_targets), where ^ is intersection operation. + const auto& bound_targets = pass.BoundTargets(); + bool matched = bound_targets.find(TARGET(kAny)) != bound_targets.end(); + std::set inter_bound_targets; + std::set_intersection( + bound_targets.begin(), + bound_targets.end(), + targets.begin(), + targets.end(), + std::inserter(inter_bound_targets, inter_bound_targets.begin())); + matched |= !inter_bound_targets.empty(); + const auto& excluded_targets = pass.ExcludedTargets(); + matched &= excluded_targets.find(TARGET(kAny)) == excluded_targets.end(); + std::set inter_excluded_targets; + std::set_intersection( + excluded_targets.begin(), + excluded_targets.end(), + targets.begin(), + targets.end(), + std::inserter(inter_excluded_targets, inter_excluded_targets.begin())); + matched &= inter_excluded_targets.empty(); + return matched; } bool PassMatchesKernels(const mir::Pass& pass) { diff --git a/lite/core/mir/pass_utils.h b/lite/core/mir/pass_utils.h index 942f64bf31..57e8da5e46 100644 --- a/lite/core/mir/pass_utils.h +++ b/lite/core/mir/pass_utils.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "lite/core/mir/pass.h" @@ -24,7 +25,8 @@ namespace lite { bool KernelRegistered(const std::string name, const Place& place); // Check if the pass hits the hardware target. -bool PassMatchesTarget(const mir::Pass& pass, TargetType target); +bool PassMatchesTarget(const mir::Pass& pass, + const std::set& targets); // Check if the pass hits all necessary operators. bool PassMatchesKernels(const mir::Pass& pass); diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index 7187ddcef6..cd54e2654c 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -48,7 +48,8 @@ class StaticKernelPickPass : public mir::StmtPass { private: // Score the kernel. - size_t KernelGrade(const lite::KernelBase& kernel, + size_t KernelGrade(const lite::mir::Node::Stmt& instruct, + const lite::KernelBase& kernel, const std::vector& places) { CHECK_GT(places.size(), 0) << "valid_places is empty."; float final_score{-1.}; @@ -66,10 +67,11 @@ class StaticKernelPickPass : public mir::StmtPass { // valid_places.size() as default. // where i is the place's index in valid_places array. // score: score is the weighted sum of target、percision and layout - for (int i = 0; i < place_size; ++i) { + for (size_t i = 0; i < place_size; ++i) { const auto& place = places[i]; float weight = static_cast(place_size - i) / place_size; size_t score{}; + // The more important factor comes first if (kernel_pick_factors_.IsTargetConsidered() && (place.target == kernel.target() || kernel.target() == TARGET(kAny) || @@ -82,8 +84,12 @@ class StaticKernelPickPass : public mir::StmtPass { (place.precision == kernel.precision() || kernel.precision() == PRECISION(kAny) || place.precision == PRECISION(kAny))) { - score += kMax / static_cast( - core::KernelPickFactor::Factor::PrecisionFirst); + // score skipped, if kernel is int8, but op is not int8 + if (!(kernel.precision() == PRECISION(kInt8) && + !instruct.op_info()->HasAttr("enable_int8"))) { + score += kMax / static_cast( + core::KernelPickFactor::Factor::PrecisionFirst); + } } VLOG(4) << "[score s2]:" << score; if (kernel_pick_factors_.IsDataLayoutConsidered() && @@ -102,17 +108,17 @@ class StaticKernelPickPass : public mir::StmtPass { VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "-------- pick summary --------"; - VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " " - << DataLayoutToStr(winner_place.layout) << " " + VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + << " " << DataLayoutToStr(winner_place.layout) << " " << TargetToStr(winner_place.target); VLOG(4) << " ===> kernel.place():" << PrecisionToStr(kernel.place().precision) << " " << DataLayoutToStr(kernel.place().layout) << " " << TargetToStr(kernel.place().target); VLOG(4) << "kernel.op_type():" << kernel.op_type(); - VLOG(4) << "picker tactic " << kernel_pick_factors_; - VLOG(4) << "kernel place " << kernel.place().DebugString(); - VLOG(4) << "picker place " << winner_place.DebugString(); + VLOG(4) << "kernel picker factors:" << kernel_pick_factors_; + VLOG(4) << "kernel place:" << kernel.place().DebugString(); + VLOG(4) << "winner_picker place:" << winner_place.DebugString(); VLOG(4) << "------------------------------"; // The data layout is not considered, for the input and output arguments diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc index c83cd70d82..65c29aa68f 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc @@ -128,10 +128,10 @@ std::string GenerateNPUProgramPass::BuildNPUGraph( // persistable=true, Sothat the model parser can recognize it and save it to // param files if (!lite::npu::BuildModel(inputs, outputs, weight)) { - LOG(WARNING) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")"; - throw std::runtime_error("Build NPU graph failed."); + LOG(FATAL) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")"; + } else { + LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")"; } - LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")"; return weight_var_name; } @@ -175,40 +175,19 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr& graph) { supported_op_types.push_back(i.first); } - try { - int num_subgraph = FuseSubgraph(graph, supported_op_types); - InferOnce(graph); - auto op_nodes_all = ClassifySubgraph(graph); - CHECK_EQ(op_nodes_all.size(), num_subgraph); - int id = 1; - for (auto& op_nodes : op_nodes_all) { - LOG(INFO) << "[NPU] Converting Subgraph " << id; - GenNPUSubgraph(graph, op_nodes.second, id); - LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n" - << Visualize(graph.get()); - id++; - } - } catch (...) { - LOG(WARNING) << "[NPU] Build NPU graph failed."; - throw std::runtime_error("[NPU] Build NPU graph failed."); - } - - for (auto& item : graph->StmtTopologicalOrder()) { - if (item->IsStmt()) { - auto& stmt = item->AsStmt(); - LOG(INFO) << stmt; - insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); - } + int num_subgraph = FuseSubgraph(graph, supported_op_types); + InferOnce(graph); + auto op_nodes_all = ClassifySubgraph(graph); + CHECK_EQ(op_nodes_all.size(), num_subgraph); + int id = 1; + for (auto& op_nodes : op_nodes_all) { + LOG(INFO) << "[NPU] Converting Subgraph " << id; + GenNPUSubgraph(graph, op_nodes.second, id); + LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n" + << Visualize(graph.get()); + id++; } } - -std::unique_ptr GenerateNPUProgramPass::GenProgram() { - LOG(INFO) << "[NPU] program insts.size " << insts_.size(); - std::unique_ptr program( - new RuntimeProgram(std::move(insts_))); - return program; -} - } // namespace subgraph } // namespace mir } // namespace lite diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h index 823ca5f1f6..5b1a98c6ed 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.h +++ b/lite/core/mir/subgraph/generate_npu_program_pass.h @@ -35,7 +35,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass { using key2nodes_t = std::map; void Apply(const std::unique_ptr& graph) override; - std::unique_ptr GenProgram(); protected: // nodes2cvt: op nodes to convert @@ -54,9 +53,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass { void GenNPUSubgraph(const std::unique_ptr& graph, const std::unordered_set& op_nodes, int sub_id); - - private: - std::vector insts_; }; } // namespace subgraph diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc index 95339d6175..1afb54c692 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc @@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) { TestModel(FLAGS_model_dir, FLAGS_model_file, FLAGS_params_file, - {lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, - lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}}, + {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kARM), PRECISION(kFloat)}}, input_tensor_shape, FLAGS_optimized_model_dir + "/NPU"); // verify results diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc index 319e1e51fe..4340cb4ee3 100644 --- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc +++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc @@ -115,10 +115,10 @@ std::string GenerateXPUProgramPass::BuildXPUGraph( graph_ctx.params, &ordered_cvted_var_nodes, weight)) { - LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")"; - throw std::runtime_error("[XPU] Build XPU graph failed."); + LOG(FATAL) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")"; + } else { + LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")"; } - LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")"; return weight_var_name; } @@ -162,40 +162,19 @@ void GenerateXPUProgramPass::Apply(const std::unique_ptr& graph) { supported_op_types.push_back(i.first); } - try { - int num_subgraph = FuseSubgraph(graph, supported_op_types); - InferOnce(graph); - auto op_nodes_all = ClassifySubgraph(graph); - CHECK_EQ(op_nodes_all.size(), num_subgraph); - int id = 1; - for (auto& op_nodes : op_nodes_all) { - LOG(INFO) << "[XPU] Converting Subgraph " << id; - GenXPUSubgraph(graph, op_nodes.second, id); - LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n" - << Visualize(graph.get()); - id++; - } - } catch (...) { - LOG(WARNING) << "[XPU] Build XPU graph failed."; - throw std::runtime_error("[XPU] Build XPU graph failed."); - } - - for (auto& item : graph->StmtTopologicalOrder()) { - if (item->IsStmt()) { - auto& stmt = item->AsStmt(); - LOG(INFO) << stmt; - insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); - } + int num_subgraph = FuseSubgraph(graph, supported_op_types); + InferOnce(graph); + auto op_nodes_all = ClassifySubgraph(graph); + CHECK_EQ(op_nodes_all.size(), num_subgraph); + int id = 1; + for (auto& op_nodes : op_nodes_all) { + LOG(INFO) << "[XPU] Converting Subgraph " << id; + GenXPUSubgraph(graph, op_nodes.second, id); + LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n" + << Visualize(graph.get()); + id++; } } - -std::unique_ptr GenerateXPUProgramPass::GenProgram() { - LOG(INFO) << "[XPU] program insts.size=" << insts_.size(); - std::unique_ptr program( - new RuntimeProgram(std::move(insts_))); - return program; -} - } // namespace subgraph } // namespace mir } // namespace lite diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h index cf121ae950..777642cfb6 100644 --- a/lite/core/mir/subgraph/generate_xpu_program_pass.h +++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h @@ -35,7 +35,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass { using key2nodes_t = std::map; void Apply(const std::unique_ptr& graph) override; - std::unique_ptr GenProgram(); protected: // nodes2cvt: op nodes to convert @@ -58,9 +57,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass { void GenXPUSubgraph(const std::unique_ptr& graph, const std::unordered_set& op_nodes, int sub_id); - - private: - std::vector insts_; }; } // namespace subgraph diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index 9d63dcbb38..b3b7a858f6 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst( for (auto& kernel : kernels) { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); -#ifdef LITE_WITH_OPENCL + // layout kernel choose // must ignore [layout check] for layout of kernels's input and output - if (TargetCompatibleTo(*in_arg_ty, from) && - PrecisionCompatibleTo(*in_arg_ty, from) && - DeviceCompatibleTo(*in_arg_ty, from) && - out_arg_ty->layout() == to.layout()) { -#else - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->layout() == to.layout()) { -#endif + // note: replace LITE_WITH_OPENCL macro with judge input and output target + // of layout_trans + if ((in_arg_ty->target() == TARGET(kOpenCL) || + out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first + (TargetCompatibleTo(*in_arg_ty, from) && + PrecisionCompatibleTo(*in_arg_ty, from) && + DeviceCompatibleTo(*in_arg_ty, from) && + out_arg_ty->layout() == to.layout())) { + is_found = true; + } else if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->layout() == to.layout()) { is_found = true; + } + if (is_found) { selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op); break; } } + CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":" << in->AsArg().name << "->" << to << ":" << inst_node->AsStmt().op_info()->Type(); diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 7a32777865..b008faa687 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst( VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty; VLOG(4) << "to:" << to << "\n"; -// kernel choose branch for opencl backend -// judge inst's target whether is kOpenCL -// Note: to == *decl_arg_type == in of inst, not output of last inst -#ifdef LITE_WITH_OPENCL + // kernel choose branch for opencl backend + // judge inst's target whether is kOpenCL + // Note: to == *decl_arg_type == in of inst, not output of last inst // ignore [layout check] for layout between [to] and [from] // Because all of origin opencl insts in model, are not default layout // NCHW, @@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst( // [*decl_arg_type] -> [to]: input of inst, not output of last // [in_arg_ty]: in of io_copy // [out_arg_ty]: out of io_copy - if (TargetCompatibleTo(*in_arg_ty, from) && - PrecisionCompatibleTo(*in_arg_ty, from) && - DeviceCompatibleTo(*in_arg_ty, from) && - TargetCompatibleTo(*out_arg_ty, to)) { - VLOG(4) << "do nothing. opencl found"; -#else - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->target() == to.target()) { -#endif + // + // noto: replace LITE_WITH_OPENCL macro with judge input and output target + // of io_copy + if ((in_arg_ty->target() == TARGET(kOpenCL) || + out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first + (TargetCompatibleTo(*in_arg_ty, from) && + PrecisionCompatibleTo(*in_arg_ty, from) && + DeviceCompatibleTo(*in_arg_ty, from) && + TargetCompatibleTo(*out_arg_ty, to))) { + VLOG(4) << "picked, opencl found"; + is_found = true; + } else if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->target() == to.target()) { VLOG(4) << "picked"; is_found = true; + } + + if (is_found) { selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel io_copy_inst->AsStmt( io_copy_type, std::move(selected_kernels), io_copy_op); break; } + VLOG(4) << "not picked"; } + CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from << ":" << in->AsArg().name << " -> " << to << ":" << inst_node->AsStmt().op_info()->Type(); diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h index fe6ecfd66d..3f5d161a56 100644 --- a/lite/core/mir/variable_place_inference_pass.h +++ b/lite/core/mir/variable_place_inference_pass.h @@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass { } } - // Set the tye of the weight - void SetWeightType(Node* w, const LiteType& type) { -// TODO(xg) to optimize this -#ifdef LITE_WITH_FPGA - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); -#endif - -#ifdef LITE_WITH_OPENCL - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); -#endif - -#ifndef LITE_WITH_FPGA -#ifndef LITE_WITH_OPENCL - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), type.precision(), DATALAYOUT(kNCHW)); -#endif -#endif + // Set the type of the weight + void SetWeightType(Node* w, + const LiteType& type, + const std::map& lite_with_targets) { + VLOG(4) << "type.precision():" << PrecisionRepr(type.precision()); + if (lite_with_targets.at("kFPGA")) { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + } else if (lite_with_targets.at("kOpenCL")) { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + } else { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), type.precision(), DATALAYOUT(kNCHW)); + } } void InferenceArgumentPlace(SSAGraph* graph) { + auto& valid_places = graph->valid_places(); + auto valid_places_has_target = [&](TargetType t) -> bool { + for (auto& p : valid_places) { + if (p.target == t) { + return true; + } + } + return false; + }; + std::map lite_with_targets{ + {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))}, + {"kFPGA", valid_places_has_target(TARGET(kFPGA))}}; + VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"]; + VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"]; + VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); for (auto& x : graph->StmtTopologicalOrder()) { auto& inst = x->AsStmt(); -// The IoCopyOp is a tool operator, it won't support the type inference. -// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for -// tool operator -#ifndef LITE_WITH_FPGA -#ifndef LITE_WITH_OPENCL - VLOG(3) << "inst.op_type() == 'io_copy', continue"; - if (inst.op_type() == "io_copy") continue; -#endif -#endif + // The IoCopyOp is a tool operator, it won't support the type inference. + // in fpga, we has io_copy+cali+layout tool ops, so we need type inference + // for + // tool operator + if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) { + VLOG(3) << "inst.op_type() == 'io_copy', continue"; + if (inst.op_type() == "io_copy") continue; + } // deal with inputs VLOG(4) << "Infering op " << inst.op_info()->Repr(); // TODO(zhaolong): Add check if the node's name in op's arguments. @@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass { if (!x_in->AsArg().type) { VLOG(4) << "set type " << *type << " " << x_in->AsArg().name; if (x_in->AsArg().is_weight) { - SetWeightType(x_in, *type); + SetWeightType(x_in, *type, lite_with_targets); } else { x_in->AsArg().type = type; } @@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass { if (!x_out->AsArg().type) { VLOG(4) << "set type " << *type << " " << x_out->AsArg().name; if (x_out->AsArg().is_weight) { - SetWeightType(x_out, *type); + SetWeightType(x_out, *type, lite_with_targets); } else { x_out->AsArg().type = type; } diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index 1400b25409..887ac3c950 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -118,6 +118,8 @@ KernelRegistry::KernelRegistry() INIT_FOR(kCUDA, kAny, kNCHW); INIT_FOR(kCUDA, kAny, kAny); INIT_FOR(kCUDA, kInt8, kNHWC); + INIT_FOR(kCUDA, kInt64, kNCHW); + INIT_FOR(kCUDA, kInt64, kNHWC); INIT_FOR(kHost, kFloat, kNCHW); INIT_FOR(kHost, kAny, kNCHW); diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 7ed632d864..d78ae690f9 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -145,6 +145,12 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 22c5f19330..38c9d0e29d 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -13,7 +13,9 @@ // limitations under the License. #pragma once +#include #include +#include #include #include #include "lite/core/mir/generate_program_pass.h" @@ -49,23 +51,20 @@ class Optimizer { valid_places_ = valid_places; CHECK(!valid_places.empty()) << "At least one valid_place should be set"; CHECK(!graph_) << "duplicate optimize found"; + graph_.reset(new mir::SSAGraph); graph_->Build(program, valid_places); graph_->SetValidPlaces(valid_places); SpecifyKernelPickTactic(kernel_pick_factor); InitTargetTypeTransformPass(); + if (passes.empty()) { - RunPasses(std::vector{ - { - #if 0 - "lite_quant_dequant_fuse_pass", // + std::vector passes_local{ + {"lite_quant_dequant_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_bn_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise - // This pass is disabled to force some opencl kernels selected for - // final running, otherwise, they will be fused to ARM fusion - // kernels, and the OpenCL devices will be discarded. // TODO(Superjomn) Refine the fusion related design to select fusion // kernels for devices automatically. "lite_conv_activation_fuse_pass", // @@ -74,11 +73,10 @@ class Optimizer { "lite_transpose_softmax_transpose_fuse_pass", // "lite_interpolate_fuse_pass", // "identity_scale_eliminate_pass", // -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) "lite_elementwise_add_activation_fuse_pass", // -#endif -#endif - "static_kernel_pick_pass", // pick original kernel from graph +#endif + "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's // info(target/precision/layout/device) // using kernel info @@ -107,17 +105,12 @@ class Optimizer { "argument_type_display_pass", // "variable_place_inference_pass", // - "argument_type_display_pass", // + "argument_type_display_pass", "runtime_context_assign_pass", - "argument_type_display_pass", // -#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \ - !defined(LITE_WITH_XPU) - // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel - "memory_optimize_pass", -#endif - "argument_type_display_pass" - }}); + "argument_type_display_pass", + "memory_optimize_pass"}}; + RunPasses(passes_local); } else { RunPasses(passes); } @@ -128,39 +121,13 @@ class Optimizer { // Generate a new program based on the mir graph. std::unique_ptr GenRuntimeProgram() { -#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU) - auto target_place = Place{ -#ifdef LITE_WITH_NPU - TARGET(kNPU), -#endif -#ifdef LITE_WITH_XPU - TARGET(kXPU), -#endif - PRECISION(kFloat)}; - if (std::find(valid_places_.begin(), valid_places_.end(), target_place) != - valid_places_.end()) { -#ifdef LITE_WITH_NPU - auto pass = mir::PassManager::Global() - .LookUp( - "generate_npu_program_pass"); -#endif -#ifdef LITE_WITH_XPU - auto pass = mir::PassManager::Global() - .LookUp( - "generate_xpu_program_pass"); -#endif - try { - pass->Apply(graph_); - auto program = pass->GenProgram(); - CHECK(exec_scope_); - program->set_exec_scope(exec_scope_); - return program; - } catch (...) { - LOG(WARNING) << "Build " << TargetToStr(target_place.target) - << " program failed!"; - } - } -#endif + // Extra passes are applied for NPU and XPU, they depends on the shapes + // of input tensors. so GenRuntimeProgram() must be called after the shapes + // of input tensors are determined. + std::vector subgraph_passes{"generate_npu_program_pass", + "generate_xpu_program_pass"}; + RunPasses(subgraph_passes); + auto pass = mir::PassManager::Global().LookUp( "generate_program_pass"); pass->Apply(graph_); @@ -202,14 +169,16 @@ class Optimizer { for (auto& x : passes) { LOG(INFO) << "== Running pass: " << x; mir::Pass* pass = mir::PassManager::Global().LookUp(x); - CHECK(pass) << "Can not find pass: " << x; - bool matched = false; + if (!pass) { + LOG(INFO) << " - Skip " << x << " because the pass isn't found."; + continue; + } + std::set targets; for (const auto& place : valid_places_) { - if (PassMatchesTarget(*pass, place.target)) { - matched = true; - } + targets.insert(place.target); } - matched = matched && PassMatchesKernels(*pass); + bool matched = + PassMatchesTarget(*pass, targets) && PassMatchesKernels(*pass); if (!matched) { LOG(INFO) << " - Skip " << x << " because the target or kernel does not match."; diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt index 54a2390244..b7ddd810af 100644 --- a/lite/core/profile/CMakeLists.txt +++ b/lite/core/profile/CMakeLists.txt @@ -5,4 +5,5 @@ endif() lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags) lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler) - +lite_cc_library(lite_profiler SRCS profiler.cc DEPS context) +lite_cc_test(test_lite_timer SRCS test_timer.cc DEPS lite_profiler) diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc new file mode 100644 index 0000000000..a51b769c8f --- /dev/null +++ b/lite/core/profile/profiler.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/profile/profiler.h" +#include +#include +#include + +namespace paddle { +namespace lite { +namespace profile { + +int Profiler::NewTimer(const OpCharacter& ch) { + StatisUnit unit; + unit.character = ch; + if (ch.target == TargetType::kCUDA) { +#ifdef LITE_WITH_CUDA + unit.timer.reset(new DeviceTimer()); +#else + LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the " + "default x86 timer is used instead."; +#endif + } else { + unit.timer.reset(new DeviceTimer()); + } + units_.push_back(std::move(unit)); + return units_.size() - 1; +} + +void Profiler::StartTiming(const int index, KernelContext* ctx) { + CHECK_LT(index, units_.size()) + << "The timer index in the profiler is out of range."; + units_[index].timer->Start(ctx); +} + +float Profiler::StopTiming(const int index, KernelContext* ctx) { + CHECK_LT(index, units_.size()) + << "The timer index in the profiler is out of range."; + return units_[index].timer->Stop(ctx); +} + +std::string Profiler::Summary(bool concise) { + STL::stringstream ss; + auto cout_title = [&ss](const std::string& title, const std::string& name) { + // clang-format off + ss << "===== " << title << ": " << name << " =====" << std::endl; + ss << std::setw(25) << std::left << "Operator Type" \ + << std::setw(40) << std::left << "Kernel Name" \ + << std::setw(10) << std::left << "Remark" \ + << std::setw(10) << std::left << "Avg (ms)" \ + << std::setw(10) << std::left << "Min (ms)" \ + << std::setw(10) << std::left << "Max (ms)" \ + << std::endl; + // clang-format on + }; + if (concise) { + auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) { + return (c1.target < c2.target) || (c1.op_type < c2.op_type) || + (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark); + }; + std::map summary(op_comp); + for (auto& unit : units_) { + auto ch = summary.find(unit.character); + if (ch != summary.end()) { + ch->second.avg += unit.timer->LapTimes().Avg(); + ch->second.min += unit.timer->LapTimes().Min(); + ch->second.max += unit.timer->LapTimes().Max(); + } else { + TimeInfo info({unit.timer->LapTimes().Avg(), + unit.timer->LapTimes().Min(), + unit.timer->LapTimes().Max()}); + summary.insert({unit.character, info}); + } + } + cout_title("Concise Profiler Summary", name_); + for (const auto& item : summary) { + // clang-format off + ss << std::setw(25) << std::left << item.first.op_type \ + << std::setw(40) << std::left << item.first.kernel_name \ + << std::setw(10) << std::left << item.first.remark \ + << std::setw(10) << std::left << item.second.avg \ + << std::setw(10) << std::left << item.second.min \ + << std::setw(10) << std::left << item.second.max \ + << std::endl; + // clang-format on + } + } else { + cout_title("Detailed Profiler Summary", name_); + for (auto& unit : units_) { + // clang-format off + ss << std::setw(25) << std::left << unit.character.op_type \ + << std::setw(40) << std::left << unit.character.kernel_name \ + << std::setw(10) << std::left << unit.character.remark \ + << std::setw(10) << std::left << unit.timer->LapTimes().Avg() \ + << std::setw(10) << std::left << unit.timer->LapTimes().Min() \ + << std::setw(10) << std::left << unit.timer->LapTimes().Max() \ + << std::endl; + // clang-format on + } + } + return ss.str(); +} + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h new file mode 100644 index 0000000000..0fce8167cd --- /dev/null +++ b/lite/core/profile/profiler.h @@ -0,0 +1,59 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/core/profile/timer.h" + +namespace paddle { +namespace lite { +namespace profile { + +struct TimeInfo { + float avg; + float min; + float max; +}; + +struct OpCharacter { + TargetType target; + std::string op_type{std::string("N/A")}; + std::string kernel_name{std::string("N/A")}; + std::string remark{std::string("N/A")}; +}; + +struct StatisUnit { + std::unique_ptr timer; + OpCharacter character; +}; + +class Profiler final { + public: + Profiler() = default; + explicit Profiler(const std::string& name) : name_(name) {} + int NewTimer(const OpCharacter& ch); + void StartTiming(const int index, KernelContext* ctx); + float StopTiming(const int index, KernelContext* ctx); + std::string Summary(bool concise = true); + + private: + std::string name_{std::string("N/A")}; + std::vector units_; +}; + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc new file mode 100644 index 0000000000..6f49698ef4 --- /dev/null +++ b/lite/core/profile/test_timer.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // NOLINT +#include // NOLINT +#include "lite/core/context.h" +#include "lite/core/profile/profiler.h" +#include "lite/core/profile/timer.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace profile { + +TEST(timer, real_latency) { + Timer timer; + + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + timer.Stop(); + + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + timer.Stop(); + + LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg(); +} + +#ifdef LITE_WITH_CUDA +TEST(gpu_timer, real_latency) { + DeviceTimer timer; + KernelContext ctx; + cudaStream_t exec_stream; + cudaStreamCreate(&exec_stream); + (&ctx.As())->SetExecStream(exec_stream); + + timer.Start(&ctx); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + timer.Stop(&ctx); + + (&timer)->Start(&ctx); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + timer.Stop(&ctx); + + LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg(); +} + +TEST(profiler, real_latency) { + KernelContext ctx; + cudaStream_t exec_stream; + cudaStreamCreate(&exec_stream); + (&ctx.As())->SetExecStream(exec_stream); + + Profiler profiler("name"); + profile::OpCharacter ch; + ch.target = TargetType::kCUDA; + ch.op_type = "operator/1"; + ch.kernel_name = "kernel/1"; + int idx = profiler.NewTimer(ch); + profiler.StartTiming(idx, &ctx); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + profiler.StopTiming(idx, &ctx); + std::cout << profiler.Summary(); +} +#endif + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h new file mode 100644 index 0000000000..1e86f0d7b9 --- /dev/null +++ b/lite/core/profile/timer.h @@ -0,0 +1,114 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include // NOLINT +#include +#ifdef LITE_WITH_CUDA +#include "lite/backends/cuda/cuda_utils.h" +#endif +#include "lite/core/context.h" + +namespace paddle { +namespace lite { +namespace profile { + +template +class TimeList { + public: + void Clear() { laps_t_.clear(); } + void Add(T t) { laps_t_.push_back(t); } + T Max() const { return *std::max_element(laps_t_.begin(), laps_t_.end()); } + T Min() const { return *std::min_element(laps_t_.begin(), laps_t_.end()); } + T Sum() const { return std::accumulate(laps_t_.begin(), laps_t_.end(), 0.0); } + size_t Size() const { return laps_t_.size(); } + T Avg() const { + if (!Size()) { + return 0; + } + return Sum() / Size(); + } + const std::list& Raw() const { return laps_t_; } + + private: + std::list laps_t_; +}; + +class Timer { + public: + Timer() = default; + virtual ~Timer() = default; + + void Reset() { laps_t_.Clear(); } + void Start() { t_start_ = std::chrono::system_clock::now(); } + float Stop() { + t_stop_ = std::chrono::system_clock::now(); + auto ts = std::chrono::duration_cast(t_stop_ - + t_start_); + float elapse_ms = 1000.f * static_cast(ts.count()) * + std::chrono::microseconds::period::num / + std::chrono::microseconds::period::den; + this->laps_t_.Add(elapse_ms); + return elapse_ms; + } + virtual void Start(KernelContext* ctx) { return Start(); } + virtual float Stop(KernelContext* ctx) { return Stop(); } + float AvgLapTimeMs() const { return laps_t_.Avg(); } + const TimeList& LapTimes() const { return laps_t_; } + + protected: + std::chrono::time_point t_start_, t_stop_; + TimeList laps_t_; +}; + +template +class DeviceTimer final : public Timer {}; + +#ifdef LITE_WITH_CUDA +template <> +class DeviceTimer final : public Timer { + public: + DeviceTimer() { + CUDA_CALL(cudaEventCreate(&e_start_)); + CUDA_CALL(cudaEventCreate(&e_stop_)); + } + ~DeviceTimer() { + CUDA_CALL(cudaEventDestroy(e_start_)); + CUDA_CALL(cudaEventDestroy(e_stop_)); + } + void Start(KernelContext* ctx) { + cudaStream_t stream; + stream = ctx->As().exec_stream(); + CUDA_CALL(cudaEventRecord(e_start_, stream)); + } + float Stop(KernelContext* ctx) { + cudaStream_t stream; + stream = ctx->As().exec_stream(); + CUDA_CALL(cudaEventRecord(e_stop_, stream)); + CUDA_CALL(cudaEventSynchronize(e_stop_)); + float elapse_ms = 1.f; + CUDA_CALL(cudaEventElapsedTime(&elapse_ms, e_start_, e_stop_)); + this->laps_t_.Add(elapse_ms); + return elapse_ms; + } + + private: + cudaEvent_t e_start_, e_stop_; +}; +#endif + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/program.cc b/lite/core/program.cc index b60f279c0f..45796a478b 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -122,6 +122,9 @@ void RuntimeProgram::Run() { #endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } +#ifdef LITE_WITH_PROFILE + LOG(INFO) << "\n" << profiler_.Summary(); +#endif // LITE_WITH_PROFILE } void Program::Build(const cpp::ProgramDesc& prog) { @@ -183,11 +186,6 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) { void Instruction::Run() { CHECK(op_) << "op null"; CHECK(kernel_) << "kernel null"; -#ifdef LITE_WITH_PROFILE - if (profile_id_ >= 0) { - profile::ProfileBlock x(profile_id_, "instruction"); - } -#endif // LITE_WITH_PROFILE if (first_epoch_) { first_epoch_ = false; CHECK(op_->CheckShape()); diff --git a/lite/core/program.h b/lite/core/program.h index 7a6700da61..1c1e4975c3 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -22,9 +22,6 @@ #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/model_parser/cpp/program_desc.h" -#ifdef LITE_WITH_PROFILE -#include "lite/core/profile/basic_profiler.h" -#endif // LITE_WITH_PROFILE namespace paddle { namespace lite { @@ -87,22 +84,7 @@ struct Program { struct Instruction { Instruction(const std::shared_ptr& op, std::unique_ptr&& kernel) - : op_(op), kernel_(std::move(kernel)) { -#ifdef LITE_WITH_PROFILE - if (op_->Type() != "feed" && op_->Type() != "fetch") { - profile_id_ = profile::BasicProfiler::Global() - .NewRcd(kernel_->SerializedKernelType()) - .id(); - kernel_->SetProfileID(profile_id_); - // Set profile custom info - auto& profiler = - *profile::BasicProfiler::Global().mutable_record( - profile_id_); - profiler.SetCustomInfo("op_type", op_->Type()); - profiler.SetCustomInfo("op_info", op_->SerializedOpInfo()); - } -#endif // LITE_WITH_PROFILE - } + : op_(op), kernel_(std::move(kernel)) {} // Run the instruction. void Run(); @@ -113,6 +95,20 @@ struct Instruction { const KernelBase* kernel() const { return kernel_.get(); } KernelBase* mutable_kernel() { return kernel_.get(); } +#ifdef LITE_WITH_PROFILE + void set_profiler(profile::Profiler* profiler) { + profiler_ = profiler; + if (op_->Type() != "feed" && op_->Type() != "fetch") { + profile::OpCharacter ch; + ch.target = kernel()->target(); + ch.op_type = op_->Type(); + ch.kernel_name = kernel()->name(); + profile_id_ = profiler->NewTimer(ch); + kernel_->SetProfiler(profiler_, profile_id_); + } + } +#endif + private: std::shared_ptr op_; std::unique_ptr kernel_; @@ -120,7 +116,7 @@ struct Instruction { bool has_run_{false}; #ifdef LITE_WITH_PROFILE - // for profiler + profile::Profiler* profiler_; int profile_id_{-1}; #endif // LITE_WITH_PROFILE }; @@ -135,6 +131,9 @@ class LITE_API RuntimeProgram { if (instructions_.empty()) { LOG(FATAL) << "no instructions"; } +#ifdef LITE_WITH_PROFILE + set_profiler(); +#endif } void Run(); @@ -159,6 +158,15 @@ class LITE_API RuntimeProgram { RuntimeProgram(const RuntimeProgram&) = delete; std::vector instructions_; lite::Scope* exec_scope_{}; + +#ifdef LITE_WITH_PROFILE + profile::Profiler profiler_; + void set_profiler() { + for (auto i = instructions_.begin(); i != instructions_.end(); ++i) { + i->set_profiler(&profiler_); + } + } +#endif }; } // namespace lite diff --git a/lite/demo/cxx/Makefile.def b/lite/demo/cxx/Makefile.def index 1b5da970e8..cc2e593000 100644 --- a/lite/demo/cxx/Makefile.def +++ b/lite/demo/cxx/Makefile.def @@ -1,26 +1,22 @@ CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \ -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING -LDFLAGS = -latomic -pthread -ldl +LDFLAGS = -latomic -pthread -ldl -llog -lz SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot - -THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a - + SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \ -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \ -I/opt/android-ndk-r17c/sources/android/support/include \ -I/opt/android-ndk-r17c/sysroot/usr/include \ -THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include - ifeq ($(ARM_ABI), arm8) CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ - CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE + CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64 SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \ /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a - INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES) + INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android else CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++ CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \ @@ -31,5 +27,5 @@ else /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \ /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \ /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a - INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES) + INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi endif diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index ec72c044e3..b7768d763e 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -1,6 +1,6 @@ # C++ Demo 1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像 -2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。 +2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。 3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz ` 4. 执行以下命令准备模拟器环境 ```shell @@ -27,8 +27,10 @@ tar zxvf mobilenet_v1.tar.gz make adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/ adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api -adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" ``` 运行成功将在控制台输出预测结果的前10个类别的预测概率 @@ -37,6 +39,24 @@ adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/da cd ../mobile_light make adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api -adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt" +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt" ``` + +7. 编译并运行目标检测的demo +```shell +cd ../mobile_detection +wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz +tar zxvf mobilenetv1-ssd.tar.gz +make +adb -s emulator-5554 push mobile_detection /data/local/tmp/ +adb -s emulator-5554 push test.jpg /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" +adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./ +``` +运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 new file mode 100644 index 0000000000..784ad73da4 --- /dev/null +++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 @@ -0,0 +1,61 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +mobile_detection: fetch_opencv mobile_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) + +mobile_detection.o: mobile_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f mobile_detection.o + rm -f mobile_detection diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 new file mode 100644 index 0000000000..2304b38eff --- /dev/null +++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 @@ -0,0 +1,61 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +mobile_detection: fetch_opencv mobile_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) + +mobile_detection.o: mobile_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f mobile_detection.o + rm -f mobile_detection diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 index f795b41d46..8ab8a3b743 100644 --- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 @@ -5,9 +5,25 @@ include ../Makefile.def LITE_ROOT=../../../ -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include +THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) +THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a + +CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_full_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_full_bundled.a` + +#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) mobilenetv1_full_api: mobilenetv1_full_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 index d0767145b0..c13320603b 100644 --- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 @@ -5,9 +5,25 @@ include ../Makefile.def LITE_ROOT=../../../ -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include +THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) +THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a + +CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_full_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_full_bundled.a` + +#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) mobilenetv1_full_api: mobilenetv1_full_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 index d235d6e25f..9150ae6e44 100644 --- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 @@ -7,7 +7,19 @@ LITE_ROOT=../../../ CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) mobilenetv1_light_api: mobilenetv1_light_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 index b91aadcef8..7a2dbdd0fc 100644 --- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 @@ -7,7 +7,19 @@ LITE_ROOT=../../../ CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) mobilenetv1_light_api: mobilenetv1_light_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/mobile_detection/mobile_detection.cc b/lite/demo/cxx/mobile_detection/mobile_detection.cc new file mode 100644 index 0000000000..9b8f02aeed --- /dev/null +++ b/lite/demo/cxx/mobile_detection/mobile_detection.cc @@ -0,0 +1,210 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +struct Object { + int batch_id; + cv::Rect rec; + int class_id; + float prob; +}; + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +const char* class_names[] = { + "background", "aeroplane", "bicycle", "bird", "boat", + "bottle", "bus", "car", "cat", "chair", + "cow", "diningtable", "dog", "horse", "motorbike", + "person", "pottedplant", "sheep", "sofa", "train", + "tvmonitor"}; + +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale(const float* din, + float* dout, + int size, + const std::vector mean, + const std::vector scale) { + if (mean.size() != 3 || scale.size() != 3) { + std::cerr << "[ERROR] mean or scale size must equal to 3\n"; + exit(1); + } + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} + +void pre_process(const cv::Mat& img, int width, int height, float* data) { + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + std::vector mean = {0.5f, 0.5f, 0.5f}; + std::vector scale = {0.5f, 0.5f, 0.5f}; + const float* dimg = reinterpret_cast(imgf.data); + neon_mean_scale(dimg, data, width * height, mean, scale); +} + +std::vector detect_object(const float* data, + int count, + float thresh, + cv::Mat& image) { // NOLINT + if (data == nullptr) { + std::cerr << "[ERROR] data can not be nullptr\n"; + exit(1); + } + std::vector rect_out; + for (int iw = 0; iw < count; iw++) { + int oriw = image.cols; + int orih = image.rows; + if (data[1] > thresh && static_cast(data[0]) > 0) { + Object obj; + int x = static_cast(data[2] * oriw); + int y = static_cast(data[3] * orih); + int w = static_cast(data[4] * oriw) - x; + int h = static_cast(data[5] * orih) - y; + cv::Rect rec_clip = + cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows); + obj.batch_id = 0; + obj.class_id = static_cast(data[0]); + obj.prob = data[1]; + obj.rec = rec_clip; + if (w > 0 && h > 0 && obj.prob <= 1) { + rect_out.push_back(obj); + cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); + std::string str_prob = std::to_string(obj.prob); + std::string text = std::string(class_names[obj.class_id]) + ": " + + str_prob.substr(0, str_prob.find(".") + 4); + int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL; + double font_scale = 1.f; + int thickness = 2; + cv::Size text_size = + cv::getTextSize(text, font_face, font_scale, thickness, nullptr); + float new_font_scale = w * 0.35 * font_scale / text_size.width; + text_size = cv::getTextSize( + text, font_face, new_font_scale, thickness, nullptr); + cv::Point origin; + origin.x = x + 10; + origin.y = y + text_size.height + 10; + cv::putText(image, + text, + origin, + font_face, + new_font_scale, + cv::Scalar(0, 255, 255), + thickness, + cv::LINE_AA); + + std::cout << "detection, image size: " << image.cols << ", " + << image.rows + << ", detect object: " << class_names[obj.class_id] + << ", score: " << obj.prob << ", location: x=" << x + << ", y=" << y << ", width=" << w << ", height=" << h + << std::endl; + } + } + data += 6; + } + return rect_out; +} + +void RunModel(std::string model_dir, std::string img_path) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data from image + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + const int in_width = 300; + const int in_height = 300; + input_tensor->Resize({1, 3, in_height, in_width}); + auto* data = input_tensor->mutable_data(); + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + pre_process(img, in_width, in_height, data); + + // 4. Run predictor + predictor->Run(); + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int64_t cnt = 1; + for (auto& i : shape_out) { + cnt *= i; + } + auto rec_out = detect_object(outptr, static_cast(cnt / 6), 0.6f, img); + std::string result_name = + img_path.substr(0, img_path.find(".")) + "_detection_result.jpg"; + cv::imwrite(result_name, img); +} + +int main(int argc, char** argv) { + if (argc < 3) { + std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + exit(1); + } + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + RunModel(model_dir, img_path); + return 0; +} diff --git a/lite/demo/cxx/mobile_detection/test.jpg b/lite/demo/cxx/mobile_detection/test.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6bb36e136deec6088c7b75215fc35d6231283673 GIT binary patch literal 127499 zcmb4qXHZkm7j}S95_*?vfKWmcklv(2LTCv!6sbaJQUVHC=mJth=pB+kLT?Hv3P_EG z5Q7#kbQs#?I!4Q+Icj1B%b2m_d%or{x8oQFr;KtWo;;Qzb)8v*dL0bemSGcia5 z7Hgos2w-3WGPAIwnXYNcU$9wjkk&)@F`oB61jJ#6nOnj#NKqmod4ZmnJ=6|aI zj{j-!GVuZ~0p3#z(i33GJvn8Yk_g9c9H^0Fe6;Gno#vAzNkt~sH@bSuA3n7tqclBO z2(GtJ8|R%<51F%PKg7u>4Hs|$ZFtp-ij3=T5M&fG9TOeC)Se=V?^FUyPb44Z`1!P5 z+^UOTnEVNse{pb0%OiE)QH2du{?%RN3B2I3rI8NE1vV|~?w4AkHEEbNR8xRgrLdY6 zb>1anE=Sn(QC^`Zf;Ffm{nl}oa5YzTQYp7tW0-{OYcR)=(pRdUE(s^D)9Iqf%Mmli~KN|J)4$E&!jtb27yQW!mG9*l!SJH4LI;p9L>M+lOE@x>L?cj{f%RZNm^q@3u*H$#fUgKR!UkB%53Q(mO2dIhQwz^4e&*$@W47RN zpgu-uC(7DOb)Hh+uTVY)FS#Y8NX&Ea5%8$=VwEbw#+XeqftN8`(+rZP>%fS2agps!TSp|ByY$xTMTSH+3!Hp`Iy_%SDFrj z)89TRPVT>yp`QMU{@Y2|S|-zJzP$Vt@>k$);gD5`d4(dm$;v%E-*GC4Ene^rT1+Jg zHayQFC04-!M@rb~MkiB^N-?WLj(g5N&zeDXF3D7vkgv-9KF_HAupP!h#H17O913L! zQ$k&Y$Ti#RIu{Tv3wcP=JiH+DdMOYa-GT~@a->M}6v_hh;L;{$UaYV*vV!dt8#@hf zw*eGgO|CG@n}7Uku(fmH^ky&I~rq27cW*f#S)2IP0?2PV7?|f zxYawb^yYPB@>Vk8OOyg=h=H%zVFEXvf@7Y$5|w)Tc^Ka|XP#8DuGzoXq%?hM?V_)^ zNyqZXy+VDzk5sU&V^N1mpudk5+prc69L~)l+l?Q*^z5W4Yx8w^d%Sq~N(J~OD#NP? zCTv5?cQF^pMmX9Wln?QE!UthLpe>`SeaO}e#CPy|3?m~*918XX^fh-0E4+$_J6r99|1J5mN)1(uHZzGz)~>s| z6oO*)ZyaGS&69g>5GAnM((b0n+E-^8G3Iqczy-rR-_;7ToJV5#Dn{&N8FYm#mUZ95 zJgjzKH8LV%xPyWS)`7R)OB!c~A2+c$nQnWYFm)mKJ^h`!NmuS}x)Wy}%4McysK_$} zW|p2C0m1+b=3h*BkdT|<-m*6R57|J&yFxBTdxp(eRc$y*HEF~w;=gnb^1JuVK_Xp} zl!#jmX98F30u&aqM^^4r*t1N($!BEKZk3+JwB$n~l^3?nFN|cl?d~OS^2vP=RM?Z_ zP+IJ(H<5C%&*Fh7A&C$ZHyNeM3VO0ygJygML#QZ9$|{8aH<9|ydPo$?V(PRH>!%C5 z-h7hL03w>P?DD1oe!L@78630lg2l{ zw%&m;71ei%CpEur;Bq+YFU0f^wZ2tM@bkp%)|dalqpFo@R#p$)bf zbr_znVDptfM&<;|p%MO(gr;OvI>EyRm?thqGFr;)jpfEM**VrdVGv7Re9c5ylCP+UG=i8~u zAlbS~+MlW%gQyyL{JI%y+1U`Lx!rK?Pj$CEx>F#b&0-o5KL_h>m@<5SH1i2O_ zSj-J*4@y_Dw>8<>eo>NN)awFM38(d}Myu!ntn%xpCUfVc`+V8Nlpuk^y!`5NXKU5y zZ86Nd*(H9Ffh;$Ly%eSrtv1&nK3~DeJVt-f_Jsvr;9aFf+;j1p0;`V%TlQq{yTP!k zZR&X;Ur|@XtaED7F-D3EUF~RJ5guyN4{e3D%hzD!^*+h~J9B87w_|4agX|6EAqVXv z?@rTs^p(8+>bVa?S6^Xs*El*#j@RXSM=ppl9Q47Csdh}Uq!6>9UX|ASmZp!VxhvI4 z^{l__e!~2qGs1_*0e{H93m?FnLk&l?qeC}#0pP@j7S!AUNhD#vO zGH)^SO$+B-Jx?X0K}>nMLtt)~Bu}Y~9KSH|X?jqhG*7tjrmaR541eKT;6#pRRF#4X zM5O?@7G&?K2Vwz+V=o78%NkU&ppXN&MGG|#BsjC^huFzjRYpl9Od-W-8g!CEV~m*g zb_CSl3=we&4F!#{EWdTvDPd|Yrcp!7#K{*v)fV?nynjX-bbE7UH;qWv?h?<{GXX;! z`P0ru&?Lkp7YcY1Jn^r2_#6;Ytk6DVaSr1bP>z( zuf!8bHtW1M9gB4d;Q~tT@9yO(|4{XF=)2?!BX2-~$^rZ5LhyZgtSAFTb|#vlbd3%Y zWE;HSj>t0s2s~@;JJy;m(`In3pmH*bzSR&c{fI4u7C^|W!&c^}VNW~Wzt|glnRV#N zy@dS-*nm>CK6c^V#~3}EoWwK>=O(LuDhUi3gqlX-ZwMk1=V)d9KQWr8X-wBxy^XEv z6^|z!Q&-WisrqGlM00@n*ph@(2bM)*hw94Mt;#eFD3hLkO2wS8%}QDAo8YY(FmbGv z4=~TZw$bM@;nDc(&Uxx}mfOo%*jf|y=!ei=khbQl`Xe1DM$y|mlCIWW-va*u?kxWU zB)2GH73n&D84Eq_5J+KWQ~3`7lf=BnCHEceDV|E9&!M>23vY3+jCy**uwGL)RE&O? zAdCIGuHx#YRUmIBApEQ5pzlQ`5JrQ>6yY)LM3XFLxw)v)(XjyYdcV!k?kBz;&dJEM)< zYj)6SwuqB{Yd#pT5J`-X=eVN%Iv6H-`Z?;RF^W_tYY%Dfa+zxCuqi_LEJTY!M;k2m}KeAb` zRmVJllwfpU%%h&2^xM4JS2j#uZeT z_~obe?mbEv>b2X6DNS=tBdYc`#9dh+D!5)^8~rpebYW=5OSv*Yb(z8YOh zE026e?SHCYF_ufOvGkT+_aWJ9YEeugL^Hvbw z9SZ7(%kA1Gn|L9x@-U+1-u)kPkd^v!6NDrfvgc^GP&{g~8slemT9j^3Aj|&S(KPn; zRYD=L)#Brx!DhtbgJkEG|N1Vc`+ODBkF5Cyai<=1Fy?(TbY>nW9D5@6Nw4dTLxQA( zoWjX`cr;@*V=7-%rIfBI5o@Iq$A*)BlU#+9ef->VQ2Tu*{fli(x}MNRa#@V$1BGkA zU#X1V6G9(@Odm`b9|PC$>st-4OZT=Jl+)=!;$#AY?z$D?nl6`f(0U<91kE4KpJ0Yy zd4XW`lin7=k+o&eJ-I_>{=u@KXfu0+FI+7hUqeKg?|-keqP?On&&Ea9Fgd={d^44s z%I1Ni4&K>3_8ivkx9(%ArGq2d)w4YsTFK}4dnE|@i}%v5Sx8QBwK|?gKn=@vL^4X; zFdv54;o`5XBXkQB+povYtQEgP-)b57tx^E372lNV;*Rs!e0NxJ1a4HpL^xg&%VM2C zXc5+a1Qx9a;o(EWP4%lyVPB(}MKgvRmX;EJAQKcW`#kiR+G%rM_@w5w!H`Ss7oFti z-$BHhjm%C(MC9Fl-QSl4-m8G6wqN!(j=6F_ zj=g+%1FAH1i~9YB7C?Rwg!OqIygym~WYxMrG4sBMW6k3JZ`K_Fmu~@%({*N1y^>y2 zr8AJzMvK>Fx5s@ZC-?5I%Z0xFRmQ^|WQBaYldCWFN)og$w14n{S0&+Ea<|YKW zyfdb@KC~zaE7OA@?;s4O)J_`c)we=wGI@@EFRXitZ(CRzu=@VgL0z$Uf3OoG?#okl z6!*lpTDEB|*3QkH~tp&}1*0eTuk5!bqh^~WI{gi-EC8>CBqcj;UAgj0Ta zTJ(L#b3gZ*#|gQ#e!;nT+mquL0S{;6_tS&9e0{gAU`6+$n zx~DJ{z-wsHAH!!4vI5{G9YAwQLwb&ATE0b}Nsw1Kdk1>UW5<>`=5C+S;A(6G6FT z8Wxq**-g5TbzjMNlYhAG8L%5yxKf6s_d+yx2Ta^r2G~5DFN4<u!KRWT z`};v^9ZA#@ZO!4I)ikTYgWlnPfSlX6T33eM@?v6~9op;>En)mLR+j_RcRe9RSBXXc zwp?mQ8|YcHAVW`0E#9R?wZ(-3T(|W&A+8H%C^ zD<#IN;Cr*(a0i)kHvI<-&7(njuawMv;K7wolf0s;6@`tzn|-jMNx=`a*)mH$+*Opl z#@F;hD0yyhQf8mjOuzaA{aVG3?>j>TSJio1b7Z>aEgq>0A&G5>M4fYm&5_AG zLQl?6q|Aw3$XG#;%~X5}5x|(Y3b-GiHZ>3O{Rhb4m)ak>W^V{F(%Zxt@A9X=+2D<; z>rNcJA5#08`mNkEh`zLtI%YC7vaFUf&V2oSRZbxvzSkD%B_}1DWMBD-%48;e^v31N zK!l{tsNn&6|mc=9waG}Nc zbEQkVReND)>ns59N?1G*SR`fx>&}uHWUu}tK}9$Wr?g_m?i zuAnf{l@H;Be}FChM(v!4Xv0$O=b}ocU*4%){Lc+HRCDp1ovY2xE0@V;XHV522zP2u zM_+OOi%2y<`qSuG^yOxktEbLv(WA1#5~f6l2kNWWorYReljpub(o;8&G_A88;QfB9 z-R^uRJb(Z5Zr9`A~}fn_X?a;lGrt!R>O3$ES?m@cE4k>_&^VQ#Vwz-n+M6 z?A;2eKQH3=sN+))cXVoYx7i7PUfO0h=Stxw|HMI-1RZ#uLty8w^;CiLLiRJrGi0D# z*a$8=Jc=ae8NIKDI+1!M_Rz{vSkr|%_&h>JJ{j53YPwuwqE%uVP-zk+&$JY6?%;7J zPGXWHtJ|#r72l9D)DWM=9g+3l0eKSt;C$$DQ`>Q7nC9noPy0v~+l2pou3u*1O2}9` zHtxk_#7tdE5|_d98!A5z)i=jg*1gkg`uo}a#jBc*Ka8?Um&oyzE$i=SgC0lrW=|Jx z55_gM`~yU|@EtCE5A8mgm9%wO)h8aGSf6cz_bsoTs$2+BWNM63Ji~qxcsv%+jQQ;k zhvEa_pAwLGmI*Zi*;YX__c-LlBk=0y&QQ&!9a7k&;#=_~(K42yC81W1HN;Z_{L zPsMne0#@e)YL5lMt5-biU#8GVU#%Jj@3{YBo0OW^-C%)azcf<*i{dOrWm%p!yu|0P~qk7 zi$kIa8g&ctxHWo3My=<~JJ^#IpTMe3lTBHJb^peb!blgnyuPP7!)Dn4W>kqQjlQtt zm;H&Hm3l8@$&;t?s#UZ`)V-S@MK1(qB*yw1hhB4Oc@}I{AgH$I%S@y_tIklMAe(S~ zGzncsVq}&M<44;+8d+M^veR^3tbF=Ez$jmzNr<*3Q#y;=A%O69b2IESC?eFf_%}dY z_BY^D|93VM6E?Kc4<22c-KtkuUC>bOU_C^mU*Gj!1;jMtX{bVEXTF>FqEa6!?uXZd zkD7~+>h>rro7Lt1ZV+rQ=juaZzgZQBrj-iibM*?)F0GtVCtb2I(Q&E zPJnk0(3j%S&gS8UrAhn3csMOBIty{b1#2J@H%(Y;OGVR&^N|jZC#+%H7m+V{6Hu(@ zTv9B=%?fLS)J-J05HWiuuua4I*8npg9=^yU+c<&&{i=!GkZRSnoV$s9dnU+GyTUII znT0k=-*b{KY%*xXq{S_)qAHMu&|))qs*U7cj-2Meh>h}Yi|A4%L$YenRj0X& zNV>}l=6d(@uYz`22clwgQ_U8Z-TumDh27%ik&z893Cnfq^f;wHIw&vFW!!&MI*bwR zb}g-TZlU`*;f=lO146a$eaACJi|whGT^MZoi99=%aEn5-+;KwBy2Q zj@4TJQN70Jt=Q{zr0#kHNdXAn7OM=#OBu;Wb6VRrg5F8i%1USZ zRpr-5!qR?@?LMSy2Q6D$(YT$tl5gcBo|a^TT9@E~fv0&3Q5X4fJV~u~L~n5@=P&6n zxY`B9=Mwx>UQT0etM_~uD#(hdTG{w+n8Up7)TYdMyPbURbMDBU=|hjVP^_WL?4EvH z;yBNk=}bnydHpiPCZ*gv@-CU)faJW#1U{^vPwshSn;2#o6aT*D7_e6QvsHPwZ)L$e za>ya}99YM(SUpax9a=QAKVFm{<_s8JB1hX6Y;SLWjX|X@qQ@N{P=U7bh9+E;>IBBE9t1w^r`R{M=vN z`R&e&iZ=?w+5Z5l{*hS%>Is$^os*uL>xN_pP!Y5+yh?`SE|C#PFvaJxDV#BDEAzrA zmX^_U&$+%rfW0_l)(`@PcQx7oO*oBp%Ejar2Z}~J68t6w?8ejzw-pkqD40Dy0m+bZ z-zKY`my(E*j;x^$y26vjX{~$12uVu+hRjKdjK>L4_DeyWXQKb#nGkyABL%z5A}%N7 z*QxGWL7of7AC>hvT8DIgJC?Ao7Aw~=mpM=TQc*E2_3W8!dpD4p6(`~5GvOLLqleOc z*5X~#s9jX>PzB5lai1uSFl=3X%yAt?&0GpoR2#u5UfPN??H*G}kP1cyHrlwZEME1X z&g0wAA3OT0l|=|CsjbVx{kUfzd+}`-+xza8&{$2i5T2(|pq27y&+k+K-JcyHHs?oU%&`w}Jl$6sn*WZ&rTkL7WR`A>6lyzFmLP6eMo zXxW#0V1upj_!2M!h0x2NFJ&%a@{lp_&L*2r zL?EBTjg3GLCh8zv)zpaQe*-=Dir=u|KbTwt7R^Oi-1MG=E3tLCv!`#olKS9>nXRz}ARbP4*@5IceRu66JFS zU}0cO1c3P*z(Y*b^x%qMrKW|uH)PC^&a?a&=say zb1N3mEVaPN9h=O3iFZTsMcZYY#Vp_oKLGYQq+iQY_-U3Zzi*&-uzss;R)paU%|+DC zT3A20n@w4&aRemmwVGE~(ZD={U^fT{=rDMurpI2_Z8uKhQ+AwqDiU9?tEgCvt zb52qLi;fFT$@{R|W?U4#GCOA6!l(3CMz>fIqwlt43~W8Muifw9nPDoK3rVyI3%mzE z5D0ot_qvf3uzPA-ZK^UA9dp00MkTKx)1aiu+G^QEW$=?)=K?*Y$Zu?NJM2bO5h_2U ziE~Qv52r%7$u~5beDNH*FA69tc=cEa%_IfyOOOuFO+TzH>I*8`O2f*!XCK>!&@{DM z59~kF_rUQeE}obIe{D9a460}idnlV!>`jNJ(b&p~)xqRk#l#x5s8j~a)XSegxada< zNhf(++1I>))s~sfc)On%g2zb52t_uNh1H&39c8OnQRN#*B*@<9eYftM#4a9qZlj~% zas#Cqm+4t3BDYaozX;(o6InI^~3w^d96I8)>J_H(Z$p@EQsVhpOB(`F{9S$ z`r3W#P9~J--=kX}&wEWbzwZyxS_;v$TT3{geiMZ*Xk54@VEYu=Eateoqm-7H)g{blhw)hb~we& z@tC}hBElG5!mw$bNwF*XoP}f*3N>N8*n5wkQAV+yeWdd+SC5fDJ2;}#6aMe8al4moJtLP=3q!Vw|}x-1Tn zfG!Z~!yT)EkVeyLX*N~(x(xzB?s@;&qO zA$FmqLXzN)LVczv&e`3e+9DIJ7nY|R-7R#FH%{?P@8YyMUGh>Uq&g*r*tUVy>(G|A{OSJZ>Oa7CUX$eh zgVFQz@WsoT5Sa*wqHyk(;H^gg6n2}*o|JD6TTRS75~aE0wR{!~pOP7s3nk`pLhX?*OhCj5#}d}9jt;a4~;nr@oiq4!7!T5bz zhNp^RccDN#5AnppTUXd*A+vjUIjo8~EX?9=RIfNkiDPF-)^M>2TQv`Ff^%`18Q-2A zZkm+6FENWhuCs`wsOkCk-b1ilM2guS4!g(85 zYo_FOgxNelZCDXp4LOTBE@@X=d{(9N9>YVm3!Xrs_&2xczJ|Ou(zIU5_zxL!{{VvF z<43|y#UQQNVF$19_%@YSH78tS;WnxGsWZNmI(_l__5Ym1uF&jK(-xW0i3DZn(98x`}&0!FE z@($iW#m-2|Jq5}o6|}ck<{TmdWnRVot+LOPb8dmw3MDMP6J){Hlf2gclmXaW zRgdM}%DExPGio6L)*dobbLv@e4D_Oia)X6Rv@XgzD=PF{f7p~>51eCHh$kO9LrYV~-_%)TtU&Iqe-p1b-5OT3kcXTTyqP zio?@pl`bK@ZBb)%;mr`Yq~W7aTO37cwB_z8@|)DITUA@U5(6k#?VDSAhc38$#`FP` z=n5vqkE>G`kKEs|dE5-sH#KY5&waclgcp<~+xUTRU+X>+vwNmE^wmx)w$C*LG@@2) z%9eUBNX_YL3L9F0X>cSR6abTR?jq3Oa>X=M1(X3{GHMP(T){`yIwDZS1q3RYFH3*{ zkzfvko7hO;OlfN9G+dsqG4CBjCOVA;ZA3OlXg;YyXs0 z+1)ls@q56UxCm6?1FrOkqa)>7meE)m9Ux8diqb#Nfg}cKp3QOwHNvf=Ho9 zHa)=riPVSst)?Mxmj6mCLKDoMaDtY)4N0&b!WhT<1_DWQj17fG@CA=@)N@$}65~x_ z2u&_;m=_s=W`@ba3b|p3Koa0<4H(CCU>l9B0C?ip#{_m+!#Cj)a~cQWn3d3ClT9Gm z{)h|Na#ztx4X+|=_4L7W4AE~4#g4H`YGy4`PkwOWie}{t3N@12og?u44zR;CpI7PL zcemC{sQpAat-d|AKsHq+B|YqYZ`Tq>IVabG!7EdYT)l{~czt z7WVwH4aF!@kV^N};OWyHO;aWx~{^ChrLBs&DsW>_glQkAN;R)1U<-VpjR|{)uOXa|FuJjtaT(i%@yqb^vjM+`fJQH_|*hZ(#TKuTZ6yTC%c#Qr{Ev$HgC$kUQ8J_M5uV6xu}dzWH`%Fu9JIF3G<@St z_?WgF>@Ag3BZ{*8u`_Z0^YSC@YU6X`E#DKCVw zwB5fMJ!<*zIY|Bf!;YYJ{dVgzO@5tfnvqVgw>jD|X~@;+Gdq?nycU^J`!ZB%31CI{ z2Pq*6FjThK%R_+DS|Ojbm32??^C)P18hd@yr(Y|OWTe959}jPTyKUal+z~v^ba`!P z^7~X{WsS-yEnq9q0sndd7u=P)#=G*YovVTLSH6CqqI<(K-6B@V>S9gqr{$*6SBC@V zM(?LCEZz&Dw`llU=gcO>hk89Jv@6ZrylmIst)g+i1G{>V_jV5*mTuEs654Q6Cn9dm z&rnhS+Ls{he}FMaRA0O?>5N+Y)H8gqMoKXM1Qm6jOa54~bl|Dr2RblP1YFwlN!x|L zSl2T>S{PX?l$$22+_+l02(W&xU=6Y?R$=5a3v?t%^Ptl4$PkhSt{0cmRV@O$P6}iI znsPKU%LqUT#AMvY7#5jK5QtC|sss9tLt<*E&v~GVZ5nMToSAJN0tgVM{yM*?u%BqB z&oJUlhGhF#BN~7z9aWta^7S~K!SomogzS<*{<~Ll{K%3gdq%CxTJ?IPW73Fu(r5H* zmi>sjGiFqMz**g7^)@?Z{wMibzm}-#B+aM+8P-e#_j_1X)t*Fz%xHVvYFcfF z*O1!3x(^-nw=CU$;4vQ0!sw<;$zbgp8cKZBfNJH;Nb%a$*h2KSE!yaKGKWd#e-tz$kSx$1q!xeaO|l|duEC_to5$9 z4l=YVSEXIMEG}xk#G1@ApD|3wepzAyyAC^8_80Nsmy!U!J%~R}_wQ&MxETWp2HRBllnkd&6hrvn zvdKIk=*%1|;=r|9mZ~DgR0sZpk?qT7u%y;p^toVIpCR*n(Sn-I&pL5SU}Rj7$)-d> zf!(=Q8T6k|^FKiFqgki+=3wVGsF>OXS--TM;X8ky4;@tek*(4D!r6+P29BpU2bb9& zYH9-JnMRbkL;DsVkA2f0nzV%#j=s8D#?m)X#h}rBTAKYQI?evjqnTwVa=T+7XxJ@1 zl-Y5NYbOo*qb=vjA9KmOYd7!hN(eV6Nwq|iVa{Z^st;SS13vPO`Fc)QTI~*`qJ~hW zvuxMkJs#R-C`N=rUp*LEXh6|heUo1lEg43fC0;PRr&!0xackRlp3E-i#==&`bm{j=uc%S;yh(z_EgL_u z{D$7J1~S*$1zRUiJv5kW`{TXhvhMlB9I`&f=G-q#4lh;fX*=Z2EKWyaW$jm?6FKyd z1- zhMR%pmucJG+M9H2Y9`!CMqK47V{Kr}31Z{P0*RQ+L#r8Ry6KXsa}hxhA^1H@2+X-K z9GvJc<;De|csg1+b_ng2OZhbZ7<(4*{%))0n+V)-tinr@k2Ip!!Gr6!t?=6j5qM?V zLsN9)JvXEl>KRouFZM|mm(#t+ryd@wIdea4RGeSx=C(}M7i;vcW9*8$ziQOp#cF8< zicxXvsnJUTF&kwQ;kL66Nn z!mr2_A~vfB&mV!rg0O@&dp*>M-FZH3{*cL+-k}R+g}u8P!1j&;l2$9glCWfT$9~@E z}QnQ0Y+jZc`>)mYc zsE>UVl%V8}khsu*2>XX+y;%c!!}ejJwmq0uCv|K2HKXB5FVF zxdn%a)e~-JMao*^URyHgXZxv2Z$i)5Ug5sVC+3%?CO04_?8V0GZ;VT^X}-$ML28^6 zKNUi9V~DX0_Z*iL(hoZAi-TS;GS4VoLbnQ-8nm6Y@cVyoOVQ+)k&)W@HcwcsOGORy)#18qvYm;4#kLty$mx>^2ge}A#e@+^DWZhOm8%M_rh_ziUiXhDh%^M zEwkuXE8hN!gvS))6-_sceR|cF;t)$zs2o!S{*f4>u4@!6 zJH@G(1C2UnTy3TPwwAu|jyZfl^i45Tnk9-q^>CsO34Yhd05J9y+rvF^nqDt!4P7s} zE;-L*ZC4le^9|eq6~;=`R)=UgK~77Rv?yPkFR$WF7mb)2894llex9c{s`c3n%iEsC zkCpLZL$VQCJU8D}9j-#R?f>`^_u@2U3i+;w~XlKTZc z_m5C%&tzm@$g#IiR{u9jk=+hca^FwV5_{iv@~Zz+;@)ZVw?x&s&yt_F-1WGoPrtkS zER|LsPhaBqV0|8}`tiomx7;U`lcjQoaxkX-^%I6*Y_*-|&Gq<4Q^%q=uTgt40{#}? z(nGVm?05F!yFcVk7x*O`pKtG7{@mHm!(Mtao9y)#wTpSq zpVdijdAd?sq_lg^CGJ$h_<{YY%U=Iy8Rko-2S0~M8Vja<3Vz|gtVxk~TlW<%9`;Yl zBzJF_As9?HOmHgS8=mp2a9&8 z$aZewZ6`r)S+Vm`rI~A=s5ckeKHH&(1gTabuY&h$j();U+YcjlV%nT;q{NxuT&lFs zEP|EJw%qvINPqAS)~atND!66rZbLpV#XuYn=D$^FjCBAJr+o&O* z^vS-+BX=B=$sOkQ#*S5)@DbT71E2ajfy^-!#-iG%tfrl0<;%v@ZxLZW>+RQ;yhy=v z0T_EebIZ42xY*<*V>FiV1hfRnhC>TMP`n8gkKn;Ty`-#}!s|mo_-%cTnd0XEOn&$V zu>ILg=h1~r>qX1SUT~g)gzx3RbF{SsaE<(A9S}Z3{SKnvV`{xZ+XeSz%e9rM>!-qS z2d*grS*x09EDLWY>{_A8)^!3V*}~gEB1e#@QAhuC@+&A(DxUHpaxR8C_&!ArP1o2E z*z!KbuklF<$+<@WJIw}=+)FJn5N%(37>yE zM7W-@Fvl}5pD}~exe=}H9mIEY<_t$v~_VjSGjU^ zVhI;oS;0j$cVIwW?bQ#xu-+Se5IHoQGNzVs%+STZQKxR`v$7glVO1!G5tB$=I73bt z=sULj{O-az)=EFR7MD&N;1KlnhHONr=CC0uviqBaK-EXS+B!Iy?Skxw3C|#*`9Q7i zbLl_rx8#Lc4ibLkq~H6GR@ngEw(x~U%gQkRbfp+M`8~8(ZJ*ceiODU63aI*O5XPc{ z(3tvhTdh? zZ80h?2;?D=3^JDN=M%j?>dVDly3&tvyC1(Fn4I+fmhJSERg-mlPo7kFN#(h~&H3+c zeNGpNE%ekesf3Hvp~p^wt1BmQqBpHm+YatzMz-Vh-!p`x7MF8SieMv?S7zG2Q1{afYp^9K)hTG#8EFI@Y3!TzF0om$Gexy@RQ zotmMIkMf)Q&IKpl|Cyp&7o+|GqJGm)?(F$Ib?Z1@RgpcVnO*vx`r-8Lp9b7>PGKFS zcT0<9nm^;$)zy=;1Z(Tc+0M(+YB2@TyWVKbpZI%Za>Ea*KB7sXcF%~$h%y2aG>SA* z(Q|sK70^_Sz@D6aMfjc^oGI8r0Y%`v3)<$Op~qOJVA~ud1y%wo6`iJ_4tLNtBQ%*p zV&sN=7;~I7pNtY%zSsRJ?2RKUR<*8PD~R3g?a$|;+KiwLT=qCxHP;VQsdh`HCLyLK z%eb*nxx4Iu(<|6+e&M&kIU|8II}3MIc!JDA)A99n`+}B`x|n}}zo_1ReQ@9G_?Y1+`+9z5 z%5%9oOIy#mW6}M{Aa4Uz6%n@M#so=r41sN%tFlOo!BDkxjXSt%J}%ZW}~{m54bNj6g`L7*#58Cs#w0L5Q7(S*rtny?5I1*U4K zngv?K5FkL10}r~;iXX~BkhguVp()FjY$t45p$T*(Kv+}>K+iy;=T>qtg5|LNC6k{q z_+Gg=$*b*^e`PG$uUm=(CT=X9x{0y_NvF@1&lj61Ab2#{oBUSyY*Gmxkg7L~vjQ!9 z)X+DcNTg4aLU6|5JO&mLx~!+hiqGw++|==Sv6r7JXET$5A?LKqMM_)8&C$spQUg;6 zahjxKTlPTHhsqy{OHZCWsSIPMOpiDfF}k)(h^!o1re!IXgj!WSWt#*q2Z{gm^Zy61 zz`gX&04ehgO6EN8Pd_`vx+*Pd$oOPtt$9M%r_n-baox8UHuQ67*zSFb()o7kJFeaH zvId@Dl8$Ll-&ADJKu$?LZ@SZ06T^A96oxw&k^nSyP^Y!GbevPZjyQ}77xZGyRS-Yo zYY?JWL zHvIyN0J!I_+gY{RXg>WbY_&P+O8Z>r%%g+ksc-J!y!A4{JHln&QFDFp%adU5?+Pe$Ql#t@;y7}na-y08al1R3t*j4Vg_+v$9PH&b}g8lxl>90Nk{{SKS zDb(*z+;<+tx?8GQXA~dJl0H1d&h8RJ8Q4(|3I=G;=CeA4YT_qE4&BPa)%R+}O=%_l zY#CfmcVp^Cx|Db{${ycy7HcaX6bHU&JNstQXL-AI0+Z7%*~NkO+C2b%X=E)Q@}56| zh_;(^{z@S7nVw(LFoiM^EA%lyJRIR$ObaYzh9LwB{VQk*W`ugK8|c&^HdzB3IX=#t z7qgpc{|`y$8OVnBzWvx+)vi^0ma1K=Hi<2X5u?dB}!G%R>$}8`#*2b`*WZBy3cieZUJ#%ZgB_PhJTLB<3E5aIXLW@G&AibaIuv^ zR)u~7O|@`DPEXz7&D=&o&=+2%p4`G$2AZ%};(vl#pHnl2)@)WX0SURNs;$?|U;hJq zS-?BbgoVNP4Ao}=y`{Wvcov43wC8szp+=OK%kd@Ami5RQ-4$c3X#o!DkYxSG%2}-> zW@YYkH~72H-S-j50H3-ug@k@sw<_diLu*%Hbn z6uI19aHo|=3W9|R_~?*91&OcJFYuQxBfjC-tTL-*YzmlEd&r1lhCbD1k|K!S*T7Fv zBbRT)lbHb>6QEx7n(h*tb($i&UWQYSwO&isDs@RwOd;d2BFnJUOwXQ;!34xNa@&rX z{g$fYDOS)#qWrFj3MTbOpzBQfQxC7U8x(Mo-dj}-2cEKb{IBdXM4`MIvyCK_kCT)$4$%8gMsfJzw{%vdX|W$b<;=e zJIv`0tX&X1@7jdUmkw}rWVHln`*W0afzlj>LNR(Z5YZBy)b_3w2lzb^<9FnyJ*i#I zIU(imK**sy{bT=0quoDEw{TWVSY@* zp8j>poXC;xH*4mW%aJVSUlK?3jrJdVRX7oegq*HuW9KM<7oq-;S6fm7{DE}_gsC6& z=v)6>dY2#B>Ns-UxtM`-c?lf348(fNC7<|+hg?Hqzv9^wkehLHmLTm%E;TO;M&2)+ z7zOGFuc?34dAh{c(>DuU(5TALvrbD)hVc3hefNoR^Oz5P%I7^4FAVe6gc7+qXJ4jX z&K7o~w&b9n8gzVz)-741;lKsV0WcBp9*ew>r2K=-lQsm!RkJr%+ zRy_&!Km(a5%l&(gU<@HopX8+}d}`N!_&kiDcTZ&)qMu+^kGGh1>k)nli3w3!Yr`BBd&~B= zCkJ$eiyu|5X*@o%A2k+jD?^+uM^K=1qR!LKam)5iBj1bygK0=@q<|W&@T%<3wM4gm zk?EP_s$fUlE|GA9XPEQfsSn(_$)CMzOW|#Tc_W{fs`t6zvJ47Y`A4F(-B&KKD-GGZd+-Wd_pV+q;5l zt@1F!XaxMjmojHe8b2p$ z$DoQ6O$!EnMk5%FE+|X_ZOl8%@vZk*q4BRr`rQ5wLUT0G(wD5*Oz4rOniApco#b|S z7i2iY?~uRpH`3wqMAJXjA&<9)&~Be^suBx@_%|3PZ`(ppduUzU6U1PUl zi(tgIP)wkb%bxG$SLWA)942~t+WW(T_JUU! z*n@5`nXnl~W1~=&3bt47YkqfyKOL!CFa=X?OUv(a43F?UJLH#oLkc#?2=Bf05V#2x z%~0g{r$|qm!r_Rsmw<;Jx1CjHpLE z1RJlmRQgrvDrFfwJm=5dQ!tme0*7&McakF!2(2F)^Dr{EP}V9R7=S4_t)?|5yFR@r zw5c2ylh?5NnQm-MrD#`3bXG@&F$LD&{M)#V zp9+WYw9f-(K-m+~X_+<~JR=QSazYM_O%`G@e$z?%^Li;!_uTv(ZQjfb>3n(myD5yj z%1M64tq!pd4s(WA6nFPvG)T=>fv)A@0%LpnMj(r!P9IE*s;Wj$YY5e~bWL<>ejN_B zJLQwiya;C*KF7X1aH#up-6QdRPzqxo;S+pdU>6|AVeK{(I;F5VQvWzSOBBLC;Mg6&6rC02ZmdUtSuV~3zHk+@GHdDpF1nROba(Wxj$**`w zFOo0VZ0dO z8hZpr=05&Bx*JGnQw{DAZG_YPwDH$On790@_atz(K&kMuI?L$YlO;ID} z^9IAQAHX2D-YyVu5 z`;Q46A~BZ!Z3Lpe*Po*<-m&O9ed${Ws157(=pa~NmdG`pZ| zHBg0-@20>Vz8&%{25L%XgR|(@cjd12H7BsYRbCml^sYybAG#ZvcYvqGQ(wk>B()`} z_+$0U)NR)Q${5n5;DwCwZ(fhSRqi8QwTbHA4@-U9B{Ju7{K~@P>WU-ji3-bFDkqgi z8a|Ai^!Qm?VZ2yCEc*=ciz;Z@br#zae>xD!G%tgpz>|xq=;Jp3-LKI+UiRbPq1RM) zM)j`E?r8xL@7X@I&q5`2s`9GC0dn!MoBq9|$X4frW>VP{Of1M?De46dE@~nK_7}BF zV+EMS(LcW<5&G}Z{OeG=_(y-w)4zh2#ND+7dpCcnH$ha!rNqnNh2w)qSdjuB97a%k zlI~q}*H9|0`=lnJx$;wixVbVEDL$mR)vrF7i?FmJ4)pV z6{#%{(l__&`q0Jg)?=Sk9Ay>{4j4RG{VY08UXLu-00=vmb=X$b;AuAx6rVQ%bzt}2 zKW8I+yiA921lt*EXV^>#+_!rl(-a+2@PXa^omIkL?+bPa_{&3vpk%qbU)za)99`$! zkbid)NJ(o+`=F23Q+?2C|3H?@*oW_XhmmHxZ=WQ#4^pyDryHdDyF+o}XvDHPQ~oc- z4-Tz0QV%8_MV)sCq6Bxm+f18335Y~U*Odslh2HNsMB(0F2k|}mc3Eq@nYzVGc6Ugh zGF{#<&Gv5Y(SEyYiX8dKN}?KJ2XXOLXWb&f~k?k?@_-vPqj+M)N3iW)Y~ z*2}P5&T*DK%36_ctclIgy7XdV8Rhl>=H>A-q?L*MKb3}=u-YG{zZx6uiB!m5=iA#G zYMO0`1o--A?S9PetoK*ZyCcoBq&Srs$$SFls*0+rI@ki!I(L@TJ6El<9NPoPYTrWe!_!Y?>*A4rXEphWK+m zZMs+PJE;DHIy^6#l=oy?Rjjb2pE;MZ@n$j(oT4hrBBt;7$Q3&AjX~K1b{<^bQXwO| zE!35MvbbxtsrTh-%JrI+svo{$X7=Hbh(R|3H6=!w&*DMmTQnu)tG+}@(Tw^ijPToX zIstqYw;Ck)um`7w9^Oz*hF)~7=za@2J5rd|sBt=VqSQ!)dZ z1vyn1DvSVjd!sOT)4GvK+^s!ZyP%7SdJ7aS6Hi#oVUb*EL*(vvPlPZo9!uiNRxvBB zC)8ZyY14Y6Vj=S46)*4y(1;~leNm~hzBvUKDTpY{3TUwBi7qJyG@qLZscCeE@($O~ zsWKrutn5~KlA8@&6oq(D07G-e$S3|3S$aCle();=(+8pM7HANF&x;qkvcV^qF`CNj zY*`v1dHUUvg-NmsBt_nvpcsQ{YhS52-k@W@8q`D)pfg&C`LeR)QY`+E)1u;=r?_bA zeFmpU9Z(eo4|hLhds^2@QF`*rdfb`%EVQt`Ko$AUvTV}G^{~?9-SCmc zdX+tHlEZQ|k}yy3&>2b0B+;4Xvtm;N!V+H$Fw-dv2fOTHlwZJkE~FK~1@@8<@A=1_ zf0r4gbQOW`I_@@#-&TM5*$8#|fUwRF@2CFIZW#8vU)Z1H1GigYDFXAyl9@{d{@*5|3=xUawFL@C&ly`&xqbu zF9M~v{3qiQpLcld3>T$D_~ORhG|4Kv%i@;HX)`0@Ja%Jli$O9>{sv%(I%Um&EnZw`?DHALz0)VPCa1NH=>Ug+dX~>XV?w zNEJLXAaWD;vEmYiRR~zZd|Nz8Y(!*wW%A}MHQ;~?WuWW{{sSzR z)z((`Kd_x8)ra4W7P(LlOT7Hsk&4kmPbTZ&G3mzg8fBZt97gElPsSG+(&+na;2#gu zSp3-k1K8aB=x|!u@wcA6Z{^Qr>$zDtkJsU)dosh$w6*&kCqyo9KCM{#4iqfr%#%xYW9L&+gk%Lg2aji4kFz z`CgBFpk?4z_dk*;_T$|%6!;}OHr^zR&dPGzOSYlo+4xSIOqIIxZrOrSs$R!n`N;~Y zGhF40XsgB}t_jI7mgiUXLaVIp9y}DQTH}qa?%*)G>F_toytN6pk89H^|M+M;P9*A< zR&S-QW{g<-T7f7)CgXjaFxSZL0(aP*n9oZ!+PWsqnXwOhD&4@(Bzv{>3979L7b?4~ zp|-vcnDkTM(~?VP26&Y|kVa_6Z|6gDg~W4wGQz<3zccaSHE;mSH)o>%)-BIJh^Yhr z0kFm({bV7p@66~60((d;$U8H_UpbveBfOt3U$`FJQ~8)9Q}Ohsvry))YTZzD-fEZF zgDS{rlNrOaFtTfICpg+dzrApB~>{ zCpIG`2*!M#oT=2#em#GXTfzn3QFYB#wsUN#vU#Hgu;N#$cX<3nSI0HeYA!IJJT~{B zjCVzny8O6ntH#j`j23M>?rOo=L~O?BPjVd3iRl#6eR*aZ7TUe_>WV|hbY}cREidJV zz&qRgo81wZ`gZD93PiFWV1#i6%T03tfb3jctZ9AaQ;cfW>cQ|O zd!iug$83kGLPNpj;9kqR9q25z@)PEMi1V|4DI+uLq+1>A4-8r$Dm5~Vy!RZf|4kY2 zA;E9tGdXd-v<#yQ7*@y>NGvcU6wXEnDL-jFxbAqo@l0*z8Bu{xOdnecIWW!ftQs>xa`fE9jG5f?&PV0o2$$h6S%!Qf z*4?pH;j!3^S0F!rK$V!14z};yt*yXS-MGxVvIN+_m|5S%Y}PPZ@fC@03M@&~Tc#3K z;!|Fi8K-svWH@f+6rrSu-DAVf+GE2oDX9UV))&i&QfKz9lQe{sQf~sI6EbHP> zQT3X~o^2|cW|hZCPct2<1VmO`BnJ$yV5FI|EsFW&n3Nn*=~~|B@|sn%3EfmIJAEDj zmgFwT?fwri)H=hn6V$u`PGa|Js-3AqkGw0r=?QeYUL-99pH6NJ^qrr(tCx^6plI#T zh#wGKHsB{+xJ98=KFK0+18Uqkk&w&wZ00lcJxntgFDTa#5fL*8oq%s~$p z+_utx*Hy5&N#tr7r_Wmg79Z%kZTiWDXxb5xK~~eKe(P{!&EVoOGz>2G-IrwKkmsL6 z2Tn2%5AW%*J@p6A42Y?U8__Q+=9OP(#V`0?xAv206Qsxm&9sYbx%Oi>kH>_H2Z>I} zB3Yaah_uK?Jkqoe$D*RtR9R9BnNyAwGHyY%T_D(F;rjMA%S;PJl@CG5lQufc>mZ6d zQ)8w$$oJClAsBY6|K%kqpfXYDAS!khZ~m3Z-@n0!*@ijL5!Bgf#KM_wk5seU`^n=D=RRXTmEr7hvcfY^fa~&aO+#Z z77*IC@2pc=DLtg*X?0cFxv!~wxrzU#AKSTBNV#I1J88hg2j{UwZ3#!gU7Dm23?>Kz@F(_jA-pRv zL9pVy+M)A*fI0XJ9F3Iq^gdqlkR-0_d~%rg#K<4d1l=l<@mWDri2CwS>3)#9*wCQ|L!VlY!P zA%x)x{Ps^}@xTLw;l)qARHR@@+_}|8Ydv#(5JbMs8r}_U{qK4S@t~n6_GreEl!N^6izLwjrtvhBatav1R_=@W8ko?cHj9 zSS_{VR?Aji6BBD3!95<;VbXGyO#Gv#E9k03zha*@Q054s^6z!l zJeKL+-aCHT+-4en74*aJ%Nq`r7R0aL=H^vhw*o3GTq zT;)`LX<|udvA}!LF?WtZS)n-BsjT#lgM#nv;_j2eFXYMg1OY&KI@gD z?G(LO)y?F!+C`{2E16tiY5r-h?#!(~vl2U2bb5OUPbO}6nzLTo+3Xh2#bK6VQ95*Y z&Y)edLQZJghIzeb+&uy4e5|h?Ss0a{W8YgXZ(;{|zJI-WNj+6g%8yAkqtQ=z%&`}p zB#C!_D|rl28XK9PaT$WNbOT_^I^(Qdsw$F9$yxM7%Pw{beY9Ly8#W9r=`CzTQI%iY zLgiPL>kzmIfcRbSEv$TGC%N=EN{kPxhxf{MB*xvz>&-^ zk+D8j%-Lwlrf0@}rEXl&v5?o+w_X;#mMOD>Nz<*2~!H0>;$M0Y9f4y;jx!Fr% zYh)Yv7%P-A22Bi$b88aO)f{3L@rq7WGUky*jlZpZ@|MZ%YQm(0V{y~UO8aO{sXH8A z?g1VMaiII4*d{{7rt_t&aF@5St@%z#hjUsc`glw`r1PP^!^&a~kTKgV{=W6ipSq)#u4 zs_}g=XIRZ|QG&=pjVI1^eacooyqSWOH!cq=s#k`*T!8O6#pKA011ZWW_f%@32lcL1 zi?_aKrmX=f^9VNf&1A%WoTHKa|Y1hzU5KWYDNO;K>X0-RsxY!V)L@6g-0sStl($*%8jZuOJ<>2b7 zFPFzp5X2;6obGt|maPgH>El#yp(?*18gd}JbH%c2un1rmNMPCNN|Qe4s&{6Fc(xr6G(Q>%00#Xo^in$4%6$KWH(aDf> zIiQLC?c8x&ZRh?k##67M+2?&;M}1l+{i#W2@^|wdaHlm^xjCHb*+`~ckcw@m7ED9q z@c#jNKl(34=z{t4kN71HjjC*l&7QWk;j|}DcMLL zmOrOwQxN+%pTjvNZBV2&0fm`{sXy6nRwb!3^diG;I7fhjzp5swtgBb)lCGCe59F_< zOOOSTdyuHzS{b3kA*uCjh)({tMb3+9uwPPtN&sdCK@=Q4P|akqrJ@+tuWmgqADszF z1;}A|xy^+e)KHDBEsR5(=3gCx`0>cXOg~Ke9gmx@vbGx{anWF|WmsK=@>>v_*3PKqpFsUt5#(Cph z-Lw@Cj9DQL)5lDQd48Q*>rC#AD7N6YjeiW}md#poO|ZGG;|(6yB7gLV-)>HU{y-4Y zHj?>+n_bVjOJ463<_Ts*e3QhAYCMAcPI^j!V8nZ0eD+Rk<)*uXmv@9ps;mxK2dt_q zc%I+<$;GR-bnN_MX_@60{rC1$T4nS>xYn{jEM+Od@iZte^TC+pe*hJ*0$XV9$IK+w zO{;m-uJ_#ipz`G)77ms_3PGJNGIA>6q5-)aVXq(XtLh8EPG-ba}KgFYTklNw|zdudq5H7GCBWz zCf#7?Ylz-Ns!eGvGI5Zi6)s!E>cuJ8Y*%3ss~(8^n?o_j_WvaQ|8e`waCcNpAL8m1 z>zirk4(!Uc5UX9Mro~iYSmiYq_B@Yoej(ILvbUA9=9=U@d@)Rd9x#7jCLj;n7B3pB z9xM#h%m&hZ3>RdN$>h4`iw&(TiLTMG(LB#R-7(R3K?Trp7HJzmxX3ltO6pI(NjaSR zNL>oYG?+wqWHM}|M zsop8xAw#smM7q)is1EF|9agkUXB$qJ#In`XH8#@bj+?^oJmw{FUFgHsu{X}UiD#G$ zcV&<+*v=%`W82K_P1p`xU#nW`?=!V?Ge$psWDg=*7nD|KE+G=t?ZB%H@1Y} z&9H8N_{SAGF*Im$<%|*Kpz4TqH|Yir%hc<6#Ja6_Gx&@dzntnq)R=c*^~tH9W>yEN z+Cau%ozt<5tLcNJV)D_Bc##pWi3luJA1^$Z|jDCqCT{?s54}w z4B)5>$Lt23qfIaD7A|mo`B^#W*3T^!w?icTiQj#kU~yT02`eVhOe1FU2`L3Us;x8#x!;zC3B%a)R9#9T_le_%U>a zZZ_pJE&IfE-TTA>9jR;8lwh{+54*wRwdT_nDFU?)BRu^loeZ9{T9dm+m}g^lkaGSf zb@FK75n-aq>xk`!c71b5T#69ruw`bNqOc_ezc$qhU+OO38hhJ7b_A}qsKR+OD^c8j zCf>@@TioE&#(AwDV4@wRW$IMVG%O=cNM`?Xw@A=bYuj|FsEJmj)}x?D0N3^og3FYX zcBnBx?C9x#03veV&8u1`XyjnbKd*PaGX`oXKT zbqyFeTnlcNcm`hj`8x{UC#&a;^uFq{piciKUri*_Os)5rbcVK(daKsjJn)4!x3~f9 zf}EAE?u*tq9I6Xm-wXU0_S0pW)^>vhx;2fTNVvu{b(D~-}_ZJz5|()C)i_gB(6 zV6X8ZDS2nvKvnHaKImX|Ih(Tvqxp^SvqGj3Wv36?jC;%rnqR{;EniP3;i^aYEHi^^ zTjq1*T=DyY6Fjb>$bjS z@bV=F?RkAlyD$fNs$gF*!RqZ?7kw`Ia$D}4wa{qFf}gi*M<3Ec%V+M%F`Pumd;`6l zoo}&Yc#q^$(Gx`_pTh-3ueL2*!bp5V_1w>4V4>xZ>h4&bYE7qrlRB*gV92H&cmEx| z8X=N+>F*L@E4At|CL=BavOPsP@Xkq>{>)DGLbRDeN&j1mIvsmsrOCspNoD%)N>X<+itj#6EqCcNw_8*)t1M|M*0Il^ z^N`Fmtb4D-kJTDC5shsQ;ywb}!LiCHwiokR91h5HS6JxnhQ<^YU6EZ&9gm z$wZ5f&U2b7?!5u10%DUBQZISmbez&CP`4R$4o#<=g@tJGUfph=k5dH8jY@(N=a!mr zULOJH!BU5oee*mKzXt2d8m|7{+Uf3s*qM8m#y1fp;~rf5kfllR>pQMTY%b>>&xii2 zGF?($8-0&E{sRcv<*c%r{6z}xY+Tmf@ox=->EwR>h32M-G(XCH@*eMq)K5%LB(lsd zE~Fg{dk4b9-?`QN*7|G!@%qfS6_^ly&Ef9F0s{e3d+!X+-0@XlEXE#Gq;^NPY9|6I zBs4FR(jsDzuSjm?*H5PKDT{m4lMCUO24JuVC2i|!eBx+1yB%RVPe4a&^ym{oJ;kiu zl*6o*?mKkNVZmw5emkWhiTv25w1W9ZIjN5?ag|!jRHU zl+9_PqbKW%HtX(KeP>WFDM7Xq!nDIFGjXfF+~828eZ$CSeZVCzN3Xr8xml!$Ll#4< zs*0WZY1^NQiCZr`Xmk3Ru?#OL@J<#LP4q_|wAXFSN};Ck_apL-t70~tSJSRKfg%X* z9<~q5Gb?cKe(cbcSvHf@3h@c&nvd7N*7{^)N3Wy*FWQ6sSwdMh8{894JxjTe_LcBq zde4-Hv+#Aa@$rKl_^x)LA4B|gbO_Fcv}RRO`KX{ldK_1GOg!Qo`w2@29CH1Y!pZQ`mL78mPPd z?}8mdNHvw}iz)3XQ_cnTuJww>CG=83G<+y)mIs1+(ToUfdySe}DVg0pDbjsG!2K1E zznoj~-MeEjtdq`WxAIFbKMS8#>C0EyBq}17EHi%IwXrpQlqbYb!-P%?T4wPN4>oU#sp8mRH$_f?o9N-r= z_&Og-iV*M39B^&bdBr48ZvQ+NQ_C|`_ro7^2d!(s)8S0%YZNjGtYN$IBCrHMQ|2n@ zolbqJ-r3j2;zW0Gw?f4!44>j!z~#RvSpmWbmU+p?S2AW4;H{k`rnj0EqBBb^-vu^v zm=yqp$SpRs8RkO*6bl%ciA#X-hyb!Pj)mWcrJ5}03E;uD8q$!PBWzY#!~i;MHK8C1{_C}7AXJ(J6j zNgeFWSfj4ABclOk(q(&vX24ztDw^;Zp@|y&QZ^q(iZixKLARod6}3__Z0IZ%S%6y2 zCbCCWN|NGAb|^Z^1P+bDp(+MQ6`wbui;=iHKOuj$tk1LX@@!g7V0kn2)2W8E=#<;? zq&cS=Lh9CPT{Vx?^a=3-Q!?fKib|Th6WP2=D|hSeV!I7PG3#wxL&tp9qEL29Vu2o`@Owr(gMtKg=nR--WSBsx*{G zNL^Bz8I29SMUG$Rzrr4W*h6e^%?7(51OQeGroc9DZk~E#?1K*`CZ8vdP{l^YAZ2Z_ zH_|bT9-M(%h{fqWTPxSmN8Z=uV(nstVImQx^gEUXK@KB@eNE<*Pw>yylotYRYhTAf zPGtLaqwB-^I0>On1Xxm_TH!~P&gSN1l?nXv?_sagq7G<)CS-BTY^5IGh!0WKHOyGL zdHJ~J9oP9ZJgmaB`(s98zgtkvobcMWXgbo?$@20^=zy`O*hSWznt}qiUUggeY9<#~ zJnl(zax(`fuN|?ldp($ScxMij-e)5l* zX)0m&fLpHI7D!norwsVEm*hvdRGj&)q-F^B_TDP*s}~4ztPFKrmSPHRQaH@T7_=f* z^dP%#+{P-l4s@C$k_Nro-G3impHnJxyNDv?&Y24brwk@rVM&Dq@*Un>>GjIezhkmD zG3r=pLEIx7qeU?@7HT|c(JC+e>t=_PJJvd6q<8K5j@`1PWfP&eKNX$hlGMKG{=b0+;CB}PGluf4ieGcWmUMUBC1`u3Z`4A^67Hh$@u&lH| z6$gvN%0qT?tNX(trM^mhsfwZ7>Ypau^Xh}x>4qhbM7n`r#J}2XA#S;$8Z-3yW3a1@ zTo77M6x*Uped(Isol9onermiP>mS)rbqH00L(iaCkE+-O!AhT+ra+Nf z3z=#?ZjQD#1CK1ZTVa!On(}|n+Z3*3dvEjm8DetR^^@;A-y`6k>Sj9ek$J9Lv=k}H zpL6rD`XMA=6f>(5j$U3jDMiTI^u?Ds@8u!ssb_Q#{%XHTGN9Dhu{Q-G)mjgz71tQh zAiIB!7SCR#GX)k1n8&(iCL?)|I9|O+UjC+aB*E_krE9K(4@Bz0&JstYZp|-%ak2vf z(3;@CCZaiM53N_+mf7iv^7A36xp&4NgJzTKvZny=8TlCd0`WHhy z>)8?$SjDF-T0-ui$1SG;#X3_iT-Ln#IvVL5BDPOjk3IXkAzJVUJbs>B0NIB1RvOao zynhhRTdki9g;ViD&!*{OjQoj^c_>mpGWXW}CATag`FoDCqZf$CW*R{RQNt-d=>Aw& zDPXnfNzACf#JxMK_y&65^o(EJCM+0!d>pjn5hvF*1gi3DpufK9AElNfU&1`fUqGjs z;it1}ie>wQ_S@mnkaux?WncVYjiUy^on*0QY;{~2MD--KA?+j8>hBR>y|lQ~0D5%$ z*??}j#UxnxK95Hd0ivrMny9PKrlYiL*D?C&XO*b$-cJ>gNZd^)0427E8bZzPZ9R+E zE}`GFq zg=f)6H~1nr3Mrw4>G|i4T)2%RQnKKH5wT@GRsi(=irmg5(jgsBExM}u!KO;jhd#1~ zu(+`xs`d+4)}Gqz6dBgdwIDiwCTGtS=aL7+eX-nh8x@2%yyW}SQX-Yzd-`T((8~l~?rKx@*gZoBZ<1?p$kKNpXqq`?eN|>}-CQx&X zz0d6xKGb-WRuEb$#aKx60kxe?Ik^%ne?l+2%G=LI#HuA7k3Q?}v5)0;(H>_DKOO`- zG1$-hbht?3h%jbXhxVuGck8D1v^_{^ncnZ|qm(g;O#^GT^_ng_UfC;MRQ|lm3mnZ9 zm!H1vaSa3P5#A2fDO2b1O=X3dtj!>rhnOm?kf%pGazZ-aWmy zePTx`V+IA&e0R}K%C<^UmdUV=`G=!M0o{b0VxTOYY=xeQEEQ$LMmSGl@-&r)%Z6+> zpKB;H6dG6VsHkf?WvVA@kXJY_8Shy0g8Idm!jN!Z&*G zM{n0F9@4k|o?z0u`QuN|Z7uk*`on1SNv}41=Y=JwWz9&pZ0V5A@)$FE05m>eqM6Us zHLZ2|)jV9869 zbqa|}@!NZ$wP)>;=K*?c?fY9dw3!zoQ;c&HVWAi=23XWa)4Sz9iCt8nr=Nb%*ZV@G z!A5FY#RdGx+da(7bvUKR)V6PIltx+zRUUYErQ=xKH!1mT(JS@N6G^RFpn2o$6MI6~ zC&o4JH)$22Po8yGt)-5L8%5m#7O{L|b2;!@C=`$T%{(@eT(={=Q59=lpE6BWpMv<3 zG3B7jL}^2SGwme5?te;#5hI>#t2LWfy_-7jb<=(C!rZLZqK$8VYzL>B-Je=u{tb`4 ztMhYl!n?kYphZbDk%ojecwCAYreSnHggq9Xu^mUJ zd0$}0#=UJtR<*2!Rq{=sK_moL43p>87a!upX)h1Q;-5dsNexhiBgsU~!J9dR(3~5r zkCAk7HFwBpXW32Bkd)Sp?@s-t)%+8biE}czuQ0d|4;xHTa-x@W=gOE2kq8-LvRe6= zpXNK@dMS{>tfJ7RrSnY|+ZUeug7&teiUecfNc;nyFGpoXOKs#j-qT0g#zigqHMdzd z6aVZ6UIZyJbB~wA(qG%-*!O`)rmCIA-Upud#}rwk;Cj=v zuA1@N3+1>szIck#Ow96o6dv~Q3G?2)`XA_=OMv$KZeA=rcBVT9!c?nH0u`__96UDz zkwxvagn`d*4itC4{O;^-@5Dc@T?bUl0B_$SEpl6_a{l5a$3Uz^?LyxMkE!P|jLJiB zX_`OZQE3}xY1si70e9mO$6}v`U6-0d##|`=un!jR+eLl*X5wQki9Gz!7VXJSrx&n| zUHY*V{OWuD0eOcG(AxGJ<;Gnd(|na>^2Y=M@kzX(TYqU-rRHH~auzwOqDf0-;v4K; zDY$Pc^F6?ud4oQyaeGHAgfLeO6`9jSKnbr?A?sAooQc~VQr!=f9VQYx=LS~DYjL?G z+n~d`Vu8Z(t4621ZTDvKr>*XlW=_+zwzZaS?S+UEGCtFIftuthtG^u;J6wrsj6)e( zm7hp=U9;oG6~ z&)P5i;dA4>&TyNBMy8QHumgaD&%Px*j|dZ3p{I2TB>y&HQ|{B|O8wnLyuckqUoOhT zq0UcqhYo|z$J9q9wcz>zB@mz;U5`FWKi_wzQA=gOB2Z`s@*Q);?llmYHPDdk>)TvQ zyM7KTbx7Gx06E#oDNadjt{IYx=JaGGIRk+=*t0KPcd_fUVF${ihZXFVKJi?Tl@5FB z#j0njB)7ERile=2VgPn}>f76Dt1?#qY`3y&6-oBo|_ zV@zIj{Xy;6@ryNv*NKijDO@VoGC#Gc|N7?h;LDr~V#0bl5h-wkrD2E$lMiKEm3LiP zH#ZXWX=vfrXqDZ1{U3myk}j79`7-5HcV##d`q(X!4sw#Xj8J;`Y>0K(qg@H9KiO!L z3aD!df#ad1=yXThRXBB~Xk>`JWYNnb{_D%8fOTr~Yt4Lq1C*Q6 zl5D0N=u5rbvx5|-acDvO3_LEMZlXQ7?IeNb19wRnx?*!3S}3HP|EIRtD7fv0n-j$^ zi{7%()hZ6%U{7hEh$;E}xrSWE*M#b=kj3iwCYX54P*%4N$Ol*I>n3}L4w~c{uXIFw zt5hPLLj)S&BwNr(!R(LhobG}wXx3!`?c2w1JDODQrlP~KTpU*QETaOUMgCAwUcQA$ zG~a%HoqkrneAwGDy8(ycKtRC(PhQ_zQql6JjLu*q3?T{7}uSl znsBqFOVCTRh4+W)-0SvxKR=su{o_Dq>$D%hJ+2HJte^eb28$V?>Dwn4E=RkGHppQT z3V(R3r8OC~AK<>Z6{&Ug_1{bzd=ZM6(|Q%UyI%TTOrYlpk|(yQjttvZwHJsM6H_%* z#$3Af{Trd>euZA!cT{hm5&iRrE;I*Ty2sg@w&yvbpBV}-#a;cwW`$$AyBs~eyLkJk z2v?30DUV>gCV=R@-1u;^xK&%h#I=8Z%77i!+R9UZ$(HlsIci1Zo zzA|x2Y95^VqC)gt(a3yFgt>;E)5p@(GLO?e>1_6GO$VKgd%X69_hy}q{pZ9 zrB~=2AuTP7x2eKq6mV2+qz9*U!XAX!nu;oYU=O136uWdT88IdH1_+e4%`ng&`-J0Z z!s4S&Vr7kH2_D^q{{XBTzIngYDv1DKah<+N)O(bih#!|`$`@>y*E9=P{Nq<=K_T(Cf*Z3-agXH3HS4)>=9g~#h{n0*2hCdo2O4;fpvzfKU9Di_7Sro0yHq($C-umqTK_v4@(b9Oth-6@L~N2 z&eV$PrWS{i&UaZo)4vr|-A45H92n1R+Om|(nGM~SEzN7ZmRr~gnbg)bdP5&O@EkuE zy4PB~z%!g8P&hcpBBYItVf8?C%CkUJM~8Ug}F)>V&2hT z8C?FleiSgS+Zgx5E{N@;wTfEyUzvP^wrcuGR(nLye8^gLL4ty~rUOlt z3+;n=fly;^hyYT$T<#66ZZ5RKg7^Ey6j$cQK7Y;odtcsVe;R!n{HSz3x@VTuvv&b$ z_tJDwtgWh4DOhO}`7yubWtp2Y&`%a4c~o(Z(-3El!f&t_4SP~_7_rczFkv2^s^$ti zv+<|0f8H^TW&hfaiXsT9OC??jI6w- zu4RmLjtS*dk)(BQK_9n6TNn%(J3Of%b}(JTa2pj_Oy6U)A5~w#s>hItRf; zx^#z}p|a&DOGd%;`*`*J@E^|Uq^9?ZGi5Ek+NIZ2qHs{|5WZ}$t$TkNg4yBreyMTie<5w5x_;GicC%XPLX)MzhJD-uYxZbGyoBg^?bD>})9~_0#FNlj`FFAGfIpxdf8yU6YPb2|0Q|nAZESaQ*^e zvzNYDrWT-S#^~4%L3bd!q}IfK(BkG+)t!yz`rhA^->+{bdY+T%yPks|D6dc+{A@H2 z6Wvs$iGL^i9+hRybJ=X_K6Cn7e7@o2LKrHVxsIZTYO4Yb<2-bW8eUXK7wVoMejStWUTC9`sn)A)I- zQwj=GFflq8RVb|sgIb@!vgLYKv%*%hEh0dW2hcj~Z8WIM8uIDM4Jd4ksv9?CvXWp4 zU4EFAwSD;H9tOJvY<+Z^)O?M2A-dw=iMO^+fB8i^!_Zg;>f4x+jlZ2VBE@Jq)OzdR zEpDNh`NT})DoQ9#S$KH{aNw4Mv`2c!<4;2My*^&ItZHQBZlTS|u7jqhXR^ORv?fq& z9R7AwqIpcZX9#}*{nt)psb}+agK}_W?h38!lVFhRfP>gSuNO-*$M-|M8yl7ds#GI6 ziKKh8yEQE9!(ZGUy}~l7d_+5{RvSDE?uEH&Bp({n`O%)$GhRl>N67gGE6EwUaD3`h zO}4i;{{cy8$fcUy&TK+y^e+ccWn1!5E@(2HY!%xSC!HGIP23tW( z8`k9%C6SV0JE=R}_xPg}p;n(GG-@&i)~r6t0qrd3G|x&the062DDXUD$ZiB+2dz>6 zxqayRytr%|C6qNM@h*N9S*-F4q0xoKkj>nc)SJQR+DZFefW^2 z?3afpb#NWt{WG;Vzs2|*4bHy z!`*d8W{EP3>{W?4=PDxOoU-?dNTQ;JWJPwFWhE4e?9uN&-{0Tw^?E;F&*$^?dOlyz z=i}iXM>?T*wYro{cVXu1L~lHDdRi91;s0jDP`Rslb3F!UG#MLdc(BU$?Aer(URPzGE(mJH6by1!xWamX{q-vm09;*;Ykmw zaRc&7c21Kda9Q2sw6OH^gQRB_Rp@UV>Bi`)_y!1tD9Lz0`*6ZZIpLc&UO7j)qJ!~v zw<`cmx)0YdExbo0@MhUesq&Q@amW(7WX)wspmm#8S4J>EILS-fkZBev#c7ns&L$d# zi>BOU9U9~8g2;t)G}uVNCDkoq6X)Nj3w3)s#*MF>{*8p~RdHXN2_vQ2kGdEo2xw$x zn)>tkH4Lo6O9Kbvtv+mX!JK@&kjm;ic_&WKb%eBn2eImk` zz%=oGdQ#Gn;MJtymsm~xNc&{#$39m@eh4L=+JXfpEmbOckHIB53GiN>BzBIZ%#zlI zn(Ix2H_`>3ZzMV`Jl$Vz9HLjz!gA5{V%Cr0L&-Xwcj=*nh@LVkjm~JqVXYZsvN*LV zfqV93iYdH@Z`+(jIg&v)F@KeJQZe<@XSKguFRq8_+pLT-_asKozA4O4*XCkFy}Beq zyP6HtV`xe!4iswnbXlhy;8DyDSVNgPYC0T9Jckan2o=FhMy8~Nu3)Xa&t3jb8bK(R z`cc#0ci&$4K=3W9aPia%gExM%?X5j))3|QkTWW_lv!X8=k)?y6-Ml!Wthd@q52=x0 zY%;=Xr*uQ)hWs;tr#tHl^oclExUcQVhnDh7`U5WEIQyvNSL&`z?;@gHjjePn)mJ+Brxb5Uv9LH+a3 z;lZDxq*MO7sf!Kb&6^yd*;m5=CS_5CNr|7k>FY~kSGVgu69mXVvN-{NCY}rf{I`RMwPHscDM& zySy4S0RA*D0Y;&=#_kn5In;zK)yLleX z^I;c>Pn`u_-=2M78qw7h)5f^Gc*gE#ad5E|I)~RZIp_{A9&f~_dOGm7AWn;jE@cg! zgQWr#?WU3U zAkzkCE?nU#RrkBgadj3K&o09l;cKx zxN>fEJ7T|%x6F|D0*4n)-@27Z0+DP&=`%p7^8meDsUk965$7P!V`jt_QFyMdL0waK zhjIGaM~45kAC_VIIXKJNaz9b!S9r`%r3t+HnPUGR4c0%I1sD!qGx(JnC%4v$%aKDs z?q_Fr=zCYH(!W{sd3)*yDt1nC)U zU}~1li$6Cdz9Y<;oNbe`ptCR7+xbS`zZ^3>*Ud=hwuaG+3d(mFCGR~=dx|0ot)qZ^ zKFlu3Q}rHi8W~fd?QqF!-S)4Wv)o+oud?SICBIZnm!JJ|3w(@aQFs1Ar&nlS5S6zz zzBPb_oolzt<+ko#5$!a|;n#dHfTNv#H#uhB zoG7{IDm(bhby;4(j>~()`MB}md(Auto#LXaBa@~MR|X%kp5>oGJQq^1(~(WTT|Jz# z&c9GS;hpL9x%Ok%8Po@>FF}$TUNBro&MZbOVieyi3nbz$;0vJ6huN;mL7yHDaOB&xL96Z3oj*9 zSfS7gdU}Eaf~5Pv@!lT z2#VR>auyPp%S0&@K)|(l@)h=2grQzZ(m66`QAHff>M_^IqsBR5yQX-%}tmE z;Z|^HWqn(tbI}-5+EnlPX+Pz27jJMLuhM;I^|O0Pt(ULfKk(CFqhHXsb9vt_^y=56 zyJMSWB+SpoRHySP%q2_Cs1MC{rKH8cL`t$5ErE0*GKnQEIL|i|NlYsvi&jsreI*UY zSj1lFE-mt3X6`KIghH>Kedu=(D1v5`loq0yHC^QyBpVWN(Y0(D$z!XRrgk2-r;&8F z?;iimX)dDWJ-*VDCv~T^^iq0e)tVr!rgBx7`)eKmrHoG zS1+Zfdg9zt3%kWF8%~~uM~D~aKVSHLGwdYAjsLDC_p&IU@R04sc~UJmCx~`;uUb!arhs%!#A`WAtZTLL zepJR#kfe~aq}<)-9FMAoI1(jf$G#{}ygugXv$!Y#*qJVVtip-ZB$yP9nb+|@C^NQV zvP`U;R#q(m{4rVJ~;2)CR+egAxr zrcLphPU&07ZxuDbI+h5R#9Mqeytw-qyQ zxSGz=<3BM{{38SDXI<#C{=`;~8EEJX7D>5rhS}O|n8$2np>oTUU3I(fmx?@6?X+-` zP*J|aJ0Kox4Ko(#w#wiu3reB8kyr=x+u_B5EL?Kv`n3~xioW%nE<@8vLnoR;YCBueL<1OoC4{B zN?n6B1ufjW6}L$AS1)YW&+aPBCCK^P(E7tI9-(bWA)!o!#x!YmltvvMxAco*Db_|uj3bzg3={aFj5BW7hU+E z9$gjyO$B3nUvx;a4Z4I*&1Wv9ln5ySlC?mvp$-&E%%ttG+j6c|^~mPARL*DO<-&=| z8rVE5Q|+^(DLP|3+Qwva)7`R5j-l=2=3?w_BrGaJj2(2FyQo(es*X)p_4v)}`yjpa zI=hUg@Q40NGrOVCdvCYPn=-Y|{_VbNKFl)f%<;syy5T!*^tQY8(3x{0I=8vYEw<0p z*J=gl+c?)R^|V$EU46K8@;6KR3Z|x1RD0AV6E^<+WBtt)9}#lX`crksG9hLXrKsU% z&8v85$(EK=rYzyY8S}xV87EKkm*;yzJszD~cLpjs=d2A=%A+1VDUWQo_x!==3!%iH zF&$i5@u@-}va6c`zsHRVnA(9#Y5I!_pIeKFZrA8ww^ge;LQc0q=v+rHG&wX~@+7ep zGaxB9a#|m@SP3$<*-ueT4{(8SoTf21QR8q3_)fJ;zkl}G3NGm5T#IMY*V$oVr)x2! z_Tir%*%b|rt$qe}5AB}@X_D!mhaV!s>_%Yd)+?v2NH--AlD%q-vaV7sql_;67bj#U zftTY8bmUGW|7@Ypy<6=9_}416o?S9IzxmRA>$`|(>fBZ0kTlLfh6=-DDQbUYsTv8S zt6VT9n1pNz`WTeY3He=gxc{PeskX6<0+W4>emR`hg~4(5acDP5XBU3Ql0F*MPRRZ0 zt{YLeaw%pp>z269Au=ZW$54|*Mv8vWLPL4uv;pfptDZ7+HWYe4REXt5Y1OzY$L&G- zi8DpdZauLZc{uKAG)}j>)!Ab4y)KO?woVd~r5YObHO=^hj=Q8BN1vb%C_$P&FEQ(t z1V6hNOwxs6f9^Sz1{mK;HT^LqTv2>KHyezRP! zUv15ju=)jEX|HzlGX8lQZiQ1&Q&I216Z`A!Cy((vUyo>F!=AktnYdhbRnUmh+b3)O zV59uRe5pQIqzKcMT8bqid4m{T?VdoA4cgxthaL<3BRe$6GLnVzkIij>-7^nxDav^y zCmhZqaw=Du-p)$|l$3Mhc{B*@>}-tV8nnx{7Fa9%q-21Hf(QzXfW(or%tGmB2Z<;v zd~PE-w~-+2TE}E0_gFv{+LQ#mu0SncpOleO8!E$qh%gO9p?2FW`-gUIt+c9}tX(0m zr12}>3s4D0KK*Vwx6KebHB9FNwVM2%TIWAzU3xv9w@A(*t}d>equT50{u`VsP~Cg( zgsetEr_<1}FAyzf{@cuk6z}KRDZEc_oPJ9^v*%;sa_YC2OjDJlZ;QiWmP(;o-+G#7 zghR34l4{j1{4ed)(Q2vP%bT~5nQFgk)t$wy_S#rC7^N7oZLMa9zxAEI zt;2&6%G>ZCdt<)+TelLuQgYu7y65;w>yEuxw^1#(t2(%&)4IGAnsq9+mh`+=N`s0# zj&Ib=e^qXKvUzX_F&1`Gks(CZN_Z6Jc-3#1ipz8iH8w$32A{Z4&fRReT4?b@jgMvK zVo^A!H^sIWIL3GJFZ8xmxEZ)CoKmrT323eT(b?TM^7>9DF7bX;3D~18uy*rg-BRp) zO_%_kxcXF{Svn6~wNpgX3l$lkLIpF)#D2~_@ZxhjZB)+VS7+3 ze(KuIW3feH5=|)@T_D%_jUHjw*+wnp;{iqKkAnpFPAoN9UJ2D{+2+=FVJ0=%%Ae;% zb6bk9-ToMzZ&3E`P9A1~CtXA+@yYqE*s*GEKW>M*1SY12w-+cpjC4Vs|5~WR>r0(^ zgoL?%!44g8WKGGoI@>zW0>`Gf$LVA7SsZp(&-zUMK{poE)Q9FmkMHM-9~tf|M*agb zs~)kH+&JDCE>1bL%2LW!KDf8Xb|8;4I7~|Z2Otkqk7XAM4xSZ1fNU|*HS?p3{zmSp zjhKJX6hGqoRA?0~v8z-VO&N$`l8Tl71G)`7Ph3nLTE23IV0qMen0j>U=x*_k0i!~d zryZS=M+mEvgrXhseW{2|#ngh5C{YF3-?j4M%}a#jnI%Jk9i!+U6Kgm111koHpoStR z#N`K~R}6~0Lz4<04-NTY*Bz((6<;si6cazS?*2U7T&lda?(2!0SFcqSL>A4k&j_}r zgZ&vL`80?TafeJno;_E{o&rNjpx)q`X2F8<(B zZ#;ifMZLVfI_jZ@Op;=!{Dzg^OJ|-Bi&MO<^?3{* zy6WmY5LbD;*S2hs<5ekqHUhe2(I18sz4%eOLbRGYyZY+j-uwq@>$1A$Rd$gR=Tz1! zzZ<~Py1!Xq6H2UMd=NC90L?j%bwVLP{HX9$LVK@;3tR z$ADA=bISaAF`Yn#tAq83wp92NcF`RM zYA59rd3!3&H91z~pPG9o6XfCyOhESXEKUhUTybLj?@XRNuk~)J>-c_tU}W3z&MN_n zM4wmvS1%M-7%GcAe{^+e1Trh&4)ls_cDhiNa$Uk?T9 z%j>O_{|>}9-i#W(Ptm<)=lFHxS%X)#uo-WRW5s9yd_H#cI7k^QS!^=KFtggskn@x*!L zeU)1MMtW807xlj6orrGO``z^Ig_`fgrda1%W{&K*-8Eqr z9a3cwa)vjN&pNQ6+n?@DvQa_bvFW=9YPVrlp-^OLr7&5NX%sIDN4Nwug)e2*ya`(H zMGenU7NJm$xYmC?KK*VHc0H1RO7mCY!n59?S(B;?K;rVtu5|sfM-lg`(WNIHPhb9N zn!xETV8*GQx#`#-`pk5+3pT(fr=^j$EPSHZ{@Pw?NBX-b*Lr#0Og1jj?K89^zoeDW zZ#Xt4GBX-44!9LMFzweia70-FZ+ryNEWUHO?kI}29)`uGD}CYr~T`% z;v~ENGiD+^&sD9gn{5CRR){;pAKE%FkUPARQr6I3RD%@ilzVkMy=kROuDFJA8fyFL zOYAJwX2wz4KiS60ek*7wc)%cQ-Vj>zknlTVV3P|IC32$25L(P1b++4n(}=4)yUHb; z5l^|Zo~jIwK#)_|EkVF4c>oT<$69BVA%lQN>mdl@V~J-Pc=d zKeu9rXnEJ_t7vx`n9NG&vJP=Tyu)Vt7S%zg=E=)c+19focF;84%41mD^h*Cw8Dhgo zduUssrloA@c_FSMvVTdUHRGPY4Ky%^*>O8WBJzP5bXnv`JJ{ouw7HZlIu|I_LRppL z1Z7&RVqx2S&^pF#!O&vslE+9{O$}`2LmryW6cY~KC!&>iYt7Dib@jpAlpgnE0SL{o znZ@Dyy$f-s2|f|cP4_p;jABF5h%)_SlS|kvPi&wRmh?%N(c60ao~uV?L8)#bTMI94 zP)e4J$B;RLt?yVf>RDt;&1AoW(6q-?|2%8Gd)DjB(-78Aj2&i8cuYgC&KA|aeRt^j zE+>D6{;Ipw)%xVXfldx=*o_e}&#v)7+ib%b!E&(eW3@~rZiv){kyn1opStxB#tr5i z&VgAetjzP>i;rxx{5)oV3a3|(WH+qcsO$P3kZfU~rQPkO_}~sch@(t6iB)2nuGKf>nEkwm&5!5wXBDrFeW%H zkExMLc^zuy5l4l6>S=~2ej(bD#O#gnY{APma*~W6UN9BNuB%VVnh)m(x6z_-xAS zX~z_c#T(^r+c`AM0pgk{+I;?tLvo>w|7>Xmf`4b$FIv!v z$Lz4o6hDQC&a?V|(Z$Pthskh@F(%&pi4m}u6OvKm9xKW5$Hsrb*xL``~H9u6JDXUVpz(2yuWGS@Xr+I!hUYgOJmOglmnF(m(rWaax0 zWlr6{!>cxJBmhUmdJ0daW9~fKDOB#hlAj38o^rC-I7Wphf*p zaRblVSc)HehoJ^K&mZ*(4Gg^l`8H*QMDnYbETtFr-+B4weY)n^vi7B}1WI*dYnmnX zN7D}nAK6CAWD!SO^=G;M#|vtsf_6S(x#q;3k(0?v6;8nJ1I}!qD`{i78~B@od%NOmK^rDV5Ut@JCQg5}?VFjs&(>j8M-ZVAr$4LzfJ&8LyC~BtgZ!fI+Upi^UO0Fsel)^*qnybRIktXT-5U4@>|Q$<*fZXIoKx(@Y4^5|x%pYq*SxZ6 zb_51V?iA{BKQ#E?n~7k>;`-|1ewg^q(|_AUQUDN}U;sP5lFGjSF!3v-+X`0G;Bm$4NgbDJr*l7r*`Lzy1$sz|S4( z9@@H(Igfr`7+?zLyyf59OXn1x+Cah@j%<+=7tlEoPnMGpssWzrv{CD$M^PzdC*rsy zS^T2YI}LUwqW2%)viS!*uw)w6Kj81X%8NTypKEih2+qfM?S}pViJN@yD#h0e4UQ%- zL96eUE&sON{0uFXFZ>6{q(uM8oVWf5Jks1Hh14+P$XF)Xu7Q(TX~>2wY4KgRI5U!z zj9m?3hM!97F{tR|PpEjNXfwrYBnKPENJfiHd9Tt9lHyd3M*cBpjG5<|Ux)q!PP%yP z=Nv!shbXL!*-&n*etk7 z^_MG<|L2&>YV@+nnX%(xR$4yss6VU4|A0oNAw-6AUBl(S<2v$6&zlWqI8DpzCMHF~ z26&iQz|nT9Z}QN|`|~*d?MovjpENt}_(_UX?v+y)FFfA55##5y{SUC*tGF7We3f^u zI!;Awm8q7+<{vB`t}V&EbtB6)J-pLjd9=>IDMnpB9b(w@D7&*fZNu*8zT)yQw@M8) z_||aWk;mae_f!YWs!RF<;2isgmVfA>`=;GsgNe0*0!9BZIJ$dy=KS{#i!*cGUu5!J z6rUW#Glh!%yDh$nV&1})RG##gsVR*HUw9Adf?y?ppRErs;4R;=0lu5I7Z+|hrc<+R)>vAik1DY@AebzW=!q3aQ@KB-)rqPA_Y zOV@9|I16=z8R~pl-60LE13$vYOri!@N0(+e6yrOjQO^xmpOkvtE;U3 z9UjL5ILokMarY9#WkDeN8FM7!>qJdavy9}R{DW=-@`amUlqC3Px|htJs!ARh%^564 z59dT6n1@`v3&;1wH>LL6^n5ZDpWFHeQXTA$2sxWdmqH|eG;+SVonp4bu+*Qu!jypl z*{nL~6P9%5iTGBu)^gfEV2}Ch(#+JwxS8#)6;lTdhWPdzkQ_+=roMHq=%3n2tCvwyk)R#wiLHn<5^RenCX0FS0q+^mgyqa zqL+=2%rMahZ4)ZSM7o_w}<`#!}&}1 z^!z7NCC&{zpWkwp-2(CtnKJtCf6km4O(i}(l2i@Y3i{8`%H%S>=xzrVr+#%H{dPZ- zm>c~MFgq&wHvj3+bR)(zdY^HWksES>KG}3>{rs`mVV8}gB!dm6fk|bXSuvNCGt8+@ zX$;Axz9@(qe>&y!4@l$c*kNkjVc#K%q~F<;)^ene`!Qwq4+u6O+a>cUAqSM}!k}Hw zsy8LyOp4lOb7G_I?8~3A78-1KYW6BE7kQp^D$Yb>(g&g$@xHmC;R78<9^bz6njNM5 z1DFnBxF#GWvd^V&>-f<8y13L}vIi$jx&j~lKQF>!>3-kgp;)2ly8-5e{7B>T>}(!||5 z*{P&sn=_}%BiB7epABB zy~l@h=e4eEzI{B-73fnFp(tUDhcwvG61dce9b-Jng0 zIsehic1btdzRPDZ9+LDD(U7D6bgGd#S(>{V3pbMtwhcDr4}-2OIZf-LXPXSxeJ||n z=R`A|mbLq@TTJ`Uz=>oTY)bxZJCx66l!oTdcM$&p??yfLpX{X_cfbD+_`SpY@gGbh ziT=;8b{Wf;Gv{wSy3Oq2|9?f?8kprn@n1OH3>NzZk-YdgZ*KC8Kd9M%9I;V z=IeX^a}00fGw6);q3ASL%d;`x{$HGm=1t6DFxBRN&tItvcUIXt#m=+wyMsiqXuEN$t%>rGG4yFV}B3WP(8P z%z81Wi?)9fC%v;Te%NPl&~^0kVQC1~+F9~V5)yI{*{9DO^?z@beME3~+v4X>mw_+# z2jqX1XHk;1_@v@Ic@cn{E5@wg29HoRs_(nl+F1i9c1%YO+=`95b&A&o6E%4;DI1QXE zT^f=a@m zGXEWZ#rL?=)+XVvXu-=`gx;&0w`j3Q_zaMl(N~mOEivIjm`=IUD>hZFNHJKb18Y!C zm~`jICq#!D>RjS<={ze?0A+@VE<*MT?*QN(7&O9zpw;4k90qzmgboTgC2LH0JH{5{ zx*bsI1#Uan{5rW3&`3om!d=D@cc^}Ucv6%$tO4%v5HHxQsNOHO=vPQ)9>brNO?ut3 zJ}tmV%i0eKHjSV)L&V{=_0t7xUO*ZE5H*Rf2*;*eyAHn#f}Wx<&3Tb=BoPmjU5p`) zGaGE_V@|upn0Xp_TmXDF1>i~g7LJ2d8-0y`{mTHa06j;N7 z{;w=_izWnGAf`bb#3(`ea-pLax&aFi28X6-HSaSDy5>W4H)s$zblo&WwjeVVl1;j? zQ#0vw4QBQM#kGzy0#O8~eD|bV;x<_O9}GJA-r#`=QX~a3ae^4v&u_ZU%m&4)%tX_9 zF!=c%GW0^M*&S66q3z^_H>*S{&pGO^4%DA10P{I@E_6#7GuzLF^o_WFuC}vT^Tm_U zei#xm3Y_B2{epr{I?INXfRJ2=+$kKavmR6hgffDxsoks+bT~NG@O%zRM|V(_CF(*l zI&r!}fP3TXGN`GW)@qg`46)<49qJ)U)pxlv0`!JbZ&(cCQpPl&c8S3=E=+6dW;3i@ zBDDqMaiio!fV~S5oh8BgUiIsrEjZO9_3xqnQAM#|ADT0f4@^bINWKua#e)e`ZO!KB zR5V?>$qQ&!5$)HIz$p1oB`4oQfTbm=(UrH#{rkz=3CZt9WMXt07yOo~%0S+Rm+q}8 znZT$1mX+ZY_7`!C=J5rB0J4Vx9zPT7f^RoaEB57@oNq(L*9tyO5c{A8H|a@4XyP>E zRKb2@%0PVLx)uTKteq#Nn+i?i316iMzRb3aAYBKaRGro-k5yjs`aJ~$I#IAW8P<@3 zOn#x)eCvY~_&me?ZV8o{@h$Q!;h8TM3a9v$1^727SQrJNh2Vy<*bHzq7keZ19xh(l zoYpA?Xd+W7k~o+hN=jp@oChO~Sf+VF1)=lXwMX%$ zGj4vf_ofN_0*NIeAgS6#4Bm2{BADh2>TGolHYwQoYSS|1`YA#jAD5)>1_xj0f(Gl} z(z8K8X2HJxF2xhdzs0~_XvbFNQ~CtZgL-0&2C%W5hBJV73R##=wI!`91zbW?ynW-4 z)8mV*$q9jTZxQFTCcl2kjx6NC1VpRS%2?*$usSf-rDx(PML&AvM2Xr0dZ$#P=u@MR zlr{=af$?jh3)qgKv$xxz6KuyNKe3DPu+zYcvC`nRocdW}=V3p9K8<@SW~k*IEBRhT zS`i9*J69(dgFbh0iveM!UVFX~wPbvq`!fWY6B4>lfjcjn7q23kuWyirQ`v@E04O>2 zyNkjWBLDqbhCYwJ3arx$`XHVsHTgVwp-=y;2hfsZ*kTFfe5gIe0EPrLGA%*xHDd#% z0Hhki=f{?bd)y06gDjhu#O8Mted7GYc05j*m0fa)<=9kj!^EVkNWPYqHp~w#+S}No zAqDVg3cP7+i%i4udKmr+jB3Rq{exb{XB)X_JP$O}yPE}i4%5`&|0A-N`CJ95Hox^_ z(?Y18t_NkV8cOZNxm6>;Ejr8P5C2~aN5ciT`H5aFpCIyLVgjz&Xye< zA)z72YKr-$=2>#NP3R>9y_0OrOA!E*Exz&5m4Bt^x&904G8^pq20I0 zGvgzja9i~Tx+gH)SsfhXj`XnH#DAtEJNIeIXNtW#WSaw5-Td7o43mIB5h1z~NH;r1 zFb1(J1uBXR_S@btVa0QTg2ARns0hacMiPF0Ix?*VWimaPUlhnxii{t17qDKE^}#Gd z(_#a$)Ew@DOT+8=zK)ko-J!@VMkE}0CX=4f0P6~i69m2G}&n0I`Y!rSjl zj-{Ccb6QDV@7*v16PtJ-4YNT-m$*`N-CxIBd`{D#Vx`$*iJEYfOM4^_YnmpN=V`I} zGu|c1rj^wey#lKu(Iz)W;|rC92}B({P^o)A79o8@x`2H zDei;1aIW?TnWA+b-M5~<8y*vJ@yH|6rrGeqVo^s#fO5R#8_n>=hdD6UUXj;$ge6;( z!c$&k4q^Ww1q>jCP{F9h_t)Yx(6ctil-Hf>c*6&AnHoR=+zb4}OHv96WBoFJS&z9w ztSaybkn|B?*mTHl!C%pm)DFS@8HF17wZ6L&{Fd==-V)))L}c9Wd_HbLc4UwFt6K+Y zJ;DjcI3i)1d6!$1c^4M=z)}5wUTdY8PO{|;oX5(dgjG=6Ubn$3<)3(Wr`Mp z&TOdz~ua%5r(BnJGcgTjCy1_b; z{Ty6JST--xyg$!8FY>t?H275B51}4VDENiH4jF@BJsHpfUIM`-1AhbJ^l<8B%VZC5 zBJSZBmtq@sd2^beHz8`jWpt=(Yff;pf%NW?l;ASw5`1 zK_RqxbTS}9yr%-%tpTT=-Em$dpp-o&|6s>$2EgE;Tw4iS7Vi3Z$bNZkyY`wnJVGoo z^6pnSw<`J@f^CCv(ST^7-Ns%o-g{~imdt6y^S7;S9MXnJiU@vmNYoQ|r zz>ir9zSzV;?vPXGGq^JIqy$-Ei5EU@8-e0~k6Lcwq}Ae{rg`nl5a#1PxqX$11H)9a zJHCyXtfhVELrwv3c3nabxvV*MN%K4ZSW3`n=8FMM?;q4CrbR)x#9D{2yb4HMpK=_S z2)%U~ac;Z~6wVYo(k)!T81-J?1peX!{gh)WW$>~TAW?T6!k_29n1zP3MqWPabPT%N zYVOOE(?wU9Sfuc3P(3aaNzz9M&GmH#lTXOy1)8=7Q}IoLL+aONwJf0cL>{auf?^&s z0Unf+@P3K>N&>E>wRK45ql;iHgC@oFd%j;mDmfByVBr~Q1~^`mM=PW69txQ=0mm|M zxy?xfec$F8Zcr*#X=DF8FySJ6)b;#+0VKqGH8IvTZIU2`p(~&AlLCZsv6>ml-#)1E zpv-r5Q6ZKy(prg(9Mtr}muB%O8Q`ti2S@^q28U_>Qq0NF)XH4@uJQ>gMA6=-0fN31 zboQm>MkWBchF=Hk&|(RSJMl)eHI*3O!HVXCIQpLore6WW6zlWh^nwNNW_RCHpuHQi zCna}Fgk!fCi(OJivB31LRu!n3wyK1W6@%Bq%|Zv|J$Yg6NBsVt1Qefw3eLL@=Pplm zOHKS(MmFs1mOx+956D$VrcdW>+nEKuD*A21vBdQ#&pHAvHw> zfN2V0Jj%IuRCxl;WG){g?|g>~@NeP}#tG!O#GWZN!x+}c$8j+IV1;hKy4lOf7Vy9V zMFI+OL3!#@Ca7A87;lUpkE`IXxM=~N1zv+9-%I?zG5Qr{L~z^ajs(Ew7D0`3Y736E zEJ|kwv)5By9Q7;K*-_~EwL7p-d`wq;IW#^~B@SJ{&Po;T%07zA>BkEn`?Ht=$3nLg zipB^0#-^JI!d_y~V9YmNX=Ex-!UU`t*(|_T7dVPx?sCyg>^5GM?z_4hJW`Ww*K{$+ z(;Hs=lHzE4U*lQ@H#j%zygurH&jr31s~@sy%+gHpk8DGE)}dp=HTuF2`i}vuu!M66 zeQ!KOLmvZ>!c!sx#K}cautn50!(aVeqhz-qU9y1FxZAEsU=AJR)m8ib6qzd?rv@f! z>2v5rq2{s?;roZF&8V;5aT;3E26gOD0xy3BZj^?zk~7nMO5KP8tNQWHF#vH5vVl{c zg{>{Fj2}=Kh}}|X}<2g$Q3pH|q zA&kPAvh6hB%I`h&;Hjq>Gc;l* z*nW;drv(u`leB5|=ClAS=@Gz#qLA&>ZNrM1%X}RbAOCdz69-O=tM^$b2=H|?yHAhR zc&S0x{!3=(#Sjr7-4v$ozAg~E&;>WAHUA~V{sT_D8ecNxkDq1~!BnDcA)QFN*6SW9 z+T?c2{6%Zl`Sys34ML2HP>TiwOgBU$zJ`-rs52t#h@Mwa5qP-y%Qo}gm<9d)VjKmX za>_E`)psO&T%#!E>7i8@t8|`UG@pG&y~%vr@hq25U|{ae8*543J{4KsQ{1n2#Jnxr zB()MQeO#vaYUK`UgH_k}84C-bBp564?_nA)LGX3{$4xU{4`Hz&u{aRqDaK3|A|Tnp zsKZ2}gBdX)`CUNSji;JILL5Im?@AnUJ>%0cHf75ZARb5|G`6*#4Vuw-d9;M* z@lMjgOwqZOUezSlnZf;eZ07GwoD3w`B8fBo-k@#$o`=K@JGXA2Q zfzn%>meOz^E}1!ZD;fN5;Z`C(PsWq zi(>Z81L)EaguM8bL;YeJxw3m}OyC0#uOa}F)rj#~r+^0yeLHC6eA|7GiSOCBm@8>R zmq7lAu`&Mxr=0(^Ac_MR1=;CW5^eNf8g(DSr5{C+wHaUPIU@kDJ_0EoeG0X zYt>=L&@eg@;{DgikO!sf3g|ep%pPPKExh+Q4QR3&5Do@jniPvj0cZSxAFBIEiSg@j z#fanY(5Wf2;ZkV-yO2<`HHiUI$Demx?#u_vTO8F)bMUNaz?9^dezo4Sj6fw1iLd&TJ3 z)LDh-*9Y`+cEQvP9w^W5BUbKbJv}#Xt=%q<`39c*Jt9Wb-M%e{AwtDW@nZz&%WFbO zoeD2LrgkIoZpWi^{knl>2$V;IwX?`Y7=jSkZ^WUwzD7qc=0j$&X5cPR3^Ud=>6Uoj ze2&v?qZK%L8E^P9W3QNkA~viD;dIG;$+V(8!@N%7>@93uim$Q+P|%VZ5?AMy4Vl#` zhs-wCzNS$iA})l@CLpHX_^{X-Wg||>J6M;4ZGe-h%8QTK}RIKL|8RE zwJSloF||cm_n?&3=T28w+gjwy&D9J}FxJ9SJTN&k9_@6vQ${^?EMy8|`tBc~q%ya; zTb7V;F7ZYY@>)hi->(PL9x<`r{Nhcl_uJZ!qYcTkc%jT+h@Kb)s@CDJ2d5Te1iwSw zK!^m38)RdDef(NR3py}hf#Ii9=Dldo?Xm}+d8B!uZQB*Q`V51Y%#owqm9JQBOz1oEezOoOUAGQ8@^7G<* zv8%wgXq`W>>fRN6@-W2bs5b=2p(2y}S&hiZZv^}T{F zLVDqq?v1P38?PRTpSueRejjML`gm!t>QR2^_DO~apr9IntXK*3g5$YN7>T>p&+V`%!l)3NyWM!LQghl&=O*kKe&GHV`sbT;{M5f znGty||JyzW-IW*|k+($Y>cKRmq0$r#ZBm`+sjHUSq7r#&zhoNr zrX%Jsx(Ec)+GMrX1Z%B?wv7@~$1$TMm_V!`5VCEKoLf#*Ciym&~UZ9wsljhTz6GJQW z=!*kp`_gQL-Ht9~6nWI$Aw7D${($;ohymE>MM3jgbk!?d>OUR6mNu+v6W&|tLjf>17N z{s};wq(q$2vyo_y93fi`TZW`VQVBT< zIm!_+M>P@Ks4yW(cTsYsjOLig)#1MHlB;t2{r1=X++&Z==e_6a{k&cPj_NtS|7ie5 zas|ErNoyQu_@*CZbl!arTnsgC79e<>psFaFdGll^XugLCc!($R{s(wL1j_x=Hiu*} zR2K*HMy^(tZ?VwO3(xL`!7F(ANN9sa7XVV^F+^&RJ=6T!KL@G|Da86C4W@h1el^#N znRwG)(E4)O#+#v>lAiGQAQ0CA&qd0>i0OkO6O4OQmIXU+B3t7?*#O1jsFeqU3nKu+kWasfkI- znZ{Z38)nG5g`YzW4GfTc=&C}Z*WQBfTjIv#X{mTZ1ne9jh?j&!vog{IK6NtjLGT3r zFyY-}!0|GWfOjRV(#>Q&IYU)6=|e0WFUtC(==X;c8XV7g?d%OJT_Up#3hx;&grcyG z1=CMV6s6=~guLq^l zi~+|}S-*8vAE5B239&z#WufgoDcsPZ`M)**%pxk*T6L7xmy4H(OdY&%I4mCc7y#V~ z1}_BJS6e!X;Pp(Ci-XyRTUY`ah!dA3J0FAg#|>3j`x=hvtDgQiK?0~*hQcirK!z%N zF_Qk$zI4D$`OrZNr{5pmSm2P%_BJrIesYUrWcQ*ghB^Q&Lh2gdMBMMRUjU`abijQ@ z=t>yZNI;V)@-VACJDK?|Ig$h*{I8sypOnDXFeutdSwL&NInE5BD%RwEenTilnQ0K= zlsH0T8OnNm68A}!B@Vd^X?^9)MwHS_@uziKT0nlX;%Y{tz)eMTr2Klmf@{0y9|tJc zJK7`mQt{nx1KT`O?!=oQGdVaEio*57>w-YA0Dt!)6bTxPg;;i~!lHy(vI$mC<4w!I z5s~oRpaT)YDPBm1jg+urn+0`GFeyz+j+wqx=Iw)PVnNyXoSZ}!u!sW#);R}n=E2<| z{b(y0)n`ZQh~-JbgDOiQ2r_~b63Uc59FMWE6mL}oWXa``8AwE*MmVt515GW_^;3W2 z&+xi-JmA_<%DG|_f>-2q!iT?^1Ap?~idrQ4qYJm~vS+XC&AvMLx9Y~%9J9ojPgPx# z#wl;QE);!u?Td#*OK(m7L_hs2^)9dKX=nj(4%qA6IeVp4@Kr|nJ_i-6cTB16n(aLc z?&(a4=;#o`$~nzLbv2{rud(TA#~``GWO?Z5q;u4>1a`Y5*U8y4Zq=(|t>@CG{Y?iM z>GBnXTH}RD{<6r1v!o1-~sNlW~^1 zFa4PQgguvP_M+EzY^sLL9`jR(qG_Oak=z>9zJ*t%W~k{#QB>;N+C+Out^${D1*}$D zZg0wUwd$CUl(g)f5ZwIFJK6rF2Fc&2)&9xMf0uu=>rT~*?|2e5gJSCATV9&%p?$jz zyH{CwlP)DRk$6459zQG9<8-|hE62ret?3=4oEoyQ>xE|;qME-ycF2sl;{2v)=KOI5 zzh#L%_&!G9{im`goW}^gW!a^|w-*Z6OkDK5~TX)>|QS4KLtR!ew&e(KDcV(Rq znwzrsL0Ng4S04pCFet7ClI`ke96AhVH7Zz9qr8{#EA{}{7n^)d^xXii+o1ym7HoEK z2xL|@JF^U10f7ZEptS`QQ;z?&BPJ+gjD#BWu*AcU8vhuCl_s`*bvp!LdSXuEIvG@y zc!wFZn}eD}^CA5H^te7DAf73Q^;lU#jCurF*Q#4kN?;N4z_M5SSe!|5<2PtPz9Qcu z{E#=JB|_rr0W5B-N!D!ncjhM+Y{_FYKg3=h><$XC!Z)18FSeY=exFmtjZf^Pjsi`h zJE;vI1O%tvoIrsuTW!Vfx{Q7B~bX02+TJ;Gm$z=3Xno&}zu|HwF z@T4rIxJfn_FfXx4ofa#_C@dlmXI-?T7n`P{>QkppYhv#n0V?@%Th&`g5UG!Bs;d1$ z<)e~4ycU;~TRU#3ziL_;i{ z78k&00sUjMOb_F-?h@>SAf|zZr}%t5ihyh1lX5ljdbX=1f|CIRh--Yb7xBLWh<>zN zuS~X%s#nr@j}{E8IE+3Z>tdVmhCt8?S;O4ytNkJkZtlni4AETphiG6bqnms2{=Jp`vlDTePSv> zkR0>$p| zMuy`+#&|Mrrcfe?!4Ao}--wL;#|!?m$nb|-42}jeQK30OyPZEmm~yH9JpN!~hrXV% zhP*1V3WN*~#*E-|#>yJZ?YVXoVHZ@NgO&$3+qCdmU^5B>XlOWSUB8*Z1dLl8fYb7& z6#YU4n8OaNmy7Efl8s!1Obl%lG{x^xO;VJ({>5LGAxv3c(}00~MOaZulJSNHppD=6 z%kwy}PPVB(s(*>GeI89mT{IpB&MW4v84NzvuI5;F!T(e!|EIo)M|di05^ffC-KMLXqje-6Q-5au?c)k%2tm8OzdR{ zwk>x@KuN;S*l!BZB;&2G0aO_Q8PXo}B#w(we;bvw@%I44yIUb$)11u3Dwc4XfpWH& ztXJdZQ2Z%8!a(Djd4>n1(Eq3O@Kc1(9~IAi@A)vFQ6}txy_- zf{iCW4j)Ag$5u&(c$+4i+bA?kHy)cvy#5l~XjMfv$$LUvBI}^mPHRC?^3^RTwUR38 zAE&*?OPX>bhPT%8F(9_3edmo&55-xuCX!6l&J;+BeKRN<0OAo~XdwV(#z$CgKcy}v zqqXUfx_|+oUf}jbCw?+rWT&BRjyP5^;QKYM-}(MG?~{|k0{>S6)O;45E()rKZiSyb z&a;ycB(N9P^e+G0_2E&ffzkp}V|O|nns3BEf12I4sgclg;qu2E9jV{`TthPp z+;_0w5l_aYAf&jWPN+>oXa4tcA~DTr>%{&TdzI%_V9g;Iq)^r=CI>l-I&SHWtDq`lP8WgP+J}|K)w)Ye`1KMV_ml?R^o}}&z$LZ ztBQzAdM3Px#-YE53%|sC7rws%Buvz|&$GR#bt#H*_(scge*>?05BEdB{l}iHU=6uA z{T1zoOgS!ls!1n7Mjd zsPiT?yo#JOIpntMuOB~WN#vBYE>g8iwx|xN{#}&&F}~>EKsofruts1-HN`+$_sWlp zSEF_nSH6S>D@~s1#t= z!jTFdkL;D#oX7axO|M!u@Y305d+Y`IoMzn|@A+pH0ps17rw7Uwb{XR83zH*bZndXN z`xQ@BZfxFDZ(q;9B6ss6rbDc-KlSH}F3Wc=Dux;wdG`v&nlF^!3)W%=tSJ{r)eLZZ zVyL;JGow;%QK-g$N4%xj2v#y0WHJimIjAdu7a|+~lxsuKqXr^bU2^mN`qsK8kY3i< zxU!TOjPs6?^(q`8P&8PJWSG0VW4vsm!gvB$dWRMV6`VfGMj`e8tmnhbB@cKsfq$q` zLrqIWy5=k!I!--J0t4~{nLt_Kw={D930p866QRx=rMmR$W3WmDGCCT%2o`IT)x<&^ zJEI=G%U3DO-D(HpS$Oas2%-QN0w?J?H4zB3EJuCsMM2v%(+7?EtM*Kh=XO>RJ~Hr1 zJerV%QW$Rry##d=k@_<#w@#7uo5~1iCON4bPu0Y7&>=w{NXft`q?hXdfY<^g4l7v6 z@CM~tKbEsMur6tFyLQ;sh`zkAk=PC?0_8M^Kwf7Gdc|A!cM^l@TR2m2r5nsC21Vs1 zq{H|-mz-4%&WWd4cu(h}tQa(MyVT>Xsf=t5Q{x1KRJZ|j3+#K-#bqh>4J>6uKVp7t z>41mVcp6u$GzG=4qZB*de*9@X9zvC;mET01vD!I@8Er~RM(0kB5aOXry2fNlrdP76 z*wpfbmbr(q(+k5?Lg?t*4otpD7BHlkeM+ zkZFc_Fkp(3s~*|!$Vuj@TA&+h&j28TtJo&FB#sAefX!J5gQV^-)b}(NmJ)zuNZMnIe1B=S zZi~pe>LnO=Aiy&w=b(yEp9~h1A_CPkAruZ{Y%5kw;ONjf{;&N{SOT83BS`Yhcag`x z1@EG$!6j_Q_|jI5(*x@j@xd{*hKXmWt{_u5s8UTTpUed#fgG@DDeiV9YQ2yG4H31W z(#RkvDx#&1o;Fef{_#tEg%0{mik1=|Vd8WC2RJmJ0{rcN**~B*v>G>f;&YpA;^9zv zSu>MUg_Mt+BwO_#atT3jMmrP&orG6~AsV9xWeUj0v91*PziULMa~qLY|CEoek|5kX7eCOy&fxFn78>ZMG?omY^7`_3d(c z9R2BF8cuV68e_}mL1|y42{G=Ud{jr3l*wWDEBfa`72R^rj$b|NGTqon*xzGn74B5? zIkZ+EQm=n8Ar|rO-ji8NUf0)0r5&Pi{od@ywhLNzPe^wq=5Y19qkGQm)=pK7Z4KGD z&+So}_u#~IKsn;jS~Z{f>z6xUQ{G=V9E5jIxNuwzk~ zOD^TC0r5WA^mx4kByWL-Bx3*B0ucY6V)0}>@~w}CD#%dk3oV(PNn`(@m?ps*Km7_L zn=^7e(<7-Afq2Sp(xGVz{kWE!$=3vffcdfUOH1 z=49x_g{Ia++@JgpxZG&*q1a&5Lmo6|>-6-=Hzlu4sd}NryV)zJ9jy<;e(RGEoZcU2 zJYPR|Eqc&U`DB#78LIJSdSUEu8aKaC;u%B0SO%$_h$fls*WUYH`=4Irez@xScHNe+ z(^f8p1;RLEi$nb#Bw)8!=8{Y4Cnh_ntn@#Gl%KF#_MxzIVb14h*v}c#$V)tASgtI$ zI2b^V@Ek0TtN7Sd(cb#4i2}p&a^j3rDSG}>(mCe-{Y(nf9FUAW5Kz=dhxXcU-`eH> z^rA&uRAYzB&NI?eaBMOPS@|N6^FrD4a{YKh>|P!1(ylmYIX>js@7^)Jbe7@{$E&Vb z8HiQ59mVU&#gFQM{qWeVv(g@EXv$U5KQMR4vEiookBQib2<=w)k;O6+mX^oOwLjY* zzq;3uNkh%-u$lRnOy{`?aM#E`=<+zeOCD9~YyGD;8iKsXUTa1LDFC)F{f;8APZHH1 zyfS`2yLfC-(p6gAU)%cd5{19O+&?^_^z_*kg}Px-wP1Jd!$QxMyb% ze0Y2%JH&Niq0_yr-7wg)MpL+9@{o=rmzj7)9)@tCyvW>h=KrRj>c7hyT7bQ|58?d)}`xf znkZQcG))IGvywPp%Foj|2`JOA2tXrshpDPE;$n(^zC{03GV_}AU2z0TuQmxH1nsmD zyWz8epX5fR*@fAHDlPc1uH}Wf(WF+5c%SO+r~c|d)hL4kZb#1%lm8v5EV(-#UMKZN z5`j$yq&kY>Vs8*j3L9JLGIeAFI%i}+nWt!FYG0bEks6Zai^6gpUqlvyq>ym%Qqmg{ z-yqXd<)mj|we!rhE)=3!5YYIeR!g%8`cIfR@-{i~i=Px>SlR<=^dn)v+6L8_hd1Xo?bun%SVeHzIOM?=arS}kO|jd8(#j1CXb_E0QW-@EN5c(>t#BYnj+C0QGV~qW~Y%CTTeGBZNTU=AL9kl9?+{E4YaCC zu@Hx&6g7Q3Y13Q0q8~rIO~*b9ua?C#AJL1k2bLVY|6YFbGaECFXJw*iKX?}Xe0o94 zb82 z{Pn-|kSb#y{d)D_`ZsM@t+706<;uYR=dRXsW6hs`{Ol<%Ju7(SNI+N2E>49%&aI*O z?o+LUFOVFyKa&9tAie6Ua{R5`GCoVfGMkAzNSpK)rRply`d*Ldzm1BI1a|UkQ-b!a zzz2ti;iYGqFxdVio=eYV&#pYZR%lz`)L%#S$9|3Yovg}H@yo+af3kLU20?rKTLdzA zd#u0}*`!}v3$rphz^KgwZ4}BZR5k!yU5)vwgVp2~Oj-p)f-0_n9bd3S0UCxl(fI;i z8OkYSy}?3ja}D_$+DX8;?4{7OU$+1S0}VM%Ab%M1ueu;@Te+J?!MhAPzHZV;x-1Xs z&Hy-&g}yj}22!`hEA5N0Ko4r4A2Xp9B;<8p+iC`_$fFfU@rRs8;_B>^*}wIBw3Mk{ z(c8XIgofEBm(XQmLBL63mZOVSsduaDM7MhFI165zrj2(nkOut?+1AO4Yb@|kFUI(b zi;mr4g^yo<813`x^#pgqHLC!WN@=cX~$h zWm%op~4qfT0Pd8{5goFNv0agsWZxHoot3t)+c{tNqGR zvHJUpI?9$jyV?Gi-==4z+t+ZN;v-hE*~|@lD)@(~i;Lh;Z4B>(i_mZWicMMaQJE1g z$yU2f&dNDDEBf+Ouavs5gcr81?sKNgatM(+IvTsT>Gs|*8NR!>^nO}*@9mqY;RCNf z%jPofzteke|J`$wB5P%R;qHf5mv82FdV|OQyn2H9p)B*c*5B!s8{fL1apOhkV$<{| z%ZB+yZ2ZC4|6aEhJU?}0vV!gTInV0jvE#F>fr>cGRj1>xRC_Sq(h*fj@xKqPr&M1=Kn@P5;$VPDb}!7AEFY8dRRD+s^C#;3VZg>>WDO z{Iz}Kz|xYA6hH44owR>r_k}Z`j|S-th!?wN$NH$yi&yqJAW%CDHosYJ-+Vo-{iEQ@ z&eN5zq46+zwc=7a4l?HAhldYdJ}R31@8dp)_w;JDS^2%+FZ(|@DS%G`>|+sc?-PY- z-}1a67k+BpsB-JQ>!n1^4EZ#aF>Fvm>U_kj+-PgALm~6aiKPq^|E7|U}KEP|*$cUm4Bu_!$ zo)!4iCM7L^R$h2Wp>OroShFdcYU_T;m!onEE|$+B0CHS3U%W!4!TAP~krLWtnhumn z8|EDaD8MqoixJ)&h!uVi)*iW$B*&EEkDu29vJ7mnBGZgCP!_edPUxlAeBRL5EzW^b z&&b9rVP)rc0!ZDM#t(1U{wPrw5y5E=^tjSOQWNGiR%!N!4Dv-?#6YzFH6>N>Q=ryG zP;`vPj&m?qe;G3_69W!1Td?!Z<&QU@?%rrh`sJg|XWCrKn`Xk(3_@ei3bmjz6NarE zww?i^X{$2{))A4D{|DTz2wn~e%rOewB$=5L3pwku#Sw8iPdUpd{so`pvfTKAl6^#vt&Q zt8HtjyBYR2DCsGXmk5tAS*6ih|@)g0NrO8j3slu z@>HGVshO$gZ0W;Yz5y*^BN9JB73qJH{zyEeAKJy#)2LV8CGLs~GT#L**2Cu=h&xL*9Lcf+dTec+D{zZB|JKC^d>z_ z8hdvBkMpcbRChmRN4j92d7s&i!r^k~^5c_t?^Y!EX{7Cb`6_QN_3Z(7^B@deA{$?N zwO8-{E4|1(JKXAGb&@oaOYiZf~&)(;B*LTfd&q??s{S}=yS35d(1vQ#(SFXlNTC@tY zTQ%B| z0+1L6)%_uKGIUV)&r#nW!AS;}0=MH%oxf=8s8HhkQff}GCn3_LQ~~~K>eM@?BDwc^ z9mHLNYrd7?e!Hj%+AdK)C*Sy^<`2~zG%-MuTV*JMkO8Tp{s}w;pfjWA@~DdolQK-h zpBYOeZ0h|9}%0uvSSr{@2WCE{2; zmcpjBA~p=bx~0pq;vET>fb&q(K9yg4#+g%Fua(z*i;*7Q(VjZ-g-y@!0@bKJ*8j2| z4Jf@pT!a((*H+F%IIcIQ75rKd^?1k-{koyNv-0oQRdeMSY4G7ysKjs`<6dTCWK{>E z^iTQMeyN)(-|y@=JOZYpO^#ju`Wb-QFI&C&(Y5y_>>;n7IpKN#Qlh23;cO=qt{Y^= zS|6}`*m3yLvC#+ptI+hw{j>?uxJ3qHP|#~VM_)c$@%CPXMM!AB?+UQt@ff!s;swd< z2N4~e>hCVV7HE}31($uVp3D_v{7PFyF%mDOeM&E*c+xwcDqk|e@5(j*d;d$c|M6Hm zUA5U-t7g{&alQNYxL+ae9Ue~=9dqlXucf7zvmZO{yH{Z(k$Ncp#>?J){ElZ+T^ggX z16^IIsP)h;x0$V;#xm4td^A8@_|Dj8PL#W$7#b3=xS<*vo>llO3 z6_=6mfio^|r@P$aUEyYU&z~#1+7|Z(=$IUhfUuHCZ{J<{lFN*|Ty?I%AZElie|qYS zh|qocj7A7ca4-8iES@{{QX9d`FNud@=nUK#vW-DSI z(6b#L9lKp1p)@9ovxO_1KJ@U4QrO*+2IHg?(WWywOXtgr%AQ}0?^uZ6FALu~E?(+- zcvku;w-0@BD{(;CO-H}bw77gX-nQkX>a(_M`AW z4$m3Om6?1WnA#TgUFX~G`0ucm)wNf|o)eb$!#JMd7WP8c`fJ-6rE|7}`@`ky+lI1# ze2zSK({3XOQ9&S&z0voPxM>>&uz}h?8uQ=GDRT2`yImEMkul63edydfR#^E1%}nz(7^MMcW>5>=61lN< zPt^q+u=Qi@06=x%CJ2OaX^PhiwoLzpl=Mt#8|WbGF$s9rA#TK#)x0NUKsm=WE-om~ zKNEaVO9=|b6(!Kmwm9(_Z(_+-Dn;Lz5T ze053oDSQJWX<@_yP3lX(n5+#X^Uy71d*pXcW5m{BK*2Ur^=$bq_ih5&Fj+bF7-7&e zl{i11<09diPzN>ut+HTItskzT=U^Zd({Zveo|2}6DeMq$L&M=^2jlW+i4(<$`3P(e z#*MQ0ZUXzcF$zJ3bpg2+=)1QCIiq8Z1}R1&rDB%%r>PlK#_qJwUybq0hbWjq3<`Dq zYCMIB9RzVXARk~QROw+=o%!?f%hsh_>ok@Hk#x$i2Mb0RTReKRQB-G}*MaoKe=(Ii zBx+Fw5+MQcc=xbMyMTW;$<_c@Nw+_AgNRg$D6UzdK@C9L5yQRW9nvjsXJrdNG1zoe zeP=?XVr5(+AGq68f(~&8%)Jr0J2@IjO}u8eez^D-Bx(*klHaGP%f_FH*mYQ13C~IQ zLIO>ya%7Mf$2gfm=z-}Q1R{>O@Zb*En9!FU$3%>x1 z=7!QUQKL@9FI*^C<1s?;vfTg^$|Qdf5&a=WSfeqlS7Lo32xsvt^Tcj!udbj)lw1HV386FqPE4KFRS&v5xyi}go?)2>2JS|T(8Ud5x=CtP zz6qgd`K^zu_WPiWzNUx+E^*-g-07Td6l?qe%cO#5&%O#$4zU2)m$u$)+U&S1OOVV3 z;YABR#Pdk;?P2mrjY%$1zn#nrC2=M0K^$H#@z8Ewar-eWqZ^qq^Eq&i)Kg7AGL9SPR39fSzjYpwi)62=OR+iKWgF8KDNRb zG8n2>APl&cU!kTvK)Yg~=W!|cdfscDEW)#;Y1on^TyEF%B>C_r=@8EIicGY{ilgZL zN|V`31p{cNX-Z@f+clsiB4eD}t)ZYHd%ch=W4Oe?ixrI1=h5);(Dx<)?a#rWa7Zwx zF4DkJBPW_ImT=C{t1pKNQH37By1+}ID~NoGN@F%C!Rw26ga3i-d!r<>2GG0zf=+@j zB;G&i6>f}(?fYvteERWOKV?Sl!m(Rd?{c7va@@BR+<0G&YM!PJLV7>3`s8_!3IDFw zPzpCzGdOL~m_tlhnJk|E{;{V-PCv%#%7X*XT?alTh}gL7FZInGzVfJ;@ov2JFHhc5 zJTKv9o(K5|*Q9~+i<3^{(z4#1WQv<4uRS*K@teXQis&7aUsznO0+sRTqr}AQ9stFg znpkN*EV%@))#^vU;6NV+t5}0zaWc_sTN!2P-PTQW%TxdW0X#)TJO47Nu?{R5OiYQX zut(D3MYTn!Sri%O7$qv>D2*c+5Q(iohg{3b#7^=$pC__7fP%!MH6v3Ro3Q^0^&2AX z%k%*J6wJzN+{IL)k%&;v0_%q$!4#N8h*|kJkQ8+FTK7{osDh&L)MR3rQe?%SjSMmA z=hWGKSFWnR)h++?yZXbh;bB)sE#bi5s@wf_9lFvelQSn@6=+>idhsEb`kq}Cz4spX zngki^wu)|!=oJwe<(h_)2&=sDTZ|?-@h|>m=asH{|Mm@@o6tFu#0H>c-kZBWAo}|R z30EG|_P!j*{1|KfIWMqA&H71b0{mjIfy}IEY@nyG*7Nyc)$-N z>C5`(&6IA7aO$*PYVo^oc_ppIx`w+R>7_joYuOW34hoJ~j~uyJW3v=Esu3R*e*E(N zeP=7{S}j6Xw6^>cmT#>)S(9IJ1m1Xxzp0rJP7t%^ zt@{O%XY0>@b58y0%Twl2kM?@)9V+LxZtIpN$rLET$jGW&8I)kL;zjn+V^smV?6U#fG} zUTlY{Jgh6W)jaRd8TQV74r!rh9kM**N-%lc_fNYL3PY)iegJA^$2T10+{-DBDq;{M zeNU=TIpl5D>AB-UBU|YOQ-6OL8PCZ;1&tD&(}w2>{0cz1^`6wP`|?iz(vE+E{cf$y zslRs>#MC#dzTRyN#ncWW#@B>mK%`-Xk3yi9lZ#z?yEnLwlH+1K~o393vXH{ z1aWW;)@{F%H56Kx*g7(TXgq?JkgiaYQEt=?)`0EKGYJ?=vF$I~0oU+tpyz#@$&d4N zCq-J5$iE2yZ?g*tVv3vnW|G0}Hd3o<+1AD83{+4CJ)zMZe9_w=bd(t^PHbPg=1i4b z_As2z4aB=a7eEP~wF+xkCA@=UJD%af5|0aA(=5eEK!@c43Y2#m>kj=C!NE^hn1%+A z-j#N5U{DT$ls%W#j@*WTq{{s{+WArfw@^q<468{jUt%ReJ3l#iSYHJGP(Uh=-PWqp zx;!}elwVm~nf&ipRv(iTntE7LNlLiTa^OIhXEfP?vCqcglGRzR-{$t_0`stlgh+oF z%<&Y~YJ=^IoNIG6`1wjv(P;x*upXg5VvPuu1;iB~c}5@6;dK4MZ8%Jrki3;KMR3gN zy%tIDhnNUcorM?kdkv93<80&ko1==QXXiz63^n9Zb>rvHv$

%#q+cm(D{E0CANy z%`)TJ<81rJ(DkKU!m6ma02&-)Aq!*Dr4)aoneD?KUX%8n@`fafBLHhM$QCXRGtE@> zD~n$U^uUIz(tA7bik0s=GqG@NM7|QfaYhx9&$i!O%3)*E^}7i`N`Lw|^Kj>$Ty@e^ z=DWIRhKnSalcKPG;VogzkW{kB8}D*<5^Yuh=W&wW09XcQ^nXBs zAE=C|Qh-7R>l3j3g=CYdaWsH~qGdflRCl595lMVUJkXsvcX=58LC_4g%)j3Wz-CzL zZ_73#S0BcQ=wcG<^Zyz88Z$JyDIaY=3#AA8k`l!6Q}ea<0ASDm7<2=G2KO#mT^0)` zv3OpL0|A9xY9vR@@tpvpuqhf?eSv6POmW;JXU6DwuFyY0mL6|aE_86|#AjJBoz33h zIkWL(YTNAf)?ex^InHsCF?sG>fwbUc##0eMO`qZdhSEJiPz*Q8hl5y>(j@kjw zK!MN(LGa;AKFIXNMn0A+ZffN$5ZovKP1Q4O`JgsU>L-$Q1p%C`Ste^s*r|6piI3LT z;7_%ZN>2&Vy$+1lK3x0V?x5<36)o{zFHEy%g)G5)J2vcy>8PoK+=O8PKUi^Wg>3FR z#wz!E0Y!(7ro*Vz6>REElW5JCzS_|;lv(|&)_UP@)ZS49*8w_Uvw<#+pr@}ebCcv% z%O;HGyfX(DM$b8_)K`nj@KI2?6yBlqMHw~V?S~iH9_HkUQ;*yL82L?+N~vRFW(G6Z zm}JH~sf)ST`So1^ehw2ppXKcQY6>B*pU&F;j5!umIUIJv==C5~GMT%7^-iv{-JQ48 zo3_4l*G=)IXKdMR{kt;ut9QeXe_{-g#8R}J@-s>Jlo-F^_zr#pu}TXhP+e4u#sitW zdN!P?>?a#pO^6S|MNkohao$4u!?oKtTd@D}vWV3nL;}4g7!AT8EJfmqIW8(iNXCJ# zG?+I#Maspch>|15aYX<1J!{4(luG6VRDZnyd@~*hpmdTfd4* z>j6a0;yyu)(3IB48QFLoN|o@a+oOgr`%F!i(SVBzFfA-d)uLALxjhR5N00mV{zys} zPFz))to!tJ^;LMEF66?Wn$fWPe^?ry?>tWZiJvgRm}gElr3s=IZbVpVKD1%IF8q(l zg({yr{L=a>#^w^|&Nm&M)NB>A_N|I?%=IVDw=bUirxM>h^9s~gFnr#3UOI@F8@WGO z20u}gR&*n`Lds7LAI3T{GkY*MT#+eSn)1oB>~`G`p;gQ-?+b56Et+^Q^rR*Nlv;zB zl1m_cxEsPxGhddp|JH4#dVa*=^kJ7PacFFw6_=ot)#+$@rdjQDlj?9h+y=9l`&Ajch6;lEqvYMays z4tHMkZ$C{EYZ5Jn7&Jl#8;jrJk(GG6Hc!qe9*LTq(@pg^*Se4;g-?*H+`Qvs@ov}< zx@<)y27WF zYK8JdfTdmxukwn@$->C&oZ>{azVs3ZQ^HbbD%#izyI~te}MC`j?_6RU`r4GP(VRc@`0n_-~Om4(Lzwf-hb4N_m_# zSr3%cRhV^Wn?a`g&qf>qUfL5kxn zlZ@hJC8gD<^;pp2v1)4B9$vx}FKOolPjL)#H`7_M0^FYMn3;mH6&ekdWOC1uE^`Qz zQ>_5?V7@X!Zp5gOQ=)DmRTnXSPG4WpI#&le__&Lg2Ti>MmI*{F5AJDga;%7}f6ru| zfZF8@JsLIuUG-M7fZ8vkuyaXwZs=*`c_2L(@n1GBw5^aq<>w3CO0yo}Nv zUX55<95Ms>lc(x%z%Myj7%$dUM^y?gt&HCSDov7W^SB=HRAPGLYXmdYAEipqH(OHR^O`o{Qu!{>0_(9|_!Z8$R4yw2rDt}fH(D^{%C%Mpx55LQy7aF@jq0vc{+ z5JnBpbrgay$N&!?D?CX_KTEFqVb+x8d(AQc6~kX*ic-Dr--?)+9H!Xs2wPb~s3QK5 z2bPXfGm_FUe(#T{P4MC%37`JbA&%J1-3lbbzHi5$;Dxd>=e^Idt(%%(*)OKnUe&KZ z(xE%-(NkBd5;)a*tPtO1K|dZ&D_7}!43#uZGff<>P4y8awY|_%Wn_-H6_p%^ar~xX z?+~8GS=iA9NkYZJ9Cy64@TlwD{gQ+fu1gG9&xPiHOiaR@ZG{JRMEu=;M#pB67)`qj z#w^wg;rCxBXUQ8JlKMha&Etm9VG*^0m{|kkj{$?D!rPDhATXK7_cPvnFKr-~Mt=)h zz;f`!{|?5%27cd``*gUJm1VdR>-g+w_QBy(ZCZ3=Y!l``T`Wd}t*fQo%O`a~z*u2| z0JjGMgHjm>ETVrOB^_^DbQ^1+2l$Z;tYY>*o<5sth`z7Fi*eA`ZmgPosrAV=<%C+kkpw>)` z>a}CL_GUs7g_r~pCCS}&rx*IJGuSXM^Q2$FG%N%=49*yY>0{M;j(8WU-3S;mc+vGl zQk-H@KR?UNya}d2mh)-~A`&c6$bdh89VFemmk#Vm0~XVEg2)4l0GuuBD@*NnPY7&& z`&a$3uRd{NI(@HvWv&T4o>?w$peI%|R&~TpxgVYm$XFv^d?JK44n5<%_9N`JA3hCO z7a9jZ$+9wCi3YTH%Cj^(ytU>(&oS94d|FS|1er#*UO(REjx+=i+>9_ zecE}v&6EEh!33) zp1!0sq<@)Y-9$39d;bSSH_`{9leX^Ie=ds*d60iuwmpIqaiVwF&bxzES5W)=`93J6 z`8}=a3t?II`JP`TG;9n89mh|dDQ6GExtGrAPc6~J1A3_-7=23?XhDZY7(6IE_3&it z6IbO+NE+tf;)&94lqYWPV1MY@uIo8syQBZejPoXWnO&6$=@#xWXdvq`~+_e8}UXSND=lkDJgk?Xx87#|4!{|jlIsWZ? z)Q2ylb)prszpvh8Jb3o#-5fRtW$cnrV4NK1*>4L7?{tkVc@nyPuD|ke?0QzV>#MZC z@+qAow?ocK`9XUDuBF_vQ5xt)#41@%e`0{>tL^kkt#~h=j@|X~^kX&EoDXN>Uhb80 z3ad!3*nS@Rz;Exq8x`h3zNKuk=4GX;QY%L~DqjsF?mQ@Z;J_pMN{P=)?B4voP#yI5 z{ki*wf96+T!Q4Y9U;8~51u-XW$B*B)Cq1!(r~lcyV@N(?h@DflK37&3+G}Fgsaf&c z^yn_|1##E4dyAsK@?1`JdgVMEj&g2QlOJu*av4K#Qq7wP*FS`yd6g*5&bY z{M@6jzn$p1a-mgP{ZYf9T95CQ16N!F_J)Sv3@{nF^hHD7v+JYof3h&`lo7Z1X%60c z5_|2s<0E0a6VcquaaXrta#G8a(7`XA0CrGZh_>@7O)@6$ws2wq04`D=aaWWoF*Vqe z;gvaO09H&0(KT-Top);0I|>yzAf;901H}@*99cB^2-<$)rC7>pjBO<%z@3q>p#q>ByuEFL zEkM%EUzpB~$1^~L39NA(liGV8_-#Og@kO_4wAo+qov#Npd!#aposeLPP3x#fe{bk3-=;GU}4;s{V;nbf3s=rk#E z+-Dt6!AH1<|7Adjpy3n1Qr}>3|JlE^%)mYUkc=c_aZD=0ZXtCN#}|@&R!oH{p`$lW zV^db4&IF&Cgmpr{J95v;*eFsOZ#;}3qGGw!cnVD5#s&MCm!qP4g%D!(idszpkd)w= zd{z{lJa~O^SoDDkT=p9CWvk@yS?k#A1oz}q3Wq2uL!9*ocuufqFlJrzT;M-5O7bW+ zJv}mGi4HTYotpo90RGn^qhJvm#>-a=PF}24H&2pTnn&Jn=jzU~2~sPvkzBT!l}x>S z^?{3->A^y6QSC*)q$X}L2h>9fu`g^R~zB`k56&kkjKi~xoNQ(Tu< zN&*XmO6z`H@}W`R>KhhleSZk>KJ2NN0=H;4v~EbV-xPkCA^P*y5=N*_FA z)JF#rR1L)^b5dzs>v`0GWVER;0WK~@=yw@cj?CvF`|^;xQ&>y~-5B!_UO*hdtA7b8 zr8tl)9b4srD3#WSXk(K5CoCv=7Lr&E5|v()MQp)`oKpCZU+X|K1bg?NH-jpwR49@q zlCY_h?o6%?Zq+hwI|*wMxFJ>&QV>^xB*{F>%<}z6tSs%KF9?C+=RFzS5y7(NsP&LLM*;-K zLpt5sCr@?^?*T@LX0Ltf`0diRyY|#g&tIzfxHHU8%BF&ACKy14O)-+s{1?V2+3bH7 zM5zT^yk0)EfN1?NZ<4H)3$S*4PKo2FQw|GIoZ32~&tdu6ib`X+-6+|>3g1?;-vTPu z0Srboese`TUCx~6^ZvV<9F|3hKBi=bx}+Xb`V{O=MGq5pdOb<^w4d}I-GA$O%f88R z#+T@CZ{y^pQe2N6szaD1u@wI}-Fj*B18LuaIlCil-zoF=L+qz}KLrmzzg%`B9{am3 z{=I}>FQZ$qDme|QFt?H7Su$gn}b*~+c7b$@1 zF3GAMlU*~jh0v^b4beN-pI#=j=V~wVBo=Ho+}*_ z>LfJ}DV3C_)k7>rg-`5(64X#$&-u3}uyII_rd=0I5Q~E5v|~}$_hvN(>Byjfu{!&& zUsRw&AFZxf3o51>bJRh@!r-PBpH{1*8^(UK-W!hv|CU5|vEKiB$Ffi%CoDhI)~(E4 z54lkozX!r5#hx+M_MLRN#jCBwW6rK?llw(w$niM6H+~m&eHD3Qv#$(~kcpQS+jbm( ze8N_|7J0& z1VpW?tBrGj{Lv?$&kSqtd&Qi2_f1z)z7eC9ZSRo#529IgsVCq=*?ohmS5*&dx(baE zANyo?Zs~uyLfr2zff^f{uVVVCoD>Ke-2Wqfxa1Jno4z>lkXgD`WU^uR#FrRUI@mO= z+zsh&8``la;>X*IZW6ST9jDFRlp)FVTIcY>FP&W%Ki${k@ar=Mr%lsV<0>%GUHXOQ z7ZdjpO^+A_C<1Yz$e5U9bJyB;`LS9bUYj`^lV)&WD*b z5^~BRrzwZ6h8&Yr&WD^&A*V5l$YD}*7IKP0&ZLp^hn&mV93mw)&BtUVHK?cjUFL3S#tFypX;id z%tL++5Bd5OO|WwR<0*%M&X`n{7Y0AKD6#Qz1RRyO3%eb>z47l{mH0b=eyOzdTaP~3 z!{~mf#E@^6_WL`(1;_R<^ZkF>%SXQ7DJy0#K=uOU!cQV!_i9zXAL0f!tHNsaV0L?s z=-z4F+o$wpmFQ2%Ix4!*0}cufAU^5kJnHht?d2XHgj1)PbGy& z(ole2_fr%~hoeK}r&Pcn&wt{1sNODJB&b75Vg={$WeFXL(AK-7sGRu$3wBxu^k~AS z29gk3) z8zv9W|4RVF_Gv8+G)LgA8Vf}%^XiAXY831B_!v82T0qw46LD_+yv1)Qem$GZSO z$*uzdYxueE0iZ>TEJkR5jN#%FJ0Hm4Cp*JYl{-itMJ@yvA!jPUBN5(6JzA#~3v&0ruRFAY_UkOr_%* z_TL3RUIH1eFw6q@j36fI{@S~)BK$!3RVZLDq3snj^?gO#8jr!c^5Nj5?;M1r;A>7s z*}Q4IzX^ScLfLj=6#z6aEUoXWRGL8alGC;>ARTi6-n~YUTxzN>VFRXolxXJ;tYoPS zfSO^Aoq%>aEp7@6MhwE=(<8k6dgBvfUm}PK?z!Hdtjhy1jUiIyOJxd3=JVQwIu_NrtaRL3r=bXzff}{1(y*l zK?<%ouGK4Aq1(*xR+zeCKub26)GhEx>i3Fnn*XPC!VA=*9x! zb^%x=MM_#Qu1i;ubt_@W7wQ62$LrBQL`yw-E0OM)N_#|0@(WTbC#dv_Oz5|gL{GM> z&hIaCaBPQFv5CJdK(G_~6Qu=110E+ssxJH0Q9>CqwRa_$P;yDKn#F777y`~GptDU+ zFE|xQxAdSxkeVQim8l9@w|S3ldVTBbz%0^?Z}^0=`-v~}t@TqSW-2A~rn~U~*O-l6 zvH{yuCK%GSr3N7())KdSv2jhlL~0OA!*wz8&eV~4`*;EW25_muk^_n*ywAIPCDt;O zdb`gR(Dsf?0r~QF{*UMf3hABd8hH0!H=g1yo9oVb*q8LrL}CWRZ{I2|9{i_`<7Cl z12(#^c|!?_4{9lv1>>9dFTW`@uj>NS%s*V=yjt5jrJqPVMVgN&$K2iBXzXHnfA;!s zD=(^B!&Ai>xqxUzS+<_k7h>Svt+$h}N?u<3+JHj4PCDLr(-Q=kUt}NT25;6aEMKaa zRK!7&?(P=bVYk%c2e0rcQ|5xDIu79=A3xqW=2RWv^;VNmO#pG^kaXKl-XrQ)nNj}% zGYNN;$d96HGbSN6n%q&BOm10~pQ}rdM8l6+6auaj8BSf|g@C@y?&=EP#Q%UZqA$xr z9_n(60c6QX@DUumFPjZ z#MWli&Sj&22CTemXQiLIODtc`P}om=7zXKAX#oGaSH z!SX@ss9gVQEdH^2figKjWbBK?L-q}~po?C3@WK=@mMBS&{O{GqerCxyCj-{9t+sv3 z=HX>ofq8>;n>`(5^YQXP@6Hef3ujK;YMTYV=FKEmj{N8lNy6hd=|TcMTvq1wjZ}v) z@qfx1@jH^znGLs0TN84!&q|P0VZ$BHV-w!!XnU$o#Yo{c1KaC=1$5OtJ9_tgp`Bse zBP0O}jsgxiZf8P)jNre2w?tErk>MzkNG G2`0%VDM!=8^}+02 zHDUNEvY|D{-wTDaGTA}XvImDF7cI2}yL;99|8}Z)si=YH28Vhl`I+a<0Qt}xW`WOR z0>EwOXIOdpw+-F3N3c9B-BekedLGLdnQ$2C>sWK`HiE=GByTE2SZVHM8hHV;O7ht} z%dWptUW_Uuj9QNX&Y#F$cCb+p?{?y1lqdjd3-`p@yanS zZN{PNf$=(#vfbC$0_6S}rD3n!f5LcH9S+gm*(RVm(7U)k7&_Hh(`0@uW>&~$+3*{! zUneog#Y|Sw@Ut9QFF|3J5{pHI^(p;9lJ5RVx+Mj2Orzm1^(vY$S(XY3b$VEC(}|~Ib2~^@U$36vQbxo$QKYNJy8I4jEh%QZ)4JM>^pT;;Ek+_ zd2p=}HYWk3y5s?Y9~x5sRtHOY)p*?B*w3YliwxbvPa`%my9>^#&Z5t$t#o#Vrw+Z- zW=60R`e@1hxO5fyu&-FowlL4GKg0fULx4QEl4Gpzbwf!Seu*pT;#F^X-GuC6=3*_; zD8Zr*ph}L|3-Rj$=7dUXC#C&u0rD*IH{I&g$Q*{kQs=8tyx6WHN7Zz>}jO+(LEPGUl*ku{H z-19RYNR4-);&@w!u;9dUEUn{*PaGUT>#jW~Rp=3|=;?@dHfd5O^f%v-*TwiPiBV|+ zpICp0pJB5*VA|VuOhQPoY4g=Z9bCWiYe@fbO`mLxmQiJ9hLNhb8>+kinFb{q zO|nCHlXd;%_^NqRVg2Vkk6Y?qrzAhq_=n?{IIxK0`NgaxWIU+J|1f$h#KR}RQxs9H zTaECEG*74esVFTTdwoBnqC<%nP+_x>agEGfi2I|J?BB=_4@7Ne_n_{(|8g1Q>=LrK z_yDP#*Sh(+VEkmLrrq>XSGWX{xiyT*yliAnTak?@W?X<=DskxTg-=!ffizSk^GkGnXamocwS|Y&?}Ng_!-)P z6=hXrGcY|X%-IhCC&1v-#NwC$3?67Z_QX?Z1>sm5PObArzTlm?ZItC^9 zAl)9AWLIbQ;o7(p^uI7*vW3Z}aTAlDXuogXR=z1`wsD4KgDhEXtlYl#(k^h=XP8YE z_rWo0B4~|sed8d*>-ag6X(3lg9?{XyT^zTZm20D`z$7h^NISF^vWumBB?oLzDS8d zmb-C3vNG;Jvvr?jdD{U?F`+5pgL%cB-0$<~s_-srGf{`*MRE`!gp7muxpTNq z*#++`Z8r;?|$$Q09mbHJR0`SK!=3Yo9y19Kj6~dmb^@AKr z3AVxT;DGi21L(Yc!q+CB0NTFTaihKcOMr0R4IZlyPzVLhWqB=QxqDPap||i!LqQr+ zweKWD@Hkm=Gu7rng_n9xhPkONkHfz^ldcmr*?raV$EQr=3CQH7q?8azl)W_QpWmKU z*@W8sApNTg==x)U?GKBqmF5*TOHdA@!_(1%xtG>_Fk`i^cS7T3LLF)l=ee(nQjuvUsL z_5Fa};O+J&B^o$N&Dv7Aeq<*T_AOnP_CMK4{nyY0&HSJu*;l@LDf8|$HAFLVb5goB zo}!Jeyc|T(Ia^Bkdw5~uHFOpm4~9ex6nuu@p^S9OQE4`hh}HDo31#?L9fo z2TR~9RFNEJiK_`V#*4XR{OZ2=M5B*~3ax4gEFmMMt&;#iK-sIQsiKj5M`4;>g|S2lnX%w>bPXZ^f-l%k_1%R zmL#z3y7gmBqA4w&cHWzc)!TzETz!VDB%tRwB-28gRY7Ko@K16QqVNWaUtU8Z;Nm5d38eL>aI`EdAd-6KuY_2V_j*cx9Ke5cO`&_e#&|inmLEbo9+K=!zI`->Pw6J|y};A1M(%1u+bmzM{9TxhH+I z9ny4UxLJhr5;N7wq7{MzinnFqs|k|HE5B-8>KSX()F;>d)>+Po6o7?M2SMisiNhf` zYKk>fvvU_KXh_pmVl}->RAISztZQdgG6R)$J$f4{HZQtH$+6B!-`|S+j5(s4x!*E) zbuXdD5keEwml|v1N9d}}it1#&xJGydh^NBS^Ofyb=T&$qN=L!B=HASZLafMM)qTGl z_jj)qMT*inXdmWI&t#AT5_pq%94sFu^JnV;oWz@ci(}RQyO5OWxubzNE1pqTci4$FP}xo_V4mK)gmM3 zXBCFA%Mp1^+Qlgq4o=oS$v8PTz?p4*8#?+)F_&qvscvR)>j?(%B`cuz+3suT+pFa1 zPgwT&Sd6LHWm`6{>YZA0UW)iQZy~=FV0!kfc#NWGBZ zK^edDnXnjnj!3uJ*RtpZ?#JIU=xcs;(vRwl7=vBCi%%|w+Yq@B*uj)}r>EXa;VuIb zuQ3cNmthJub44fQ^Mf}|sRXn&N5o-?U;~pAW-?J}T5+P^sezQwoO$V0E5T$PMogLr zMj0&b_%c3cQdH_sCMJ#(6Dh>u*x`{kVTYa84nsSCx6A~WM_7m63S$ln@(geM3=r-7 za#=qKpm+RG`Z(7Yqy!>ZLZy=jvDMR!Q6yy)y4BR!2>_7P;M>9<&|5c79s<(crb# ziZJ(bn;2x`o2Tbv+d+P6_{19-uE#^~x=&7dIC|YgihX5;%7rg0tWVn(q470f>RGCL zer(LuS|IJn`Knb)6A$Z@uioN#T<%ka57XX!;Iu#nS&@T_d#${^lCzjGmniiiA%=S- zL&MxK!00hwv&!`ll%(np?H})QD-k$5KQzDThub_%*}Hd(+*nq@d~vGoT1bCkI-&d( z_Dw%nHeGcwN=l8o_vMe}!t$Z4N{-MSz*@On?)1*-nM+(t>M%0K{CV0A4^Amz-&C-b z6oZY|8o-A+-2*Gk?wdB(a@II9Eo!$Z^ODQVJ{eWct}RL$NK2iRda3n}EWa@A@4r5} zN?U)dL2x_*zP9~@YgYm)3&_ulDzuY3SJ0!scO3C-;lRCtpdgWd#`2bIHCdN0oBON) z8t@J0y|FUNVqW`>QW!4ztN;3mc}Ma=lLgH#h9kxWKS0&%YvVw`QUZnBWLhl&A#*|*2Qnd&R0l6>AtKP2HB1L z82Z%VYvrdM3?X3DJg?ieI$f_0gn`k?h+u489w0%pAS^y_JnUZ2ZV|UaHpy@QS&Do2 z6#|GZ@Kf}@&BGX5NG434kd$GS-9XDp-UI(F@`^Peg*@Y(nYrKHX)*@6<38%A;1G_- zrldCT%B6i^)%lMH%CXx}23Q08vZf_siy3(bwD<9UYs15_QxDcBwMWEGVByQ~g1SwE`y-OuBg*k{F=73N!U-*_bIM(T|>2dtbO^>2@)GA&x1R(k%4 z*{i(jE*`4Y9;oQO8@dkKU>UcqQL{KV`kmrk#UPf(hpnP$EE#SFK* z9;21rH?d$yI?bwl5*XY3nGwST3iPB4=V1)a#1yzO8ROhPckyn2k9?L>C4OyfUR;L$ zm;=cRTe zk7qoY1Go+~%UyV1_^%8t6wOqHjCBJ@3*jl3kGK7v8Fn9UcVFx8L=FmZyJm z8Ur#LmpxB;Ztyef)R`@E#?JP(&Y=j!&j-oHlwd7gQ!9wu=|cw1e_Tz-FWvzbp_ zKqn7_ba^+lHZc_AOS65Hp50IKZPkm9Kf2(l2p_Z)Ju3)h@Iw-h4lg9J6yny~K}=J3 zRVWk(Q@L^wGp3{&hecq8lG&2W_}2H$J1e9g;aC!|sNbYce4QA;oZ5+5gSRcN#g+WQ ztk=2|^pYaK*^#{fLZK2ED5*2KsE^{K7O(dst%H1zz?)c-4dA@4LUMg+U~{={eAwTqp{X@M>6>7)L5FuHfMa5+FalhIPCX1Abb zY6+R*lF*hmXk2;M?=GCFqm#u}<$I!nb(#T5o~d{VnM+C(D!1-Q^Alz-zz91q`v*eG zW4%~PbbGZ`rTfpRe8et-V61Zn1x zwz-$%S-t-QEL89baaOAD=_ubJnFbq2=H;r4x$&^}YuXCeY#pCZ`NtmDwv147@ z=BI2wHL_gcRTl-h+AvdBaIF94dHMp8gzZWPZ&q!(rADH|66)e?Sh4hNeoKMr{zmnJ zkMR&jLD-xp2{l#f>!&cYshWlsv$o`s9UV;XFlwfQNZ?ciLihQpz37yRBH^nxYZsLy zoPWmCWfvAsj-`GBwmV&Tc>i{`2Y+IQ^6FQWtaO_lblS9++t(pZfgDk*>&}g`d)0A8 zMSdRSd4IUDjMAEF=Oy_eJ`7f2xNtJFW;rvh zx72BM+!gis$EH&dFG;Xw<+7Q@q>{1Dnbvjb@=fti(Pl<(NVEO=X4&6w&eo~Z@c`?x z#j$!L(W9ic6_wpMUpb}Qmt@dgYO5J9uim~^puE}g30N&eS=7FeCsWo5E`R>Q#{1%0 z73@)cH8wF51*5+zocY|IzQ4`baC|ccwbJ-~u6U3f_F(t+L$9krf$xV5avwwD(T8$j z74F~NJb0T3&D{LKqpq9HX|KP>ySEwI9~?=q+-st%!jdg7XV@C74E3G6O5CxyUKBs{ zJTHf@-qXC|?vdXz!zS`1qXn)!L)OG`9U6lUVGz@-B@p`2-+Wr>U%%&1Cr2~ z)4Nu5Xh&>nNB?PgrVRbr#oy`tj_%<}N_2!@qW2|$n_{_gi-v1mGFje537O1~lp7p? zMl1&Jh~|J@yJw$Lmp+}ew=Fbyfc$z~{6)P`pFHd=;z}N$ z32|#l*KhKsETONp>Rs*l=iucj!}y*UKKWAtxzZtE{HO{VBcn8b$5B_v3mHCdTDYXc zWf&fEAt^)$-W$gL3S`bKZLUtYOT)G%223l*0JqGkloY(7~oEm=CiJ*1xfyuA(icAAB2P6Esf$!LrNQNc#O)j4`6RKi?z$~ z{F*i@3iQZ{Pl$u{^acls%z1K1Xpu-fAbv`VR7)(y&Zf=%V`6T2K}!BSh7f9~Ci^K5 ze!LHmhVaI?EM5|dMuO;bN#%l(yu;1{NKNG6+be3LJ*2K+_1{GDW5-(ku1sxw461%e z4(hEWey=Oy_ILw9>)*AXxWPZd$c)>@0A{^qw6j+XzVQn?N4D-Gx|r{yGGO&ZOjjJ5 zXKF2UWy`_`0)`rfm};1{#(OwmG`RbjXxc8EtXM@FBVI|UIWu2b)}1LXs!6hq+9Mzc z$bq=twr)01=bSnKz5>!d`FD+vrb1t`I2YVPR5ZyyuNZ8=dSyvcb$j5l&K5K$fUR8F z4WO&xH^&@uH`r+NC&5p3f<$u*fOB?Bx2h?WrBLh7>8~Dswx8kBAmpC9eub4EKmXk6)8KX#zM%_p>v25C%)!ksjeX)90RScgsl45i^dE zG0;MAKVJUTD|SvI%Vv?4VXqHK_XYm+@Gt(f-U5)w{|bn5Y6RO6MB?L_o(pdjDsqYQ zqoKD+kglu}CL}FS>&#OuJaA|YaH>7JUV4#hn>O3%J*Ru;_Y$ zz`3SJRmc`3q+yd5B6y%enBMwWEc5>KGco5*VtJ<)#3b_m}P#Gwm2j&qc6RtGn+QMe1$MGj+lE9(9$QqqV ze(8s-Nq3nYUM-?wp|jPZZRNsOt1%pz0>|hu=7l=YU8Ou#w~l^r4nIQ4g904;u(-0d zG-$w3Mtg3-*^(;rcg7!g5dLA(ht~x~P_N}DLT8C3s%q=&8NT0zN?n`B>Z+r<{|DeJ z`#13X^<8P+`l@N860j~5Woj2za;}5{XH)hF>@OC%`|<_S?g!WR%xu{~&v>A8U3hkA7q?p|qnfjOcBaTT@3ZQS=N$^Ly$eL^sf5Z@+)jrcKUAMw|0ezs z#l4-)Bv9E92LJKoDf*zFZ`bCG`=jyZeiey*i*HX0@CkjuyCuf6y{a&g{{buoEc?6) z+TqzY92Wv_VTcqy9_$0l+d}QgY`~i_caC&BSWabao1ENO#mESDW=#9!tgN#gy21N< zukiW1yB9;NBv9o8mPc&MZBtZ3ET&uLC;0JvB~eB5s z=fwW2Sjb9uv2l=SN*S=|suTQd{$Qu>1rgVIVt?D~eweOzfULO7ed7NBejhj)RHr=o zm|U|yF7Yu>18wFO?>&?;zELmx_+ygO?AmLqZ&jRO8|v@q+2I+``yPs~2{{zml(+pJ z8}(;xp&J-~5_li)U^U*Ja5&9U0%1KySA6!U^B$c{UDoD90fcvQGQl7Y&{P*WpQsEzq$Mr+#YypUv<+nSI#otA) zE{;ad)ajz@P)!jT>4xQsWoLp+iY(X>e^Ols6T^>8g7rTN=8cWk8A3?Nzrj}4yK+$+ zwlKeMmg|-y^|N;hgu-feHGEB`jk&MC-LA7LT((2)a82#q+WdJtsipp2GhH~R|1VqX zcYPze0C+HB;LF7R{YTjq?!T2^-vxtTE>MmuZ6+?5VXca9r|p2wX>nGWmf2p<Fd5+)fg(Ct5D{sL#fjd*BZV6IQ~i1V0Cr;Oa3@*vnMlfI0pQ53rWbJ#?@+7KmJbJF9 zvCI&Ff{pVL3k3ijzY?zRu<}7{m;eYk=S@(5uo$N7%eh4)gsPAjdSO!6t0<+@(q`!S z{G$BZGh^d`4TO*uS=8f7A}L*6pk~iPD&>({8Wejvf3HyZ8*O|27u>1xX4yu0mP1ie zX0=Lf{^9)~o7+vqW(tqi8PBqTXR!&s$Qd1{pF%*50+z=b_t-2q1TQ5TbyTk!#_+;@ zyZingLw;}8Z@r5aA@}||y;1Ee2V_M44-jkajyQtkEdm9LP5N+g?804Gx2GtYaFl^rJGb z>;U(aigp*K06F|wqzA@J9{>wkW77d90>{!FeD3Pf&?WjD#&X{DJKD)dT|dgP-mTT_4{#wyR1szulRKT@Kmj0ikgHf zGxX#p>Bjq^UjWLX$A$kegf4K`6&Z#S|F+cYw&Jt#jA8>kuWEZ_p8E_|K%MbbzWJBk z%(4Rj!mYbmQd%*?ik{L4o~OJ)NBS zj3AL{xWxz7l@kEY9UlEx#1r$|5dIv-mGHRZ47%iz;bR|Vv97unTW=C6kt_UpSUo!C z`n#Dw_<=VU5gW-}5lmt=bW*xw1g%@Yt5&n}_kus`qkM$X`lOTl=7~H|6vmwGx{bYV zp$#T?%V%k0)c>x&bi;Wm(NM>dP+v@dFczFE+1lIQFqtCH4LuLtt5ymuDBimr&H z_KW~k(fEI5N|43T~#FnF`S%!sQ%5$H>!(i;c0i$(v3$Y&VecHNvHCG_`F${1GOCFd!ZL z5iJZ%A-y)4s2*3Tt~<@JddF;3qtm`x$BSKo1InbmK|~tkU-`4PZX|ImXkI7zaf>+&mRu-d=&$H67CaX2JEFRl% ztskI@ql*B)t^~IK2Jc1p-}fr00o*wr0d3 zIp}eT!Ew zrd|{lin#Gyt?|?1^tB#hFE5B0#{>Rq__)rbS|zMD=vg10xAAA8YjQ~4hf=X*j^(~s z&Mx5?1IWFtO)F)J9~y{uoCI6z8$9i%&ygIz9-Y5RaL3`Hs|zUj`94pw*PvbaL^2iM z`-yw}SId*k80Q7FFly_9@x z761w;072Nl0_?7A{CP7Dms=#~$1hyPdG}RS>4_}vXQ{Crm>NTSkbb>(D6B7_|Cn{# z(nC$4iGT2TUWCf4JW*y0+=91^A0&;rzuznT?Dcf=j*{rz*;$3fWKD7N?=AF?Th{hf zUgQqVQrt#h=jKeQWu@I_ za-dx@7lAB+zy~%Pi2oU*gR~N=DB5$;Fiu1 zI2-Ld?>5JhQaa3TknR2x1W6>Sn(oWAp4zjJ!Rp%Ysg8j^Bo?SXZr<8oRohGT`X2Qs zNNQIIwlpM&{^$@+iz;2Cx&0ik^Y#D&tTQ;TZG};oWUR+v`)`)G`|;jWkD5pn$uGw& zjtkZGZyg(10mEYA$Hly5n<6G2wi^M}ti2Zup85+C1Mz&`+AkgIJIi!9)RsezUo&gb z6-acyx`lr^(^}BfH_LF?62zwWx9xCU`zOu-u6L!ED&Jt6pL#7u?m3T|N9N_cXkz^p zSg~0b1CP-!wz)VBT$#M|Fjua8BCoghHHkMH*&FooYWZfFxrgZK$5bZ4ozL&{<%=fY zIWO9KTrz5vrV9@u2lj?suYG@b=T^Zf%YCZnv8SB7u-=8N${#ge^N~frCur{Vzb)C4SHycKV zeP1p;e^w>meAI?6?e-6cz-50x>4a7MEV+8!cC1sa-{$>=lHWZZtX~u!J^mk{Z64Nl z;neV@MFgK!iSy5stL{9S&#mOtT^qe2!1$JAqTN2g-O0QvKp&na`;ndC>QE?tXF zDHlg2U|r16|8K()!52No@SjZW_9jBVcv-P(0#ax5`9q5H-7@EPesJ_~MN^gqsg7dD z{ofXE91jcqlLUCcQqUIyrju* z=a3|OdIQt2vj?W10bmK?dzzgo6=fYlmHxhJV)oXBwgd%0-jOJC)9{mrLl{nR-!s2~ zSVCv2DCSDdji!Q~$O?QqrA$GD%nyf{gw*zp))BDTx=FH6@Lq#l>SlGZ)}&O0t`f=U z=+LbvkWk=UOktTPNA`mjfS3ihoWJ&KRZDBC(cj@GHD%%P*^6-*WmVV{GkGmp28h~L45%TMFo*L(MGeha^{3ybD8cX(y{%G{mvSO3bE<>@8s z-Cqh{_v4;3^i&)i_M%J=GF(Z^B6W+<<&U-LUgi&$`lBUdBMNNPh9{H%ZY+tqaB8lnJ4Y}3h^6asl4Rn?j;Xk%Vs$F>`B1+WkWzc*8vNvjx%?x|A zz@Y83@lkD^_=OMtKD!E^BQzlgi%M1Ja9?z$^_;97o1_gcFWYz}B$njvrZW5Ijtu(l0nFzcS-P*|s3vLQ8~#!_0@yAeIduU`sD^K=r>`}4^w(*FvTavMWX zm~ysk!|Au!;nUa%z*cgT+OvtI!9-TRrRxY1*={xAmh9c?`k$H^954&8#O6u8&UP6UGQvaF_G7??M6nr)QY2 zWWMB1d$(u%+1q`Z>6kC4mK9f4_|`uT-7JX|z@|;br>~z{KJi8Ob}6!c7}B?p1aSz3 zEN@J0$P=(mM&oMWq3*&y8v^E{$Rkkt?N=rNiDVi-+$*z3(K|!+V;1MIeagKov@V}- zQw*0ymKK>01dN_#W7d%{#cG2?G(CB58s_tapoB~*^lg9s1&Fcz5iEWN9={!b96sLX z2MYKt7L^qYYzfUC-p8ZIu+d0#r>U>c_O@p{T$1b(9$}W~<@qxK5}6>@WobK#ae#)E zE1%JiFgPY>1#UU5?))~L@IC?nVU40}OFr8z;wF9`72(Eid|FlcTbY=wT=T(H1FJ3@GzN}kmTZArao!-H0Z;s$VJ~!KN+dhSIThIn5my4FADGGld)36X1nus8 z?AdsIQ_)6YPIfx0{>GTiM{muu?N4uexmNFD&*t>JxO@^}`BY;_aANa;#Buq5uk3^F zAOl-=Q1WcK)MM`)0i~to3K-Z8^m+wEFuOvdPmoCLHe^_gsmdctuyiPO-Ah z&W(*R>~hMg>}039AZz=!m@@>StE6N#>1%;GU-S$!glF++Q^#<07*foL%NmtuH>so+szt#Hd4^jLXoK^Wfs!!+#6-<^IbJYZ7K6Lc)^MlTSsO?hLlJmM6uK1~LuEPj~;+P6GSf+!kt@$1woOvE8E zchrCurN&kFlQ|L0e`gG+(fH|Q^nZfu(T8(f;crDN9-uBw{#MS~W(WRrpMl+D*)Hd`Ll!7QVf9lx35@^|WHu9r7t!%i@= zn`RGM@N);ODkvsg*3!$)eS>yC?LS>|T|M5}6vFgXn!oQTo!j>K(K0VE`Ag zE^^@N#QnONf+TwOv7DWVmyI{~glTDhERWBrh37$B@51*S%&Z=KrFS#&ywT;3$t?Pp z_7VbHxAq4@Zd}LLPIg_|-i74p;xmsn+xK^ELx#sYv|#G7e{1S=+Y%(#8HGPRA}kbI zHlLP2=f}U!n^JALyi|I9z9sycTP6)N<%9t31Tr>_pHK2{FoC5Lt+DE9Z@@6h3W*mE zZ+lj(;)2ink|9(K&|g8XM)pOYLw^IUisGB$C@!R|Y+E#PAxzRyTL6dW2{i$*CTKk} znNLM(lE;AQYN^=Mf7hxUoE(P{XP+d?P%$vEQH!vp0}WTT->VhfmmHoMZ%(37*R;*J!Hyrm;f85rp=Sr?b6@S3 z5?0SY{Mj3L|a5la(xh$Ft!?GXe$(JN}<-(envtk{% zP|=4coEhaR=Sl!7$=CWSb;F$~(W&+IPJ57$%KNKST|O#N@yw@gGv-fONJgK|_O60T za&XnWbG=N5yUW4&TE*Y`o(CzKRXa&l>=iL4-H!1o?dD;>*0?fP{f?yiIU%c5FTc|f zZ!TWm4gs6{l3rRGjmmkN%|{`!mwj(9t1H$jyC|vxSD4$#dLg{(_7<=DgR#Ki7zh=T zknVh|*PZj{g5heVRd`)eim2sX<8OXS>9KKz_Sw-&z9CmFW8fHxPp&_TN-&UzGq4Il zrXj$e#Q*F{h2_QdVVvfKb4OJ|)B15?1tGz&{-pwkL}0XfyEgN2IM<_UCmY@(j^9mDN(Y!_|ctY%ivX_@7v8H7X^H8U`rzG3AjfaCmnZG#uL zwkyMQ4k=y~MsP@ibVA^Py2%{nWl*X*dN8q~m`BFp5PLz;o+~p`4Tns9Di~m|;RIHY zT1_?HNH)~`SD`}oR{CU*aCggN9q4!SKq*0cLI4@;B61%>`wT7S_B`MjA_^D9hE$R* zOTS+y&DuJ#|GD{5FH)$BwN2)y2IYD%Aqh3m_N9mOP%^(_po-IB3d9ke9dWv35mfBO z!_lX6>5}X9CzV7$E|ukFXA7)JU^EVU(#{l7&ODz=prj3Cy#xp}ZCf7b0uGkXmi%e> z2PMHYK!N=a7ReanJ1n(}DFPS4se5Gn1-C*ABIp#^=f}o)ylsLeJ%@2L) zUF7`ds2h{40gb?h5ubxoOJnc!an$SYwpVGS7-HTPdd$HKZM#ol7X$}N+KW>ok;@e? z_(Tf&G~ZOZ6bXf18@anY;;AHkxr#G#&cGz-L6Uiab1tLuNUln2SUTYxj_l(b8Nbl! zZXBALoUKQe=+>Y-m6z1~3X?D~-u-z=b@lmNPUQWCrFZm0>xh3(8cpoAv>w4eX725o zRXTuJuA}&0kC#BO$ZwBSPV@wDMuUz6 zn^vCk8be1nU_5p1ZZVyK|geUAUh4l_MG$KqSvGb=4%1$8z?oy*97# z-9m|QKNQHnW!vbUOF;$zF)&P@AFAfMKTLC}{{SDT1Q!qVfdIh!!rai3Kxmsd%yH(h z@?F)*!(0Md;3>q!o1dzU>?9VKNFC%)_O}X3HNap2#l-i4=X9$!rBXIpFK995Jr-&U zbj=XpT-X4}x%`xrsD@^k6Tt*oJuOVMZ@lD{VUKlZ5Z1JSFDAhvApy`Qa1sa+DBY)N zBBr7HME(|&2g=s3Xs@T#d8V$#s@d}%@`W+*UI2E09wZxe}RJ8pDwyELTy-h<0@HPM)){dW~ zX*AD*#@mtsKR8c_Ia1U0J$|L^W{*Tc{{V>m01uaM1Ujl+b-I?(b1AW|zTi#2DP0zq zPR2MsnV#ZqKK@GOW_#xDnR846KrolF>B}CAt>3!7wyT)oCCq`e_>NWPdpW1nP-}hl zMm(Q|;pBtq=Cm~xUq-7Q1T>4Z$^CmN_S0)WiKqU{zY`ml9|%)zJw`a`l{6Dhu-zDT zLyv(!s+!UOF>4WTrwe_Q-CCGqn%A+yK{uN$-G54T4~m3{*nlwGPbUveTdn7`lk zPTRX6V+bY;@C|`EL>&Z~5>2dSBS_%Wo-@QKNRkF-*XQJo=5XxXj1RjYaTYd_#pmzI z0ByY>2_oOkB0UmoNU$v=_80t;X*&t*EaG$Ml)!>O#GFb3Duyk|BID(YDlcq26FuOc zzW)GKda-E`98C9%Ds>$X1Q`RoU%_5I6-`7K09rX5ag}v~8*(6VDxD$?ftb!d{{XtT zPFy&H$$=hFrz42m;vFFPpA}v2+UO&Qu!#E#_Q1ooL7oU7cvb4sZBQI77mspXI2+52d79NtQ z+S54~m90v&HzNF~X%@DC2#)|%701K3gTX3B)sQE=UfxP7gl#3nTtV+By$HzG9kKwi z9r#$ai3SXqk`1THZ0iUB$eT_6{^4TQ19Z&Iu`-q-sA@@w`GTuB5!r zff*MIQj?%!_qKem_m!Ka4h#SpjPWZ)gW!Ve;20VCerpVxVjAWQMe*dT%*YOaZf(X? z^})=<!9YcTwmc;N0x%(+Fks@uA09)_DLs|16_JRim zI7*e=W*~q$&*p@Uz(AiPC~LXf%pAb``KZNfK>5ahWv-F6%0@^edRoguh{y&2uqG_B z^zPi@CL@5f=?e*IHXW8(M!2Q~7e?Cy1kJwB=(Myzn2b&3W?a7aiwK*azs+iBfiM7z z;Bun$xZ9%J0mNqV6uXVdzAxDY69X^|d46c7<_~P%>J)ec*!-@T3Cpet7{YJ>wA`Ly zJZ(1~B%wqB5GU0n#E1v-^hp8X^9^Vjj>-o-1A=**m{Sup#99T(5TJP5d!N;M(dSUP zz{uz1rlJ4?lQAj-a%~n<>07!0v2lPdB|2F4)S&IHX^DU&N9wGLj2}xVXqaeeGDV{5 zW{J*6VS>4`Z4Bvbh`)pzD(@|ElOW&_2)eNI6CB^a-IZrVNwti3P6`4WG2@<3s*#`% zA}nrxshMZHi8 zK5C&a-LO`zFa)9nCWPpl`AMb>T&0NfEE=Kla;83F{{ zc*wE}U^*^&;DsRqH$}%kd04GZxwr$42kN7y2zk}tw9l)QO3P1;2GZ1bM z$y$E2D11-`L|snjZ3Mw1feN(~4HLn%Nm8GQn@PCfkv;J%JKB&|B0z#fSc#vXs*W9r zzoL=hw%g!IB@f{MxC1t{n<{}5oJV68k(2iyRak8=yn{1xRMlkM$AM7?2RoSWlPc4D zS*98QAi%@}`YC`P&1vHQ0FsK6WuTl!L|f{nfw+fr+SACojFe}SKLpr#kD7Mw1Ow(J zGr=qaK2iB3&xFC8UYk2+gTS+9;FHFC_$r+y_L{E7QTIizY>Wv4KK}qU$@Q&3Xe?`|n-~Mp zbRAbz8%@&u9Dh~9bqz_YSo~QyD{mG*S?6M->Jnfat2$akaF8wCK}MeAClkjDT7@te zM%#{J%XU#;THwk%f0`~Xs#9meNwHRXlc2S+Y3!bpL#c=nA5=hrlTE@&B&nR;tuy_U zCfb1m2GSxuRiNLG;`ATYGo7@$OPg0n0U}u-Dgyu^mP5FU^TB9u_^N6ulXouoLLl`CQwZ+=8suxsnlP)-$qkTtI zbHAE-BIsInl0h4Qzk=$=8H{n~m$e|qwZI#ZkQMHY(_PfH^R)Y+cerpB8e!#*aWTl1 z)_WhPH1st!883MP!{`cLOub=YH(J%Gwv`>NYfD4`BwN$SD(q*vtN;gVuW?|n>fS{F z0Jy-%)73*wf~_LQ2qNPW4;h8WFV^dNb8W&b4T!PzQq`u{)QJSeppZOlsOj|->APdY zg-MBs9+*9oeAT4pliK+t0N&6ey{x7>mQrL3krGUT2Z*=PLTWgZ?-&4> z$P?!aGh3?n&wUoOHLO1yt|RY{p9#oCtJPH0xRUL(m`?W_5PL`5EZ6m*$2U!&q8{O3 zK)ZjL$y_#qSD$7ID>)6{Fb(hrBl;s*-D!xqsPQV9iy4Qm(5 zo3)Pun-9XqpTv2;E27iro2Ou=83ER5Y#H@V+7zi%4W*{C)TwTmYe0)R zuwrER%9DQL1FR+{-?V#is5#`?q2|9smbna*yfVelmuRM6E z`a7EaF==(L@Zan#+WL=pHUd5vG65&=`YF0&stW=(oD*a6`6r@W2Cjy*FdFnZrv#Sx zt&KLOg;p0d?qk220U7PWrC&~AwOT-IiM)M0(x*wm#-^3xo6WY9@Vn4>StD2K-4)b} zt7*r>m$Uv5v82f^+XQj|k>T@QcAr$r`?S?r?y9oRX&?eT2+h_{Rp^x{?$l}9xrXin zNFTb2rZ(-S_nXdRx&rTxCOl7nA4MZr^H5SPadrzpApDPqbS0#|0OlvT^YTOqA{q!X zAV|sw5Wol=@nQ2zA|PinKIX_?WMG~MJ-$etg?H5;KyKLJi~Uu7;m-tlf&wb09VZsB zpUGEfaCEe`dq>{~=C+X)dVwbY0L_6s1$K+J-Nrx}%yv|0hinI(#0$sjt^n4_gCmo1 z_l2HIiOT_pI`AX`c|m4^0hkdbMk_ctftbPLj_5Zz&k%FLI2ip@NmNiI@+Q{(e37t_ zCl?ZJZ@BuXi^vjOEfN6}@=`F(j~5q%Dw4gYue6)`K{5CGttM+=wj6jd#Q7{{OGqL` zoLh{S8xGK?l|%RMjei*fSA)-1)3M6<3p)C-D<+FU@I#AVHH8 z0}!UPh;=S5X>$k=f0%RO-D1@{UU)p>-R!nCQr1Tt0Dmv&v8w<3g;y{o|1F)Z;$#i1I_IV<2^1(@@2%Z3f074Go2HAjZ z`J!upa6OJ=l}Tvn2G*H&4h#>!MWak^7sae??mtXNxg>vkE|aBbF|eu#LhXAL@1;S#m&LEly$(}xXp+W+m#L4 z0iJJ@e3i7;kiDibLtytH0B`2CDG_dDO@P1Lti3B{9T|&SR;4&_f+7X)AKwcM`)P_$ zNOh*)MXjO)xvrjO0Ut$|OcLM@JT3Y8tz8>--q!QK?y{{TYKZ}pk!g#fBuKxA}f5MFZm`=HC|h2 znZWFP)DI({l+T);q{$h9c{lke9wowZd%=ZLk^^uffB+?2(+Rl;$H79~DRBgho>kgZ zwpb4-ADS|vEihxa1j(`|PTvFLl+17dVECRqf@hLREw~wwvQm{}B$zNw#pI4qQv-I; zGs*6btR4<#)`Kv4fBxA%+%^kKlo8yR4vs4U#Ow9Ho-R_EFK>@b4?SJlx!)wVmBe4AwHsVeX zBK|k*m@0+93rBPJ=%C#OPduNysa#2u7vz)o!mjdMc7p^P@d=VfI$L~^!kbM=`kLZ} z7Rhz5f!etkf<$M752KcYve!BSoNYo<1jlEM`wPl zp;zBIuWO(T+DT3C+d_zN19OF^+l0#{WH<=3ENo6OKfFJ^4D|t2sxxxB}0Ax1=jHO1fX=o=W5&8E_;2o%R z=e$Z6waj=p5y6>OpQ$T03!W?os(Mc@ro$F)PjtedeFrp{kvF<(4;0(9jMx!)2tTP_ z1DiRVTMJjB)-?g9`t{vJWPk0jjTrWaMSR*|Jph=AiF zZ5`BAm;fcDN8$jB{XX!o9d}RHX}40S4L~PrtU{-#X*5aXe|60zb#7@@jf42CCp@c^ zMK-r_aEQtKFtVy@X=&O6>PhU|_^R5jrt0*CCQDlvj(&?zO|rg8i%V0~X_xJwek;qC zeOK-Qvp4yw>~t^yCVOP7Y|}lDW1ky|Fa|=_W9t~)u1GZiQKm;O&w{T>q%(9I-^?ks zwLRrfI2HmS8=3Cuux*eV>J_v4VmE1t80{<2dSgA;P1CeENMI&@c?*Z5LHdf?dM&p1 z07vFmf9{PIKm?Y+z7L=am(PAS@xaz;s9bDd*dOSkAkudCNo9nAC*@>sa5x)8SOGJh zKf0c?J?mQMAH#bIkI8ejM^c!m-+0pxY>;4FMeQmUIia91NX_g>^!?SHB~~`0cZp+< zJ^B0bQFR?L>Gi$toR=~90_DDT`|ht4)lGh=w2Cyr_m0OPe`SHyb;grMj)%ne8=vT| zYHPi$tEeu6dx-=$o_rx^bgd!P*E))lOO2pGpX|4sMr|IAp{Jvwbrpcz(Jn4I=dt@M zXQu&Ho3)Q6uc|?4G4PHf(&x{?P}e(exY002y>OC1kkR8Fe>I^}u%o3LL;@fZG327t z=yfYO&ol8j6Fxp(pPG_)84?K+=rJEw_|}V zn30ebUCwET891IJ`zb0^dx&6g3~Wr?{%IO5AUs?;{9DozfmR0gmtJI(1o< z24Km^`;`N!v^2b2gWf^n_VGnVsbCSu5C@;<$^&)PkGL!jXmklC;DuGSpGt4I#lQsp zAy@t()M>UH;$2|g{jDBKdKIZ?X-#qOD?EH*l_O~JKP4u*-7R{w>DAP|l!3b-ox`5r zuP8NrcBYoGEcb0KHV+u{IPSSFz3oLx8mg2u;+v{6J|iZ4j^T;z;JK|wRIR9@=GLf7 zi6oFRPjBw8A052W?Wd@Ji|KlcAE&2K9aVOdQmr-#o&?WmmClwHz0D#V+d$)sg_Z1` zN+(-DQ!iyeWVmzYf1=yh(5t8Fbt%3iyKo)Hii5fDiM#z%nT=VUG*G=Y<8BHwsV zB!UR@Amj2uSe>yq;KDQ&DueMsl6wL^R`Bvr(=zZo_u7;KED+js&RPS9M#6CPCpR zPVC4WPw6T!KzWi#BE-axPhUhr+G65h@Gb;I{{H~cO-SxCPAHWRzjRy|?x z&+>~H7)B*o)w2!|4hU7BCD1Ph)0H)M15+1^i-}MG*MkNG;%>TUOfO-1i5QC_X_qvA zD1rew`6F#6#Jli|Tc1uanY4g;ARIuB_(sHOX+RAY$77$T-*tJ00Rx=QN8jwJX>J?a zFh~%u28bY&6L^dL{rF>(S<=06E!&y8fzi7+7Pb`nkS1iwwZH&+D8RncJ_JSl(uw+E zBpYCxfuFzGSFF*yBoZX{KYyyW*28TP;b~Q?2?X1fP*qVxIp>%r#&6vaq$6l(?Gk-I zG??SO3A_MfkGrY%uzXVnX4W5?t7*vtN}E{bz&lO^;a^}j<~cESM|gce8e}(-%DqUt zWRfir0}DEFH8C#}j`4-Dr6I5ngc7nT1H4F?$?xQ^>FNs;j9)*ZmGs?hKxu+`Gjv|^ zB--;5vpz=D>KYv#BBU%k9ge-R|yPj3aAQ&i9l*cnMl8KT4-~k>#Fr|_vaxO`_!Lk~Jmd_)$ETM6NL|o=BK8jk8fI%lS0YkhJeupZj zNyzks?in&;!A{dRfi2_nRp~lzaw1Hrw+NE%EgXcz=${WF3=?5$RCWQeAj~T6*SXmlJR)FM zEjEUxEw~>3Ba~xQ*B^-Efy%^Ys7#l0VdABys3gcSJe2(l*`-y@BXmbNpQn;)Bs#;s zaI3bg+GL(emYIk%bC@?OY&gCIL>UAdkPLkhkT#h!jz#{87Rhd4o=P{90A$8UGOB6&M1MlxDlZdp4@WJ^e-Do6&JT3>EC8JpngE{aw1p22SkY^lspRl51!6PK* zV)`Rgmvwj50;(mw-s)FlgE-xQHnt++jz~D->+D@%qwAe(vzz|1*==9wmMbg^J zwa@W2+xS{cHK+AP^fbundP7B@#?+=mhx_d(OcuC6x=etp_eMyE;1pX+V)*c~b zXzIAs)YK-x0dP&e;b>Q?@8Ii`1AD9^BhMVH{SYThR796L1AF;Lnt07YMWmoLG<7@P zSxT83oXZ)?p{UgwdJ!hzLr6Z0q1G{)E|%RrCbpb6by93x?qK>X2k0MPPRem%CL%ml z<7Udv9G;=ZwxEG>56xZEX`lGE*ocx>MWps!jRv7)xZX;!Tc(OlwneQnwEbsDm|v`& zHki|BHYDU5kegNmXgHF2jE|bPSK+ZS2a6w71J3$ogpl4mzRD%!oXZSXJF%`mW9HH08J-`$A!J9?NNsO$cqh;+L4Y)n2H_d~)+$zJpK+-_44EX8A73RE67uSW%`Obgn727sYgZKX z;1-8F00#j-e-L|&)kNy$)3wm99WY`dh?(QdEibuNrM3I=Ugx zdEZPuw;YKl=5NoU7tZTJ8?x1RFo*waFS^$CQwDDuRvd@T?lSAl-5hcT$-_0s*-&K_Ad@LsCpF|oO zjUm*ITjH&0s0!MV4g5C^e0+XOd&ETL()%Z*(?45MqZw0Zfae2&j(=ryRM6dJM$>ty z#*)^##1coQZl$N9sY9c)P;B@7RT{>D1Hv<9S*FBs;d`!2&28~4z61f>$o&)*ofF6B z=%yqLMWlHTMHOce0E_q-{ZaH@lqy(mXp_v`6Xgf;LruUL24?u<)AdqRJ6n5^$sc?n zq5yG&erFylPmKPq(mw>)0h@E>^5V*3&_IKJZw5I0lngE-IKD-_NwSsh+SnX(AJJ#^ ze-QvS#wNf5XXv8=+lT-k4E<4)Bk|_YFhUlAdw>Q2i>67K*dUPGc+4L@RhAZpfNhwA zA9gCMOJ|ToV8km7B$>?OJ1DVBO>iU$FfKfPSxC4agqwqSw2wbU8zr(xi01eCCnfEN zMZ_2vzl43=AiJRr24e(<_@6bV$FwkrGA;oVZdOj2+gwCkoc{n-peo1!0zpGm^eWRL+Mf*=rY^9wsnCE$`E@(0amvEKw4z6kC2-C>t) zaj94caCL-C5@+v)ZLc6i1K5szLaM+?24*{$^0&!M1Gfw0Vh`W@|Os{28@(F_&o9D~ssy*%wu@FQOY){yQHA4-_nGtnT zp_2q(gBMOj+D_HOtVkX?Nl>cm+t6MTSL+k)T;Y($^D$)$oJky$j$ugRb^-;%aG`ON zAmagc#uUNCn~1zx>qAUnNFUW_f*gFtEVOjdZ-&{;l;zRm+Wi@cIpPX3V&up;Aqx%- z;QbKQbO107mw?9{Z613zOYl(e&si zaD&Hr{Z`c)45k2_cglHrG;;3jh5&^trW9oF~lZgU+g4SK4%`MWeAPmnug=v)l zAm(_-_E@y5ZFCZ2nf&nB)98d12ah8%VH;EQl`aMVTkuYou)?o#J z$qqM*U+A*RG?XqM6oCL>_E9{!!!b4@%2zlNMkB%$PlE5nNb%(;ccpin$mTE4qIx%V z?ch^BAc&9*`6r`*0EjU+SMdw?HNeLs_g6Z$x}em* zOdat#@wm^zIa%}z+pDJb3x|lGyDPf)d)}*3v2?VyM5E|L?{}utd#Y@_L2&{~9+t?W z=QtBCxfYXzt2yqYv|OK<{E^V8X=&O3zUjTcR9)zO7f~Q+*-Ty6AdksG(oi2IbLlUxad99z*)I?_jD z99~L=`k>%IGA<_xOlbCvN9*X*WX*u&i9acYldji$(-GS^3XZ2%3$E;(2ogR@vnp?K zGil_lG4bgaAy#p9_V)ynKTj0~=QIHjdl0Sc1QK#?LGAWbFQY$-(P^2Vye%SCDi$~v z_wnIVbv>v-K3f;^NcYThq}yDY%@8zZ@#njNIqM zt2C-S>Ds}N4al({-7?3q)@ukZagCxNszf#Q4%vWJ$I=FwbDSwxZz%)WJ8U?@=$8qwy|_TF+9QrQE;4`a1pYJac!(m$s6f2b%X=i zUJPwIY-3EAlWT+&3<^}+Obd*aQoD(9<~a)WV*vSGXI3Ow+IR>Sk515PG=a`ay-tO+D+iz z0*n%0fMV|3$0t5ZH?4Jc=_#5_hg5WYE;$&n1J}_^#@aWirR@h;W}2JY$aCjz!)WGY zE-l8Tce<@vdz@sFRQ0O6Td6L2E&#{Ga*52@VjJ2gpR&7gp0S19X><)ZY}A6{<8RB! zZ*>aor=l(00sjC103WB*`l?+flUYbp0tgrp_v3W^LEigSrluwXq_n{VaAw{ME6z)k zQF52NrKW2GTpdlO#}NyfW~ORXa3^!zNRAgpsNUTjSMT_P34j3$oX|0@aV~6uc?1Yg z0NhDa{ttxa{&0?1I=xjw{Z&TN zrr*NHnHlHEeS{o{!ym7z!1l#6aL_~n-|O^K>+96fQT`H%YaHkV0Bk;9J(8`erKO;0 zX~;DRg83lFly|;($9hd=O0E?u6THsiWo2(Iu)$MQ+M0@nrEiJufJun$^o7Xvx<;2# zHpsc|yLZ$02gPs>M1EeYl=haGZDzbTyn#9St^7H1pEf`0!_&FrQrI_fT~A2R)UB+h zec3H865=cuHKW>fAi2Vr0%ZRHRoQzzSHh$Z(rj=6!KAU# z3zMd-ZPE+~wguzq`KapL3vrWjoQKkTsk&qsAlgrt_2ElOiK_*|&B&Q0R?oI|Qs%V4 z^0B{~8dk9}B5!Gxsa3h$8{9Mx>*T6dq|ig!KM@xk_Tf36({ZzrQU#_s?k_4zrWycj zkZu9U^Yl$bkv9f8jPc(;L~oMQCleyq{QGxH>8m-Rw*xa^Fi^3;l4KHi6MwQ)k7I*- zhYl?T13&$NGARlKX_G|(hNa^^j4{~ZNq^ONbU9%)kTIu zn032B`}_Gk$N{Fp0h>pXYD5W}2+1}mqFh5@$%p`gKEI-nmXE?=tv#(a{_5nA(qiJm zc}1q6*$~J$m1XppWIz&dIP&#SCQRDymdwH6=2m@pIRpSWIsMj_uAB)3P9{I9%hVD< z;0%$0-{zcE>77)P?Y00-h0xL)pAH7xf&`;`L3Z4Bv?$tkmqt$}(djFBIAoFFGX!=3 za8WQsIFdk{_f5BO8=?TQJfmxXacq+r7LMJ!B~5=$WPvk*2j%*!aUfU&ao)e*XZvr&vhvlXLp3MD1k#6-Sr?^NW)lr-(o(G!o|x#PB6KpvN_!9kI=zAG;|a znvgCXa{^%-7%9%;(s-VI74EB3M$-br5q}dA^jZu*1Wm<=kJ(b9WyRJvF)}|@afAsZ z+#U`>iXv)qahW{9g*#2bBXP0f-@Xd02p4Ov6rshUB*-@+E$3jQ#PBzc!Ek8Z#k$Up#_l{nF%d!qtqK#q4)hXQZx9a zC37Ovf_W`%=E4eyf?!0=;|tSY(YjCYjIC2eM7**?u_ilttookKXmwPDZ{dq)1)ww! zUVcj)KmLXlreDR$z_d7@)GaMB@qxsxJ%6*bXuP?B%-{m;$JI;F{hO<F9d&&ZFwsOr7}F(%^17fq}7TTM!lG&+Nv;4)fA!6cQXqtf0N_CTYg!w2nIOs-53H6Bgt>G*$q`^2I*@Dj$A+hddjh1*j;5R zLF}u&(If``SN#{{Rm7A4R{sy&_|0YgFkquuccmDC)cd10SBiSyFdU0j4r7AmK(eVZ?%9 z9(#W2`kvM*5M3jY7Rm>Mv=Lx!${%_N+&+AR;G~V*LGDlJpF>DJ*zIf)g;Q0g1*O1q z@>*m-jDyU|t*NtMnQ0iz1ydrpxt&ELPN5@@44|b@fpZ;*;aS!}+}OCD6&8YDv{;@N z^QJa5^iP+QCx~0LOPtPN$B)zR3ooYh_Yt^^@JHmgDd}FufB^*e8Cd6L)r_bMgIyuK z-Y!%7m%Km*;NQ1wC-|D-xBxu=09AKN`X>>&loi0H zOf(1_5D?PUhPoVL!a0JgTj?_CXo3I{adJmKir${6a|<70GcY751x7f;{4qHZ_hh$0 zu%8&pn-sRvz~cg7@Fo&8ce;Jw!Ujhf=fwkS21J28lPKM)V6b>G0@5eYsP(a19V$$s z?Zu$&1U6u*+Pz47fnWjz5)8_1POK4UOmIwjz*&_mZ>c8}1Dp7(#~bxGCH7aDGHu-D zJF5c#9%KM-{c|ddDiP_DIsMXgZ-yj@xEJ~>^-s48Z`1;0%$^TFaG`x-9r(YZtm=dj z4U;fUu7}=(%ughzF)gf`#%$syNIkp#_$t+PL7;%_XORjD`kk%#Psvh!R}v-BCl)1X zF;3ID?tA_jyqk%YGYkY{yM0Nb=% z0Z?|U;j^Hh8z_#To6Lx@DSDpAsjLH>4sT$nx~aYGI7L>eB$6}TRo-`P`YW9#nyZfN z;Mx_66-b~WW@3A&UfGH5gmjw?037+iOqSSp(>=#^Qx!2LbJ!0|0W>s2*69WM>4^9WYZSD^W z^V;HA*phKKl}dpd#o)-riq_FnHM)+Tn6Py~2|Ri!w^38KZ=)uTmW47Pwwp{%kFOW; zR@fke1i_ZH7$@uGqbzHLN8yMeqQOiVP(dyP_XLRSxGy!+@mpJ~r=roC#oq23d7J%t zTu)K1`i-es^S$+9q%;^q}d={!#_W& z>G5N!wmDE~sRpnYUVHxlWj!i>_ft)`Ah=0CeU#0l*3lps69i)FUYSF7v+%%UNsO&+ zDce^{-KGGNJAri`<5#M>0NTbyq6gRtS|kdjG@Bclg$|r=)oN-v&>Y~=9G?LTooPCi zX0xYO+DRpmXdVk2rMepj9r5C>b=U0Ch8G`+z(E-Pnm;lYb?k@#!BzyeR-?4YYx-P00xJm5eFzu%I#r0Lp-BdxRN7Bz)E1E~aU>4&e;-8sP1UqC{{V$^I2Nk3>KKiRjl{2kfhRNHL?Kx{{XtPENcKZaXgYfV+Tm8UT53^Z1DKS(Iw*mOCW#!ajr6=~Sg% zy87f*q}d+%#D@-Z=B#V_eveb6cDJY1h;H*pUvqgPFtM`rT5VDA=VIn#H~RNbR(n7U z066AyK=V|cUac}nc@csw^-7gp#Fslfl1izDlXLjJMbl}x8M!0w6t67;&56y=lBrIq zX&)CHz`8GfNhFW~?tc6I5vn&5m;3;4^#F-cT22fLp6U})1eajUcpw#`^Tr1Xv|C&F zT?FKr5KrIqR;hq6d)v3|g+Y~p=J__CHENwkx(GIl3mH*mIaXT+z_i*X$IFE+0Db1- zXK@xdDkeby52Oh5Qc)qK$dTJON5gMvM6fhlzj^wr6)8hE6CZvnFHcdz;BAh0Kfm-= zDrz3;WDAZ#A!br3>x_s7V3`GFRwtP)Cil;Edb^9s#2J}c+JZx5T01~Q?RTXbggB4~ zi0l+@kRW89ES=jYPA4BkRWDG3bNqn06Hfe zU(+iFv!zO;7eEqo0_3Q~I2ibWIh-PS$=XEYAAF;v*)bDM!1JK za4rGmYdNK~<;3GXmTs+}H)y$Wk|UJy?5cxxFQiBp7l57Gnw1{}2<&)7-6KhM>V>j6 z1pVPNwA&1|(l~IyTJT)0X@MY1Zy4k&tPly>Y%yc?`^u%Ju*Zlay1da1+(btNSgfyY zF)hXqYeAE5ztte<9mza|4s-+IcovmDgGEzcs%0uoGIQJy>ZEOBL87a!)CRO|U_XjD zU0|!D=sh^qE)BeKxyOi5RT@5tp?AG7);HrZ1fB%*bw^32t<-CdqpoaIAMH^%Z)b40gb zlH1L@{ozHZqUULv&~aed7VNzGpIYjI3w4!j1UGbj7Ozk1-DbT}Pu>ck4~uXBecgHa ze+S8jJ9Ojc$1ANbbrDgmcH&&pU>GnwrD^o4EpnaMkD==wN1~Te0hQ@Zn@|!ygb=m-?s#ey{{T$4T}2@F)M)G?H| z9oQ{cb-k*6S}vU?+JhTi@Q_r5!W$RiCKpT9-5yf>_fQ9)6!y#&w>fUtOwwYLy&d z{0$`hJ)%%l>c7Osz!=ehCJFhK4N@P0a3Ies-0=86JG#R!B+78lt5x^8gLN0bcq2H+ zWj#tVNslg652Qn^Ae&s_c1s-gRjNA3w3BZ5T6${1&B-Kl-E+=s>6q?pC0x?#oYw){ zH_BN>V|jEBg3D*P%Yb-4W{wh}2+|WgbfK>HQfI%kGL`9Ti+f2G->d4$6i=NbCZ7s(R6xDYg#~&YlW^>$mjaq>AEJht&u#;WMOGkp)DYr9uFhJ zeo7iO8pa5AFia+^UPKdtZhrY(cXqa#W@~g&E@;kLBE@7qnyNJk5(t~$^IPXo9OEWL z_Mbn?ls@j2awVY2&Hm4-(r2m5Ts%JgG3a$ljds_QA_1PuOHb4Tzy}BOQ0qE-OE)#3 z$8J91W;L}a=N*;n9J)`F#g~n*si*H+T+#`_cXQ9#SUQ(Ji{fxh0JSMqI{-Hw$JIrv z)3&P#13ky;s~#M;YWiJh1_;~_(P?|>U?5Ch0XJOLM!JncB=!VlT+?fuKn2D@vb?aA zrcFAUmo@fu7K;xR0oJ{-xmu4C!YTRvH(ai zxC_XeN6|q}sTVwb^XQ$`gGBe@GxJrbOFQ5|F~lfb{iF=uBn%Y^)I(i>coM5s)I^X> z%=)52;c@z&*YGwVD$msy9`k}Y^i@q#OwITNpmWF>!JP1`MYWez1b4mx$IS=4+-=3| zs~V$&5!y;`dGTihnssgn^2L3=wf#3`NO2t6ZJII;tstFoaID9eq+ADO!6^o^R0F8nhv7hCMD2Axf6?%gZ^1GqpDm*?JcmDh&@{t{Epd_M0S=(Gx@R|U#6f}#5&5n!t^PS$&)==D zP#A|1c7Z;V;~qM;nV4OZh{E=sqnyx7 zn`yBSWlhqC_f@D}V9Rcw%Kea5WH7o6PBIETGsOxUo$!~Nf^aYry&Pyzv2)8oF~=R0 zT|L3{ed}Ncl5aB!-0hk=^@D|&V30-p0v#y#8a6j>;{wFr=N;61QjcGFce!o99tHUH zwp6+R16#CSB$F0Ydc*hYDKaJm#f|=63rD9m8YINylCJTJezZkL25s$WjDE@*wVcL+ z=NA$n1K-s(O6HW7{AJgJVpQEbr`6Rw)pdxA5h0_w@mX}a$TZqom8!74j@;{!6SMoP z`kiGx5T?o%0jk*~11GjdG4xmZeuN4YjT+S7sE>hw$UWpwc;#abn^R9*`|uMeEYt-K2NdS^#ynWxrOdR&KJ8X#p4>wNM1_0L&$e+L2OvY_DA~P31 zJS=T=NM<+RQX$`n8Mw$!(hG=>h#$mx9%#1)i+&@Xe(<8F1@hA+!v@y#8UBh4P7NpG zXmQQCKc&|**8z242gS9pAlu|uPUeW1fNgVw$)B3%x|15}uPrh=pQ_)(+aE7ZLBI#P zNU`!!wb0;rQ|boV&TL$nlAwrps`RH%I`+}gG?%(TA;XRpih<0iA+{pNx;~poJou5D z{S~UZaw@mBCo-bt%I8XNFKY>%ptmcg_E%mTG~7*AI5mzFXyifkT<)obpf{6v>=xg$ z-}hSLv)tBFX_vX!A4Q&e((yM{(~TceuSakpt{_1M2lNG1r&DRSA|v-(-E*pRZFY^$ zp{D~_>XKX-8e_za*>gMP;yek!PYv12JnY{K!o7f-ixDv?8uz*5o)s|iIl1JX6Nw~- zx?sc*B&X_IOYcm8JTfFA6>;#vHp+lIiJtrbm`svQ?~9|>30E0eq2Nbh2fDV(@e)kU zghI-f4D*#ED>4m7mxduMgWLnzThmLO8Fa*h6X>h#Zrc|S41+WERy6+r z0SENR^YTkBjy0&jXe4Fvitu4kc2gWVq1Sy_nkVW8J8xpzN{8f1Y z&NDtjAmi`NOG3`TwV-f6RSi#x!!zbSsHqqr#%~{aO)J$NLuM=ujOAw3yRUiX(hmyV zSNuJJwgi4_H&sxz;la}f%gY`pqiM9v4FGO{Z5jEiJ4V-7d4MnXDhdRcvA+WG*;=PcrlI1MCd-^d zTG1qv;-=kmS2{xBE&wnQHX!5f7hBPE-{G1+;+#QygvsD=v)Ry@rlKqAtIa`*Z2|%F zIr)05uV{UtdDF9}(j|q$@xaG)VSTzyYS64bqw1!s%Z86mY(L{Y{CpQeT57qWkQ&m6>tR@FK6DiF|MFdQmWodleLdtr2AhNC-Ho2L+D6p(xW z0C?vxTZ!(Hu1kh(Ccy-fZRWSVoo?Fio6Anw69??9YEl{`#E6dHeoJelYQKcl3ax8g zNP^RY`y^tES%4T-XEG?+W&}Sj}B;;v;0VH()YF zuWsIoj;6bs*fxvEvgPs3Iy||1DAdzZ4j=;n+!(U8q1C^pJ+&w|<7osENWYT8)^!rf zhiN-bZ*>p&tH<+a&2?WNwB^a~r>Ofk?HzuvsTw_9-vls$bLg-=xzRgM)GB+*fu^`6 zq9Bov(P8v{rB;TZR$!Mzf)u@TRjTT5_tol7*)3_bPhhae!|CMVd1vj5nWs_ID><(= z1j!2hI_+=-qLFERG z;n%d7Apqk*9KTxtV7BC73oLHk6IL?o&=PMVZ7Q%kU27i;Elp9}ZINSZsvhX?$IV=+ z%706ADml#!`2my~z~CY;7(c31oZ4h=?Z{Hrbvm?xoaD^HB;;Ghoc&p4LPic&mrm+N zi2(30s%mLBadSx{#Z{8mSlw-@W18MwzL%+czzB8XB73XMsB18^UB+>6eAgAD)j5U% zj0s!XJ!m6oAkFUtEOPO#WVuS1G&P&p;WT1&QE36jrN;(-fthJTIyO* zT3!wNtWe^|_A8}%HFVAb;0zuYIjkEhW-sJI(-$>ar(RemzcXZ?S9=BIiId8r%A^RK z*+^=S!c55XznZNU();3O0WoZ(`rDd7G6x5RQ*?K+_j-h0mo(A`ezut}SqP1H}S z;(HS|?xWmfat;UxcLHP6H%5sebdMvMOkV5}Aq3h72b652_d3m-Su+g!PcWO3Vq-F= zI%fh};AHZUq6MyI=pfV6zN=?#gj`{3XgV>j&Bx)*id{2fD-l;{Y5r#kk4{%XrXqvd zFY!%@f<{HWR$1j^Ok`({n@yBWu1sSec2{(|I&GmQ#wKke=AO7Bp)Mpzuo8TKRbX!v zN%(e)0|3ve%H>z6yi|3) zCo86O%A;)(ZZp8{q+okrGck+d8wG%qK8jXTFEA`ef@GeXn)Wn+K_69egJq^n$jBV< zvZy)Ct+rrI{1&Y@1~l4aUM{Ap6-JY$aToF)zN)^Yfu^P#$jKlpZ*V$i@JQfTs#>BQ zq@~~>l9V(d^3W#bGl(q!;%p@|oei)Akpd(0P;POMa3%omtveM#*}%lem0ZDiwSp%Y z7qK2Gsa~Y`+7OlMEDa!$-s6?m`zFUXLs9OtUr+!@&(GCz46_UjoCo0m)z;zli%F2WtQWC#RXKc;@D>Na$> zY44wgWZVvO!sMN__~3elK;0|DJ}G$sG~>Fl(^wlpv6Iez@~d^X6I25*=g5}Cg?CLX zqY;r1EKduw;~${Zksvez9cU--l`x=cy`yaHi#GGF+gudIjG(=@;Wq}&U_vrt1xb?5NG zx${21MJDIS3)>N;%`w6Cj8 znt}YJgMTrVTT1;+Y808MtX$fEg{HuIE_|=oP8e+-hfh|7I8=V@mjYy;KfJ9czNJ?a zS?+NF2|V_nq6X}2fNwEtgY@xIOJqpfBEW(^!sp8_T~cT+l0n@7c1FgM;MpKJ0z{en z!j7mW@(Bb<&(p|-B@h7{@6E_EJO`?q(WIyj--JoTk$)Z#GzQdzVg=7{l5*p^#$&vn zuaaADxPjyZUZlw3X1kNB21-d~LT`&leNRbN@S0b4P{sv4f zyj`;LbmR4~-9aO{Rm8a5`72uX#Zoz&oTw?Z64)1?n)K&SI`-e_)c2S(J9%2wDHha_ z&_i;3Je5v_(%NO`;oLw5Kp&dFs;nB)1EtQe2Y|Wq{{W`5rd2v3wyWL{;>W%($7R%3 z&_6@$^WR3Fy%j)fi~+D`9NlDe?FCIs*-o1SOH7dGMDhClS4ujrpFvu$PwET|qUvF? z&`CY`?Fur=`zlGN6}JF=VnIOWxVvb?V5{23Q5w39(w+2LNMntO9wArSIEafI{!4jx z7+8o17|$e&gy2aqOvR&wL_@2kbxtQ6y>r3Hu@CKVE_|xfd+nwn3-2-1PtaaDt78v8#fRD78mtZ zK;0P!82wYpIOh;j)Ei%+T$2LBxs|D=c`!0!AYKB*(+`WraC7uqI$h20AY9%DuPdL) zBYZB`oIqg&L=1_NscMYo&7Row^j9811*{VsIC|w#)Q}?5d?v%?Z(jwTJ+#R|&S`9J z=R6d4p?>DO>1La8z(cqWVsQiJx6>qnrKa;Ev6)sVkBA797$Yj1pMio+ zq`>+sdSVNSi*g0NifP8J5w({BdqBUcDy{$zixLSoQPV90!#0o6Dy}<2K^uVPFYC&| zuf=%{AOa>ywf^v(wF5~OyyE`=d??8!@p}+3e)y1xIJMD>3mCBSDe9zko<+An8MmUN zsc-;H?lz2P@0DV)B9MuIE;9vFsNh5!`dqCsO;-*rox(4U-O;hYzr5rC0tkRW;E5nl z)F~;p1d9p0URIY$jU6xm2KY>#Jxbf?f|1s_&$ER$ZI1Z-io~Em0OoTaKa?WY>uTvK z3bfqeP+);4vgXeZej9I5XRdYEv#3-IpLa+Wv`N2-_57D0buZNFA6@cG&4DJv%}1}+ zJ)>zQ&Z&13bD!v>(`$6krq)$x6KPxc{C=3T-qtj_>a=XaM{A$a7g6d}A4lD~qnzoy zl79UV)zVU?;i#hdU@;MeMypLn?r|pn03~uwi)y1D1LejVZY|2x)6z9s?{?kLB=`kJ zqXX%Nxr@J>Z0-+P$O>wS|o*pBq_cn(iV5 z$o!SI9ABD?3T4BpJ_-3tEY7 zYIZUY5VNyOl?IQgzKg(lBK(;8E$uf~VPKc%kZ13O%j)!{r*P*Aps!YzEGIcpmx^V} zzr*~SjMa4=OQ=)SGn0P-`mRc)V9i0=P53Iy>(n&ho^J~-xIXmDy;hz)H5mT@C(AQz z(AEv0Iy^a9+Si9Tl4RyiR1B>H#2yN2&Mq=?d{ybjWkIgmW{*#;REP!+ zAS=z#7C6WeI2V=C==2BpDo=GNkRxneMqWuL2QcTB2AiXTM&Tg#&)t^3o2F8yPyo+} za74<6{oSqve>tAg%;4!xbwjZkyYRK3Fx#E2)L9L2Iu%I z6qp(&4DfhCt!c-+b1642APKPG532NC9+c72QKz_Wroaf3mFK!DKKofhm>Wx;05S*F zdP`|+rrCF3BL4tY%jd1I3C$q@vFoz_E7Rix3hLrjK$Rij6E zrwnWW0f&7TXT~2SqE$vv%hf2e%*1uO(x2R{p1-QnvKn=ji=h0TvGiYmC z+W})7t3YiLAQ;8K`by@`y6~EW@LC%NK(&v$& zK_(932^bUk6Ws$uP5vTt0D<51Q!$~`+9n-GqNU9$3YlE?wOjGtf15c{vD@Y`-BW9xJ^%!})6r$>wT{(R-%z-c00i=^ zYxg@)TR(v90NMwVqp)s*BoJ;4LU_sAa;0KK1AYewlm)^-;PPaBVHA8)0Rypw+kfIP zBh%)uh=9S`Bt+a1`=)7j0KD>#X`4lff)Ale^FNi-+H@R~7VZs`!6F(xX2lhNH<%G5 zrfu%wY(#UyK8*x`2IpRWiHHWx@|g9)LBcZ-c)?6uL4bEhf}%d#Okj6I19X6W5`tO? zKQzI_b37ju1+6)d0mzFFos6x$7yttx9>?tPu{315dz&6WAE%<*(mK(R7UKT^CC}t< z<6L5Pbu*fEG!^sR?XM{$@`+DLp@3uRx@2sVQUo5)e0M>}XR zOL>vOpV0+lAX>o0fpPnPMJ*{b;6=Nptk46D5&`vCHTa~!0W&0?_WQt200tZ|Gc)6B z_@Zba`F7{B5DxGzLyk%PAzD%DBLv>}!l?2h212=56B0O%@%yS(Ko%2jMU})*01Js1 zC)PZQk$^PqCg9%x0HTNz1;;rDJ(SD@7;u}w2ajLqnSE(#hz++KTI(~d*DZZIV3J** zxmpyU*o)-Bi>`02>Iv$kL~IIho5^>kUbXwHkUy5$P)mWta3c3ZxSEb6$tFyNBTSHE z1TMXB#iysH>RMnN;!C2^n^fZ6jTj&jN|C|Or${$|2^=A-Ow2e*9fHg2MLNxS0j$aO zQt33UXZn=^E<8l56ezjkz(|;Yr_$(&Ad+PMDYADjg`sJ56srx-U<~pZNlU1%Hi`_u zxq^~YbghiiVWQ(4juA4oHUM22xs_MrbvbYlN6}QPQO$I6K8sCk_oqchu&88t0#69F zZUT&(5!l&g^vz43lZyyib)nRWb;uqBe3v#;SvJP+7d5PVz2?wuFhpfnYaDawb$?pH zy!V-ss%`+=j&i#3>B*O8V+fh$M)rX`PbC?u0L%n2zDq$Tx}>3>o)d=OD!|gb_!3}v zsT-yZGJOcLTRksy;6zy5Wo}XIL`NWSu{4B&2agMGghhr1ZThZHE2qYx)f@zonaG>2 zaqn=cWAe)A`kPCq;fVs?3!JS;weB%*$!p^2pObq?%C2yU#yBcK3yj6jVxgp9i8dLs zw!jDyE#4NaNN$I)igqsJT&m+^bVP6kO)nxVj% zx$KmR*j{$Dj}Mqxx`&54#aOLSnG+eZrBTGX>Wb27lXOlqepGF_5D}K%1coafE)4=e<`i8bpSozh$(!C6MM>>r0E~Cf%q%>aUZY$j-z|=x?AJ^8 zY^|oI*YX_q?mnaXEe%$hmX}OR%R{8V5iasv?JZN7;%T_HFlHl#FTQh}NjscwTyK9e zyzk39mhE?b*!t>34$(}I+sP71`{iDHDpS#E#W(zJSzVYMU4}=+<*&K@)K?DYYHYVNGx3CQ( z3yJPQ#&}hOW=o`j-@3hBv8HXW6K)YoFVJyyI&KCeV{8HI%Cz?bNIC#qFdb;UJ&C6&hPfuH4^tTmZ}-eHImPMy1Vz5>IujOpB#IC#352oqBxv#3@V(2(p05fcv1l!^4 zsAvNQnWP`YNxpM+cA59C0oOJn1VHpzW!p*8v)oTouHfTlhKRT2gxbcr+Goz&f-pTe zPf4yC%|k>G2ml-KBJtvbUis3x3_#4xdhzO@@UkJ0Q!N39{MH>DtO6tsD#p zSh@}Ry<4joBz_`d{{VJfHkzOMA)riwGBfj0<*mFprJ@5LahS-;o}|cwV{r~L6EUuH zV$nWcaHe%J0Knb%-Ev#Ys04f<5p#JUW9Ec4pzbCN;P(0W6(MPzzX zJ6$c9={z~2o4qHWRMWTMX>bvdX!`k~2IFwJ$1&^ZrDG%y2WRgIa0D0x0Wmi`&)?*( zd$xw|1d=icfhxON4S*0hwEb0&cpJ2|SPAk`Ouox-!O1*6yctMzM9WnDccy3HWG zm^b3%Fcpmj5mt>xv;yGG;qfuqV;Y3GxSNt>MprL&uSs=mBE3KzFiai~Yx?r578x^Z zafnj1$;^T-X9W-+%F`#06>{n)QBcz$+T*umC9v#UxwY1lC`)#mi~(^dbsBKiheQEs z810WVOw;PkEdUn@5o=mUl4}A^W+0H2Mv7nnZV9EeL28=e6t z$Lyj81QWOud&&xuN!ka1JpK41Me!WL1msvnp(04h=^BDOo{X^uz9%oA3l*WI56 zJ?yY(qG`?MJTVEW0Ou2HlQ9wklAt>vxQ{cH8w55ap8{v*5A3cb9bjPCiR2p(u&LH? z0NeqA02OaiUQ81tb}Efh0502ZZ^F|QP_S(_Bo7j%q~Kx=?=WV5{{Tfc_H z)+QY@VG6Q^8Z@V12Z218^<2MPrt#H@JdOreQ%(Q|X2)y`pS~47tJB%3{{T`Y%oa0y zto(UuuPMWh8d|THvfvobHh^ zSf)1AzCf@XA*^>Bh$9hcQf&Yd;TN(O1HGj8;_A@8InMnu$3y+#hR$8KanA$NT82kAwtxK5a81gDN zYMem?`6yiZ_2oGZ<+hhjc@Bw>6}!49YWW{ljK)LLHp@jff5Lix%`us z{yW1Xs5%J*M?Q-*$tDzLzG5B}@&(&pWU9r{|Gw8a# zV_Mdp1>hXza{BfS&LB@R1{3k!jFq(Y$*l(99#0FR(Z6$AM{9x!k67_stuk9%kU_LA zi>K811l*GVOeph4o+`1arMMeh_=~I!Z2-gZ40_#k&mgpt1c5L~5U_fKnp|K^ctTz( zJh6;jw8x6GN~|(FU`nY_!;SgKsrlHwY#z1b8Um&AheK5k165--mT(j;eH( zbm6k&a`DI?f6+i#OF%7{HawrnU+G#_yGyC8`g@3MHhcJ`bgX+vOidQK%&?O;CU~;3 zQjn*nsZ@`N+)c-zRrJ&DhUUaX0Bq%NUeGkfga(%aTuwpl_t{U=ojDy#$*AcW#{p0x z3>m&j%q=ZrO`8s|w1|?ZK3d>hIFCiyk4yo!2!_BWA#1vksQOgE_(T^(5=WQGWm9kZ ztZ5|fa1q!oijEYiRj76l8x0PSN2v1q;ZUT?zr<_m)qq+Z07NVGoEt!i0FXf9RW6>! zwSXVyIq*|z9k)vN83$?gOX>V_RZP{XKk*ahqJmF{ikYA|InFXkGEm4Ci_gt$$eT#S z_u&|&4F)bn{1b;s9z6C=NdooDl7w~n1 zW-NR-ONA~Pkw!yFfpadO%MaOlMZGlCi1&`0WRH^H4`^^9vTy*g`}_0Z@?Z9JBDD?k9if!vJ#s!B~X>qN+i5dcT8nx36EH~}uOxsefzTf>yqSQ<;)!wG-@1MU`E z<$oD^I^wtS4}0$Fgn(v87XIqLTddY;oaY;x0nEpA{Uo*296$t@5(KKA&>!?zA`6{~ zT29hk6Hoq}ktZX-eygIQ2xzqC*p->l=sC0)19C1_-5NlVaccuJ9hWzkE?vGU!fH*1 zkVMV~?2P_JFky3vBF5m0$9M>~4bU6_1B(enz)UtQED3}33!81g=rIkj?P(u<`=%`} z1i_5lpT1R+HWrq*;=~I=PiThQ#F@eJOJho`q)Vp3qGtChRxmUZkYgeMQWl28Tmb}R zoO#&y?46;YfNmzh^U1O;0j3-c{{WlFf!gXv8GVe%xx{?aJA14kUSQ+Z1KqY|v5yiy z@`*N^cyj*J^*2)0KTS@~Edj)th08Dsl|Ca4ZhTgswDo=OT+|l-0PKhy7{Ap}qfE49 z2+W1+c(Ich^E{bvB$XQ4kEf;wU~vSKd*OC{H%MDal=qi3unTZ_jfksy`adzWc+RNYRXcK-l`5H8;n>-1Uqa`<>gheczlGNaxJ4?AE0J|7$|cU`9x zdWC9nX1stg0`W4sDo*!0Y7HM0X69Ue%E#*s4eFJz8;p_Z#g;hL)@-yij1CAQ%1TaX zf5mx=h*7#p7>^zh`@joeV}djLEfKz3lv>BNuwq17bh~f zFrb(Q;v)ufkv)&XIGiGTXOl77{{VMGk>_(M?iQ^xh0MY13JtJgCVO$}x(=JssERGr z>lk0W9jp>PNt^xMTdUDDRoLN1o2a;BBp<0<_xU)?wCTtA*;%(O7a-aOB!21l5dZ=> znHTJ(uR-)`K9izaNHHn4kSsS0cl?(`%2~q;HB;$bQ@z|2`gr*QX@Lxg!}E2GmopSld0F*9$=?+Is}ogvIs(;61Y zoCs6tUoYJQz2L&s>N;jQptNV0AG*hN7g2a1Lu~{Mr;c9GZPp_sdIxi(&*Hr;pW6s`gGKw9E-O5I`y>gD@L62FArF)o5tr zz>tBJL68nPCfQXXY&wg$h$b;Stop7XcL#t71NTiB)|jIz zi+994L`C3_xmGmivB|_LyJQcF2)Q6gD%PE{=!iIxH&vHO$F`ji2xuZBi3UDjqI#eJ zC$!tF7dY}c zKSfHoEs|~`AeGaN;f)3H8GuFq05n7r>*5HL`=DR|fiO7-`Y9k88*e-nN!q1zC&2Oo zk8ES+Wmv2SY<5tXjC!pnNh)m{jE%mt`6Q&K1o)(pxk*oyz)6KDW#HezXw0aRDq72b zLs*aiGa?~jkQzZ9*0)Y+F*gS|S?3yIR;y`C+~P#e0xDX4hjW|ofzB1Zpi`EPF?mqc zs2XJ2c^}n8J4VWkq(Q+so65A#qY@fG2l+-JWK?)@v7ZRxGYKGVi{J~Y_1b-!TAgXA z(n;OeV4mwEQ0>H@mt*oq_dq0!92HW%Xf6iis>dA`)tq9mlh1Xxp>Wy`aWl%m4&74{ z1y^3o%1 z#ik4ub5_$)?F3+05@tM=I*tGUS!{W(ta@|(4FFEEm;hPQCejw3wXXgZz;1%*FaZR7 z{FZKz594X>G6a~gjx*}C_fmBiG8woy5DSU=bNv#?%Syo7EXB?T7mihKr&J9UR*1>C zy1Phip^F21#QN~EI_tgN2evF-;+8zqy)c?UryP_JOvU_Bw4IFr+m950BHWG)t=S!| zCebn39m{0#!9pAXiQ|q@NC5mhNkCZ88LAuq02QuQ$GU@aBv?Yk)3`RPNU%JGvRwDN zoEx_U@$ykuI#KElijWV&GvZcjn&WaH6N0bx8mx6f(|8vkl~Z&u4n`p5R83WIyn>y$ zkTKm%_R|)No)kdHfHB!J8yX2E!#PhkA|!VS#1TC3o{4Cz-QvOD5R8PcM>}_&4w$9U z^!M6G-NYZ7(`6tIE^j%G->R>tAZcipZ3Af9(jo_h{z_&L*9R8(&BT67g7aN7qSZc| zsxqF@5M&9rq^vHc-+ixR8UYLfVr}&+VYH^J>b!^pCh#(DeAX}zsaCe2m&=JQi;^*6 zJ(iw0St8MGezvVZ((R1`1B-$El+~CkYN@v394C@_!SYo)B~w)D9XH=&v;*=BN{Itj zDoJ+HCT9FAy`n7|qLV`>2P8%Ee&dA>zO#mwyXByYL~R@h?>;G>$2rB$879pNw@^Ok zr)mfeB(&nvR7uM3O#YJD(Kq6Ucyg-sf$uA6*bUt2Aml53DDBa)%@*g)ReL@E09b1y z@QISO-jN%+8e3?90TLqo@nxXL3Q!3mM?C%G(MP1E{+`xUC+xzo&>BsF5qbOjm5pdm09bRzWhNN_ zkr84@i1{aRbBkabT4MmD!2k;a3`pT17c+4roJEdNGz$sf-s0b7Du!PJlP~}>o|{hH zyGV(}zJVZK2LqlZSE_y>0kcG@-sTq{hjHTfBhFPFSq!C8p&$ma&IWi|c>e(EGs~x* zYij%MrCO}myq@VbrQyMsbBqC*Q0*j{xp!r!(sXqz>X`Ce7{>x)Z|c2$`r`8Y$3zIS66LqNoyoiV}A^o2M6ok_29T57i9-Y11Xc z@n91%=j$ty^LE~F(0%FAZ{jmD2j6v9tLf`j)ryRzP_@kfzk+f3ski&>MLJ?(k!*s0 zPe{Vw_e`s$_eUEnM}IZXJOxw*{ zc91oohLA0h0F?zw1WocRFZwO_7;34Ly5MBz!^J+(E(Ab>60FhDyHiwAXt^>-z#Pe2 zS}u)BzNwm8RK5G%_@3i$8)Dz0k0+NLBJl9s`E%{0Ne&_-h!^{W{{X`g2VsDGm!r4Y zo}WUF!J=(PH!pOOeyc00bQ)A?olcddt!VhE<`*VE&ciIdTK@ozWZF6uMSUv^Obw7E zV5H46=$9Ox#F_j3)lDXO>Yu@izJ_CO!0>AmGWx)y8#qF@dF)7&Zyzx-qgZJ4w1r1VF!_ zrL98F9|<`Yjy~mTbiGwAZBWAKcVZ=I>Uw{^rtsJ@-Y&D=t4(dkHPHD7;fs;a*;i=m zp3r5$f!zgOn_aGOE!sn@*dpFa7wDeH+9Z#}1VXMWD=NwAji}wPsp?1v=s!hXwzZ%Z z@7xb2)=yWawba9dZbT9X-;%4S=^GC11YANAa)V=@=}0W+gNuP z`~A?pp7F9z1^}7xQ@On~_2C!11lS96JSblK&OtW32>n!UdzyB}CJ5)cebQ_qGIR4< z$i~rpF#vf1WoD1U@MdR`VpVpYW(nlQ{_LbDQ!q`+Y!j+ zK>BfVW^H~x?6rur%|8^6Nguj7+Ab~39H;a zd?>csBpik^m+>4|NE2$@+MwF)LDnORdg)6YdQ%L|*_zD1-(%%}T|M+Akau zvCEd&aS`ikhdhY@9tHtfPj=^$r_|NlXUts6tv%M5%`?jKNZLRbx{W-QNLXb+(hS&l zR$HmsEe3K{eBm^af{ffNLz%Za@K3Ye<3w0cB;bA-h0{%t)2XmD_A$N4gqh0!X=8 znvh)ilLvxTr`0V5t}{FffIfVOPk=XR&e(%J7f!AvZ&|= zKsGku50dN0r!_u-rP}akY&4Q{a%F00H6v4{6<7mEmbt73!^HD{&0}@9Zq@CP@U&PX z8LBPep3(zb;ldr#>2W{^6Dh!R%tF-y4kY{|l`C3JiOgHF7PR0G0#wWZcR2_9pvpD? z0FM=>%H+{&`$SA#XAszIx(wkH>r`V)GSd)e0U(1@Q*~lzCP^HqgXZVSC;U#6^h{br zTg@^%%sBpGHZVU@mX!|t{(bW#N=Vbhr&V{b4R->FqXoiAB zj`sOS_E`SUkj)$SaN}jW)84|@I6D$#2!o7zrI}*1?J#p?nqwXCf4CIe622VL$a+^f%sJac(E8bC3EEI9)oc2W>aSVZ3NE%QvqZe__N z4^O|{7-%3D86X}#JN{kQoqKu^K$eS<0!$kZOrmRxoarJ!=JEDZ#`_WKCLt3eE#J-Ttmr|0CFNy1mZp4V?Jl>{;7-D0T4G1 z1mEbH&i?>X;s6&>f<-_g=5fd0=&C`py}$_$k>9adUeMP5AEj0x-QYABiJrk~I5o7$ z4fHmTSy_E=LhhkLl?FHixV^y5#~+{3ZQ|Fq*!h3u#OU-@^}2%1J9kD*fCBD%CWPo| zQm0y=kRiYTNIl2Z4@~IvG%bGD{6yW`V1BWSBkF~%)KjTKtUGKut#m>A3$Hx=xN&2S zxk;>b8rt^?PnvDBIf39jPzQ>-n82KER%Ni$nmpBN4{nT^=j1yrHVr2SRb-v2! z+Gtj+MZ=>0U%4q^j+NxkeX)32lmuRS#jO2(H#r{hlCSH-RiwT|Op6-e& z0$Ex}xCBJ~Y_e+6sZOJ*)M056GbR^a-4><4hLJ5dkRm-jJ1m~Ez|BAo5KBhsu?jq1 zYi^o(JTgtjcTcJOKCQ2O!5JWkE3wgPn))H^cJAQ*>yef@!KO(9W;x*n9d&&_!pD%( za7kSjdve46U5=@=DhOCJb>rh*58`cpyO4RGEKqA z3M|s6Dv#C*ZVhmlzyf3L=-pn8U<80CKb)v_)L7CCT;It?9ZA8i07&4Kaw4kY?$4)#@wcul_b29E^I$v$p{g60B1P#{gv%j zhOV9QPyz_ZGbh#+eWQ_fj2;hy+F4^9rIaiZCvX>;j3)qt8)WknbnR?5;K?1h^HMRA zVq|vgp5bW1i=OZW=bq~29V?)c!()?S`KF>ay{_+qI9hZANf2bj!La%Itn$gE#Z>8k z({U_-V?C8hv)bT`_ccc`Dj^WzGaGjXMKdF@cz}+M*?<0~!T@ zjudrXM2rskPf_qpeT7-h-SK$jB_OnmiN)@dL`eoD&G<|WgyA`G4L&%w6%Z;sG~e}A zD!7voM+~qV(DFuc-Y=BS)r+yTza53fbQGnye%`=Pxd!QZXb>UgkbnQl|rS$0%z;5WU z^;fzVLVr{~`mI*d)3m{Ro&@*+eAiW}IvpmBnq519q&^};V7&MbMUH-5T}GJUw=YLb z?K{mUM7JbCTH0oov^F4~*-)Bg)k~Z|5wxGxTGN0C1LB;42LAx1l;wC-nqH$wxO^mA znV&T?=sBd=m^g!FKTV_yfFK#HZNd(41e1mb?mgR4@Of-l^($ODWeFZ&H}tI>Bg z?`(W7zS3ZNtp2D2wJ&fo460K9BH58n2 zw!o^6A(%fFBeK8J+zkrAfJuV~1uCed?L9`BB0(`^&ZSVF!|Mx4^#MCT1GWN=?iRs7 z(>S+m+k|v!hqSoyfeKo>ceL&Rxu|G4tRx;nWG0QIk}WvIEpKNjKCf7$@ID$~ml4|m zWbOo$o(F>7_DYy`zO7q8W`3TlKPCSFRbkt$a4na-zylx%`9g-UV`+wE^{r^I^!a+J zhU&E%!0p?NouvIEx}mAnHIJ)Z#0Jwa!s7rRPh75CX4~7tFWN1C@Op8irywc?tog!%gx?!O9u)4!O+Tw3}7HGL7$tHbPFSSlB zqd}(RyK&$Pe(PneWS2PHgB(xXE`QpmgII#b zEfUHs8-EBKOiJgTOX$;JxpROqmDKdcmQy>WwgO;@`Q=ZZ@ut2B%S0G#IDA5GECEEq zdsv)GqOGLz6SORn4i(mjw zTYDV$L9mhwFB~7As?L~%H;`unO}&0-3!3W#5ozbg>nTIx`Q?XFdajQ-aW(dO0 z>KfXbZloF+xvXRvAOcPOa12?spwlx;qDg#WL%G*lCa$&V*p{0Zb^bj`Cs~`_d8te%@=MD>{QyNzM5S6 zvOksp1CLv&dXt*kt*8JS+a_dB7Eh$7{63MPqD#z?H~K2wa<4s^5@kpx6`g`H`I z(pnXP1Ot=aS4-^QQEd}agUoMmAfJ-q=<1oc81T0$<26oe!-hvVQ_NmgSEm}(+YhuH zlZDLtP1LE^YiKmg8Ml5G6FRo6)AxS#mW;`h{S{YLFC>=|;BrYnRIRkJk{Yu^go_i4 zL;KN470AUG8W@z&3UQ=w>xXGwVCSp$_#U?aJlVB~}soua&GuT9?4I}wr@=g4{!tLJ-bSf|-gHSoL*2SX3y60)KMgDtr=;m!7nZNrel+9{{UqP5a>MIND19Z?J^DG#Qpdq_i_x2m@o?T=MFo`>5Bo| zfj9f}SG3xLMqoA>NJE@b3hjcN`DLz0nSm;s#-&L0!gssgk^-VW0jTGd%8O6Adoi*mY)86tCT&TqVexN**y0H z(pFzZ)Rt6i-KOVoxbyz2svSkNsQ}Of-Xbi#{{SXgVRaslj|_5~tbJ!mq}C7^8UT=B z0#`e!q9|1$6CBtT&~-kv!b>tnB7Zf9SenLS@pE%M?zQmcmF<_C3};12%&;arBOf)V zL*48fTO|6=saY&wv663Mc|NN`oj1P{;mi^weyY5c9z1`wx41kxt$;b<%>}K-y8RbZ&FK zx=k<*4kN@XXKPqs$()=inj21#i+n#dR<%hm+fD%s=qE23 z4M%P`k3|#%figg{kbqmb4)b*28}fThD?kB@v}1(@T96I@0J@g9!Ohhgt}cCyf=~?S zZX>zKiz#TZms-+!Lq?Y49IN!S@tpYbk$|uU%-;s~l~SWY1WnJ;Xc}b5jPOidRjov{ z7jxiTH z>g5m%cAxBkk>LBeiwjR>Mxg+h`mGnY{W9P9ZdP?s4(>t#H$pZ7>-M^e6^ce8M4t{& z5(dLNEu7q~=zK~GMbt3PW!0UtH1r0HXB=NtC4eAL?^>Z)k;k0K`=0}FD7mtfrB zNQ+DrH61Wzpa2Bea;!9#*b5xV`|i2&%5AKOA5Foc&`t;w_x%#n(IwXsP4X|>m1V8o zFNaOQ{=brXVh9noN2K^!PS-nX^wF}Qfz844^;&w8Hhfd&GO~1410E;FB;j1ISQeP# z#D09NFKuHVs?~;_)|-lm2j42?8X`eFT=V-%tchZSCEI3BBlKESDf^is z1jgZM1C>`y*fIrx;>4?k@i(}=g~H|XR{kw-Y1vRx(^PSAl1KwGs%k0|Nzstm@(0tw zD%FUh&6rp=HI4pr@qPS~r;lKXe(u-vNv-#_ zJ`n7A@kfm4#5mCR5=JVRzS#vwL7~2G7n8NG5jif(;toK+s;{C^; zlFQ5g04nkJ)71X}qQ|~TeNtOY0RRslA^9vWyI=Sn54sL}Ur=!bJG|b@Ygb*4-KnhT z)6)@5hRokFJ=Y7>HBZ*-Uh+f(Xf{5JKNde%o%p8Kg6oW%D->bASCtDJBt`hLw?)1d z#Ff!{Wg2}kgzf?_iHrI!rGxz@?HgLdasY!Ej_aH04QKI8Yt4ZMNXhcQWz$VC(;$#Q z+G6D9{{Yc)`8#cb8jWxh5MTl%-<|^FJ*BBVTmzQ0WA^o3rB{If8H0-)E(@*I_q|rD zfv|Ozc&8=gTGPAU>L|7Z12Y^huSzyk5tD&urs5UaF_(F$59<-^U!@ zYSp1hl+O1NX^>#gq^ocYKbHXkqw_GT0$Rbj#E@p=Y<hed<}7R%b!xS9M}0PwNBtzbPnOzj%c6s**^y3j@-nOS0_cn-!* z{JhkxsRY4+#Qy+woyBt|i?uB+limR$2j%?|b!LD>T<|-6m7Z&Y1?2W9g>VT25>3ti z?w6_iEWO*Lhgc6dQN5rGnI_Ut%1rlA3ZTgjyickhd;_GLN1A#b$-^2%-8eA6v9j0xo%YKIXf2tH}*jl{vFrU(G~5>E`QNFXOHf%?pK5DzVi`f4FfC(0YE&B4bnA)ASXz3kL z#6;K|V;S=7w|Zudj0T4{-bN4KisB#D>XT}aBw$DCpH0=<`S~gOze?7%zV3k!i$Rc2Z(rGD={jz% zja}6#k&;e+^0&fcA~LqFxI6iqHn{{JE2`n8wE=CQ%mF@sky{L=(?|z^OMBc$ilwP- zI!)XxNtiwJl;zoJqoAtRFmBHfbLM|^9Yd^CV+=fiI2c)9#)eYb1PIOKtWAEdt#`zH zBLfhkcxAh-r<+*nG)Gc(N~F5-WUajxp|7^<7=kVNE=q+rID$mXtxXf&$=k^j;I8y&w{ixMdnGk>}FFogTvXh1IvNzt+C;z0Jz8lm5y9=?(zP5-fER}Z=^Rg z2f)DueHBAU?k>fyK?m;(Qm(1Mz-S=$$ys%Y4s&3Jo5XUj$8XjiT)cMw0HSI-uX5;j z#64wgX}aA8QVgil;w%Z9`Yv7XID;cHGr~4jAOHqLL|CQd@ME84hnC{$H62=uoPFCj zxSN6kO9i@%TWK-KFp|oD9~j&AwaeA43m_Z#s{Zm!8%?i>Z| zF>IAjR^kK~W?%%R8jSk18eB_p*+p8BXaIJ&3Qgn^4X@^vR}F$M-?E{l0suZelSCV$ zc!y;VnxqMWTsy1Uc7gzz;>wE$#f5)NM2`%RfVTF00tDjAKD8ick07+FJO&7K+wSW+ zwOj$4alN;S#3653J z<46E9FLUaq6ooo?&lf6%dW49y&EaP`UEy|~q+fA?F=b)3L$G|je*D%8s_e13;CLw+ zsB~D`G1)&v`!|JVw2JzlhkJN%M9pEYa$Gb(E!q`df%>SmH`E5^ z1gSb`CqTjkfN*(0Lq@2!CP#i0{T~&&umYn)F5Ce{8orRfyz2yzrD>uT58O^=stXaKpm9C$8cq-Yslqw7{JZ3+ThT_8C2o)dLm zr2TzOqy0JzwKfCzMiyDmZWWIL>s@8~+NPBsf`}wHX|WzYs~|L5_U=v5?IPXL21x{r za|-d2LcH=CVBc&_t`)j%F>AQyN%cWZjRr`Gh`dU*Mv)}CW;%0h zK$9hPfbFD67r?)#>bXB>3x?y^z??3kn3v7OWRgG!)pGg&0O{G_{{WQqTMma$i4uQb z$y$ep0^U2o`ItXd9T36;vhLmZ!kCzFjF~r!`Gv`ArXzSalRJb#DV-uj*bB^Dr*qH6 zwA^xUAAfS7d3hkXi=4>_fS%%i55)#Z7KFl$N1VX|!gK2$YC!>{x8j2bnLcTn_c_1} z3qc>Eq?IarfDQw3IVNCZ@(OmaZ6hXR5t#X;7#+!z{{XZ=Qbe@5qDbVZ`fT2y9?BIQ z&`5KDl5_hYpwjR)4cj(rw`526vazoNWi}HaI!FU3bWIO)m~Pit18*;?>%;zBep~*_ zPtv&2mYF%mJyJSpVf9K!1p(w1{xABnuo|sf7q(Y&48adVZ+d7`kdi)!Psz zB>6-+~C6ECD=BA z2Zfe5w9XFc)SBb*@5ESly!P;0>>ZvY3(Bu6&;75nl zWeI8BVa$1^R3+~q8JHM|{DL2R=745Mz3`MdxHmYP$Dux+Q^VLx6*>Fa8q_hLMXjn8p6GXpsO>a{o6NyyGjRs zfOrrAK8wuyW}$(kOiip{6|1GyJ4R2tB)5^@pGC{%@!yJdrsRBV#deM`yx!$bJr+;13>v+KSY!jtL`Q3BwvRVma`E&u4;C{ zjudWqYly(^qJ4P-g$t@Ul0AGEdj$UYj2}b~)mkSdReRkz^+P4j9pwRJ)!f<7nkH3n zB+Y;ZF?A-IkGvplp7X-L{v78-GbgdP%}=yc#ZsQCsE$G7y0t^7e0yDN0uMe))4E}6 z2lDfOva$7SbsC272XL1Z`fa;b9Nn#tm{e;5;OMzLUh6)dsm*u;=QvhQq{$O8+mvXw zsa2(qB;;SoMr!T^PbhU2H$6kRs~Z!qbT&$D3CD^OI$+2lUDJ{TW5rWo;0c~px^`?> zo=0+kJw;8T#~qa^E+9pZD)`aXt#B@F1zOYelE`!JPiZixaw<)$Xml=n!=gO-T^^fF zv4Q|^q6!onNLT_#F@lQ9XBIrVCU6Yl6KJ@Jmv^RSa|q$s7d!>Br|JkgR*V1GJJ410BAp_Pce&LA?IIM7@vH zIvqS$4ln~mgZ;l%cBSLIT;eTxL#JpPiEP9+1@GtbM^(QJ$OM67KYT8H?(3_^HHqN= z01U$0+}Oa6AXQyuXPrCu0stOTwRE&;4G4S0&wTurH(09CMi~9AIc#NK472ToTz`mR zHkWgl@>i;cxugIGVr;TY+$#cK%GP|5_;(W66F1{4rZLH(qH$)5ox9xOM*!9?Ye8`t zBlS}0kNwJ++h~gkNR6b3v9w}+R$of0>Y80bqadBIK`#IgLz~UkV0(y@Y5rKB$!k`o z*H@_lvRYt}UGfsKSkeoDam|ba;8wV$ovmPPX&m?}-$kL+>FwI#BLFGa3!3KFr)d9BY2pb#PDj7CEl8L*SQULj;6+k9(Fj2AW zbyi3aP09UIy$I>+HPkv*lSa|B)E@Gn4-N;+dGq1mvnn@xoi>BTmW^^f_U5^y3St9W zx5L#`+aq?{Oat9UD^%JVcVzJfVGwTVn*q$>OSk<=g5pi$!8l+s0w=I4h^a6KUQOae z;az1tt!ajdurrkd8UU~|2$HSQAT#6md=Kr)ie%93(k}+U5585F^)}OBW7;=1c9W2N zJE(z}CjxvYx`w+5dkJlam5PlVs(nXX%FQ&{U`Ub%dtbnnky>Wwi}|fO)QGI`8xm&R zEbT~(f#w3&6xO%1uh~uJ)^NGybo87656YX@YE`Kf{4&7i%k!u@%*hF4q@Q_K&%#>loiwKePR9Zu@I390^geH@6 zU<&~f6^69#xsm{jBXdMRGJf)nmXzN_5nuo|F>lF0=ER7eWKGtQr69Opg3=@Ep?xE0 z8L)wb=0$O4xuiNqPk69S;zi7p-rj2C=n!rWIT!VUqID+$ao*?a-CwjussiE+ktevD z{SeDy_A%-epK=JZO}p}&-2VXV=Z;63e$Z;Eyi5_qnF%)Yx&by5A60D7B0HV{>Cw4Ok z+VUJYe&WGd>H;>QZYfIjuj0 z!0bJK@~UHwbCwz7%cj!x9X7H^Zr2-t-~s!;(LvOyGOZ^A1mqd}g@-}Tr%)MyE^P|V zqgPV3J+2`551%1V<*c|xjGC-#9Oi*Ab8D!YGX^HZ6Y^8lVcbkinUCnA*$$W(%!SuA zp{j5UKy;471JO-M&S@KovB>?3|~GTWr$or@XX)FNyh|*#?6~8rHqSK(-a_Q`t(1fFKS(Udxj# zw_C}>D_f{z1=|rlmpiNmyWHSpe3pj2<@E>{C*-1~rixpLyk2g$$AnfB*jP#AruU=_ z{Z*}CXmKLZ*-@FtD_F>!WX~vJS*JzL6Brjj`g=*6k=ZbX6G1o$5p->7gW!T28*pYj zcq>|Fm(~IX}K{rw3Q=(C*cGUY5u59 zKcR``Uid+!nCB8?CfW$x06sBe^YT(PgB`^Br0Q3>v|zv){nOtALZrdAMWp=x{{UoF z832pVXqYJ2MKIQr4gg?8{d`bWcoOI{*qI~K&1Ln|CkZuLnsl_(SV(CNfK7?-Wy|$V zQ)<-OF9s}P%THaly(s)J2@qmA^;o*9TPXks#6-=^u8$nkn?BSY$vcg_7Rw#GO2a?M zEXuXZ&*Oj}}p4wVc_cU&N!$^qr zvgZq>(n&FxT4B;^O>=$bl#2C- z;Y3-02WWx;{M3wS-y}?PHb^uA4elm9RQnoW#lVvX9zUX62HF7x5o-`st!XWdw-*H1 z$M!(bNe7Z|VQ-R^q`A=IE`JHP=A;~#mWinBh+C;hpPH`)3^n+SS$6zKFX045I5`G{Ej4Q^9E`;CErsTD-GehOoRBK zNR%=dTG+Wf9C~ty73$GHbR8tciCJ^rgZa5Uu0oG)T;^+;+R?#++DHwK4ba)NE{0lbF}F2s;$?iJc(5CroC#k?o)lmdH8ns?eP0PGbmuESra0!jY>OyvH` z{brC1-^|SZ$~uiTI(25el1pYgl|NS;u%nAieSV5TM%TO!dHG>Z8UcaLjGP}nKJ1TU zWGBg)2nl1Dnqhjh>aAKoc8e{+Lj^O3^YifJr}+(b@nv&OtZuQEifJAb>yz zCkM~^qja?t4wetZNE3*Njq~G5h&UEv;P2va25cFw+T?vI;~Og+#Fs$ z`1My_WC$eAZ;w!}^ubSgE^abFDBTkng!G*ycJ9>B1P0#Us>SPiK9ub{o5g=dhmDi7!Ru>B#^B!33z`+XZDzdoR>pD5*RC`NJ#l}HV z;^RM(+qP?t4qz>D&(q0CLs3$T!|A#sBf5s2aNrqKNZ|4lxw~P&$UK5>mV=_v^%`21 z29ve;f-U`1Yx-3^PsRWaCm==(D)l(&dRXT{YCE z@+Y?(`K=m6=-w*>J8ndgKSVTZ8bAIBDxsI|u~W&S7JfzgIN{N3PbL3pO@?ep=g|+ja5%BxEki~QW(XvO8`-cx z79wmh$Lf!9>sijE>5(LJbmEZY0elI7KSg-b2Nty3_er#y7ZJpp{pCMm^-^sy5?nki zlDKJxfJmHOB=$7q9`G6gFrsBQR(n{{GGIbZ{631`N&f)Fk1)C`IIztLrwukaveePh zrq=-=Ma-zG{{Z5B5s&v-=CR7|HR-y|O*$+me&RVdpBwy^R=-x#PQu(|9Q`blKiYSa zqg(yc<`iR$+1gB}F5K~)7_yI5TvK5U_&^x#&+4on!|1bh--!M8T29Kds~0`aBbl;? zX=#)4RvVM#qN=ux>c;43Gj9G03a534J-J3OJd-p1#-OSo)7pG<3PB>mM}#Bpg`bR| zhDo<_eydxiF~AOIfU(YfS3%N$+IRe=Mtv(D+Ej~x00IwiVh^9y0ds(K18BHY5B3}6 z6jeU~{{WaSf33jkkDT8IE`DVbUO~HL7z7y;_b6F`?yJO2RN z{{Z42s*11TK4_3o?vpNyE$y#={*MVvD7E)7hy{_01 z5=nqydHO1j{71=N(Ek9w-z2i)FCNk|lovb?gqZ*zpVdLBku#t8dGO&~ss7zhRaaB^ z8TZ1=9^2lTPAA>fE@^0J?-TV{+L!y*TG7e5`K@h#;D3-<)qew*XN}F09i_~W>j4C1 zUFppwjt1lz0L|5EU+n(?@hf^vzsYFEwP*s`EptqPU;r|H{{Tc(T;1-eXeK6N0*;6O z0H&W$ivC4s^qIX&P%G*j%5c_An%mENtjEJX0^1(yT3_xTGMiJMMYcUNton}IF@iDz zuH!y)l{G~Re;2B>m%AkStGa%lRnXu~&B#^h{{Xsu7emv(@NM+`l-hDL^?EAJV1}Dm zk|eC+%FVvh@?A!&Aw+)-KlzaO- z2?bf{w5nC=6e;Z_mx~2!Q!=aepH6?eKJlsA+)VAWe3-HpEE;m~W>zQnA2OkYG5>oQ^+8^ig;J z0GUl!@elb{pB$e=9MUcc2LK2{8X8NQA+Zh^`@iIqKk(zsBY)xl0O9=#05r}tYq^2M zo597rmdy&}UI{WTarbqKKk*cNmcEzpzy5#O1M5kS_ofRhL4%k+YBw~pX`Vz9AVO3# z{{X~OSN{NYsw$l0$fc-zE(=ToctyC2WMAZ)RsR5utM|!BemZ}Aq_{HuvOp)ma#SXv z+*}$<&5N?5sgM3Y)ib}{XZ*zhWv+3tbcY+0Bj~1S+S!pk*pXiKKh{R*fNu~TJ z&-79CKlaa&{SiAXuA)wz?FRSzr@Oew%IPWV=~ki{J~_hk4JZCB{{Zu!&33v>555&= zjETo9T2*vZTz%m!fMiZ*m|0z3o|>f+$L)*)V#DqhgZ}{8{{Z1wn*RX6{{S)hDaW(Q zI6Y>7zo=B9LEGoz5Czs{S`{d~vZG5}Gb>|D{{W|d_`g*bUH<^azsY(MxvXRoU`drD zD>sX3nr2Ilq)qSHE#qKj$NYeRsBn*u)Jb6qp=yUeYPAN%e9 z0GKH9M!2oJOj!7HCQY0o3XoE4g5V2ynFsInQ|hwM`F~W*&&^@Bx(U+LKNE;>zyjzh zX^Vk#PoM|WD&0T!>3N_(&*nc=Z1ke~L9E-3V#Nb#1He28kYgXwTYg8GQ1bNlRv>*Q W?IJ;#7#T@3Gyed^`XsAq)Bo9M@!z!o literal 0 HcmV?d00001 diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 5ac041b2cc..0c9da1a764 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -13,12 +13,10 @@ // limitations under the License. #include -#include +#include #include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT +#include "paddle_api.h" // NOLINT +#include "paddle_use_passes.h" // NOLINT using namespace paddle::lite_api; // NOLINT @@ -32,11 +30,21 @@ int64_t ShapeProduction(const shape_t& shape) { return res; } +// 0. Enable OpenCL, if needed +// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl) +// #define DEMO_WITH_OPENCL void RunModel() { // 1. Set CxxConfig CxxConfig config; config.set_model_dir(FLAGS_model_dir); +#ifdef DEMO_WITH_OPENCL + std::vector valid_places{ + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}, + Place{TARGET(kARM), PRECISION(kFloat)}}; +#else std::vector valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; +#endif if (FLAGS_prefer_int8_kernel) { valid_places.insert(valid_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); @@ -68,14 +76,22 @@ void RunModel() { // 6. Get output std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); - printf("Output dim: %d\n", output_tensor->shape()[1]); + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - printf("Output[%d]: %f\n", i, output_tensor->data()[i]); + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; } } int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir == "" || FLAGS_optimized_model_dir == "") { + std::cerr << "[ERROR] usage: " << argv[0] + << " --model_dir=" + << " --optimized_model_dir= " + << " --prefer_int8_kernel=[true|false]\n"; + exit(1); + } RunModel(); return 0; } diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index e1833814ca..c40e3d5e9a 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -12,27 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include +#include #include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT +#include "paddle_api.h" // NOLINT using namespace paddle::lite_api; // NOLINT -DEFINE_string(model_dir, "", "Model dir path."); - int64_t ShapeProduction(const shape_t& shape) { int64_t res = 1; for (auto i : shape) res *= i; return res; } -void RunModel() { +void RunModel(std::string model_dir) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(FLAGS_model_dir); + config.set_model_dir(model_dir); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -52,14 +47,19 @@ void RunModel() { // 5. Get output std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); - printf("Output dim: %d\n", output_tensor->shape()[1]); + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - printf("Output[%d]: %f\n", i, output_tensor->data()[i]); + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; } } int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - RunModel(); + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); return 0; } diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 56c70cf1e1..40c9541554 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -18,7 +18,6 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -47,7 +46,6 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels} EXCLUDE_COMPILE_DEPS "ON" ) diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 8949602cab..0c8866eaf8 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -1,3 +1,5 @@ + +# 1. basic kernels for basic models # for conv op add_kernel(conv_depthwise ARM basic SRCS conv_depthwise.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(conv_direct ARM basic SRCS conv_direct.cc DEPS ${lite_kernel_deps} math_arm) @@ -14,51 +16,58 @@ add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_ add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(unsqueeze_compute_arm ARM extra SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm) + +## 2.other basic kernels: basic kernels that not used in basic models +add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) + +## 3. extra kernels +add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(density_prior_box_compute_arm ARM extra SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(reduce_mean_compute_arm ARM extra SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(stack_compute_arm ARM extra SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(affine_channel_compute_arm ARM extra SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(range_compute_arm ARM extra SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm) + # for OCR specific add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -74,7 +83,7 @@ add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -90,18 +99,17 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm) lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm) -lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm) -lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm) lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm) lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm) lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm) -lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm) lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra) lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm) -lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) -lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm) - +lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm) if(LITE_BUILD_EXTRA) + lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm) + lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm) + lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm) + lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm) endif() diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index ebb96e21d5..69e507ba34 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -32,13 +32,18 @@ void ConvCompute::PrepareForRun() { auto w_dims = param.filter->dims(); auto& ctx = this->ctx_->template As(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; int ic = w_dims[1] * param.groups; int oc = w_dims[0]; int kh = w_dims[2]; // oihw int kw = w_dims[3]; - int pad = param.paddings[0]; + int pad = paddings[0]; int stride = param.strides[0]; + int threads = ctx.threads(); + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int chin = param.x->dims()[1]; int hin = param.x->dims()[2]; int win = param.x->dims()[3]; @@ -46,22 +51,28 @@ void ConvCompute::PrepareForRun() { int hout = param.output->dims()[2]; int wout = param.output->dims()[3]; - bool kps_equal = (param.paddings[0] == param.paddings[1]) && - (param.strides[0] == param.strides[1]) && (kw == kh); - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); + bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); + + bool kps_equal = (param.strides[0] == param.strides[1]) && (kw == kh); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool flag_dw_3x3 = (kw == 3 && kh == 3 && (stride == 1 || stride == 2)); - bool flag_dw_5x5 = - (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2); + bool flag_dw_5x5 = pads_all_equal && ((kw == 5 && stride == 1) || + (kw == 5 && stride == 2 && pad == 2)); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; /// select conv impl - if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) { + if (param.groups == ic && ic == oc && kps_equal && pads_equal && + no_dilation && flag_dw) { /// dw conv impl impl_ = new DepthwiseConv; VLOG(3) << "invoking dw conv"; } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal && no_dilation) { - if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) { + bool use_winograd = + (threads == 1 && oc >= 4 && ic >= 4 && hout >= 6 && wout >= 6 && + pads_equal) || + (oc >= 32 && ic >= 32 && hout >= 16 && wout >= 16 && pads_equal); + if (use_winograd) { /// winograd conv impl impl_ = new WinogradConv; VLOG(3) << "invoking winograd conv"; @@ -92,22 +103,29 @@ void ConvCompute::PrepareForRun() { auto& ctx = this->ctx_->template As(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int ic = param.groups * w_dims[1]; int oc = w_dims[0]; int kh = w_dims[2]; // oihw int kw = w_dims[3]; - int ph = param.paddings[1]; - int pw = param.paddings[0]; + int ph = paddings[0]; + int pw = paddings[2]; int sh = param.strides[1]; int sw = param.strides[0]; + bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); - bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2); - bool flag_dw_5x5 = (kw == 5 && sw == 1); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); + bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); + bool flag_dw_5x5 = pads_all_equal && + ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; - if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) { + if (param.groups == ic && ic == oc && kps_equal && pads_equal && + no_dilation && flag_dw) { impl_ = new DepthwiseConv; VLOG(3) << "Run DepthwiseConv Int8"; } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && @@ -130,23 +148,30 @@ void ConvCompute::PrepareForRun() { auto w_dims = param.filter->dims(); auto& ctx = this->ctx_->template As(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int ic = w_dims[1] * param.groups; int oc = w_dims[0]; int kh = w_dims[2]; // oihw int kw = w_dims[3]; - int ph = param.paddings[1]; - int pw = param.paddings[0]; + int ph = paddings[0]; + int pw = paddings[2]; int sh = param.strides[1]; int sw = param.strides[0]; + bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); - bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2); - bool flag_dw_5x5 = (kw == 5 && sw == 1); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); + bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); + bool flag_dw_5x5 = pads_all_equal && + ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; - if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) { + if (param.groups == ic && ic == oc && kps_equal && pads_equal && + no_dilation && flag_dw) { impl_ = new DepthwiseConv; VLOG(3) << "Run DepthwiseConv Int8"; } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && @@ -194,7 +219,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, ConvFp32, def) REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -203,7 +228,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -213,7 +238,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) REGISTER_LITE_KERNEL( depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -223,7 +248,7 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc index 6a20d607e3..e2eaef51dd 100644 --- a/lite/kernels/arm/conv_depthwise.cc +++ b/lite/kernels/arm/conv_depthwise.cc @@ -31,19 +31,28 @@ void DepthwiseConv::PrepareForRun() { // select dw conv kernel if (kw == 3) { VLOG(5) << "invoke 3x3 dw conv fp32"; - // trans weights - constexpr int cblock = 4; - auto oc = w_dims[0]; - auto kh = w_dims[2]; - auto cround = ROUNDUP(oc, cblock); - weights_.Resize({cround, 1, kh, kw}); - // auto w_data = weights_.mutable_data(); - // auto w_data_in = param.filter->data(); - // lite::arm::math::conv_trans_weights_numc( - // w_data_in, w_data, oc, 1, cblock, kh * kw); - impl_ = lite::arm::math::conv_depthwise_3x3_fp32; - flag_trans_weights_ = false; - // flag_trans_weights_ = true; + auto paddings = *param.paddings; + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + + if (pads_equal && paddings[0] == paddings[2] && + (paddings[0] == 0 || paddings[0] == 1)) { + impl_ = lite::arm::math::conv_depthwise_3x3_fp32; + flag_trans_weights_ = false; + } else { + // trans weights + constexpr int cblock = 4; + auto oc = w_dims[0]; + auto kh = w_dims[2]; + auto cround = ROUNDUP(oc, cblock); + weights_.Resize({cround, 1, kh, kw}); + auto w_data = weights_.mutable_data(); + auto w_data_in = param.filter->data(); + lite::arm::math::conv_trans_weights_numc( + w_data_in, w_data, oc, 1, cblock, kh * kw); + impl_ = lite::arm::math::conv_depthwise_3x3_fp32; + flag_trans_weights_ = true; + } } else if (kw == 5) { VLOG(5) << "invoke 5x5 dw conv fp32"; impl_ = lite::arm::math::conv_depthwise_5x5_fp32; diff --git a/lite/kernels/arm/conv_gemmlike.h b/lite/kernels/arm/conv_gemmlike.h index e00b8de6f4..5e59eb8d17 100644 --- a/lite/kernels/arm/conv_gemmlike.h +++ b/lite/kernels/arm/conv_gemmlike.h @@ -52,12 +52,19 @@ class GemmLikeConv : public KernelLite { int oc = o_dims[1]; int kw = w_dims[3]; int kh = w_dims[2]; + + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int sw = param.strides[1]; int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; + int pw = paddings[2]; + int ph = paddings[0]; + int dw = dilations[1]; + int dh = dilations[0]; + + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int m = oc / param.groups; int k = ic * kh * kw / param.groups; @@ -66,7 +73,7 @@ class GemmLikeConv : public KernelLite { bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh); bool ks_equal = (sw == sh) && (kw == kh); //! select conv gemmlike kernel - if (kw == 1 && sw == 1 && pw == 0 && kps_equal) { + if (kw == 1 && sw == 1 && pw == 0 && kps_equal && pads_equal) { //! 1x1s1p0 gemmlike conv flag_1x1gemm_ = true; } else { diff --git a/lite/kernels/arm/conv_transpose_compute.cc b/lite/kernels/arm/conv_transpose_compute.cc index 5a18499c85..5c58b29713 100644 --- a/lite/kernels/arm/conv_transpose_compute.cc +++ b/lite/kernels/arm/conv_transpose_compute.cc @@ -76,19 +76,28 @@ void Conv2DTransposeCompute::Run() { bool fuse_relu = param.fuse_relu; bool flag_bias = (param.bias != nullptr); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int m = chout * kw * kh / group; int n = hin * win; int k = chin / group; + + bool pads_equal = + (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); + int group_size_in = win * hin * chin / group; int group_size_out = wout * hout * chout / group; int group_size_coldata = m * n; + + bool pads_all_qual = pads_equal && (paddings[0] == paddings[2]); int hblock = lite::arm::math::get_hblock(&ctx); int m_roundup = hblock * ((m + hblock - 1) / hblock); int group_size_weights = ((m_roundup * k + 15) / 16) * 16; bool flag_1x1s1p1 = (kw == 1) && (kh == 1) && (param.strides[0] == 1) && - (param.strides[1] == 1) && (param.paddings[0] == 0) && - (param.paddings[1] == 0) && (param.dilations[0] == 1) && - (param.dilations[1] == 1); + (param.strides[1] == 1) && pads_all_qual && + (paddings[0] == 0) && (dilations[0] == 1) && + (dilations[1] == 1); ctx.ExtendWorkspace(sizeof(float) * group * m * n); auto din = param.x->data(); @@ -129,12 +138,14 @@ void Conv2DTransposeCompute::Run() { wout, kh, kw, - param.paddings[0], - param.paddings[1], + paddings[0], + paddings[1], + paddings[2], + paddings[3], param.strides[0], param.strides[1], - param.dilations[0], - param.dilations[1], + dilations[0], + dilations[1], dout_batch); } if (flag_bias) { diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc index d1b8d8a48e..d02cabf277 100644 --- a/lite/kernels/arm/conv_winograd.cc +++ b/lite/kernels/arm/conv_winograd.cc @@ -26,6 +26,7 @@ template <> void WinogradConv::ReInitWhenNeeded() { auto& param = this->Param(); auto& ctx = this->ctx_->template As(); + int threads = ctx.threads(); auto x_dims = param.x->dims(); auto w_dims = param.filter->dims(); @@ -36,77 +37,97 @@ void WinogradConv::ReInitWhenNeeded() { } int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; + int ih = x_dims[2]; + int iw = x_dims[3]; int oc = o_dims[1]; - int tile_w = (ow + 5) / 6; - int tile_h = (oh + 5) / 6; - int size_tile = tile_h * tile_w; - int size_trans_channel = 8 * 8 * size_tile; - int max_ch = ic > oc ? ic : oc; - - const int n_wino = size_tile; - workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float); + int oh = o_dims[2]; + int ow = o_dims[3]; + int tile_block = 8; +#ifdef __aarch64__ + tile_block = 16; +#endif + int parallel_threads = + (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block; + if (threads <= 2 && parallel_threads >= threads) { + if (last_kernel_is_c4_ == 1) { + return; + } + last_kernel_is_c4_ = 1; + auto pad = *(param.paddings); + int pad_h = pad[0]; + int pad_w = pad[2]; + int oc_pad = (oc + 3) / 4 * 4; + int ic_pad = (ic + 3) / 4 * 4; + const int new_input_size = + (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2); + const int temp_size = + (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 256 + 512) * threads; + ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float)); + + weights_.Resize({1, 1, 1, 64 * oc_pad * ic_pad}); + ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float)); + void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); + auto weights_data_ = weights_.mutable_data(); + lite::arm::math::weight_trans_c4( + weights_data_, param.filter->data(), ic, oc, trans_tmp_ptr); + free(trans_tmp_ptr); + } else { + if (last_kernel_is_c4_ == 0) { + return; + } + last_kernel_is_c4_ = 0; + int tile_w = (ow + 5) / 6; + int tile_h = (oh + 5) / 6; + + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = ic > oc ? ic : oc; + + const int n_wino = size_tile; + ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) * + sizeof(float)); + + const int m_wino = oc; + int hblock = lite::arm::math::get_hblock(&ctx); + int m_round = hblock * ((m_wino + hblock - 1) / hblock); + weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic}); + ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) * + sizeof(float)); + auto weights_wino = + static_cast(malloc(sizeof(float) * 8 * 8 * oc * ic)); + void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); + lite::arm::math::winograd_transform_weights( + weights_wino, param.filter->data(), oc, ic, trans_tmp_ptr); + auto weights_trans = weights_.mutable_data(); + for (int i = 0; i < 64; ++i) { + float* packed_weights = weights_trans + i * m_round * ic; + const float* weights_wino_ptr = weights_wino + i * oc * ic; + lite::arm::math::prepackA(packed_weights, + weights_wino_ptr, + 1.f, + ic, + 0, + m_wino, + 0, + ic, + false, + &ctx); + } + free(trans_tmp_ptr); + free(weights_wino); + } last_shape_ = x_dims; } template <> void WinogradConv::PrepareForRun() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - last_shape_ = x_dims; - - int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; - int oc = o_dims[1]; - int tile_w = (ow + 5) / 6; - int tile_h = (oh + 5) / 6; - int size_tile = tile_h * tile_w; - int size_trans_channel = 8 * 8 * size_tile; - int max_ch = ic > oc ? ic : oc; - - const int m_wino = oc; - const int n_wino = size_tile; - int hblock = lite::arm::math::get_hblock(&ctx); - int m_round = hblock * ((m_wino + hblock - 1) / hblock); - weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic}); - workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float); - auto weights_wino = - static_cast(malloc(sizeof(float) * 8 * 8 * oc * ic)); - void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); - lite::arm::math::winograd_transform_weights( - weights_wino, param.filter->data(), oc, ic, trans_tmp_ptr); - auto weights_trans = weights_.mutable_data(); - for (int i = 0; i < 64; ++i) { - float* packed_weights = weights_trans + i * m_round * ic; - const float* weights_wino_ptr = weights_wino + i * oc * ic; - lite::arm::math::prepackA(packed_weights, - weights_wino_ptr, - 1.f, - ic, - 0, - m_wino, - 0, - ic, - false, - &ctx); - } - free(trans_tmp_ptr); - free(weights_wino); + ReInitWhenNeeded(); } template <> void WinogradConv::Run() { auto& param = this->Param(); auto& ctx = this->ctx_->template As(); - // extend workspace - ctx.ExtendWorkspace(workspace_size_); - const auto* i_data = param.x->data(); const auto* w_data = weights_.data(); const auto* b_data = param.bias ? param.bias->data() : nullptr; @@ -124,8 +145,42 @@ void WinogradConv::Run() { int ow = o_dims[3]; int oc = o_dims[1]; - lite::arm::math::conv_winograd3x3( - i_data, o_data, bs, oc, oh, ow, ic, ih, iw, w_data, b_data, param, &ctx); + int tile_block = 8; +#ifdef __aarch64__ + tile_block = 16; +#endif + int threads = ctx.threads(); + int parallel_threads = + (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block; + if (threads <= 2 && parallel_threads >= threads) { + lite::arm::math::conv_compute_6x6_3x3(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + w_data, + b_data, + param, + &ctx); + } else { + lite::arm::math::conv_winograd3x3(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + w_data, + b_data, + param, + &ctx); + } } } // namespace arm diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h index 33f0edc017..40ea54b291 100644 --- a/lite/kernels/arm/conv_winograd.h +++ b/lite/kernels/arm/conv_winograd.h @@ -40,6 +40,7 @@ class WinogradConv : public KernelLite { Tensor weights_; DDim last_shape_; int workspace_size_{0}; + int last_kernel_is_c4_{-1}; }; } // namespace arm diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc index 1983c73318..525eca269b 100644 --- a/lite/kernels/arm/fc_compute.cc +++ b/lite/kernels/arm/fc_compute.cc @@ -127,7 +127,8 @@ void FcCompute::Run() { k_, param.bias != nullptr, b_data, - false); + false, + &ctx); } } } diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc index 0b1911abf4..05d43dddec 100644 --- a/lite/kernels/arm/fill_constant_compute.cc +++ b/lite/kernels/arm/fill_constant_compute.cc @@ -25,6 +25,38 @@ class FillConstantCompute : public KernelLite { public: using param_t = operators::FillConstantParam; + inline DDimLite GetShape(const param_t& param) { + // 1. shape is a Tensor + if (param.shape_tensor != nullptr) { + auto* shape_tensor = param.shape_tensor; + auto* shape_data = shape_tensor->data(); + auto vec_shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + return DDimLite(vec_shape); + } + + // 2. shape is a list/tuple containing Tensor + auto shape_tensor_list = param.shape_tensor_list; + if (shape_tensor_list.size() > 0) { + std::vector vec_shape; + for (size_t i = 0; i < shape_tensor_list.size(); ++i) { + auto tensor = shape_tensor_list[i]; + vec_shape.push_back(*tensor->data()); + } + return DDimLite(vec_shape); + } + + // 3. shape is a list/tuple without containing Tensor + auto vec_shape = param.shape; + return DDimLite(vec_shape); + } + + void PrepareForRun() override { + auto& param = *param_.get_mutable(); + auto outdims = GetShape(param); + param.Out->Resize(outdims); + } + void Run() override { auto& param = *param_.get_mutable(); auto& context = ctx_->As(); @@ -107,6 +139,11 @@ REGISTER_LITE_KERNEL(fill_constant, kNCHW, paddle::lite::kernels::arm::FillConstantCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("ShapeTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); REGISTER_LITE_KERNEL( diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc index a26777826d..0398dabeae 100644 --- a/lite/kernels/arm/interpolate_compute.cc +++ b/lite/kernels/arm/interpolate_compute.cc @@ -28,6 +28,8 @@ void BilinearInterpCompute::Run() { auto& param = Param(); lite::Tensor* X = param.X; lite::Tensor* OutSize = param.OutSize; + auto SizeTensor = param.SizeTensor; + auto Scale = param.Scale; lite::Tensor* Out = param.Out; float scale = param.scale; int out_w = param.out_w; @@ -36,11 +38,12 @@ void BilinearInterpCompute::Run() { std::string interp_method = "Bilinear"; lite::arm::math::interpolate(X, OutSize, + SizeTensor, + Scale, Out, out_h, out_w, scale, - scale, align_corners, interp_method); } @@ -49,6 +52,8 @@ void NearestInterpCompute::Run() { auto& param = Param(); lite::Tensor* X = param.X; lite::Tensor* OutSize = param.OutSize; + auto SizeTensor = param.SizeTensor; + auto Scale = param.Scale; lite::Tensor* Out = param.Out; float scale = param.scale; int out_w = param.out_w; @@ -57,11 +62,12 @@ void NearestInterpCompute::Run() { std::string interp_method = "Nearest"; lite::arm::math::interpolate(X, OutSize, + SizeTensor, + Scale, Out, out_h, out_w, scale, - scale, align_corners, interp_method); } @@ -79,6 +85,8 @@ REGISTER_LITE_KERNEL(bilinear_interp, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); @@ -90,5 +98,7 @@ REGISTER_LITE_KERNEL(nearest_interp, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc new file mode 100644 index 0000000000..bc52c5ea3e --- /dev/null +++ b/lite/kernels/arm/layout_compute.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/layout_compute.h" +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +#define NCHWTONHWC(type) \ + auto& param = this->template Param(); \ + auto input = param.x->template data(); \ + auto input_dim = param.x->dims(); \ + CHECK(input_dim.size() == 4) \ + << "NCHW to NHWC should guarantee that the input dims should be 4"; \ + int n = input_dim[0]; \ + int c = input_dim[1]; \ + int h = input_dim[2]; \ + int w = input_dim[3]; \ + param.y->Resize({n, h, w, c}); \ + auto output = param.y->template mutable_data(TARGET(kARM)); \ + if (c == 1) { \ + memcpy(output, input, sizeof(type) * n * h * w); \ + return; \ + } \ + lite::arm::math::NCHW2NHWC(n, c, h * w, input, output); + +#define NHWCTONCHW(type) \ + auto& param = this->template Param(); \ + auto input = param.x->template data(); \ + auto input_dim = param.x->dims(); \ + CHECK(input_dim.size() == 4) \ + << "NHWC to NCHW should guarantee that the input dims should be 4"; \ + int n = input_dim[0]; \ + int h = input_dim[1]; \ + int w = input_dim[2]; \ + int c = input_dim[3]; \ + param.y->Resize({n, c, h, w}); \ + auto output = param.y->template mutable_data(TARGET(kARM)); \ + if (c == 1) { \ + memcpy(output, input, sizeof(type) * n * h * w); \ + return; \ + } \ + lite::arm::math::NHWC2NCHW(n, c, h * w, input, output); + +template <> +void NCHWToNHWCCompute::Run() { + NCHWTONHWC(float); +} + +template <> +void NCHWToNHWCCompute::Run() { + NCHWTONHWC(int8_t); +} + +template <> +void NHWCToNCHWCompute::Run() { + NHWCTONCHW(float); +} + +template <> +void NHWCToNCHWCompute::Run() { + NHWCTONCHW(int8_t); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +typedef paddle::lite::kernels::arm::NCHWToNHWCCompute + NCHW_fp32; +typedef paddle::lite::kernels::arm::NCHWToNHWCCompute + NCHW_int8; +typedef paddle::lite::kernels::arm::NHWCToNCHWCompute + NHWC_fp32; +typedef paddle::lite::kernels::arm::NHWCToNCHWCompute + NHWC_int8; + +REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/arm/layout_compute.h b/lite/kernels/arm/layout_compute.h new file mode 100644 index 0000000000..13b8621029 --- /dev/null +++ b/lite/kernels/arm/layout_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { +template +class NCHWToNHWCCompute : public KernelLite { + public: + using param_t = operators::LayoutParam; + void Run() override; + virtual ~NCHWToNHWCCompute() = default; +}; + +template +class NHWCToNCHWCompute : public KernelLite { + public: + using param_t = operators::LayoutParam; + void Run() override; + virtual ~NHWCToNCHWCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc index fa7e2c0c3a..ba58b378f4 100644 --- a/lite/kernels/arm/lookup_table_compute.cc +++ b/lite/kernels/arm/lookup_table_compute.cc @@ -28,7 +28,6 @@ namespace arm { void LookupTableCompute::Run() { auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); // inputs auto w = param.W; auto ids = param.Ids; @@ -37,7 +36,7 @@ void LookupTableCompute::Run() { auto table_dim = w->dims(); int64_t ids_numel = ids->numel(); - auto ids_data = ids->data(); + auto ids_data = ids->data(); int64_t row_number = table_dim[0]; int64_t row_width = table_dim[1]; @@ -76,3 +75,14 @@ REGISTER_LITE_KERNEL(lookup_table, .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +REGISTER_LITE_KERNEL(lookup_table_v2, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/lookup_table_compute_test.cc b/lite/kernels/arm/lookup_table_compute_test.cc new file mode 100644 index 0000000000..78748edf39 --- /dev/null +++ b/lite/kernels/arm/lookup_table_compute_test.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/lookup_table_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void lookup_table_compute_ref(const operators::LookupTableParam ¶m) { + auto *ids_t = param.Ids; + auto *output_t = param.Out; + int64_t padding_idx = param.padding_idx; + auto *ids = ids_t->data(); + int64_t ids_numel = ids_t->dims().production(); + + auto *table_t = param.W; + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(); + memset(output, 0, output_t->dims().production() * sizeof(float)); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != -1 && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(float)); + } else { + CHECK_LT(ids[i], row_number); + CHECK_GE(ids[i], 0); + memcpy(output + i * row_width, + table + ids[i] * row_width, + row_width * sizeof(float)); + } + } +} + +TEST(lookup_table_arm, retrieve_op) { + auto lookup_table = + KernelRegistry::Global().Create( + "lookup_table"); + ASSERT_FALSE(lookup_table.empty()); + ASSERT_TRUE(lookup_table.front()); +} + +TEST(lookup_table_arm, init) { + LookupTableCompute lookup_table; + ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat)); + ASSERT_EQ(lookup_table.target(), TARGET(kARM)); +} + +TEST(lookup_table_arm, compute) { + LookupTableCompute lookup_table; + operators::LookupTableParam param; + lite::Tensor w, ids, out, out_ref; + int64_t padding_idx = -1; + + auto w_dim = DDim(std::vector({4, 5})); + auto ids_dim = DDim(std::vector({3, 2})); + auto out_dim = DDim(std::vector({3, 2, 5})); + + w.Resize(w_dim); + ids.Resize(ids_dim); + out.Resize(out_dim); + out_ref.Resize(out_dim); + + auto *w_data = w.mutable_data(); + auto *ids_data = ids.mutable_data(); + auto *out_data = out.mutable_data(); + auto *out_ref_data = out_ref.mutable_data(); + + int w_num = w_dim.production(); + for (int i = 0; i < w_num; i++) { + w_data[i] = static_cast(i + 1) / (w_num + 1); + } + int ids_num = ids_dim.production(); + for (int i = 0; i < ids_num; i++) { + ids_data[i] = i % 4; + } + int out_num = out_dim.production(); + + param.W = &w; + param.Ids = &ids; + param.Out = &out; + lookup_table.SetParam(param); + lookup_table.Run(); + param.Out = &out_ref; + lookup_table_compute_ref(param); + for (int i = 0; i < out_num; i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def); diff --git a/lite/kernels/arm/lrn_compute.cc b/lite/kernels/arm/lrn_compute.cc index 18e6654282..0476b1e6bd 100644 --- a/lite/kernels/arm/lrn_compute.cc +++ b/lite/kernels/arm/lrn_compute.cc @@ -31,16 +31,16 @@ void LrnCompute::Run() { int channel = x_dims[1]; int h = x_dims[2]; int w = x_dims[3]; - const int local_size = param.local_size; + const int n = param.n; const float alpha = param.alpha; const float beta = param.beta; const float k = param.k; if (param.norm_region == "AcrossChannels") { lite::arm::math::compute_across_channels( - x_data, out_data, num, channel, h, w, local_size, alpha, beta, k); + x_data, out_data, num, channel, h, w, n, alpha, beta, k); } else { lite::arm::math::compute_within_channels( - x_data, out_data, num, channel, h, w, local_size, alpha, beta, k); + x_data, out_data, num, channel, h, w, n, alpha, beta, k); } } @@ -53,4 +53,5 @@ REGISTER_LITE_KERNEL( lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("MidOut", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc index 8e03000615..e7030d0042 100644 --- a/lite/kernels/arm/lrn_compute_test.cc +++ b/lite/kernels/arm/lrn_compute_test.cc @@ -91,7 +91,7 @@ void lrn_compute_ref(const operators::LrnParam& param) { const dtype* x_data = param.X->data(); dtype* out_data = param.Out->mutable_data(); auto x_dims = param.X->dims(); - int local_size = param.local_size; + int local_size = param.n; float alpha = param.alpha; float beta = param.beta; float k = param.k; @@ -171,7 +171,7 @@ TEST(lrn_arm, compute) { } param.X = &x; param.Out = &output; - param.local_size = local_size; + param.n = local_size; param.alpha = alpha; param.beta = beta; param.k = k; diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc index 29be34d0c2..d00a5bdc06 100644 --- a/lite/kernels/arm/matmul_compute.cc +++ b/lite/kernels/arm/matmul_compute.cc @@ -232,7 +232,7 @@ void MatMulCompute::Run() { int ldc = n_; if (n_ == 1) { lite::arm::math::sgemv( - x_data, y_data, o_data, false, m_, k_, false, nullptr, false); + x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx); if (fabsf(alpha - 1.f) > 1e-8f) { for (size_t i = 0; i < param.Out->dims().production(); ++i) { o_data[i] *= alpha; diff --git a/lite/kernels/arm/mul_compute.cc b/lite/kernels/arm/mul_compute.cc index fa43b6cf8e..debe9e907c 100644 --- a/lite/kernels/arm/mul_compute.cc +++ b/lite/kernels/arm/mul_compute.cc @@ -48,14 +48,13 @@ void MulCompute::Run() { CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; k_ = x_w; - + auto& ctx = this->ctx_->template As(); if (n_ == 1) { lite::arm::math::sgemv( - x_data, y_data, o_data, false, m_, k_, false, nullptr, false); + x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx); } else { constexpr bool is_tranposed_y = false; - auto& ctx = this->ctx_->template As(); int hblock = lite::arm::math::get_hblock(&ctx); int m_round = hblock * ((m_ + hblock - 1) / hblock); ctx.ExtendWorkspace(m_round * k_ * sizeof(float)); diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc index 9f02a462a5..c9f0fed478 100644 --- a/lite/kernels/arm/pool_compute.cc +++ b/lite/kernels/arm/pool_compute.cc @@ -38,7 +38,7 @@ void PoolCompute::Run() { std::vector& ksize = param.ksize; std::vector& strides = param.strides; - std::vector& paddings = param.paddings; + std::vector& paddings = *param.paddings; std::string& pooling_type = param.pooling_type; bool global_pooling = param.global_pooling; @@ -48,12 +48,15 @@ void PoolCompute::Run() { bool use_quantizer = param.use_quantizer; std::string& data_format = param.data_format; - bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && - (paddings[0] == paddings[1]); + bool pads_equal = + (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); + bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && + (paddings[0] == paddings[2]); if (global_pooling) { for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(in_dims[i + 2]); } if (pooling_type == "max") { @@ -80,7 +83,8 @@ void PoolCompute::Run() { return; } } else { - if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) { + if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal && + kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling2x2s2_max(din, dout, @@ -106,7 +110,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s1p1_max(din, dout, @@ -132,7 +136,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s1p0_max(din, dout, @@ -158,7 +162,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s2p0_max(din, dout, @@ -184,7 +188,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s2p1_max(din, dout, diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc index 79e5332172..7ed8a142dd 100644 --- a/lite/kernels/arm/pool_compute_test.cc +++ b/lite/kernels/arm/pool_compute_test.cc @@ -15,6 +15,7 @@ #include "lite/kernels/arm/pool_compute.h" #include #include +#include #include #include #include "lite/backends/arm/math/funcs.h" @@ -25,14 +26,21 @@ namespace lite { namespace kernels { namespace arm { -int PoolOutputSize( - int input_size, int filter_size, int padding, int stride, bool ceil_mode) { +int PoolOutputSize(int input_size, + int filter_size, + int pad_left, + int pad_right, + int stride, + bool ceil_mode) { int output_size; if (!ceil_mode) { - output_size = (input_size - filter_size + 2 * padding) / stride + 1; + output_size = + (input_size - filter_size + pad_left + pad_right) / stride + 1; } else { output_size = - (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + (input_size - filter_size + pad_left + pad_right + stride - 1) / + stride + + 1; } return output_size; } @@ -40,10 +48,12 @@ int PoolOutputSize( std::vector compute_output_shape(operators::PoolParam* param_) { const auto x_dims = param_->x->dims(); std::vector& ksize = param_->ksize; + auto paddings = *param_->paddings; if (param_->global_pooling) { ksize.resize(static_cast(x_dims.size()) - 2); for (size_t i = 0; i < ksize.size(); ++i) { - param_->paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(x_dims[i + 2]); } } @@ -56,7 +66,8 @@ std::vector compute_output_shape(operators::PoolParam* param_) { for (size_t i = 0; i < param_->ksize.size(); ++i) { output_shape.push_back(PoolOutputSize(x_dims[i + 2], param_->ksize[i], - param_->paddings[i], + paddings[2 * i], + paddings[2 * i + 1], param_->strides[i], param_->ceil_mode)); } @@ -73,7 +84,7 @@ void pool_compute_ref(const operators::PoolParam& param) { std::vector ksize = param.ksize; std::vector strides = param.strides; - std::vector paddings = param.paddings; + std::vector paddings = *param.paddings; std::string pooling_type = param.pooling_type; bool global_pooling = param.global_pooling; @@ -99,7 +110,7 @@ void pool_compute_ref(const operators::PoolParam& param) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int size_channel_in = win * hin; int size_channel_out = wout * hout; if (global_pooling) { @@ -178,18 +189,22 @@ void pool_compute_ref(const operators::PoolParam& param) { int bh = kernel_h; int bw = kernel_w; if (ew == win) { - bw = sw + kernel_w >= win + pad_w ? win + pad_w - : sw + kernel_w; + bw = (sw + kernel_w) >= (win + paddings[3]) + ? (win + paddings[3]) + : (sw + kernel_w); bw -= sw; - if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) { + if ((sw - pad_w) < 0 && + (sw + kernel_w) > (win + paddings[3])) { bw += pad_w; } } if (eh == hin) { - bh = sh + kernel_h >= hin + pad_h ? hin + pad_h - : sh + kernel_h; + bh = (sh + kernel_h) >= (hin + paddings[1]) + ? (hin + paddings[1]) + : (sh + kernel_h); bh -= sh; - if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) { + if ((sh - pad_h) < 0 && + (sh + kernel_h) > (hin + paddings[1])) { bh += pad_h; } } @@ -225,75 +240,92 @@ TEST(pool_arm, compute) { for (auto exclusive : {true, false}) { for (auto ksize : {2, 3}) { for (auto stride : {1, 2}) { - for (auto pad : {0, 1}) { - for (auto n : {1, 2}) { - for (auto c : {1, 3}) { + for (auto pad_left : {0, 1}) { + for (auto pad_right : {0, 1}) { + for (auto pad_top : {0, 1}) { + for (auto pad_bottom : {0, 1}) { + for (auto n : {1, 2}) { + for (auto c : {1, 3}) { #if 1 - for (auto h : {2, 3, 4, 11}) { - for (auto w : {2, 3, 4, 11}) { + for (auto h : {2, 3, 4, 11}) { + for (auto w : {2, 3, 4, 11}) { #else - for (int h = 2; h < 25; h++) { - for (int w = 2; w < 25; w++) { + for (int h = 2; h < 25; h++) { + for (int w = 2; w < 25; w++) { #endif - VLOG(3) << "n:" << n << " c:" << c << " h:" << h - << " w:" << w << " ksize:" << ksize - << " stride:" << stride << " pad:" << pad - << " exclusive:" << exclusive - << " global_pooling:" << global_pooling - << " ceil_mode: " << ceil_mode - << " pooling_type:" << pooling_type; + VLOG(3) << "n:" << n << " c:" << c << " h:" << h + << " w:" << w << " ksize:" << ksize + << " stride:" << stride + << " pad_left:" << pad_left + << " pad_right:" << pad_right + << " pad_top:" << pad_top + << " pad_bottom:" << pad_bottom + << " exclusive:" << exclusive + << " global_pooling:" << global_pooling + << " ceil_mode: " << ceil_mode + << " pooling_type:" << pooling_type; - // init x, output - x.Resize(DDim(std::vector({n, c, h, w}))); - auto* x_data = x.mutable_data(); - for (int i = 0; i < x.dims().production(); ++i) { - float sign = i % 3 == 0 ? -0.03 : 0.05f; - x_data[i] = sign * (i % 128); - } + // init x, output + x.Resize( + DDim(std::vector({n, c, h, w}))); + auto* x_data = x.mutable_data(); + for (int i = 0; i < x.dims().production(); ++i) { + float sign = i % 3 == 0 ? -0.03 : 0.05f; + x_data[i] = sign * (i % 128); + } - // fill param - param.x = &x; - param.output = &output; - param.pooling_type = pooling_type; - if (global_pooling) { - param.ksize = {h, w}; - } else { - param.ksize = {ksize, ksize}; - } - param.global_pooling = global_pooling; - param.strides = {stride, stride}; - param.paddings = {pad, pad}; - param.exclusive = exclusive; - param.ceil_mode = ceil_mode; - param.adaptive = false; - param.use_quantizer = false; + // fill param + param.x = &x; + param.output = &output; + param.pooling_type = pooling_type; + if (global_pooling) { + param.ksize = {h, w}; + } else { + param.ksize = {ksize, ksize}; + } + param.global_pooling = global_pooling; + param.strides = {stride, stride}; + std::vector paddings = { + pad_top, pad_bottom, pad_left, pad_right}; + param.exclusive = exclusive; + param.paddings = + std::make_shared>(paddings); + param.ceil_mode = ceil_mode; + param.adaptive = false; + param.use_quantizer = false; - const std::vector& output_shape = - compute_output_shape(¶m); - output.Resize(DDim(output_shape)); - output_ref.Resize(DDim(output_shape)); + const std::vector& output_shape = + compute_output_shape(¶m); + output.Resize(DDim(output_shape)); + output_ref.Resize(DDim(output_shape)); - auto* output_data = output.mutable_data(); - auto* output_ref_data = - output_ref.mutable_data(); - for (int i = 0; i < output.dims().production(); ++i) { - output_data[i] = -2; - output_ref_data[i] = -2; - } + auto* output_data = output.mutable_data(); + auto* output_ref_data = + output_ref.mutable_data(); + for (int i = 0; i < output.dims().production(); + ++i) { + output_data[i] = -2; + output_ref_data[i] = -2; + } - // compute - pool.SetParam(param); - pool.Run(); + // compute + pool.SetParam(param); + pool.Run(); - // compute ref - param.output = &output_ref; - pool_compute_ref(param); + // compute ref + param.output = &output_ref; + pool_compute_ref(param); - // compare - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4); + // compare + for (int i = 0; i < output.dims().production(); + i++) { + EXPECT_NEAR( + output_data[i], output_ref_data[i], 1e-4); + } + VLOG(3) << "compare pass"; + } + } } - VLOG(3) << "compare pass"; } } } diff --git a/lite/kernels/arm/split_compute.cc b/lite/kernels/arm/split_compute.cc index 27606e2d76..2a0c52e7fc 100644 --- a/lite/kernels/arm/split_compute.cc +++ b/lite/kernels/arm/split_compute.cc @@ -42,5 +42,9 @@ void SplitCompute::Run() { REGISTER_LITE_KERNEL( split, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SplitCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("SectionsTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index b33fc8f6bb..4bf1cbf521 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -5,24 +5,39 @@ endif() message(STATUS "compile with lite CUDA kernels") add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} context) +add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps}) add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps}) add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose) add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps}) add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(concat_compute_cuda CUDA basic SRCS concat_compute.cu DEPS ${lite_kernel_deps}) -add_kernel(elementwise_add_compute_cuda CUDA basic SRCS elementwise_add_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise) +add_kernel(elementwise_compute_cuda CUDA basic SRCS elementwise_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise) add_kernel(calib_compute_cuda CUDA basic SRCS calib_compute.cu DEPS ${lite_kernel_deps}) add_kernel(layout_compute_cuda CUDA basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} cuda_transpose) add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_deps}) add_kernel(scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} cuda_scale) add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cuda_scale) add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps}) -add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS +${lite_kernel_deps} cudnn_pool) add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) +add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps}) add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(search_fc_compute_cuda CUDA basic SRCS search_fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(sequence_topk_avg_pooling_compute_cuda CUDA basic SRCS sequence_topk_avg_pooling_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) +add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm) +add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) +add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda) nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda) @@ -31,13 +46,28 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_ nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda) nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda) nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda) +nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda) nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda) -nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda) +nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda) nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda) #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda) -nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) +nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda ) nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda) +nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda) +nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda) +nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda) +nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda) +nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda) +nv_test(search_fc_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda) +nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda) + if(LITE_BUILD_EXTRA) + nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda) + nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda) + nv_test(search_grnn_compute_cuda_test SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_cuda) + nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda) nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda) + nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda) + nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda) endif() diff --git a/lite/kernels/cuda/attention_padding_mask_compute.cu b/lite/kernels/cuda/attention_padding_mask_compute.cu new file mode 100644 index 0000000000..fac73b1adc --- /dev/null +++ b/lite/kernels/cuda/attention_padding_mask_compute.cu @@ -0,0 +1,162 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/attention_padding_mask_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +#define CUDA_NUM_THREADS 256 + +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void ker_attention_padding_mask(T* out_data, + const T* attn_data, + const int* src_offset, + const int attn_seq_num, + const int attn_seq_len, + const int src_seq_num, + const int src_seq_len, + const T* pad_begin_data, + const T mask, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int src_word_id = tid % src_seq_len; + int tmp_tid = tid / src_seq_len; + int attn_seq_id = tmp_tid / attn_seq_len; + int attn_word_id = tmp_tid % attn_seq_len; + int src_seq_id = attn_seq_id % src_seq_num; + int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id]; + + int k = static_cast(pad_begin_data[src_seq_id]); + if (k < cur_len && + tid >= src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + k && + tid < src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + + cur_len) { + out_data[tid] = mask; + } else { + out_data[tid] = attn_data[tid]; + } + } +} + +void AttentionPaddingMaskCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + auto attn = param.X; + auto src = param.Y; + const int count = attn->numel(); + auto attn_offset = attn->lod()[0]; + auto src_offset = src->lod()[0]; + const int attn_seq_num = attn_offset.size() - 1; + const int attn_seq_len = attn_offset[1]; + const int src_seq_num = src_offset.size() - 1; + const int src_seq_len = count / attn->dims()[0]; + + auto out = param.Out; + out->Resize(attn->dims()); + out->set_lod(attn->lod()); + + auto attn_data = attn->data(); + auto out_data = out->mutable_data(TARGET(kCUDA)); + + std::vector src_cpu(src->numel(), 0); + TargetWrapperCuda::MemcpyAsync(src_cpu.data(), + src->data(), + sizeof(float) * src->numel(), + IoDirection::DtoH, + stream); + cudaStreamSynchronize(stream); + + std::vector pad_begin(src_seq_num, 0); + auto src_len = static_cast(src->lod()[0][1]); + int _pad_id = param.pad_id; + for (int i = 0; i < src_seq_num; ++i) { + const auto* src_data = src_cpu.data() + src_len * i; + int index = src_len - 1; + for (; index >= 0 && _pad_id == static_cast(src_data[index]); + --index) { + } + pad_begin[i] = static_cast(index + 1); + } + + param.pad_begin->Resize({static_cast(src_seq_num)}); + auto pad_begin_cuda_data = + param.pad_begin->mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemcpyAsync(pad_begin_cuda_data, + pad_begin.data(), + sizeof(float) * src_seq_num, + IoDirection::HtoD, + stream); + + std::vector src_offset_cpu(src_offset.size(), 0); + for (int i = 0; i < src_offset.size(); i++) { + src_offset_cpu[i] = src_offset[i]; + } + + src_offset_cuda.Resize({static_cast(src_offset.size())}); + auto src_offset_cuda_data = src_offset_cuda.mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemcpyAsync(src_offset_cuda_data, + src_offset_cpu.data(), + sizeof(int) * src_offset.size(), + IoDirection::HtoD, + stream); + + ker_attention_padding_mask< + float><<>>( + out_data, + attn_data, + src_offset_cuda_data, + attn_seq_num, + attn_seq_len, + src_seq_num, + src_seq_len, + pad_begin_cuda_data, + param.mask, + count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_attention_padding_mask, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::AttentionPaddingMaskCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/attention_padding_mask_compute.h b/lite/kernels/cuda/attention_padding_mask_compute.h new file mode 100644 index 0000000000..57d8c269a1 --- /dev/null +++ b/lite/kernels/cuda/attention_padding_mask_compute.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AttentionPaddingMaskCompute + : public KernelLite { + public: + using param_t = operators::AttentionPaddingMaskParam; + + void Run() override; + virtual ~AttentionPaddingMaskCompute() = default; + + private: + lite::Tensor src_offset_cuda; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/attention_padding_mask_compute_test.cc b/lite/kernels/cuda/attention_padding_mask_compute_test.cc new file mode 100644 index 0000000000..d11858350d --- /dev/null +++ b/lite/kernels/cuda/attention_padding_mask_compute_test.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/attention_padding_mask_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +void attention_padding_mask_ref( + const Tensor& x, + const Tensor& y, + Tensor* out, + Tensor* pad_begin, + const operators::AttentionPaddingMaskParam& param) { + auto attn_offset = x.lod()[0]; + auto src_offset = y.lod()[0]; + int attn_seq_num = attn_offset.size() - 1; + int src_seq_num = src_offset.size() - 1; + int attn_seq_len = attn_offset[1]; + int src_seq_len = x.dims()[1]; + CHECK_EQ(attn_seq_num % src_seq_num, 0); + + auto count = x.numel(); + auto attn_data = x.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + memcpy(out_data, attn_data, count * sizeof(float)); + + for (int i = 0; i < attn_seq_num; ++i) { + for (int j = 0; j < attn_seq_len; ++j) { + auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j); + int src_seq_idx = i % src_seq_num; + int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx]; + for (int k = cur_len; k < src_seq_len; k++) { + tmp_out_data[k] = param.mask; + } + } + } +} + +void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) { + std::vector x_dims{static_cast(lod[0].back()), dim2rd}; + x->Resize(x_dims); + x->set_lod(lod); + auto x_data = x->mutable_data(); + auto x_num = x->numel(); + for (int i = 0; i < x_num; i++) { + x_data[i] = (i - x_num) * 1.1; + } +} + +int get_max_len(const LoD& lod) { + int max_len = 0; + auto offset = lod[0]; + for (int i = 0; i < offset.size() - 1; i++) { + int cur_len = offset[i + 1] - offset[i]; + max_len = max_len < cur_len ? cur_len : max_len; + } + return max_len; +} + +TEST(attention_padding_mask_cuda, run_test) { + lite::Tensor x, y, x_cpu, y_cpu; + lite::Tensor out, pad_begin, out_cpu, out_ref, pad_begin_ref; + + LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}}; + prepare_input(&x_cpu, x_lod, get_max_len(y_lod)); + prepare_input(&y_cpu, y_lod, 1); + + x.Resize(x_cpu.dims()); + x.set_lod(x_cpu.lod()); + auto x_cpu_data = x_cpu.mutable_data(); + x.Assign(x_cpu_data, x_cpu.dims()); + + y.Resize(y_cpu.dims()); + y.set_lod(y_cpu.lod()); + + operators::AttentionPaddingMaskParam param; + param.X = &x; + param.Y = &y; + param.pad_id = 12800001; + param.mask = -90000000.f; + param.Out = &out; + param.pad_begin = &pad_begin; + + std::unique_ptr ctx(new KernelContext); + auto context = ctx->As(); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + AttentionPaddingMaskCompute attention_padding_mask_kernel; + attention_padding_mask_kernel.SetParam(param); + attention_padding_mask_kernel.SetContext(std::move(ctx)); + attention_padding_mask_kernel.Run(); + cudaDeviceSynchronize(); + + auto out_data = out.mutable_data(TARGET(kCUDA)); + out_cpu.Resize(out.dims()); + auto out_cpu_data = out_cpu.mutable_data(); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + + attention_padding_mask_ref(x_cpu, y_cpu, &out_ref, &pad_begin_ref, param); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/bilinear_interp_compute.cu b/lite/kernels/cuda/bilinear_interp_compute.cu index 7e1dbaf228..00b1457938 100644 --- a/lite/kernels/cuda/bilinear_interp_compute.cu +++ b/lite/kernels/cuda/bilinear_interp_compute.cu @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include "lite/backends/cuda/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/kernels/cuda/bilinear_interp_compute.h" @@ -20,6 +21,43 @@ namespace kernels { namespace cuda { using Tensor = lite::Tensor; +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + lite::Tensor temp; + auto temp_data = temp.mutable_data(); + auto tensor_data = tensor->data(); + cudaMemcpy(temp_data, + tensor_data, + tensor->dims().production() * sizeof(float), + cudaMemcpyDeviceToHost); + + vec_new_shape.push_back(static_cast(*temp_data)); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data(); + cudaMemcpy(cpu_starts_tensor_data, + new_data, + new_data_tensor->dims().production() * sizeof(T), + cudaMemcpyDeviceToHost); + + auto new_data_ = cpu_starts_tensor.data(); + vec_new_data = std::vector( + new_data_, new_data_ + new_data_tensor->dims().production()); + return vec_new_data; +} + template __global__ void BilinearInterp(const T* in, const size_t in_img_h, @@ -103,23 +141,35 @@ void BilinearInterpCompute::Run() { int out_w = param.out_w; float scale = param.scale; bool align_corners = param.align_corners; - if (scale > 0) { - out_h = static_cast(in_h * scale); - out_w = static_cast(in_w * scale); - } - if (out_size != nullptr) { - Tensor sizes; - float* size_data = sizes.mutable_data(); - float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); - cudaMemcpy( - size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); - out_h = static_cast(size_data[0]); - out_w = static_cast(size_data[1]); + auto list_new_shape_tensor = param.SizeTensor; + if (list_new_shape_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_shape_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + auto scale_tensor = param.Scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + if (out_size != nullptr) { + lite::Tensor sizes; + float* size_data = sizes.mutable_data(); + float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); + cudaMemcpy( + size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); + out_h = static_cast(size_data[0]); + out_w = static_cast(size_data[1]); + } } auto output_data = output->mutable_data(TARGET(kCUDA)); - if (in_h == out_h && in_w == out_w) { cudaMemcpy(output_data, input_data, @@ -188,6 +238,14 @@ REGISTER_LITE_KERNEL(bilinear_interp, {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Scale", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), diff --git a/lite/kernels/cuda/bilinear_interp_compute_test.cc b/lite/kernels/cuda/bilinear_interp_compute_test.cc index e7e8143150..e93f5b1f3e 100644 --- a/lite/kernels/cuda/bilinear_interp_compute_test.cc +++ b/lite/kernels/cuda/bilinear_interp_compute_test.cc @@ -16,6 +16,7 @@ #include #include #include +#include namespace paddle { namespace lite { @@ -98,6 +99,116 @@ TEST(bilinear_interp, normal) { } } +TEST(bilinear_interp, update) { + BilinearInterpCompute bilinear_interp_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::InterpolateParam param; + + std::vector size_tensor(2); + std::vector size_tensor_cpu(2), size_tensor_ref(2); + Tensor x, input_scale, osz, out; + Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu; + Tensor x_ref, input_scale_ref, osz_ref, out_ref; + + int n = 1, c = 1, in_h = 3, in_w = 3; + int out_h = 6, out_w = 6; + float scale = 2.0; + + param.out_h = out_h; + param.out_w = out_w; + param.scale = scale; + param.align_corners = false; + param.align_mode = 0; + + x.Resize({n, c, in_h, in_w}); + size_tensor[0].Resize({1}); + size_tensor[1].Resize({1}); + input_scale.Resize({1}); + osz.Resize({2}); + out.Resize({n, c, out_h, out_w}); + + x_cpu.Resize({n, c, in_h, in_w}); + size_tensor_cpu[0].Resize({1}); + size_tensor_cpu[1].Resize({1}); + input_scale_cpu.Resize({1}); + osz_cpu.Resize({2}); + out_cpu.Resize({n, c, out_h, out_w}); + + x_ref.Resize({n, c, in_h, in_w}); + size_tensor_ref[0].Resize({1}); + size_tensor_ref[1].Resize({1}); + input_scale_ref.Resize({1}); + osz_ref.Resize({2}); + out_ref.Resize({n, c, out_h, out_w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + float* x_cpu_data = x_cpu.mutable_data(); + float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data(); + float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data(); + float* input_scale_cpu_data = input_scale_cpu.mutable_data(); + float* osz_cpu_data = osz_cpu.mutable_data(); + float* out_cpu_data = out_cpu.mutable_data(); + + float* x_ref_data = x_ref.mutable_data(); + float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data(); + float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data(); + float* input_scale_ref_data = input_scale_ref.mutable_data(); + float* osz_ref_data = osz_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + + osz_cpu_data[0] = out_h; + osz_cpu_data[1] = out_w; + size_tensor0_cpu_data[0] = out_h; + size_tensor1_cpu_data[0] = out_w; + input_scale_cpu_data[0] = scale; + osz_ref_data[0] = out_h; + osz_ref_data[1] = out_w; + size_tensor0_ref_data[0] = out_h; + size_tensor1_ref_data[0] = out_w; + input_scale_ref_data[0] = scale; + + x.Assign(x_cpu_data, x_cpu.dims()); + size_tensor[0].Assign( + size_tensor0_cpu_data, size_tensor[0].dims()); + size_tensor[1].Assign( + size_tensor1_cpu_data, size_tensor[1].dims()); + input_scale.Assign(input_scale_cpu_data, + input_scale.dims()); + osz.Assign(osz_cpu_data, osz_cpu.dims()); + + param.X = &x; + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[0])); + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[1])); + param.Scale = &input_scale; + param.OutSize = &osz; + param.Out = &out; + + bilinear_interp_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + bilinear_interp_kernel.SetContext(std::move(ctx)); + bilinear_interp_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + for (int i = 0; i < out.numel(); i++) { + LOG(INFO) << out_cpu_data[i]; + } +} + } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/calib_compute_cuda_test.cc b/lite/kernels/cuda/calib_compute_cuda_test.cc index 8703d8730a..fdb47f7dd3 100644 --- a/lite/kernels/cuda/calib_compute_cuda_test.cc +++ b/lite/kernels/cuda/calib_compute_cuda_test.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/kernels/cuda/calib_compute.h" #include #include #include @@ -58,12 +59,7 @@ void calib_ref(const operators::CalibParam& param, bool to_float = true) { } TEST(calib_cuda, int8_to_fp32) { - LOG(INFO) << "to get kernel ..."; - auto kernels = KernelRegistry::Global().Create( - "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW)); - ASSERT_FALSE(kernels.empty()); - auto calib = std::move(*std::next(kernels.begin(), 1)); - LOG(INFO) << "get kernel: " << calib->doc(); + CalibComputeInt8ToFp32 calib; const int n = 64, c = 32, h = 18, w = 18; Tensor x; Tensor x_cpu; @@ -87,14 +83,14 @@ TEST(calib_cuda, int8_to_fp32) { cudaStream_t stream; cudaStreamCreate(&stream); context.SetExecStream(stream); - calib->SetContext(std::move(ctx)); + calib.SetContext(std::move(ctx)); operators::CalibParam param; param.scale = 0.013f; param.input = &x; param.output = &output; - calib->SetParam(param); - calib->Launch(); + calib.SetParam(param); + calib.Launch(); cudaDeviceSynchronize(); // invoking ref implementation and compare results param.input = &x_cpu; @@ -113,12 +109,7 @@ TEST(calib_cuda, int8_to_fp32) { } TEST(calib_cuda, fp32_to_int8) { - LOG(INFO) << "to get kernel ..."; - auto kernels = KernelRegistry::Global().Create( - "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW)); - ASSERT_FALSE(kernels.empty()); - auto calib = std::move(kernels.front()); - LOG(INFO) << "get kernel: " << calib->doc(); + CalibComputeFp32ToInt8 calib; const int n = 64, c = 32, h = 18, w = 18; Tensor x; Tensor x_cpu; @@ -142,14 +133,14 @@ TEST(calib_cuda, fp32_to_int8) { cudaStream_t stream; cudaStreamCreate(&stream); context.SetExecStream(stream); - calib->SetContext(std::move(ctx)); + calib.SetContext(std::move(ctx)); operators::CalibParam param; param.scale = 0.013f; param.input = &x; param.output = &output; - calib->SetParam(param); - calib->Launch(); + calib.SetParam(param); + calib.Launch(); cudaDeviceSynchronize(); // invoking ref implementation and compare results param.input = &x_cpu; diff --git a/lite/kernels/cuda/concat_compute.cu b/lite/kernels/cuda/concat_compute.cu index 9ec6936672..72d0af459b 100644 --- a/lite/kernels/cuda/concat_compute.cu +++ b/lite/kernels/cuda/concat_compute.cu @@ -51,9 +51,9 @@ void ConcatCompute::Run() { Tensor* output = param.output; auto* output_data = output->mutable_data(TARGET(kCUDA)); int axis = param.axis; - auto* axis_tensor = param.axis_tensor; + Tensor* axis_tensor = param.axis_tensor; if (axis_tensor != nullptr) { - auto* axis_tensor_data = axis_tensor->data(); + const int* axis_tensor_data = axis_tensor->data(); axis = axis_tensor_data[0]; } int inner_size = 1; diff --git a/lite/kernels/cuda/conv_compute.cc b/lite/kernels/cuda/conv_compute.cc index eea81602dd..468ed0cbd0 100644 --- a/lite/kernels/cuda/conv_compute.cc +++ b/lite/kernels/cuda/conv_compute.cc @@ -21,10 +21,14 @@ namespace lite { namespace kernels { namespace cuda { -inline int ConvOutputSize( - int input_size, int filter_size, int dilation, int padding, int stride) { +inline int ConvOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + 2 * padding - dkernel) / stride + 1; + int output_size = (input_size + pad_left + pad_right - dkernel) / stride + 1; CHECK_GT_OR_FALSE(output_size, 0); return output_size; @@ -50,11 +54,15 @@ void ConvComputeInt8::PrepareForRun() { const auto filter_dims = param.filter->dims(); std::vector output_shape({in_dims[0]}); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + for (size_t i = 0; i < param.strides.size(); ++i) { output_shape.push_back(ConvOutputSize(in_dims[i + 1], filter_dims[i + 1], - param.dilations[i], - param.paddings[i], + dilations[i], + paddings[2 * i], + paddings[2 * i + 1], param.strides[i])); } output_shape.push_back(filter_dims[0]); @@ -71,12 +79,15 @@ void ConvComputeInt8::Run() { const auto in_dims = param.x->dims(); const auto filter_dims = param.filter->dims(); std::vector output_shape({in_dims[0]}); + auto paddings = *param.paddings; + auto dilations = *param.dilations; for (size_t i = 0; i < param.strides.size(); ++i) { output_shape.push_back(ConvOutputSize(in_dims[i + 1], filter_dims[i + 1], - param.dilations[i], - param.paddings[i], + dilations[i], + paddings[2 * i], + paddings[2 * i + 1], param.strides[i])); } output_shape.push_back(filter_dims[0]); diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc index 05175a0deb..2ebd7e33ba 100644 --- a/lite/kernels/cuda/conv_compute_test.cc +++ b/lite/kernels/cuda/conv_compute_test.cc @@ -41,7 +41,10 @@ TEST(conv_compute, fp32) { act_param.Leaky_relu_alpha = 0.1; operators::ConvParam param; param.activation_param = act_param; - param.paddings = {1, 1}; + std::vector pads = {1, 1, 1, 1}; + std::vector dilations = {1, 1, 1, 1}; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilations); param.groups = 1; Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu; @@ -148,6 +151,10 @@ TEST(conv_compute, int8) { bias.Assign(bias_cpu_data, filter_cpu.dims()); + std::vector pads = {0, 0, 0, 0}; + std::vector dilations = {1, 1, 1, 1}; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilations); param.x = &x; param.filter = &filter; param.output = &y; @@ -202,12 +209,10 @@ TEST(conv_compute, int8_int8_out) { std::cout << "input" << std::endl; for (int i = 0; i < x_cpu.numel(); i++) { x_cpu_data[i] = static_cast(random(-36, 36)); - std::cout << float(x_cpu_data[i]) << std::endl; } std::cout << "filter" << std::endl; for (int i = 0; i < filter_cpu.numel(); i++) { filter_cpu_data[i] = static_cast(random(-10, 10)); - std::cout << float(filter_cpu_data[i]) << std::endl; } for (int i = 0; i < bias_cpu.numel(); i++) { bias_cpu_data[i] = i + 1.0; @@ -220,6 +225,10 @@ TEST(conv_compute, int8_int8_out) { bias.Assign(bias_cpu_data, filter_cpu.dims()); + std::vector pads = {0, 0, 0, 0}; + std::vector dilations = {1, 1, 1, 1}; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilations); param.x = &x; param.filter = &filter; param.output = &y; diff --git a/lite/kernels/cuda/elementwise_compute.cu b/lite/kernels/cuda/elementwise_compute.cu new file mode 100644 index 0000000000..64759f86f5 --- /dev/null +++ b/lite/kernels/cuda/elementwise_compute.cu @@ -0,0 +1,318 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "lite/backends/cuda/math/elementwise.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/elementwise_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +inline DDim trim_trailing_singular_dims(const DDim& dims) { + // Remove trailing dimensions of size 1 for y + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(); + } + return DDim(trim_dims); +} + +inline bool is_broadcast(const DDim& x_dims, + const DDim& y_dims, + int axis, + int* pre, + int* n, + int* post) { + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + DDim y_dim_trim = trim_trailing_singular_dims(y_dims); + axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis; + if (x_dims.size() == y_dim_trim.size()) { + return false; + } + *pre = 1; + *n = 1; + *post = 1; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dim_trim.size(); ++i) { + CHECK_EQ(x_dims[i + axis], y_dim_trim[i]) + << "Broadcast dimension mismatch."; + (*n) *= y_dim_trim[i]; + } + for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } + return true; +} + +#define ELEMENTWISE_COMPUTE(OP, WITH_RELU) \ + auto& param = this->Param(); \ + auto& ctx = this->ctx_->template As(); \ + auto stream = ctx.exec_stream(); \ + const lite::Tensor* x = param.X; \ + const lite::Tensor* y = param.Y; \ + lite::Tensor* out = param.Out; \ + int axis = param.axis; \ + auto* x_data = x->data(); \ + auto* y_data = y->data(); \ + auto out_data = out->mutable_data(TARGET(kCUDA)); \ + int pixel_num = x->numel(); \ + int pre = 1; \ + int n = pixel_num; \ + int post = 1; \ + if (WITH_RELU) { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } else { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } + +#define ELEMENTWISE_COMPUTE_NHWC(OP, WITH_RELU) \ + std::map pos_map = {{0, 0}, {1, 3}, {2, 1}, {3, 2}}; \ + auto& param = this->Param(); \ + auto& ctx = this->ctx_->template As(); \ + auto stream = ctx.exec_stream(); \ + const lite::Tensor* x = param.X; \ + const lite::Tensor* y = param.Y; \ + lite::Tensor* out = param.Out; \ + int axis = param.axis; \ + if (axis < 0) axis = x->dims().size() - y->dims().size(); \ + CHECK(axis >= 0) << "invalid axis of elementwise op"; \ + axis = pos_map[axis]; \ + auto* x_data = x->data(); \ + auto* y_data = y->data(); \ + auto out_data = out->mutable_data(TARGET(kCUDA)); \ + int pixel_num = x->numel(); \ + int pre = 1; \ + int n = pixel_num; \ + int post = 1; \ + if (WITH_RELU) { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } else { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } + +void ElementwiseAddCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseAddComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseAddReluCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseAddReluComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulReluCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulReluComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(elementwise_add, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseAddCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_add, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseMulCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseMulComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseAddReluCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseAddReluComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseMulReluCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseMulReluComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/cuda/elementwise_compute.h b/lite/kernels/cuda/elementwise_compute.h new file mode 100644 index 0000000000..986a4db227 --- /dev/null +++ b/lite/kernels/cuda/elementwise_compute.h @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class ElementwiseAddCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseAddCompute() = default; +}; + +class ElementwiseAddComputeNHWC + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseAddComputeNHWC() = default; +}; + +class ElementwiseMulCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseMulCompute() = default; +}; + +class ElementwiseMulComputeNHWC + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseMulComputeNHWC() = default; +}; + +class ElementwiseAddReluCompute + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseAddReluCompute() = default; +}; + +class ElementwiseAddReluComputeNHWC + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseAddReluComputeNHWC() = default; +}; + +class ElementwiseMulReluCompute + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseMulReluCompute() = default; +}; + +class ElementwiseMulReluComputeNHWC + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseMulReluComputeNHWC() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/elementwise_compute_test.cc b/lite/kernels/cuda/elementwise_compute_test.cc new file mode 100644 index 0000000000..9fd0b7754f --- /dev/null +++ b/lite/kernels/cuda/elementwise_compute_test.cc @@ -0,0 +1,252 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/elementwise_compute.h" +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +static void ElementwiseAddRef(float* x, float* y, float* out, int num) { + for (int i = 0; i < num; ++i) { + out[i] = x[i] + y[i]; + } +} + +static void ElementwiseBroadcastRef( + float* x, float* y, float* out, int pre, int n, int post) { + for (int i = 0; i < pre * n * post; ++i) { + int idx = (i / post) % n; + out[i] = x[i] + y[idx]; + } +} + +TEST(elementwise_add, normal) { + ElementwiseAddCompute elementwise_add_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::ElementwiseParam param; + Tensor x, y, out; + Tensor x_cpu, y_cpu, out_cpu; + Tensor x_ref, y_ref, out_ref; + + const int n = 1; + const int c = 3; + const int h = 2000; + const int w = 2000; + + x.Resize({n, c, h, w}); + y.Resize({n, c, h, w}); + out.Resize({n, c, h, w}); + x_cpu.Resize({n, c, h, w}); + y_cpu.Resize({n, c, h, w}); + out_cpu.Resize({n, c, h, w}); + x_ref.Resize({n, c, h, w}); + y_ref.Resize({n, c, h, w}); + out_ref.Resize({n, c, h, w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + auto* out_cpu_data = out_cpu.mutable_data(); + + auto* x_ref_data = x_ref.mutable_data(); + auto* y_ref_data = y_ref.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = i - 5.0; + y_ref_data[i] = i - 5.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.X = &x; + param.Y = &y; + param.Out = &out; + elementwise_add_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + elementwise_add_kernel.SetContext(std::move(ctx)); + elementwise_add_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel()); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(elementwise_add, bias) { + ElementwiseAddCompute elementwise_add_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::ElementwiseParam param; + Tensor x, y, out; + Tensor x_cpu, y_cpu, out_cpu; + Tensor x_ref, y_ref, out_ref; + + const int n = 1; + const int c = 3; + const int h = 2000; + const int w = 2000; + + x.Resize({n, c, h, w}); + y.Resize({c, 1, 1}); + out.Resize({n, c, h, w}); + x_cpu.Resize({n, c, h, w}); + y_cpu.Resize({c, 1, 1}); + out_cpu.Resize({n, c, h, w}); + x_ref.Resize({n, c, h, w}); + y_ref.Resize({c, 1, 1}); + out_ref.Resize({n, c, h, w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + auto* out_cpu_data = out_cpu.mutable_data(); + + auto* x_ref_data = x_ref.mutable_data(); + auto* y_ref_data = y_ref.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = i - 5.0; + y_ref_data[i] = i - 5.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.X = &x; + param.Y = &y; + param.Out = &out; + param.axis = -1; + elementwise_add_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + elementwise_add_kernel.SetContext(std::move(ctx)); + elementwise_add_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + ElementwiseBroadcastRef(x_ref_data, y_ref_data, out_ref_data, n, c, h * w); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(elementwise_add_nhwc, bias) { + ElementwiseAddComputeNHWC elementwise_add_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::ElementwiseParam param; + Tensor x, y, out; + Tensor x_cpu, y_cpu, out_cpu; + Tensor x_ref, y_ref, out_ref; + + const int n = 1; + const int c = 3; + const int h = 2000; + const int w = 2000; + + x.Resize({n, h, w, c}); + y.Resize({c, 1, 1}); + out.Resize({n, h, w, c}); + x_cpu.Resize({n, h, w, c}); + y_cpu.Resize({c, 1, 1}); + out_cpu.Resize({n, h, w, c}); + x_ref.Resize({n, h, w, c}); + y_ref.Resize({c, 1, 1}); + out_ref.Resize({n, h, w, c}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + auto* out_cpu_data = out_cpu.mutable_data(); + + auto* x_ref_data = x_ref.mutable_data(); + auto* y_ref_data = y_ref.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = i - 5.0; + y_ref_data[i] = i - 5.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.X = &x; + param.Y = &y; + param.Out = &out; + param.axis = -1; + elementwise_add_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + elementwise_add_kernel.SetContext(std::move(ctx)); + elementwise_add_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + ElementwiseBroadcastRef( + x_ref_data, y_ref_data, out_ref_data, n * h * w, c, 1); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/feed_compute.cc b/lite/kernels/cuda/feed_compute.cc index cffa8a573d..e54c5b9b03 100644 --- a/lite/kernels/cuda/feed_compute.cc +++ b/lite/kernels/cuda/feed_compute.cc @@ -20,21 +20,22 @@ namespace lite { namespace kernels { namespace cuda { -void FeedCompute::Run() { - auto& param = this->Param(); +template +void FeedCompute::Run() { + auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); auto stream = ctx.exec_stream(); VLOG(4) << "feed_list.size: " << param.feed_list->size(); const lite::Tensor& feed_item = (*param.feed_list)[param.col]; int num = static_cast(feed_item.numel()); - auto input = feed_item.data(); + auto input = feed_item.data(); param.out->Resize(feed_item.dims()); - auto output = param.out->mutable_data(TARGET(kCUDA)); + auto output = param.out->template mutable_data(TARGET(kCUDA)); VLOG(4) << "col: " << param.col << " num:" << num; TargetW::MemcpyAsync( - output, input, num * sizeof(float), IoDirection::HtoD, stream); + output, input, num * sizeof(T), IoDirection::HtoD, stream); } } // namespace cuda @@ -42,8 +43,13 @@ void FeedCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - feed, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::FeedCompute, nchw) +typedef paddle::lite::kernels::cuda::FeedCompute + FeedFp32; + +typedef paddle::lite::kernels::cuda::FeedCompute + FeedInt64; + +REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNCHW, FeedFp32, nchw) .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat), @@ -54,8 +60,7 @@ REGISTER_LITE_KERNEL( DATALAYOUT(kNCHW))}) .Finalize(); -REGISTER_LITE_KERNEL( - feed, kCUDA, kFloat, kNHWC, paddle::lite::kernels::cuda::FeedCompute, nhwc) +REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNHWC, FeedFp32, nhwc) .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat), @@ -65,3 +70,25 @@ REGISTER_LITE_KERNEL( PRECISION(kFloat), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNCHW, FeedInt64, nchw) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNHWC, FeedInt64, nhwc) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt64), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kInt64), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/cuda/feed_compute.h b/lite/kernels/cuda/feed_compute.h index 0510404b2b..9c42dcc1ca 100644 --- a/lite/kernels/cuda/feed_compute.h +++ b/lite/kernels/cuda/feed_compute.h @@ -20,7 +20,8 @@ namespace lite { namespace kernels { namespace cuda { -class FeedCompute : public KernelLite { +template +class FeedCompute : public KernelLite { public: using param_t = operators::FeedParam; using TargetW = TargetWrapper; diff --git a/lite/kernels/cuda/layout_compute.cc b/lite/kernels/cuda/layout_compute.cc index e2d0ae4f2e..6b56d9e1de 100644 --- a/lite/kernels/cuda/layout_compute.cc +++ b/lite/kernels/cuda/layout_compute.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/kernels/cuda/layout_compute.h" +#include #include "lite/backends/cuda/math/transpose.h" #include "lite/core/op_registry.h" @@ -21,11 +22,32 @@ namespace lite { namespace kernels { namespace cuda { +inline DDim trim_singular_dims(const DDim& dims) { + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(); + } + return DDim(trim_dims); +} + #define NCHWTONHWC(type) \ auto& param = this->template Param(); \ auto& ctx = this->ctx_->template As(); \ auto input = param.x->template data(); \ auto input_dim = param.x->dims(); \ + DDim input_trim_dim = trim_singular_dims(input_dim); \ + if (input_trim_dim.size() == 1) { \ + param.y->CopyDataFrom(*param.x); \ + return; \ + } \ CHECK(input_dim.size() == 4) \ << "NCHW to NHWC should guarantee that the input dims should be 4"; \ int n = input_dim[0]; \ @@ -41,6 +63,11 @@ namespace cuda { auto& ctx = this->ctx_->template As(); \ auto input = param.x->template data(); \ auto input_dim = param.x->dims(); \ + DDim input_trim_dim = trim_singular_dims(input_dim); \ + if (input_trim_dim.size() == 1) { \ + param.y->CopyDataFrom(*param.x); \ + return; \ + } \ CHECK(input_dim.size() == 4) \ << "NHWC to NCHW should guarantee that the input dims should be 4"; \ int n = input_dim[0]; \ diff --git a/lite/kernels/cuda/lookup_table_compute.cu b/lite/kernels/cuda/lookup_table_compute.cu index 34b6de0e10..3c3bb952ca 100644 --- a/lite/kernels/cuda/lookup_table_compute.cu +++ b/lite/kernels/cuda/lookup_table_compute.cu @@ -98,3 +98,14 @@ REGISTER_LITE_KERNEL(lookup_table, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))}) .Finalize(); +REGISTER_LITE_KERNEL(lookup_table_v2, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))}) + .Finalize(); diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.cu b/lite/kernels/cuda/match_matrix_tensor_compute.cu new file mode 100644 index 0000000000..f89b9c9578 --- /dev/null +++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu @@ -0,0 +1,145 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/match_matrix_tensor_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +void MatchMatrixTensorCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +void MatchMatrixTensorCompute::Run() { + CHECK(ctx_) << "running context should be set first"; + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + + auto* x = param.x; + auto* w = param.w; + auto* y = param.y; + auto* out = param.out; + auto* tmp = param.tmp; + int dim_t = param.dim_t; + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* t_data = w->data(); + auto* out_data = out->mutable_data(TARGET(kCUDA)); + auto* bottom_l_trans_data = tmp->mutable_data(TARGET(kCUDA)); + + gemm_impl_->init( + false, false, x->dims()[0], dim_t * dim_in, dim_in, &context); + gemm_impl_->run( + 1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context); + + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + auto* top_data = out_data + top_offset[b] + t * len_l * len_r; + const auto* l_t_data = + bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; + const auto* r_data = bottom_r_data + offset_r[b] * dim_in; + + gemm_impl_->init(false, + true, + len_l, + len_r, + dim_in, + dim_t * dim_in, + dim_in, + len_r, + &context); + gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context); + } + } + + int batch_size = x->lod()[0].size() - 1; + int lod_lv1_size = batch_size * dim_t; + int lod_lv2_size = x->lod()[0].back() * dim_t; + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); + for (int i = 0; i < batch_size; i++) { + out_lod0[i + 1] = out_lod0[i] + dim_t; + int len_l = offset_l[i + 1] - offset_l[i]; + + for (int j = 0; j < dim_t; j++) { + out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l; + int len_r = offset_r[i + 1] - offset_r[i]; + + for (int k = 0; k < len_l; k++) { + out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = + out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; + } + } + } + + LoD out_lod; + out_lod.push_back(top_offset); + out_lod.push_back(offset_l); + out_lod.push_back(offset_r); + out->set_lod(out_lod); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(match_matrix_tensor, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::MatchMatrixTensorCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("W", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Tmp", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.h b/lite/kernels/cuda/match_matrix_tensor_compute.h new file mode 100644 index 0000000000..09db326ff3 --- /dev/null +++ b/lite/kernels/cuda/match_matrix_tensor_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/blas.h" +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class MatchMatrixTensorCompute + : public KernelLite { + public: + using param_t = operators::MatchMatrixTensorParam; + + void PrepareForRun() override; + void Run() override; + virtual ~MatchMatrixTensorCompute() = default; + + private: + std::unique_ptr> gemm_impl_; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/match_matrix_tensor_compute_test.cc b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc new file mode 100644 index 0000000000..ce0ae2a7a8 --- /dev/null +++ b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/match_matrix_tensor_compute.h" +#include +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +TEST(match_matrix_tensor, normal) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + MatchMatrixTensorCompute kernel; + operators::MatchMatrixTensorParam param; + + // prepare ins and outs tensor in gpu, including size and lod + int ix = 5, iy = 4, h = 2, dim_t = 2; + Tensor x, w, y, out, tmp; + x.Resize({ix, h}); + w.Resize({h, dim_t, h}); + y.Resize({iy, h}); + out.Resize({18, 1}); + tmp.Resize({20, 1}); + LoD x_lod{}; + x_lod.push_back({0, 2, 5}); + x.set_lod(x_lod); + LoD y_lod{}; + y_lod.push_back({0, 3, 4}); + y.set_lod(y_lod); + + // init ins tensor in cpu + Tensor x_cpu, w_cpu, y_cpu, out_cpu, tmp_cpu; + x_cpu.Resize({ix, h}); + w_cpu.Resize({h, dim_t, h}); + y_cpu.Resize({iy, h}); + out_cpu.Resize({18, 1}); + tmp_cpu.Resize({20, 1}); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* w_cpu_data = w_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = static_cast(i); + } + for (int i = 0; i < w_cpu.numel(); ++i) { + w_cpu_data[i] = static_cast(i); + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = static_cast(i); + } + + // cpu tensor data assigin to gpu tensor + x.Assign(x_cpu_data, x_cpu.dims()); + w.Assign(w_cpu_data, w_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.x = &x; + param.w = &w; + param.y = &y; + param.dim_t = dim_t; + param.out = &out; + param.tmp = &tmp; + kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + kernel.SetContext(std::move(ctx)); + kernel.Launch(); + cudaDeviceSynchronize(); + + auto* out_cpu_data = out_cpu.mutable_data(); + auto* out_data = out.mutable_data(TARGET(kCUDA)); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + std::vector ref_results = {5, + 23, + 41, + 17, + 75, + 133, + 7, + 33, + 59, + 27, + 125, + 223, + 323, + 455, + 587, + 557, + 793, + 1029}; + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/mul_compute_test.cc b/lite/kernels/cuda/mul_compute_test.cc index d1c1d63e7d..f521a12e2d 100644 --- a/lite/kernels/cuda/mul_compute_test.cc +++ b/lite/kernels/cuda/mul_compute_test.cc @@ -16,6 +16,7 @@ #include #include #include +#include "lite/backends/cuda/blas.h" namespace paddle { namespace lite { @@ -26,6 +27,7 @@ TEST(mul_compute, normal) { MulCompute mul_kernel; std::unique_ptr ctx(new KernelContext); auto& context = ctx->As(); + context.InitOnce(); Tensor x, y, out, x_cpu, y_cpu, out_cpu; int x_h = 2, x_w_y_h = 3, y_w = 4; diff --git a/lite/kernels/cuda/nearest_interp_compute.cu b/lite/kernels/cuda/nearest_interp_compute.cu index 1a614e0656..adae034a1d 100644 --- a/lite/kernels/cuda/nearest_interp_compute.cu +++ b/lite/kernels/cuda/nearest_interp_compute.cu @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include "lite/backends/cuda/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/kernels/cuda/nearest_interp_compute.h" @@ -20,6 +21,43 @@ namespace kernels { namespace cuda { using Tensor = lite::Tensor; +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + lite::Tensor temp; + auto temp_data = temp.mutable_data(); + auto tensor_data = tensor->data(); + cudaMemcpy(temp_data, + tensor_data, + tensor->dims().production() * sizeof(float), + cudaMemcpyDeviceToHost); + + vec_new_shape.push_back(static_cast(*temp_data)); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data(); + cudaMemcpy(cpu_starts_tensor_data, + new_data, + new_data_tensor->dims().production() * sizeof(T), + cudaMemcpyDeviceToHost); + + auto new_data_ = cpu_starts_tensor.data(); + vec_new_data = std::vector( + new_data_, new_data_ + new_data_tensor->dims().production()); + return vec_new_data; +} + __global__ void KeNearestNeighborInterp(const float* in, const size_t in_img_h, const size_t in_img_w, @@ -79,19 +117,34 @@ void NearestInterpCompute::Run() { int out_w = param.out_w; float scale = param.scale; bool align_corners = param.align_corners; - if (scale > 0) { - out_h = static_cast(in_h * scale); - out_w = static_cast(in_w * scale); - } - - if (out_size != nullptr) { - Tensor sizes; - float* size_data = sizes.mutable_data(); - float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); - cudaMemcpy( - size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); - out_h = static_cast(size_data[0]); - out_w = static_cast(size_data[1]); + auto align_mode = param.align_mode; + + auto list_new_shape_tensor = param.SizeTensor; + if (list_new_shape_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_shape_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + auto scale_tensor = param.Scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + + if (out_size != nullptr) { + lite::Tensor sizes; + float* size_data = sizes.mutable_data(); + float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); + cudaMemcpy( + size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); + out_h = static_cast(size_data[0]); + out_w = static_cast(size_data[1]); + } } auto output_data = output->mutable_data(TARGET(kCUDA)); @@ -162,6 +215,14 @@ REGISTER_LITE_KERNEL(nearest_interp, {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Scale", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), diff --git a/lite/kernels/cuda/nearest_interp_compute_test.cc b/lite/kernels/cuda/nearest_interp_compute_test.cc index 85032016d6..ad2ef9294e 100644 --- a/lite/kernels/cuda/nearest_interp_compute_test.cc +++ b/lite/kernels/cuda/nearest_interp_compute_test.cc @@ -16,6 +16,7 @@ #include #include #include +#include namespace paddle { namespace lite { @@ -143,6 +144,116 @@ TEST(nearest_interp, normal) { } } +TEST(nearest_interp, update) { + NearestInterpCompute nearest_interp_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::InterpolateParam param; + + std::vector size_tensor(2); + std::vector size_tensor_cpu(2), size_tensor_ref(2); + Tensor x, input_scale, osz, out; + Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu; + Tensor x_ref, input_scale_ref, osz_ref, out_ref; + + int n = 1, c = 3, in_h = 40, in_w = 40; + int out_h = 80, out_w = 80; + float scale = 2.0; + + param.out_h = out_h; + param.out_w = out_w; + param.scale = scale; + param.align_corners = false; + param.align_mode = 0; + + x.Resize({n, c, in_h, in_w}); + size_tensor[0].Resize({1}); + size_tensor[1].Resize({1}); + input_scale.Resize({1}); + osz.Resize({2}); + out.Resize({n, c, out_h, out_w}); + + x_cpu.Resize({n, c, in_h, in_w}); + size_tensor_cpu[0].Resize({1}); + size_tensor_cpu[1].Resize({1}); + input_scale_cpu.Resize({1}); + osz_cpu.Resize({2}); + out_cpu.Resize({n, c, out_h, out_w}); + + x_ref.Resize({n, c, in_h, in_w}); + size_tensor_ref[0].Resize({1}); + size_tensor_ref[1].Resize({1}); + input_scale_ref.Resize({1}); + osz_ref.Resize({2}); + out_ref.Resize({n, c, out_h, out_w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + float* x_cpu_data = x_cpu.mutable_data(); + float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data(); + float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data(); + float* input_scale_cpu_data = input_scale_cpu.mutable_data(); + float* osz_cpu_data = osz_cpu.mutable_data(); + float* out_cpu_data = out_cpu.mutable_data(); + + float* x_ref_data = x_ref.mutable_data(); + float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data(); + float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data(); + float* input_scale_ref_data = input_scale_ref.mutable_data(); + float* osz_ref_data = osz_ref.mutable_data(); + float* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + osz_cpu_data[0] = out_h; + osz_cpu_data[1] = out_w; + size_tensor0_cpu_data[0] = out_h; + size_tensor1_cpu_data[0] = out_w; + input_scale_cpu_data[0] = scale; + osz_ref_data[0] = out_h; + osz_ref_data[1] = out_w; + size_tensor0_ref_data[0] = out_h; + size_tensor1_ref_data[0] = out_w; + input_scale_ref_data[0] = scale; + + x.Assign(x_cpu_data, x_cpu.dims()); + size_tensor[0].Assign( + size_tensor0_cpu_data, size_tensor[0].dims()); + size_tensor[1].Assign( + size_tensor1_cpu_data, size_tensor[1].dims()); + input_scale.Assign(input_scale_cpu_data, + input_scale.dims()); + osz.Assign(osz_cpu_data, osz_cpu.dims()); + + param.X = &x; + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[0])); + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[1])); + param.Scale = &input_scale; + param.OutSize = &osz; + param.Out = &out; + nearest_interp_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + nearest_interp_kernel.SetContext(std::move(ctx)); + nearest_interp_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + NearestInterpRef(&x_ref, &out_ref, false); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/pool_compute.cu b/lite/kernels/cuda/pool_compute.cu index a2483a2c75..d7e3739ddb 100644 --- a/lite/kernels/cuda/pool_compute.cu +++ b/lite/kernels/cuda/pool_compute.cu @@ -256,6 +256,7 @@ void PoolCompute::Run() { bool adaptive = param.adaptive; auto x_dims = param.x->dims(); auto out_dims = param.output->dims(); + auto paddings = *param.paddings; const int in_h = x_dims[2]; const int in_w = x_dims[3]; const int out_h = out_dims[2]; @@ -266,8 +267,8 @@ void PoolCompute::Run() { const int win_w = param.ksize[1]; const int stride_h = param.strides[0]; const int stride_w = param.strides[1]; - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; const int total_threads = out_dims.production(); const int threads = 512; const int blocks = (total_threads + threads - 1) / threads; @@ -357,6 +358,61 @@ void PoolCompute::Run() { if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error); } +inline int PoolOutputSize( + int input_size, int filter_size, int padding, int stride, bool ceil_mode) { + int output_size; + if (!ceil_mode) { + output_size = (input_size - filter_size + 2 * padding) / stride + 1; + } else { + output_size = + (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + } + return output_size; +} + +void PoolComputeNHWC::PrepareForRun() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + pool_impl_.reset(new lite::cuda::math::CudnnPool2DNHWC); + pool_impl_->init(param, &ctx); +} + +void PoolComputeNHWC::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + const auto x_dims = param.x->dims(); + std::vector& ksize = param.ksize; + if (param.global_pooling) { + ksize.resize(static_cast(x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + (*param.paddings)[i] = 0; + ksize[i] = static_cast(x_dims[i + 1]); + } + } + + std::vector output_shape({x_dims[0]}); + if (param.adaptive) { + output_shape.insert( + output_shape.end(), param.ksize.begin(), param.ksize.end()); + } else { + for (size_t i = 0; i < param.ksize.size(); ++i) { + output_shape.push_back(PoolOutputSize(x_dims[i + 1], + param.ksize[i], + (*param.paddings)[i], + param.strides[i], + param.ceil_mode)); + } + } + output_shape.push_back(x_dims[3]); + param.output->Resize(lite::DDim(output_shape)); + + pool_impl_->run(param); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error); +} + } // namespace cuda } // namespace kernels } // namespace lite @@ -373,3 +429,19 @@ REGISTER_LITE_KERNEL( PRECISION(kFloat), DATALAYOUT(kNCHW))}) .Finalize(); + +REGISTER_LITE_KERNEL(pool2d, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::PoolComputeNHWC, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/cuda/pool_compute.h b/lite/kernels/cuda/pool_compute.h index 55b346bfaf..5c3a1bc2b9 100644 --- a/lite/kernels/cuda/pool_compute.h +++ b/lite/kernels/cuda/pool_compute.h @@ -13,6 +13,9 @@ // limitations under the License. #pragma once +#include +#include +#include "lite/backends/cuda/math/cudnn_pool.h" #include "lite/core/kernel.h" namespace paddle { @@ -29,6 +32,20 @@ class PoolCompute virtual ~PoolCompute() = default; }; +class PoolComputeNHWC + : public KernelLite { + public: + using param_t = operators::PoolParam; + + void PrepareForRun() override; + void Run() override; + virtual ~PoolComputeNHWC() = default; + + private: + std::unique_ptr> + pool_impl_; +}; + } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/pool_compute_test.cc b/lite/kernels/cuda/pool_compute_test.cc index fe6ff92c0c..0e5aeec8c0 100644 --- a/lite/kernels/cuda/pool_compute_test.cc +++ b/lite/kernels/cuda/pool_compute_test.cc @@ -27,42 +27,123 @@ namespace cuda { using Tensor = lite::Tensor; using DDim = lite::DDim; -static int PoolOutputSize( - int input_size, int filter_size, int padding, int stride, bool ceil_mode) { +#define IN(n, c, h, w) \ + input_data[w + h * input_w + c * input_h * input_w + \ + n * input_c * input_h * input_w] +#define OUT(n, c, h, w) \ + output_data[w + h * output_w + c * output_h * output_w + \ + n * output_c * output_h * output_w] + +template +void nchw2nhwc_ref(lite::Tensor* input, lite::Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_c = input->dims()[1]; + int input_h = input->dims()[2]; + int input_w = input->dims()[3]; + int output_c = output->dims()[1]; + int output_h = output->dims()[2]; + int output_w = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, h, w, c) = IN(n, c, h, w); + } + } + } + } +} + +#undef IN +#undef OUT + +#define IN(n, h, w, c) \ + input_data[c + w * input_c + h * input_w * input_c + \ + n * input_h * input_w * input_c] +#define OUT(n, h, w, c) \ + output_data[c + w * output_c + h * output_w * output_c + \ + n * output_h * output_w * output_c] + +template +void nhwc2nchw_ref(lite::Tensor* input, lite::Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_h = input->dims()[1]; + int input_w = input->dims()[2]; + int input_c = input->dims()[3]; + int output_h = output->dims()[1]; + int output_w = output->dims()[2]; + int output_c = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, c, h, w) = IN(n, h, w, c); + } + } + } + } +} + +static int PoolOutputSize(int input_size, + int filter_size, + int pad_left, + int pad_right, + int stride, + bool ceil_mode) { int output_size; if (!ceil_mode) { - output_size = (input_size - filter_size + 2 * padding) / stride + 1; + output_size = + (input_size - filter_size + pad_left + pad_right) / stride + 1; } else { output_size = - (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + (input_size - filter_size + pad_left + pad_right + stride - 1) / + stride + + 1; } return output_size; } -static std::vector compute_output_shape(operators::PoolParam* param_) { +static std::vector compute_output_shape(operators::PoolParam* param_, + bool is_nchw) { + int axis = 2; + if (!is_nchw) axis = 1; const auto x_dims = param_->x->dims(); std::vector& ksize = param_->ksize; if (param_->global_pooling) { ksize.resize(static_cast(x_dims.size()) - 2); + auto paddings = *param_->paddings; for (size_t i = 0; i < ksize.size(); ++i) { - param_->paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(x_dims[i + 2]); } } - std::vector output_shape({x_dims[0], x_dims[1]}); + std::vector output_shape({x_dims[0]}); + if (is_nchw) output_shape.push_back(x_dims[1]); if (param_->adaptive) { output_shape.insert( output_shape.end(), param_->ksize.begin(), param_->ksize.end()); } else { + auto paddings = *param_->paddings; for (size_t i = 0; i < param_->ksize.size(); ++i) { - output_shape.push_back(PoolOutputSize(x_dims[i + 2], + output_shape.push_back(PoolOutputSize(x_dims[i + axis], param_->ksize[i], - param_->paddings[i], + paddings[2 * i], + paddings[2 * i + 1], param_->strides[i], param_->ceil_mode)); } } + if (!is_nchw) output_shape.push_back(x_dims[3]); return output_shape; } @@ -75,7 +156,7 @@ static void pool_compute_ref(const operators::PoolParam& param) { std::vector ksize = param.ksize; std::vector strides = param.strides; - std::vector paddings = param.paddings; + std::vector paddings = *param.paddings; std::string pooling_type = param.pooling_type; bool global_pooling = param.global_pooling; @@ -99,7 +180,7 @@ static void pool_compute_ref(const operators::PoolParam& param) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; if (global_pooling == true) { for (int n = 0; n < in_n; ++n) { @@ -195,15 +276,15 @@ TEST(pool_cuda, compute) { for (auto pad : {0, 1}) { for (auto n : {1, 2}) { for (auto c : {1, 3}) { - for (auto h : {2, 3, 4, 11}) { - for (auto w : {2, 3, 4, 11}) { - VLOG(3) << "n:" << n << " c:" << c << " h:" << h - << " w:" << w << " ksize:" << ksize - << " stride:" << stride << " pad:" << pad - << " exclusive:" << exclusive - << " global_pooling:" << global_pooling - << " ceil_mode: " << ceil_mode - << " pooling_type:" << pooling_type; + for (auto h : {3}) { + for (auto w : {3}) { + LOG(INFO) << "n:" << n << " c:" << c << " h:" << h + << " w:" << w << " ksize:" << ksize + << " stride:" << stride << " pad:" << pad + << " exclusive:" << exclusive + << " global_pooling:" << global_pooling + << " ceil_mode: " << ceil_mode + << " pooling_type:" << pooling_type; // init x, output x.Resize(DDim(std::vector({n, c, h, w}))); @@ -226,14 +307,16 @@ TEST(pool_cuda, compute) { } param.global_pooling = global_pooling; param.strides = {stride, stride}; - param.paddings = {pad, pad}; + std::vector paddings = {pad, pad, pad, pad}; + param.paddings = + std::make_shared>(paddings); param.exclusive = exclusive; param.ceil_mode = ceil_mode; param.adaptive = false; param.use_quantizer = false; const std::vector& output_shape = - compute_output_shape(¶m); + compute_output_shape(¶m, true); if (output_shape[2] * output_shape[3] == 0) continue; output.Resize(DDim(output_shape)); output_ref.Resize(DDim(output_shape)); @@ -277,6 +360,131 @@ TEST(pool_cuda, compute) { } } } + +TEST(pool_cuda, nhwc) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + PoolComputeNHWC pool; + operators::PoolParam param; + pool.SetContext(std::move(ctx)); + + lite::Tensor x, temp; + lite::Tensor x_cpu; + lite::Tensor output; + lite::Tensor output_cpu, output_temp; + lite::Tensor output_ref; + for (auto pooling_type : {"max", "avg"}) { + for (auto ceil_mode : {false}) { + for (auto global_pooling : {true, false}) { + for (auto exclusive : {false, true}) { + for (auto ksize : {3}) { + for (auto stride : {3}) { + for (auto pad : {1}) { + for (auto n : {1}) { + for (auto c : {3}) { + for (auto h : {8}) { + for (auto w : {8}) { + LOG(INFO) << "n:" << n << " c:" << c << " h:" << h + << " w:" << w << " ksize:" << ksize + << " stride:" << stride << " pad:" << pad + << " exclusive:" << exclusive + << " global_pooling:" << global_pooling + << " ceil_mode: " << ceil_mode + << " pooling_type:" << pooling_type; + + // init x, output + x.Resize(DDim(std::vector({n, h, w, c}))); + temp.Resize(DDim(std::vector({n, h, w, c}))); + x_cpu.Resize(DDim(std::vector({n, c, h, w}))); + + auto* x_cpu_data = x_cpu.mutable_data(); + for (int i = 0; i < x_cpu.dims().production(); ++i) { + float sign = i % 3 == 0 ? -0.03 : 0.05f; + x_cpu_data[i] = sign * (i % 128); + } + + nchw2nhwc_ref(&x_cpu, &temp); + auto* temp_cpu_data = temp.mutable_data(); + + x.Assign(temp_cpu_data, + temp.dims()); + // fill param + param.x = &x; + param.output = &output; + param.pooling_type = pooling_type; + if (global_pooling) { + param.ksize = {h, w}; + } else { + param.ksize = {ksize, ksize}; + } + param.global_pooling = global_pooling; + param.strides = {stride, stride}; + std::vector paddings = {pad, pad, pad, pad}; + param.paddings = + std::make_shared>(paddings); + param.exclusive = exclusive; + param.ceil_mode = ceil_mode; + param.adaptive = false; + param.use_quantizer = false; + + const std::vector& output_shape = + compute_output_shape(¶m, false); + if (output_shape[2] * output_shape[3] == 0) continue; + output.Resize(DDim(output_shape)); + output_temp.Resize(DDim(output_shape)); + output_cpu.Resize(DDim(output_shape)); + + auto* output_data = + output.mutable_data(TARGET(kCUDA)); + auto* output_cpu_data = + output_cpu.mutable_data(); + + // compute + pool.SetParam(param); + pool.Launch(); + + // compute ref + param.x = &x_cpu; + // nchw + const std::vector& output_shape_ref = + compute_output_shape(¶m, true); + + output_ref.Resize(DDim(output_shape_ref)); + // auto* output_ref_data = + // output_ref.mutable_data(); + param.output = &output_ref; + pool_compute_ref(param); + nchw2nhwc_ref(&output_ref, &output_temp); + auto* output_temp_data = + output_temp.mutable_data(); + + cudaDeviceSynchronize(); + CopySync(output_cpu_data, + output_data, + sizeof(float) * output.numel(), + IoDirection::DtoH); + // compare + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR( + output_cpu_data[i], output_temp_data[i], 1e-4); + } + VLOG(3) << "compare pass"; + } + } + } + } + } + } + } + } + } + } + } +} } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc new file mode 100644 index 0000000000..ddefb608dd --- /dev/null +++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda {} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_aligned_mat_mul, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchAlignedMatMulCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.h b/lite/kernels/cuda/search_aligned_mat_mul_compute.h new file mode 100644 index 0000000000..b1c4552d9c --- /dev/null +++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h @@ -0,0 +1,103 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/math/batched_gemm.h" +#include "lite/core/context.h" +#include "lite/core/kernel.h" +#include "lite/core/types.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchAlignedMatMulCompute + : public KernelLite { + public: + using param_t = operators::MatMulParam; + + void PrepareForRun() override { + auto& param = this->Param(); + CHECK(ctx_) << "running context should be set first"; + auto& cuda_ctx = ctx_->template As(); + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + int seq_num = param.X->lod()[0].size() - 1; + batched_gemm_impl_.reset(new lite::cuda::math::BatchedGemm); + CHECK( + batched_gemm_impl_->init(x_transpose, y_transpose, seq_num, &cuda_ctx)); + A_ = static_cast(malloc(3 * seq_num * sizeof(float*))); + CHECK(A_); + } + + void Run() override { + auto& param = this->Param(); + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + float alpha = param.alpha; + const auto& x_dims = x->dims(); + const auto& y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + + auto x_data = x->data(); + auto y_data = y->data(); + auto out_data = out->mutable_data(TARGET(kCUDA)); + auto x_stride = x_batch_size * x_inner_size; + auto y_stride = y_batch_size * y_inner_size; + auto out_stride = M * N; + for (int seq = 0; seq < seq_num; seq++) { + A_[seq] = const_cast(x_data) + seq * x_stride; + A_[seq + seq_num] = const_cast(y_data) + seq * y_stride; + A_[seq + seq_num * 2] = out_data + seq * out_stride; + } + batched_gemm_impl_->run( + alpha, 0.0f, const_cast(A_), M, N, K, seq_num); + } + + ~SearchAlignedMatMulCompute() { + if (A_ != nullptr) { + free(A_); + } + } + + private: + std::unique_ptr> + batched_gemm_impl_; + float** A_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc new file mode 100644 index 0000000000..f08333b310 --- /dev/null +++ b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void search_aligned_mat_mul_compute_ref(const operators::MatMulParam& param) { + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + T alpha = static_cast(param.alpha); + const auto x_dims = x->dims(); + const auto y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + int lda = x_transpose ? M : K; + int ldb = y_transpose ? K : N; + int ldc = N; + int x_stride = x_batch_size * x_inner_size; + int y_stride = y_batch_size * y_inner_size; + int out_stride = M * N; + auto x_data = x->data(); + auto y_data = y->data(); + auto out_data = out->mutable_data(); + + for (int seq = 0; seq < seq_num; seq++) { + auto a = x_data + seq * x_stride; + auto b = y_data + seq * y_stride; + auto c = out_data + seq * out_stride; + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + auto sum = static_cast(0); + for (int l = 0; l < K; l++) { + T av; + T bv; + if (x_transpose) { + av = a[l * lda + i]; + } else { + av = a[i * lda + l]; + } + if (y_transpose) { + bv = b[j * ldb + l]; + } else { + bv = b[l * ldb + j]; + } + sum += av * bv; + } + c[i * ldc + j] = alpha * sum; + } + } + } +} + +TEST(search_aligned_mat_mul_compute, normal) { + Env::Init(); + for (int seq_num : {1, 2}) { + for (int x_batch_size : {1, 3}) { + for (int x_inner_size : {1, 5}) { + for (int out_inner_size : {1, 4}) { + for (bool x_transpose : {true, false}) { + for (bool y_transpose : {true, false}) { + for (float alpha : {1., 2.}) { + // infer x_dims and y_dims + int y_batch_size; + int y_inner_size; + int out_batch_size; + if (x_transpose) { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_batch_size; + out_batch_size = x_inner_size; + } else { + y_batch_size = x_batch_size; + y_inner_size = out_inner_size; + out_batch_size = x_inner_size; + } + } else { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_inner_size; + out_batch_size = x_batch_size; + } else { + y_batch_size = x_inner_size; + y_inner_size = out_inner_size; + out_batch_size = x_batch_size; + } + } + std::vector x_lod_0(seq_num + 1); + std::vector y_lod_0(seq_num + 1); + std::vector out_lod_0(seq_num + 1); + x_lod_0[0] = 0; + y_lod_0[0] = 0; + out_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + x_lod_0[i + 1] = x_lod_0[i] + x_batch_size; + y_lod_0[i + 1] = y_lod_0[i] + y_batch_size; + out_lod_0[i + 1] = out_lod_0[i] + out_batch_size; + } + LoD x_lod; + LoD y_lod; + LoD out_lod; + x_lod.push_back(x_lod_0); + y_lod.push_back(y_lod_0); + out_lod.push_back(out_lod_0); + DDim x_dims({static_cast(x_lod_0.back()), + static_cast(x_inner_size)}); + DDim y_dims({static_cast(y_lod_0.back()), + static_cast(y_inner_size)}); + DDim out_dims({static_cast(out_lod_0.back()), + static_cast(out_inner_size)}); + // prepare input&output tensors + Tensor x_dev, x_host, y_dev, y_host, out_dev, out_host, out_ref; + x_host.Resize(x_dims); + y_host.Resize(y_dims); + out_host.Resize(out_dims); + x_dev.Resize(x_dims); + y_dev.Resize(y_dims); + out_dev.Resize(out_dims); + out_ref.Resize(out_dims); + x_host.set_lod(x_lod); + y_host.set_lod(y_lod); + out_host.set_lod(out_lod); + x_dev.set_lod(x_lod); + y_dev.set_lod(y_lod); + out_dev.set_lod(out_lod); + out_ref.set_lod(out_lod); + auto out_dev_data = out_dev.mutable_data(TARGET(kCUDA)); + auto x_host_data = x_host.mutable_data(); + auto y_host_data = y_host.mutable_data(); + auto out_host_data = out_host.mutable_data(); + auto out_ref_data = out_ref.mutable_data(); + for (int i = 0; i < x_host.dims().production(); i++) { + x_host_data[i] = i * 0.125f; + } + for (int i = 0; i < y_host.dims().production(); i++) { + y_host_data[i] = i * 0.5f; + } + x_dev.Assign(x_host_data, + x_host.dims()); + y_dev.Assign(y_host_data, + y_host.dims()); + // prepare cuda context, initialize param, and run kernel + operators::MatMulParam param; + param.X = &x_dev; + param.Y = &y_dev; + param.Out = &out_dev; + param.alpha = alpha; + param.transpose_X = x_transpose; + param.transpose_Y = y_transpose; + std::unique_ptr ctx(new KernelContext); + auto& cuda_ctx = ctx->As(); + cuda_ctx.InitOnce(); + int dev_id = TargetWrapper::GetCurDevice(); + cuda_ctx.Init(dev_id); + SearchAlignedMatMulCompute search_aligned_mat_mul; + search_aligned_mat_mul.SetParam(param); + search_aligned_mat_mul.SetContext(std::move(ctx)); + search_aligned_mat_mul.Launch(); + cudaDeviceSynchronize(); + CopySync( + out_host_data, + out_dev_data, + sizeof(float) * out_dev.dims().production(), + IoDirection::DtoH); + // run reference + param.X = &x_host; + param.Y = &y_host; + param.Out = &out_ref; + search_aligned_mat_mul_compute_ref(param); + // verify result + for (int i = 0; i < out_ref.dims().production(); i++) { + EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5); + } + } + } + } + } + } + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_fc_compute.cu b/lite/kernels/cuda/search_fc_compute.cu new file mode 100644 index 0000000000..591e2474a4 --- /dev/null +++ b/lite/kernels/cuda/search_fc_compute.cu @@ -0,0 +1,170 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_fc_compute.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +template +static void anakin_NV_gemv(cublasHandle_t handle, + const bool TransA, + const int M, + const int N, + const T alpha, + const T* A, + const T* x, + const T beta, + T* y); +template <> +void anakin_NV_gemv(cublasHandle_t handle, + const bool TransA, + const int M, + const int N, + const float alpha, + const float* A, + const float* x, + const float beta, + float* y) { + cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); +} +template +static void anakin_NV_gemm(cublasHandle_t handle, + const bool TransA, + const bool TransB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const T* B, + const T beta, + T* C); + +template <> +void anakin_NV_gemm(cublasHandle_t handle, + const bool TransA, + const bool TransB, + const int M, + const int N, + const int K, + const float alpha, + const float* A, + const float* B, + const float beta, + float* C) { + // Note that cublas follows fortran order. + int lda = (!TransA /* == CblasNoTrans*/) ? K : M; + int ldb = (!TransB /* == CblasNoTrans*/) ? N : K; + cublasOperation_t cuTransA = + (!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + N)); +} + +template <> +void anakin_NV_gemm(cublasHandle_t handle, + const bool TransA, + const bool TransB, + const int M, + const int N, + const int K, + const char alpha, + const char* A, + const char* B, + const char beta, + char* C) { + LOG(FATAL) << "int8 gemm is not implemented"; +} + +template +static __global__ void add_bias(int n, + int output_size, + const T* bias, + T* dout) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int bias_index = index % output_size; + if (index < n) { + dout[index] = dout[index] + bias[bias_index]; + } +} + +template +void SearchFcCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + const Tensor* x_tensor = param.X; + param.Out->Resize({x_tensor->dims()[0], param.out_size}); + _M = x_tensor->dims().count(0, 1); + _K = x_tensor->dims().count(1, x_tensor->numel()); + _N = param.out_size; + const T* din = x_tensor->data(); + Tensor* out_tensor = param.Out; + T* dout = out_tensor->mutable_data(TARGET(kCUDA)); + const Tensor* w_tensor = param.W; + const T* weight = w_tensor->data(); + const Tensor* b_tensor = param.b; + const T* bias = b_tensor->data(); + cublasCreate(&_handle); + if (_M == 1 && _K > 50000) { + anakin_NV_gemv(_handle, false, _N, _K, (T)1, weight, din, (T)0, dout); + } else { + anakin_NV_gemm(_handle, + false, + !_flag_trans_weights, + _M, + _N, + _K, + (T)1, + din, + weight, + (T)0, + dout); + } + int total_size = _M * _N; + add_bias<<>>( + total_size, _N, bias, dout); +} +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_fc, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_fc_compute.h b/lite/kernels/cuda/search_fc_compute.h new file mode 100644 index 0000000000..db09362734 --- /dev/null +++ b/lite/kernels/cuda/search_fc_compute.h @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} +inline int CUDA_GET_BLOCKS(const int N, const int base) { + return (N + base - 1) / base; +} + +template +class SearchFcCompute : public KernelLite { + public: + using param_t = operators::SearchFcParam; + void Run() override; + virtual ~SearchFcCompute() = default; + + private: + bool _flag_trans_weights{false}; + int _M; + int _K; + int _N; + cublasHandle_t _handle; + bool _is_continue_buf{true}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_fc_compute_test.cc b/lite/kernels/cuda/search_fc_compute_test.cc new file mode 100644 index 0000000000..f06028fbe1 --- /dev/null +++ b/lite/kernels/cuda/search_fc_compute_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_fc_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +void fc_cpu_base(const lite::Tensor* X, + const lite::Tensor* W, + const lite::Tensor* b, + int out_size, + lite::Tensor* Out) { + const float* data_in = X->data(); + const float* bias = b->data(); + const float* weights = W->data(); + float* data_out = Out->mutable_data(); + int out_rows = X->dims()[0]; + int in_cols = X->numel() / out_rows; + int out_cols = W->numel() / in_cols; + int index_out; + + for (int i = 0; i < out_rows; i++) { + for (int j = 0; j < out_cols; j++) { + index_out = i * out_cols + j; + data_out[index_out] = bias ? bias[j] : 0; + + for (int k = 0; k < in_cols; k++) { + data_out[index_out] += + data_in[i * in_cols + k] * weights[j * in_cols + k]; + } + } + } +} + +TEST(search_fc, normal) { + SearchFcCompute search_fc_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + operators::SearchFcParam param; + lite::Tensor X, X_gpu, W, W_gpu, b, b_gpu; + lite::Tensor Out, Out_cpu, out_ref; + std::vector x_shape{1, 4}; + X.Resize(lite::DDim(x_shape)); + std::vector w_shape{3, 4}; + W.Resize(lite::DDim(w_shape)); + std::vector b_shape{3}; + b.Resize(lite::DDim(b_shape)); + std::vector out_shape{1, 4}; + Out.Resize(lite::DDim(out_shape)); + out_ref.Resize(lite::DDim(out_shape)); + auto x_data = X.mutable_data(); + auto w_data = W.mutable_data(); + auto b_data = b.mutable_data(); + auto out_data_ref = out_ref.mutable_data(); + for (int64_t i = 0; i < X.dims().production(); i++) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < W.dims().production(); i++) { + w_data[i] = static_cast(i); + } + for (int64_t i = 0; i < b.dims().production(); i++) { + b_data[i] = static_cast(i); + } + X_gpu.Assign(x_data, X.dims()); + W_gpu.Assign(w_data, W.dims()); + b_gpu.Assign(b_data, b.dims()); + param.X = &X_gpu; + param.W = &W_gpu; + param.b = &b_gpu; + param.out_size = 4; + param.Out = &Out; + search_fc_kernel.SetParam(param); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + search_fc_kernel.SetContext(std::move(ctx)); + search_fc_kernel.Run(); + fc_cpu_base(&X, &W, &b, 4, &out_ref); + cudaDeviceSynchronize(); + const float* out_data = Out.data(); + float* out_cpu_data = Out_cpu.mutable_data(); + CopySync( + out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH); + for (int i = 0; i < Out.numel(); ++i) { + EXPECT_NEAR(out_cpu_data[i], out_data_ref[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_grnn_compute.cu b/lite/kernels/cuda/search_grnn_compute.cu new file mode 100644 index 0000000000..468b66e568 --- /dev/null +++ b/lite/kernels/cuda/search_grnn_compute.cu @@ -0,0 +1,351 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_grnn_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +template +T sigmoid(T z) { + return 1 / (1 + std::exp(-z)); +} + +template +__global__ void PreComputeKernel( + const int num, const T* w_x_e, const T* wz_x_e, T* tilde, T* z, T* hidden) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < num) { + tilde[index] = std::tanh(w_x_e[index]); + z[index] = 1 / (1 + std::exp(-wz_x_e[index])); + hidden[index] = (1. - z[index]) * tilde[index]; + } +} + +template +__global__ void PostComputeKernel(const int start, + const int end, + const int cap_h, + const int w_tm1, + const T* wr_x_e, + const T* ur_x_h, + const T* wz_x_e, + const T* uz_x_h, + const T* w_x_e, + const T* u_x_h, + T* r, + T* z, + T* tilde, + T* hidden) { + int j = start + blockIdx.x * blockDim.x + threadIdx.x; + if (j < end) { + r[j] = 1 / (1 + std::exp(-(wr_x_e[j] + ur_x_h[j]))); + z[j] = 1 / (1 + std::exp(-(wz_x_e[j] + uz_x_h[j]))); + tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]); + hidden[j] = z[j] * hidden[j - cap_h * w_tm1] + (1.0 - z[j]) * tilde[j]; + } +} + +void SearchGrnnCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) { + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + auto cuda_stream = context.exec_stream(); + + auto* _input = input_blob; + int dim0 = _input->dims()[0]; + int dim1 = 1; + if (_input->dims().size() > 1) { + dim1 = _input->dims()[1]; + } + int batch = _input->lod()[0].size() - 1; + auto& offset = _input->lod()[0]; + + idx_sorted_by_width_cpu = std::make_shared(); + idx_sorted_by_width_cpu->Resize({batch}); + int* idx_sorted_by_width_cpu_data = + idx_sorted_by_width_cpu->mutable_data(); + + Tensor _width; + _width.Resize({batch}); + int* width_data = _width.mutable_data(); + // sort sequence by width (descending) and find the largest width in the + // batch + for (int i = 0; i < batch; i++) { + width_data[i] = offset[i + 1] - offset[i]; + idx_sorted_by_width_cpu_data[i] = i; + } + std::sort(idx_sorted_by_width_cpu_data, + idx_sorted_by_width_cpu_data + batch, + [&_width](int a, int b) { + return _width.data()[a] > _width.data()[b]; + }); + int max_width = width_data[idx_sorted_by_width_cpu_data[0]]; + + // start of reorganizing the input + std::vector new_offset; + new_offset.resize(max_width + 1); + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width_data[idx_sorted_by_width_cpu_data[k]] > last_width) { + sub_row = width_data[idx_sorted_by_width_cpu_data[k]] - last_width; + sub_col = k + 1; + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width_data[idx_sorted_by_width_cpu_data[k]]; + j = k - 1; + break; + } + } + } + + // copying to the reorganized buffer + auto* _layout_input = new Tensor(); + auto* _layout_input_gpu = param.layout_input; + if (_input->dims().size() == 1) { + // _layout_input.reshape_batch_sequence({dim0}, new_offset); + LOG(FATAL) << "_input->dims().size() = 1, error."; + } else { + // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset); + LoD new_lod; + new_lod.push_back(new_offset); + _layout_input->set_lod(new_lod); + _layout_input->Resize({dim0, dim1}); + _layout_input_gpu->set_lod(new_lod); + _layout_input_gpu->Resize({dim0, dim1}); + } + + auto* new_emb = _layout_input->mutable_data(); + auto* input_cpu = new Tensor(); + input_cpu->Resize(_input->dims()); + auto* input_cpu_data = input_cpu->mutable_data(); + TargetW::MemcpyAsync(input_cpu_data, + _input->data(), + _input->numel() * sizeof(float), + IoDirection::DtoH, + cuda_stream); + for (int i = 0; i < max_width; i++) { + int w = new_offset[i + 1] - new_offset[i]; + auto* emb_start = new_emb + dim1 * new_offset[i]; + for (int j = 0; j < w; ++j) { + memcpy(emb_start + dim1 * j, + input_cpu_data + dim1 * offset[idx_sorted_by_width_cpu_data[j]] + + dim1 * i, + dim1 * sizeof(float)); + } + } + + auto* _layout_input_gpu_data = + _layout_input_gpu->mutable_data(TARGET(kCUDA)); + TargetW::MemcpyAsync(_layout_input_gpu_data, + new_emb, + _layout_input->numel() * sizeof(float), + IoDirection::HtoD, + cuda_stream); + delete _layout_input; + delete input_cpu; +} + +void SearchGrnnCompute::CopyBack(float* from, float* to, int step) { + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + auto* _input = param.x; + auto* _layout_input = param.layout_input; + + const auto& offset = _input->lod()[0]; + const auto& new_offset = _layout_input->lod()[0]; + const auto* idx_sorted_by_width_cpu_data = + idx_sorted_by_width_cpu->data(); + for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) { + int w = new_offset[i + 1] - new_offset[i]; + for (int j = 0; j < w; j++) { + TargetW::MemcpyAsync( + to + step * (offset[idx_sorted_by_width_cpu_data[j]] + i), + from + (new_offset[i] + j) * step, + step * sizeof(float), + IoDirection::DtoD, + stream); + } + } +} + +void SearchGrnnCompute::Run() { + CHECK(ctx_) << "running context should be set first"; + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + + auto* bottom = param.x; + auto* wi = param.wi; + auto* wh = param.wh; + auto* top = param.out; + auto* _buffer = param.tmp_buffer; + int _cap_h = param.num_hidden; + int _cap_e = param.num_input; + + int _cap_l = bottom->dims()[0]; + int batch = bottom->lod()[0].size() - 1; + + const auto& offset = bottom->lod()[0]; + LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + std::vector top_dims_vec{_cap_l, _cap_h}; + top->Resize(top_dims_vec); + auto* top_hidden = top->mutable_data(TARGET(kCUDA)); + + const auto* dense_e2h = wi->data(); + const auto* dense_h2h = wh->data(); + + const auto* e2h = dense_e2h; + const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h; + const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h; + const auto* h2h = dense_h2h; + const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h; + const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h; + + PrepareLayout(bottom); + + auto* _layout_input = param.layout_input; + auto* new_emb = _layout_input->data(); + const auto& new_offset = _layout_input->lod()[0]; + int max_width = _layout_input->lod()[0].size() - 1; + + // this buffer is used for book keeping info which will be used in bp + // buffer also needed in bp, so make it larger + _buffer->Resize({20, _cap_l, _cap_h}); + auto* buffer_data = _buffer->mutable_data(TARGET(kCUDA)); + auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h; + auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h; + auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h; + auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h; + auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h; + auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h; + auto* r = buffer_data + 6 * _cap_l * _cap_h; + auto* z = buffer_data + 7 * _cap_l * _cap_h; + auto* tilde = buffer_data + 8 * _cap_l * _cap_h; + // the internal hidden + auto* hidden = buffer_data + 19 * _cap_l * _cap_h; + + gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context); + gemm_impl_->run(1.0f, 0.0f, new_emb, e2h, w_x_e, &context); + gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context); + gemm_impl_->run(1.0f, 0.0f, new_emb, e2hr, wr_x_e, &context); + gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context); + gemm_impl_->run(1.0f, 0.0f, new_emb, e2hz, wz_x_e, &context); + + // precompute hidden0 + int num = batch * _cap_h; + int threads = 512; + int blocks = (num + threads - 1) / threads; + PreComputeKernel<<>>( + num, w_x_e, wz_x_e, tilde, z, hidden); + + // recurrence + for (int i = 1; i < max_width; i++) { + int w_tm1 = new_offset[i] - new_offset[i - 1]; + int w = new_offset[i + 1] - new_offset[i]; + + // precompute hidden i-1 to hidden i + auto* htm1 = hidden + new_offset[i - 1] * _cap_h; + + gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context); + gemm_impl_->run( + 1.0f, 0.0f, htm1, h2h, u_x_h + new_offset[i] * _cap_h, &context); + gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context); + gemm_impl_->run( + 1.0f, 0.0f, htm1, h2hr, ur_x_h + new_offset[i] * _cap_h, &context); + gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context); + gemm_impl_->run( + 1.0f, 0.0f, htm1, h2hz, uz_x_h + new_offset[i] * _cap_h, &context); + + // compute the gate and hidden + int start = new_offset[i] * _cap_h; + int end = (new_offset[i] + w) * _cap_h; + PostComputeKernel<<>>(start, + end, + _cap_h, + w_tm1, + wr_x_e, + ur_x_h, + wz_x_e, + uz_x_h, + w_x_e, + u_x_h, + r, + z, + tilde, + hidden); + } + + CopyBack(hidden, top_hidden, _cap_h); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_grnn, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchGrnnCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Wi", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Wh", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("tmp_buffer", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("idx_sorted_by_width", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("layout_input", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_grnn_compute.h b/lite/kernels/cuda/search_grnn_compute.h new file mode 100644 index 0000000000..73d84635d0 --- /dev/null +++ b/lite/kernels/cuda/search_grnn_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/blas.h" +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchGrnnCompute + : public KernelLite { + public: + using param_t = operators::SearchGrnnParam; + using TargetW = TargetWrapper; + + void PrepareForRun() override; + void Run() override; + virtual ~SearchGrnnCompute() = default; + + private: + std::shared_ptr idx_sorted_by_width_cpu; + std::unique_ptr> gemm_impl_; + void PrepareLayout(const Tensor* input); + void CopyBack(float* from, float* to, int step); +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_grnn_compute_test.cc b/lite/kernels/cuda/search_grnn_compute_test.cc new file mode 100644 index 0000000000..08b96e1f1e --- /dev/null +++ b/lite/kernels/cuda/search_grnn_compute_test.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_grnn_compute.h" +#include +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +TEST(search_grnn, normal) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + SearchGrnnCompute kernel; + operators::SearchGrnnParam param; + + int num_input = 6; + int num_hidden = 6; + int num_batch = 3; + Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer; + x.Resize({num_batch, num_input}); + wi.Resize({3, num_hidden, num_input}); + wh.Resize({3, num_hidden, num_hidden}); + LoD x_lod{}; + x_lod.push_back({0, 1, 3}); + x.set_lod(x_lod); + + Tensor x_cpu, wi_cpu, wh_cpu, out_cpu, layout_input_cpu, tmp_buffer_cpu; + x_cpu.Resize({num_batch, num_input}); + wi_cpu.Resize({3, num_hidden, num_input}); + wh_cpu.Resize({3, num_hidden, num_hidden}); + out_cpu.Resize({num_batch, num_hidden}); + layout_input_cpu.Resize({num_batch, num_input}); + tmp_buffer_cpu.Resize({20, num_batch, num_hidden}); + auto* x_cpu_data = x_cpu.mutable_data(); + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = static_cast(i); + } + auto* wi_cpu_data = wi_cpu.mutable_data(); + for (int i = 0; i < wi_cpu.numel(); ++i) { + wi_cpu_data[i] = static_cast(i); + } + auto* wh_cpu_data = wh_cpu.mutable_data(); + for (int i = 0; i < wh_cpu.numel(); ++i) { + wh_cpu_data[i] = static_cast(i); + } + + x.Assign(x_cpu_data, x_cpu.dims()); + wi.Assign(wi_cpu_data, wi_cpu.dims()); + wh.Assign(wh_cpu_data, wh_cpu.dims()); + + param.x = &x; + param.wi = &wi; + param.wh = &wh; + param.out = &out; + param.idx_sorted_by_width = &idx_sorted_by_width; + param.layout_input = &layout_input; + param.tmp_buffer = &tmp_buffer; + param.num_input = num_input; + param.num_hidden = num_hidden; + kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + kernel.SetContext(std::move(ctx)); + kernel.Launch(); + cudaDeviceSynchronize(); + + auto* out_cpu_data = out_cpu.mutable_data(); + auto* out_data = out.mutable_data(TARGET(kCUDA)); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + LOG(INFO) << "out_data:"; + for (int i = 0; i < out.numel(); i++) { + // EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5); + LOG(INFO) << out_cpu_data[i]; + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_group_padding_compute.cu b/lite/kernels/cuda/search_group_padding_compute.cu new file mode 100644 index 0000000000..697e53dbb6 --- /dev/null +++ b/lite/kernels/cuda/search_group_padding_compute.cu @@ -0,0 +1,164 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_group_padding_compute.h" + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +template +__global__ void ker_search_group_padding(Dtype* out_emb_padding_data, + Dtype* out_padding_data, + const Dtype* in_data, + const uint64_t* offset, + const int seq_num, + const int max_len, + const int emb_size, + const Dtype pad_id, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int seq_id = word_id / max_len; + int word_id_in_seq = word_id % max_len; + int cur_len = offset[seq_id + 1] - offset[seq_id]; + if (word_id_in_seq < cur_len) { + out_emb_padding_data[tid] = + in_data[(offset[seq_id] + word_id_in_seq) * emb_size + emb_id]; + } else { + out_emb_padding_data[tid] = 0.f; + if (emb_id == 0) { + out_padding_data[word_id] = pad_id; + } + } + } +} + +void SearchGroupPaddingCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + + const Tensor* x = param.x; + Tensor* out_emb_padding = param.out_emb_padding; + Tensor* out_new = param.out_new; + Tensor* out_padding = param.out_padding; + const float pad_id = static_cast(param.pad_id); + const float* in_data = x->data(); + const auto& in_seq_offset = x->lod()[0]; + int batch = in_seq_offset.size() - 1; + int max_seq = 0; + for (int i = 0; i < batch; ++i) { + if (in_seq_offset[i + 1] - in_seq_offset[i] > max_seq) { + max_seq = in_seq_offset[i + 1] - in_seq_offset[i]; + } + } + std::vector new_offset; + new_offset.resize(batch + 1); + for (int i = 0; i < batch + 1; ++i) { + new_offset[i] = i * max_seq; + } + std::vector x_dims = x->dims().Vectorize(); + LoD out_emb_padding_lod; + out_emb_padding_lod.push_back(new_offset); + out_emb_padding->set_lod(out_emb_padding_lod); + out_emb_padding->Resize({batch * max_seq, x_dims[1]}); + float* out_emb_padding_data = + out_emb_padding->mutable_data(TARGET(kCUDA)); + + LoD out_new_lod; + out_new_lod.push_back(in_seq_offset); + out_new->set_lod(out_new_lod); + out_new->Resize({x_dims[0], 1}); + float* out_new_data = out_new->mutable_data(TARGET(kCUDA)); + + LoD out_padding_lod; + out_padding_lod.push_back(new_offset); + out_padding->set_lod(out_padding_lod); + out_padding->Resize({batch * max_seq, 1}); + float* out_padding_data = out_padding->mutable_data(TARGET(kCUDA)); + + const int count = out_emb_padding->numel(); + const auto& out_emb_padding_seq_offset = out_emb_padding->lod()[0]; + int max_len = out_emb_padding_seq_offset[1]; + int seq_num = out_emb_padding_seq_offset.size() - 1; + int emb_size = x->dims()[1]; + _in_seq_offset.Resize({seq_num + 1, 1, 1, 1}); + uint64_t* offset_data = _in_seq_offset.mutable_data(TARGET(kCUDA)); + + TargetWrapperCuda::MemcpyAsync(offset_data, + in_seq_offset.data(), + sizeof(uint64_t) * in_seq_offset.size(), + IoDirection::HtoD, + cuda_stream); + + TargetWrapperCuda::MemsetSync( + out_new_data, 0, out_new->dims()[0] * out_new->dims()[1] * sizeof(float)); + TargetWrapperCuda::MemsetSync( + out_padding_data, + 0, + out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float)); + + ker_search_group_padding< + float><<>>( + out_emb_padding_data, + out_padding_data, + in_data, + offset_data, + seq_num, + max_len, + emb_size, + pad_id, + count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_group_padding, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchGroupPaddingCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_emb_padding", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_new", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_padding", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_group_padding_compute.h b/lite/kernels/cuda/search_group_padding_compute.h new file mode 100644 index 0000000000..88391e6d65 --- /dev/null +++ b/lite/kernels/cuda/search_group_padding_compute.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchGroupPaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchGroupPaddingParam; + + void Run() override; + virtual ~SearchGroupPaddingCompute() = default; + + private: + lite::Tensor _in_seq_offset; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_group_padding_compute_test.cc b/lite/kernels/cuda/search_group_padding_compute_test.cc new file mode 100644 index 0000000000..b831780c87 --- /dev/null +++ b/lite/kernels/cuda/search_group_padding_compute_test.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_group_padding_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +TEST(search_group_padding_cuda, run_test) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + lite::Tensor x, x_cpu, x_ref; + lite::Tensor out_emb_padding, out_emb_padding_cpu, out_emb_padding_ref; + lite::Tensor out_new, out_new_cpu, out_new_ref; + lite::Tensor out_padding, out_padding_cpu, out_padding_ref; + + int x_dims0 = 2; + int x_dims1 = 3; + + x.Resize({x_dims0, x_dims1}); + x_cpu.Resize({x_dims0, x_dims1}); + x_ref.Resize({x_dims0, x_dims1}); + out_emb_padding.Resize({1, x_dims1}); + out_emb_padding_cpu.Resize({1, x_dims1}); + out_emb_padding_ref.Resize({1, x_dims1}); + out_new.Resize({x_dims0, 1}); + out_new_cpu.Resize({x_dims0, 1}); + out_new_ref.Resize({x_dims0, 1}); + out_padding.Resize({1, 1}); + out_padding_cpu.Resize({1, 1}); + out_padding_ref.Resize({1, 1}); + + LoD x_lod{}; + x_lod.push_back({0, 1}); + x.set_lod(x_lod); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* x_ref_data = x_ref.mutable_data(); + auto* out_emb_padding_data = + out_emb_padding.mutable_data(TARGET(kCUDA)); + auto* out_emb_padding_cpu_data = out_emb_padding_cpu.mutable_data(); + auto* out_emb_padding_ref_data = out_emb_padding_ref.mutable_data(); + auto* out_new_data = out_new.mutable_data(TARGET(kCUDA)); + auto* out_new_cpu_data = out_new_cpu.mutable_data(); + auto* out_new_ref_data = out_new_ref.mutable_data(); + auto* out_padding_data = out_padding.mutable_data(TARGET(kCUDA)); + auto* out_padding_cpu_data = out_padding_cpu.mutable_data(); + auto* out_padding_ref_data = out_padding_ref.mutable_data(); + + for (int64_t i = 0; i < x_cpu.dims().production(); i++) { + x_cpu_data[i] = static_cast(i); + x_ref_data[i] = static_cast(i); + } + x.Assign(x_cpu_data, x_cpu.dims()); + out_emb_padding_ref_data[0] = 0.f; + out_emb_padding_ref_data[1] = 1.f; + out_emb_padding_ref_data[2] = 2.f; + out_new_ref_data[0] = 0.f; + out_new_ref_data[1] = 0.f; + out_padding_ref_data[0] = 0.f; + + SearchGroupPaddingCompute sgp_kernel; + operators::SearchGroupPaddingParam param; + + param.x = &x; + param.out_emb_padding = &out_emb_padding; + param.out_new = &out_new; + param.out_padding = &out_padding; + + sgp_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + sgp_kernel.SetContext(std::move(ctx)); + sgp_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync(out_emb_padding_cpu_data, + out_emb_padding_data, + sizeof(float) * out_emb_padding.numel(), + IoDirection::DtoH); + CopySync(out_new_cpu_data, + out_new_data, + sizeof(float) * out_new.numel(), + IoDirection::DtoH); + CopySync(out_padding_cpu_data, + out_padding_data, + sizeof(float) * out_padding.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_emb_padding_cpu.dims().production(); i++) { + EXPECT_NEAR(out_emb_padding_cpu_data[i], out_emb_padding_ref_data[i], 1e-5); + } + for (int i = 0; i < out_new_cpu.dims().production(); i++) { + EXPECT_NEAR(out_new_cpu_data[i], out_new_ref_data[i], 1e-5); + } + for (int i = 0; i < out_padding_cpu.dims().production(); i++) { + EXPECT_NEAR(out_padding_cpu_data[i], out_padding_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_group_padding, kCUDA, kFloat, kNCHW, def); diff --git a/lite/kernels/cuda/search_seq_depadding_compute.cu b/lite/kernels/cuda/search_seq_depadding_compute.cu new file mode 100644 index 0000000000..ecadceab58 --- /dev/null +++ b/lite/kernels/cuda/search_seq_depadding_compute.cu @@ -0,0 +1,115 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_seq_depadding_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void ker_sequence_depadding_fwd(Dtype* out_data, + const Dtype* in_data, + const int* seq_id_map, + const int seq_num, + const int max_len, + const int emb_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int seq_id = seq_id_map[word_id]; + out_data[tid] = in_data[seq_id * emb_size + emb_id]; + } +} + +void SearchSeqDepaddingCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + + auto* pad = param.pad; + auto* src = param.src; + auto* out = param.out; + + auto* in_data = pad->data(); + out->Resize({src->dims()[0], pad->dims()[1]}); + auto* out_data = out->mutable_data(TARGET(kCUDA)); + const int count = out->numel(); + + const auto& pad_seq_offset = pad->lod()[0]; + const auto& src_seq_offset = src->lod()[0]; + int max_len = pad_seq_offset[1]; + int seq_num = pad_seq_offset.size() - 1; + int emb_size = pad->dims()[1]; + + LoD out_lod; + out_lod.push_back(src_seq_offset); + out->set_lod(out_lod); + std::vector seq_id_map; + for (int i = 0; i < seq_num; i++) { + int cur_len = src_seq_offset[i + 1] - src_seq_offset[i]; + for (int j = 0; j < cur_len; j++) { + seq_id_map.push_back(i * max_len + j); + } + } + + int map_size = seq_id_map.size(); + seq_id_map_tensor.Resize({map_size, 1, 1, 1}); + int* seq_id_map_data = seq_id_map_tensor.mutable_data(TARGET(kCUDA)); + TargetW::MemcpyAsync(seq_id_map_data, + &seq_id_map[0], + seq_id_map.size() * sizeof(int), + IoDirection::HtoD, + cuda_stream); + + int threads = 512; + int blocks = (count + threads - 1) / threads; + ker_sequence_depadding_fwd<<>>( + out_data, in_data, seq_id_map_data, seq_num, max_len, emb_size, count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_seq_depadding, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchSeqDepaddingCompute, + def) + .BindInput("Src", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Pad", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_seq_depadding_compute.h b/lite/kernels/cuda/search_seq_depadding_compute.h new file mode 100644 index 0000000000..a06f39bee2 --- /dev/null +++ b/lite/kernels/cuda/search_seq_depadding_compute.h @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchSeqDepaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchSeqDepaddingParam; + using TargetW = TargetWrapper; + + void Run() override; + virtual ~SearchSeqDepaddingCompute() = default; + + private: + Tensor seq_id_map_tensor; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_seq_depadding_compute_test.cc b/lite/kernels/cuda/search_seq_depadding_compute_test.cc new file mode 100644 index 0000000000..9c23ff14ab --- /dev/null +++ b/lite/kernels/cuda/search_seq_depadding_compute_test.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_seq_depadding_compute.h" +#include +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +TEST(search_seq_depadding, normal) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + SearchSeqDepaddingCompute kernel; + operators::SearchSeqDepaddingParam param; + + Tensor pad, src, out; + pad.Resize({2 * 3, 4}); + src.Resize({3, 1}); + out.Resize({3, 4}); + LoD pad_lod{}; + pad_lod.push_back({0, 4, 6}); + pad.set_lod(pad_lod); + LoD src_lod{}; + src_lod.push_back({0, 2, 3}); + src.set_lod(src_lod); + + Tensor pad_cpu, src_cpu, out_cpu; + pad_cpu.Resize({2 * 3, 4}); + src_cpu.Resize({3, 1}); + out_cpu.Resize({3, 4}); + + auto* pad_cpu_data = pad_cpu.mutable_data(); + auto* src_cpu_data = src_cpu.mutable_data(); + for (int i = 0; i < pad_cpu.numel(); ++i) { + pad_cpu_data[i] = static_cast(i); + } + + pad.Assign(pad_cpu_data, pad_cpu.dims()); + src.Assign(src_cpu_data, src_cpu.dims()); + + param.pad = &pad; + param.src = &src; + param.out = &out; + kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + kernel.SetContext(std::move(ctx)); + kernel.Launch(); + cudaDeviceSynchronize(); + + auto* out_cpu_data = out_cpu.mutable_data(); + auto* out_data = out.mutable_data(TARGET(kCUDA)); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + + std::vector ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19}; + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5); + // LOG(INFO) << out_cpu_data[i]; + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_seq_fc_compute.cu b/lite/kernels/cuda/search_seq_fc_compute.cu new file mode 100644 index 0000000000..e3ac75afee --- /dev/null +++ b/lite/kernels/cuda/search_seq_fc_compute.cu @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_seq_fc_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void add_bias(int n, + int output_size, + const dtype* bias, + dtype* dout) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int bias_index = index % output_size; + if (index < n) { + dout[index] = dout[index] + bias[bias_index]; + } +} + +void SearchSeqFcCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +void SearchSeqFcCompute::Run() { + auto& param = this->Param(); + CHECK(ctx_) << "running context should be set first"; + auto& cuda_ctx = ctx_->template As(); + auto cuda_stream = cuda_ctx.exec_stream(); + + auto x = param.x; + auto w = param.w; + auto b = param.b; + auto out = param.out; + auto out_size = param.out_size; + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto out_dims = out->dims(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]"; + CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size"; + int M = x_dims[0]; + int K = x_dims[1]; + int N = w_dims[0]; + auto x_data = x->data(); + auto w_data = w->data(); + auto out_data = out->mutable_data(TARGET(kCUDA)); + + CHECK(gemm_impl_->init(false, true, M, N, K, &cuda_ctx)); + gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &cuda_ctx); + + if (b != nullptr) { + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + auto b_data = b->mutable_data(); + int total_size = M * N; + add_bias<<>>(total_size, N, b_data, out_data); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_seq_fc, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchSeqFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_seq_fc_compute.h b/lite/kernels/cuda/search_seq_fc_compute.h new file mode 100644 index 0000000000..dff8ba2acf --- /dev/null +++ b/lite/kernels/cuda/search_seq_fc_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/context.h" +#include "lite/core/kernel.h" +#include "lite/core/types.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchSeqFcCompute : public KernelLite { + public: + using param_t = operators::SearchSeqFcParam; + + void PrepareForRun() override; + void Run() override; + virtual ~SearchSeqFcCompute() = default; + + private: + std::unique_ptr> gemm_impl_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_seq_fc_compute_test.cc b/lite/kernels/cuda/search_seq_fc_compute_test.cc new file mode 100644 index 0000000000..354d1bb5bc --- /dev/null +++ b/lite/kernels/cuda/search_seq_fc_compute_test.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_seq_fc_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void search_seq_fc_compute_ref(const operators::SearchSeqFcParam& param) { + auto x = param.x; + auto w = param.w; + auto b = param.b; + auto out = param.out; + auto out_size = param.out_size; + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto& x_lod = x->lod(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + const auto& x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + int M = x_dims[0]; + int K = x_dims[1]; + int N = w_dims[0]; + auto x_data = x->data(); + auto w_data = w->data(); + auto out_data = out->mutable_data(); + + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + auto sum = static_cast(0); + for (int l = 0; l < K; l++) { + T xv = x_data[i * K + l]; + T wv = w_data[j * K + l]; + sum += xv * wv; + } + out_data[i * N + j] = sum; + } + } + + if (b != nullptr) { + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + auto b_data = b->data(); + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + out_data[i * N + j] += b_data[j]; + } + } + } +} + +TEST(search_seq_fc_compute, normal) { + Env::Init(); + for (auto x_lod_0 : {std::vector({0, 1, 3}), + std::vector({0, 3, 4, 5})}) { + for (auto feature_size : {2, 9}) { + for (auto out_size : {3, 5}) { + for (auto has_bias : {true, false}) { + // infer x_dims, w_dims, b_dims and out_dims + DDim x_dims({static_cast(x_lod_0.back()), feature_size}); + DDim w_dims({out_size, feature_size}); + DDim b_dims({has_bias ? out_size : 0}); + DDim out_dims({static_cast(x_lod_0.back()), out_size}); + LoD x_lod; + x_lod.push_back(x_lod_0); + LoD out_lod; + out_lod.push_back(x_lod_0); + // prepare input&output tensors + Tensor x_dev, x_host, w_dev, w_host, b_dev, b_host, out_dev, out_host, + out_ref; + x_host.Resize(x_dims); + w_host.Resize(w_dims); + b_host.Resize(b_dims); + out_host.Resize(out_dims); + x_dev.Resize(x_dims); + w_dev.Resize(w_dims); + b_dev.Resize(b_dims); + out_dev.Resize(out_dims); + out_ref.Resize(out_dims); + x_host.set_lod(x_lod); + out_host.set_lod(out_lod); + x_dev.set_lod(x_lod); + out_dev.set_lod(out_lod); + out_ref.set_lod(out_lod); + auto out_dev_data = out_dev.mutable_data(TARGET(kCUDA)); + auto x_host_data = x_host.mutable_data(); + auto w_host_data = w_host.mutable_data(); + auto out_host_data = out_host.mutable_data(); + auto out_ref_data = out_ref.mutable_data(); + for (int i = 0; i < x_host.dims().production(); i++) { + x_host_data[i] = i * 0.125f; + } + for (int i = 0; i < w_host.dims().production(); i++) { + w_host_data[i] = i * 0.5f; + } + x_dev.Assign(x_host_data, + x_host.dims()); + w_dev.Assign(w_host_data, + w_host.dims()); + // prepare cuda context, initialize param, and run kernel + operators::SearchSeqFcParam param; + param.x = &x_dev; + param.w = &w_dev; + param.out = &out_dev; + param.out_size = out_size; + if (has_bias) { + auto b_host_data = b_host.mutable_data(); + for (int i = 0; i < b_host.dims().production(); i++) { + b_host_data[i] = i * 0.5f; + } + b_dev.Assign(b_host_data, + b_host.dims()); + param.b = &b_dev; + } + std::unique_ptr ctx(new KernelContext); + auto& cuda_ctx = ctx->As(); + cuda_ctx.InitOnce(); + int dev_id = TargetWrapper::GetCurDevice(); + cuda_ctx.Init(dev_id); + SearchSeqFcCompute search_seq_fc; + search_seq_fc.SetParam(param); + search_seq_fc.SetContext(std::move(ctx)); + search_seq_fc.Launch(); + cudaDeviceSynchronize(); + CopySync(out_host_data, + out_dev_data, + sizeof(float) * out_dev.dims().production(), + IoDirection::DtoH); + // run reference + param.x = &x_host; + param.w = &w_host; + param.out = &out_ref; + if (has_bias) { + param.b = &b_host; + } + search_seq_fc_compute_ref(param); + // verify result + for (int i = 0; i < out_ref.dims().production(); i++) { + EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5); + } + } + } + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.cu b/lite/kernels/cuda/sequence_arithmetic_compute.cu new file mode 100644 index 0000000000..7593632a14 --- /dev/null +++ b/lite/kernels/cuda/sequence_arithmetic_compute.cu @@ -0,0 +1,249 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_arithmetic_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void ker_arithmetic_sum(Dtype* out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = + in_data_0[tid] + + in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +template +__global__ void ker_arithmetic_sub(Dtype* out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = + in_data_0[tid] - + in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +template +__global__ void ker_arithmetic_mul(Dtype* out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = + in_data_0[tid] * + in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +void SequenceArithmeticCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + auto x_data = param.X->data(); + auto x_lod = param.X->lod()[0]; + auto y_data = param.Y->data(); + auto y_lod = param.Y->lod()[0]; + auto out_data = param.Out->mutable_data(TARGET(kCUDA)); + + offset_x.Resize({static_cast(x_lod.size())}); + auto offset_x_data = offset_x.mutable_data(TARGET(kCUDA)); + + offset_y.Resize({static_cast(y_lod.size())}); + auto offset_y_data = offset_y.mutable_data(TARGET(kCUDA)); + + word_id_to_seq_id.Resize({param.X->numel()}); + auto word_id_to_seq_id_data = + word_id_to_seq_id.mutable_data(TARGET(kCUDA)); + + std::vector word_seq_map; + for (int i = 0; i < x_lod.size() - 1; i++) { + for (int j = x_lod[i]; j < x_lod[i + 1]; j++) { + word_seq_map.push_back(i); + } + } + + std::vector offset_x_data_cpu(x_lod.size(), 0); + auto x_lod_data = x_lod.data(); + for (int i = 0; i < offset_x_data_cpu.size(); i++) { + offset_x_data_cpu[i] = x_lod_data[i]; + } + + std::vector offset_y_data_cpu(y_lod.size(), 0); + auto y_lod_data = y_lod.data(); + for (int i = 0; i < offset_y_data_cpu.size(); i++) { + offset_y_data_cpu[i] = y_lod_data[i]; + } + + TargetWrapperCuda::MemcpyAsync(offset_x_data, + offset_x_data_cpu.data(), + sizeof(int) * x_lod.size(), + IoDirection::HtoD, + stream); + + TargetWrapperCuda::MemcpyAsync(offset_y_data, + offset_y_data_cpu.data(), + sizeof(int) * y_lod.size(), + IoDirection::HtoD, + stream); + + TargetWrapperCuda::MemcpyAsync(word_id_to_seq_id_data, + word_seq_map.data(), + sizeof(int) * word_seq_map.size(), + IoDirection::HtoD, + stream); + + int seq_num = x_lod.size() - 1; + int count = param.X->numel(); + int inner_size = param.X->dims()[1]; + switch (param.op_type) { + case 1: // sum + ker_arithmetic_sum< + float><<>>( + out_data, + x_data, + y_data, + offset_x_data, + offset_y_data, + word_id_to_seq_id_data, + seq_num, + inner_size, + count); + break; + case 2: // sub + ker_arithmetic_sub< + float><<>>( + out_data, + x_data, + y_data, + offset_x_data, + offset_y_data, + word_id_to_seq_id_data, + seq_num, + inner_size, + count); + break; + case 3: // mul + ker_arithmetic_mul< + float><<>>( + out_data, + x_data, + y_data, + offset_x_data, + offset_y_data, + word_id_to_seq_id_data, + seq_num, + inner_size, + count); + break; + default: + break; + } + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_arithmetic, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); +REGISTER_LITE_KERNEL(search_seq_arithmetic, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.h b/lite/kernels/cuda/sequence_arithmetic_compute.h new file mode 100644 index 0000000000..a180c50eaa --- /dev/null +++ b/lite/kernels/cuda/sequence_arithmetic_compute.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceArithmeticCompute + : public KernelLite { + public: + using param_t = operators::SequenceArithmeticParam; + + void Run() override; + virtual ~SequenceArithmeticCompute() = default; + + private: + lite::Tensor offset_x; + lite::Tensor offset_y; + lite::Tensor word_id_to_seq_id; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_arithmetic_compute_test.cc b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc new file mode 100644 index 0000000000..c0746d375d --- /dev/null +++ b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_arithmetic_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +void sequence_arithmetic_compute_ref(const Tensor& x, + const Tensor& y, + Tensor* out, + int op_type) { + auto x_data = x.data(); + auto y_data = y.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + auto x_seq_offset = x.lod()[0]; + auto y_seq_offset = y.lod()[0]; + int seq_num = x_seq_offset.size() - 1; + int inner_size = x.numel() / x.dims()[0]; + + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + switch (op_type) { + case 1: + t_out[j] = input_x[j] + input_y[j]; + break; + case 2: + t_out[j] = input_x[j] - input_y[j]; + break; + case 3: + t_out[j] = input_x[j] * input_y[j]; + break; + default: + break; + } + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len)); + } + } +} + +void prepare_input(Tensor* x, const LoD& x_lod) { + x->Resize({static_cast(x_lod[0].back()), 3}); + x->set_lod(x_lod); + auto x_data = x->mutable_data(); + for (int i = 0; i < x->numel(); i++) { + x_data[i] = (i - x->numel() / 2) * 1.1; + } +} + +TEST(sequence_arithmetic_cuda, run_test) { + lite::Tensor x, y, x_cpu, y_cpu; + lite::Tensor out, out_cpu, out_ref; + lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}}; + + prepare_input(&x_cpu, x_lod); + prepare_input(&y_cpu, y_lod); + + x.Resize(x_cpu.dims()); + x.set_lod(x_cpu.lod()); + auto x_cpu_data = x_cpu.mutable_data(); + x.Assign(x_cpu_data, x_cpu.dims()); + + y.Resize(y_cpu.dims()); + y.set_lod(y_cpu.lod()); + auto y_cpu_data = y_cpu.mutable_data(); + y.Assign(y_cpu_data, y_cpu.dims()); + + operators::SequenceArithmeticParam param; + param.X = &x; + param.Y = &y; + param.Out = &out; + param.op_type = 1; + + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + SequenceArithmeticCompute sequence_arithmetic; + sequence_arithmetic.SetContext(std::move(ctx)); + sequence_arithmetic.SetParam(param); + sequence_arithmetic.Run(); + cudaDeviceSynchronize(); + + auto out_data = out.mutable_data(TARGET(kCUDA)); + out_cpu.Resize(out.dims()); + auto out_cpu_data = out_cpu.mutable_data(); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + + sequence_arithmetic_compute_ref(x_cpu, y_cpu, &out_ref, param.op_type); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-3); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_concat_compute.cu b/lite/kernels/cuda/sequence_concat_compute.cu new file mode 100644 index 0000000000..d4390046b0 --- /dev/null +++ b/lite/kernels/cuda/sequence_concat_compute.cu @@ -0,0 +1,151 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_concat_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; + +template +inline LoD ConcatLoD(const std::vector& xs) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +template +__global__ void ker_sequence_concat(Dtype* out_data, + const uint64_t* in_locate_data, + const int* o2i_map, + const int* o2i_w_map, + const int seq_num, + const int emb_size, + const int count) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int input_id = o2i_map[word_id]; + int cur_work_id = o2i_w_map[word_id]; + const Dtype* in_data = reinterpret_cast( + reinterpret_cast(in_locate_data[input_id])); + out_data[tid] = in_data[cur_work_id * emb_size + emb_id]; + } +} + +void SequenceConcatCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + float* out_data = param.Out->mutable_data(TARGET(kCUDA)); + + int seq_num = param.X[0]->lod()[0].size() - 1; + const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0]; + std::vector in_locate_vec; + for (size_t i = 0; i < param.X.size(); ++i) { + in_locate_vec.push_back( + reinterpret_cast(param.X[i]->data())); + } + in_locate_tensor.Resize({static_cast(in_locate_vec.size())}); + + std::vector out2in_map; + std::vector out2in_word_map; + for (int i = 0; i < seq_num; ++i) { + for (int j = 0; j < param.X.size(); ++j) { + auto offset = param.X[j]->lod()[0]; + int cur_len = offset[i + 1] - offset[i]; + for (int k = 0; k < cur_len; ++k) { + out2in_map.push_back(j); + out2in_word_map.push_back(offset[i] + k); + } + } + } + int word_num = out2in_map.size(); + out2in_map_tensor.Resize({word_num}); + out2in_word_map_tensor.Resize({word_num}); + int* gpu_o2i_map_data = out2in_map_tensor.mutable_data(TARGET(kCUDA)); + int* gpu_o2i_w_map_data = + out2in_word_map_tensor.mutable_data(TARGET(kCUDA)); + uint64_t* gpu_in_locate_data = + in_locate_tensor.mutable_data(TARGET(kCUDA)); + + TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data, + out2in_map.data(), + sizeof(int) * out2in_map.size(), + IoDirection::HtoD, + stream); + TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data, + out2in_word_map.data(), + sizeof(int) * out2in_word_map.size(), + IoDirection::HtoD, + stream); + TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data, + in_locate_vec.data(), + sizeof(uint64_t) * in_locate_vec.size(), + IoDirection::HtoD, + stream); + + param.Out->set_lod(ConcatLoD(param.X)); + + int count = param.X[0]->numel(); + for (int i = 1; i < param.X.size(); ++i) { + count += param.X[i]->numel(); + } + + int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; + ker_sequence_concat<<>>( + out_data, + gpu_in_locate_data, + gpu_o2i_map_data, + gpu_o2i_w_map_data, + seq_num, + emb_size, + count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_concat, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceConcatCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_concat_compute.h b/lite/kernels/cuda/sequence_concat_compute.h new file mode 100644 index 0000000000..1737c18dd3 --- /dev/null +++ b/lite/kernels/cuda/sequence_concat_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceConcatCompute + : public KernelLite { + public: + using param_t = operators::SequenceConcatParam; + + void Run() override; + virtual ~SequenceConcatCompute() = default; + + private: + lite::Tensor out2in_map_tensor; + lite::Tensor out2in_word_map_tensor; + lite::Tensor in_locate_tensor; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_concat_compute_test.cc b/lite/kernels/cuda/sequence_concat_compute_test.cc new file mode 100644 index 0000000000..477dc48dbb --- /dev/null +++ b/lite/kernels/cuda/sequence_concat_compute_test.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_concat_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +namespace { +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +static void sequence_concat_ref(const std::vector& xs, + lite::Tensor* out) { + std::vector out_dims; + int64_t batch_size = 0; + int64_t feature_size = 0; + for (const auto& tensor : xs) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.Vectorize(); + } + batch_size += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + out_dims[0] = batch_size; + out->Resize(out_dims); + std::vector x_in_order; + out->set_lod(ConcatLoD(xs, &x_in_order)); + + int num = x_in_order.size(); + std::vector input_cols(num); + for (int i = 0; i < num; ++i) { + input_cols[i] = x_in_order[i].numel(); + } + float* out_data = out->mutable_data(); + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = x_in_order[j].data(); + memcpy(out_data + col_idx, input_data, sizeof(float) * col_len); + col_idx += col_len; + } +} + +#define PREPARE_INPUT_DATA(name) \ + name.Resize({name##_lod_len, feature_len}); \ + name##_cpu.Resize({name##_lod_len, feature_len}); \ + name##_ref.Resize({name##_lod_len, feature_len}); \ + name.set_lod(lod_info_##name); \ + name##_cpu.set_lod(lod_info_##name); \ + name##_ref.set_lod(lod_info_##name); \ + float* name##_cpu_data = name##_cpu.mutable_data(); \ + float* name##_ref_data = name##_ref.mutable_data(); \ + for (int i = 0; i < name##_cpu.numel(); ++i) { \ + name##_cpu_data[i] = (i - 2.0) * 1.0; \ + name##_ref_data[i] = (i - 2.0) * 1.0; \ + } \ + name.Assign(name##_cpu_data, \ + name##_cpu.dims()); + +#define PREPARE_OUTPUT_INFO(name) \ + name##_cpu.Resize({y_lod_len, feature_len}); \ + name##_ref.Resize({y_lod_len, feature_len}); \ + name.Resize({y_lod_len, feature_len}); \ + float* name##_cpu_data = name##_cpu.mutable_data(); + +} // namespace + +TEST(sequence_concat_cuda, normal) { + SequenceConcatCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::SequenceConcatParam param; + lite::Tensor x1, x2, x3, x1_cpu, x2_cpu, x3_cpu, x1_ref, x2_ref, x3_ref; + lite::Tensor y, y_cpu, y_ref; + + int32_t x1_lod_len = 10, feature_len = 4; + int32_t x2_lod_len = 4, x3_lod_len = 8; + int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len; + LoD lod_info_x1{{0, 3, 5, 6, 10}}; + LoD lod_info_x2{{0, 1, 2, 3, 4}}; + LoD lod_info_x3{{0, 2, 4, 6, 8}}; + LoD lod_info_y{{0, 0, 0, 0, 0}}; + for (size_t i = 0; i < lod_info_x1[0].size(); ++i) { + lod_info_y[0][i] = + lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i]; + } + + PREPARE_INPUT_DATA(x1); + PREPARE_INPUT_DATA(x2); + PREPARE_INPUT_DATA(x3); + PREPARE_OUTPUT_INFO(y); + + param.X = std::vector({&x1, &x2, &x3}); + param.Out = &y; + seq_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + cudaDeviceSynchronize(); + + auto* y_data = y.mutable_data(TARGET(kCUDA)); + CopySync( + y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH); + + std::vector input_ref({&x1_ref, &x2_ref, &x3_ref}); + sequence_concat_ref(input_ref, &y_ref); + float* y_ref_data = y_ref.mutable_data(); + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pool_compute.cu b/lite/kernels/cuda/sequence_pool_compute.cu new file mode 100644 index 0000000000..97876ec32f --- /dev/null +++ b/lite/kernels/cuda/sequence_pool_compute.cu @@ -0,0 +1,258 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_pool_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void seq_pool_average_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for (int i = 0; i < in_slice_num; ++i) { + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum / in_slice_num; + } +} + +template +__global__ void seq_pool_sum_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for (int i = 0; i < in_slice_num; ++i) { + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum; + } +} + +template +__global__ void seq_pool_sqrt_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for (int i = 0; i < in_slice_num; ++i) { + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum * rsqrtf(in_slice_num); + } +} + +template +__global__ void seq_pool_max_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype max = src_in[0]; + for (int i = 1; i < in_slice_num; ++i) { + Dtype val = src_in[i * slice_size]; + if (val > max) { + max = val; + } + } + dst[out_batch_id * slice_size + out_id] = max; + } +} + +template +__global__ void seq_pool_last_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_offset = + (static_cast(seq_offset[out_batch_id + 1]) - 1) * slice_size; + dst[tid] = src_in[in_offset + out_id]; + } +} + +template +__global__ void seq_pool_first_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + dst[tid] = src_in[in_offset + out_id]; + } +} + +void SequencePoolCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + std::vector seq_offset = param.X->lod()[0]; + int batch_size = param.X->lod()[0].size() - 1; + int slice_size = param.Out->dims().production() / batch_size; + + float* out_data = param.Out->mutable_data(TARGET(kCUDA)); + const float* in_data = param.X->data(); + + lite::Tensor seq_offset_D; + seq_offset_D.Resize({static_cast(seq_offset.size())}); + TargetWrapperCuda::MemcpyAsync( + seq_offset_D.mutable_data(TARGET(kCUDA)), + seq_offset.data(), + sizeof(uint64_t) * seq_offset.size(), + IoDirection::HtoD, + stream); + + if (param.pool_type == "MAX") { + seq_pool_max_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "AVERAGE") { + seq_pool_average_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "SUM") { + seq_pool_sum_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "SQRT") { + seq_pool_sqrt_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "FIRST") { + seq_pool_first_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "LAST") { + seq_pool_last_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else { + LOG(ERROR) << "pool type " << param.pool_type << " is not supoorted."; + } + + std::vector offset_new(static_cast(batch_size + 1)); + + for (int i = 0; i <= batch_size; ++i) { + offset_new[i] = i; + } + std::vector> voffset_new; + voffset_new.push_back(offset_new); + param.Out->set_lod(voffset_new); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_pool, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequencePoolCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_pool_compute.h b/lite/kernels/cuda/sequence_pool_compute.h new file mode 100644 index 0000000000..9309454d18 --- /dev/null +++ b/lite/kernels/cuda/sequence_pool_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequencePoolCompute + : public KernelLite { + public: + using param_t = operators::SequencePoolParam; + + void Run() override; + virtual ~SequencePoolCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pool_compute_test.cc b/lite/kernels/cuda/sequence_pool_compute_test.cc new file mode 100644 index 0000000000..0f2656cd1d --- /dev/null +++ b/lite/kernels/cuda/sequence_pool_compute_test.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_pool_compute.h" +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +TEST(sequence_pool_cuda, normal) { + SequencePoolCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + lite::Tensor x, x_cpu, out, out_cpu; + lite::LoD lod; + lod.push_back(std::vector{0, 10}); + + x.set_lod(lod); + x_cpu.set_lod(lod); + const size_t second_dim = 8u; + std::vector input_shape{static_cast(lod[0].back()), + static_cast(second_dim)}; + lite::DDim in_dims(input_shape); + x.Resize(in_dims); + x_cpu.Resize(in_dims); + + const size_t out_first_dim = lod[0].size() - 1; + std::vector output_shape{static_cast(out_first_dim), + static_cast(second_dim)}; + lite::DDim out_dims(output_shape); + out.Resize(out_dims); + out_cpu.Resize(out_dims); + + auto x_cpu_data = x_cpu.mutable_data(); + auto out_data = out.mutable_data(TARGET(kCUDA)); + auto out_cpu_data = out_cpu.mutable_data(); + + for (int64_t i = 0; i < x_cpu.dims().production(); i++) { + x_cpu_data[i] = 1.1f * i; + } + x.Assign(x_cpu_data, x_cpu.dims()); + + operators::SequencePoolParam param; + param.X = &x; + param.Out = &out; + std::vector pool_types( + {"MAX", "AVERAGE", "SUM", "SQRT", "FIRST", "LAST"}); + std::map> type_map; + type_map["MAX"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9}; + type_map["AVERAGE"] = {39.6, 40.7, 41.8, 42.9, 44, 45.1, 46.2, 47.3}; + type_map["SUM"] = {396, 407, 418, 429, 440, 451, 462, 473}; + type_map["SQRT"] = { + 125.226, 128.705, 132.183, 135.662, 139.14, 142.619, 146.097, 149.576}; + type_map["FIRST"] = {0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7}; + type_map["LAST"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9}; + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + seq_kernel.SetContext(std::move(ctx)); + for (std::string pool_type : pool_types) { + param.pool_type = pool_type; + seq_kernel.SetParam(param); + + seq_kernel.Run(); + cudaDeviceSynchronize(); + + CopySync(out_cpu_data, + out_data, + sizeof(float) * out_cpu.numel(), + IoDirection::DtoH); + + std::vector ref_results = type_map[pool_type]; + + for (int i = 0; i < out_cpu.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-3); + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_reverse_compute.cu b/lite/kernels/cuda/sequence_reverse_compute.cu new file mode 100644 index 0000000000..68447fcebb --- /dev/null +++ b/lite/kernels/cuda/sequence_reverse_compute.cu @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_reverse_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__host__ __device__ inline size_t UpperBound(const T* x, + size_t num, + const T& val) { + // The following code is from + // https://en.cppreference.com/w/cpp/algorithm/upper_bound + auto* first = x; + int64_t count = static_cast(num); + while (count > 0) { + auto step = (count >> 1); + auto* it = first + step; + if (val < *it) { + count = step; + } else { + first = ++it; + count -= (step + 1); + } + } + return static_cast(first - x); +} + +template +__global__ void SequenceReverseKernelGridIsOne( + const T* x, T* y, const int64_t* lod, size_t lod_count, int64_t row_numel) { + int64_t idx = static_cast(threadIdx.x); + auto row_idx_x = idx / row_numel; + auto lod_idx = UpperBound(lod, lod_count, row_idx_x); + auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x); + auto idx_y = row_idx_y * row_numel + idx % row_numel; + y[idx_y] = x[idx]; +} + +template +__global__ void SequenceReverseKernel(const T* x, + T* y, + const int64_t* lod, + size_t lod_count, + int64_t row_numel, + size_t limit) { + int64_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + auto row_idx_x = idx / row_numel; + auto lod_idx = UpperBound(lod, lod_count, row_idx_x); + auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x); + auto idx_y = row_idx_y * row_numel + idx % row_numel; + y[idx_y] = x[idx]; + } +} + +template +void SequenceReverseCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + size_t limit = static_cast(param.X->numel()); + int64_t row_numel = static_cast(limit / param.X->dims()[0]); + const auto* x_data = param.X->template data(); + auto y_data = param.Out->template mutable_data(TARGET(kCUDA)); + CHECK_NE(x_data, y_data) + << "SequenceReverse Op does not support in-place operation"; + const auto lod = param.X->lod()[param.X->lod().size() - 1]; + const size_t lod_count = lod.size(); + param.Out->set_lod(param.X->lod()); + + lod_cuda.Resize({static_cast(lod.size())}); + int64_t* lod_data = lod_cuda.mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemcpyAsync(lod_data, + lod.data(), + sizeof(int64_t) * lod.size(), + IoDirection::HtoD, + stream); + constexpr int num_threads = 1024; + int block_size = limit <= num_threads ? limit : num_threads; + int grid_size = (limit + num_threads - 1) / num_threads; + if (grid_size == 1) { + SequenceReverseKernelGridIsOne<<<1, block_size, 0, stream>>>( + x_data, y_data, lod_data, lod_count, row_numel); + } else { + SequenceReverseKernel<<>>( + x_data, y_data, lod_data, lod_count, row_numel, limit); + } + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +typedef paddle::lite::kernels::cuda::SequenceReverseCompute + ReverseFp32; + +typedef paddle::lite::kernels::cuda::SequenceReverseCompute + ReverseInt64; + +REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kFloat, kNCHW, ReverseFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kInt64, kNCHW, ReverseInt64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_reverse_compute.h b/lite/kernels/cuda/sequence_reverse_compute.h new file mode 100644 index 0000000000..6b6199e020 --- /dev/null +++ b/lite/kernels/cuda/sequence_reverse_compute.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceReverseCompute : public KernelLite { + public: + using param_t = operators::SequenceReverseParam; + + void Run() override; + virtual ~SequenceReverseCompute() = default; + + private: + lite::Tensor lod_cuda; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_reverse_compute_test.cc b/lite/kernels/cuda/sequence_reverse_compute_test.cc new file mode 100644 index 0000000000..3317b52303 --- /dev/null +++ b/lite/kernels/cuda/sequence_reverse_compute_test.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_reverse_compute.h" +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) { + const auto* x_data = x->data(); + auto seq_offset = x->lod()[x->lod().size() - 1]; + int width = x->numel() / x->dims()[0]; + auto* y_data = y->mutable_data(); + for (int i = 0; i < static_cast(seq_offset.size()) - 1; ++i) { + auto start_pos = seq_offset[i]; + auto end_pos = seq_offset[i + 1]; + for (auto pos = start_pos; pos < end_pos; ++pos) { + auto cur_pos = end_pos - pos - 1 + start_pos; + std::memcpy(y_data + pos * width, + x_data + cur_pos * width, + width * sizeof(float)); + } + } +} + +TEST(sequence_reverse_cuda, normal) { + SequenceReverseCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::SequenceReverseParam param; + lite::Tensor x, x_cpu, x_ref; + lite::Tensor y, y_cpu, y_ref; + + int32_t lod_len = 10, feature_len = 4; + LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}}; + + x.Resize({lod_len, feature_len}); + x_cpu.Resize({lod_len, feature_len}); + x_ref.Resize({lod_len, feature_len}); + y.Resize({lod_len, feature_len}); + y_cpu.Resize({lod_len, feature_len}); + y_ref.Resize({lod_len, feature_len}); + x.set_lod(lod_info); + x_cpu.set_lod(lod_info); + x_ref.set_lod(lod_info); + y.set_lod(lod_info); + y_cpu.set_lod(lod_info); + y_ref.set_lod(lod_info); + + auto* y_data = y.mutable_data(TARGET(kCUDA)); + + float* x_cpu_data = x_cpu.mutable_data(); + float* x_ref_data = x_ref.mutable_data(); + float* y_cpu_data = y_cpu.mutable_data(); + float* y_ref_data = y_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = (i - 2.0) * 1.0; + x_ref_data[i] = (i - 2.0) * 1.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + + param.X = &x; + param.Out = &y; + seq_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + cudaDeviceSynchronize(); + + CopySync( + y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH); + + sequence_reverse_ref(&x_ref, &y_ref); + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu new file mode 100644 index 0000000000..8ea3edb30d --- /dev/null +++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu @@ -0,0 +1,209 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/sequence_topk_avg_pooling_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void topk_avg_pooling_kernel_by_row_improve( + Dtype *output_data, + const Dtype *input, + const int *gpu_input_offset_l, + const int *gpu_input_offset_r, + const int topk_size, + const int *topks, + const int feat_map_num) { + int row = + gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x]; // 8 + int col = gpu_input_offset_r[blockIdx.x + 1] - + gpu_input_offset_r[blockIdx.x]; // 30 + int max_k = topks[topk_size - 1]; + max_k = max_k < col ? max_k : col; + + extern __shared__ Dtype smem[]; // H*W + + const Dtype *fm_row_in_data = input; + for (int i = 0; i < blockIdx.x; ++i) { + int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i]; + int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i]; + fm_row_in_data += tmp_row * feat_map_num * tmp_col; + } + fm_row_in_data += blockIdx.y * row * col; + + for (int i = threadIdx.x; i < row * col; i += blockDim.x) { + smem[i] = fm_row_in_data[i]; + } + __syncthreads(); + + for (int idx = threadIdx.x; idx < row; idx += blockDim.x) { + Dtype *fm_row_out_data = + output_data + + (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size + + blockIdx.y * topk_size; + + Dtype *smem_start_col = smem + idx * col; + + int counter = max_k; // topk_size; + Dtype last_max_val = -20000.0; + while (counter) { + Dtype max_val = -10000.0; + int max_pos = 0; + int m = 0; + for (; m < col; m++) { + Dtype cur_data = smem_start_col[m]; + if (cur_data > max_val) { + max_val = cur_data; + max_pos = m; + last_max_val = max_val; + } + } + if (max_val < -9999.0) { // == -10000.0 + max_val = last_max_val; + } + smem_start_col[max_pos] = -10000000.0; + int i = max_k - counter; + for (int c = 0; c < topk_size; c++) { + if (i <= topks[c] - 1) { + fm_row_out_data[c] += max_val; + } + } + counter--; + } + __syncthreads(); + // compute avg + for (int i = 0; i < topk_size; i++) { + fm_row_out_data[i] = fm_row_out_data[i] / topks[i]; + } + } +} + +template +void SequenceTopkAvgPoolingCompute::Run() { + auto ¶m = this->Param(); + auto &ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + int topk_num = param.topks.size(); + lite::DDim top_ks_shape(std::vector{topk_num, 1, 1, 1}); + _top_ks.Resize(top_ks_shape); + cudaMemcpyAsync(_top_ks.mutable_data(TARGET(kCUDA)), + ¶m.topks[0], + sizeof(int) * topk_num, + cudaMemcpyHostToDevice, + cuda_stream); + + int width_offset_len = param.COLUMN->lod()[0].size(); + lite::DDim width_offset_shape( + std::vector{width_offset_len, 1, 1, 1}); + _width_offset.Resize(width_offset_shape); + std::vector width_lod_0(width_offset_len, 0); + for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) { + width_lod_0[i] = static_cast(param.COLUMN->lod()[0][i]); + } + cudaMemcpyAsync(_width_offset.mutable_data(TARGET(kCUDA)), + &width_lod_0[0], + sizeof(int) * width_offset_len, + cudaMemcpyHostToDevice, + cuda_stream); + + int height_offset_len = param.ROW->lod()[0].size(); + lite::DDim height_offset_shape( + std::vector{height_offset_len, 1, 1, 1}); + _height_offset.Resize(height_offset_shape); + std::vector height_lod_0(height_offset_len, 0); + for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) { + height_lod_0[i] = static_cast(param.ROW->lod()[0][i]); + } + cudaMemcpyAsync(_height_offset.mutable_data(TARGET(kCUDA)), + &height_lod_0[0], + sizeof(int) * height_offset_len, + cudaMemcpyHostToDevice, + cuda_stream); + + const Tensor *x_tensor = param.X; + Tensor *out_tensor = param.Out; + const T *in_data = x_tensor->data(); + T *out_data = out_tensor->mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemsetAsync(out_tensor->mutable_data(TARGET(kCUDA)), + 0, + sizeof(T) * out_tensor->numel(), + cuda_stream); + + int num = param.ROW->lod()[0].size() - 1; + int channel = param.channel_num; + + const int *height_offset = _height_offset.data(); + const int *width_offset = _width_offset.data(); + + int feat_map_size = 0; + for (size_t i = 0; i < height_lod_0.size() - 1; ++i) { + int height = height_lod_0[i + 1] - height_lod_0[i]; + int width = width_lod_0[i + 1] - width_lod_0[i]; + if (height * width > feat_map_size) { + feat_map_size = height * width; + } + } + dim3 blocks(num, channel); + dim3 threads(32, 1); + topk_avg_pooling_kernel_by_row_improve< + T><<>>( + out_data, + in_data, + height_offset, + width_offset, + param.topks.size(), + _top_ks.data(), + param.channel_num); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + sequence_topk_avg_pooling, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceTopkAvgPoolingCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("ROW", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("COLUMN", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("pos", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h new file mode 100644 index 0000000000..321ec9cfce --- /dev/null +++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/kernel.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceTopkAvgPoolingCompute + : public KernelLite { + public: + using param_t = operators::SequenceTopkAvgPoolingParam; + + void Run() override; + + virtual ~SequenceTopkAvgPoolingCompute() = default; + + protected: + lite::Tensor _height_offset; + lite::Tensor _width_offset; + lite::Tensor _top_ks; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/softmax_compute.cu b/lite/kernels/cuda/softmax_compute.cu index d8d2987524..6293f7295e 100644 --- a/lite/kernels/cuda/softmax_compute.cu +++ b/lite/kernels/cuda/softmax_compute.cu @@ -173,9 +173,10 @@ void SoftmaxCompute::Run() { cudaGetDeviceProperties(&deviceProp, device_id); size_t sharedmem_size = deviceProp.sharedMemPerBlock; int max_dimsize = sharedmem_size / sizeof(float) / threads; - auto input_data = param.x->data(); auto output_data = param.output->mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemsetSync( + output_data, 0, param.output->numel() * sizeof(float)); if (axis_size <= max_dimsize) { int use_sharemem_size = axis_size * threads * sizeof(float); sharemem_softmax_kernel<<>>( @@ -194,7 +195,7 @@ void SoftmaxCompute::Run() { auto max_data = tmax_data.mutable_data(TARGET(kCUDA)); auto sum_data = tsum_data.mutable_data(TARGET(kCUDA)); //! firstly, get maximum data - float min_data = std::numeric_limits::min(); + float min_data = std::numeric_limits::lowest(); softmax_max_kernel<<>>(total_threads, input_data, max_data, @@ -217,7 +218,7 @@ void SoftmaxCompute::Run() { total_threads, output_data, sum_data, inner_num, outer_num, axis_size); } cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); } } // namespace cuda @@ -244,3 +245,19 @@ REGISTER_LITE_KERNEL(softmax, PRECISION(kFloat), DATALAYOUT(kNCHW))}) .Finalize(); +REGISTER_LITE_KERNEL(search_seq_softmax, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SoftmaxCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu new file mode 100644 index 0000000000..f2588a8f53 --- /dev/null +++ b/lite/kernels/cuda/var_conv_2d_compute.cu @@ -0,0 +1,263 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/core/tensor.h" +#include "lite/kernels/cuda/var_conv_2d_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; + +template +__global__ void var_im2col_gpu_kernel(const int n, + const Dtype* data_im, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int height_col, + const int width_col, + Dtype* data_col) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (int index = idx; index < n; index += blockDim.x * gridDim.x) { + const int h_index = index / width_col; + const int h_col = h_index % height_col; + const int w_col = index % width_col; + const int c_im = h_index / height_col; + const int c_col = c_im * kernel_h * kernel_w; + const int h_offset = h_col * stride_h - pad_h; + const int w_offset = w_col * stride_w - pad_w; + + Dtype* data_col_ptr = data_col; + data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; + const Dtype* data_im_ptr = data_im; + data_im_ptr += (c_im * height + h_offset) * width + w_offset; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h_im = h_offset + i; + int w_im = w_offset + j; + *data_col_ptr = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? data_im_ptr[i * width + j] + : 0; + data_col_ptr += height_col * width_col; + } + } + } +} + +void VarConv2DCompute::var_im2col(const cudaStream_t& stream) { + auto& param = this->Param(); + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + const auto* input = param.X; + auto* col = param.Col; + + int batch = input->lod()[0].size() - 1; + const auto& bottom_offset = input->lod()[0]; + // 2-D lod info. + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(TARGET(kCUDA)); + const auto* bottom_data = input->data(); + + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int width_col = (width - 1) / stride_w + 1; + int height_col = (height - 1) / stride_h + 1; + const float* data_im = bottom_data + b_offset; + float* data_col = top_data + t_offset; + + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int num_kernels = height_col * width_col * input_channel; + const int CUDA_NUM_BLOCKS = + (num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; + var_im2col_gpu_kernel< + float><<>>( + num_kernels, + data_im, + height, + width, + kernel_h, + kernel_w, + ((stride_h - 1) * height + kernel_h - 1) / 2, + ((stride_w - 1) * width + kernel_w - 1) / 2, + stride_h, + stride_w, + height_col, + width_col, + data_col); + } +} + +void VarConv2DCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + auto* bottom = param.X; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + auto* w = param.W; + auto* top = param.Out; + auto* col = param.Col; + int output_channel = param.output_channel; + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + + var_im2col(stream); + + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + std::vector top_offset; + std::vector height_vector; + std::vector width_vector; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + height_vector.push_back(top_im_y); + width_vector.push_back(top_im_x); + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + + auto* top_data = top->mutable_data(TARGET(kCUDA)); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + std::unique_ptr> gemm_impl_; + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + float* out_data = top_data + top_offset[b]; + const float* in_data = col_data + col->lod()[0][b]; + gemm_impl_.reset(new lite::cuda::math::Gemm); + gemm_impl_->init(false, + false, + w->dims()[0], + height_vector[b] * width_vector[b], + input_channel * kernel_h * kernel_w, + &ctx); + gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx); + } + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(var_conv_2d, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::VarConv2DCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/var_conv_2d_compute.h b/lite/kernels/cuda/var_conv_2d_compute.h new file mode 100644 index 0000000000..e0b8e30c50 --- /dev/null +++ b/lite/kernels/cuda/var_conv_2d_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class VarConv2DCompute : public KernelLite { + public: + using param_t = operators::VarConv2DParam; + + void Run() override; + virtual ~VarConv2DCompute() = default; + + private: + void var_im2col(const cudaStream_t& stream); +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/var_conv_2d_compute_test.cc b/lite/kernels/cuda/var_conv_2d_compute_test.cc new file mode 100644 index 0000000000..98e9c73cdd --- /dev/null +++ b/lite/kernels/cuda/var_conv_2d_compute_test.cc @@ -0,0 +1,360 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/var_conv_2d_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +static void im2col_ref(const lite::Tensor& input, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + lite::Tensor* col) { + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + // 2-D lod info. + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(); + const auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } +} + +static void naive_sgemm(const bool transpose_A, + const bool transpose_B, + const int M, + const int N, + const int K, + const float alpha, + const float* A, // m x k (after transpose if TransA) + const int lda, // leading dimension of a + const float* B, // k x n (after transpose if TransB) + const int ldb, // leading dimension of b + const float beta, + float* C, // m x n + const int ldc) { + for (int m = 0; m < M; ++m) { + for (int k = 0; k < K; ++k) { + for (int n = 0; n < N; ++n) { + C[m * N + n] += beta * C[m * N + n]; + size_t A_idx = 0, B_idx = 0; + if (transpose_A) { + A_idx = k * M + m; // A is k x m + } else { + A_idx = m * K + k; // A is m x k + } + + if (transpose_B) { + B_idx = n * K + k; // B is n x k + } else { + B_idx = k * N + n; // B is k x n + } + + C[m * N + n] += alpha * A[A_idx] * B[B_idx]; + } + } + } +} + +static void var_conv_2d_ref(const lite::Tensor* bottom, + const lite::Tensor* w, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + const int output_channel, + lite::Tensor* top, + lite::Tensor* col) { + im2col_ref(*bottom, + in_row, + in_col, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + naive_sgemm(false, + false, + output_channel, + top_im_size, + input_channel * kernel_h * kernel_w, + 1.0, + w_data, + input_channel * kernel_h * kernel_w, + col_data + col_offset[b], + top_im_size, + 0.0, + top_data + top_offset[b], + top_im_size); + } +} + +TEST(var_conv_2d_cuda, normal) { + VarConv2DCompute var_conv_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::VarConv2DParam param; + + lite::Tensor X, W, ROW, COLUMN; + lite::Tensor x_cpu, w_cpu; + lite::Tensor Out, Col, out_cpu, col_cpu; + int kernel_h = 5, kernel_w = 5; + int stride_h = 1, stride_w = 1; + int input_channel = 5, output_channel = 5; + + std::vector w_dims_vec; + w_dims_vec.push_back(output_channel); + w_dims_vec.push_back(input_channel * kernel_h * kernel_w); + W.Resize(w_dims_vec); + w_cpu.Resize(w_dims_vec); + auto* w_cpu_data = w_cpu.mutable_data(); + for (int i = 0; i < W.numel(); ++i) { + w_cpu_data[i] = i - 1.f; + } + + std::vector row_lod_vec{0, 10, 20}; + LoD row_lod; + row_lod.push_back(row_lod_vec); + ROW.set_lod(row_lod); + + std::vector column_lod_vec{0, 10, 20}; + LoD column_lod; + column_lod.push_back(column_lod_vec); + COLUMN.set_lod(column_lod); + + int x_size = 0; + std::vector x_lod_vec; + x_lod_vec.push_back(0); + for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) { + int height = row_lod_vec[i + 1] - row_lod_vec[i]; + int width = column_lod_vec[i + 1] - column_lod_vec[i]; + x_lod_vec.push_back(x_lod_vec.back() + height * width); + x_size += height * width; + } + for (size_t i = 0; i < x_lod_vec.size(); ++i) { + x_lod_vec[i] *= input_channel; + } + x_size *= input_channel; + std::vector x_dims_vec{x_size, 1}; + LoD x_lod; + x_lod.push_back(x_lod_vec); + x_lod.push_back(row_lod_vec); + x_lod.push_back(column_lod_vec); + X.Resize(x_dims_vec); + x_cpu.Resize(x_dims_vec); + X.set_lod(x_lod); + x_cpu.set_lod(x_lod); + auto* x_cpu_data = x_cpu.mutable_data(); + for (int i = 0; i < X.numel(); ++i) { + x_cpu_data[i] = i % 20 * 1.f; + } + + int sum_num = 0; + int out_sum_num = 0; + for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) { + int height = row_lod_vec[i + 1] - row_lod_vec[i]; + int width = column_lod_vec[i + 1] - column_lod_vec[i]; + sum_num += height * width * input_channel * kernel_h * kernel_w; + out_sum_num += height * width * output_channel; + } + col_cpu.Resize({sum_num, 1}); + out_cpu.Resize({out_sum_num, 1}); + float* out_cpu_data = out_cpu.mutable_data(); + float* col_cpu_data = col_cpu.mutable_data(); + + X.Assign(x_cpu_data, x_cpu.dims()); + W.Assign(w_cpu_data, w_cpu.dims()); + + param.X = &X; + param.W = &W; + // param.ROW = &ROW; + // param.COLUMN = &COLUMN; + param.Out = &Out; + param.Col = &Col; + param.stride_h = stride_h; + param.stride_w = stride_w; + param.kernel_h = kernel_h; + param.kernel_w = kernel_w; + param.input_channel = input_channel; + param.output_channel = output_channel; + var_conv_kernel.SetParam(param); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + var_conv_kernel.SetContext(std::move(ctx)); + var_conv_kernel.Run(); + cudaDeviceSynchronize(); + + const float* out_data = Out.data(); + const float* col_data = Col.data(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH); + CopySync( + col_cpu_data, col_data, sizeof(float) * Col.numel(), IoDirection::DtoH); + + lite::Tensor top_ref, col_ref; + var_conv_2d_ref(&x_cpu, + &w_cpu, + &ROW, + &COLUMN, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + output_channel, + &top_ref, + &col_ref); + + for (int i = 0; i < Out.numel(); ++i) { + EXPECT_NEAR(out_cpu_data[i], top_ref.data()[i], 1e-5); + } + for (int i = 0; i < Col.numel(); ++i) { + EXPECT_NEAR(col_cpu_data[i], col_ref.data()[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc index 3e06e103bb..8bc171dd67 100644 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -36,8 +36,15 @@ void ConvCompute::PrepareForRun() { conv_param.filter = param.filter->ZynqTensor(); conv_param.groups = param.groups; conv_param.strides = param.strides; + auto paddings = *param.paddings; conv_param.paddings = param.paddings; conv_param.dilations = param.dilations; + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1] + << ", " << paddings[2] << ", " << paddings[3]; + } fill_scale_bias_const(&conv_param); conv_param.bias()->copyFrom(param.bias->ZynqTensor()); conv_param.relu.enabled = param.fuse_relu; diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc index f166974cc9..1e05c1fa0c 100644 --- a/lite/kernels/fpga/conv_compute_test.cc +++ b/lite/kernels/fpga/conv_compute_test.cc @@ -141,13 +141,15 @@ void conv_compute_ref(const operators::ConvParam& param) { int group = param.groups; int kernel_w = param.filter->dims()[2]; int kernel_h = param.filter->dims()[3]; + + auto paddings = *param.paddings; + auto dilations = *para.dilations; int stride_w = param.strides[0]; int stride_h = param.strides[1]; - int dila_w = param.dilations[0]; - int dila_h = param.dilations[1]; - - int pad_w = param.paddings[0]; - int pad_h = param.paddings[1]; + int dila_w = dilations[0]; + int dila_h = dilations[1]; + int pad_w = paddings[2]; + int pad_h = paddings[0]; bool flag_bias = (param.bias != nullptr); bool flag_relu = param.fuse_relu; @@ -277,10 +279,14 @@ TEST(conv_fpga, compute) { param.bias = &bias; } param.fuse_relu = flag_relu; - param.paddings = std::vector({padding, padding}); + std::vector paddings = { + padding, padding, padding, padding}; param.strides = std::vector({stride, stride}); + std::vector dilations = {dilation, dilation}; + param.paddings = + std::make_shared>(paddings); param.dilations = - std::vector({dilation, dilation}); + std::make_shared>(dilations); param.groups = group; conv.SetParam(param); conv.Launch(); diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 032de81974..79d1bf2fd5 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -19,6 +19,9 @@ lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps}) +lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps}) +lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps}) +lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps}) set(npu_bridges npu_bridge_registry @@ -39,6 +42,9 @@ set(npu_bridges npu_bridge_concat_op npu_bridge_shuffle_channel_op npu_bridge_pad2d_op + npu_bridge_square_op + npu_bridge_sqrt_op + npu_bridge_reduce_mean_op CACHE INTERNAL "npu_bridges") set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops}) @@ -60,5 +66,8 @@ lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) +lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) +lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) +lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) message(STATUS "+++++ npu_bridges: ${npu_bridges}") diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc index 51b49091cd..ac62891113 100644 --- a/lite/kernels/npu/bridges/act_op.cc +++ b/lite/kernels/npu/bridges/act_op.cc @@ -41,6 +41,19 @@ node_map_type ActConverter(const std::shared_ptr act_op, // clipped_relu etc. act_node->set_attr_mode(lite::npu::CvtActMode(op_type)); + if (op_type == "relu_clipped") { + auto Relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); + act_node->set_attr_coef(Relu_clipped_coef); + } else if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + act_node->set_attr_negative_slope(alpha); + } else if (op_type == "hard_sigmoid") { + auto slope = op_info->GetAttr("slope"); + auto offset = op_info->GetAttr("offset"); + act_node->set_attr_negative_slope(slope); + act_node->set_attr_coef(offset); + } + node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = act_node; return outputs_map; @@ -52,14 +65,18 @@ node_map_type ActConverter(const std::shared_ptr act_op, } // namespace lite } // namespace paddle -REGISTER_NPU_BRIDGE(sigmod, paddle::lite::kernels::npu::bridges::ActConverter); +REGISTER_NPU_BRIDGE(sigmoid, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter); -REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter); +REGISTER_NPU_BRIDGE(relu_clipped, + paddle::lite::kernels::npu::bridges::ActConverter); +// REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter); +REGISTER_NPU_BRIDGE(leaky_relu, + paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(softsign, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(softplus, paddle::lite::kernels::npu::bridges::ActConverter); -REGISTER_NPU_BRIDGE(hardsigmoid, +REGISTER_NPU_BRIDGE(hard_sigmoid, paddle::lite::kernels::npu::bridges::ActConverter); diff --git a/lite/kernels/npu/bridges/act_op_test.cc b/lite/kernels/npu/bridges/act_op_test.cc index 420de655dc..d50b1968b1 100644 --- a/lite/kernels/npu/bridges/act_op_test.cc +++ b/lite/kernels/npu/bridges/act_op_test.cc @@ -17,7 +17,7 @@ #include "lite/core/op_registry.h" #include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/test_helper.h" -#include "lite/operators/relu_op.h" +#include "lite/operators/activation_ops.h" namespace paddle { namespace lite { @@ -25,69 +25,112 @@ namespace kernels { namespace npu { namespace bridges { -void relu_ref(const std::shared_ptr op) { +void act_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto op_type = op_info->Type(); + auto x = scope->FindTensor("x"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); auto x_data = x->data(); auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - DDim out_dims = out->dims(); - CHECK_EQ(x_dims.production(), out_dims.production()); - for (int i = 0; i < out_dims.production(); i++) { - out_data[i] = std::max(0.f, x_data[i]); + CHECK_EQ(x->numel(), out->numel()); + + // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid" + if (op_type == "sigmoid") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = 1.f / (1.f + std::exp(-x_data[i])); + } + } else if (op_type == "relu") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::max(0.f, x_data[i]); + } + } else if (op_type == "tanh") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) / + (std::exp(x_data[i]) + std::exp(-x_data[i])); + } + } else if (op_type == "relu_clipped") { + auto relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef); + } + } else if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::max(x_data[i], x_data[i] * alpha); + } + } else if (op_type == "softsign") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = x_data[i] / (1 + std::abs(x_data[i])); + } + } else if (op_type == "hard_sigmoid") { + auto slope = op_info->GetAttr("slope"); + auto offset = op_info->GetAttr("offset"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::min(1.f, slope * x_data[i] + offset); + out_data[i] = std::max(0.f, out_data[i]); + } + } else { + LOG(FATAL) << "unsupported activation type: " << op_type; } } -void test_relu(int bs, int ic, int ih, int iw) { +void test_act(std::vector x_shape, std::string op_type) { // prepare input&output variables Scope scope; std::string x_var_name("x"); std::string out_var_name("out"); std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + x->Resize(x_shape); // initialize input&output data - FillTensor(x); + FillTensor(x, -8, 8); // initialize op desc cpp::OpDesc opdesc; - opdesc.SetType("relu"); + opdesc.SetType(op_type); opdesc.SetInput("X", {x_var_name}); opdesc.SetOutput("Out", {out_var_name}); + if (op_type == "relu_clipped") { + opdesc.SetAttr("Relu_clipped_coef", 6.f); + } else if (op_type == "leaky_relu") { + opdesc.SetAttr("alpha", 0.02f); + } else if (op_type == "hard_sigmoid") { + opdesc.SetAttr("slope", 0.2f); + opdesc.SetAttr("offset", 0.5f); + } // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); + auto op = CreateOp(opdesc, &scope); LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); // execute reference implementation and save to output tensor - relu_ref(op); + act_ref(op); // compare results auto* out_data = out->mutable_data(); auto* out_ref_data = out_ref->mutable_data(); for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); } } -TEST(NPUBridges, relu) { - for (auto bs : {1, 3}) { - for (auto ic : {3, 4}) { - for (auto ih : {2, 5}) { - for (auto iw : {5, 9}) { - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw; - test_relu(bs, ic, ih, iw); - } - } +TEST(NPUBridges, activation) { + std::vector> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; + std::vector types{"sigmoid", + "relu", + "tanh", + "relu_clipped", + "leaky_relu", + "softsign", + "hard_sigmoid"}; + for (auto x_shape : shapes) { + for (auto op_type : types) { + test_act(x_shape, op_type); } } } @@ -98,5 +141,20 @@ TEST(NPUBridges, relu) { } // namespace lite } // namespace paddle +USE_LITE_OP(sigmoid); +USE_NPU_BRIDGE(sigmoid); USE_LITE_OP(relu); USE_NPU_BRIDGE(relu); +USE_LITE_OP(tanh); +USE_NPU_BRIDGE(tanh); +USE_LITE_OP(relu_clipped); +USE_NPU_BRIDGE(relu_clipped); + +USE_LITE_OP(leaky_relu); +USE_NPU_BRIDGE(leaky_relu); + +USE_LITE_OP(softsign); +USE_NPU_BRIDGE(softsign); + +USE_LITE_OP(hard_sigmoid); +USE_NPU_BRIDGE(hard_sigmoid); diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc index 6f5f00959b..8c3153d242 100644 --- a/lite/kernels/npu/bridges/batch_norm_op.cc +++ b/lite/kernels/npu/bridges/batch_norm_op.cc @@ -30,8 +30,8 @@ node_map_type BatchNormConverter( auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "[NPU] Converting " + op_type + "..."; - std::shared_ptr batch_norm_node = - std::make_shared(unique_op_type); + std::shared_ptr batch_norm_node = + std::make_shared(unique_op_type); auto x_var_name = op_info->Input("X").front(); auto scale_var_name = op_info->Input("Scale").front(); @@ -66,7 +66,7 @@ node_map_type BatchNormConverter( batch_norm_node->set_input_x(*inputs_map.at(x_var_name)); batch_norm_node->set_input_scale(*npu_scale); - batch_norm_node->set_input_b(*npu_bias); + batch_norm_node->set_input_offset(*npu_bias); batch_norm_node->set_input_mean(*npu_mean); batch_norm_node->set_input_variance(*npu_variance); batch_norm_node->set_attr_momentum(npu_momentum); diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc index 32f4d511d5..8dc9ab1f0f 100644 --- a/lite/kernels/npu/bridges/conv_op.cc +++ b/lite/kernels/npu/bridges/conv_op.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/operators/conv_op.h" #include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" @@ -42,9 +43,9 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, auto bs = input_dims[0]; auto ic = input_dims[1]; auto oc = filter_dims[0]; - CHECK_EQ(input_dims.size(), 4); - CHECK_EQ(output_dims.size(), 4); - CHECK_EQ(filter_dims.size(), 4); + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); CHECK_EQ(output_dims[0], bs); CHECK_EQ(output_dims[1], oc); auto strides = op_info->GetAttr>("strides"); @@ -52,9 +53,28 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, auto groups = op_info->GetAttr("groups"); auto dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); // check depthwise mode, and decide whether use ConvolutionDepthwise Op bool use_depthwise_conv = @@ -134,7 +154,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, depthwise_conv_node->set_attr_pad_mode(5); // VALID depthwise_conv_node->set_attr_group(groups); depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); + {paddings[0], paddings[1], paddings[2], paddings[3]})); depthwise_conv_node->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); depthwise_conv_node->set_attr_stride( @@ -161,7 +181,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, common_conv_node->set_attr_pad_mode(0); // NOTSET common_conv_node->set_attr_group(groups); common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); + {paddings[0], paddings[0], paddings[2], paddings[2]})); common_conv_node->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); common_conv_node->set_attr_stride( diff --git a/lite/kernels/npu/bridges/conv_op_test.cc b/lite/kernels/npu/bridges/conv_op_test.cc index 26309aa9e2..909061d2ba 100644 --- a/lite/kernels/npu/bridges/conv_op_test.cc +++ b/lite/kernels/npu/bridges/conv_op_test.cc @@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr op) { int stride_h = strides[0]; int dila_w = dilations[1]; int dila_h = dilations[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int pad_h = paddings[0]; int batch_size = input_dims[0]; int in_ch_size = input_dims[1]; @@ -175,7 +175,8 @@ void test_conv(int bs, opdesc.SetOutput("Output", {output_var_name}); opdesc.SetAttr("dilations", std::vector({dilation, dilation})); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("groups", groups); opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); if (has_bias) { diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc index 5ae99ef046..6eff4cb2d2 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op.cc @@ -44,9 +44,17 @@ node_map_type ConvTransposeConverter( auto groups = op_info->GetAttr("groups"); auto dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the input size."; // create deconv node auto conv_transpose_node = @@ -82,12 +90,11 @@ node_map_type ConvTransposeConverter( lite::npu::OpList::Global().add(inputs_map.at(input_var_name)); // set attributes - conv_transpose_node->set_attr_mode(1); conv_transpose_node->set_attr_format(0); // NCHW conv_transpose_node->set_attr_pad_mode(0); // NOTSET conv_transpose_node->set_attr_group(groups); conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); + {paddings[0], paddings[1], paddings[2], paddings[3]})); conv_transpose_node->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); conv_transpose_node->set_attr_stride( diff --git a/lite/kernels/npu/bridges/conv_transpose_op_test.cc b/lite/kernels/npu/bridges/conv_transpose_op_test.cc index a009ef588e..f96e57c06f 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op_test.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op_test.cc @@ -278,7 +278,8 @@ void test_conv_transpose(int bs, opdesc.SetOutput("Output", {output_var_name}); opdesc.SetAttr("dilations", std::vector({dilation, dilation})); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("groups", groups); opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); if (has_bias) { diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc index 2ec757ab14..5eb5f4e271 100644 --- a/lite/kernels/npu/bridges/elementwise_ops.cc +++ b/lite/kernels/npu/bridges/elementwise_ops.cc @@ -21,6 +21,30 @@ namespace kernels { namespace npu { namespace bridges { +std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { + auto x_dims = x.dims(); + CHECK_EQ(x_dims.size(), 4UL) << "[NPU] only support 4-dimension x"; + auto y_dims = y->dims(); + CHECK_GE(x_dims.size(), y_dims.size()); + + if (axis < 0) { + axis += x_dims.size(); + } + + std::vector y_new_shape(y_dims.Vectorize()); + if (y_new_shape.size() == 4UL) { + return y_new_shape; + } + for (int i = 0; i < axis; i++) { + y_new_shape.insert(y_new_shape.begin(), 1); + } + while (y_new_shape.size() < 4) { + y_new_shape.push_back(1); + } + CHECK_EQ(y_new_shape.size(), 4UL); + return y_new_shape; +} + node_map_type ElementwiseConverter( const std::shared_ptr elementwise_op, const node_map_type& inputs_map) { @@ -30,34 +54,53 @@ node_map_type ElementwiseConverter( auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "[NPU] Converting " + op_type + "..."; - std::shared_ptr elementwise_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); auto y_var_name = op_info->Input("Y").front(); - - CHECK_EQ(op_info->GetAttr("axis"), -1) - << "[NPU] elementwise only support inputs with same size"; - CHECK(inputs_map.find(x_var_name) != inputs_map.end()); - elementwise_node->set_input_x1(*inputs_map.at(x_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + auto axis = op_info->GetAttr("axis"); + std::shared_ptr elementwise_node = nullptr; + std::shared_ptr x_node = inputs_map.at(x_var_name); + std::shared_ptr y_node = nullptr; if (inputs_map.find(y_var_name) != inputs_map.end()) { - elementwise_node->set_input_x2(*inputs_map.at(y_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(y_var_name)); + y_node = inputs_map.at(y_var_name); } else { auto y_const_node = std::make_shared(y_var_name); - auto* y = scope->FindVar(y_var_name)->GetMutable(); - y_const_node->set_attr_value(lite::npu::CvtTensor(y)); - elementwise_node->set_input_x2(*y_const_node); - lite::npu::OpList::Global().add(y_const_node); + auto x = scope->FindTensor(x_var_name); + auto y = scope->FindMutableTensor(y_var_name); + auto y_new_shape = CvtYShape(*x, y, axis); + y_const_node->set_attr_value(lite::npu::CvtTensor(y, y_new_shape)); + y_node = y_const_node; } + lite::npu::OpList::Global().add(x_node); + lite::npu::OpList::Global().add(y_node); - lite::npu::OpList::Global().add(elementwise_node); + if (op_type == "elementwise_add" || + op_type == "fusion_elementwise_add_activation") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x1(*x_node); + elt_node->set_input_x2(*y_node); + elementwise_node = elt_node; + } else if (op_type == "elementwise_sub") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x1(*x_node); + elt_node->set_input_x2(*y_node); + elementwise_node = elt_node; + } else if (op_type == "elementwise_mul") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x(*x_node); + elt_node->set_input_y(*y_node); + elementwise_node = elt_node; + } else if (op_type == "elementwise_div") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x1(*x_node); + elt_node->set_input_x2(*y_node); + elementwise_node = elt_node; + } else { + LOG(FATAL) << "unsupported op type: " << op_type; + } - // paddlelite has sum only - elementwise_node->set_attr_mode(1); + lite::npu::OpList::Global().add(elementwise_node); node_map_type outputs_map; if (op_type == "fusion_elementwise_add_activation") { @@ -86,3 +129,9 @@ REGISTER_NPU_BRIDGE(elementwise_add, paddle::lite::kernels::npu::bridges::ElementwiseConverter); REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation, paddle::lite::kernels::npu::bridges::ElementwiseConverter); +REGISTER_NPU_BRIDGE(elementwise_sub, + paddle::lite::kernels::npu::bridges::ElementwiseConverter); +REGISTER_NPU_BRIDGE(elementwise_mul, + paddle::lite::kernels::npu::bridges::ElementwiseConverter); +REGISTER_NPU_BRIDGE(elementwise_div, + paddle::lite::kernels::npu::bridges::ElementwiseConverter); diff --git a/lite/kernels/npu/bridges/elementwise_ops_test.cc b/lite/kernels/npu/bridges/elementwise_ops_test.cc index 0e2fc9f262..8dd4c851ca 100644 --- a/lite/kernels/npu/bridges/elementwise_ops_test.cc +++ b/lite/kernels/npu/bridges/elementwise_ops_test.cc @@ -29,37 +29,28 @@ template void elementwise_add_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto x = scope->FindTensor("x"); + auto y = scope->FindTensor("y"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); auto x_data = x->data(); auto y_data = y->data(); - dtype* out_data = out->mutable_data(); + auto out_data = out->mutable_data(); auto x_dims = x->dims(); auto y_dims = y->dims(); int axis = op_info->GetAttr("axis"); if (axis < 0) { - axis = x_dims.size() - y_dims.size(); + axis += x_dims.size(); } int batch = 1; - int channels = 1; - int num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - num *= x_dims[i]; - } + int channels = y->numel(); + int num = x->numel() / channels / batch; // do elementwise add/sub/max... - std::string elt_type = "add"; - if (elt_type == "add") { + std::string op_type = op_info->Type(); + if (op_type == "elementwise_add") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -73,7 +64,7 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } - } else if (elt_type == "sub") { + } else if (op_type == "elementwise_sub") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -87,7 +78,7 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } - } else if (elt_type == "mul") { + } else if (op_type == "elementwise_mul") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -101,7 +92,21 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } - } else if (elt_type == "max") { + } else if (op_type == "elementwise_div") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr / diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (op_type == "elementwise_max") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -116,11 +121,14 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } else { - LOG(FATAL) << "unsupported Elementwise type: " << elt_type; + LOG(FATAL) << "unsupported Elementwise type: " << op_type; } } -void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { +void test_elementwise_add(const std::vector& x_shape, + const std::vector& y_shape, + int axis, + std::string elt_type) { // prepare input&output variables Scope scope; std::string x_var_name = "x"; @@ -131,16 +139,16 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { auto* y = scope.Var(y_var_name)->GetMutable(); auto* out = scope.Var(out_var_name)->GetMutable(); auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - y->Resize({bs, ic, ih, iw}); + x->Resize(x_shape); + y->Resize(y_shape); // initialize input&output data - FillTensor(x); - FillTensor(y); + FillTensor(x, 1, 3); + FillTensor(y, 1, 3); // initialize op desc cpp::OpDesc opdesc; - opdesc.SetType("elementwise_add"); + opdesc.SetType("elementwise_" + elt_type); opdesc.SetInput("X", {x_var_name}); opdesc.SetInput("Y", {y_var_name}); opdesc.SetOutput("Out", {out_var_name}); @@ -149,7 +157,6 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { // create and convert op to NPU model, then run it on NPU auto op = CreateOp(opdesc, &scope); LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); // execute reference implementation and save to output tensor elementwise_add_ref(op); @@ -158,19 +165,15 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { auto* out_data = out->mutable_data(); auto* out_ref_data = out_ref->mutable_data(); for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-1); + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); } } TEST(NPUBridges, elementwise_add) { - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto axis : {-1}) test_elementwise_add(bs, ic, ih, iw, axis); - } - } - } + for (auto elt_type : {"add", "sub", "mul", "div"}) { + test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type); + test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type); + test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type); } } @@ -182,3 +185,9 @@ TEST(NPUBridges, elementwise_add) { USE_LITE_OP(elementwise_add); USE_NPU_BRIDGE(elementwise_add); +USE_LITE_OP(elementwise_sub); +USE_NPU_BRIDGE(elementwise_sub); +USE_LITE_OP(elementwise_mul); +USE_NPU_BRIDGE(elementwise_mul); +USE_LITE_OP(elementwise_div); +USE_NPU_BRIDGE(elementwise_div); diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc index 71f5eac57a..8e60a39fe4 100644 --- a/lite/kernels/npu/bridges/interpolate_op.cc +++ b/lite/kernels/npu/bridges/interpolate_op.cc @@ -45,6 +45,7 @@ node_map_type InterpolateConverter( auto out_h = op_info->GetAttr("out_h"); auto align_corners = op_info->GetAttr("align_corners"); int align_mode = op_info->GetAttr("align_mode"); + auto interp_method = op_info->GetAttr("interp_method"); CHECK(!(align_mode == 0 && !align_corners)) << "[NPU] align_mode = 0 && " "align_corners = false isn't " "supported in HiAI DDK"; @@ -58,11 +59,11 @@ node_map_type InterpolateConverter( } // update out_h and out_w if has OutSize - bool inputs_map_has_w = false; + std::shared_ptr out_size_node = nullptr; if (lite::npu::HasInputArg(op_info, scope, "OutSize")) { auto out_size_var_name = op_info->Input("OutSize").front(); if (inputs_map.count(out_size_var_name)) { - inputs_map_has_w = true; + out_size_node = inputs_map.at(out_size_var_name); } else { auto out_size = scope->FindVar(out_size_var_name)->GetMutable(); @@ -73,58 +74,45 @@ node_map_type InterpolateConverter( out_w = out_size_data[1]; } } - - node_map_type outputs_map; - auto interp_method = op_info->GetAttr("interp_method"); - if (interp_method == "bilinear") { - auto interp_node = std::make_shared(unique_op_type); - lite::npu::OpList::Global().add(interp_node); - interp_node->set_input_x(*inputs_map.at(x_var_name)); - if (inputs_map_has_w) { - auto out_size_var_name = op_info->Input("OutSize").front(); - interp_node->set_input_w(*inputs_map.at(out_size_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name)); - } else { + if (out_size_node == nullptr) { + if (interp_method == "bilinear") { const float largest_multiple = 7.0f; float multiple = static_cast(x_h * x_w) / (out_h * out_w); CHECK_LT(multiple, largest_multiple) << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple << " is too large, should not exceed " << largest_multiple << " in HiAI DDK"; - auto w_const_node = - std::make_shared(unique_op_type + "/w"); - w_const_node->set_attr_value( - lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); - interp_node->set_input_w(*w_const_node); - lite::npu::OpList::Global().add(w_const_node); } - interp_node->set_attr_output_dim_mode( - 2); // 0: zoom_factor, 1: shrink_factor, 2: height/width - interp_node->set_attr_align_corners(align_corners); - outputs_map[op_info->Output("Out").front()] = interp_node; + auto out_size_const_node = + std::make_shared(unique_op_type + "/out_size"); + out_size_const_node->set_attr_value( + lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); + out_size_node = out_size_const_node; + } + lite::npu::OpList::Global().add(out_size_node); + + std::shared_ptr interp_node = nullptr; + if (interp_method == "bilinear") { + auto bilinear_interp_node = + std::make_shared(unique_op_type); + bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name)); + bilinear_interp_node->set_input_size(*out_size_node); + bilinear_interp_node->set_attr_align_corners(align_corners); + interp_node = bilinear_interp_node; } else if (interp_method == "nearest") { - auto interp_node = + auto nearest_interp_node = std::make_shared(unique_op_type); - lite::npu::OpList::Global().add(interp_node); - interp_node->set_input_image(*inputs_map.at(x_var_name)); - if (inputs_map_has_w) { - auto out_size_var_name = op_info->Input("OutSize").front(); - interp_node->set_input_size(*inputs_map.at(out_size_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name)); - } else { - auto w_const_node = - std::make_shared(unique_op_type + "/w"); - w_const_node->set_attr_value( - lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); - interp_node->set_input_size(*w_const_node); - lite::npu::OpList::Global().add(w_const_node); - } - interp_node->set_attr_align_corners(align_corners); - outputs_map[op_info->Output("Out").front()] = interp_node; + nearest_interp_node->set_input_image(*inputs_map.at(x_var_name)); + nearest_interp_node->set_input_size(*out_size_node); + nearest_interp_node->set_attr_align_corners(align_corners); + interp_node = nearest_interp_node; } else { LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method; } + lite::npu::OpList::Global().add(interp_node); + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = interp_node; return outputs_map; } diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc index 5f8bdc4ee9..2313351f6c 100644 --- a/lite/kernels/npu/bridges/mul_op.cc +++ b/lite/kernels/npu/bridges/mul_op.cc @@ -31,82 +31,67 @@ node_map_type MulConverter(const std::shared_ptr mul_op, auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "[NPU] Converting " + op_type + "..."; - auto output_node = std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); auto y_var_name = op_info->Input("Y").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto y = scope->FindVar(y_var_name)->GetMutable(); + auto x_dims = x->dims(); + auto y_dims = y->dims(); int x_num_col_dims = op_info->GetAttr("x_num_col_dims"); int y_num_col_dims = op_info->GetAttr("y_num_col_dims"); - auto* xtensor = scope->FindVar(x_var_name)->GetMutable(); - auto* ytensor = scope->FindVar(y_var_name)->GetMutable(); - - int m = xtensor->dims().Slice(0, x_num_col_dims).production(); - int x_w = xtensor->dims() - .Slice(x_num_col_dims, xtensor->dims().size()) - .production(); - int y_h = ytensor->dims().Slice(0, y_num_col_dims).production(); - int n = ytensor->dims() - .Slice(y_num_col_dims, ytensor->dims().size()) - .production(); - CHECK_EQ(x_w, y_h) << "[NPU] x_w must be equal with y_h"; - int k = x_w; + int m = x_dims.Slice(0, x_num_col_dims).production(); + int k = x_dims.Slice(x_num_col_dims, x_dims.size()).production(); + CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production()) + << "[NPU] columns of X must be equal with rows of Y"; + int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production(); LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k; LOG(INFO) << "x_var_name:" << x_var_name << ", is data: " << inputs_map.count(x_var_name); LOG(INFO) << "y_var_name:" << y_var_name << ", is data: " << inputs_map.count(y_var_name); CHECK(inputs_map.count(x_var_name)) - << "[NPU] MatMul only support X is data, Y is const yet"; + << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet."; + + auto mul_node = std::make_shared(unique_op_type); + // add input x node which supports persistable and non-persistable tensor, and + // reshape to (m, k) if (inputs_map.count(x_var_name)) { - auto xsrc = inputs_map.at(x_var_name); - auto reshapex = std::make_shared(x_var_name + "_reshape"); - reshapex->set_input_tensor(*xsrc); - reshapex->set_attr_shape({m, k}); - reshapex->set_attr_axis(0); - lite::npu::OpList::Global().add(xsrc); - lite::npu::OpList::Global().add(reshapex); - output_node->set_input_x(*reshapex); + auto reshaped_x_node = + std::make_shared(x_var_name + "_reshape"); + reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name)); + reshaped_x_node->set_attr_shape({m, k}); + reshaped_x_node->set_attr_axis(0); + mul_node->set_input_x1(*reshaped_x_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(reshaped_x_node); } else { - auto constx = std::make_shared(x_var_name); - ge::TensorDesc desc(ge::Shape({m, k}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = desc.GetShape().GetShapeSize(); - CHECK_EQ(size, xtensor->dims().production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(desc); - auto* pdata = reinterpret_cast(xtensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - constx->set_attr_value(ptensor); - lite::npu::OpList::Global().add(constx); - output_node->set_input_x(*constx); + auto x_const_node = std::make_shared(x_var_name); + x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k})); + mul_node->set_input_x1(*x_const_node); + lite::npu::OpList::Global().add(x_const_node); } - + // add input y node which only supports persistable tensor, and reshape to (k, + // n) if (inputs_map.count(y_var_name)) { - auto ysrc = inputs_map.at(y_var_name); - auto reshapey = std::make_shared(y_var_name + "_reshape"); - reshapey->set_input_tensor(*ysrc); - reshapey->set_attr_shape({k, n}); - reshapey->set_attr_axis(0); - lite::npu::OpList::Global().add(ysrc); - lite::npu::OpList::Global().add(reshapey); - output_node->set_input_w(*reshapey); + auto reshaped_y_node = + std::make_shared(y_var_name + "_reshape"); + reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name)); + reshaped_y_node->set_attr_shape({k, n}); + reshaped_y_node->set_attr_axis(0); + mul_node->set_input_x2(*reshaped_y_node); + lite::npu::OpList::Global().add(inputs_map.at(y_var_name)); + lite::npu::OpList::Global().add(reshaped_y_node); } else { - auto consty = std::make_shared(y_var_name); - ge::TensorDesc desc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = desc.GetShape().GetShapeSize(); - CHECK_EQ(size, ytensor->dims().production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(desc); - auto* pdata = reinterpret_cast(ytensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - consty->set_attr_value(ptensor); - lite::npu::OpList::Global().add(consty); - output_node->set_input_w(*consty); + auto y_const_node = std::make_shared(y_var_name); + y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n})); + mul_node->set_input_x2(*y_const_node); + lite::npu::OpList::Global().add(y_const_node); } - lite::npu::OpList::Global().add(output_node); + lite::npu::OpList::Global().add(mul_node); node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = output_node; + outputs_map[op_info->Output("Out").front()] = mul_node; return outputs_map; } diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h index 8b4252de06..9a432d17e5 100644 --- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h @@ -16,23 +16,40 @@ #include "lite/kernels/npu/bridges/registry.h" -USE_NPU_BRIDGE(mul); -USE_NPU_BRIDGE(fc); +USE_NPU_BRIDGE(sigmoid); +USE_NPU_BRIDGE(relu); +USE_NPU_BRIDGE(tanh); +USE_NPU_BRIDGE(relu_clipped); +USE_NPU_BRIDGE(leaky_relu); +USE_NPU_BRIDGE(softsign); +USE_NPU_BRIDGE(hard_sigmoid); + +USE_NPU_BRIDGE(batch_norm); +USE_NPU_BRIDGE(concat); USE_NPU_BRIDGE(conv2d); USE_NPU_BRIDGE(depthwise_conv2d); -USE_NPU_BRIDGE(pool2d); -USE_NPU_BRIDGE(relu); +USE_NPU_BRIDGE(conv2d_transpose); + USE_NPU_BRIDGE(elementwise_add); USE_NPU_BRIDGE(fusion_elementwise_add_activation); +USE_NPU_BRIDGE(elementwise_sub); +USE_NPU_BRIDGE(elementwise_mul); +USE_NPU_BRIDGE(elementwise_div); + +USE_NPU_BRIDGE(fc); +USE_NPU_BRIDGE(bilinear_interp); +USE_NPU_BRIDGE(nearest_interp); +USE_NPU_BRIDGE(mul); +USE_NPU_BRIDGE(pad2d); +USE_NPU_BRIDGE(pool2d); +USE_NPU_BRIDGE(reduce_mean); +USE_NPU_BRIDGE(reshape); +USE_NPU_BRIDGE(reshape2); USE_NPU_BRIDGE(scale); +USE_NPU_BRIDGE(shuffle_channel); USE_NPU_BRIDGE(softmax); -USE_NPU_BRIDGE(concat); USE_NPU_BRIDGE(split); +USE_NPU_BRIDGE(sqrt); +USE_NPU_BRIDGE(square); USE_NPU_BRIDGE(transpose); USE_NPU_BRIDGE(transpose2); -USE_NPU_BRIDGE(shuffle_channel); -USE_NPU_BRIDGE(batch_norm); -USE_NPU_BRIDGE(bilinear_interp); -USE_NPU_BRIDGE(conv2d_transpose); -USE_NPU_BRIDGE(reshape); -USE_NPU_BRIDGE(reshape2); diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index 5915b7a8aa..7bbe94d5db 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/operators/pool_op.h" #include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" @@ -32,44 +33,78 @@ node_map_type PoolConverter(const std::shared_ptr pool_op, std::shared_ptr pool_node = std::make_shared(unique_op_type); auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindTensor(x_var_name); + pool_node->set_input_x(*inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(pool_node); + + int mode = 0; auto pooling_type = op_info->GetAttr("pooling_type"); - int npu_mode = 0; if (pooling_type == "max") { - npu_mode = 0; + mode = 0; } else if (pooling_type == "avg") { - npu_mode = 1; + mode = 1; CHECK(op_info->GetAttr("exclusive")) << "[NPU] exclusive must be true in HiAI DDK"; } else { LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type; } - bool npu_global_pooling = op_info->GetAttr("global_pooling"); + pool_node->set_attr_mode(mode); + + int pad_mode = 0; + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + if (padding_algorithm == "SAME") { + pad_mode = 6; + } else if (padding_algorithm == "VALID") { + pad_mode = 5; + } + pool_node->set_attr_pad_mode(pad_mode); + + bool global_pooling = op_info->GetAttr("global_pooling"); + pool_node->set_attr_global_pooling(global_pooling); + auto ksize = op_info->GetAttr>("ksize"); - auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()); + auto window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()); + pool_node->set_attr_window(window); - auto padding = op_info->GetAttr>("paddings"); - auto npu_pad = - ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]}; + auto paddings = op_info->GetAttr>("paddings"); + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the inputs size."; + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } auto strides = op_info->GetAttr>("strides"); + operators::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + x->dims(), + strides, + ksize); + auto npu_pad = ge::AttrValue::LIST_INT{ + paddings[0], paddings[1], paddings[2], paddings[3]}; + pool_node->set_attr_pad(npu_pad); + auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end()); - int npu_ceil_mode = 0; + pool_node->set_attr_stride(npu_stride); + + int ceil_mode = 0; if (op_info->HasAttr("ceil_mode")) { - npu_ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; + ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; } - - pool_node->set_input_x(*inputs_map.at(x_var_name)); - pool_node->set_attr_mode(npu_mode); - pool_node->set_attr_pad_mode(0); - pool_node->set_attr_global_pooling(npu_global_pooling); - pool_node->set_attr_window(npu_window); - pool_node->set_attr_pad(npu_pad); - pool_node->set_attr_stride(npu_stride); - pool_node->set_attr_ceil_mode(npu_ceil_mode); + pool_node->set_attr_ceil_mode(ceil_mode); // output_node->set_attr_data_mode(npu_data_mode); - lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); - lite::npu::OpList::Global().add(pool_node); - node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = pool_node; return outputs_map; diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc index d4543a6ae1..298e065547 100644 --- a/lite/kernels/npu/bridges/pool_op_test.cc +++ b/lite/kernels/npu/bridges/pool_op_test.cc @@ -61,7 +61,7 @@ void pool_ref(const std::shared_ptr op) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; if (global_pooling == true) { for (int n = 0; n < in_n; ++n) { @@ -163,7 +163,8 @@ void test_pool(int bs, opdesc.SetAttr("global_pooling", global_pooling); opdesc.SetAttr("exclusive", exclusive); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); // create and convert op to NPU model, then run it on NPU auto op = CreateOp(opdesc, &scope); diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc new file mode 100644 index 0000000000..4725bdfb0e --- /dev/null +++ b/lite/kernels/npu/bridges/reduce_mean_op.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/npu/builder.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +node_map_type ReduceMeanConverter( + const std::shared_ptr reduce_mean_op, + const node_map_type& inputs_map) { + auto scope = reduce_mean_op->scope(); + auto op_info = reduce_mean_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "[NPU] Converting " + op_type + "..."; + + // get input, and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x_dims = scope->FindTensor(x_var_name)->dims(); + auto keep_dim = op_info->GetAttr("keep_dim"); + auto dim = op_info->GetAttr>("dim"); + CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty."; + for (size_t i = 0; i < dim.size(); i++) { + if (dim[i] < 0) { + dim[i] += x_dims.size(); + } + } + std::sort(dim.begin(), dim.end()); + + // create reduce_mean(reduce_sum + scale) node and set input node from + // inputs_map + // creat reduce_sum node + auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum"); + auto reduce_sum_node = std::make_shared(unique_reduce_sum); + CHECK(inputs_map.count(x_var_name)); + reduce_sum_node->set_input_x(*inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(reduce_sum_node); + + auto dim_const_node = + std::make_shared(unique_reduce_sum + "/dim"); + dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData(dim)); + reduce_sum_node->set_input_w(*dim_const_node); + lite::npu::OpList::Global().add(dim_const_node); + + reduce_sum_node->set_attr_keep_dims(keep_dim); + + // create scale node + auto unique_scale = lite::npu::UniqueName("scale"); + auto scale_node = std::make_shared(unique_scale); + scale_node->set_input_x(*reduce_sum_node); + lite::npu::OpList::Global().add(scale_node); + + float scale = 1; + for (size_t i = 0; i < dim.size(); i++) { + scale /= x_dims[dim[i]]; + } + + std::vector scale_bias_shape = x_dims.Vectorize(); + if (keep_dim) { + for (size_t i = 0; i < dim.size(); i++) { + scale_bias_shape[dim[i]] = 1; + } + } else { + const int64_t kDelFlag = -2; + for (size_t i = 0; i < dim.size(); ++i) { + scale_bias_shape[dim[i]] = kDelFlag; + } + scale_bias_shape.erase( + remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag), + scale_bias_shape.end()); + } + + auto filter_const_node = + std::make_shared(unique_scale + "/filter"); + filter_const_node->set_attr_value( + lite::npu::CreateTensorAndFillData(scale, scale_bias_shape)); + scale_node->set_input_filter(*filter_const_node); + lite::npu::OpList::Global().add(filter_const_node); + + scale_node->set_attr_axis(1); + + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = scale_node; + return outputs_map; +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_NPU_BRIDGE(reduce_mean, + paddle::lite::kernels::npu::bridges::ReduceMeanConverter); diff --git a/lite/kernels/npu/bridges/reduce_mean_op_test.cc b/lite/kernels/npu/bridges/reduce_mean_op_test.cc new file mode 100644 index 0000000000..8646ce5c25 --- /dev/null +++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc @@ -0,0 +1,347 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/reduce_mean_op.h" +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +void reduce_mean_n(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int hw_size = height_in * width_in; + int chw_size = channel_in * hw_size; + int data_index, src_index; + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = c * hw_size + h * width_in + w; + dst[data_index] = 0.0; + for (int n = 0; n < num_in; ++n) { + src_index = n * chw_size + data_index; + dst[data_index] += static_cast(src[src_index]) / num_in; + } + } + } + } +} + +void reduce_mean_c(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int hw_size = height_in * width_in; + int chw_size = hw_size * channel_in; + int data_index, src_index0, src_index; + for (int n = 0; n < num_in; ++n) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = n * hw_size + h * width_in + w; + src_index0 = n * chw_size + h * width_in + w; + dst[data_index] = 0.0; + for (int c = 0; c < channel_in; ++c) { + src_index = src_index0 + c * hw_size; + dst[data_index] += static_cast(src[src_index]) / channel_in; + } + } + } + } +} + +void reduce_mean_h(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int cw_size = channel_in * width_in; + int chw_size = cw_size * height_in; + int hw_size = height_in * width_in; + int data_index, src_index, src_index0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int w = 0; w < width_in; ++w) { + data_index = n * cw_size + c * width_in + w; + src_index0 = n * chw_size + c * hw_size + w; + dst[data_index] = 0.0; + for (int h = 0; h < height_in; ++h) { + src_index = src_index0 + h * width_in; + dst[data_index] += static_cast(src[src_index]) / height_in; + } + } + } + } +} + +void reduce_mean_w(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int ch_size = channel_in * height_in; + int hw_size = height_in * width_in; + int chw_size = ch_size * width_in; + int data_index = 0; + int src_index0 = 0; + int src_index = 0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + data_index = n * ch_size + c * height_in + h; + src_index0 = n * chw_size + c * hw_size + h * width_in; + dst[data_index] = 0.0; + for (int w = 0; w < width_in; ++w) { + src_index = src_index0 + w; + dst[data_index] += static_cast(src[src_index]) / width_in; + } + } + } + } +} + +void reduce_mean_all(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + float mean = 0.0; + int src_index; + int n_id, c_id; + int all = num_in * channel_in * height_in * width_in; + for (int n = 0; n < num_in; ++n) { + n_id = n * channel_in * height_in * width_in; + for (int c = 0; c < channel_in; ++c) { + c_id = c * height_in * width_in; + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + src_index = n_id + c_id + h * width_in + w; + mean = src[src_index] / all; + } + } + } + } + dst[0] = mean; +} + +void reduce_mean_nc(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce n first. + DDimLite ddimA({1, channel_in, height_in, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in); +} + +void reduce_mean_ch(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce c first + DDimLite ddimA({num_in, 1, height_in, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in); +} + +void reduce_mean_hw(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce h first + DDimLite ddimA({num_in, channel_in, 1, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); +} + +void reduce_mean_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto x = scope->FindTensor("x"); + auto x_dims = x->dims(); + auto x_data = x->data(); + auto out = scope->FindMutableTensor("out_ref"); + + auto dim = op_info->GetAttr>("dim"); + auto keep_dim = op_info->GetAttr("keep_dim"); + + auto x_rank = x_dims.size(); + if (!dim.empty()) { + for (size_t i = 0; i < dim.size(); i++) { + if (dim[i] < 0) { + dim[i] += x_rank; + } + } + } + + bool reduce_all = false; + sort(dim.begin(), dim.end()); + if (dim.size() == 0) { + reduce_all = true; + } + + std::vector out_dims; + if (reduce_all) { + if (keep_dim) { + for (size_t i = 0; i < x_dims.size(); i++) { + out_dims.push_back(1); + } + } else { + out_dims.push_back(1); + } + } else { + for (int i = 0; i < x_dims.size(); i++) { + out_dims.push_back(x_dims[i]); + } + if (keep_dim) { + for (size_t i = 0; i < dim.size(); ++i) { + out_dims[dim[i]] = 1L; + } + } else { + int64_t kDelFlag = -2; + for (size_t i = 0; i < dim.size(); ++i) { + out_dims[dim[i]] = kDelFlag; + } + out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag), + out_dims.end()); + } + out->Resize(DDim(out_dims)); + } + + auto out_data = out->mutable_data(); + int in_n = x_dims[0]; + int in_c = x_dims[1]; + int in_h = x_dims[2]; + int in_w = x_dims[3]; + + if (dim.size() == 0) { + reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim.size() == 1) { + switch (dim[0]) { + case 0: + reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 1: + reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 2: + reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 3: + reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w); + break; + default: + LOG(FATAL) << "error!!!"; + } + } else if (dim.size() == 2) { + if (dim[0] == 0 && dim[1] == 1) { + reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim[0] == 1 && dim[1] == 2) { + reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim[0] == 2 && dim[1] == 3) { + reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w); + } else { + LOG(FATAL) << "invalid dim!!"; + } + } +} + +void test_reduce_mean(const std::vector& input_shape, + std::vector dim, + bool keep_dim) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("reduce_mean"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("dim", dim); + opdesc.SetAttr("keep_dim", keep_dim); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor + reduce_mean_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(NPUBridges, reduce_mean) { + std::vector> reduce_dim{ + {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}}; + for (auto dim : reduce_dim) { + for (auto keep_dim : {true, false}) { + test_reduce_mean({1, 2, 3, 4}, dim, keep_dim); + } + } +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(reduce_mean); +USE_NPU_BRIDGE(reduce_mean); diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc index b2ed556faf..a554aac94f 100644 --- a/lite/kernels/npu/bridges/reshape_op.cc +++ b/lite/kernels/npu/bridges/reshape_op.cc @@ -41,8 +41,10 @@ node_map_type ReshapeConverter(const std::shared_ptr reshape_op, reshape_node->set_input_tensor(*inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); - // read shape from actual shape tensor as input "w" if 'Shape' is found - if (lite::npu::HasInputArg(op_info, scope, "Shape")) { + // read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr) + if (lite::npu::HasInputArg(op_info, scope, "ShapeTensor")) { + LOG(FATAL) << "[NPU] not support \"Shape\" from more than one Tensor."; + } else if (lite::npu::HasInputArg(op_info, scope, "Shape")) { auto actual_shape_var_name = op_info->Input("Shape").front(); if (!inputs_map.count(actual_shape_var_name)) { auto actual_shape = diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc new file mode 100644 index 0000000000..84ab3a9eb2 --- /dev/null +++ b/lite/kernels/npu/bridges/sqrt_op.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/npu/builder.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +node_map_type SqrtConverter(const std::shared_ptr sqrt_op, + const node_map_type& inputs_map) { + auto scope = sqrt_op->scope(); + auto op_info = sqrt_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "[NPU] Converting " + op_type + "..."; + + std::shared_ptr sqrt_node = + std::make_shared(unique_op_type); + + auto x_var_name = op_info->Input("X").front(); + + CHECK(inputs_map.count(x_var_name)); + sqrt_node->set_input_x(*inputs_map.at(x_var_name)); + + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(sqrt_node); + + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = sqrt_node; + return outputs_map; +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_NPU_BRIDGE(sqrt, paddle::lite::kernels::npu::bridges::SqrtConverter); diff --git a/lite/kernels/npu/bridges/sqrt_op_test.cc b/lite/kernels/npu/bridges/sqrt_op_test.cc new file mode 100644 index 0000000000..015d61685b --- /dev/null +++ b/lite/kernels/npu/bridges/sqrt_op_test.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/test_helper.h" +#include "lite/operators/activation_ops.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +template +void sqrt_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto x = scope->FindTensor("x"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + + for (size_t i = 0; i < x->numel(); i++) { + out_data[i] = std::sqrtf(x_data[i]); + } +} + +void test_sqrt(const std::vector& input_shape) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x, 0, 5); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("sqrt"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor + sqrt_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(NPUBridges, sqrt) { + test_sqrt({2}); + test_sqrt({2, 3}); + test_sqrt({1, 2, 3, 4}); + test_sqrt({5, 6, 7, 8}); +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(sqrt); +USE_NPU_BRIDGE(sqrt); diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc new file mode 100644 index 0000000000..2ca91adba0 --- /dev/null +++ b/lite/kernels/npu/bridges/square_op.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/npu/builder.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +node_map_type SquareConverter(const std::shared_ptr square_op, + const node_map_type& inputs_map) { + auto scope = square_op->scope(); + auto op_info = square_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "[NPU] Converting " + op_type + "..."; + + std::shared_ptr square_node = + std::make_shared(unique_op_type); + + auto x_var_name = op_info->Input("X").front(); + + CHECK(inputs_map.count(x_var_name)); + square_node->set_input_x(*inputs_map.at(x_var_name)); + + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(square_node); + + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = square_node; + return outputs_map; +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_NPU_BRIDGE(square, + paddle::lite::kernels::npu::bridges::SquareConverter); diff --git a/lite/kernels/npu/bridges/square_op_test.cc b/lite/kernels/npu/bridges/square_op_test.cc new file mode 100644 index 0000000000..d715c11430 --- /dev/null +++ b/lite/kernels/npu/bridges/square_op_test.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/test_helper.h" +#include "lite/operators/activation_ops.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +template +void square_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto x = scope->FindTensor("x"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + + for (size_t i = 0; i < x->numel(); i++) { + out_data[i] = x_data[i] * x_data[i]; + } +} + +void test_square(const std::vector& input_shape) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("square"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor + square_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(NPUBridges, square) { + test_square({2}); + test_square({2, 3}); + test_square({1, 2, 3, 4}); + test_square({5, 6, 7, 8}); +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(square); +USE_NPU_BRIDGE(square); diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index d070eb84c5..99b23c19f0 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT LITE_WITH_OPENCL) +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL)) return () endif() diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc index 04a78face2..e13d12ec22 100644 --- a/lite/kernels/opencl/conv_compute.cc +++ b/lite/kernels/opencl/conv_compute.cc @@ -38,15 +38,20 @@ void ConvCompute::PrepareForRun() { int w_out = output_dims[3]; int kernel_h = filter_dims[2]; // oihw int kernel_w = filter_dims[3]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; int stride_h = param.strides[0]; int stride_w = param.strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int groups = param.groups; bool relu_fused = param.fuse_relu; - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool zero_pad = (pad_h == 0) && (pad_w == 0); + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); VLOG(3) << "groups:" << groups << " stride_h:" << stride_h << " stride_w:" << stride_w << " pad_h:" << pad_h @@ -60,7 +65,7 @@ void ConvCompute::PrepareForRun() { << filter_dims[2] << " " << filter_dims[3]; if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && - zero_pad && no_dilation) { + zero_pad && no_dilation && pad_equal) { // conv2d_1x1 kernel_func_names_.push_back("gemm_batch"); kernel_func_paths_.push_back("buffer/fc_kernel.cl"); @@ -70,7 +75,7 @@ void ConvCompute::PrepareForRun() { build_options_.push_back("-DCL_DTYPE=float"); } impl_ = &ConvCompute::Conv2d1x1; - } else { + } else if (pad_equal) { kernel_func_names_.push_back("im2col"); kernel_func_names_.push_back("gemm_batch"); kernel_func_paths_.push_back("buffer/im2col_kernel.cl"); @@ -85,6 +90,9 @@ void ConvCompute::PrepareForRun() { col_buffer_.reset(new lite::Tensor); col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out}); col_buffer_->mutable_data(TARGET(kOpenCL)); + } else { + LOG(FATAL) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; } for (size_t i = 0; i < kernel_func_names_.size(); i++) { @@ -102,17 +110,19 @@ void ConvCompute::GemmlikeConv2d() { int c_in = x_dims[1]; int h_in = x_dims[2]; int w_in = x_dims[3]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; int c_out = output_dims[1]; int h_out = output_dims[2]; int w_out = output_dims[3]; int kernel_h = filter_dims[2]; int kernel_w = filter_dims[3]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; - int dilation_h = param.dilations[0]; - int dilation_w = param.dilations[1]; + int dilation_h = dilations[0]; + int dilation_w = dilations[1]; auto* x_buf = param.x->data(); auto* filter_buf = param.filter->data(); diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_compute_test.cc index a7417e3525..3bc7a0734d 100644 --- a/lite/kernels/opencl/conv_compute_test.cc +++ b/lite/kernels/opencl/conv_compute_test.cc @@ -24,7 +24,6 @@ namespace lite { #define A(i, j) a[i * lda + j] #define B(i, j) cur_b[i * ldb + j] #define C(i, j) cur_c[i * ldc + j] - template static void conv_basic(const Dtype1* din, Dtype2* dout, @@ -227,10 +226,12 @@ TEST(conv2d, compute_conv2d_1x1) { param.bias = bias_flag ? &bias : nullptr; param.output = &out; param.strides = {stride, stride}; - param.paddings = {pad, pad}; + std::vector paddings = {pad, pad, pad, pad}; param.groups = group; - param.dilations = {dilation, dilation}; + std::vector dilations = {dilation, dilation}; param.fuse_relu = relu_flag; + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); kernel->SetParam(param); std::unique_ptr conv_context(new KernelContext); @@ -454,11 +455,14 @@ TEST(conv2d, compute_conv2d_gemm) { param.bias = bias_flag ? &bias : nullptr; param.output = &out; param.strides = {stride, stride}; - param.paddings = {pad, pad}; + std::vector paddings = {pad, pad, pad, pad}; param.groups = group; - param.dilations = {dilation, dilation}; + std::vector dilations = {dilation, dilation}; param.fuse_relu = relu_flag; + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); + kernel->SetParam(param); std::unique_ptr conv_context(new KernelContext); context->As().CopySharedTo( diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc index 62734610e2..ed942d7f0c 100644 --- a/lite/kernels/opencl/depthwise_conv2d_compute.cc +++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc @@ -44,7 +44,7 @@ class DepthwiseConv2dCompute auto x_dims = param.x->dims(); auto filter_dims = param.filter->dims(); auto output_dims = param.output->dims(); - auto paddings = param.paddings; + auto paddings = *param.paddings; auto strides = param.strides; auto& context = ctx_->As(); diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc index a189acaf91..3556d1abed 100644 --- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc +++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc @@ -105,7 +105,8 @@ TEST(depthwise_conv2d, compute) { param.x = &input; param.filter = &filter; param.output = &output; - param.paddings = std::vector{0, 0}; + std::vector paddings = {0, 0}; + param.paddings = std::make_shared>(paddings); param.strides = std::vector{1, 1}; std::unique_ptr context(new KernelContext); diff --git a/lite/kernels/opencl/io_copy_compute.cc b/lite/kernels/opencl/io_copy_compute.cc index dc4bdfe64c..3387a0887d 100644 --- a/lite/kernels/opencl/io_copy_compute.cc +++ b/lite/kernels/opencl/io_copy_compute.cc @@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute auto* wait_list = context.cl_wait_list(); auto* x_ptr = param.x->data(); - /* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` - in kernel and enable wait_list + /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to + `cl_wait_list` + in kernel and `wait_list` enabled auto it = wait_list->find(x_ptr); if (it != wait_list->end()) { VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc index dc2e851595..d275b312d6 100644 --- a/lite/kernels/opencl/pool_compute.cc +++ b/lite/kernels/opencl/pool_compute.cc @@ -44,16 +44,22 @@ class PoolCompute const auto& out_dims = param.output->dims(); const std::string pooling_type = param.pooling_type; const bool global_pooling = param.global_pooling; - std::vector paddings = param.paddings; + std::vector paddings = *param.paddings; std::vector strides = param.strides; std::vector ksize = param.ksize; if (global_pooling) { for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(in_dims[i + 2]); } } - + bool pads_equal = + (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); + if (!pads_equal) { + LOG(FATAL) + << "padding requires pad_left == pad_right, pad_top == pad_bottom"; + } auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); auto* input_buf = param.x->data(); @@ -89,7 +95,7 @@ class PoolCompute CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, static_cast(paddings[0])); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(paddings[1])); + status = kernel.setArg(++arg_idx, static_cast(paddings[2])); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *output_buf); CL_CHECK_FATAL(status); diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc index 53f64e9505..25f0e72634 100644 --- a/lite/kernels/opencl/pool_compute_test.cc +++ b/lite/kernels/opencl/pool_compute_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" @@ -88,9 +89,10 @@ TEST(pool2d, compute) { param.output = &out; param.global_pooling = true; param.pooling_type = "avg"; - param.paddings = std::vector{0, 0}; + std::vector paddings = {0, 0, 0, 0}; param.strides = std::vector{1, 1}; param.ksize = std::vector{7, 7}; + param.paddings = std::make_shared>(paddings); std::unique_ptr context(new KernelContext); context->As().InitOnce(); diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt index da955e4fd5..bf3a1685f0 100644 --- a/lite/kernels/x86/CMakeLists.txt +++ b/lite/kernels/x86/CMakeLists.txt @@ -5,6 +5,7 @@ add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${li # lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps}) add_kernel(scale_compute_x86 X86 basic SRCS scale_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(cast_compute_x86 X86 basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} fluid_data_type) add_kernel(slice_compute_x86 X86 basic SRCS slice_compute.cc DEPS ${lite_kernel_deps}) add_kernel(squeeze_compute_x86 X86 basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps}) add_kernel(fill_constant_batch_size_like_compute_x86 X86 basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_function) @@ -15,8 +16,10 @@ add_kernel(conv_compute_x86 X86 basic SRCS conv_compute.cc DEPS ${lite_kernel_de # lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} ) # lite_cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col) add_kernel(pool_compute_x86 X86 basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling) +add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_deps}) add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps}) add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function) +add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper) # add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} ) @@ -26,6 +29,7 @@ add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_comp # lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86) # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) +add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type) # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86) # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86) # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86) @@ -33,12 +37,27 @@ add_kernel(mul_compute_x86 X86 basic SRCS mul_compute.cc DEPS ${lite_kernel_deps add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kernel_deps}) add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling) +add_kernel(search_group_padding_compute_x86 X86 basic SRCS search_group_padding_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(sequence_reverse_compute_x86 X86 basic SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps}) add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax) add_kernel(elementwise_compute_x86 X86 basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps}) add_kernel(batch_norm_compute_x86 X86 basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reduce_sum_compute_x86 X86 basic SRCS reduce_compute.cc DEPS ${lite_kernel_deps}) add_kernel(lookup_table_compute_x86 X86 basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_reshape_compute_x86 X86 basic SRCS sequence_reshape_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(match_matrix_tensor_compute_x86 X86 basic SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps} blas math_function) +add_kernel(search_seq_depadding_compute_x86 X86 basic SRCS search_seq_depadding_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(search_grnn_compute_x86 X86 basic SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps} blas math_function) +add_kernel(sequence_concat_compute_x86 X86 basic SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(var_conv_2d_compute_x86 X86 basic SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps} blas fluid_data_type) +add_kernel(attention_padding_mask_compute_x86 X86 basic SRCS attention_padding_mask_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(sequence_arithmetic_compute_x86 X86 basic SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps}) + +# for content-dnn specific +add_kernel(search_aligned_mat_mul_compute_x86 X86 extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} blas) +add_kernel(search_seq_fc_compute_x86 X86 extra SRCS search_seq_fc_compute.cc DEPS ${lite_kernel_deps} blas) +add_kernel(sequence_topk_avg_pooling_compute_x86 X86 basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps} sequence_topk_avg_pooling) +add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps} search_fc) if(NOT LITE_WITH_X86) return() @@ -47,12 +66,14 @@ add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kerne lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86) +lite_cc_test(test_gather_compute_x86 SRCS gather_compute_test.cc DEPS gather_compute_x86) lite_cc_test(test_slice_compute_x86 SRCS slice_compute_test.cc DEPS slice_compute_x86) lite_cc_test(test_squeeze_compute_x86 SRCS squeeze_compute_test.cc DEPS squeeze_compute_x86) lite_cc_test(test_fill_constant_batch_size_like_compute_x86 SRCS fill_constant_batch_size_like_compute_test.cc DEPS fill_constant_batch_size_like_compute_x86) lite_cc_test(test_reshape_compute_x86 SRCS reshape_compute_test.cc DEPS reshape_compute_x86) lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86) lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86) +lite_cc_test(test_sequence_reverse_compute_x86 SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_x86) lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86) lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86) lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86) @@ -63,7 +84,19 @@ lite_cc_test(test_gelu_compute_x86 SRCS gelu_compute_test.cc DEPS activation_com lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86) lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86) lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86) - +lite_cc_test(test_cast_compute_x86 SRCS cast_compute_test.cc DEPS cast_compute_x86) lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86) +lite_cc_test(test_layer_norm_compute_x86 SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_x86) lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86) lite_cc_test(test_transpose_compute_x86 SRCS transpose_compute_test.cc DEPS transpose_compute_x86) +lite_cc_test(test_search_fc_compute_x86 SRCS search_fc_compute_test.cc DEPS search_fc_compute_x86) +lite_cc_test(test_search_seq_depadding_compute_x86 SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_x86) +lite_cc_test(test_search_grnn_compute_x86 SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_x86) +lite_cc_test(test_match_matrix_compute_x86 SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_x86) +lite_cc_test(test_lookup_table_compute_x86 SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_x86) +lite_cc_test(test_stack_compute_x86 SRCS stack_compute_test.cc DEPS stack_compute_x86) +lite_cc_test(test_search_group_padding_compute_x86 SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_x86) +lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_x86) +lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86) +#lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86) +lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86) diff --git a/lite/kernels/x86/attention_padding_mask_compute.cc b/lite/kernels/x86/attention_padding_mask_compute.cc new file mode 100644 index 0000000000..0c35c416e7 --- /dev/null +++ b/lite/kernels/x86/attention_padding_mask_compute.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/attention_padding_mask_compute.h" + +REGISTER_LITE_KERNEL( + search_attention_padding_mask, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::AttentionPaddingMaskCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h new file mode 100644 index 0000000000..b9124e5ad4 --- /dev/null +++ b/lite/kernels/x86/attention_padding_mask_compute.h @@ -0,0 +1,83 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/fluid/eigen.h" +#include "lite/operators/attention_padding_mask_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class AttentionPaddingMaskCompute + : public KernelLite { + public: + using param_t = operators::AttentionPaddingMaskParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto* bottom0 = param.X; + auto* bottom1 = param.Y; + auto* _pad_begin = param.pad_begin; + auto* top = param.Out; + int _pad_id = param.pad_id; + float _mask = param.mask; + auto src_len = static_cast(bottom1->lod()[0][1]); + const int att_batch = bottom0->lod()[0].size() - 1; + const int src_batch = bottom1->lod()[0].size() - 1; + int* pad_begin = _pad_begin->mutable_data(); + for (int i = 0; i < src_batch; ++i) { + const auto* src_data = bottom1->data() + src_len * i; + int index = src_len - 1; + for (; index >= 0 && _pad_id == static_cast(src_data[index]); + --index) { + } + pad_begin[i] = index + 1; + } + + const auto att_len = static_cast(bottom0->lod()[0][1]); + auto* top_data = top->mutable_data(); + memcpy(top_data, + bottom0->data(), + bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T)); + for (int i = 0; i < att_batch; ++i) { + for (int j = 0; j < att_len; ++j) { + top_data = top->mutable_data() + src_len * (att_len * i + j); + int src_idx = i % src_batch; + for (int k = pad_begin[src_idx]; k < src_len; ++k) { + top_data[k] = _mask; + } + } + } + } + + virtual ~AttentionPaddingMaskCompute() = default; + + private: + lite::Tensor src_offset_; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc new file mode 100644 index 0000000000..35ce822e01 --- /dev/null +++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/attention_padding_mask_compute.cc" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +void attention_padding_mask_ref( + const Tensor& x, + const Tensor& y, + Tensor* out, + Tensor* pad_begin, + const operators::AttentionPaddingMaskParam& param) { + auto attn_offset = x.lod()[0]; + auto src_offset = y.lod()[0]; + int attn_seq_num = attn_offset.size() - 1; + int src_seq_num = src_offset.size() - 1; + int attn_seq_len = attn_offset[1]; + int src_seq_len = x.dims()[1]; + CHECK_EQ(attn_seq_num % src_seq_num, 0); + + auto count = x.numel(); + auto attn_data = x.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + memcpy(out_data, attn_data, count * sizeof(float)); + + for (int i = 0; i < attn_seq_num; ++i) { + for (int j = 0; j < attn_seq_len; ++j) { + auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j); + int src_seq_idx = i % src_seq_num; + int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx]; + for (int k = cur_len; k < src_seq_len; k++) { + tmp_out_data[k] = param.mask; + } + } + } +} + +void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) { + std::vector x_dims{static_cast(lod[0].back()), dim2rd}; + x->Resize(x_dims); + x->set_lod(lod); + auto x_data = x->mutable_data(); + auto x_num = x->numel(); + for (int i = 0; i < x_num; i++) { + x_data[i] = (i - x_num) * 1.1; + } +} + +int get_max_len(const LoD& lod) { + int max_len = 0; + auto offset = lod[0]; + for (int i = 0; i < offset.size() - 1; i++) { + int cur_len = offset[i + 1] - offset[i]; + max_len = max_len < cur_len ? cur_len : max_len; + } + return max_len; +} + +TEST(attention_padding_mask_x86, retrive_op) { + auto attention_padding_mask = + KernelRegistry::Global().Create( + "attention_padding_mask"); + ASSERT_FALSE(attention_padding_mask.empty()); + ASSERT_TRUE(attention_padding_mask.front()); +} + +TEST(attention_padding_mask_x86, init) { + AttentionPaddingMaskCompute attention_padding_mask; + ASSERT_EQ(attention_padding_mask.precision(), PRECISION(kFloat)); + ASSERT_EQ(attention_padding_mask.target(), TARGET(kX86)); +} + +TEST(attention_padding_mask_x86, run_test) { + lite::Tensor x, y; + lite::Tensor out, pad_begin, out_ref, pad_begin_ref; + + LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}}; + prepare_input(&x, x_lod, get_max_len(y_lod)); + prepare_input(&y, y_lod, 1); + + operators::AttentionPaddingMaskParam param; + param.X = &x; + param.Y = &y; + param.pad_id = 12800001; + param.mask = -90000000.f; + param.Out = &out; + param.pad_begin = &pad_begin; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + AttentionPaddingMaskCompute attention_padding_mask_kernel; + attention_padding_mask_kernel.SetParam(param); + attention_padding_mask_kernel.SetContext(std::move(ctx)); + attention_padding_mask_kernel.Run(); + + attention_padding_mask_ref(x, y, &out_ref, &pad_begin_ref, param); + auto out_data = out.data(); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_attention_padding_mask, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/cast_compute.cc b/lite/kernels/x86/cast_compute.cc new file mode 100644 index 0000000000..d342056c7f --- /dev/null +++ b/lite/kernels/x86/cast_compute.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/cast_compute.h" + +REGISTER_LITE_KERNEL(cast, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::CastCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/cast_compute.h b/lite/kernels/x86/cast_compute.h new file mode 100644 index 0000000000..06e47e9a50 --- /dev/null +++ b/lite/kernels/x86/cast_compute.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/fluid/data_type.h" +#include "lite/fluid/hostdevice.h" +#include "lite/fluid/transform.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +struct CastOpTransformFunctor { + HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } +}; + +template +class CastOpFunctor { + public: + CastOpFunctor(const lite::Tensor* in, + lite::Tensor* out, + const lite::Context& context) + : input(in), output(out), ctx(context) {} + + template + void apply() const { + auto* in_begin = input->data(); + auto numel = input->dims().production(); + auto* in_end = in_begin + numel; + auto* out_begin = output->mutable_data(); + paddle::lite::fluid::Transform trans; + trans( + ctx, in_begin, in_end, out_begin, CastOpTransformFunctor()); + } + + private: + const lite::Tensor* input; + lite::Tensor* output; + const lite::Context& ctx; +}; + +template +class CastCompute : public KernelLite { + public: + using param_t = operators::CastParam; + + void Run() override { + auto param = param_.get_mutable(); + auto& context = ctx_->As(); + auto x = param->X; + auto out = param->Out; + auto out_dtype = param->out_dtype; + paddle::lite::fluid::VisitDataType( + static_cast(out_dtype), + CastOpFunctor(x, out, context)); + } + virtual ~CastCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc new file mode 100644 index 0000000000..f7aa52ca6d --- /dev/null +++ b/lite/kernels/x86/cast_compute_test.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/cast_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(cast_x86, retrive_op) { + auto cast = + KernelRegistry::Global().Create("cast"); + ASSERT_FALSE(cast.empty()); + ASSERT_TRUE(cast.front()); +} + +TEST(cast_x86, init) { + CastCompute cast; + ASSERT_EQ(cast.precision(), PRECISION(kFloat)); + ASSERT_EQ(cast.target(), TARGET(kX86)); +} + +TEST(cast_x86, run_test) { + lite::Tensor x, out; + constexpr int batch_size = 1; + std::vector x_shape{batch_size, 1, 3, 3}; + x.Resize(lite::DDim(x_shape)); + + std::vector out_shape{batch_size, 1, 3, 3}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(1); + } + + CastCompute cast; + operators::CastParam param; + param.X = &x; + param.Out = &out; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + cast.SetContext(std::move(ctx)); + cast.SetParam(param); + cast.Run(); + + std::vector ref_results = {1, 1, 1, 1, 1, 1, 1, 1, 1}; + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_results[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(cast, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h index 48cb3c74ef..e9f403059f 100644 --- a/lite/kernels/x86/conv_compute.h +++ b/lite/kernels/x86/conv_compute.h @@ -67,7 +67,7 @@ class Conv2dCompute : public KernelLite { lite::DDim col_shape(col_shape_vec); lite::DDim col_matrix_shape = col_shape.Flatten2D(data_dim + 1); bool is_expand = IsExpand( - filter_shape_vec, param.strides, param.paddings, param.dilations); + filter_shape_vec, param.strides, *param.paddings, *param.dilations); lite::Tensor col; lite::Tensor col_matrix; if (is_expand) { @@ -95,20 +95,15 @@ class Conv2dCompute : public KernelLite { auto blas = paddle::lite::x86::math::GetBlas(context); for (int i = 0; i < batch_size; i++) { - lite::Tensor in_batch; - lite::Tensor tmp_in_batch = param.x->Slice(i, i + 1); - tmp_in_batch.Resize(input_shape); - in_batch.ShareDataWith(tmp_in_batch); - lite::Tensor out_batch; - lite::Tensor tmp_out_batch = param.output->Slice(i, i + 1); - tmp_out_batch.Resize(output_matrix_shape); - out_batch.ShareDataWith(tmp_out_batch); + lite::Tensor in_batch = param.x->Slice(i, i + 1); + in_batch.Resize(input_shape); + lite::Tensor out_batch = param.output->Slice(i, i + 1); + out_batch.Resize(output_matrix_shape); for (int g = 0; g < param.groups; g++) { - lite::Tensor in_slice; - in_slice.ShareDataWith( + lite::Tensor in_slice = in_batch.Slice(static_cast(g * in_step), - static_cast((g + 1) * in_step))); - + static_cast((g + 1) * in_step)); + auto paddings = *param.paddings; if (!is_expand) { col.ShareDataWith(in_slice); col_matrix.ShareDataWith(col); @@ -117,32 +112,30 @@ class Conv2dCompute : public KernelLite { // im2col im2col(context, in_slice, - param.dilations, + *param.dilations, param.strides, - std::vector{param.paddings[0], - param.paddings[1], - param.paddings[0], - param.paddings[1]}, + std::vector{ + paddings[0], paddings[2], paddings[0], paddings[2]}, &(col)); } else if (data_dim == 3U) { // vol2col vol2col(context, in_slice, - param.dilations, + *param.dilations, param.strides, - param.paddings, + *param.paddings, &(col)); } // gemm lite::Tensor out_slice; - out_slice.ShareDataWith( + out_slice = out_batch.Slice(static_cast(g * out_step), - static_cast((g + 1) * out_step))); + static_cast((g + 1) * out_step)); lite::Tensor filter_slice; - filter_slice.ShareDataWith( + filter_slice = filter.Slice(static_cast(g * out_step), - static_cast((g + 1) * out_step))); + static_cast((g + 1) * out_step)); blas.MatMul(filter_slice, false, col_matrix, diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc index f2dde962b9..2827c6577e 100644 --- a/lite/kernels/x86/conv_compute_test.cc +++ b/lite/kernels/x86/conv_compute_test.cc @@ -73,9 +73,11 @@ TEST(conv2d_x86, run_test) { param.bias = &b; param.output = &out; param.strides = {1, 1}; - param.paddings = {0, 0}; + std::vector paddings = {0, 0, 0, 0}; param.groups = 1; - param.dilations = {1, 1}; + std::vector dilations = {1, 1}; + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); LOG(INFO) << 123; std::unique_ptr ctx(new KernelContext); ctx->As(); diff --git a/lite/kernels/x86/fill_constant_compute.cc b/lite/kernels/x86/fill_constant_compute.cc index 1eb76332cc..dace1e9025 100644 --- a/lite/kernels/x86/fill_constant_compute.cc +++ b/lite/kernels/x86/fill_constant_compute.cc @@ -29,6 +29,38 @@ class FillConstantCompute : public KernelLite { public: using param_t = operators::FillConstantParam; + inline DDimLite GetShape(const param_t& param) { + // 1. shape is a Tensor + if (param.shape_tensor != nullptr) { + auto* shape_tensor = param.shape_tensor; + auto* shape_data = shape_tensor->data(); + auto vec_shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + return DDimLite(vec_shape); + } + + // 2. shape is a list/tuple containing Tensor + auto shape_tensor_list = param.shape_tensor_list; + if (shape_tensor_list.size() > 0) { + std::vector vec_shape; + for (size_t i = 0; i < shape_tensor_list.size(); ++i) { + auto tensor = shape_tensor_list[i]; + vec_shape.push_back(*tensor->data()); + } + return DDimLite(vec_shape); + } + + // 3. shape is a list/tuple without containing Tensor + auto vec_shape = param.shape; + return DDimLite(vec_shape); + } + + void PrepareForRun() override { + auto& param = *param_.get_mutable(); + auto outdims = GetShape(param); + param.Out->Resize(outdims); + } + void Run() override { auto& param = *param_.get_mutable(); auto& context = ctx_->As(); @@ -55,5 +87,9 @@ REGISTER_LITE_KERNEL(fill_constant, kNCHW, paddle::lite::kernels::x86::FillConstantCompute, def) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("ShapeTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); diff --git a/lite/kernels/x86/gather_compute.cc b/lite/kernels/x86/gather_compute.cc new file mode 100644 index 0000000000..836f336271 --- /dev/null +++ b/lite/kernels/x86/gather_compute.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/gather_compute.h" + +typedef paddle::lite::kernels::x86::GatherCompute GatherInt32; +typedef paddle::lite::kernels::x86::GatherCompute GatherInt64; + +REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); + +REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt64, int64_in) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h new file mode 100644 index 0000000000..6ee270647f --- /dev/null +++ b/lite/kernels/x86/gather_compute.h @@ -0,0 +1,99 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/api/paddle_place.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/fluid/data_type.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +/** + * A thin wrapper for gathering on cpu tensor + * Return a new tensor from source tensor, gathered according to index + * input[src]: type-T source Tensor + * input[index]: type-IndexT index Tensor (1-D) + * return: output tensor + */ +template +void CPUGather(const lite::Tensor* src, + const lite::Tensor* index, + lite::Tensor* output) { + // check index of shape 1-D + if (index->dims().size() == 2) { + CHECK(index->dims()[1] == 1) << "Index(Input)'s dimension[1] should be 1 " + "when Index(input)'s dimension's size " + "equal to 2 in Gather(Op)."; + } else { + CHECK(index->dims().size() == 1) + << "Index(Input)'s dimension's size() should be 1 or 2 in Gather(Op)."; + } + int64_t index_size = index->dims()[0]; + + auto src_dims = src->dims(); + + const T* p_src = src->data(); + const IndexT* p_index = index->data(); + T* p_output = output->mutable_data(); + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + for (int64_t i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); + } +} + +template +class GatherCompute : public KernelLite { + public: + using param_t = operators::GatherParam; + + void Run() override { + auto& param = *param_.get_mutable(); + + auto x = param.X; + auto index = param.Index; + auto out = param.Out; + + out->mutable_data(); + if (x->dims().production() == 0) return; + /* + * Since there's no type defined for lite::Tensor in Paddle-Lite, then + * convert the Index's value to float which must be int32_t or int64_t and + * this supposes to cause no precision difference during inference just for + * now. + * Alternatively, if define the Tensor's type during registering, may cause + * a redefinition error. + */ + CPUGather(x, index, out); + } + + virtual ~GatherCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc new file mode 100644 index 0000000000..286dfcb08a --- /dev/null +++ b/lite/kernels/x86/gather_compute_test.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/gather_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(gather_x86, retrive_op) { + auto gather = + KernelRegistry::Global().Create( + "gather"); + ASSERT_FALSE(gather.empty()); + int cnt = 0; + for (auto item = gather.begin(); item != gather.end(); ++item) { + cnt++; + ASSERT_TRUE(*item); + } + ASSERT_EQ(cnt, 2); +} + +TEST(gather_x86, int32_init) { + GatherCompute gather; + ASSERT_EQ(gather.precision(), PRECISION(kFloat)); + ASSERT_EQ(gather.target(), TARGET(kX86)); +} + +TEST(gather_x86, int64_init) { + GatherCompute gather; + ASSERT_EQ(gather.precision(), PRECISION(kFloat)); + ASSERT_EQ(gather.target(), TARGET(kX86)); +} + +template +void test_case_1dims() { + lite::Tensor x, index, out; + std::vector x_shape{10}; + x.Resize(lite::DDim(x_shape)); + std::vector index_shape{3}; + index.Resize(lite::DDim(index_shape)); + std::vector out_shape{3}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto index_data = index.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + std::vector index_value{1, 3, 5}; + for (int i = 0; i < index.dims().production(); ++i) { + index_data[i] = static_cast(index_value[i]); + } + + GatherCompute gather; + operators::GatherParam param; + + param.X = &x; + param.Index = &index; + param.Out = &out; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + gather.SetContext(std::move(ctx)); + gather.SetParam(param); + gather.Run(); + + std::vector ref_data{1, 3, 5}; + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + } +} + +template +void test_case_2dims() { + lite::Tensor x, index, out; + std::vector x_shape{10, 20}; + x.Resize(lite::DDim(x_shape)); + std::vector index_shape{3}; + index.Resize(lite::DDim(index_shape)); + std::vector out_shape{3, 20}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto index_data = index.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + std::vector index_value{1, 3, 5}; + for (int i = 0; i < index.dims().production(); ++i) { + index_data[i] = static_cast(index_value[i]); + } + + GatherCompute gather; + operators::GatherParam param; + + param.X = &x; + param.Index = &index; + param.Out = &out; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + gather.SetContext(std::move(ctx)); + gather.SetParam(param); + gather.Run(); + + std::vector ref_data(60); + for (int i = 0; i < 20; ++i) { + ref_data[i] = static_cast(20 + i); + } + for (int i = 20; i < 40; ++i) { + ref_data[i] = static_cast(40 + i); + } + for (int i = 40; i < 60; ++i) { + ref_data[i] = static_cast(60 + i); + } + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + } +} + +TEST(gather_x86, run_test_1dims) { + test_case_1dims(); + test_case_1dims(); +} + +TEST(gather_x86, run_test_2dims) { + test_case_2dims(); + test_case_2dims(); +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, def); +USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, int64_in); diff --git a/lite/kernels/x86/layer_norm_compute.cc b/lite/kernels/x86/layer_norm_compute.cc new file mode 100644 index 0000000000..4854a69a1d --- /dev/null +++ b/lite/kernels/x86/layer_norm_compute.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/layer_norm_compute.h" + +REGISTER_LITE_KERNEL(layer_norm, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::LayerNormCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h new file mode 100644 index 0000000000..bbbdb91deb --- /dev/null +++ b/lite/kernels/x86/layer_norm_compute.h @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/backends/x86/jit/helper.h" +#include "lite/backends/x86/jit/kernel_base.h" +#include "lite/backends/x86/jit/kernels.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/operators/layer_norm_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class LayerNormCompute : public KernelLite { + public: + using param_t = operators::LayerNormParam; + + void Run() override { + auto ¶m = *param_.get_mutable(); + float epsilon = param.epsilon; + auto Scale = param.Scale; + auto Bias = param.Bias; + auto x = param.X; + + auto y = param.Y; + auto Mean = param.Mean; + auto Var = param.Variance; + auto begin_norm_axis = param.begin_norm_axis; + + auto x_dims = x->dims(); + + y->mutable_data(); + Mean->mutable_data(); + Var->mutable_data(); + + auto matrix_dim = x_dims.Flatten2D(begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + lite::DDim matrix_shape({left, right}); + + lite::Tensor in; + in.ShareDataWith(*x); + in.Resize(matrix_shape); + lite::Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + PADDLE_ENFORCE_EQ(Mean->numel(), left); + PADDLE_ENFORCE_EQ(Var->numel(), left); + PADDLE_ENFORCE_EQ(Scale->numel(), right); + PADDLE_ENFORCE_EQ(Bias->numel(), right); + + auto ker = paddle::lite::jit::KernelFuncs, + lite::fluid::CPUPlace>::Cache() + .At(right); + ker(in.mutable_data(), + out.mutable_data(), + Mean->mutable_data(), + Var->mutable_data(), + Scale->data(), + Bias->data(), + static_cast(left), + static_cast(epsilon), + right); + } + + virtual ~LayerNormCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc new file mode 100644 index 0000000000..fbac395052 --- /dev/null +++ b/lite/kernels/x86/layer_norm_compute_test.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/layer_norm_compute.h" +#include +#include +#include +#include +#include "lite/backends/x86/jit/helper.h" +#include "lite/backends/x86/jit/kernel_base.h" +#include "lite/backends/x86/jit/kernels.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +std::vector ref(lite::Tensor* x, + lite::Tensor* Scale, + lite::Tensor* Bias, + lite::Tensor* y, + lite::Tensor* Mean, + lite::Tensor* Var, + int begin_norm_axis, + float epsilon) { + auto x_dims = x->dims(); + + y->mutable_data(); + Mean->mutable_data(); + Var->mutable_data(); + + auto matrix_dim = x_dims.Flatten2D(begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + lite::DDim matrix_shape({left, right}); + + x->Resize(matrix_shape); + Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + auto ker = paddle::lite::jit::KernelFuncs, + lite::fluid::CPUPlace>::Cache() + .At(right); + ker(x->mutable_data(), + out.mutable_data(), + Mean->mutable_data(), + Var->mutable_data(), + Scale->data(), + Bias->data(), + static_cast(left), + static_cast(epsilon), + right); + + std::vector ref_data; + auto result = out.mutable_data(); + for (int i = 0; i < y->dims().production(); ++i) { + ref_data.emplace_back(result[i]); + } + return ref_data; +} + +// layer_norm +TEST(layer_norm_x86, retrive_op) { + auto layer_norm = + KernelRegistry::Global().Create( + "layer_norm"); + ASSERT_FALSE(layer_norm.empty()); + ASSERT_TRUE(layer_norm.front()); +} + +TEST(layer_norm_x86, init) { + lite::kernels::x86::LayerNormCompute layer_norm; + ASSERT_EQ(layer_norm.precision(), PRECISION(kFloat)); + ASSERT_EQ(layer_norm.target(), TARGET(kX86)); +} + +TEST(layer_norm_x86, run_test) { + lite::Tensor x; + lite::Tensor Scale; + lite::Tensor Bias; + + lite::Tensor out; + lite::Tensor Mean; + lite::Tensor Var; + + std::vector x_shape({1, 2, 3, 1}); + x.Resize(lite::DDim(x_shape)); + std::vector out_shape({1, 2, 3, 1}); + out.Resize(lite::DDim(out_shape)); + + int begin_norm_axis = 0; + float epsilon = 1e-5; + int pre = 1; + int post = 1; + for (int i = 0; i < begin_norm_axis; ++i) { + pre *= x_shape[i]; + } + for (int i = begin_norm_axis; i < x_shape.size(); ++i) { + post *= x_shape[i]; + } + std::vector scale_shape({post}); + Scale.Resize(scale_shape); + std::vector bias_shape({post}); + Bias.Resize(bias_shape); + + auto x_data = x.mutable_data(); + auto scale_data = Scale.mutable_data(); + auto bias_data = Bias.mutable_data(); + auto out_data = out.mutable_data(); + auto mean_data = Mean.mutable_data(); + auto var_data = Var.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < Scale.dims().production(); ++i) { + scale_data[i] = 1.5; + } + for (int64_t i = 0; i < Bias.dims().production(); ++i) { + bias_data[i] = 0.25; + } + + LayerNormCompute layer_norm; + operators::LayerNormParam param; + + param.X = &x; + param.Y = &out; + param.Scale = &Scale; + param.Bias = &Bias; + param.Mean = &Mean; + param.Variance = &Var; + param.begin_norm_axis = begin_norm_axis; + param.epsilon = epsilon; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + layer_norm.SetContext(std::move(ctx)); + layer_norm.SetParam(param); + layer_norm.Run(); + + std::vector ref_data = + ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon); + for (int j = 0; j < out.dims().production(); ++j) { + EXPECT_NEAR(out_data[j], ref_data[j], 1e-5); + // LOG(INFO) << out_data[j]; + } + LOG(INFO) << *mean_data; + LOG(INFO) << *var_data; +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(layer_norm, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/lookup_table_compute.cc b/lite/kernels/x86/lookup_table_compute.cc index 364593251e..856a07a94c 100644 --- a/lite/kernels/x86/lookup_table_compute.cc +++ b/lite/kernels/x86/lookup_table_compute.cc @@ -32,3 +32,13 @@ REGISTER_LITE_KERNEL(lookup_table, .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); +REGISTER_LITE_KERNEL(lookup_table_v2, + kX86, + kInt64, + kNCHW, + paddle::lite::kernels::x86::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h index e0d7752ca7..d5719f332c 100644 --- a/lite/kernels/x86/lookup_table_compute.h +++ b/lite/kernels/x86/lookup_table_compute.h @@ -30,7 +30,6 @@ class LookupTableCompute : public KernelLite { void Run() override { auto ¶m = *param_.get_mutable(); - // auto& context = context_->As(); auto *ids_t = param.Ids; auto *output_t = param.Out; int64_t padding_idx = param.padding_idx; @@ -41,18 +40,18 @@ class LookupTableCompute : public KernelLite { int64_t row_number = table_t->dims()[0]; int64_t row_width = table_t->dims()[1]; - auto *table = table_t->data(); - auto *output = output_t->mutable_data(); - memset(output, 0, output_t->dims().production() * sizeof(float)); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(); + memset(output, 0, output_t->dims().production() * sizeof(T)); for (int64_t i = 0; i < ids_numel; ++i) { if (padding_idx != -1 && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(float)); + memset(output + i * row_width, 0, row_width * sizeof(T)); } else { CHECK_LT(ids[i], row_number); CHECK_GE(ids[i], 0); memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(float)); + row_width * sizeof(T)); } } } diff --git a/lite/kernels/x86/lookup_table_compute_test.cc b/lite/kernels/x86/lookup_table_compute_test.cc new file mode 100644 index 0000000000..86b2d39186 --- /dev/null +++ b/lite/kernels/x86/lookup_table_compute_test.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/lookup_table_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(lookup_table_x86, compute) { + LookupTableCompute lookup_table; + operators::LookupTableParam param; + lite::Tensor w, ids, out, out_ref; + int64_t padding_idx = -1; + + int vocab_size = 40; + int emb_size = 50; + int ids_h = 30; + int ids_w = 20; + + auto w_dim = DDim({vocab_size, emb_size}); + auto ids_dim = DDim({ids_h, ids_w}); + auto out_dim = DDim({ids_h, ids_w, emb_size}); + + w.Resize(w_dim); + ids.Resize(ids_dim); + out.Resize(out_dim); + out_ref.Resize(out_dim); + + auto* w_data = w.mutable_data(); + auto* ids_data = ids.mutable_data(); + auto* out_data = out.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + int w_num = w_dim.production(); + for (int i = 0; i < w_num; i++) { + w_data[i] = static_cast(i + 1) / (w_num + 1); + } + int ids_num = ids_dim.production(); + for (int i = 0; i < ids_num; i++) { + ids_data[i] = i % vocab_size; + } + int out_num = out_dim.production(); + for (int i = 0; i < out_num; i++) { + out_ref_data[i] = + static_cast((i % (vocab_size * emb_size)) + 1) / (w_num + 1); + } + + param.W = &w; + param.Ids = &ids; + param.Out = &out; + param.padding_idx = padding_idx; + lookup_table.SetParam(param); + lookup_table.Run(); + for (int i = 0; i < out_num; i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW, def); diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc new file mode 100644 index 0000000000..feda180d22 --- /dev/null +++ b/lite/kernels/x86/match_matrix_tensor_compute.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/match_matrix_tensor_compute.h" +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +void MatchMatrixTensorCompute::Run() { + auto& context = ctx_->As(); + auto& param = this->Param(); + auto* x = param.x; + auto* w = param.w; + auto* y = param.y; + auto* out = param.out; + auto* tmp = param.tmp; + int dim_t = param.dim_t; + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + + auto* bottom_l_data = x->template data(); + auto* bottom_r_data = y->template data(); + auto* t_data = w->template data(); + auto* out_data = out->template mutable_data(); + auto* bottom_l_trans_data = tmp->template mutable_data(); + memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T)); + memset(bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); + + auto blas = lite::x86::math::GetBlas(context); + blas.GEMM(CblasNoTrans, + CblasNoTrans, + x->dims()[0], + dim_t * dim_in, + dim_in, + 1.0f, + bottom_l_data, + dim_in, + t_data, + dim_t * dim_in, + 0.0f, + bottom_l_trans_data, + dim_t * dim_in); + + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + auto* top_data = out_data + top_offset[b] + t * len_l * len_r; + const auto* l_t_data = + bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; + const auto* r_data = bottom_r_data + offset_r[b] * dim_in; + + auto blas = lite::x86::math::GetBlas(context); + blas.GEMM(CblasNoTrans, + CblasTrans, + len_l, + len_r, + dim_in, + 1.0f, + l_t_data, + dim_t * dim_in, + r_data, + dim_in, + 0.0f, + top_data, + len_r); + } + } + + int batch_size = x->lod()[0].size() - 1; + int lod_lv1_size = batch_size * dim_t; + int lod_lv2_size = x->lod()[0].back() * dim_t; + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); + for (int i = 0; i < batch_size; i++) { + out_lod0[i + 1] = out_lod0[i] + dim_t; + int len_l = offset_l[i + 1] - offset_l[i]; + + for (int j = 0; j < dim_t; j++) { + out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l; + int len_r = offset_r[i + 1] - offset_r[i]; + + for (int k = 0; k < len_l; k++) { + out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = + out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; + } + } + } + + LoD out_lod; + out_lod.push_back(top_offset); + out_lod.push_back(offset_l); + out_lod.push_back(offset_r); + out->set_lod(out_lod); +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + match_matrix_tensor, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::MatchMatrixTensorCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/match_matrix_tensor_compute.h b/lite/kernels/x86/match_matrix_tensor_compute.h new file mode 100644 index 0000000000..6189676fd8 --- /dev/null +++ b/lite/kernels/x86/match_matrix_tensor_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class MatchMatrixTensorCompute + : public KernelLite { + public: + using param_t = operators::MatchMatrixTensorParam; + + void Run() override; + + virtual ~MatchMatrixTensorCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc new file mode 100644 index 0000000000..0c3f3ad509 --- /dev/null +++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/match_matrix_tensor_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(match_matrix_tensor_x86, retrive_op) { + auto kernel = + KernelRegistry::Global().Create( + "match_matrix_tensor"); + ASSERT_FALSE(kernel.empty()); + ASSERT_TRUE(kernel.front()); +} + +TEST(match_matrix_tensor_x86, init) { + MatchMatrixTensorCompute mmtc; + ASSERT_EQ(mmtc.precision(), PRECISION(kFloat)); + ASSERT_EQ(mmtc.target(), TARGET(kX86)); +} + +TEST(match_matrix_tensor_x86, run_test) { + int ix = 5, iy = 4, h = 2, dim_t = 2; + lite::Tensor x, w, y, out, tmp; + x.Resize({ix, h}); + w.Resize({h, dim_t, h}); + y.Resize({iy, h}); + out.Resize({18, 1}); + tmp.Resize({20, 1}); + + LoD x_lod{}; + x_lod.push_back({0, 2, 5}); + x.set_lod(x_lod); + LoD y_lod{}; + y_lod.push_back({0, 3, 4}); + y.set_lod(y_lod); + + auto* x_data = x.mutable_data(); + for (int64_t i = 0; i < x.numel(); i++) { + x_data[i] = static_cast(i); + } + auto* y_data = y.mutable_data(); + for (int64_t i = 0; i < y.numel(); i++) { + y_data[i] = static_cast(i); + } + auto* w_data = w.mutable_data(); + for (int64_t i = 0; i < w.numel(); i++) { + w_data[i] = static_cast(i); + } + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + MatchMatrixTensorCompute mmtc; + mmtc.SetContext(std::move(ctx)); + + operators::MatchMatrixTensorParam param; + param.x = &x; + param.w = &w; + param.y = &y; + param.dim_t = dim_t; + param.out = &out; + param.tmp = &tmp; + + mmtc.SetParam(param); + mmtc.Run(); + + std::vector ref_results = {5, + 23, + 41, + 17, + 75, + 133, + 7, + 33, + 59, + 27, + 125, + 223, + 323, + 455, + 587, + 557, + 793, + 1029}; + auto* out_data = out.mutable_data(); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_results[i], 1e-3); + // LOG(INFO) << out_data[i]; + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(match_matrix_tensor, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/mean_compute.cc b/lite/kernels/x86/mean_compute.cc index b618d2d377..1216d99ad8 100644 --- a/lite/kernels/x86/mean_compute.cc +++ b/lite/kernels/x86/mean_compute.cc @@ -54,29 +54,6 @@ class MeanCompute : public KernelLite { virtual ~MeanCompute() = default; }; -template -class MeanGradCompute : public KernelLite { - public: - using param_t = operators::MeanGradParam; - - void Run() override { - auto& param = *param_.get_mutable(); - auto& context = ctx_->As(); - CHECK_EQ(param.Out_grad->raw_tensor().numel(), 1); - CHECK(context.x86_device_context()); - - param.X_grad->template mutable_data(); - T x_grad_size = static_cast(param.X_grad->raw_tensor().numel()); - Eigen::DSizes bcast(static_cast(x_grad_size)); - EigenVector::Flatten(param.X_grad->raw_tensor()) - .device(*(context.x86_device_context()->eigen_device())) = - (EigenVector::From(param.Out_grad->raw_tensor()) / x_grad_size) - .broadcast(bcast); - } - - virtual ~MeanGradCompute() = default; -}; - } // namespace x86 } // namespace kernels } // namespace lite @@ -93,16 +70,3 @@ REGISTER_LITE_KERNEL(mean, .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); - -REGISTER_LITE_KERNEL(mean_grad, - kX86, - kFloat, - kNCHW, - paddle::lite::kernels::x86::MeanGradCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) - .BindInput(paddle::framework::GradVarName("Out"), - {LiteType::GetTensorTy(TARGET(kX86))}) - .BindOutput(paddle::framework::GradVarName("X"), - {LiteType::GetTensorTy(TARGET(kX86))}) - .Finalize(); diff --git a/lite/kernels/x86/mul_compute.cc b/lite/kernels/x86/mul_compute.cc index 64558f6677..3de4340543 100644 --- a/lite/kernels/x86/mul_compute.cc +++ b/lite/kernels/x86/mul_compute.cc @@ -24,21 +24,3 @@ REGISTER_LITE_KERNEL(mul, .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); - -// #ifdef LITE_WITH_TRAIN -// REGISTER_LITE_KERNEL(mul_grad, -// kX86, -// kFloat, -// kNCHW, -// paddle::lite::kernels::x86::MulGradCompute, -// def) -// .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindInput(paddle::framework::GradVarName("Out"), -// {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindOutput(paddle::framework::GradVarName("X"), -// {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindOutput(paddle::framework::GradVarName("Y"), -// {LiteType::GetTensorTy(TARGET(kX86))}) -// .Finalize(); -// #endif diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h index e204fc81f2..be58f24ba2 100644 --- a/lite/kernels/x86/mul_compute.h +++ b/lite/kernels/x86/mul_compute.h @@ -81,78 +81,6 @@ class MulCompute : public KernelLite { virtual ~MulCompute() = default; }; -#ifdef LITE_WITH_TRAIN -template -class MulGradCompute : public KernelLite { - public: - void Run() override { - auto& context = ctx_->As(); - auto& param = *param_.get_mutable(); - CHECK(context.x86_device_context()); - - auto* x = ¶m.x->raw_tensor(); - auto* y = ¶m.y->raw_tensor(); - - Tensor x_matrix, y_matrix; - - if (x->dims().size() > 2) { - x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims); - } else { - x_matrix = *x; - } - - if (y->dims().size() > 2) { - y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims); - - } else { - y_matrix = *y; - } - - auto* dout = ¶m.output_grad->raw_tensor(); - - Tensor dout_mat; - dout_mat.ShareDataWith(*dout); - dout_mat.Resize( - {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0], - framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]}); - - auto* dx = ¶m.x_grad->raw_tensor(); - auto* dy = ¶m.y_grad->raw_tensor(); - - if (dx != nullptr) { - dx->set_lod(x->lod()); - } - if (dy != nullptr) { - dy->set_lod(y->lod()); - } - - auto blas = paddle::operators::math::GetBlas( - *context.x86_device_context()); - if (dx) { - // dx->mutable_data(context.x86_device_context->GetPlace()); - param.x_grad->template mutable_data(); - Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix( - *dx, param.x_num_col_dims) - : *dx; - - // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix); - } - if (dy) { - // dy->yutable_data(context.x86_device_context->GetPlace()); - param.y_grad->template mutable_data(); - Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix( - *dy, param.y_num_col_dims) - : *dy; - // dy = x' * dout. dy K x N, dout : M x N, x : M x K - blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix); - } - } - - virtual ~MulGradCompute() = default; -}; -#endif - } // namespace x86 } // namespace kernels } // namespace lite diff --git a/lite/kernels/x86/pool_compute.h b/lite/kernels/x86/pool_compute.h index 57bcddcec9..0dccb245b1 100644 --- a/lite/kernels/x86/pool_compute.h +++ b/lite/kernels/x86/pool_compute.h @@ -35,7 +35,6 @@ class PoolCompute : public KernelLite { auto& param = *param_.get_mutable(); if (param.global_pooling) { for (size_t i = 0; i < param.ksize.size(); ++i) { - param.paddings[i] = 0; param.ksize[i] = static_cast(param.x->dims()[i + 2]); } } @@ -52,7 +51,7 @@ class PoolCompute : public KernelLite { param.x, param.ksize, param.strides, - param.paddings, + *param.paddings, pool_process, true, false, @@ -68,7 +67,7 @@ class PoolCompute : public KernelLite { param.x, param.ksize, param.strides, - param.paddings, + *param.paddings, pool_process, param.exclusive, param.adaptive, diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc index 87b75a0760..4ea727cedd 100644 --- a/lite/kernels/x86/pool_compute_test.cc +++ b/lite/kernels/x86/pool_compute_test.cc @@ -60,7 +60,8 @@ TEST(pool2d_x86, run_test) { param.x = &x; param.output = &out; param.strides = {2, 2}; - param.paddings = {0, 0}; + std::vector paddings = {0, 0, 0, 0}; + param.paddings = std::make_shared>(paddings); param.ksize = {2, 2}; param.pooling_type = "max"; std::unique_ptr ctx(new KernelContext); diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.cc b/lite/kernels/x86/search_aligned_mat_mul_compute.cc new file mode 100644 index 0000000000..956f2a3beb --- /dev/null +++ b/lite/kernels/x86/search_aligned_mat_mul_compute.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_aligned_mat_mul_compute.h" + +REGISTER_LITE_KERNEL( + search_aligned_mat_mul, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchAlignedMatMulCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.h b/lite/kernels/x86/search_aligned_mat_mul_compute.h new file mode 100644 index 0000000000..ea6b546c2c --- /dev/null +++ b/lite/kernels/x86/search_aligned_mat_mul_compute.h @@ -0,0 +1,83 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchAlignedMatMulCompute + : public KernelLite { + public: + using param_t = operators::MatMulParam; + + void Run() override { + auto& context = ctx_->As(); + auto& param = *param_.get_mutable(); + + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + float alpha = param.alpha; + const auto x_dims = x->dims(); + const auto y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + + lite::x86::math::MatDescriptor mat_dim_a; + mat_dim_a.height_ = M; + mat_dim_a.width_ = K; + mat_dim_a.stride_ = x_batch_size * x_inner_size; + mat_dim_a.batch_size_ = seq_num; + mat_dim_a.trans_ = x_transpose; + lite::x86::math::MatDescriptor mat_dim_b; + mat_dim_b.height_ = K; + mat_dim_b.width_ = N; + mat_dim_b.stride_ = y_batch_size * y_inner_size; + mat_dim_b.batch_size_ = seq_num; + mat_dim_b.trans_ = y_transpose; + auto blas = lite::x86::math::GetBlas(context); + blas.MatMul(*x, mat_dim_a, *y, mat_dim_b, static_cast(alpha), out, T(0)); + } + + virtual ~SearchAlignedMatMulCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_fc_compute.cc b/lite/kernels/x86/search_fc_compute.cc new file mode 100644 index 0000000000..cf76113e01 --- /dev/null +++ b/lite/kernels/x86/search_fc_compute.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_fc_compute.h" + +REGISTER_LITE_KERNEL(search_fc, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_fc_compute.h b/lite/kernels/x86/search_fc_compute.h new file mode 100644 index 0000000000..e0f44de526 --- /dev/null +++ b/lite/kernels/x86/search_fc_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/search_fc.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchFcCompute : public KernelLite { + public: + using param_t = operators::SearchFcParam; + void Run() override { + auto& context = ctx_->As(); + auto& param = *param_.get_mutable(); + + param.Out->Resize({param.X->dims()[0], param.out_size}); + lite::x86::math::SearchFcFunctor search_fc; + search_fc(context, *param.X, *param.W, *param.b, param.Out, param.out_size); + } + virtual ~SearchFcCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc new file mode 100644 index 0000000000..425df2a0f0 --- /dev/null +++ b/lite/kernels/x86/search_fc_compute_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_fc_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +void fc_cpu_base(const lite::Tensor* X, + const lite::Tensor* W, + const lite::Tensor* b, + int out_size, + lite::Tensor* Out) { + const float* data_in = X->data(); + const float* bias = b->data(); + const float* weights = W->data(); + float* data_out = Out->mutable_data(); + int out_rows = X->dims()[0]; + int in_cols = X->numel() / out_rows; + int out_cols = W->numel() / in_cols; + int index_out; + + for (int i = 0; i < out_rows; i++) { + for (int j = 0; j < out_cols; j++) { + index_out = i * out_cols + j; + data_out[index_out] = bias ? bias[j] : 0; + + for (int k = 0; k < in_cols; k++) { + data_out[index_out] += + data_in[i * in_cols + k] * weights[j * in_cols + k]; + } + } + } +} + +TEST(search_fc_x86, retrive_op) { + auto search_fc = + KernelRegistry::Global().Create( + "search_fc"); + ASSERT_FALSE(search_fc.empty()); + ASSERT_TRUE(search_fc.front()); +} + +TEST(search_fc_x86, init) { + SearchFcCompute search_fc; + ASSERT_EQ(search_fc.precision(), PRECISION(kFloat)); + ASSERT_EQ(search_fc.target(), TARGET(kX86)); +} + +TEST(search_fc_x86, run_test) { + lite::Tensor x, w, b, out; + lite::Tensor out_ref; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + std::vector x_shape{1, 4}; + x.Resize(lite::DDim(x_shape)); + std::vector w_shape{3, 4}; + w.Resize(lite::DDim(w_shape)); + std::vector b_shape{3}; + b.Resize(lite::DDim(b_shape)); + std::vector out_shape{1, 4}; + out.Resize(lite::DDim(out_shape)); + out_ref.Resize(lite::DDim(out_shape)); + auto x_data = x.mutable_data(); + auto w_data = w.mutable_data(); + auto b_data = b.mutable_data(); + auto out_data = out.mutable_data(); + auto out_data_ref = out_ref.mutable_data(); + for (int64_t i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < w.dims().production(); i++) { + w_data[i] = static_cast(i); + } + for (int64_t i = 0; i < b.dims().production(); i++) { + b_data[i] = static_cast(i); + } + + fc_cpu_base(&x, &w, &b, 4, &out_ref); + + SearchFcCompute fc; + operators::SearchFcParam param; + param.X = &x; + param.W = &w; + param.b = &b; + param.Out = &out; + param.out_size = 4; + fc.SetParam(param); + fc.SetContext(std::move(ctx)); + fc.Run(); + + VLOG(3) << "output vs ref"; + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_data_ref[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_fc, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_grnn_compute.cc b/lite/kernels/x86/search_grnn_compute.cc new file mode 100644 index 0000000000..95839ba71b --- /dev/null +++ b/lite/kernels/x86/search_grnn_compute.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_grnn_compute.h" +#include +#include +#include "lite/backends/x86/math/blas.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +T sigmoid(T z) { + return 1 / (1 + std::exp(-z)); +} + +template +void CallGemm(const lite::x86::math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const T* B, + const T beta, + T* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +} + +template +void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) { + auto& param = this->Param(); + auto* _idx_sorted_by_width = param.idx_sorted_by_width; + auto* _layout_input = param.layout_input; + auto* _input = input_blob; + + // usually total length + int dim0 = _input->dims()[0]; + // if it is id only sequence + int dim1 = 1; + // if its a embedding like sequence (dim1 would be embedding_size) + if (_input->dims().size() > 1) { + dim1 = _input->dims()[1]; + } + + int batch = _input->lod()[0].size() - 1; + auto& offset = _input->lod()[0]; + + Tensor _width; + _width.Resize({batch}); + _idx_sorted_by_width->Resize({batch}); + int* width_data = _width.template mutable_data(); + int* idx_sorted_by_width_data = + _idx_sorted_by_width->template mutable_data(); + // sort sequence by width (descending) and find the largest width in the + // batch + for (int i = 0; i < batch; i++) { + width_data[i] = offset[i + 1] - offset[i]; + idx_sorted_by_width_data[i] = i; + } + std::sort(idx_sorted_by_width_data, + idx_sorted_by_width_data + batch, + [&_width](int a, int b) { + return _width.template data()[a] > + _width.template data()[b]; + }); + int max_width = width_data[idx_sorted_by_width_data[0]]; + + // start of reorganizing the input + std::vector new_offset; + new_offset.resize(max_width + 1); + + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width_data[idx_sorted_by_width_data[k]] > last_width) { + sub_row = width_data[idx_sorted_by_width_data[k]] - last_width; + sub_col = k + 1; + + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width_data[idx_sorted_by_width_data[k]]; + j = k - 1; + break; + } + } + } + + // copying to the reorganized buffer + if (_input->dims().size() == 1) { + // _layout_input.reshape_batch_sequence({dim0}, new_offset); + LOG(FATAL) << "_input->dims().size() = 1, error."; + } else { + // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset); + LoD new_lod; + new_lod.push_back(new_offset); + _layout_input->set_lod(new_lod); + _layout_input->Resize({dim0, dim1}); + } + + auto* new_emb = _layout_input->template mutable_data(); + for (int i = 0; i < max_width; i++) { + int w = new_offset[i + 1] - new_offset[i]; + auto* emb_start = new_emb + dim1 * new_offset[i]; + for (int j = 0; j < w; ++j) { + memcpy(emb_start + dim1 * j, + _input->template data() + + dim1 * offset[idx_sorted_by_width_data[j]] + dim1 * i, + dim1 * sizeof(T)); + } + } +} + +template +void SearchGrnnCompute::CopyBack(T* from, T* to, int step) { + auto& param = this->Param(); + auto* _input = param.x; + auto* _layout_input = param.layout_input; + auto* _idx_sorted_by_width = param.idx_sorted_by_width; + + const auto& offset = _input->lod()[0]; + const auto& new_offset = _layout_input->lod()[0]; + const auto* idx_sorted_by_width_data = + _idx_sorted_by_width->template data(); + for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) { + int w = new_offset[i + 1] - new_offset[i]; + for (int j = 0; j < w; j++) { + memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i), + from + (new_offset[i] + j) * step, + step * sizeof(T)); + } + } +} + +template +void SearchGrnnCompute::Run() { + auto& context = ctx_->As(); + auto& param = this->Param(); + auto* bottom = param.x; + auto* wi = param.wi; + auto* wh = param.wh; + auto* top = param.out; + auto* _buffer = param.tmp_buffer; + int _cap_h = param.num_hidden; + int _cap_e = param.num_input; + + int _cap_l = bottom->dims()[0]; + int batch = bottom->lod()[0].size() - 1; + + const auto& offset = bottom->lod()[0]; + LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + std::vector top_dims_vec{_cap_l, _cap_h}; + top->Resize(top_dims_vec); + auto* top_hidden = top->template mutable_data(); + + const auto* dense_e2h = wi->template data(); + const auto* dense_h2h = wh->template data(); + + const auto* e2h = dense_e2h; + const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h; + const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h; + const auto* h2h = dense_h2h; + const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h; + const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h; + + PrepareLayout(bottom); + + auto* _layout_input = param.layout_input; + auto* new_emb = _layout_input->template mutable_data(); + const auto& new_offset = _layout_input->lod()[0]; + int max_width = _layout_input->lod()[0].size() - 1; + + // this buffer is used for book keeping info which will be used in bp + // buffer also needed in bp, so make it larger + _buffer->Resize({20, _cap_l, _cap_h}); + auto* buffer_data = _buffer->template mutable_data(); + auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h; + auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h; + auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h; + auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h; + auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h; + auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h; + auto* r = buffer_data + 6 * _cap_l * _cap_h; + auto* z = buffer_data + 7 * _cap_l * _cap_h; + auto* tilde = buffer_data + 8 * _cap_l * _cap_h; + // the internal hidden + auto* hidden = buffer_data + 19 * _cap_l * _cap_h; + + auto blas = lite::x86::math::GetBlas(context); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + _cap_l, + _cap_h, + _cap_e, + 1.0f, + new_emb, + e2h, + 0.0f, + w_x_e); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + _cap_l, + _cap_h, + _cap_e, + 1.0f, + new_emb, + e2hr, + 0.0f, + wr_x_e); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + _cap_l, + _cap_h, + _cap_e, + 1.0f, + new_emb, + e2hz, + 0.0f, + wz_x_e); + + // precompute hidden0 + for (int i = 0; i < batch * _cap_h; i++) { + tilde[i] = std::tanh(w_x_e[i]); + z[i] = sigmoid(wz_x_e[i]); + hidden[i] = (1. - z[i]) * tilde[i]; + } + + // recurrence + for (int i = 1; i < max_width; i++) { + int w_tm1 = new_offset[i] - new_offset[i - 1]; + int w = new_offset[i + 1] - new_offset[i]; + + // precompute hidden i-1 to hidden i + auto* htm1 = hidden + new_offset[i - 1] * _cap_h; + + CallGemm(blas, + CblasNoTrans, + CblasTrans, + w, + _cap_h, + _cap_h, + 1.0f, + htm1, + h2h, + 0.0f, + u_x_h + new_offset[i] * _cap_h); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + w, + _cap_h, + _cap_h, + 1.0f, + htm1, + h2hr, + 0.0f, + ur_x_h + new_offset[i] * _cap_h); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + w, + _cap_h, + _cap_h, + 1.0f, + htm1, + h2hz, + 0.0f, + uz_x_h + new_offset[i] * _cap_h); + + // compute the gate and hidden + for (size_t j = new_offset[i] * _cap_h; j < (new_offset[i] + w) * _cap_h; + j++) { + r[j] = sigmoid(wr_x_e[j] + ur_x_h[j]); + z[j] = sigmoid(wz_x_e[j] + uz_x_h[j]); + tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]); + hidden[j] = z[j] * hidden[j - _cap_h * w_tm1] + (1.0 - z[j]) * tilde[j]; + } + } + + CopyBack(hidden, top_hidden, _cap_h); +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_grnn, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchGrnnCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("idx_sorted_by_width", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) + .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_grnn_compute.h b/lite/kernels/x86/search_grnn_compute.h new file mode 100644 index 0000000000..66866761e1 --- /dev/null +++ b/lite/kernels/x86/search_grnn_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchGrnnCompute : public KernelLite { + public: + using param_t = operators::SearchGrnnParam; + + void Run() override; + + virtual ~SearchGrnnCompute() = default; + + private: + void PrepareLayout(const Tensor* input); + void CopyBack(T* from, T* to, int step); +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc new file mode 100644 index 0000000000..b85d97e3f1 --- /dev/null +++ b/lite/kernels/x86/search_grnn_compute_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_grnn_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(search_grnn_x86, retrive_op) { + auto kernel = + KernelRegistry::Global().Create( + "search_grnn"); + ASSERT_FALSE(kernel.empty()); + ASSERT_TRUE(kernel.front()); +} + +TEST(search_grnn_x86, init) { + SearchGrnnCompute ssdc; + ASSERT_EQ(ssdc.precision(), PRECISION(kFloat)); + ASSERT_EQ(ssdc.target(), TARGET(kX86)); +} + +TEST(search_grnn_x86, run_test) { + int num_input = 128; + int num_hidden = 128; + int num_batch = 3; + lite::Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer; + x.Resize({num_batch, num_input}); + wi.Resize({3, num_hidden, num_input}); + wh.Resize({3, num_hidden, num_hidden}); + // out.Resize({num_batch, num_hidden}); + LoD x_lod{}; + x_lod.push_back({0, 1, 3}); + x.set_lod(x_lod); + + auto* x_data = x.mutable_data(); + for (int64_t i = 0; i < x.numel(); i++) { + x_data[i] = static_cast(i); + } + auto* wi_data = wi.mutable_data(); + for (int64_t i = 0; i < wi.numel(); i++) { + wi_data[i] = static_cast(i); + } + auto* wh_data = wh.mutable_data(); + for (int64_t i = 0; i < wh.numel(); i++) { + wh_data[i] = static_cast(i); + } + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + operators::SearchGrnnParam param; + param.x = &x; + param.wi = &wi; + param.wh = &wh; + param.out = &out; + param.idx_sorted_by_width = &idx_sorted_by_width; + param.layout_input = &layout_input; + param.tmp_buffer = &tmp_buffer; + param.num_input = num_input; + param.num_hidden = num_hidden; + + SearchGrnnCompute sgc; + sgc.SetContext(std::move(ctx)); + sgc.SetParam(param); + sgc.Run(); + + // std::vector ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19}; + auto* out_data = out.mutable_data(); + LOG(INFO) << out.numel(); + for (int i = 0; i < out.numel(); i++) { + // EXPECT_NEAR(out_data[i], ref_results[i], 1e-3); + LOG(INFO) << out_data[i]; + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_grnn, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_group_padding_compute.cc b/lite/kernels/x86/search_group_padding_compute.cc new file mode 100644 index 0000000000..d1847ac9db --- /dev/null +++ b/lite/kernels/x86/search_group_padding_compute.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_group_padding_compute.h" + +REGISTER_LITE_KERNEL( + search_group_padding, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchGroupPaddingCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_emb_padding", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_new", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_padding", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_group_padding_compute.h b/lite/kernels/x86/search_group_padding_compute.h new file mode 100644 index 0000000000..17244d15d9 --- /dev/null +++ b/lite/kernels/x86/search_group_padding_compute.h @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchGroupPaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchGroupPaddingParam; + + void Run() override { + auto& param = *param_.get_mutable(); + + auto* bottom0 = param.x; + auto* top0 = param.out_emb_padding; + auto* top1 = param.out_new; + auto* top2 = param.out_padding; + + int _pad_id = param.pad_id; + + int batch = bottom0->lod()[0].size() - 1; + int dim0 = bottom0->dims()[0]; + int dim1 = bottom0->dims()[1]; + + const auto offset = bottom0->lod()[0]; + int max_seq = 0; + for (int i = 0; i < batch; ++i) { + if (offset[i + 1] - offset[i] > max_seq) { + max_seq = offset[i + 1] - offset[i]; + } + } + + std::vector new_offset; + new_offset.resize(batch + 1); + for (int i = 0; i < batch + 1; ++i) { + new_offset[i] = i * max_seq; + } + + // for padding data + lite::LoD top0_lod; + top0_lod.push_back(new_offset); + top0->set_lod(top0_lod); + top0->Resize({batch * max_seq, dim1}); + // for origin input id + // already set by ShareLoD in InferShape + lite::LoD top1_lod; + top1_lod.push_back(offset); + top1->set_lod(top1_lod); + top1->Resize({dim0, 1}); + memset(top1->mutable_data(), + 0, + top1->dims()[0] * top1->dims()[1] * sizeof(T)); + // for padding input id + lite::LoD top2_lod; + top2_lod.push_back(new_offset); + top2->set_lod(top2_lod); + top2->Resize({batch * max_seq, 1}); + // copy data + const auto* bottom_data = bottom0->data(); + auto* top_data = top0->mutable_data(); + auto* top_padding_input_data = top2->mutable_data(); + for (int i = 0; i < batch; i++) { + const int copy_step = offset[i + 1] - offset[i]; + const int start = i * max_seq; + memcpy(top_data + start * dim1, + bottom_data + offset[i] * dim1, + copy_step * dim1 * sizeof(T)); + memset(top_data + (start + copy_step) * dim1, + 0, + (max_seq - copy_step) * dim1 * sizeof(T)); + // for padding input id + memset(top_padding_input_data + start, 0, copy_step * sizeof(T)); + for (int j = start + copy_step; j < start + max_seq; j++) { + top_padding_input_data[j] = static_cast(_pad_id); + } + } + } + + virtual ~SearchGroupPaddingCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc new file mode 100644 index 0000000000..f4c36c2a63 --- /dev/null +++ b/lite/kernels/x86/search_group_padding_compute_test.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_group_padding_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(search_group_padding_x86, retrieve_op) { + auto search_group_padding = + KernelRegistry::Global().Create( + "search_group_padding"); + ASSERT_FALSE(search_group_padding.empty()); + ASSERT_TRUE(search_group_padding.front()); +} + +TEST(search_group_padding_x86, init) { + SearchGroupPaddingCompute search_group_padding; + ASSERT_EQ(search_group_padding.precision(), PRECISION(kFloat)); + ASSERT_EQ(search_group_padding.target(), TARGET(kX86)); +} + +TEST(search_group_padding_x86, run_test) { + lite::Tensor x, out_emb_padding, out_new, out_padding; + x.Resize({2, 3}); + out_emb_padding.Resize({-1, 3}); + out_new.Resize({2, 1}); + out_padding.Resize({-1, 1}); + LoD x_lod{}; + x_lod.push_back({0, 1}); + x.set_lod(x_lod); + + auto* x_data = x.mutable_data(); + for (int64_t i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(i); + } + SearchGroupPaddingCompute sgp_kernel; + operators::SearchGroupPaddingParam param; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + sgp_kernel.SetContext(std::move(ctx)); + + param.x = &x; + param.out_emb_padding = &out_emb_padding; + param.out_new = &out_new; + param.out_padding = &out_padding; + + sgp_kernel.SetParam(param); + sgp_kernel.Run(); + + std::vector out_emb_padding_ref = {0, 1, 2}; + std::vector out_new_ref = {0, 0}; + std::vector out_padding_ref = {0}; + auto* out_emb_padding_data = out_emb_padding.mutable_data(); + auto* out_new_data = out_new.mutable_data(); + auto* out_padding_data = out_padding.mutable_data(); + for (int i = 0; i < out_emb_padding.dims().production(); i++) { + EXPECT_NEAR(out_emb_padding_data[i], out_emb_padding_ref[i], 1e-5); + } + for (int i = 0; i < out_new.dims().production(); i++) { + EXPECT_NEAR(out_new_data[i], out_new_ref[i], 1e-5); + } + for (int i = 0; i < out_padding.dims().production(); i++) { + EXPECT_NEAR(out_padding_data[i], out_padding_ref[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_group_padding, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_seq_depadding_compute.cc b/lite/kernels/x86/search_seq_depadding_compute.cc new file mode 100644 index 0000000000..db1816fb48 --- /dev/null +++ b/lite/kernels/x86/search_seq_depadding_compute.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_seq_depadding_compute.h" +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +void SearchSeqDepaddingCompute::Run() { + auto& param = this->Param(); + auto* pad = param.pad; + auto* src = param.src; + auto* out = param.out; + + const int pad_batch = pad->lod()[0].size() - 1; + const int src_batch = src->lod()[0].size() - 1; + if (pad_batch % src_batch != 0) { + LOG(FATAL) << "Mismatch batch size."; + } + + const auto& pad_offset = pad->lod()[0]; + const int pad_cap_e = pad->dims()[1]; + const auto& src_offset = src->lod()[0]; + const int src_cap_l = src->dims()[0]; + + LoD out_lod; + out_lod.push_back(src_offset); + out->set_lod(out_lod); + out->Resize({src_cap_l, pad_cap_e}); + + const auto* pad_data = pad->template data(); + auto* out_data = out->template mutable_data(); + for (int i = 0; i < src_batch; ++i) { + const int src_i_l = src_offset[i + 1] - src_offset[i]; + const int pad_i_l = pad_offset[i + 1] - pad_offset[i]; + if (pad_i_l < src_i_l) { + LOG(FATAL) + << "the length of padding seq input is less than source seq input."; + } + memcpy(out_data + src_offset[i] * pad_cap_e, + pad_data + pad_offset[i] * pad_cap_e, + src_i_l * pad_cap_e * sizeof(T)); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + search_seq_depadding, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchSeqDepaddingCompute, + def) + .BindInput("Pad", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Src", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_seq_depadding_compute.h b/lite/kernels/x86/search_seq_depadding_compute.h new file mode 100644 index 0000000000..e48fa92723 --- /dev/null +++ b/lite/kernels/x86/search_seq_depadding_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchSeqDepaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchSeqDepaddingParam; + + void Run() override; + + virtual ~SearchSeqDepaddingCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc new file mode 100644 index 0000000000..0d978b35ed --- /dev/null +++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_seq_depadding_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(search_seq_depadding_x86, retrive_op) { + auto kernel = + KernelRegistry::Global().Create( + "search_seq_depadding"); + ASSERT_FALSE(kernel.empty()); + ASSERT_TRUE(kernel.front()); +} + +TEST(search_seq_depadding_x86, init) { + SearchSeqDepaddingCompute ssdc; + ASSERT_EQ(ssdc.precision(), PRECISION(kFloat)); + ASSERT_EQ(ssdc.target(), TARGET(kX86)); +} + +TEST(search_seq_depadding_x86, run_test) { + lite::Tensor pad, src, out; + pad.Resize({2 * 3, 4}); + src.Resize({3, 1}); + out.Resize({3, 4}); + LoD pad_lod{}; + pad_lod.push_back({0, 4, 6}); + pad.set_lod(pad_lod); + LoD src_lod{}; + src_lod.push_back({0, 2, 3}); + src.set_lod(src_lod); + + auto* pad_data = pad.mutable_data(); + for (int64_t i = 0; i < pad.dims().production(); i++) { + pad_data[i] = static_cast(i); + } + SearchSeqDepaddingCompute ssdc; + operators::SearchSeqDepaddingParam param; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + ssdc.SetContext(std::move(ctx)); + + param.pad = &pad; + param.src = &src; + param.out = &out; + + ssdc.SetParam(param); + ssdc.Run(); + + std::vector ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19}; + auto* out_data = out.mutable_data(); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_results[i], 1e-3); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_seq_depadding, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_seq_fc_compute.cc b/lite/kernels/x86/search_seq_fc_compute.cc new file mode 100644 index 0000000000..e0845bd74c --- /dev/null +++ b/lite/kernels/x86/search_seq_fc_compute.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_seq_fc_compute.h" + +REGISTER_LITE_KERNEL(search_seq_fc, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchSeqFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_seq_fc_compute.h b/lite/kernels/x86/search_seq_fc_compute.h new file mode 100644 index 0000000000..80ef54b30b --- /dev/null +++ b/lite/kernels/x86/search_seq_fc_compute.h @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchSeqFcCompute : public KernelLite { + public: + using param_t = operators::SearchSeqFcParam; + + void Run() override { + auto& context = ctx_->As(); + auto& param = *param_.get_mutable(); + + auto x = param.x; + auto w = param.w; + auto b = param.b; + auto out = param.out; + auto out_size = param.out_size; + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto out_dims = out->dims(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]"; + CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size"; + + auto blas = lite::x86::math::GetBlas(context); + blas.MatMul(*x, false, *w, true, out); + + if (b != nullptr) { + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + int M = x_dims[0]; + int N = w_dims[0]; + for (int i = 0; i < M; i++) { + blas.AXPY( + N, static_cast(1), b->data(), out->mutable_data() + i * N); + } + } + } + + virtual ~SearchSeqFcCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_arithmetic_compute.cc b/lite/kernels/x86/sequence_arithmetic_compute.cc new file mode 100644 index 0000000000..95fa27e3d4 --- /dev/null +++ b/lite/kernels/x86/sequence_arithmetic_compute.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_arithmetic_compute.h" + +REGISTER_LITE_KERNEL( + sequence_arithmetic, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); +REGISTER_LITE_KERNEL( + search_seq_arithmetic, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h new file mode 100644 index 0000000000..88510b8b1c --- /dev/null +++ b/lite/kernels/x86/sequence_arithmetic_compute.h @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SequenceArithmeticCompute + : public KernelLite { + public: + using param_t = operators::SequenceArithmeticParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + int op_type = param.op_type; + + out->Resize(x->dims()); + out->set_lod(x->lod()); + + auto x_data = x->data(); + auto y_data = y->data(); + auto out_data = out->mutable_data(); + auto x_seq_offset = x->lod()[0]; + auto y_seq_offset = y->lod()[0]; + int seq_num = x_seq_offset.size() - 1; + int inner_size = (x->numel()) / (x->dims()[0]); + + // sum + if (op_type == 1) { + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + t_out[j] = input_x[j] + input_y[j]; + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len)); + } + } + } + + // sub + if (op_type == 2) { + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + t_out[j] = input_x[j] - input_y[j]; + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len)); + } + } + } + + // mul + if (op_type == 3) { + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + t_out[j] = input_x[j] * input_y[j]; + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len)); + } + } + } + } + + virtual ~SequenceArithmeticCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc new file mode 100644 index 0000000000..3b41e7d7ce --- /dev/null +++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_arithmetic_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +void sequence_arithmetic_compute_ref(const Tensor& x, + const Tensor& y, + Tensor* out, + int op_type) { + auto x_data = x.data(); + auto y_data = y.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + auto x_seq_offset = x.lod()[0]; + auto y_seq_offset = y.lod()[0]; + int seq_num = x_seq_offset.size() - 1; + int inner_size = x.numel() / x.dims()[0]; + + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + switch (op_type) { + case 1: + t_out[j] = input_x[j] + input_y[j]; + break; + case 2: + t_out[j] = input_x[j] - input_y[j]; + break; + case 3: + t_out[j] = input_x[j] * input_y[j]; + break; + default: + break; + } + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len)); + } + } +} + +void prepare_input(Tensor* x, const LoD& x_lod) { + x->Resize({static_cast(x_lod[0].back()), 3}); + x->set_lod(x_lod); + auto x_data = x->mutable_data(); + for (int i = 0; i < x->numel(); i++) { + x_data[i] = (i - x->numel() / 2) * 1.1; + } +} + +TEST(sequence_arithmetic_x86, retrive_op) { + auto sequence_arithmetic = + KernelRegistry::Global().Create( + "sequence_arithmetic"); + ASSERT_FALSE(sequence_arithmetic.empty()); + ASSERT_TRUE(sequence_arithmetic.front()); +} + +TEST(sequence_arithmetic_x86, init) { + SequenceArithmeticCompute sequence_arithmetic; + ASSERT_EQ(sequence_arithmetic.precision(), PRECISION(kFloat)); + ASSERT_EQ(sequence_arithmetic.target(), TARGET(kX86)); +} + +TEST(sequence_arithmetic_x86, run_test) { + SequenceArithmeticCompute sequence_arithmetic; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + lite::Tensor x, y, out, out_ref; + lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}}; + prepare_input(&x, x_lod); + prepare_input(&y, y_lod); + + operators::SequenceArithmeticParam param; + param.X = &x; + param.Y = &y; + param.Out = &out; + param.op_type = 1; + + sequence_arithmetic.SetContext(std::move(ctx)); + sequence_arithmetic.SetParam(param); + sequence_arithmetic.Run(); + + sequence_arithmetic_compute_ref(x, y, &out_ref, param.op_type); + auto out_data = out.data(); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-3); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(sequence_arithmetic, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/sequence_concat_compute.cc b/lite/kernels/x86/sequence_concat_compute.cc new file mode 100644 index 0000000000..facdad39d3 --- /dev/null +++ b/lite/kernels/x86/sequence_concat_compute.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_concat_compute.h" + +REGISTER_LITE_KERNEL(sequence_concat, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceConcatCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h new file mode 100644 index 0000000000..553e2e8b06 --- /dev/null +++ b/lite/kernels/x86/sequence_concat_compute.h @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +template +class SequenceConcatCompute + : public KernelLite { + public: + using param_t = operators::SequenceConcatParam; + + void Run() override { + auto& param = *param_.get_mutable(); + // auto& param = Param(); + T* dout = param.Out->mutable_data(); + + std::vector x_in_order; + param.Out->set_lod(ConcatLoD(param.X, &x_in_order)); + + int num = x_in_order.size(); + int out_rows = 1; + + std::vector input_cols(num); + for (int i = 0; i < num; ++i) { + input_cols[i] = x_in_order[i].numel() / out_rows; + } + + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = x_in_order[j].data(); + memcpy(dout + col_idx, input_data, sizeof(T) * col_len); + col_idx += col_len; + } + } + + virtual ~SequenceConcatCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc new file mode 100644 index 0000000000..be1f86a5c8 --- /dev/null +++ b/lite/kernels/x86/sequence_concat_compute_test.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_concat_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +namespace { +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +static void sequence_concat_ref(const std::vector& xs, + lite::Tensor* out) { + std::vector out_dims; + int64_t batch_size = 0; + int64_t feature_size = 0; + for (const auto& tensor : xs) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.Vectorize(); + } + batch_size += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + out_dims[0] = batch_size; + out->Resize(out_dims); + std::vector x_in_order; + out->set_lod(ConcatLoD(xs, &x_in_order)); + + int num = x_in_order.size(); + std::vector input_cols(num); + for (int i = 0; i < num; ++i) { + input_cols[i] = x_in_order[i].numel(); + } + float* out_data = out->mutable_data(); + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = x_in_order[j].data(); + memcpy(out_data + col_idx, input_data, sizeof(float) * col_len); + col_idx += col_len; + } +} + +#define PREPARE_INPUT(name) \ + name.Resize({name##_lod_len, feature_len}); \ + name.set_lod(lod_info_##name); \ + float* name##_data = name.mutable_data(); \ + for (int i = 0; i < name.numel(); ++i) { \ + name##_data[i] = (i - 2.0) * 1.0; \ + } + +} // namespace + +TEST(sequence_concat_x86, retrive_op) { + auto sequence_concat = + KernelRegistry::Global().Create( + "sequence_concat"); + ASSERT_FALSE(sequence_concat.empty()); + ASSERT_TRUE(sequence_concat.front()); +} + +TEST(sequence_concat_x86, init) { + SequenceConcatCompute sequence_concat; + ASSERT_EQ(sequence_concat.precision(), PRECISION(kFloat)); + ASSERT_EQ(sequence_concat.target(), TARGET(kX86)); +} + +TEST(sequence_concat_x86, run_test) { + SequenceConcatCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + operators::SequenceConcatParam param; + lite::Tensor x1, x2, x3; + lite::Tensor y, y_ref; + + int32_t x1_lod_len = 10, feature_len = 4; + int32_t x2_lod_len = 4, x3_lod_len = 8; + int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len; + LoD lod_info_x1{{0, 3, 5, 6, 10}}; + LoD lod_info_x2{{0, 1, 2, 3, 4}}; + LoD lod_info_x3{{0, 2, 4, 6, 8}}; + LoD lod_info_y{{0, 0, 0, 0, 0}}; + for (size_t i = 0; i < lod_info_x1[0].size(); ++i) { + lod_info_y[0][i] = + lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i]; + } + + PREPARE_INPUT(x1); + PREPARE_INPUT(x2); + PREPARE_INPUT(x3); + + y_ref.Resize({y_lod_len, feature_len}); + y.Resize({y_lod_len, feature_len}); + y_ref.set_lod(lod_info_y); + y.set_lod(lod_info_y); + + std::vector xs{&x1, &x2, &x3}; + + param.X = xs; + param.Out = &y; + seq_kernel.SetParam(param); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + + auto* y_data = y.mutable_data(); + sequence_concat_ref(xs, &y_ref); + float* y_ref_data = y_ref.mutable_data(); + + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(sequence_concat, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/sequence_reverse_compute.cc b/lite/kernels/x86/sequence_reverse_compute.cc new file mode 100644 index 0000000000..6c391e12ad --- /dev/null +++ b/lite/kernels/x86/sequence_reverse_compute.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_reverse_compute.h" + +typedef paddle::lite::kernels::x86::SequenceReverseCompute + ReverseFp32; +typedef paddle::lite::kernels::x86::SequenceReverseCompute + ReverseInt64; + +REGISTER_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, ReverseFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_reverse, kX86, kInt64, kNCHW, ReverseInt64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_reverse_compute.h b/lite/kernels/x86/sequence_reverse_compute.h new file mode 100644 index 0000000000..ab93972276 --- /dev/null +++ b/lite/kernels/x86/sequence_reverse_compute.h @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SequenceReverseCompute : public KernelLite { + public: + using param_t = operators::SequenceReverseParam; + + void Run() override { + auto& param = this->template Param(); + auto* output = param.Out; + const auto* din = param.X->template data(); + + T* dout = output->template mutable_data(); + CHECK_NE(din, dout) + << "SequenceReverse Op does not support in-place operation"; + const auto lod = param.X->lod()[param.X->lod().size() - 1]; + const size_t lod_count = lod.size(); + + size_t limit = static_cast(param.X->numel()); + size_t row_numel = static_cast(limit / param.X->dims()[0]); + + for (size_t idx = 0; idx < lod_count - 1; ++idx) { + auto start_pos = lod[idx]; + auto end_pos = lod[idx + 1]; + for (auto pos = start_pos; pos < end_pos; ++pos) { + auto cur_pos = end_pos - pos - 1 + start_pos; + std::memcpy(dout + pos * row_numel, + din + cur_pos * row_numel, + row_numel * sizeof(T)); + } + } + output->set_lod(param.X->lod()); + } + + virtual ~SequenceReverseCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc new file mode 100644 index 0000000000..4b84241c8b --- /dev/null +++ b/lite/kernels/x86/sequence_reverse_compute_test.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_reverse_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +namespace { +static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) { + const auto* x_data = x->data(); + auto seq_offset = x->lod()[x->lod().size() - 1]; + int width = x->numel() / x->dims()[0]; + auto* y_data = y->mutable_data(); + for (int i = 0; i < seq_offset.size() - 1; ++i) { + auto start_pos = seq_offset[i]; + auto end_pos = seq_offset[i + 1]; + for (auto pos = start_pos; pos < end_pos; ++pos) { + auto cur_pos = end_pos - pos - 1 + start_pos; + std::memcpy(y_data + pos * width, + x_data + cur_pos * width, + width * sizeof(float)); + } + } +} +} // namespace + +TEST(sequence_reverse_x86, retrive_op) { + auto sequence_reverse = + KernelRegistry::Global().Create( + "sequence_reverse"); + ASSERT_FALSE(sequence_reverse.empty()); + ASSERT_TRUE(sequence_reverse.front()); +} + +TEST(sequence_reverse_x86, init) { + SequenceReverseCompute sequence_reverse; + ASSERT_EQ(sequence_reverse.precision(), PRECISION(kFloat)); + ASSERT_EQ(sequence_reverse.target(), TARGET(kX86)); +} + +TEST(sequence_reverse_x86, run_test) { + SequenceReverseCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + + operators::SequenceReverseParam param; + lite::Tensor x, x_ref; + lite::Tensor y, y_ref; + + int32_t lod_len = 10, feature_len = 4; + LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}}; + + x.Resize({lod_len, feature_len}); + x_ref.Resize({lod_len, feature_len}); + y.Resize({lod_len, feature_len}); + y_ref.Resize({lod_len, feature_len}); + x.set_lod(lod_info); + x_ref.set_lod(lod_info); + y.set_lod(lod_info); + y_ref.set_lod(lod_info); + + auto* y_data = y.mutable_data(); + float* x_data = x.mutable_data(); + float* x_ref_data = x_ref.mutable_data(); + float* y_ref_data = y_ref.mutable_data(); + + for (int i = 0; i < x.numel(); ++i) { + x_ref_data[i] = (i - 2.0) * 1.0; + x_data[i] = (i - 2.0) * 1.0; + } + + param.X = &x; + param.Out = &y; + seq_kernel.SetParam(param); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + + sequence_reverse_ref(&x_ref, &y_ref); + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc new file mode 100644 index 0000000000..9bd8b28750 --- /dev/null +++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_topk_avg_pooling_compute.h" + +REGISTER_LITE_KERNEL( + sequence_topk_avg_pooling, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceTopkAvgPoolingCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.h b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h new file mode 100644 index 0000000000..724415288a --- /dev/null +++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/sequence_topk_avg_pooling.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SequenceTopkAvgPoolingCompute + : public KernelLite { + public: + using param_t = operators::SequenceTopkAvgPoolingParam; + + void Run() override { + auto& param = *param_.get_mutable(); + lite::x86::math::SequenceTopkAvgPoolingFunctor + sequence_topk_avg_pooling; + sequence_topk_avg_pooling(*param.X, + *param.ROW, + *param.COLUMN, + param.Out, + param.pos, + param.channel_num, + param.topks); + }; + virtual ~SequenceTopkAvgPoolingCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/softmax_compute.cc b/lite/kernels/x86/softmax_compute.cc index a00aa6d566..3a2cdc29ed 100644 --- a/lite/kernels/x86/softmax_compute.cc +++ b/lite/kernels/x86/softmax_compute.cc @@ -23,3 +23,13 @@ REGISTER_LITE_KERNEL(softmax, .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); +REGISTER_LITE_KERNEL(search_seq_softmax, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SoftmaxCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/stack_compute.cc b/lite/kernels/x86/stack_compute.cc new file mode 100644 index 0000000000..5f69319a6c --- /dev/null +++ b/lite/kernels/x86/stack_compute.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/stack_compute.h" + +REGISTER_LITE_KERNEL(stack, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::StackCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h new file mode 100644 index 0000000000..12a6c3490e --- /dev/null +++ b/lite/kernels/x86/stack_compute.h @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/operators/stack_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class StackCompute : public KernelLite { + public: + using param_t = operators::StackParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto x = param.X; + auto y = param.Out; + + int axis = param.axis; + if (axis < 0) axis += (x[0]->dims().size() + 1); + + int n = static_cast(x.size()); + auto y_data = y->mutable_data(); + std::vector x_datas(n); + for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data(); + + int pre = 1, post = 1; + auto dim = x[0]->dims(); + for (int i = 0; i < axis; ++i) pre *= dim[i]; + for (int i = axis; i < dim.size(); ++i) post *= dim[i]; + + auto x_data_arr = x_datas.data(); + + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy( + y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } + } + + virtual ~StackCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc new file mode 100644 index 0000000000..d105165a98 --- /dev/null +++ b/lite/kernels/x86/stack_compute_test.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/stack_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +// stack +TEST(stack_x86, retrive_op) { + auto stack = + KernelRegistry::Global().Create("stack"); + ASSERT_FALSE(stack.empty()); + ASSERT_TRUE(stack.front()); +} + +TEST(stack_x86, init) { + lite::kernels::x86::StackCompute stack; + ASSERT_EQ(stack.precision(), PRECISION(kFloat)); + ASSERT_EQ(stack.target(), TARGET(kX86)); +} + +TEST(stack_x86, run_test) { + lite::Tensor x; + lite::Tensor out; + int num_input = 5; + + std::vector x_shape({10, 20, 10}); + x.Resize(lite::DDim(x_shape)); + + std::vector out_shape({5, 10, 20, 10}); + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + std::vector input; + for (int i = 0; i < num_input; ++i) { + input.emplace_back(&x); + } + + // StackCompute stack; + StackCompute stack; + operators::StackParam param; + + param.X = input; + param.Out = &out; + int axis = 0; + param.axis = axis; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + stack.SetContext(std::move(ctx)); + stack.SetParam(param); + stack.Run(); + + int ref_data = 0; + for (int j = 0; j < out.dims().production(); ++j) { + EXPECT_NEAR(out_data[j], ref_data, 1e-5); + ref_data++; + ref_data = (ref_data >= 2000) ? (ref_data - 2000) : ref_data; + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(stack, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/var_conv_2d_compute.cc b/lite/kernels/x86/var_conv_2d_compute.cc new file mode 100644 index 0000000000..48ae1b055e --- /dev/null +++ b/lite/kernels/x86/var_conv_2d_compute.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/var_conv_2d_compute.h" + +REGISTER_LITE_KERNEL(var_conv_2d, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::VarConv2DCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h new file mode 100644 index 0000000000..c94cb2ca2d --- /dev/null +++ b/lite/kernels/x86/var_conv_2d_compute.h @@ -0,0 +1,213 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class VarConv2DCompute : public KernelLite { + public: + using param_t = operators::VarConv2DParam; + + void Im2Col(const lite::Tensor& input, lite::Tensor* col) const { + auto& param = *param_.get_mutable(); + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + // 2-D lod info. + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + // std::vector col_lod_vec; + // col_lod_vec.push_back(top_offset); + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(); + const auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } + } + + void Run() override { + auto& param = *param_.get_mutable(); + auto& context = ctx_->As(); + auto* bottom = param.X; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + auto* w = param.W; + auto* top = param.Out; + auto* col = param.Col; + + int output_channel = param.output_channel; + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + + Im2Col(*bottom, col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + auto blas = lite::x86::math::GetBlas(context); + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + blas.GEMM(false, + false, + output_channel, + top_im_size, + input_channel * kernel_h * kernel_w, + 1.0, + w_data, + input_channel * kernel_h * kernel_w, + col_data + col_offset[b], + top_im_size, + 0.0, + top_data + top_offset[b], + top_im_size); + } + } + + virtual ~VarConv2DCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc new file mode 100644 index 0000000000..d6ae5a67bf --- /dev/null +++ b/lite/kernels/x86/var_conv_2d_compute_test.cc @@ -0,0 +1,315 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/var_conv_2d_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +static void im2col_ref(const lite::Tensor& input, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + lite::Tensor* col) { + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + // 2-D lod info. + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(); + const auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } +} + +static void var_conv_2d_ref(const lite::Tensor* bottom, + const lite::Tensor* w, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + const int output_channel, + lite::Tensor* top, + lite::Tensor* col) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + im2col_ref(*bottom, + in_row, + in_col, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + auto blas = lite::x86::math::GetBlas(context); + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + blas.GEMM(false, + false, + output_channel, + top_im_size, + input_channel * kernel_h * kernel_w, + 1.0, + w_data, + input_channel * kernel_h * kernel_w, + col_data + col_offset[b], + top_im_size, + 0.0, + top_data + top_offset[b], + top_im_size); + } +} + +TEST(var_conv_2d_x86, retrive_op) { + auto var_conv_2d = + KernelRegistry::Global().Create( + "var_conv_2d"); + ASSERT_FALSE(var_conv_2d.empty()); + ASSERT_TRUE(var_conv_2d.front()); +} + +TEST(var_conv_2d_x86, init) { + VarConv2DCompute var_conv_2d; + ASSERT_EQ(var_conv_2d.precision(), PRECISION(kFloat)); + ASSERT_EQ(var_conv_2d.target(), TARGET(kX86)); +} + +TEST(var_conv_2d_x86, run_test) { + VarConv2DCompute var_conv_2d; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + operators::VarConv2DParam param; + + lite::Tensor X, W, ROW, COLUMN; + lite::Tensor Out, Col; + int kernel_h, kernel_w; + int stride_h, stride_w; + int input_channel, output_channel; + + output_channel = 5; + input_channel = 5; + kernel_h = 5; + kernel_w = 5; + stride_h = 1; + stride_w = 1; + std::vector w_dims_vec; + w_dims_vec.push_back(output_channel); + w_dims_vec.push_back(input_channel * kernel_h * kernel_w); + W.Resize(w_dims_vec); + auto* w_data = W.mutable_data(); + for (int i = 0; i < W.numel(); ++i) { + w_data[i] = i - 1.f; + } + + std::vector row_lod_vec{0, 10, 20}; + LoD row_lod; + row_lod.push_back(row_lod_vec); + ROW.set_lod(row_lod); + + std::vector column_lod_vec{0, 10, 20}; + LoD column_lod; + column_lod.push_back(column_lod_vec); + COLUMN.set_lod(column_lod); + + int x_size = 0; + std::vector x_lod_vec; + x_lod_vec.push_back(0); + for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) { + int height = row_lod_vec[i + 1] - row_lod_vec[i]; + int width = column_lod_vec[i + 1] - column_lod_vec[i]; + x_lod_vec.push_back(height * width * input_channel); + x_size += height * width * input_channel; + } + std::vector x_dims_vec{x_size, 1}; + LoD x_lod; + x_lod.push_back(x_lod_vec); + x_lod.push_back(row_lod_vec); + x_lod.push_back(column_lod_vec); + X.Resize(x_dims_vec); + X.set_lod(x_lod); + auto* x_data = X.mutable_data(); + for (int i = 0; i < X.numel(); ++i) { + x_data[i] = i % 20 * 1.f; + } + + param.X = &X; + param.W = &W; + // param.ROW = &ROW; + // param.COLUMN = &COLUMN; + param.Out = &Out; + param.Col = &Col; + param.stride_h = stride_h; + param.stride_w = stride_w; + param.kernel_h = kernel_h; + param.kernel_w = kernel_w; + param.input_channel = input_channel; + param.output_channel = output_channel; + var_conv_2d.SetParam(param); + var_conv_2d.SetContext(std::move(ctx)); + var_conv_2d.Run(); + + lite::Tensor top_ref, col_ref; + var_conv_2d_ref(&X, + &W, + &ROW, + &COLUMN, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + output_channel, + &top_ref, + &col_ref); + + for (int i = 0; i < Out.numel(); ++i) { + EXPECT_NEAR(Out.data()[i], top_ref.data()[i], 1e-5); + } + for (int i = 0; i < Col.numel(); ++i) { + EXPECT_NEAR(Col.data()[i], col_ref.data()[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(var_conv_2d, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc index 2c758cf950..d6fc806ad4 100644 --- a/lite/kernels/xpu/bridges/conv_op.cc +++ b/lite/kernels/xpu/bridges/conv_op.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/operators/conv_op.h" #include "lite/backends/xpu/builder.h" #include "lite/kernels/xpu/bridges/registry.h" @@ -46,14 +47,36 @@ node_map_type ConvConverter(const std::shared_ptr op, auto groups = op_info->GetAttr("groups"); auto dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + std::vector output_shape({bs, oc}); for (size_t i = 0; i < 2; i++) { const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1; output_shape.push_back( - (input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1); + (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / + strides[i] + + 1); } DDim output_dims(output_shape); diff --git a/lite/kernels/xpu/bridges/conv_op_test.cc b/lite/kernels/xpu/bridges/conv_op_test.cc index ebdb67bd0d..70929ffcd5 100644 --- a/lite/kernels/xpu/bridges/conv_op_test.cc +++ b/lite/kernels/xpu/bridges/conv_op_test.cc @@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr op) { int stride_h = strides[0]; int dila_w = dilations[1]; int dila_h = dilations[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int pad_h = paddings[0]; int batch_size = input_dims[0]; int in_ch_size = input_dims[1]; @@ -175,7 +175,8 @@ void test_conv(int bs, opdesc.SetOutput("Output", {output_var_name}); opdesc.SetAttr("dilations", std::vector({dilation, dilation})); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("groups", groups); opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); if (has_bias) { diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc index ed5f922d59..7efc6b464c 100644 --- a/lite/kernels/xpu/bridges/pool_op_test.cc +++ b/lite/kernels/xpu/bridges/pool_op_test.cc @@ -60,7 +60,7 @@ void pool_ref(const std::shared_ptr op) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; if (global_pooling == true) { for (int n = 0; n < in_n; ++n) { @@ -162,7 +162,8 @@ void test_pool(int bs, opdesc.SetAttr("global_pooling", global_pooling); opdesc.SetAttr("exclusive", exclusive); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("ceil_mode", ceil_mode); // create and convert op to XPU model, then run it on XPU diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc index 13b6cb5b77..ed3f45c598 100644 --- a/lite/model_parser/model_parser.cc +++ b/lite/model_parser/model_parser.cc @@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir, SaveParamNaive(path, exec_scope, var.Name()); } } - VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully"; + LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully"; } #endif diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 49badbb27b..7c4048c204 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -2,11 +2,10 @@ set(op_DEPS tensor op op_params scope memory) lite_cc_library(op_params SRCS op_params.cc DEPS tensor any) +# 1.baisc ops used in basic models add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS}) add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS}) add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS}) -add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS}) -add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS}) add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS}) add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS}) add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS}) @@ -15,57 +14,64 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} ) add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS}) add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS}) add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS}) -add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS}) -add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS}) add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS}) add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS}) -add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS}) -add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS}) add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS}) add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS}) -add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS}) add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS}) add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS}) -#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS}) -add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS}) -add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS}) add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS}) add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS}) add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS}) add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS}) -add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS}) -add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS}) -add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS}) -add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) -add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS}) add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS}) -add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS}) -add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS}) add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS}) add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS}) -add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS}) -add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS}) add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS}) -add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS}) add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS}) add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS}) add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS}) -add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS}) add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS}) -add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS}) -add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS}) -add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS}) -add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS}) add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS}) -add_operator(unsqueeze_op_lite extra SRCS unsqueeze_op.cc DEPS ${op_DEPS}) -add_operator(im2sequence_op basic SRCS im2sequence_op.cc DEPS ${op_DEPS}) +add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS}) +add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS}) +add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS}) +add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS}) +add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS}) +add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS}) +add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS}) +add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS}) +add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS}) +add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS}) +add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS}) +add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) +add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS}) + +# 2.basic ops not used in basic models +add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) +add_operator(crop_op extra SRCS crop_op.cc DEPS ${op_DEPS}) +add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS}) +add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS}) +add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS}) + +# 3.extra ops +add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS}) +add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS}) +add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS}) +add_operator(uniform_random_op extra SRCS uniform_random_op.cc DEPS ${op_DEPS}) +add_operator(axpy_op extra SRCS axpy_op.cc DEPS ${op_DEPS}) +add_operator(gru_unit_op extra SRCS gru_unit_op.cc DEPS ${op_DEPS}) +add_operator(gru_op extra SRCS gru_op.cc DEPS ${op_DEPS}) +add_operator(layout_once_op extra SRCS layout_once_op.cc DEPS ${op_DEPS}) +add_operator(density_prior_box_op extra SRCS density_prior_box_op.cc DEPS ${op_DEPS}) +add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS}) +add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS}) +add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS}) +add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS}) +add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS}) add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS}) -add_operator(reduce_mean_op extra SRCS reduce_mean_op.cc DEPS ${op_DEPS}) -add_operator(stack_op extra SRCS stack_op.cc DEPS ${op_DEPS}) -add_operator(cast_op_lite extra SRCS cast_op.cc DEPS ${op_DEPS}) -add_operator(affine_channel_op extra SRCS affine_channel_op.cc DEPS ${op_DEPS}) add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS}) add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS}) add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS}) @@ -73,16 +79,26 @@ add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS}) add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS}) add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS}) add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS}) -add_operator(range_op extra SRCS range_op.cc DEPS ${op_DEPS}) add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS}) + add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS}) +add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS}) add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS}) +add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS}) +add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS}) +add_operator(search_grnn_op_lite extra SRCS search_grnn_op.cc DEPS ${op_DEPS}) +add_operator(search_seq_softmax_op_lite extra SRCS search_seq_softmax_op.cc DEPS ${op_DEPS}) +add_operator(sequence_concat_op_lite extra SRCS sequence_concat_op.cc DEPS ${op_DEPS}) +add_operator(var_conv_2d_op_lite extra SRCS var_conv_2d_op.cc DEPS ${op_DEPS}) +add_operator(attention_padding_mask_op_lite extra SRCS attention_padding_mask_op.cc DEPS ${op_DEPS}) +add_operator(sequence_arithmetic_op_lite extra SRCS sequence_arithmetic_op.cc DEPS ${op_DEPS}) # for OCR specific add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS}) +add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS}) add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS}) add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS}) add_operator(logical_xor extra SRCS logical_op.cc DEPS ${op_DEPS}) @@ -106,7 +122,11 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS}) add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS}) add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS}) add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS}) - +# for content-dnn specific +add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS}) +add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS}) +add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS}) +add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS}) if (NOT LITE_WITH_X86) lite_cc_test(test_fc_op SRCS fc_op_test.cc @@ -122,8 +142,8 @@ if (NOT LITE_WITH_X86) lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory) lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope) lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm) - lite_cc_test(test_fusion_elementwise_activation_ops - SRCS fusion_elementwise_activation_ops_test.cc - DEPS fusion_elementwise_activation_ops memory) lite_cc_test(test_transpose_op SRCS transpose_op_test.cc DEPS transpose_op memory) + lite_cc_test(test_fusion_elementwise_activation_ops + SRCS fusion_elementwise_activation_ops_test.cc + DEPS fusion_elementwise_activation_ops memory) endif() diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc index c3c5de311f..6ddcee0cb9 100644 --- a/lite/operators/activation_ops.cc +++ b/lite/operators/activation_ops.cc @@ -117,6 +117,7 @@ REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp); diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc new file mode 100644 index 0000000000..a88df0e7a9 --- /dev/null +++ b/lite/operators/attention_padding_mask_op.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/attention_padding_mask_op.h" +#include "lite/core/op_registry.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool AttentionPaddingMaskOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.pad_begin); + return true; +} + +bool AttentionPaddingMaskOp::InferShape() const { + auto src_len = param_.X->lod()[0][1]; + CHECK_EQ(src_len, param_.X->dims()[1]) + << "Mismatch source length, expect: " << src_len + << ", get: " << param_.X->lod()[0][1]; + auto att_batch = param_.X->lod()[0].size() - 1; + auto src_batch = param_.Y->lod()[0].size() - 1; + CHECK_EQ(att_batch % src_batch, 0) + << "Mismatch batch size, bottom0: " << att_batch + << ", bottom1: " << src_batch; + + param_.pad_begin->Resize({static_cast(src_batch)}); + param_.Out->Resize(param_.X->dims()); + param_.Out->set_lod(param_.X->lod()); + + return true; +} + +bool AttentionPaddingMaskOp::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + param_.X = scope->FindTensor(op_desc.Input("X").front()); + param_.Y = scope->FindTensor(op_desc.Input("Y").front()); + param_.Out = scope->FindMutableTensor(op_desc.Output("Out").front()); + param_.pad_begin = + scope->FindMutableTensor(op_desc.Output("pad_begin").front()); + + param_.pad_id = op_desc.GetAttr("pad_id"); + param_.mask = op_desc.GetAttr("mask"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(attention_padding_mask, + paddle::lite::operators::AttentionPaddingMaskOp); +REGISTER_LITE_OP(search_attention_padding_mask, + paddle::lite::operators::AttentionPaddingMaskOp); diff --git a/lite/operators/attention_padding_mask_op.h b/lite/operators/attention_padding_mask_op.h new file mode 100644 index 0000000000..894d68f622 --- /dev/null +++ b/lite/operators/attention_padding_mask_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class AttentionPaddingMaskOp : public OpLite { + public: + AttentionPaddingMaskOp() {} + + explicit AttentionPaddingMaskOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "attention_padding_mask"; } + + private: + mutable AttentionPaddingMaskParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc index ceca1a61ce..6dab55ff3b 100644 --- a/lite/operators/conv_op.cc +++ b/lite/operators/conv_op.cc @@ -39,56 +39,38 @@ bool ConvOpLite::CheckShape() const { return true; } -inline int ConvOutputSize( - int input_size, int filter_size, int dilation, int padding, int stride) { +inline int ConvOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + 2 * padding - dkernel) / stride + 1; - // CHECK_GT_OR_FALSE(output_size, 0); + int output_size = + (input_size + (pad_left + pad_right) - dkernel) / stride + 1; return output_size; } -inline void UpdatePaddingAndDilation(std::vector* paddings, - std::vector* dilations, - const std::vector& strides, - const std::string padding_algorithm, - const lite::DDim data_dims, - const lite::DDim& ksize) { - // when padding_desc is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (size_t i = 0; i < strides.size(); ++i) { - int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; - int pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2], - (int64_t)0); - // pad - *(paddings->begin() + i) = pad_sum / 2; - // dilation - *(dilations->begin() + i) = 1; - } - } else if (padding_algorithm == "VALID") { - for (auto& it : *paddings) { - it = 0; - } - } -} - bool ConvOpLite::InferShape() const { const auto in_dims = param_.x->dims(); const auto filter_dims = param_.filter->dims(); - UpdatePaddingAndDilation(¶m_.paddings, - ¶m_.dilations, + UpdatePaddingAndDilation(param_.paddings.get(), + param_.dilations.get(), param_.strides, padding_algorithm_, in_dims, filter_dims); std::vector output_shape({in_dims[0], filter_dims[0]}); + auto paddings = *param_.paddings; + auto dilations = *param_.dilations; for (size_t i = 0; i < param_.strides.size(); ++i) { output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - param_.dilations[i], - param_.paddings[i], + dilations[i], + paddings[i * 2], + paddings[i * 2 + 1], param_.strides[i])); } diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index e764819f63..3ab34bc1d0 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include "lite/core/kernel.h" @@ -47,9 +48,10 @@ class ConvOpLite : public OpLite { param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - param_.paddings = op_desc.GetAttr>("paddings"); + auto paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); - param_.dilations = op_desc.GetAttr>("dilations"); + auto dilations = op_desc.GetAttr>("dilations"); + param_.dilations = std::make_shared>(dilations); // optional params std::vector input_arg_names = op_desc.InputArgumentNames(); @@ -109,12 +111,24 @@ class ConvOpLite : public OpLite { param_.output_scale = op_desc.GetAttr("output_scale"); } } + + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < param_.strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the input size."; + } + } + param_.paddings = std::make_shared>(paddings); return true; } - void AttachKernel(KernelBase* kernel) override { - kernel->SetParam(param_); - } + void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "conv2d"; } @@ -123,6 +137,34 @@ class ConvOpLite : public OpLite { std::string padding_algorithm_{""}; }; +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilations, + const std::vector& strides, + const std::string padding_algorithm, + const lite::DDim data_dims, + const lite::DDim& ksize) { + // when padding_desc is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (size_t i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = std::max( + (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + // pad + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + // dilation + *(dilations->begin() + i) = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto& it : *paddings) { + it = 0; + } + } +} + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc index fb6b431fff..a472ae0745 100644 --- a/lite/operators/conv_transpose_op.cc +++ b/lite/operators/conv_transpose_op.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "lite/operators/conv_transpose_op.h" +#include #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" @@ -32,24 +32,75 @@ bool ConvTransposeOpLite::CheckShape() const { CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size()); CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U); - CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size()); CHECK_OR_FALSE(in_dims[1] % param_.groups == 0); + CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL); return true; } +inline int ConvTransposeOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + int output_size = (input_size - 1) * stride - pad_left - pad_right + dkernel; + + return output_size; +} + +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilations, + const std::vector& strides, + const std::string padding_algorithm, + const lite::DDim data_dims, + const lite::DDim& ksize) { + // when padding_desc is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (size_t i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = std::max( + (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + // pad + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + // dilation + *(dilations->begin() + i) = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto& it : *paddings) { + it = 0; + } + } +} + bool ConvTransposeOpLite::InferShape() const { const auto in_dims = param_.x->dims(); const auto filter_dims = param_.filter->dims(); + UpdatePaddingAndDilation(param_.paddings.get(), + param_.dilations.get(), + param_.strides, + padding_algorithm_, + in_dims, + filter_dims); + auto paddings = *param_.paddings; + auto dilations = *param_.dilations; + std::vector output_shape; output_shape.push_back(in_dims[0]); output_shape.push_back(filter_dims[1] * param_.groups); - for (int i = 0; i < param_.strides.size(); i++) { - int kernel_extent = param_.dilations[i] * (filter_dims[i + 2] - 1) + 1; - int output_len = (in_dims[i + 2] - 1) * param_.strides[i] + kernel_extent - - 2 * param_.paddings[i]; - output_shape.push_back(output_len); + for (size_t i = 0; i < param_.strides.size(); ++i) { + output_shape.push_back(ConvTransposeOutputSize(in_dims[i + 2], + filter_dims[i + 2], + dilations[i], + paddings[i * 2], + paddings[i * 2 + 1], + param_.strides[i])); } // Set output dims @@ -58,8 +109,8 @@ bool ConvTransposeOpLite::InferShape() const { } // TODO(Superjomn) replace framework::OpDesc with a lite one. -bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc, - lite::Scope *scope) { +bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { auto X = op_desc.Input("Input").front(); auto Filter = op_desc.Input("Filter").front(); auto Out = op_desc.Output("Output").front(); @@ -68,9 +119,27 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc, param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - param_.paddings = op_desc.GetAttr>("paddings"); + auto paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); - param_.dilations = op_desc.GetAttr>("dilations"); + auto dilations = op_desc.GetAttr>("dilations"); + + if (op_desc.HasAttr("padding_algorithm")) { + padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); + } + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the input size."; + } + } + param_.paddings = std::make_shared>(paddings); + param_.dilations = std::make_shared>(dilations); // optional params std::vector input_arg_names = op_desc.InputArgumentNames(); @@ -81,7 +150,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc, auto bias_var = scope->FindVar(bias_arguments.front()); if (bias_var != nullptr) { param_.bias = - const_cast(&(bias_var->Get())); + const_cast(&(bias_var->Get())); } } } diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h index d8b64c78ef..fb25c022f9 100644 --- a/lite/operators/conv_transpose_op.h +++ b/lite/operators/conv_transpose_op.h @@ -44,6 +44,7 @@ class ConvTransposeOpLite : public OpLite { private: mutable ConvParam param_; + std::string padding_algorithm_{""}; }; } // namespace operators diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc index 6e4bee4da8..acf9701cbd 100644 --- a/lite/operators/fill_constant_op.cc +++ b/lite/operators/fill_constant_op.cc @@ -29,6 +29,12 @@ class FillConstantOp : public OpLite { } bool InferShape() const override { + lite::Tensor* shape_tensor_ = param_.shape_tensor; + if (param_.shape.empty() && shape_tensor_ != nullptr) { + param_.Out->Resize(shape_tensor_->dims()); + return true; + } + param_.Out->Resize(param_.shape); return true; } @@ -41,6 +47,23 @@ class FillConstantOp : public OpLite { param_.shape = opdesc.GetAttr>("shape"); param_.value = opdesc.GetAttr("value"); param_.force_cpu = opdesc.GetAttr("force_cpu"); + param_.shape_tensor = nullptr; + param_.shape_tensor_list = {}; + + std::vector input_arg_names = opdesc.InputArgumentNames(); + if (std::find(input_arg_names.begin(), + input_arg_names.end(), + "ShapeTensor") != input_arg_names.end()) { + auto args = opdesc.Input("ShapeTensor"); + auto* var = scope->FindVar(args.front()); + param_.shape_tensor = var->GetMutable(); + } + if (opdesc.HasAttr("ShapeTensorList")) { + auto args = opdesc.Input("ShapeTensorList"); + auto* var = scope->FindVar(args.front()); + param_.shape_tensor_list = + *(var->GetMutable>()); + } return true; } diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc index b98240ba4f..936da73d89 100644 --- a/lite/operators/interpolate_op.cc +++ b/lite/operators/interpolate_op.cc @@ -45,23 +45,42 @@ bool InterpolateOp::InferShape() const { int out_h; int out_w; - if (OutSize != nullptr) { - auto outsize_data = OutSize->data(); - int h_out = outsize_data[0]; // HW - int w_out = outsize_data[1]; // HW - param_.Out->Resize({n, c, h_out, w_out}); + auto SizeTensor = param_.SizeTensor; + if (!SizeTensor.empty()) { + CHECK(SizeTensor.size() == 2) + << "Input(SizeTensor)'size of Op(interpolate) must be 2. " + "Attr(out_shape)'s length must be 2 for 4-D input tensor."; + out_h = param_.out_h; + out_w = param_.out_w; + param_.Out->Resize({n, c, out_h, out_w}); + return true; + } + + auto Scale = param_.Scale; + if (Scale) { + auto scale_dims = Scale->dims(); + CHECK(scale_dims.size() == 1) << "Scale's dimension size must be 1."; + out_h = -1; + out_w = -1; } else { - if (0 >= param_.out_h && 0 >= param_.out_w) { - out_h = h * param_.scale; - out_w = w * param_.scale; + auto scale = param_.scale; + if (scale > 0) { + out_h = static_cast(h * scale); + out_w = static_cast(w * scale); out_h = out_h > 0 ? out_h : -1; out_w = out_w > 0 ? out_w : -1; } else { out_h = param_.out_h; out_w = param_.out_w; } - param_.Out->Resize({n, c, out_h, out_w}); } + + if (OutSize != nullptr) { + auto out_lod = param_.Out->mutable_lod(); + *out_lod = param_.X->lod(); + } + param_.Out->Resize({n, c, out_h, out_w}); + return true; } @@ -76,6 +95,24 @@ bool InterpolateOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { } else { param_.OutSize = nullptr; } + + if (op_desc.HasInput("SizeTensor")) { + auto size_tensor = op_desc.Input("SizeTensor"); + for (auto var : size_tensor) { + param_.SizeTensor.push_back( + scope->FindVar(var)->GetMutable()); + } + } + + if (op_desc.HasInput("Scale")) { + auto scale_var_names = op_desc.Input("Scale"); + if (scale_var_names.size() > 0) { + param_.Scale = + scope->FindVar(scale_var_names.front())->GetMutable(); + } + } else { + param_.Scale = nullptr; + } auto Out = op_desc.Output("Out").front(); param_.X = scope->FindVar(X)->GetMutable(); param_.Out = scope->FindVar(Out)->GetMutable(); diff --git a/lite/operators/lookup_table_v2_op.cc b/lite/operators/lookup_table_v2_op.cc new file mode 100644 index 0000000000..c783695163 --- /dev/null +++ b/lite/operators/lookup_table_v2_op.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/lookup_table_v2_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool LookupTableV2OpLite::CheckShape() const { + CHECK_OR_FALSE(param_.W) + CHECK_OR_FALSE(param_.Ids) + CHECK_OR_FALSE(param_.Out) + + auto table_dims = param_.W->dims(); + + CHECK_EQ_OR_FALSE(table_dims.size(), 2) + + return true; +} + +bool LookupTableV2OpLite::InferShape() const { + auto table_dims = param_.W->dims(); + auto ids_dims = param_.Ids->dims(); + + std::vector out_dims; + for (int i = 0; i < ids_dims.size(); ++i) { + out_dims.push_back(ids_dims[i]); + } + out_dims.push_back(table_dims[1]); + param_.Out->Resize(lite::DDim{out_dims}); + param_.Out->set_lod(param_.Ids->lod()); + return true; +} + +bool LookupTableV2OpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto input = op_desc.Input("W").front(); + auto ids = op_desc.Input("Ids").front(); + auto out = op_desc.Output("Out").front(); + + param_.W = scope->FindVar(input)->GetMutable(); + param_.Ids = scope->FindVar(ids)->GetMutable(); + param_.Out = scope->FindVar(out)->GetMutable(); + + param_.padding_idx = op_desc.GetAttr("padding_idx"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(lookup_table_v2, paddle::lite::operators::LookupTableV2OpLite) diff --git a/lite/operators/lookup_table_v2_op.h b/lite/operators/lookup_table_v2_op.h new file mode 100644 index 0000000000..dabff3f0ca --- /dev/null +++ b/lite/operators/lookup_table_v2_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class LookupTableV2OpLite : public OpLite { + public: + LookupTableV2OpLite() {} + explicit LookupTableV2OpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "LookupTable"; } + + private: + mutable LookupTableParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/lrn_op.cc b/lite/operators/lrn_op.cc index 34b00653f9..aff3e5af55 100644 --- a/lite/operators/lrn_op.cc +++ b/lite/operators/lrn_op.cc @@ -37,11 +37,13 @@ bool LrnOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { auto Out_name = opdesc.Output("Out").front(); param_.X = GetVar(scope, X_name); param_.Out = GetMutableVar(scope, Out_name); - param_.local_size = opdesc.GetAttr("local_size"); + param_.n = opdesc.GetAttr("n"); param_.alpha = opdesc.GetAttr("alpha"); param_.beta = opdesc.GetAttr("beta"); param_.k = opdesc.GetAttr("k"); - param_.norm_region = opdesc.GetAttr("norm_region"); + if (opdesc.HasAttr("norm_region")) { + param_.norm_region = opdesc.GetAttr("norm_region"); + } return true; } diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc new file mode 100644 index 0000000000..a8095a94bf --- /dev/null +++ b/lite/operators/match_matrix_tensor_op.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/match_matrix_tensor_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool MatchMatrixTensorOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.y); + CHECK_OR_FALSE(param_.w); + CHECK_OR_FALSE(param_.out); + CHECK_OR_FALSE(param_.tmp); + + DDim x_dims = param_.x->dims(); + DDim y_dims = param_.y->dims(); + DDim w_dims = param_.w->dims(); + int dim_t = param_.dim_t; + + CHECK_OR_FALSE(x_dims.size() == 2); + CHECK_OR_FALSE(y_dims.size() == 2); + CHECK_OR_FALSE(w_dims.size() == 3); + + CHECK_OR_FALSE(x_dims[1] == w_dims[0] && y_dims[1] == w_dims[2] && + w_dims[1] == dim_t); + + return true; +} + +bool MatchMatrixTensorOpLite::InferShape() const { + const Tensor* x = param_.x; + const Tensor* y = param_.y; + DDim x_dims = param_.x->dims(); + DDim y_dims = param_.y->dims(); + DDim w_dims = param_.w->dims(); + int dim_t = param_.dim_t; + + const auto& x_lod = x->lod(); + CHECK_OR_FALSE(!x_lod.empty()); + const auto& x_lod_0 = x_lod[0]; + CHECK_OR_FALSE(x_lod_0.size() >= 2); + CHECK_OR_FALSE(x_dims[0] == x_lod_0.back()); + + const auto& y_lod = y->lod(); + CHECK_OR_FALSE(!y_lod.empty()); + const auto& y_lod_0 = y_lod[0]; + CHECK_OR_FALSE(y_lod_0.size() >= 2); + CHECK_OR_FALSE(y_dims[0] == y_lod_0.back()); + + CHECK_OR_FALSE(x_lod_0.size() == y_lod_0.size()); + + int out_dim_0 = 0; + for (size_t i = 1; i < x_lod_0.size(); i++) { + int x_len = x_lod_0[i] - x_lod_0[i - 1]; + int y_len = y_lod_0[i] - y_lod_0[i - 1]; + out_dim_0 += (x_len * y_len); + } + out_dim_0 *= dim_t; + int tmp_dim_0 = x_dims[0] * dim_t * x_dims[1]; + + param_.out->Resize({out_dim_0, 1}); + param_.tmp->Resize({tmp_dim_0, 1}); + return true; +} + +bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto x = op_desc.Input("X").front(); + auto w = op_desc.Input("W").front(); + auto y = op_desc.Input("Y").front(); + auto out = op_desc.Output("Out").front(); + auto tmp = op_desc.Output("Tmp").front(); + + param_.x = scope->FindVar(x)->GetMutable(); + param_.w = scope->FindVar(w)->GetMutable(); + param_.y = scope->FindVar(y)->GetMutable(); + param_.out = scope->FindVar(out)->GetMutable(); + param_.tmp = scope->FindVar(tmp)->GetMutable(); + + param_.dim_t = op_desc.GetAttr("dim_t"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(match_matrix_tensor, + paddle::lite::operators::MatchMatrixTensorOpLite); diff --git a/lite/operators/match_matrix_tensor_op.h b/lite/operators/match_matrix_tensor_op.h new file mode 100644 index 0000000000..404183ea5b --- /dev/null +++ b/lite/operators/match_matrix_tensor_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class MatchMatrixTensorOpLite : public OpLite { + public: + MatchMatrixTensorOpLite() {} + + explicit MatchMatrixTensorOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "match_matrix_tensor"; } + + private: + mutable MatchMatrixTensorParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 8609f17888..4f0c707484 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -89,11 +90,21 @@ struct FcParam { WITH_INT8_CONFIG }; +struct SearchSeqFcParam { + lite::Tensor* x{nullptr}; + lite::Tensor* w{nullptr}; + lite::Tensor* b{nullptr}; + lite::Tensor* out{nullptr}; + int out_size; +}; + // For Interpolate Op struct InterpolateParam { lite::Tensor* X{}; lite::Tensor* OutSize{}; lite::Tensor* Out{}; + std::vector SizeTensor; + lite::Tensor* Scale{}; float scale{0.f}; int out_h{-1}; @@ -101,6 +112,7 @@ struct InterpolateParam { bool align_corners{true}; int align_mode{1}; std::string interp_method{"Nearest"}; + DataLayoutType data_layout{DATALAYOUT(kNCHW)}; }; // For Mul Op @@ -242,9 +254,19 @@ struct ConvParam { lite::Tensor* residualData{nullptr}; lite::Tensor* output{}; std::vector strides{1, 1}; - std::vector paddings{0, 0}; + /* paddings type change + * from std::vector to std::shared_ptr> + * to support dynamically modify padding + * let kernel param and operator param Synchronous update + */ + std::shared_ptr> paddings; int groups{1}; - std::vector dilations{1, 1}; + /* dilations type change + * from std::vector to std::shared_ptr> + * to support dynamically modify padding + * let kernel param and operator param Synchronous update + */ + std::shared_ptr> dilations; bool fuse_relu_before_depthwise_conv{false}; bool use_mkldnn{false}; bool fuse_relu{false}; // only used in mkldnn kernel @@ -291,7 +313,12 @@ struct PoolParam { bool global_pooling{ false}; // if true, knernel size and paddings will be ignored std::vector strides{1, 1}; - std::vector paddings{0, 0}; + /* paddings type change + * from std::vector to std::shared_ptr> + * to support dynamically modify padding + * let kernel param and operator param Synchronous update + */ + std::shared_ptr> paddings; bool exclusive{true}; bool adaptive{false}; bool ceil_mode{false}; @@ -317,6 +344,9 @@ struct DropoutParam { struct SplitParam { lite::Tensor* x{}; std::vector output{}; + lite::Tensor* axis_tensor; + std::vector sections_tensor_list{}; + int axis{-1}; int num{0}; std::vector sections; @@ -378,6 +408,9 @@ struct MeanGradParam { struct FillConstantParam { int dtype{static_cast(VarDescAPI::VarDataType::FP32)}; std::vector shape{}; + lite::Tensor* shape_tensor; + std::vector shape_tensor_list{}; + float value{0.0f}; // useless for x86, keep it for compatibility bool force_cpu{false}; @@ -511,8 +544,8 @@ struct GRUUnitParam { struct LrnParam { const lite::Tensor* X{}; lite::Tensor* Out{}; - int local_size{5}; - float alpha{1.}; + int n{5}; + float alpha{1e-4}; float beta{0.75}; float k{1.}; std::string norm_region{"AcrossChannels"}; @@ -729,6 +762,14 @@ struct SequencePoolParam { #endif }; +struct SearchGroupPaddingParam { + lite::Tensor* x{}; + lite::Tensor* out_emb_padding{}; + lite::Tensor* out_new{}; + lite::Tensor* out_padding{}; + int pad_id; +}; + struct SequenceReshapeParam { lite::Tensor* x{}; lite::Tensor* output{}; @@ -748,6 +789,32 @@ struct SequenceExpandAsParam { lite::Tensor* out{nullptr}; }; +struct SequenceReverseParam { + const lite::Tensor* X{}; + lite::Tensor* Out{}; +}; + +struct SequenceConcatParam { + std::vector X{}; + lite::Tensor* Out{}; +}; + +struct AttentionPaddingMaskParam { + const lite::Tensor* X{}; + const lite::Tensor* Y{}; + int pad_id; + float mask; + lite::Tensor* Out{}; + lite::Tensor* pad_begin{}; +}; + +struct SequenceArithmeticParam { + const lite::Tensor* X{}; + const lite::Tensor* Y{}; + int op_type{1}; + lite::Tensor* Out{}; +}; + struct ReduceMaxParam { const lite::Tensor* X{}; lite::Tensor* Out{}; @@ -776,6 +843,22 @@ struct ReduceParam { bool reduce_all{false}; }; +struct VarConv2DParam { + const lite::Tensor* X{}; + const lite::Tensor* ROW{}; + const lite::Tensor* COLUMN{}; + const lite::Tensor* W{}; + lite::Tensor* Out{}; + lite::Tensor* Col{}; + + int input_channel; + int output_channel; + int stride_h; + int stride_w; + int kernel_h; + int kernel_w; +}; + /// ----------------------- shape operators ---------------------- struct ShapeParam { const lite::Tensor* X{}; @@ -856,7 +939,7 @@ struct UnsqueezeParam { lite::Tensor* XShape{}; std::vector axes{}; const lite::Tensor* axes_tensor{}; - std::vector* axes_tensor_vct{}; + std::vector axes_tensor_vct{}; }; /// ----------------------- expand operators ---------------------- @@ -922,6 +1005,57 @@ struct AssignValueParam { lite::Tensor* Out{}; }; +/// --------------- sequence_topk_avg_pooling operators ------------------ +struct SequenceTopkAvgPoolingParam { + const lite::Tensor* X{}; + const lite::Tensor* ROW{}; + const lite::Tensor* COLUMN{}; + lite::Tensor* Out{}; + lite::Tensor* pos{}; + int channel_num{}; + std::vector topks{}; +}; + +/// --------------- search_fc operators ------------------ +struct SearchFcParam { + const lite::Tensor* X{}; + const lite::Tensor* W{}; + const lite::Tensor* b{}; + lite::Tensor* Out{}; + int out_size{}; +}; +/// --------------------- match_matrix_tensor operators -------------------- +struct MatchMatrixTensorParam { + const lite::Tensor* x{}; + const lite::Tensor* y{}; + const lite::Tensor* w{}; + lite::Tensor* out{}; + lite::Tensor* tmp{}; + + int dim_t; +}; + +/// --------------------- search_seq_depadding operators -------------------- +struct SearchSeqDepaddingParam { + const lite::Tensor* pad{}; + const lite::Tensor* src{}; + lite::Tensor* out{}; +}; + +/// --------------------- search_grnn operators -------------------- +struct SearchGrnnParam { + const lite::Tensor* x{}; + const lite::Tensor* wi{}; + const lite::Tensor* wh{}; + int num_input; + int num_hidden; + + lite::Tensor* out{}; + lite::Tensor* tmp_buffer{}; + lite::Tensor* idx_sorted_by_width{}; + lite::Tensor* layout_input{}; +}; + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc index 1ebbc059b7..c6f6eed28f 100644 --- a/lite/operators/pool_op.cc +++ b/lite/operators/pool_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/operators/pool_op.h" +#include #include "lite/core/op_registry.h" namespace paddle { @@ -26,7 +27,7 @@ bool PoolOpLite::CheckShape() const { const auto& x_dims = param_.x->dims(); const auto& ksize = param_.ksize; const auto& strides = param_.strides; - const auto& paddings = param_.paddings; + const auto& paddings = *param_.paddings; // "Pooling intput should be 4-D or 5-D tensor." CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5); @@ -34,20 +35,27 @@ bool PoolOpLite::CheckShape() const { CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U); // Strides size and pooling size should be the same. CHECK_OR_FALSE(ksize.size() == strides.size()); - // Paddings size and pooling size should be the same. - CHECK_OR_FALSE(ksize.size() == paddings.size()); + // Paddings size must be 4. + CHECK_OR_FALSE(paddings.size() == 4L); return true; } -int PoolOutputSize( - int input_size, int filter_size, int padding, int stride, bool ceil_mode) { +int PoolOutputSize(int input_size, + int filter_size, + int pad_left, + int pad_right, + int stride, + bool ceil_mode) { int output_size; if (!ceil_mode) { - output_size = (input_size - filter_size + 2 * padding) / stride + 1; + output_size = + (input_size - filter_size + pad_left + pad_right) / stride + 1; } else { output_size = - (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + (input_size - filter_size + pad_left + pad_right + stride - 1) / + stride + + 1; } return output_size; } @@ -55,14 +63,21 @@ int PoolOutputSize( bool PoolOpLite::InferShape() const { const auto x_dims = param_.x->dims(); std::vector& ksize = param_.ksize; + // dynamic update 4-pad + UpdatePadding(param_.paddings.get(), + param_.global_pooling, + param_.adaptive, + padding_algorithm_, + x_dims, + param_.strides, + ksize); if (param_.global_pooling) { ksize.resize(static_cast(x_dims.size()) - 2); for (size_t i = 0; i < ksize.size(); ++i) { - param_.paddings[i] = 0; ksize[i] = static_cast(x_dims[i + 2]); } } - + auto paddings = *param_.paddings; std::vector output_shape({x_dims[0], x_dims[1]}); if (param_.adaptive) { output_shape.insert( @@ -71,15 +86,14 @@ bool PoolOpLite::InferShape() const { for (size_t i = 0; i < param_.ksize.size(); ++i) { output_shape.push_back(PoolOutputSize(x_dims[i + 2], param_.ksize[i], - param_.paddings[i], + paddings[2 * i], + paddings[2 * i + 1], param_.strides[i], param_.ceil_mode)); } } param_.output->Resize(lite::DDim(output_shape)); - // ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); - // ctx->ShareLoD("X", "Out"); return true; } diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h index aecec4c619..c44875ff95 100644 --- a/lite/operators/pool_op.h +++ b/lite/operators/pool_op.h @@ -14,6 +14,8 @@ #pragma once +#include +#include #include #include #include "lite/core/kernel.h" @@ -51,7 +53,7 @@ class PoolOpLite : public OpLite { param_.ksize = op_desc.GetAttr>("ksize"); param_.global_pooling = op_desc.GetAttr("global_pooling"); param_.strides = op_desc.GetAttr>("strides"); - param_.paddings = op_desc.GetAttr>("paddings"); + auto paddings = op_desc.GetAttr>("paddings"); if (op_desc.HasAttr("exclusive")) { param_.exclusive = op_desc.GetAttr("exclusive"); @@ -65,7 +67,23 @@ class PoolOpLite : public OpLite { if (op_desc.HasAttr("use_quantizer")) { param_.use_quantizer = op_desc.GetAttr("use_quantizer"); } - // param_.data_format = op_desc.GetAttr("data_format"); + if (op_desc.HasAttr("padding_algorithm")) { + padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); + } + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the inputs size."; + } + } + param_.paddings = std::make_shared>(paddings); + return true; } @@ -75,8 +93,42 @@ class PoolOpLite : public OpLite { private: mutable PoolParam param_; + std::string padding_algorithm_{""}; }; +inline void UpdatePadding(std::vector *paddings, + const bool global_pooling, + const bool adaptive, + const std::string padding_algorithm, + const lite::DDim data_dims, + const std::vector &strides, + const std::vector &ksize) { + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = + std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + } + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } + + // if global_pooling == true or adaptive == true, padding will be ignore + if (global_pooling || adaptive) { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/search_aligned_mat_mul_op.cc b/lite/operators/search_aligned_mat_mul_op.cc new file mode 100644 index 0000000000..43a276e3c7 --- /dev/null +++ b/lite/operators/search_aligned_mat_mul_op.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_aligned_mat_mul_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchAlignedMatMulOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_OR_FALSE(param_.Out); + + return true; +} + +bool SearchAlignedMatMulOpLite::InferShape() const { + const auto x_dims = param_.X->dims(); + const auto y_dims = param_.Y->dims(); + const auto& x_lod = param_.X->lod(); + const auto& y_lod = param_.Y->lod(); + bool x_transpose = param_.transpose_X; + bool y_transpose = param_.transpose_Y; + + CHECK_EQ(x_dims.size(), 2) << "X should be 2-D tensor"; + CHECK_EQ(y_dims.size(), 2) << "Y should be 2-D tensor"; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + CHECK(!y_lod.empty()) << "The Input(Y) must hold lod info."; + + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_GE(y_lod_0.size(), 2) << "The Input(Y)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(y_dims[0], static_cast(y_lod_0.back())) + << "The Input(Y)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(x_lod_0.size(), y_lod_0.size()) + << "The Length of X and Y must be equal."; + + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + + LoD out_lod; + std::vector out_lod_0(seq_num + 1); + out_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + out_lod_0[i + 1] = out_lod_0[i] + M; + } + out_lod.push_back(out_lod_0); + DDim out_dims( + {static_cast(out_lod_0.back()), static_cast(N)}); + param_.Out->set_lod(out_lod); + param_.Out->Resize(out_dims); + return true; +} + +bool SearchAlignedMatMulOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + CHECK(!op_desc.Input("X").empty()); + CHECK(!op_desc.Input("Y").empty()); + CHECK(!op_desc.Output("Out").empty()); + auto X = op_desc.Input("X").front(); + auto Y = op_desc.Input("Y").front(); + auto Out = op_desc.Output("Out").front(); + param_.X = GetVar(scope, X); + param_.Y = GetVar(scope, Y); + param_.Out = GetMutableVar(scope, Out); + param_.transpose_X = op_desc.GetAttr("transpose_X"); + param_.transpose_Y = op_desc.GetAttr("transpose_Y"); + param_.alpha = op_desc.GetAttr("alpha"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_aligned_mat_mul, + paddle::lite::operators::SearchAlignedMatMulOpLite); diff --git a/lite/operators/search_aligned_mat_mul_op.h b/lite/operators/search_aligned_mat_mul_op.h new file mode 100644 index 0000000000..7321b7e9d1 --- /dev/null +++ b/lite/operators/search_aligned_mat_mul_op.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchAlignedMatMulOpLite : public OpLite { + public: + SearchAlignedMatMulOpLite() {} + + explicit SearchAlignedMatMulOpLite(const std::string &type) : OpLite(type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; + std::string DebugString() const override { return "search_aligned_mat_mul"; } + + private: + mutable MatMulParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc new file mode 100644 index 0000000000..2e77e36162 --- /dev/null +++ b/lite/operators/search_fc_op.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_fc_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchFcOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.W); + CHECK_OR_FALSE(param_.b); + CHECK_OR_FALSE(param_.Out); + + auto x_dims = param_.X->dims(); + CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) should be 2."; + auto w_dims = param_.W->dims(); + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + auto b_dims = param_.b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(w_dims[1], x_dims[1]) << "wrong shape: w_dims[1] != x_dims[1]"; + return true; +} + +bool SearchFcOpLite::InferShape() const { + auto out_size = param_.out_size; + lite::DDim dims(std::vector({-1, out_size})); + param_.Out->Resize(dims); + return true; +} + +bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto X = op_desc.Input("X").front(); + auto W = op_desc.Input("W").front(); + auto b = op_desc.Input("b").front(); + auto Out = op_desc.Output("Out").front(); + + param_.X = scope->FindVar(X)->GetMutable(); + param_.W = scope->FindVar(W)->GetMutable(); + param_.b = scope->FindVar(b)->GetMutable(); + param_.Out = scope->FindVar(Out)->GetMutable(); + param_.out_size = op_desc.GetAttr("out_size"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_fc, paddle::lite::operators::SearchFcOpLite); diff --git a/lite/operators/search_fc_op.h b/lite/operators/search_fc_op.h new file mode 100644 index 0000000000..a871cadd33 --- /dev/null +++ b/lite/operators/search_fc_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchFcOpLite : public OpLite { + public: + SearchFcOpLite() {} + explicit SearchFcOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "search_fc"; } + + private: + mutable SearchFcParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc new file mode 100644 index 0000000000..b56ae820bf --- /dev/null +++ b/lite/operators/search_grnn_op.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_grnn_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchGrnnOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.wi); + CHECK_OR_FALSE(param_.wh); + CHECK_OR_FALSE(param_.out); + CHECK_OR_FALSE(param_.tmp_buffer); + CHECK_OR_FALSE(param_.idx_sorted_by_width); + CHECK_OR_FALSE(param_.layout_input); + + int _cap_h = param_.num_hidden; + int _cap_e = param_.num_input; + + const auto& x_dims = param_.x->dims(); + CHECK_OR_FALSE(x_dims.size() == 2); + CHECK_OR_FALSE(x_dims[1] == _cap_e); + + const auto& wi_dims = param_.wi->dims(); + CHECK_OR_FALSE(wi_dims.size() == 3); + CHECK_OR_FALSE(wi_dims[0] == 3); + CHECK_OR_FALSE(wi_dims[1] == _cap_h); + CHECK_OR_FALSE(wi_dims[2] == _cap_e); + + const auto& wh_dims = param_.wh->dims(); + CHECK_OR_FALSE(wh_dims.size() == 3); + CHECK_OR_FALSE(wh_dims[0] == 3); + CHECK_OR_FALSE(wh_dims[1] == _cap_h); + CHECK_OR_FALSE(wh_dims[2] == _cap_h); + + return true; +} + +bool SearchGrnnOpLite::InferShape() const { + const auto& x_dims = param_.x->dims(); + const auto& x_lod = param_.x->lod(); + CHECK_OR_FALSE(!x_lod.empty()); + CHECK_OR_FALSE(x_dims[0] == x_lod[0].back()); + param_.out->set_lod(x_lod); + + return true; +} + +bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto x = op_desc.Input("X").front(); + auto wi = op_desc.Input("Wi").front(); + auto wh = op_desc.Input("Wh").front(); + param_.x = scope->FindVar(x)->GetMutable(); + param_.wi = scope->FindVar(wi)->GetMutable(); + param_.wh = scope->FindVar(wh)->GetMutable(); + + param_.num_input = op_desc.GetAttr("num_input"); + param_.num_hidden = op_desc.GetAttr("num_hidden"); + + auto out = op_desc.Output("Out").front(); + auto tmp_buffer = op_desc.Output("tmp_buffer").front(); + auto idx_sorted_by_width = op_desc.Output("idx_sorted_by_width").front(); + auto layout_input = op_desc.Output("layout_input").front(); + param_.out = scope->FindVar(out)->GetMutable(); + param_.tmp_buffer = scope->FindVar(tmp_buffer)->GetMutable(); + param_.idx_sorted_by_width = + scope->FindVar(idx_sorted_by_width)->GetMutable(); + param_.layout_input = + scope->FindVar(layout_input)->GetMutable(); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_grnn, paddle::lite::operators::SearchGrnnOpLite); diff --git a/lite/operators/search_grnn_op.h b/lite/operators/search_grnn_op.h new file mode 100644 index 0000000000..670af8a6c9 --- /dev/null +++ b/lite/operators/search_grnn_op.h @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchGrnnOpLite : public OpLite { + public: + SearchGrnnOpLite() {} + + explicit SearchGrnnOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "search_grnn"; } + + private: + mutable SearchGrnnParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_group_padding_op.cc b/lite/operators/search_group_padding_op.cc new file mode 100644 index 0000000000..5ba4dde275 --- /dev/null +++ b/lite/operators/search_group_padding_op.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_group_padding_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchGroupPaddingOp::CheckShape() const { + CHECK_EQ(param_.x->dims().size(), 2) << "The rank of X(Input) should be 2."; + CHECK_EQ(param_.x->lod().empty(), false) + << "Input Tensor of X does not contain LoD information."; + CHECK_GE(param_.x->lod()[0].size(), 2) + << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(param_.x->dims()[0], static_cast(param_.x->lod()[0].back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + + return true; +} + +bool SearchGroupPaddingOp::InferShape() const { + std::vector x_dims = param_.x->dims().Vectorize(); + + param_.out_emb_padding->Resize({-1, x_dims[1]}); + param_.out_new->Resize({x_dims[0], 1}); + param_.out_padding->Resize({-1, 1}); + return true; +} + +bool SearchGroupPaddingOp::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto x = op_desc.Input("X").front(); + auto out_emb_padding = op_desc.Output("Out_emb_padding").front(); + auto out_new = op_desc.Output("Out_new").front(); + auto out_padding = op_desc.Output("Out_padding").front(); + + param_.x = scope->FindVar(x)->GetMutable(); + param_.out_emb_padding = + scope->FindVar(out_emb_padding)->GetMutable(); + param_.out_new = scope->FindVar(out_new)->GetMutable(); + param_.out_padding = scope->FindVar(out_padding)->GetMutable(); + param_.pad_id = op_desc.GetAttr("pad_id"); + + CHECK(param_.out_emb_padding) + << "Output(Out_emb_padding) of SearchGroupPadding Op should not be null."; + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_group_padding, + paddle::lite::operators::SearchGroupPaddingOp); diff --git a/lite/operators/search_group_padding_op.h b/lite/operators/search_group_padding_op.h new file mode 100644 index 0000000000..a8e96c9697 --- /dev/null +++ b/lite/operators/search_group_padding_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchGroupPaddingOp : public OpLite { + public: + SearchGroupPaddingOp() {} + explicit SearchGroupPaddingOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "search_group_padding"; } + + private: + mutable SearchGroupPaddingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_seq_depadding_op.cc b/lite/operators/search_seq_depadding_op.cc new file mode 100644 index 0000000000..12d5123e05 --- /dev/null +++ b/lite/operators/search_seq_depadding_op.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_seq_depadding_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchSeqDepaddingOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.pad); + CHECK_OR_FALSE(param_.src); + CHECK_OR_FALSE(param_.out); + + DDim pad_dims = param_.pad->dims(); + DDim src_dims = param_.src->dims(); + CHECK_OR_FALSE(pad_dims.size() == 2); + CHECK_OR_FALSE(src_dims.size() == 2); + + const auto& pad_lod = param_.pad->lod(); + CHECK_OR_FALSE(!pad_lod.empty()); + const auto& pad_lod_0 = pad_lod[0]; + CHECK_OR_FALSE(pad_lod_0.size() >= 2); + CHECK_OR_FALSE(pad_dims[0] == pad_lod_0.back()); + + const auto& src_lod = param_.src->lod(); + CHECK_OR_FALSE(!src_lod.empty()); + const auto& src_lod_0 = src_lod[0]; + CHECK_OR_FALSE(src_lod_0.size() >= 2); + CHECK_OR_FALSE(src_dims[0] == src_lod_0.back()); + return true; +} + +bool SearchSeqDepaddingOpLite::InferShape() const { + DDim pad_dims = param_.pad->dims(); + param_.out->Resize({-1, pad_dims[1]}); + return true; +} + +bool SearchSeqDepaddingOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto pad = op_desc.Input("Pad").front(); + auto src = op_desc.Input("Src").front(); + auto out = op_desc.Output("Out").front(); + + param_.pad = scope->FindVar(pad)->GetMutable(); + param_.src = scope->FindVar(src)->GetMutable(); + param_.out = scope->FindVar(out)->GetMutable(); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_seq_depadding, + paddle::lite::operators::SearchSeqDepaddingOpLite); diff --git a/lite/operators/search_seq_depadding_op.h b/lite/operators/search_seq_depadding_op.h new file mode 100644 index 0000000000..445d9e0f3b --- /dev/null +++ b/lite/operators/search_seq_depadding_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchSeqDepaddingOpLite : public OpLite { + public: + SearchSeqDepaddingOpLite() {} + + explicit SearchSeqDepaddingOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "search_seq_depadding"; } + + private: + mutable SearchSeqDepaddingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_seq_fc_op.cc b/lite/operators/search_seq_fc_op.cc new file mode 100644 index 0000000000..c5cca5331a --- /dev/null +++ b/lite/operators/search_seq_fc_op.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_seq_fc_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchSeqFcOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.w); + CHECK_OR_FALSE(param_.out); + return true; +} + +bool SearchSeqFcOpLite::InferShape() const { + const auto x_dims = param_.x->dims(); + const auto w_dims = param_.w->dims(); + const auto& x_lod = param_.x->lod(); + auto out_size = param_.out_size; + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + const auto& x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + + if (param_.b != nullptr) { + const auto b_dims = param_.b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + } + + param_.out->set_lod(x_lod); + param_.out->Resize({x_dims[0], w_dims[0]}); + return true; +} + +bool SearchSeqFcOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + CHECK(!op_desc.Input("X").empty()); + CHECK(!op_desc.Input("W").empty()); + CHECK(!op_desc.Output("Out").empty()); + auto x = op_desc.Input("X").front(); + auto w = op_desc.Input("W").front(); + auto out = op_desc.Output("Out").front(); + param_.x = scope->FindVar(x)->GetMutable(); + param_.w = scope->FindVar(w)->GetMutable(); + param_.out = scope->FindVar(out)->GetMutable(); + param_.out_size = op_desc.GetAttr("out_size"); + bool has_bias = op_desc.GetAttr("has_bias"); + if (has_bias) { + CHECK(!op_desc.Input("b").empty()); + auto b = op_desc.Input("b").front(); + param_.b = scope->FindVar(b)->GetMutable(); + } + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_seq_fc, paddle::lite::operators::SearchSeqFcOpLite); diff --git a/lite/operators/search_seq_fc_op.h b/lite/operators/search_seq_fc_op.h new file mode 100644 index 0000000000..3c4f7d82bf --- /dev/null +++ b/lite/operators/search_seq_fc_op.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchSeqFcOpLite : public OpLite { + public: + SearchSeqFcOpLite() {} + + explicit SearchSeqFcOpLite(const std::string &type) : OpLite(type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; + std::string DebugString() const override { return "search_seq_fc"; } + + private: + mutable SearchSeqFcParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_seq_softmax_op.cc b/lite/operators/search_seq_softmax_op.cc new file mode 100644 index 0000000000..973ffa04c4 --- /dev/null +++ b/lite/operators/search_seq_softmax_op.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_seq_softmax_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchSeqSoftmaxOp::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.output); + return true; +} + +bool SearchSeqSoftmaxOp::InferShape() const { + param_.output->Resize(param_.x->dims()); + param_.output->set_lod(param_.x->lod()); + return true; +} + +bool SearchSeqSoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.x = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + param_.output = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + param_.axis = 1; + + CHECK(param_.x); + CHECK(param_.output); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_seq_softmax, + paddle::lite::operators::SearchSeqSoftmaxOp); diff --git a/lite/operators/search_seq_softmax_op.h b/lite/operators/search_seq_softmax_op.h new file mode 100644 index 0000000000..f97e8ddd3a --- /dev/null +++ b/lite/operators/search_seq_softmax_op.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchSeqSoftmaxOp : public OpLite { + public: + SearchSeqSoftmaxOp() {} + explicit SearchSeqSoftmaxOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "search_seq_softmax_op"; } + + private: + mutable SoftmaxParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_arithmetic_op.cc b/lite/operators/sequence_arithmetic_op.cc new file mode 100644 index 0000000000..29c39ebc23 --- /dev/null +++ b/lite/operators/sequence_arithmetic_op.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_arithmetic_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceArithmeticOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_EQ(param_.X->dims().size(), 2) << "Input X should a 2-D Tensor"; + CHECK_EQ(param_.Y->dims().size(), 2) << "Input Y should a 2-D Tensor"; + CHECK_OR_FALSE(param_.Out); + return true; +} + +bool SequenceArithmeticOp::InferShape() const { + param_.Out->Resize(param_.X->dims()); + param_.Out->set_lod(param_.X->lod()); + return true; +} + +bool SequenceArithmeticOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.X = scope->FindTensor(opdesc.Input("X").front()); + param_.Y = scope->FindTensor(opdesc.Input("Y").front()); + param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front()); + + param_.op_type = opdesc.GetAttr("op_type"); + + CHECK(param_.X); + CHECK(param_.Y); + CHECK(param_.Out); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_arithmetic, + paddle::lite::operators::SequenceArithmeticOp); +REGISTER_LITE_OP(search_seq_arithmetic, + paddle::lite::operators::SequenceArithmeticOp); diff --git a/lite/operators/sequence_arithmetic_op.h b/lite/operators/sequence_arithmetic_op.h new file mode 100644 index 0000000000..9f844dfbf4 --- /dev/null +++ b/lite/operators/sequence_arithmetic_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceArithmeticOp : public OpLite { + public: + SequenceArithmeticOp() {} + explicit SequenceArithmeticOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "sequence_arithmetic"; } + + private: + mutable SequenceArithmeticParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc new file mode 100644 index 0000000000..2a54df890c --- /dev/null +++ b/lite/operators/sequence_concat_op.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_concat_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceConcatOp::CheckShape() const { + CHECK_GT(param_.X.size(), 1) + << "The number of input sequences is at least two."; + CHECK_OR_FALSE(param_.Out); + size_t lod_size = 0; + for (const auto &t : param_.X) { + CHECK_EQ(t->lod().empty(), false) + << "Input Tensor of X does not contain LoD information."; + // CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now."; + if (lod_size == 0) { + lod_size = t->lod()[0].size(); + } else { + CHECK_EQ(t->lod()[0].size(), lod_size) + << "The number of sequence must be same between each input"; + } + } + CHECK_NE(lod_size, 0) << "Each input must have sequence information"; + return true; +} + +bool SequenceConcatOp::InferShape() const { + int64_t batch_size = 0; + int64_t feature_size = 0; + std::vector out_dims; + for (const auto &tensor : param_.X) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.Vectorize(); + } + batch_size += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + if (batch_size < 0) { + batch_size = -1; // Normalize batch size for compile time. + } + out_dims[0] = batch_size; + param_.Out->Resize(out_dims); + // LoD info will be computed in Kernel. + return true; +} + +bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + auto input_list = opdesc.Input("X"); + param_.X.clear(); + for (auto var : input_list) { + param_.X.push_back(scope->FindVar(var)->GetMutable()); + } + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + CHECK(param_.Out) << "Output(Out) of Sequence Concat Op should not be null."; + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_concat, paddle::lite::operators::SequenceConcatOp); diff --git a/lite/operators/sequence_concat_op.h b/lite/operators/sequence_concat_op.h new file mode 100644 index 0000000000..8cdc07ebca --- /dev/null +++ b/lite/operators/sequence_concat_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceConcatOp : public OpLite { + public: + SequenceConcatOp() {} + explicit SequenceConcatOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_concat"; } + + private: + mutable SequenceConcatParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc new file mode 100644 index 0000000000..dd8fa2e8fd --- /dev/null +++ b/lite/operators/sequence_reverse_op.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_reverse_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceReverseOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + CHECK_EQ(param_.X->lod().empty(), false) + << "Input(X) Tensor of SequenceReverseOp does not contain " + "LoD information."; + CHECK_GE(param_.X->dims().size(), 2) + << "Rank of Input(X) must be not less than 2."; + return true; +} + +bool SequenceReverseOp::InferShape() const { + const auto *input = param_.X; + auto out_dims = input->dims(); + param_.Out->Resize(out_dims); + return true; +} + +bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + param_.Out = + scope->FindVar(opdesc.Output("Y").front())->GetMutable(); + CHECK(param_.X); + CHECK(param_.Out); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_reverse, paddle::lite::operators::SequenceReverseOp); diff --git a/lite/operators/sequence_reverse_op.h b/lite/operators/sequence_reverse_op.h new file mode 100644 index 0000000000..326d0f6892 --- /dev/null +++ b/lite/operators/sequence_reverse_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceReverseOp : public OpLite { + public: + SequenceReverseOp() {} + explicit SequenceReverseOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_reverse"; } + + private: + mutable SequenceReverseParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_topk_avg_pooling_op.cc b/lite/operators/sequence_topk_avg_pooling_op.cc new file mode 100644 index 0000000000..6f5cbeeeee --- /dev/null +++ b/lite/operators/sequence_topk_avg_pooling_op.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_topk_avg_pooling_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceTopkAvgPoolingOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.ROW); + CHECK_OR_FALSE(param_.COLUMN); + CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.pos); + return true; +} + +bool SequenceTopkAvgPoolingOpLite::InferShape() const { + int channel_num = param_.channel_num; + std::vector topks = param_.topks; + auto row_dim = param_.ROW->dims(); + auto num_k = topks.size(); + auto row_shape_0 = row_dim[0]; + std::vector vec_out_shape; + vec_out_shape.push_back(row_shape_0); + vec_out_shape.push_back(channel_num * num_k); + + param_.Out->Resize(lite::DDim(vec_out_shape)); + param_.Out->set_lod(param_.ROW->lod()); + return true; +} + +bool SequenceTopkAvgPoolingOpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto X = op_desc.Input("X").front(); + auto ROW = op_desc.Input("ROW").front(); + auto COLUMN = op_desc.Input("COLUMN").front(); + auto Out = op_desc.Output("Out").front(); + auto pos = op_desc.Output("pos").front(); + + param_.X = scope->FindVar(X)->GetMutable(); + param_.ROW = scope->FindVar(ROW)->GetMutable(); + param_.COLUMN = scope->FindVar(COLUMN)->GetMutable(); + param_.Out = scope->FindVar(Out)->GetMutable(); + param_.pos = scope->FindVar(pos)->GetMutable(); + param_.channel_num = op_desc.GetAttr("channel_num"); + param_.topks = op_desc.GetAttr>("topks"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_topk_avg_pooling, + paddle::lite::operators::SequenceTopkAvgPoolingOpLite); diff --git a/lite/operators/sequence_topk_avg_pooling_op.h b/lite/operators/sequence_topk_avg_pooling_op.h new file mode 100644 index 0000000000..1c1cfe3a9c --- /dev/null +++ b/lite/operators/sequence_topk_avg_pooling_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceTopkAvgPoolingOpLite : public OpLite { + public: + SequenceTopkAvgPoolingOpLite() {} + explicit SequenceTopkAvgPoolingOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { + return "sequence_topk_avg_pooling"; + } + + private: + mutable SequenceTopkAvgPoolingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc index 18280616aa..ec98a0d6c3 100644 --- a/lite/operators/split_op.cc +++ b/lite/operators/split_op.cc @@ -39,8 +39,16 @@ bool SplitOp::InferShape() const { const int outs_number = outs.size(); std::vector outs_dims; outs_dims.reserve(outs_number); - - if (num > 0) { + std::vector sections_tensor_list_ = + param_.sections_tensor_list; + if (sections.size() > 0 && sections_tensor_list_.size() > 0) { + std::vector vec_sections; + for (size_t i = 0; i < sections_tensor_list_.size(); ++i) { + auto dim = in_dims; + dim[axis] = sections_tensor_list_[i]->data()[0]; + outs_dims.push_back(dim); + } + } else if (num > 0) { int out_axis_dim = in_dims[axis] / num; for (int i = 0; i < outs_number; ++i) { auto dim = in_dims; @@ -55,6 +63,10 @@ bool SplitOp::InferShape() const { } } + if (param_.axis_tensor != nullptr) { + axis = param_.axis_tensor->data()[0]; + } + for (int j = 0; j < outs_dims.size(); ++j) { outs[j]->Resize(outs_dims[j]); } @@ -73,6 +85,21 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { for (auto var : outs) { param_.output.push_back(scope->FindVar(var)->GetMutable()); } + std::vector input_arg_names = opdesc.InputArgumentNames(); + if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") != + input_arg_names.end()) { + auto args = opdesc.Input("AxisTensor"); + auto *var = scope->FindVar(args.front()); + param_.axis_tensor = var->GetMutable(); + } + if (std::find(input_arg_names.begin(), + input_arg_names.end(), + "SectionsTensorList") != input_arg_names.end()) { + auto args = opdesc.Input("SectionsTensorList"); + auto *var = scope->FindVar(args.front()); + param_.sections_tensor_list = + *(var->GetMutable>()); + } return true; } diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc index 8db14d0660..39b275b7b5 100644 --- a/lite/operators/unsqueeze_op.cc +++ b/lite/operators/unsqueeze_op.cc @@ -66,10 +66,7 @@ bool UnsqueezeOp::InferShape() const { std::vector final_axes; auto axes = param_.axes; auto *axes_tensor = param_.axes_tensor; - std::vector axes_tensor_vct; - if (param_.axes_tensor_vct) { - axes_tensor_vct = *(param_.axes_tensor_vct); - } + auto axes_tensor_vct = param_.axes_tensor_vct; if (!axes.empty()) { final_axes = axes; @@ -79,7 +76,7 @@ bool UnsqueezeOp::InferShape() const { axes_tensor_data + axes_tensor->numel()); } else if (!axes_tensor_vct.empty()) { for (int i = 0; i < axes_tensor_vct.size(); i++) { - final_axes.push_back(axes_tensor_vct[i].data()[0]); + final_axes.push_back(axes_tensor_vct[i]->data()[0]); } } else { LOG(FATAL) << "Input axis error"; @@ -114,16 +111,12 @@ bool UnsqueezeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { if (opdesc.HasInput("AxesTensorList") && opdesc.Input("AxesTensorList").size() > 0) { auto args = opdesc.Input("AxesTensorList"); - /* for (auto arg : args) { auto *var = scope->FindVar(arg); if (var != nullptr) { param_.axes_tensor_vct.push_back(var->GetMutable()); } } - */ - auto *var = scope->FindVar(args.front()); - param_.axes_tensor_vct = var->GetMutable>(); } CHECK(param_.X) << "Input(X) of UnsqueezeOp should not be null."; CHECK(param_.Out) << "Output(Out) of UnsqueezeOp should not be null."; diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc new file mode 100644 index 0000000000..5c7fe374fc --- /dev/null +++ b/lite/operators/var_conv_2d_op.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/var_conv_2d_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool VarConv2dOp::CheckShape() const { + auto x_dims = param_.X->dims(); + CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2."; + auto w_dims = param_.W->dims(); + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor"; + CHECK_EQ(w_dims[0], param_.output_channel) + << "W dim[0] should be equal to OutputChannel"; + CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w) + << "W dim[1] should be equal to InputChannel * KernelH * KernelW"; + LoD x_lod = param_.X->lod(); + CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info."; + // CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted."; + CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod[0].back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + // LoD row_lod = param_.ROW->lod(); + // CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info."; + // LoD col_lod = param_.COLUMN->lod(); + // CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod + // info."; + return true; +} + +bool VarConv2dOp::InferShape() const { return true; } + +bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + // param_.ROW = const_cast( + // &scope->FindVar(opdesc.Input("ROW").front())->Get()); + // param_.COLUMN = const_cast( + // &scope->FindVar(opdesc.Input("COLUMN").front())->Get()); + param_.W = const_cast( + &scope->FindVar(opdesc.Input("W").front())->Get()); + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + param_.Col = + scope->FindVar(opdesc.Output("Col").front())->GetMutable(); + CHECK(param_.X) << "X(Input) of VarConv2dOP should not be null."; + // CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null."; + // CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null."; + CHECK(param_.W) << "W(Input) of VarConv2dOP should not be null."; + CHECK(param_.Out) << "Out(Output) of VarConv2dOP should not be null."; + CHECK(param_.Col) << "Col(Output) of VarConv2dOP should not be null."; + param_.output_channel = opdesc.GetAttr("OutputChannel"); + param_.input_channel = opdesc.GetAttr("InputChannel"); + param_.kernel_h = opdesc.GetAttr("KernelH"); + param_.kernel_w = opdesc.GetAttr("KernelW"); + param_.stride_h = opdesc.GetAttr("StrideH"); + param_.stride_w = opdesc.GetAttr("StrideW"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(var_conv_2d, paddle::lite::operators::VarConv2dOp); diff --git a/lite/operators/var_conv_2d_op.h b/lite/operators/var_conv_2d_op.h new file mode 100644 index 0000000000..ce6309419c --- /dev/null +++ b/lite/operators/var_conv_2d_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class VarConv2dOp : public OpLite { + public: + VarConv2dOp() {} + explicit VarConv2dOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "var_conv_2d"; } + + private: + mutable VarConv2DParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc index 7c0f867fae..eefd30f74f 100644 --- a/lite/tests/cv/image_convert_test.cc +++ b/lite/tests/cv/image_convert_test.cc @@ -17,8 +17,8 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/tests/cv/cv_basic.h" -#include "lite/tests/utils/timer.h" #include "lite/utils/cv/paddle_image_preprocess.h" DEFINE_int32(cluster, 3, "cluster id"); @@ -46,7 +46,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; typedef paddle::lite_api::Tensor Tensor_api; typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; void fill_tensor_host_rand(uint8_t* dio, int64_t size) { uint seed = 256; @@ -285,8 +285,8 @@ void test_img(const std::vector& cluster_id, ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); + t1.Reset(); + t1.Start(); LOG(INFO) << "image convert saber compute"; // 方法一: image_preprocess.imageCovert(src, lite_dst); @@ -329,8 +329,8 @@ void test_img(const std::vector& cluster_id, means, scales); - t1.end(); - double tdiff = t1.get_average_ms(); + t1.Stop(); + double tdiff = t1.LapTimes().Avg(); to += tdiff; if (tdiff < min_time) { min_time = tdiff; diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 02d40ce6cc..549fabab5a 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -39,6 +39,8 @@ if(LITE_BUILD_EXTRA) lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/kernels/bilinear_interp_compute_test.cc b/lite/tests/kernels/bilinear_interp_compute_test.cc index 0779caf67a..7ea4293f08 100644 --- a/lite/tests/kernels/bilinear_interp_compute_test.cc +++ b/lite/tests/kernels/bilinear_interp_compute_test.cc @@ -22,6 +22,27 @@ namespace paddle { namespace lite { +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + vec_new_shape.push_back(static_cast(*(tensor->data()))); + } + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->dims().production()); + return vec_new_data; +} + template void resize_bilinear_align(std::vector inputs, lite::Tensor* output) { @@ -149,6 +170,9 @@ class BilinearInterpComputeTester : public arena::TestCase { protected: // common attributes for this op. std::string input0_ = "X"; + std::string sizetensor0_ = "SizeTensor0"; + std::string sizetensor1_ = "SizeTensor1"; + std::string input_scale_ = "Scale"; std::string input1_ = "OutSize"; std::string output_ = "Out"; @@ -162,6 +186,8 @@ class BilinearInterpComputeTester : public arena::TestCase { std::string interp_method_ = "Bilinear"; DDim _dims0_{{1, 1, 16, 16}}; DDim _dims1_{{2}}; + DDim sizetensor_dims_{{1}}; + DDim scale_dims_{{1}}; public: BilinearInterpComputeTester(const Place& place, @@ -190,33 +216,48 @@ class BilinearInterpComputeTester : public arena::TestCase { if (outsize_height_ > 0 && outsize_width_ > 0) { inputs.emplace_back(scope->FindTensor(input1_)); } + std::vector SizeTensor; + if (outsize_height_ > 0 && outsize_width_ > 0) { + SizeTensor.emplace_back(scope->FindTensor(sizetensor0_)); + SizeTensor.emplace_back(scope->FindTensor(sizetensor1_)); + } + const lite::Tensor* input_scale = scope->FindTensor(input_scale_); + float scale = height_scale_; + int in_h = inputs[0]->dims()[2]; + int in_w = inputs[0]->dims()[3]; + if (SizeTensor.size() > 0) { + auto new_size = get_new_shape(SizeTensor); + out_height_ = new_size[0]; + out_width_ = new_size[1]; + } else { + auto scale_tensor = input_scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_height_ = static_cast(in_h * scale); + out_width_ = static_cast(in_w * scale); + } + if (inputs.size() > 1) { + auto out_size = inputs[1]; + auto out_size_data = get_new_data_from_tensor(out_size); + out_height_ = out_size_data[0]; + out_width_ = out_size_data[1]; + } + } + height_scale_ = scale; + width_scale_ = scale; + if (out_width_ != -1 && out_height_ != -1) { height_scale_ = static_cast(out_height_ / inputs[0]->dims()[2]); width_scale_ = static_cast(out_width_ / inputs[0]->dims()[3]); } auto* outputs = scope->NewTensor(output_); CHECK(outputs); - if (inputs.size() > 1) { - auto outsize_data = inputs[1]->data(); - int h_out = outsize_data[0]; // HW - int w_out = outsize_data[1]; // HW - int num_cout = inputs[0]->dims()[0]; - int c_cout = inputs[0]->dims()[1]; - outputs->Resize({num_cout, c_cout, h_out, w_out}); - } else { - int out_h; - int out_w; - if (-1 == out_height_ && -1 == out_width_) { - out_h = inputs[0]->dims()[2] * height_scale_; - out_w = inputs[0]->dims()[3] * width_scale_; - } else { - out_h = out_height_; - out_w = out_width_; - } - outputs->Resize( - {inputs[0]->dims()[0], inputs[0]->dims()[1], out_h, out_w}); - } - + int num_cout = inputs[0]->dims()[0]; + int c_cout = inputs[0]->dims()[1]; + outputs->Resize({num_cout, c_cout, out_height_, out_width_}); if (align_corners_) { resize_bilinear_align(inputs, outputs); } else { @@ -229,6 +270,10 @@ class BilinearInterpComputeTester : public arena::TestCase { op_desc->SetInput("X", {input0_}); if (outsize_height_ > 0 && outsize_width_ > 0) { op_desc->SetInput("OutSize", {input1_}); + op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_}); + } + if (height_scale_ > 0) { + op_desc->SetInput("Scale", {input_scale_}); } op_desc->SetOutput("Out", {output_}); op_desc->SetAttr("scale", height_scale_); @@ -250,6 +295,19 @@ class BilinearInterpComputeTester : public arena::TestCase { data1[0] = outsize_height_; data1[1] = outsize_width_; SetCommonTensor(input1_, _dims1_, data1.data()); + + std::vector sizetensor_data(1); + sizetensor_data[0] = outsize_height_; + SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data()); + + sizetensor_data[0] = outsize_width_; + SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data()); + } + + if (height_scale_ > 0) { + std::vector scale_data(1); + scale_data[0] = height_scale_; + SetCommonTensor(input_scale_, scale_dims_, scale_data.data()); } } }; diff --git a/lite/tests/kernels/conv2d_transpose_compute_test.cc b/lite/tests/kernels/conv2d_transpose_compute_test.cc index a287f0bb66..6c348076ba 100644 --- a/lite/tests/kernels/conv2d_transpose_compute_test.cc +++ b/lite/tests/kernels/conv2d_transpose_compute_test.cc @@ -31,8 +31,10 @@ void col2im(const Dtype* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, @@ -40,19 +42,22 @@ void col2im(const Dtype* data_col, Dtype* data_im) { memset(data_im, 0, height * width * channels * sizeof(float)); const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_h0 + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { data_col += output_w; } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_w0 + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { data_im[input_row * width + input_col] += *data_col; @@ -104,6 +109,34 @@ void fill_bias_relu(float* tensor, } } +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilations, + const std::vector& strides, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& ksize) { + // when padding_desc is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (size_t i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = std::max( + (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + // pad + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + // dilation + *(dilations->begin() + i) = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto& it : *paddings) { + it = 0; + } + } +} + template static void basic_gemm(int m, int n, @@ -172,8 +205,10 @@ bool deconv_basic(const Dtype1* din, int stride_h, int dila_w, int dila_h, - int pad_w, - int pad_h, + int pad_w0, + int pad_w1, + int pad_h0, + int pad_h1, bool flag_bias, bool flag_relu) { int m = chout * kernel_w * kernel_h / group; @@ -193,8 +228,9 @@ bool deconv_basic(const Dtype1* din, int group_size_coldata = m * n; int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && - (dila_w == 1) && (dila_h == 1); + (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) && + (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) && + (dila_h == 1); for (int i = 0; i < num; ++i) { const Dtype1* din_batch = din + i * chin * hin * win; @@ -204,7 +240,7 @@ bool deconv_basic(const Dtype1* din, if (flag_1x1s1p1) { col_data = dout_batch; } - memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); + memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group); for (int g = 0; g < group; ++g) { const Dtype1* din_group = din_batch + g * group_size_in; const Dtype1* weights_group = weights + g * group_size_weights; @@ -230,8 +266,10 @@ bool deconv_basic(const Dtype1* din, wout, kernel_h, kernel_w, - pad_h, - pad_w, + pad_h0, + pad_h1, + pad_w0, + pad_w1, stride_h, stride_w, dila_h, @@ -253,9 +291,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase { std::string output_ = "out"; std::string filter_ = "filter"; std::string bias_ = "bias"; + std::string padding_algorithm_ = ""; std::vector strides_{1, 1}; - std::vector paddings_{0, 0}; + std::vector paddings_{0, 0, 0, 0}; int groups_{1}; std::vector dilations_{1, 1}; bool flag_relu_{false}; @@ -280,9 +319,13 @@ class Conv2DTransposeComputeTester : public arena::TestCase { bool flag_relu, int dilation, int stride, - int padding, + int pad_h0, + int pad_h1, + int pad_w0, + int pad_w1, int ks, - int groups) + int groups, + std::string padding_algorithm) : TestCase(place, alias) { n_ = n; ic_ = ic; @@ -291,20 +334,29 @@ class Conv2DTransposeComputeTester : public arena::TestCase { iw_ = iw; ks_ = ks; flag_bias_ = flag_bias; - + padding_algorithm_ = padding_algorithm; strides_ = std::vector({stride, stride}); - paddings_ = std::vector({padding, padding}); - groups_ = groups; + paddings_ = std::vector({pad_h0, pad_h1, pad_w0, pad_w1}); dilations_ = std::vector({dilation, dilation}); + groups_ = groups; flag_relu_ = flag_relu; } void RunBaseline(Scope* scope) override { auto* out = scope->NewTensor(output_); CHECK(out); - int oh = (ih_ - 1) * strides_[0] - 2 * paddings_[0] + + auto* x = scope->FindTensor(x_); + auto input_dim = x->dims(); + std::vector ksize({1, 1, ks_, ks_}); + UpdatePaddingAndDilation(&paddings_, + &dilations_, + strides_, + padding_algorithm_, + input_dim, + ksize); + int oh = (ih_ - 1) * strides_[0] - paddings_[0] - paddings_[1] + dilations_[0] * (ks_ - 1) + 1; - int ow = (iw_ - 1) * strides_[1] - 2 * paddings_[1] + + int ow = (iw_ - 1) * strides_[1] - paddings_[2] - paddings_[3] + dilations_[1] * (ks_ - 1) + 1; CHECK(oh > 0 || ow > 0); @@ -313,7 +365,6 @@ class Conv2DTransposeComputeTester : public arena::TestCase { out->Resize(output_dims); auto* output_data = out->mutable_data(); - auto* x = scope->FindTensor(x_); const auto* x_data = x->data(); auto* filter = scope->FindTensor(filter_); const auto* filter_data = filter->data(); @@ -341,8 +392,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase { strides_[0], dilations_[1], dilations_[0], - paddings_[1], + paddings_[2], + paddings_[3], paddings_[0], + paddings_[1], flag_bias_, flag_relu_); } @@ -360,6 +413,7 @@ class Conv2DTransposeComputeTester : public arena::TestCase { op_desc->SetInput("Bias", {bias_}); } op_desc->SetAttr("fuse_relu", flag_relu_); + op_desc->SetAttr("padding_algorithm", padding_algorithm_); } void PrepareData() override { @@ -402,49 +456,66 @@ TEST(conv2d_transpose, precision) { LOG(INFO) << "test conv2d_transpose op"; #ifdef LITE_WITH_ARM Place place(TARGET(kARM)); - for (auto n : {1, 2}) { + for (auto n : {2}) { for (auto ic : {1, 4 /*, 128*/}) { for (auto oc : {1, 4 /*, 128*/}) { LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << oc; - for (auto ih : {8, 16 /*, 56 , 112, 224, 512*/}) { + for (auto ih : {8, 8 /*, 56 , 112, 224, 512*/}) { for (auto iw : {8, 16 /*, 56, 112, 224, 512*/}) { for (auto flag_bias : {false, true}) { for (auto flag_relu : {false, true}) { for (auto dilation : {1, 2}) { for (auto stride : {1, 2}) { - for (auto padding : {0, 2}) { - for (auto ks : {2, 5}) { - for (auto group : {1, 2}) { - // obtain shape - // LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << - // oc - // << ",ih:" << ih << ",iw:" << iw - // << ",flag_bias:" << flag_bias - // << ",flag_relu:" << flag_relu - // << ",dila:" << dilation - // << ",stride:" << stride - // << ",padding:" << padding << ",ks:" << ks - // << ",group:" << group; - if (ic % group != 0 || oc % group != 0) { - group = 1; + for (auto pad_h0 : {0, 1}) { + for (auto pad_h1 : {0, 1}) { + for (auto pad_w0 : {0, 1}) { + for (auto pad_w1 : {0, 1}) { + for (auto ks : {1, 4}) { + for (auto group : {1, 2}) { + for (auto padding_algorithm : + {"", "SAME", "VALID"}) { + // obtain shape + // LOG(INFO) << "n:" << n << ",ic:" << ic << + // ",oc:" << + // oc + // << ",ih:" << ih << ",iw:" << iw + // << ",flag_bias:" << flag_bias + // << ",flag_relu:" << flag_relu + // << ",dila:" << dilation + // << ",stride:" << stride + // << ",padding:" << padding << + // ",ks:" << ks + // << ",group:" << group; + if (ic % group != 0 || oc % group != 0) { + group = 1; + } + std::unique_ptr tester( + new Conv2DTransposeComputeTester( + place, + "def", + n, + ic, + oc, + ih, + iw, + flag_bias, + flag_relu, + dilation, + stride, + pad_h0, + pad_h1, + pad_w0, + pad_w1, + ks, + group, + padding_algorithm)); + arena::Arena arena( + std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + } } - std::unique_ptr tester( - new Conv2DTransposeComputeTester(place, - "def", - n, - ic, - oc, - ih, - iw, - flag_bias, - flag_relu, - dilation, - stride, - padding, - ks, - group)); - arena::Arena arena(std::move(tester), place, 2e-5); - arena.TestPrecision(); } } } diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc new file mode 100644 index 0000000000..e211582b04 --- /dev/null +++ b/lite/tests/kernels/fill_constant_compute_test.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" + +namespace paddle { +namespace lite { + +class FillConstantComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string out_ = "out"; + int dtype_{static_cast(VarDescAPI::VarDataType::FP32)}; + std::vector shape_{}; + std::string shape_tensor_ = "ShapeTensor"; + std::vector shape_tensor_list_; + bool is_use_shape_tensor_{false}; + bool is_use_shape_tensor_list_{false}; + + float value_{0.0f}; + // useless for x86, keep it for compatibility + bool force_cpu_{false}; + // DDim shape_tensor_data{{5, 3}}; + std::vector shape_tensor_data; + DDim shape_test{{1, 2}}; + + public: + FillConstantComputeTester(const Place& place, + const std::string& alias, + std::vector shape, + const bool is_use_shape_tensor, + const bool is_use_shape_tensor_list, + float value, + bool force_cpu) + : TestCase(place, alias) { + shape_ = shape; + value_ = value; + force_cpu_ = force_cpu; + is_use_shape_tensor_ = is_use_shape_tensor; + is_use_shape_tensor_list_ = is_use_shape_tensor_list; + + for (int i = 0; i < shape_test.size(); i++) { + shape_tensor_data.push_back(i + 1); + } + } + + void RunBaseline(Scope* scope) override { + auto* out = scope->NewTensor(out_); + DDim output_dims{shape_}; + if (is_use_shape_tensor_) { + auto* temp_shape = scope->FindTensor(shape_tensor_); + auto* shape_data = temp_shape->data(); + auto vec_shape = + std::vector(shape_data, shape_data + temp_shape->numel()); + output_dims.ConstructFrom(vec_shape); + } + if (is_use_shape_tensor_list_) { + std::vector vec_shape; + for (int i = 0; i < shape_tensor_list_.size(); i++) { + auto* temp_shape = scope->FindTensor(shape_tensor_list_[i]); + vec_shape.push_back(*temp_shape->data()); + } + + output_dims.ConstructFrom(vec_shape); + } + out->Resize(output_dims); + + auto* output_data = out->mutable_data(); + for (int i = 0; i < out->numel(); i++) { + output_data[i] = value_; + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + LOG(INFO) << "PrepareOpDesc"; + + op_desc->SetType("fill_constant"); + op_desc->SetAttr("dtype", dtype_); + op_desc->SetAttr("shape", shape_); + op_desc->SetAttr("value", value_); + op_desc->SetAttr("force_cpu", force_cpu_); + if (is_use_shape_tensor_) { + op_desc->SetInput("ShapeTensor", {shape_tensor_}); + } + if (is_use_shape_tensor_list_) { + // std::vector shape_tensor_list_; + for (int i = 0; i < shape_test.size(); ++i) { + shape_tensor_list_.push_back("shape_tensor_list_" + std::to_string(i)); + } + op_desc->SetInput("ShapeTensorList", {shape_tensor_list_}); + } + op_desc->SetOutput("Out", {out_}); + } + + void PrepareData() override { + if (is_use_shape_tensor_) { + // std::vector temp = x_dims_.data(); + // int64_t* data = temp.data(); + SetCommonTensor(shape_tensor_, shape_test, shape_tensor_data.data()); + } + if (is_use_shape_tensor_list_) { + Scope& scope_ = this->scope(); + for (int i = 0; i < shape_test.size(); ++i) { + auto* tensor = + scope_.NewTensor("shape_tensor_list_" + std::to_string(i)); + tensor->Resize(DDim({1})); + auto* d = tensor->mutable_data(); + d[0] = shape_tensor_data[i]; + } + } + } +}; + +TEST(fill_constant, precision) { + LOG(INFO) << "test fill_constant op, kARM"; +#ifdef LITE_WITH_ARM + Place place(TARGET(kARM)); + std::vector shape{1, 2}; + + for (int dtype : {static_cast(VarDescAPI::VarDataType::INT32)}) { + for (float value : {1, 2}) { + for (bool is_use_shape_tensor_list : {false, true}) { + for (bool is_use_shape_tensor : {false, true}) { + if (is_use_shape_tensor && is_use_shape_tensor_list) break; + LOG(INFO) << "value:" << value + << ", is_use_shape_tensor:" << is_use_shape_tensor + << ", is_use_shape_tensor_list:" + << is_use_shape_tensor_list; + + std::unique_ptr tester( + new FillConstantComputeTester(place, + "def", + shape, + is_use_shape_tensor, + is_use_shape_tensor_list, + value, + false)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + } + } +#endif + +#ifdef LITE_WITH_X86 + Place place(TARGET(kX86)); + LOG(INFO) << "test concate op, x86"; + for (int axis : {1, 2}) { + for (bool is_use_axis_tensor : {false, true}) { + LOG(INFO) << "axis:" << axis + << ", is_use_axis_tensor:" << is_use_axis_tensor; + std::unique_ptr tester( + new ConcateComputeTester(place, "def", axis, is_use_axis_tensor)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/lrn_compute_test.cc b/lite/tests/kernels/lrn_compute_test.cc index 9ee43c5c60..e306155514 100644 --- a/lite/tests/kernels/lrn_compute_test.cc +++ b/lite/tests/kernels/lrn_compute_test.cc @@ -158,7 +158,7 @@ class LrnComputeTester : public arena::TestCase { op_desc->SetOutput("Out", {output_}); op_desc->SetAttr("alpha", alpha_); op_desc->SetAttr("beta", beta_); - op_desc->SetAttr("local_size", local_size_); + op_desc->SetAttr("n", local_size_); op_desc->SetAttr("k", k_); op_desc->SetAttr("norm_region", norm_region_); } diff --git a/lite/tests/kernels/nearest_interp_compute_test.cc b/lite/tests/kernels/nearest_interp_compute_test.cc index 3256ababca..894959f909 100644 --- a/lite/tests/kernels/nearest_interp_compute_test.cc +++ b/lite/tests/kernels/nearest_interp_compute_test.cc @@ -22,6 +22,28 @@ namespace paddle { namespace lite { +inline std::vector get_new_shape( + const std::vector& list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + vec_new_shape.push_back(static_cast(*tensor->data())); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->dims().production()); + return vec_new_data; +} + template void resize_nearest_align(std::vector inputs, lite::Tensor* output, @@ -73,6 +95,9 @@ class NearestInterpComputeTester : public arena::TestCase { protected: // common attributes for this op. std::string input0_ = "X"; + std::string sizetensor0_ = "SizeTensor0"; + std::string sizetensor1_ = "SizeTensor1"; + std::string input_scale_ = "Scale"; std::string input1_ = "OutSize"; std::string output_ = "Out"; @@ -85,6 +110,8 @@ class NearestInterpComputeTester : public arena::TestCase { DDim dims_{{2, 3}}; DDim _dims0_{{2, 3, 3, 2}}; DDim _dims1_{{2}}; + DDim sizetensor_dims_{{1}}; + DDim scale_dims_{{1}}; public: NearestInterpComputeTester(const Place& place, @@ -112,24 +139,54 @@ class NearestInterpComputeTester : public arena::TestCase { inputs.emplace_back(scope->FindTensor(input0_)); inputs.emplace_back(scope->FindTensor(input1_)); - auto outsize_data = inputs[1]->data(); + std::vector SizeTensor(2); + SizeTensor[0] = scope->FindTensor(sizetensor0_); + SizeTensor[1] = scope->FindTensor(sizetensor1_); + const lite::Tensor* input_scale = scope->FindTensor(input_scale_); + + float scale = height_scale_; + int in_h = inputs[0]->dims()[2]; + int in_w = inputs[0]->dims()[3]; + if (SizeTensor.size() > 0) { + auto new_size = get_new_shape(SizeTensor); + out_height_ = new_size[0]; + out_width_ = new_size[1]; + } else { + auto scale_tensor = input_scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_height_ = static_cast(in_h * scale); + out_width_ = static_cast(in_w * scale); + } + auto out_size = inputs[1]; + if (out_size != nullptr) { + auto out_size_data = get_new_data_from_tensor(out_size); + out_height_ = out_size_data[0]; + out_width_ = out_size_data[1]; + } + } + height_scale_ = scale; + width_scale_ = scale; + if (out_width_ != -1 && out_height_ != -1) { height_scale_ = static_cast(out_height_ / inputs[0]->dims()[2]); width_scale_ = static_cast(out_width_ / inputs[0]->dims()[3]); } - if (inputs.size() > 1) { - int h_out = outsize_data[0]; // HW - int w_out = outsize_data[1]; // HW - int num_cout = outputs->dims()[0]; - int c_cout = outputs->dims()[1]; - outputs->Resize({num_cout, c_cout, h_out, w_out}); - } + int num_cout = inputs[0]->dims()[0]; + int c_cout = inputs[0]->dims()[1]; + outputs->Resize({num_cout, c_cout, out_height_, out_width_}); + resize_nearest_align(inputs, outputs, align_corners_); } void PrepareOpDesc(cpp::OpDesc* op_desc) { op_desc->SetType("nearest_interp"); op_desc->SetInput("X", {input0_}); + op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_}); + op_desc->SetInput("Scale", {input_scale_}); op_desc->SetInput("OutSize", {input1_}); op_desc->SetOutput("Out", {output_}); op_desc->SetAttr("scale", height_scale_); @@ -152,6 +209,17 @@ class NearestInterpComputeTester : public arena::TestCase { SetCommonTensor(input0_, _dims0_, data0.data()); SetCommonTensor(input1_, _dims1_, data1.data()); + + std::vector sizetensor_data(1); + sizetensor_data[0] = out_height_; + SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data()); + + sizetensor_data[0] = out_width_; + SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data()); + + std::vector scale_data(1); + scale_data[0] = height_scale_; + SetCommonTensor(input_scale_, scale_dims_, scale_data.data()); } }; diff --git a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc new file mode 100644 index 0000000000..cb824931ae --- /dev/null +++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc @@ -0,0 +1,220 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" + +namespace paddle { +namespace lite { + +class SearchAlignedMatMulComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string x_ = "X"; + std::string y_ = "Y"; + bool x_transpose_; + bool y_transpose_; + float alpha_; + std::string out_ = "Out"; + DDim x_dims_; + DDim y_dims_; + LoD x_lod_; + LoD y_lod_; + + public: + SearchAlignedMatMulComputeTester(const Place& place, + const std::string& alias, + bool x_transpose, + bool y_transpose, + float alpha, + const DDim& x_dims, + const DDim& y_dims, + const LoD& x_lod, + const LoD& y_lod) + : TestCase(place, alias), + x_transpose_(x_transpose), + y_transpose_(y_transpose), + alpha_(alpha), + x_dims_(x_dims), + y_dims_(y_dims), + x_lod_(x_lod), + y_lod_(y_lod) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(x_); + auto y = scope->FindTensor(y_); + CHECK(x); + CHECK(y); + const auto x_data = x->data(); + const auto y_data = y->data(); + auto out = scope->NewTensor(out_); + CHECK(out); + + const auto x_dims = x->dims(); + const auto y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose_ ? x_inner_size : x_batch_size; + int N = y_transpose_ ? y_batch_size : y_inner_size; + int X_K = x_transpose_ ? x_batch_size : x_inner_size; + int Y_K = y_transpose_ ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + int x_stride = x_batch_size * x_inner_size; + int y_stride = y_batch_size * y_inner_size; + int out_stride = M * N; + int lda = x_transpose_ ? M : K; + int ldb = y_transpose_ ? K : N; + int ldc = N; + + LoD out_lod; + std::vector out_lod_0(seq_num + 1); + out_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + out_lod_0[i + 1] = out_lod_0[i] + M; + } + out_lod.push_back(out_lod_0); + DDim out_dims( + {static_cast(out_lod_0.back()), static_cast(N)}); + out->set_lod(out_lod); + out->Resize(out_dims); + + auto out_data = out->mutable_data(); + for (int i = 0; i < seq_num; i++) { + basic_gemm(x_transpose_, + y_transpose_, + M, + N, + K, + alpha_, + x_data + i * x_stride, + lda, + y_data + i * y_stride, + ldb, + 0, + out_data + i * out_stride, + ldc, + nullptr, + false, + false); + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("search_aligned_mat_mul"); + op_desc->SetInput("X", {x_}); + op_desc->SetInput("Y", {y_}); + op_desc->SetOutput("Out", {out_}); + op_desc->SetAttr("transpose_X", x_transpose_); + op_desc->SetAttr("transpose_Y", y_transpose_); + op_desc->SetAttr("alpha", alpha_); + } + + void PrepareData() override { + std::vector x_data(x_dims_.production()); + std::vector y_data(y_dims_.production()); + fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production()); + fill_data_rand(y_data.data(), -1.f, 1.f, y_dims_.production()); + SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_); + SetCommonTensor(y_, y_dims_, y_data.data(), y_lod_); + } +}; + +void test_search_aligned_mat_mul(Place place) { + for (int seq_num : {1, 2}) { + for (int x_batch_size : {1, 3}) { + for (int x_inner_size : {1, 5}) { + for (int out_inner_size : {1, 4}) { + for (bool x_transpose : {true, false}) { + for (bool y_transpose : {true, false}) { + for (float alpha : {1., 2.}) { + // infer x_dims and y_dims + int y_batch_size; + int y_inner_size; + if (x_transpose) { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_batch_size; + } else { + y_batch_size = x_batch_size; + y_inner_size = out_inner_size; + } + } else { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_inner_size; + } else { + y_batch_size = x_inner_size; + y_inner_size = out_inner_size; + } + } + std::vector x_lod_0(seq_num + 1); + std::vector y_lod_0(seq_num + 1); + x_lod_0[0] = 0; + y_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + x_lod_0[i + 1] = x_lod_0[i] + x_batch_size; + y_lod_0[i + 1] = y_lod_0[i] + y_batch_size; + } + LoD x_lod; + LoD y_lod; + x_lod.push_back(x_lod_0); + y_lod.push_back(y_lod_0); + DDim x_dims({static_cast(x_lod_0.back()), + static_cast(x_inner_size)}); + DDim y_dims({static_cast(y_lod_0.back()), + static_cast(y_inner_size)}); + + std::unique_ptr tester( + new SearchAlignedMatMulComputeTester(place, + "def", + x_transpose, + y_transpose, + alpha, + x_dims, + y_dims, + x_lod, + y_lod)); + arena::Arena arena(std::move(tester), place, 5e-4); + arena.TestPrecision(); + } + } + } + } + } + } + } +} + +TEST(SearchAlignedMatMul, precision) { +#ifdef LITE_WITH_X86 + Place place(TARGET(kX86)); + test_search_aligned_mat_mul(place); +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/search_seq_fc_compute_test.cc b/lite/tests/kernels/search_seq_fc_compute_test.cc new file mode 100644 index 0000000000..988d3a27cc --- /dev/null +++ b/lite/tests/kernels/search_seq_fc_compute_test.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" + +namespace paddle { +namespace lite { + +class SearchSeqFcOPTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string x_ = "x"; + std::string w_ = "w"; + std::string b_ = "b"; + std::string out_ = "out"; + DDim x_dims_; + DDim w_dims_; + DDim b_dims_; + LoD x_lod_; + bool has_bias_; + int out_size_; + + public: + SearchSeqFcOPTest(const Place& place, + const std::string& alias, + DDim x_dims, + DDim w_dims, + DDim b_dims, + LoD x_lod, + bool has_bias, + int out_size) + : TestCase(place, alias), + x_dims_(x_dims), + w_dims_(w_dims), + b_dims_(b_dims), + x_lod_(x_lod), + has_bias_(has_bias), + out_size_(out_size) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(x_); + auto w = scope->FindTensor(w_); + CHECK(x); + CHECK(w); + auto out = scope->NewTensor(out_); + CHECK(out); + + const auto x_data = x->data(); + const auto w_data = w->data(); + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto& x_lod = x->lod(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + const auto& x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size_) << "Wrong shape: w_dims[0] != out_size"; + + const float* b_data = nullptr; + if (has_bias_) { + auto b = scope->FindTensor(b_); + CHECK(b); + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + b_data = b->data(); + } + + out->set_lod(x_lod); + out->Resize({x_dims[0], w_dims[0]}); + + int M = x_dims[0]; + int K = x_dims[1]; + int N = w_dims[0]; + auto out_data = out->mutable_data(); + basic_gemm(false, + true, + M, + N, + K, + 1.f, + x_data, + K, + w_data, + K, + 0, + out_data, + N, + nullptr, + false, + false); + if (b_data != nullptr) { + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + out_data[i * N + j] += b_data[j]; + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("search_seq_fc"); + op_desc->SetInput("X", {x_}); + op_desc->SetInput("W", {w_}); + if (has_bias_) { + op_desc->SetInput("b", {b_}); + } + op_desc->SetAttr("has_bias", has_bias_); + op_desc->SetAttr("out_size", out_size_); + op_desc->SetOutput("Out", {out_}); + } + + void PrepareData() override { + std::vector x_data(x_dims_.production()); + std::vector w_data(w_dims_.production()); + fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production()); + fill_data_rand(w_data.data(), -1.f, 1.f, w_dims_.production()); + SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_); + SetCommonTensor(w_, w_dims_, w_data.data()); + if (has_bias_) { + std::vector b_data(b_dims_.production()); + fill_data_rand(b_data.data(), -1.f, 1.f, b_dims_.production()); + SetCommonTensor(b_, b_dims_, b_data.data()); + } + } +}; + +void test_search_seq_fc(Place place) { + for (auto x_lod_0 : {std::vector({0, 1, 3}), + std::vector({0, 3, 4, 5})}) { + for (auto feature_size : {2, 9}) { + for (auto out_size : {3, 5}) { + for (auto has_bias : {true, false}) { + DDim x_dims({static_cast(x_lod_0.back()), feature_size}); + DDim w_dims({out_size, feature_size}); + DDim b_dims({has_bias ? out_size : 0}); + LoD x_lod; + x_lod.push_back(x_lod_0); + std::unique_ptr tester(new SearchSeqFcOPTest( + place, "def", x_dims, w_dims, b_dims, x_lod, has_bias, out_size)); + arena::Arena arena(std::move(tester), place, 6e-5); + arena.TestPrecision(); + } + } + } + } +} + +TEST(SearchSeqFcOP, precision) { +#ifdef LITE_WITH_X86 + Place place(TARGET(kX86)); + test_search_seq_fc(place); +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc index d0e9912e65..66123625fa 100644 --- a/lite/tests/kernels/shuffle_channel_compute_test.cc +++ b/lite/tests/kernels/shuffle_channel_compute_test.cc @@ -12,12 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// TODO(zhengxi) -// shuffle_channel_test can pass on local compilation -// while on ci compilation, the test will be killed immediately. - -/* -#include +// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine. +// Open this. +/*#include #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" @@ -30,8 +27,8 @@ class ShuffleChannelComputeTester : public arena::TestCase { // common attributes for this op. std::string input_ = "X"; std::string output_ = "Out"; - int group_ = 1; - DDim dims_{{1, 2}}; + int group_ = 4; + DDim dims_{{10, 16, 4, 4}}; public: ShuffleChannelComputeTester(const Place& place, @@ -87,7 +84,7 @@ class ShuffleChannelComputeTester : public arena::TestCase { }; void test_shuffle_channel(Place place) { - for (int group : {1, 2, 3}) { + for (int group : {4}) { std::unique_ptr tester( new ShuffleChannelComputeTester(place, "def", group)); arena::Arena arena(std::move(tester), place, 2e-5); diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc index 9bbf39b70d..22e475672a 100644 --- a/lite/tests/kernels/unsqueeze_compute_test.cc +++ b/lite/tests/kernels/unsqueeze_compute_test.cc @@ -125,8 +125,7 @@ class UnsqueezeComputeTester : public arena::TestCase { for (size_t i = 0; i < axes_.size(); i++) { name = name + std::to_string(i); axes_tensor_list_.push_back(name); - std::vector in_data = {axes_[i]}; - SetCommonTensor(name, DDim({1}), in_data.data()); + SetCommonTensor(name, DDim({1}), &axes_[i]); } } } @@ -230,7 +229,7 @@ void test_unsqueeze(Place place) { for (int C : {3}) { for (int H : {1}) { for (int W : {5}) { - for (int input_axes_flag : {1, 2}) { + for (int input_axes_flag : {1, 2, 3}) { LOG(INFO) << N << " " << C << " " << H << " " << W << " " << input_axes_flag; std::unique_ptr tester( diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt index 87324375e0..7dd4f522db 100644 --- a/lite/tests/math/CMakeLists.txt +++ b/lite/tests/math/CMakeLists.txt @@ -1,9 +1,17 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(gemv_int8_compute_test SRCS gemv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(pool_compute_test SRCS pool_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + + if(LITE_BUILD_EXTRA) + lite_cc_test(layout_compute_test SRCS layout_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + endif() + + endif() diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc index bfb74e6e0a..bda50d3563 100644 --- a/lite/tests/math/conv_compute_test.cc +++ b/lite/tests/math/conv_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/conv_compute.h" @@ -59,26 +59,30 @@ DEFINE_bool(flag_bias, true, "with bias"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::ConvParam ConvParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::ConvParam& param) { DDim dim_out = dim_in; + auto paddings = *param.paddings; + auto dilations = *param.dilations; dim_out[1] = param.filter->dims()[0]; auto kernel_h = param.filter->dims()[2]; auto kernel_w = param.filter->dims()[3]; auto h = dim_in[2]; auto w = dim_in[3]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; + int pad_top = paddings[0]; + int pad_bottom = paddings[1]; + int pad_left = paddings[2]; + int pad_right = paddings[3]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; auto kernel_exten = dila_h * (kernel_h - 1) + 1; - auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; + auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1; kernel_exten = dila_w * (kernel_w - 1) + 1; - auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; + auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1; dim_out[2] = hout; dim_out[3] = wout; return dim_out; @@ -110,8 +114,8 @@ void test_conv_fp32(const std::vector& input_dims, param.bias->set_precision(PRECISION(kFloat)); } param.strides = strides; - param.paddings = pads; - param.dilations = dilas; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilas); param.fuse_relu = flag_relu; param.groups = group; @@ -162,7 +166,7 @@ void test_conv_fp32(const std::vector& input_dims, param.output->Resize(dim_out); paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f); - // paddle::lite::fill_tensor_const(*param.x, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); auto din = param.x->data(); Tensor tout_basic; @@ -189,7 +193,7 @@ void test_conv_fp32(const std::vector& input_dims, strides[0], dilas[1], dilas[0], - pads[1], + pads[2], pads[0], flag_bias, flag_relu); @@ -201,19 +205,19 @@ void test_conv_fp32(const std::vector& input_dims, /// compute Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv.Launch(); - t0.end(); + t0.Stop(); } double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] * weight_dim[3] / param.groups; LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" - << dim_out << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; @@ -235,7 +239,8 @@ void test_conv_fp32(const std::vector& input_dims, LOG(FATAL) << "test fp32 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -280,27 +285,33 @@ void test_conv_fp32(const std::vector& input_dims, TEST(TestConv3x3DW, test_conv3x3_depthwise) { if (FLAGS_basic_test) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - for (auto& c : {1, 3, 5, 8, 16, 32}) { - std::vector dims; - DDim weights_dim({c, 1, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 15, 19, 28, 32, 75}) { - dims.push_back(DDim({batch, c, h, h})); + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + for (auto& c : {1, 3, 5, 8, 16, 32}) { + std::vector dims; + DDim weights_dim({c, 1, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 15, 19, 28, 32, 75}) { + dims.push_back(DDim({batch, c, h, h})); + } + } + test_conv_fp32(dims, + weights_dim, + c, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - c, - {stride, stride}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -329,7 +340,7 @@ TEST(TestConv5x5DW, test_conv5x5_depthwise) { weights_dim, c, {stride, stride}, - {pad, pad}, + {pad, pad, pad, pad}, {1, 1}, flag_bias, flag_relu, @@ -366,7 +377,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) { weights_dim, g, {1, 1}, - {0, 0}, + {0, 0, 0, 0}, {1, 1}, flag_bias, flag_relu, @@ -386,26 +397,32 @@ TEST(TestConv3x3s1, test_conv_3x3s1) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32, 48}) { for (auto& cout : {1, 5, 8, 32, 48}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_fp32(dims, + weights_dim, + 1, + {1, 1}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - 1, - {1, 1}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -420,26 +437,32 @@ TEST(TestConv3x3s2, test_conv_3x3s2) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32}) { for (auto& cout : {1, 5, 8, 32}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_fp32(dims, + weights_dim, + 1, + {2, 2}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - 1, - {2, 2}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -458,30 +481,37 @@ TEST(TestConvRand, test_conv_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& dila : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - if (cin % g != 0 || cout % g != 0) { - continue; - } - std::vector dims; - DDim weights_dim({cout, cin / g, kh, kw}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + DDim weights_dim({cout, cin / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_fp32( + dims, + weights_dim, + g, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {dila, dila}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - g, - {stride, stride}, - {pad, pad}, - {dila, dila}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -510,7 +540,7 @@ TEST(TestConvCustom, test_conv_fp32_custom_size) { FLAGS_kernel_w}), FLAGS_group, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, {FLAGS_dila_h, FLAGS_dila_w}, FLAGS_flag_bias, FLAGS_flag_relu, diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc index e15b7d22bc..27c186d7ce 100644 --- a/lite/tests/math/conv_int8_compute_test.cc +++ b/lite/tests/math/conv_int8_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/conv_compute.h" @@ -59,26 +59,26 @@ DEFINE_bool(flag_bias, true, "with bias"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::ConvParam ConvParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::ConvParam& param) { + auto paddings = *param.paddings; + auto dilations = *param.dilations; DDim dim_out = dim_in; dim_out[1] = param.filter->dims()[0]; auto kernel_h = param.filter->dims()[2]; auto kernel_w = param.filter->dims()[3]; auto h = dim_in[2]; auto w = dim_in[3]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; auto kernel_exten = dila_h * (kernel_h - 1) + 1; - auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; + auto hout = (h + paddings[0] + paddings[1] - kernel_exten) / stride_h + 1; kernel_exten = dila_w * (kernel_w - 1) + 1; - auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; + auto wout = (w + paddings[2] + paddings[3] - kernel_exten) / stride_w + 1; dim_out[2] = hout; dim_out[3] = wout; return dim_out; @@ -104,8 +104,8 @@ void get_conv_param(const DDim& dim_w, param->bias->set_precision(PRECISION(kFloat)); } param->strides = strides; - param->paddings = pads; - param->dilations = dila; + param->paddings = std::make_shared>(pads); + param->dilations = std::make_shared>(dila); param->fuse_relu = flag_relu; param->groups = g; @@ -288,7 +288,7 @@ void test_conv_int8(const std::vector& input_dims, strides[0], dilas[1], dilas[0], - pads[1], + pads[2], pads[0], flag_bias, flag_relu); @@ -309,30 +309,30 @@ void test_conv_int8(const std::vector& input_dims, /// compute fp32 output Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv_int8_fp32.Launch(); - t0.end(); + t0.Stop(); } LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out - << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); /// compute int8 output - t0.clear(); + t0.Reset(); for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv_int8_int8.Launch(); - t0.end(); + t0.Stop(); } LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out - << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); /// compare result fp32 output if (FLAGS_check_result) { @@ -358,7 +358,8 @@ void test_conv_int8(const std::vector& input_dims, LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -416,7 +417,8 @@ void test_conv_int8(const std::vector& input_dims, LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -428,9 +430,9 @@ void test_conv_int8(const std::vector& input_dims, } LOG(INFO) << "test int8 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] - << ", stride: " << strides[0] << ", " << strides[1] - << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << (flag_relu ? "true" : "false") << ", threads: " << th << ", power_mode: " << cls @@ -473,7 +475,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) { weights_dim, c, {stride, stride}, - {pad, pad}, + {pad, pad, pad, pad}, {1, 1}, flag_bias, flag_relu, @@ -507,7 +509,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) { weights_dim, c, {stride, stride}, - {pad, pad}, + {pad, pad, pad, pad}, {1, 1}, flag_bias, flag_relu, @@ -544,7 +546,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) { weights_dim, g, {1, 1}, - {0, 0}, + {0, 0, 0, 0}, {1, 1}, flag_bias, flag_relu, @@ -564,26 +566,32 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32, 48}) { for (auto& cout : {1, 5, 8, 32, 48}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_int8(dims, + weights_dim, + 1, + {1, 1}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_int8(dims, - weights_dim, - 1, - {1, 1}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -598,26 +606,32 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32}) { for (auto& cout : {1, 5, 8, 32}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_int8(dims, + weights_dim, + 1, + {2, 2}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_int8(dims, - weights_dim, - 1, - {2, 2}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -636,30 +650,37 @@ TEST(TestConvRandInt8, test_conv_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& dila : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - if (cin % g != 0 || cout % g != 0) { - continue; - } - std::vector dims; - DDim weights_dim({cout, cin / g, kh, kw}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + DDim weights_dim({cout, cin / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_int8( + dims, + weights_dim, + g, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {dila, dila}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_int8(dims, - weights_dim, - g, - {stride, stride}, - {pad, pad}, - {dila, dila}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -688,7 +709,7 @@ TEST(TestConvCustomInt8, test_conv_custom_size) { FLAGS_kernel_w}), FLAGS_group, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, {FLAGS_dila_h, FLAGS_dila_w}, FLAGS_flag_bias, FLAGS_flag_relu, diff --git a/lite/tests/math/conv_transpose_compute_test.cc b/lite/tests/math/conv_transpose_compute_test.cc index e0da07a534..398e745d94 100644 --- a/lite/tests/math/conv_transpose_compute_test.cc +++ b/lite/tests/math/conv_transpose_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/conv_transpose_compute.h" @@ -59,17 +59,19 @@ DEFINE_bool(flag_bias, false, "with bias"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::ConvParam ConvParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::ConvParam& param) { auto filter_dims = param.filter->dims(); DDim output_shape = dim_in; output_shape[1] = filter_dims[1] * param.groups; + auto paddings = *param.paddings; + auto dilations = *param.dilations; for (int i = 0; i < 2; i++) { - int kernel_extent = param.dilations[i] * (filter_dims[i + 2] - 1) + 1; + int kernel_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; int output_len = (dim_in[i + 2] - 1) * param.strides[i] + kernel_extent - - 2 * param.paddings[i]; + (paddings[2 * i] + paddings[2 * i + 1]); output_shape[i + 2] = output_len; } return output_shape; @@ -101,19 +103,19 @@ void test_conv_transpose_fp32(const std::vector& input_dims, param.bias->set_precision(PRECISION(kFloat)); } param.strides = strides; - param.paddings = pads; - param.dilations = dilas; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilas); param.fuse_relu = flag_relu; param.groups = group; param.output = new Tensor; param.output->set_precision(PRECISION(kFloat)); - // paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f); - paddle::lite::fill_tensor_const(*param.filter, 1.f); + paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.filter, 1.f); if (flag_bias) { - // paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f); - paddle::lite::fill_tensor_const(*param.bias, 1.f); + paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.bias, 1.f); } Tensor tmp_weights; tmp_weights.Resize(weight_dim); @@ -128,21 +130,8 @@ void test_conv_transpose_fp32(const std::vector& input_dims, new paddle::lite::KernelContext); auto& ctx = ctx1->As(); ctx.SetRunMode(static_cast(cls), th); - /// set param and context - for (auto& dim_in : input_dims) { - param.x->Resize(dim_in); - DDim out_tmp_dims = compute_out_dim(dim_in, param); - if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) { - continue; - } - param.output->Resize(out_tmp_dims); - break; - } conv_t.SetParam(param); conv_t.SetContext(std::move(ctx1)); - /// prepare for run - conv_t.PrepareForRun(); - for (auto& dim_in : input_dims) { CHECK_EQ(weight_dim[0], dim_in[1]) << "input channel must equal to weights channel"; @@ -152,9 +141,11 @@ void test_conv_transpose_fp32(const std::vector& input_dims, } param.x->Resize(dim_in); param.output->Resize(dim_out); - - // paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f); - paddle::lite::fill_tensor_const(*param.x, 1.f); + param.filter->CopyDataFrom(tmp_weights); + // prepare for run + conv_t.PrepareForRun(); + paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); auto din = param.x->data(); Tensor tout_basic; @@ -182,8 +173,10 @@ void test_conv_transpose_fp32(const std::vector& input_dims, strides[0], dilas[1], dilas[0], - pads[1], + pads[2], + pads[3], pads[0], + pads[1], flag_bias, flag_relu); } @@ -194,19 +187,19 @@ void test_conv_transpose_fp32(const std::vector& input_dims, /// compute Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv_t.Launch(); - t0.end(); + t0.Stop(); } float gops = 2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3]; LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" - << dim_out << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; @@ -228,7 +221,8 @@ void test_conv_transpose_fp32(const std::vector& input_dims, LOG(FATAL) << "test fp32 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -240,9 +234,9 @@ void test_conv_transpose_fp32(const std::vector& input_dims, } LOG(INFO) << "test fp32 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] - << ", stride: " << strides[0] << ", " << strides[1] - << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << (flag_relu ? "true" : "false") << ", threads: " << th << ", power_mode: " << cls @@ -278,30 +272,37 @@ TEST(TestConvRand, test_conv_transpose_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& dila : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - if (cin % g != 0 || cout % g != 0) { - continue; - } - std::vector dims; - DDim weights_dim({cin, cout / g, kh, kw}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_h0 : {0, 1, 2}) { + for (auto& pad_h1 : {0, 1, 2}) { + for (auto& pad_w0 : {0, 1, 2}) { + for (auto& pad_w1 : {0, 1, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + DDim weights_dim({cin, cout / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_transpose_fp32( + dims, + weights_dim, + g, + {stride, stride}, + {pad_h0, pad_h1, pad_w0, pad_w1}, + {dila, dila}, + flag_bias, + flag_relu, + {1, 4}, + {FLAGS_power_mode}); + } } } - test_conv_transpose_fp32(dims, - weights_dim, - g, - {stride, stride}, - {pad, pad}, - {dila, dila}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -330,7 +331,7 @@ TEST(TestConvCustom, test_conv_transpose_fp32_custom_size) { FLAGS_kernel_w}), FLAGS_group, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, {FLAGS_dila_h, FLAGS_dila_w}, FLAGS_flag_bias, FLAGS_flag_relu, diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc index 06a1a0a65e..fde5aacb1c 100644 --- a/lite/tests/math/gemm_int8_compute_test.cc +++ b/lite/tests/math/gemm_int8_compute_test.cc @@ -20,12 +20,12 @@ #include "lite/backends/arm/math/funcs.h" #endif // LITE_WITH_ARM #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/core/tensor.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_int32(power_mode, 3, @@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra, dbias_int8[l] = dbias[l] / scale_c[0]; } for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data(), db, dbias_int8, @@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra, trb, scale_merge_int8.data(), &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; /// fp32 output compute - t0.clear(); + t0.Reset(); for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data(), db, dbias, @@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra, trb, scale_merge_fp32.data(), &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; if (FLAGS_check_result) { diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc index c64e78d66a..623615c8da 100644 --- a/lite/tests/math/gemv_int8_compute_test.cc +++ b/lite/tests/math/gemv_int8_compute_test.cc @@ -20,12 +20,12 @@ #include "lite/backends/arm/math/funcs.h" #endif // LITE_WITH_ARM #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/core/tensor.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_int32(power_mode, 3, @@ -165,7 +165,7 @@ bool test_gemv_int8( dbias_int8[l] = dbias[l] / scale_c[0]; } for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemv_int8(da, db, dc_fp32, @@ -177,21 +177,21 @@ bool test_gemv_int8( dbias, has_relu, &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; /// fp32 output compute - t0.clear(); + t0.Reset(); for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemv_int8(da, db, dc_int8, @@ -203,15 +203,15 @@ bool test_gemv_int8( dbias_int8, has_relu, &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; if (FLAGS_check_result) { diff --git a/lite/tests/math/layout_compute_test.cc b/lite/tests/math/layout_compute_test.cc new file mode 100644 index 0000000000..a566924548 --- /dev/null +++ b/lite/tests/math/layout_compute_test.cc @@ -0,0 +1,608 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/context.h" +#include "lite/core/profile/timer.h" +#include "lite/operators/op_params.h" +#include "lite/tests/utils/naive_math_impl.h" +#include "lite/tests/utils/tensor_utils.h" + +#ifdef LITE_WITH_ARM +#include "lite/kernels/arm/layout_compute.h" +#endif // LITE_WITH_ARM + +DEFINE_int32(power_mode, + 3, + "power mode: " + "0 for POWER_HIGH;" + "1 for POWER_LOW;" + "2 for POWER_FULL;" + "3 for NO_BIND"); +DEFINE_int32(threads, 1, "threads num"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_bool(basic_test, false, "do all tests"); +DEFINE_bool(check_result, true, "check the result"); + +DEFINE_int32(batch, 1, "batch size"); +DEFINE_int32(in_channel, 32, "input channel"); +DEFINE_int32(in_height, 112, "input height"); +DEFINE_int32(in_width, 112, "input width"); + +DEFINE_bool(flag_nchw, true, "do nchw to nhwc"); + +typedef paddle::lite::DDim DDim; +typedef paddle::lite::Tensor Tensor; +typedef paddle::lite::operators::LayoutParam LayoutParam; + +using paddle::lite::profile::Timer; + +#define IN(n, c, h, w) \ + input_data[w + h * input_w + c * input_h * input_w + \ + n * input_c * input_h * input_w] +#define OUT(n, c, h, w) \ + output_data[w + h * output_w + c * output_h * output_w + \ + n * output_c * output_h * output_w] + +template +void nchw2nhwc_ref(const Tensor* input, Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_c = input->dims()[1]; + int input_h = input->dims()[2]; + int input_w = input->dims()[3]; + int output_c = output->dims()[1]; + int output_h = output->dims()[2]; + int output_w = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, h, w, c) = IN(n, c, h, w); + } + } + } + } +} +#undef IN +#undef OUT + +#define IN(n, h, w, c) \ + input_data[c + w * input_c + h * input_w * input_c + \ + n * input_h * input_w * input_c] +#define OUT(n, h, w, c) \ + output_data[c + w * output_c + h * output_w * output_c + \ + n * output_h * output_w * output_c] +template +void nhwc2nchw_ref(const Tensor* input, Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_h = input->dims()[1]; + int input_w = input->dims()[2]; + int input_c = input->dims()[3]; + int output_h = output->dims()[1]; + int output_w = output->dims()[2]; + int output_c = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, c, h, w) = IN(n, h, w, c); + } + } + } + } +} + +#ifdef LITE_WITH_ARM +void test_layout_fp32_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kFloat)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kFloat)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NCHWToNHWCCompute layout; + DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand( + *(const_cast(param.x)), -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kFloat)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0.f); + auto dout_basic = tout_basic.mutable_data(); + nchw2nhwc_ref(param.x, &tout_basic); + } + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kFloat)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +void test_layout_fp32_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kFloat)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kFloat)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NHWCToNCHWCompute layout; + // n h w c == n c h w + DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand( + *(const_cast(param.x)), -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kFloat)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0.f); + auto dout_basic = tout_basic.mutable_data(); + nhwc2nchw_ref(param.x, &tout_basic); + } + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kFloat)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +void test_layout_int8_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kInt8)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kInt8)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NCHWToNHWCCompute layout; + DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand(*(const_cast(param.x))); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kInt8)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0); + auto dout_basic = tout_basic.mutable_data(); + nchw2nhwc_ref(param.x, &tout_basic); + } + LOG(INFO) << "saber compute"; + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + LOG(INFO) << "saber compute end"; + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kInt8)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +void test_layout_int8_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kInt8)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kInt8)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NHWCToNCHWCompute layout; + // n h w c == n c h w + DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand(*(const_cast(param.x))); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kInt8)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0.f); + auto dout_basic = tout_basic.mutable_data(); + nhwc2nchw_ref(param.x, &tout_basic); + } + LOG(INFO) << "saber compute"; + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + LOG(INFO) << "run"; + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kInt8)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +#else +void test_layout_fp32_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +void test_layout_fp32_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +void test_layout_int8_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +void test_layout_int8_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +#endif // LITE_WITH_ARM + +#if 1 // +TEST(TestLayout, test_Layout_fp32) { + if (FLAGS_basic_test) { + for (auto n : {1, 3}) { + for (auto c : {1, 3, 5, 32}) { + for (auto h : {3, 16, 20, 32}) { + for (auto w : {3, 4, 32, 112}) { + for (auto nchw2nhwc : {true, false}) { + DDim dim_in({n, c, h, w}); + if (nchw2nhwc) { + LOG(INFO) << "NCHW2NHWC"; + test_layout_fp32_nchw( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } else { + LOG(INFO) << "NHWC2NCHW"; + test_layout_fp32_nhwc( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } + } + } + } + } + } + } +} +#endif +#if 1 +TEST(TestLayout, test_Layout_int8) { + if (FLAGS_basic_test) { + for (auto n : {1, 3}) { + for (auto c : {1, 3, 5, 32}) { + for (auto h : {3, 16, 20, 32}) { + for (auto w : {3, 4, 32, 112}) { + for (auto nchw2nhwc : {true, false}) { + DDim dim_in({n, c, h, w}); + if (nchw2nhwc) { + LOG(INFO) << "NCHW2NHWC int8"; + test_layout_int8_nchw( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } else { + LOG(INFO) << "NHWC2NCHW int8"; + test_layout_int8_nhwc( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } + } + } + } + } + } + } +} +#endif + +#if 1 /// custom +TEST(TestLayoutCustom, test_Layout_custom_size) { + test_layout_fp32_nchw( + {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})}, + true, + {FLAGS_threads}, + {FLAGS_power_mode}); +} +#endif // custom diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc index 9f4a943594..73a5ba5606 100644 --- a/lite/tests/math/pool_compute_test.cc +++ b/lite/tests/math/pool_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/pool_compute.h" @@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::PoolParam PoolParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::PoolParam& param) { @@ -69,8 +69,7 @@ DDim compute_out_dim(const DDim& dim_in, auto kernel_w = param.ksize[1]; auto h = dim_in[2]; auto w = dim_in[3]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; int stride_h = param.strides[0]; int stride_w = param.strides[1]; bool ceil_mode = param.ceil_mode; @@ -79,11 +78,15 @@ DDim compute_out_dim(const DDim& dim_in, int wout = 1; if (!flag_global) { if (!ceil_mode) { - hout = (h - kernel_h + 2 * pad_h) / stride_h + 1; - wout = (w - kernel_w + 2 * pad_w) / stride_w + 1; + hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1; + wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1; } else { - hout = (h - kernel_h + 2 * pad_h + stride_h - 1) / stride_h + 1; - wout = (w - kernel_w + 2 * pad_w + stride_w - 1) / stride_w + 1; + hout = + (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h + + 1; + wout = + (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w + + 1; } } dim_out[2] = hout; @@ -116,7 +119,7 @@ void pooling_basic(const float* din, int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int size_channel_in = win * hin; int size_channel_out = wout * hout; if (global_pooling) { @@ -195,18 +198,22 @@ void pooling_basic(const float* din, int bh = kernel_h; int bw = kernel_w; if (ew == win) { - bw = sw + kernel_w >= win + pad_w ? win + pad_w - : sw + kernel_w; + bw = (sw + kernel_w) >= (win + paddings[3]) + ? (win + paddings[3]) + : (sw + kernel_w); bw -= sw; - if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) { + if ((sw - pad_w) < 0 && + (sw + kernel_w) > (win + paddings[3])) { bw += pad_w; } } if (eh == hin) { - bh = sh + kernel_h >= hin + pad_h ? hin + pad_h - : sh + kernel_h; + bh = (sh + kernel_h) >= (hin + paddings[1]) + ? (hin + paddings[1]) + : (sh + kernel_h); bh -= sh; - if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) { + if ((sh - pad_h) < 0 && + (sh + kernel_h) > (hin + paddings[1])) { bh += pad_h; } } @@ -243,7 +250,7 @@ void test_pool_fp32(const std::vector& input_dims, param.ksize = ksize; param.strides = strides; - param.paddings = pads; + param.paddings = std::make_shared>(pads); param.ceil_mode = ceil_mode; param.global_pooling = flag_global; param.pooling_type = pooling_type; @@ -313,18 +320,18 @@ void test_pool_fp32(const std::vector& input_dims, /// compute Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); pool.Launch(); - t0.end(); + t0.Stop(); } double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1]; LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape" - << dim_out << ", running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << dim_out << ", running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; @@ -399,31 +406,38 @@ TEST(TestPoolRand, test_pool_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& flag_global : {false, true}) { - for (auto& exclusive : {false, true}) { - for (auto& ceil_mode : {false, true}) { - for (auto& pooling_type : {"max", "avg"}) { - bool adaptive = false; - bool use_quantizer = false; - std::vector dims; - for (auto& batch : {1, 2}) { - for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& flag_global : {false, true}) { + for (auto& exclusive : {false, true}) { + for (auto& ceil_mode : {false, true}) { + for (auto& pooling_type : {"max", "avg"}) { + bool adaptive = false; + bool use_quantizer = false; + std::vector dims; + for (auto& batch : {1, 2}) { + for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_pool_fp32( + dims, + {kh, kw}, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + ceil_mode, + flag_global, + exclusive, + adaptive, + use_quantizer, + pooling_type, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_pool_fp32(dims, - {kh, kw}, - {stride, stride}, - {pad, pad}, - ceil_mode, - flag_global, - exclusive, - adaptive, - use_quantizer, - pooling_type, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -443,7 +457,7 @@ TEST(TesPoolCustom, test_pool_fp32_custom_size) { {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})}, {FLAGS_kernel_h, FLAGS_kernel_w}, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, FLAGS_ceil_mode, FLAGS_flag_global, FLAGS_exclusive, diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc new file mode 100644 index 0000000000..886dba6ac5 --- /dev/null +++ b/lite/tests/math/sgemm_c4_compute_test.cc @@ -0,0 +1,236 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" +#ifdef LITE_WITH_ARM +#include "lite/backends/arm/math/funcs.h" +#endif // LITE_WITH_ARM +#include "lite/core/context.h" +#include "lite/core/profile/timer.h" +#include "lite/core/tensor.h" +#include "lite/tests/utils/tensor_utils.h" + +typedef paddle::lite::Tensor Tensor; +using paddle::lite::profile::Timer; + +DEFINE_int32(power_mode, + 3, + "power mode: " + "0 for POWER_HIGH;" + "1 for POWER_LOW;" + "2 for POWER_FULL;" + "3 for NO_BIND"); +DEFINE_int32(threads, 1, "threads num"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_bool(basic_test, false, "do all tests"); +DEFINE_bool(check_result, true, "check the result"); + +DEFINE_int32(M, 512, "gemm_c4: M"); +DEFINE_int32(N, 512, "gemm_c4: N"); +DEFINE_int32(K, 512, "gemm_c4: K"); + +DEFINE_bool(flag_relu, false, "do relu"); +DEFINE_bool(flag_bias, false, "with bias"); + +bool test_sgemm_c4( + int m, int n, int k, bool has_bias, bool has_relu, int cls, int ths) { + int m_round = (m + 3) / 4 * 4; + int k_round = (k + 3) / 4 * 4; + int size_a = m * k; + int size_b = n * k; + int size_a_c4 = m_round * k_round; + int size_b_c4 = k_round * n; + + Tensor ta; + Tensor tb; + Tensor ta_c4; + Tensor tb_c4; + Tensor tc; + Tensor tc_basic; + Tensor tc_backup; + Tensor tbias; + + ta.Resize({size_a}); + tb.Resize({size_b}); + ta_c4.Resize({size_a_c4}); + tb_c4.Resize({size_b_c4}); + tc.Resize({m_round * n}); + tc_basic.Resize({m_round * n}); + tbias.Resize({m}); + + ta.set_precision(PRECISION(kFloat)); + tb.set_precision(PRECISION(kFloat)); + ta_c4.set_precision(PRECISION(kFloat)); + tb_c4.set_precision(PRECISION(kFloat)); + tc.set_precision(PRECISION(kFloat)); + tc_basic.set_precision(PRECISION(kFloat)); + tbias.set_precision(PRECISION(kFloat)); + + fill_tensor_rand(ta, -1.f, 1.f); + fill_tensor_rand(tb, -1.f, 1.f); + fill_tensor_rand(tbias, -1.f, 1.f); + fill_tensor_rand(tc, -1.f, 1.f); + + auto da = ta.mutable_data(); + auto db = tb.mutable_data(); + auto da_c4 = ta_c4.mutable_data(); + auto db_c4 = tb_c4.mutable_data(); + auto dc_basic = tc_basic.mutable_data(); + auto dbias = tbias.mutable_data(); + + // trans A, B to c4 + basic_trans_mat_to_c4(da, da_c4, k, m, k, true); + basic_trans_mat_to_c4(db, db_c4, n, k, n, false); + + LOG(INFO) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k + << ", relu: " << (has_relu ? "true" : "false") + << ", bias: " << (has_bias ? "true" : "false"); + + if (FLAGS_check_result) { + basic_gemm_c4(false, + false, + m, + n, + k, + 1.f, + da, + k, + db, + n, + 0.f, + dc_basic, + n, + dbias, + has_bias, + has_relu); + } + Timer t0; +#ifdef LITE_WITH_ARM + //! compute + double ops = 2.0 * m_round * n * k_round; + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), ths); + auto dc = tc.mutable_data(); + for (int j = 0; j < FLAGS_warmup; ++j) { + paddle::lite::arm::math::sgemm_prepack_c4( + m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx); + } + + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + paddle::lite::arm::math::sgemm_prepack_c4( + m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx); + t0.Stop(); + } + LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k + << ", power_mode: " << cls << ", threads: " << ths + << ", GOPS: " << ops * 1e-9f + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() + << " GOPs"; + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tc_basic, tc, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) { + Tensor tdiff; + tdiff.set_precision(PRECISION(kFloat)); + tdiff.Resize(tc.dims()); + tensor_diff(tc_basic, tc, tdiff); + LOG(INFO) << "a: "; + print_tensor(ta); + LOG(INFO) << "a_c4: "; + print_tensor(ta_c4); + LOG(INFO) << "b: "; + print_tensor(tb); + LOG(INFO) << "b_c4: "; + print_tensor(tb_c4); + LOG(INFO) << "basic result: "; + print_tensor(tc_basic); + LOG(INFO) << "lite result: "; + print_tensor(tc); + LOG(INFO) << "diff result: "; + print_tensor(tdiff); + return false; + } + } +#endif + return true; +} + +TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) { + if (FLAGS_basic_test) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + LOG(INFO) << "run basic sgemm_c4 test"; + for (auto& m : {1, 3, 8, 32, 397}) { + for (auto& n : {1, 2, 3, 4, 13, 141, 789}) { + for (auto& k : {1, 3, 8, 59, 234}) { + for (auto& has_bias : {false, true}) { + for (auto& has_relu : {false, true}) { + for (auto& th : {1, 2, 4}) { + auto flag = test_sgemm_c4( + m, n, k, has_bias, has_relu, FLAGS_power_mode, th); + if (flag) { + LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << " passed\n"; + } else { + LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << " failed\n"; + } + } + } + } + } + } + } + } +} + +TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + auto flag = test_sgemm_c4(FLAGS_M, + FLAGS_N, + FLAGS_K, + FLAGS_flag_bias, + FLAGS_flag_relu, + FLAGS_power_mode, + FLAGS_threads); + if (!flag) { + LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N + << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias + << ", relu: " << FLAGS_flag_relu << " failed!!"; + } + LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N << ", k=" << FLAGS_K + << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu + << " passed!!"; +} diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc index 1621ceb904..6df5e671fe 100644 --- a/lite/tests/math/sgemm_compute_test.cc +++ b/lite/tests/math/sgemm_compute_test.cc @@ -20,12 +20,12 @@ #include "lite/backends/arm/math/funcs.h" #endif // LITE_WITH_ARM #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/core/tensor.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_int32(power_mode, 3, @@ -171,7 +171,7 @@ bool test_sgemm(bool tra, if (i == FLAGS_repeats - 1) { memcpy(dc, dc_backup, sizeof(float) * m * ldc); } - t0.start(); + t0.Start(); paddle::lite::arm::math::sgemm_prepack(trb, m, n, @@ -186,15 +186,15 @@ bool test_sgemm(bool tra, has_bias, has_relu, &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; if (FLAGS_check_result) { diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc new file mode 100644 index 0000000000..5dd2d32295 --- /dev/null +++ b/lite/tests/math/sgemv_compute_test.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" +#ifdef LITE_WITH_ARM +#include "lite/backends/arm/math/funcs.h" +#endif // LITE_WITH_ARM +#include "lite/core/context.h" +#include "lite/core/profile/timer.h" +#include "lite/core/tensor.h" +#include "lite/tests/utils/tensor_utils.h" + +typedef paddle::lite::Tensor Tensor; + +DEFINE_int32(cluster, 3, "cluster id"); +DEFINE_int32(threads, 1, "threads num"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_bool(basic_test, true, "do all tests"); +DEFINE_bool(check_result, true, "check the result"); + +DEFINE_int32(M, 512, "sgemv: M"); +DEFINE_int32(K, 512, "sgemv: K"); + +DEFINE_bool(traA, false, "gemv: A transpose"); + +DEFINE_bool(flag_relu, false, "do relu"); +DEFINE_bool(flag_bias, false, "with bias"); + +bool test_sgemv( + bool tra, int m, int k, bool has_bias, bool has_relu, int cls, int ths) { + Tensor ta; + Tensor tb; + Tensor tc; + Tensor tc_basic; + Tensor tbias; + + ta.Resize({m, k}); + tb.Resize({k, 1}); + tc.Resize({m, 1}); + tc_basic.Resize({m, 1}); + tbias.Resize({m}); + + ta.set_precision(PRECISION(kFloat)); + tb.set_precision(PRECISION(kFloat)); + tc.set_precision(PRECISION(kFloat)); + tc_basic.set_precision(PRECISION(kFloat)); + tbias.set_precision(PRECISION(kFloat)); + + fill_tensor_rand(ta, -1.f, 1.f); + // fill_tensor_const(ta, 1.f); + fill_tensor_rand(tb, -1.f, 1.f); + // fill_tensor_const(tb, 1.f); + fill_tensor_rand(tbias, -1.f, 1.f); + + LOG(INFO) << "sgemv M: " << m << ", K: " << k + << ", transA: " << (tra ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << ", bias: " << (has_bias ? "true" : "false"); +#ifdef LITE_WITH_ARM + + auto da = ta.mutable_data(); + auto db = tb.mutable_data(); + auto dc = tc.mutable_data(); + auto dc_basic = tc_basic.mutable_data(); + auto dbias = tbias.mutable_data(); + + if (FLAGS_check_result) { + basic_gemv( + m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu); + } + paddle::lite::profile::Timer t0; + //! compute + double ops = 2.0 * m * k; + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), ths); + /// warmup + for (int j = 0; j < FLAGS_warmup; ++j) { + paddle::lite::arm::math::sgemv( + da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx); + } + + t0.Reset(); + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + paddle::lite::arm::math::sgemv( + da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx); + t0.Stop(); + } + LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls + << ", threads: " << ths << ", GOPS: " << ops * 1e-9f + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() + << " GOPs"; + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + /// fp32 result + tensor_cmp_host(tc_basic, tc, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) { + Tensor tdiff; + tdiff.set_precision(PRECISION(kFloat)); + tdiff.Resize(tc.dims()); + tensor_diff(tc_basic, tc, tdiff); + LOG(INFO) << "basic result: "; + print_tensor(tc_basic); + LOG(INFO) << "saber result: "; + print_tensor(tc); + LOG(INFO) << "diff result: "; + print_tensor(tdiff); + return false; + } + } +#endif + return true; +} + +TEST(TestLiteSgemv, Sgemv) { + if (FLAGS_basic_test) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + LOG(INFO) << "run basic sgemv test"; + for (auto& m : {1, 3, 8, 21, 32, 397}) { + for (auto& k : {1, 3, 8, 17, 59, 234}) { + for (auto& tra : {true, false}) { + for (auto& has_bias : {false, true}) { + for (auto& has_relu : {false, true}) { + for (auto& th : {1, 2, 4}) { + auto flag = test_sgemv( + tra, m, k, has_bias, has_relu, FLAGS_cluster, th); + if (flag) { + LOG(INFO) << "test m = " << m << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << ", trans A: " << (tra ? "true" : "false") + << ", threads: " << th << " passed\n"; + } else { + LOG(FATAL) << "test m = " << m << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << ", trans A: " << (tra ? "true" : "false") + << ", threads: " << th << " failed\n"; + } + } + } + } + } + } + } + } +} + +TEST(TestSgemvCustom, Sgemv_custom) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + auto flag = test_sgemv(FLAGS_traA, + FLAGS_M, + FLAGS_K, + FLAGS_flag_bias, + FLAGS_flag_relu, + FLAGS_cluster, + FLAGS_threads); + if (!flag) { + LOG(FATAL) << "test m = " << FLAGS_M << ", k=" << FLAGS_K + << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias + << ", relu: " << FLAGS_flag_relu << " failed!!"; + } + LOG(INFO) << "test m = " << FLAGS_M << ", k=" << FLAGS_K + << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias + << ", relu: " << FLAGS_flag_relu << " passed!!"; +} diff --git a/lite/tests/utils/naive_math_impl.h b/lite/tests/utils/naive_math_impl.h index 846126ac24..fd868e85ac 100644 --- a/lite/tests/utils/naive_math_impl.h +++ b/lite/tests/utils/naive_math_impl.h @@ -14,6 +14,108 @@ #pragma once +template +static void basic_trans_mat_to_c4(const type* input, + type* output, + const int ldin, + const int M, + const int K, + bool pack_k) { + const int m_round = (M + 3) / 4 * 4; + int k_round = (K + 3) / 4 * 4; + if (!pack_k) { + k_round = K; + } + const int m_loop = m_round / 4; + type zero_buf[K]; + memset(zero_buf, 0, K * sizeof(type)); + for (int i = 0; i < m_loop; ++i) { + const type* in0 = input + i * 4 * ldin; + const type* in1 = in0 + ldin; + const type* in2 = in1 + ldin; + const type* in3 = in2 + ldin; + if (4 * (i + 1) - M > 0) { + switch (4 * (i + 1) - M) { + case 3: + in1 = zero_buf; + case 2: + in2 = zero_buf; + case 1: + in3 = zero_buf; + default: + break; + } + } + for (int j = 0; j < K; ++j) { + *output++ = *in0++; + *output++ = *in1++; + *output++ = *in2++; + *output++ = *in3++; + } + for (int j = K; j < k_round; ++j) { + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + } + } +} + +template +static void basic_gemm_c4(bool trans_a, + bool trans_b, + int m, + int n, + int k, + type2 alpha, + const type* a, + int lda, + const type* b, + int ldb, + type2 beta, + type2* c, + int ldc, + const type2* bias, + bool flag_bias = false, + bool flag_relu = false) { + type2* tmp_c = reinterpret_cast(malloc(m * ldc * sizeof(type2))); + memset(tmp_c, 0, m * ldc * sizeof(type2)); +#pragma omp parallel for + for (int i = 0; i < m; ++i) { + auto bias_data = static_cast(0); + if (flag_bias) { + bias_data = bias[i]; + } + for (int j = 0; j < n; ++j) { + auto sum = static_cast(0); + for (int l = 0; l < k; ++l) { + type av; + type bv; + if (trans_a) { + av = a[l * lda + i]; + } else { + av = a[i * lda + l]; + } + if (trans_b) { + bv = b[j * ldb + l]; + } else { + bv = b[l * ldb + j]; + } + sum += av * bv; + } + type2 tmp = alpha * sum + beta * tmp_c[i * ldc + j] + bias_data; + if (flag_relu) { + tmp_c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0; + } else { + tmp_c[i * ldc + j] = tmp; + } + } + } + //! trans c to c4 + basic_trans_mat_to_c4(tmp_c, c, ldc, m, n, false); + free(tmp_c); +} + template static void basic_gemm(bool trans_a, bool trans_b, @@ -228,8 +330,10 @@ static void col2im(const Dtype* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, @@ -237,21 +341,24 @@ static void col2im(const Dtype* data_col, Dtype* data_im) { memset(data_im, 0, height * width * channels * sizeof(Dtype)); const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_h0 + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { data_col += output_w; } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_w0 + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { @@ -289,8 +396,10 @@ void deconv_basic(const Dtype1* din, int stride_h, int dila_w, int dila_h, - int pad_w, - int pad_h, + int pad_w0, + int pad_w1, + int pad_h0, + int pad_h1, bool flag_bias, bool flag_relu) { int m = chout * kernel_w * kernel_h / group; @@ -302,8 +411,9 @@ void deconv_basic(const Dtype1* din, int group_size_coldata = m * n; int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && - (dila_w == 1) && (dila_h == 1); + (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) && + (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) && + (dila_h == 1); Dtype2* workspace_ptr = static_cast(malloc(sizeof(float) * m * n * group)); @@ -316,7 +426,7 @@ void deconv_basic(const Dtype1* din, if (flag_1x1s1p1) { col_data = dout_batch; } - memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); + memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group); for (int g = 0; g < group; ++g) { const Dtype1* din_group = din_batch + g * group_size_in; const Dtype1* weights_group = weights + g * group_size_weights; @@ -346,8 +456,10 @@ void deconv_basic(const Dtype1* din, wout, kernel_h, kernel_w, - pad_h, - pad_w, + pad_h0, + pad_h1, + pad_w0, + pad_w1, stride_h, stride_w, dila_h, diff --git a/lite/tools/build.sh b/lite/tools/build.sh index 4873e70773..319f26ff82 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -20,6 +20,7 @@ BUILD_DIR=$(pwd) OPTMODEL_DIR="" BUILD_TAILOR=OFF BUILD_CV=OFF +SHUTDOWN_LOG=ON readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz @@ -93,7 +94,7 @@ function make_tiny_publish_so { -DWITH_TESTING=OFF \ -DLITE_WITH_JAVA=$BUILD_JAVA \ -DLITE_WITH_PYTHON=$BUILD_PYTHON \ - -DLITE_SHUTDOWN_LOG=ON \ + -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \ -DLITE_ON_TINY_PUBLISH=ON \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ @@ -136,7 +137,7 @@ function make_full_publish_so { -DWITH_TESTING=OFF \ -DLITE_WITH_JAVA=$BUILD_JAVA \ -DLITE_WITH_PYTHON=$BUILD_PYTHON \ - -DLITE_SHUTDOWN_LOG=ON \ + -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ -DLITE_WITH_CV=$BUILD_CV \ @@ -236,10 +237,10 @@ function make_cuda { -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ -DWITH_TESTING=OFF \ -DLITE_WITH_ARM=OFF \ - -DLITE_WITH_PYTHON=ON \ + -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_BUILD_EXTRA=ON - - make publish_inference_python_lib -j8 + + make publish_inference -j4 cd - } @@ -290,6 +291,7 @@ function print_usage { echo -e " ./build.sh --arm_os= --arm_abi= --arm_lang= test" echo echo -e "optional argument:" + echo -e "--shutdown_log: (OFF|ON); controls whether to shutdown log, default is ON" echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)" echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)" echo -e "--build_java: (OFF|ON); controls whether to publish java api lib (Only ANDROID is supported)" @@ -366,6 +368,10 @@ function main { BUILD_TAILOR="${i#*=}" shift ;; + --shutdown_log=*) + SHUTDOWN_LOG="${i#*=}" + shift + ;; tiny_publish) make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL shift diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh index 03a74046f1..1509f563b2 100755 --- a/lite/tools/build_npu.sh +++ b/lite/tools/build_npu.sh @@ -5,8 +5,8 @@ set -ex ARM_OS="android" # android only yet ARM_ABI="armv8" # armv8, armv7 ARM_LANG="gcc" # gcc only yet -ANDROID_STL="c++_static" # c++_shared, c++_static -DDK_ROOT="$(pwd)/ai_ddk_lib/" # HIAI SDK from https://developer.huawei.com/consumer/cn/hiai/ +ANDROID_STL="c++_shared" # c++_shared/c++_static, c++_shared is used by HiAI DDK 310 +DDK_ROOT="$(pwd)/ai_ddk_lib/" # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/ TARGET_NAME="test_npu_pass" # default target BUILD_EXTRA=OFF # ON(with sequence ops)/OFF WITH_JAVA=ON # ON(build jar and jni so)/OFF diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 8be8e6e6b6..8b5741a7a6 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -1,9 +1,10 @@ #!/bin/bash +# The git version of CI is 2.7.4. This script is not compatible with git version 1.7.1. set -ex TESTS_FILE="./lite_tests.txt" LIBS_FILE="./lite_libs.txt" - +CUDNN_ROOT="/usr/local/cudnn" readonly ADB_WORK_DIR="/data/local/tmp" readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF" @@ -162,6 +163,12 @@ function cmake_x86_for_CI { # make test_generated_code -j$NUM_CORES_FOR_COMPILE } +function cmake_cuda_for_CI { + prepare_workspace # fake an empty __generated_code__.cc to pass cmake. + cmake .. -DLITE_WITH_CUDA=ON -DWITH_MKLDNN=OFF -DLITE_WITH_X86=OFF ${common_flags} -DLITE_WITH_PROFILE=ON -DWITH_MKL=OFF \ + -DLITE_BUILD_EXTRA=ON -DCUDNN_ROOT=${CUDNN_ROOT} +} + function cmake_gpu { prepare_workspace cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON" @@ -195,7 +202,6 @@ function test_server { # Due to the missing of x86 kernels, we skip the following tests temporarily. # TODO(xxx) clear the skip list latter local skip_list=("test_paddle_api" "test_cxx_api" - "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86" "test_light_api" "test_apis" "test_model_bin" ) @@ -227,6 +233,16 @@ function build_test_server { test_model_optimize_tool_compile } +# The CUDA version of CI is cuda_10.1.243_418.87.00_linux. +# The cuDNN version is cudnn-10.1-linux-x64-v7.5.0.56. +function build_test_cuda_server { + mkdir -p ./build + cd ./build + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" + cmake_cuda_for_CI + build +} + function build_test_train { mkdir -p ./build cd ./build @@ -951,6 +967,10 @@ function main { test_arm_android $TEST_NAME $ARM_PORT shift ;; + build_test_cuda_server) + build_test_cuda_server + shift + ;; build_test_server) build_test_server shift diff --git a/lite/tools/debug/debug_utils.h b/lite/tools/debug/debug_utils.h index 7f77b90488..ff08c47e52 100644 --- a/lite/tools/debug/debug_utils.h +++ b/lite/tools/debug/debug_utils.h @@ -27,7 +27,7 @@ #include "lite/model_parser/pb/var_desc.h" #include "lite/utils/all.h" -DEFINE_string(model_dir, "", "Model dir path"); +DEFINE_string(model_path, "", "Model dir path"); DEFINE_string(input_file, "", "Input datas file path"); DEFINE_string(topo_output_file, "", "Runtime topology order output file path"); DEFINE_bool(output_topo, true, "Dump runtime topology or not"); @@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) { CHECK(conf); #define CHECK_NON_EMPTY(name__) \ CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty." - CHECK_NON_EMPTY(model_dir); + CHECK_NON_EMPTY(model_path); if (FLAGS_output_topo) { CHECK_NON_EMPTY(topo_output_file); } @@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) { CHECK_NON_EMPTY(tensor_output_file); } #undef CHECK_NON_EMPTY - conf->model_dir = FLAGS_model_dir; + conf->model_dir = FLAGS_model_path; conf->topo_output_file = FLAGS_topo_output_file; conf->tensor_output_file = FLAGS_tensor_output_file; conf->input_file = FLAGS_input_file; diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc index 0bccfe2804..f180475568 100644 --- a/lite/utils/cv/paddle_image_preprocess.cc +++ b/lite/utils/cv/paddle_image_preprocess.cc @@ -69,240 +69,6 @@ void ImagePreprocess::imageResize(const uint8_t* src, int dstw, int dsth) { resize(src, dst, srcFormat, srcw, srch, dstw, dsth); - /* - int size = srcw * srch; - if (srcw == dstw && srch == dsth) { - if (srcFormat == NV12 || srcFormat == NV21) { - size = srcw * (floor(1.5 * srch)); - } else if (srcFormat == BGR || srcFormat == RGB) { - size = 3 * srcw * srch; - } else if (srcFormat == BGRA || srcFormat == RGBA) { - size = 4 * srcw * srch; - } - memcpy(dst, src, sizeof(uint8_t) * size); - return; - } - double scale_x = static_cast(srcw / dstw); - double scale_y = static_cast(srch / dsth); - - int* buf = new int[dstw * 2 + dsth * 2]; - - int* xofs = buf; - int* yofs = buf + dstw; - int16_t* ialpha = reinterpret_cast(buf + dstw + dsth); - int16_t* ibeta = reinterpret_cast(buf + 2 * dstw + dsth); - - compute_xy( - srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta); - - int w_out = dstw; - int w_in = srcw; - int num = 1; - int orih = dsth; - if (srcFormat == GRAY) { - num = 1; - } else if (srcFormat == NV12 || srcFormat == NV21) { - num = 1; - int hout = static_cast(0.5 * dsth); - dsth += hout; - } else if (srcFormat == BGR || srcFormat == RGB) { - w_in = srcw * 3; - w_out = dstw * 3; - num = 3; - - } else if (srcFormat == BGRA || srcFormat == RGBA) { - w_in = srcw * 4; - w_out = dstw * 4; - num = 4; - } - - int* xofs1 = nullptr; - int* yofs1 = nullptr; - int16_t* ialpha1 = nullptr; - if (orih < dsth) { // uv - int tmp = dsth - orih; - int w = dstw / 2; - xofs1 = new int[w]; - yofs1 = new int[tmp]; - ialpha1 = new int16_t[srcw]; - compute_xy(srcw / 2, - srch / 2, - w, - tmp, - scale_x, - scale_y, - xofs1, - yofs1, - ialpha1, - ibeta + orih); - } - int cnt = w_out >> 3; - int remain = w_out % 8; - int32x4_t _v2 = vdupq_n_s32(2); - #pragma omp parallel for - for (int dy = 0; dy < dsth; dy++) { - int16_t* rowsbuf0 = new int16_t[w_out]; - int16_t* rowsbuf1 = new int16_t[w_out]; - int sy = yofs[dy]; - if (dy >= orih) { - xofs = xofs1; - yofs = yofs1; - ialpha = ialpha1; - } - if (sy < 0) { - memset(rowsbuf0, 0, sizeof(uint16_t) * w_out); - const uint8_t* S1 = src + srcw * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx] * num; // num = 4 - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; - - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - if (sx < 0) { - S1pl = S1; - } - for (int i = 0; i < num; i++) { - if (sx < 0) { - *rows1p++ = ((*S1pl++) * a1) >> 4; - } else { - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - } - ialphap += 2; - } - } else { - // hresize two rows - const uint8_t* S0 = src + w_in * (sy); - const uint8_t* S1 = src + w_in * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx] * num; // num = 4 - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; - - const uint8_t* S0pl = S0 + sx; - const uint8_t* S0pr = S0 + sx + num; - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - if (sx < 0) { - S0pl = S0; - S1pl = S1; - } - for (int i = 0; i < num; i++) { - if (sx < 0) { - *rows0p = ((*S0pl++) * a1) >> 4; - *rows1p = ((*S1pl++) * a1) >> 4; - rows0p++; - rows1p++; - } else { - *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4; - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - } - ialphap += 2; - } - } - int ind = dy * 2; - int16_t b0 = ibeta[ind]; - int16_t b1 = ibeta[ind + 1]; - int16x8_t _b0 = vdupq_n_s16(b0); - int16x8_t _b1 = vdupq_n_s16(b1); - uint8_t* dp_ptr = dst + dy * w_out; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; - int re_cnt = cnt; - if (re_cnt > 0) { - #ifdef __aarch64__ - asm volatile( - "1: \n" - "ld1 {v0.8h}, [%[rows0p]], #16 \n" - "ld1 {v1.8h}, [%[rows1p]], #16 \n" - "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n" - "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n" - "smull v2.4s, v0.4h, %w[_b0].4h \n" - "smull2 v4.4s, v0.8h, %w[_b0].8h \n" - "smull v3.4s, v1.4h, %w[_b1].4h \n" - "smull2 v5.4s, v1.8h, %w[_b1].8h \n" - - "ssra v6.4s, v2.4s, #16 \n" - "ssra v7.4s, v4.4s, #16 \n" - "ssra v6.4s, v3.4s, #16 \n" - "ssra v7.4s, v5.4s, #16 \n" - - "shrn v0.4h, v6.4s, #2 \n" - "shrn2 v0.8h, v7.4s, #2 \n" - "subs %w[cnt], %w[cnt], #1 \n" - "sqxtun v1.8b, v0.8h \n" - "st1 {v1.8b}, [%[dp]], #8 \n" - "bne 1b \n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [cnt] "+r"(re_cnt), - [dp] "+r"(dp_ptr) - : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); - #else - asm volatile( - "mov r4, #2 \n" - "vdup.s32 q12, r4 \n" - "0: \n" - "vld1.s16 {d2-d3}, [%[rows0p]]!\n" - "vld1.s16 {d6-d7}, [%[rows1p]]!\n" - "vorr.s32 q10, q12, q12 \n" - "vorr.s32 q11, q12, q12 \n" - - "vmull.s16 q0, d2, %[_b0] \n" - "vmull.s16 q1, d3, %[_b0] \n" - "vmull.s16 q2, d6, %[_b1] \n" - "vmull.s16 q3, d7, %[_b1] \n" - - "vsra.s32 q10, q0, #16 \n" - "vsra.s32 q11, q1, #16 \n" - "vsra.s32 q10, q2, #16 \n" - "vsra.s32 q11, q3, #16 \n" - - "vshrn.s32 d20, q10, #2 \n" - "vshrn.s32 d21, q11, #2 \n" - "subs %[cnt], #1 \n" - "vqmovun.s16 d20, q10 \n" - "vst1.8 {d20}, [%[dp]]! \n" - "bne 0b \n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [cnt] "+r"(re_cnt), - [dp] "+r"(dp_ptr) - : [_b0] "w"(_b0), [_b1] "w"(_b1) - : "cc", - "memory", - "r4", - "q0", - "q1", - "q2", - "q3", - "q8", - "q9", - "q10", - "q11", - "q12"); - - #endif // __aarch64__ - } - for (int i = 0; i < remain; i++) { - // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> - // INTER_RESIZE_COEF_BITS; - *dp_ptr++ = - (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) + - (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >> - 2); - } - } - delete[] buf; - */ } void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h index 11673e1904..5a46a9e48e 100644 --- a/lite/utils/cv/paddle_image_preprocess.h +++ b/lite/utils/cv/paddle_image_preprocess.h @@ -133,7 +133,7 @@ class ImagePreprocess { * color format support 1-channel image, 3-channel image and 4-channel image * param src: input image data * param dst: output image data - * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param degree: Rotate degree, support 90, 180 and 270 @@ -158,7 +158,7 @@ class ImagePreprocess { * color format support 1-channel image, 3-channel image and 4-channel image * param src: input image data * param dst: output image data - * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param flip_param: flip parameter, support X, Y and XY @@ -190,7 +190,7 @@ class ImagePreprocess { * NCHW * param src: input image data * param dstTensor: output tensor data - * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param layout: output tensor layout,support NHWC and NCHW diff --git a/lite/utils/io.h b/lite/utils/io.h index 98a0f39b08..92405cae86 100644 --- a/lite/utils/io.h +++ b/lite/utils/io.h @@ -14,9 +14,12 @@ #pragma once +#include #include +#include #include #include +#include #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" @@ -46,11 +49,68 @@ static void MkDirRecur(const std::string& path) { // read buffer from file static std::string ReadFile(const std::string& filename) { std::ifstream ifile(filename.c_str()); + if (!ifile.is_open()) { + LOG(FATAL) << "Open file: [" << filename << "] failed."; + } std::ostringstream buf; char ch; while (buf && ifile.get(ch)) buf.put(ch); + ifile.close(); return buf.str(); } +// read lines from file +static std::vector ReadLines(const std::string& filename) { + std::ifstream ifile(filename.c_str()); + if (!ifile.is_open()) { + LOG(FATAL) << "Open file: [" << filename << "] failed."; + } + std::vector res; + std::string tmp; + while (getline(ifile, tmp)) res.push_back(tmp); + ifile.close(); + return res; +} + +static void WriteLines(const std::vector& lines, + const std::string& filename) { + std::ofstream ofile(filename.c_str()); + if (!ofile.is_open()) { + LOG(FATAL) << "Open file: [" << filename << "] failed."; + } + for (const auto& line : lines) { + ofile << line << "\n"; + } + ofile.close(); +} + +static bool IsDir(const std::string& path) { + DIR* dir_fd = opendir(path.c_str()); + if (dir_fd == nullptr) return false; + closedir(dir_fd); + return true; +} + +static std::vector ListDir(const std::string& path, + bool only_dir = false) { + if (!IsDir(path)) { + LOG(FATAL) << "[" << path << "] is not a valid dir path."; + } + + std::vector paths; + DIR* parent_dir_fd = opendir(path.c_str()); + dirent* dp; + while ((dp = readdir(parent_dir_fd)) != nullptr) { + // Exclude '.', '..' and hidden dir + std::string name(dp->d_name); + if (name == "." || name == ".." || name[0] == '.') continue; + if (IsDir(Join({path, name}, "/"))) { + paths.push_back(name); + } + } + closedir(parent_dir_fd); + return paths; +} + } // namespace lite } // namespace paddle diff --git a/lite/utils/logging.cc b/lite/utils/logging.cc index 6351be95ac..e9ee5861ba 100644 --- a/lite/utils/logging.cc +++ b/lite/utils/logging.cc @@ -43,10 +43,10 @@ void gen_log(STL::ostream& log_stream_, gettimeofday(&tv, NULL); // print date / time - log_stream_ << '[' << level << ' ' << std::setw(2) << 1 + tm_time.tm_mon - << '/' << std::setw(2) << tm_time.tm_mday << ' ' << std::setw(2) - << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':' - << std::setw(2) << tm_time.tm_sec << '.' << std::setw(3) + log_stream_ << '[' << level << ' ' << STL::setw(2) << 1 + tm_time.tm_mon + << '/' << STL::setw(2) << tm_time.tm_mday << ' ' << STL::setw(2) + << tm_time.tm_hour << ':' << STL::setw(2) << tm_time.tm_min << ':' + << STL::setw(2) << tm_time.tm_sec << '.' << STL::setw(3) << tv.tv_usec / 1000 << " "; if (len > kMaxLen) { diff --git a/lite/utils/logging.h b/lite/utils/logging.h index e85753ec30..c2c999fd70 100644 --- a/lite/utils/logging.h +++ b/lite/utils/logging.h @@ -30,6 +30,18 @@ #include #include "lite/utils/replace_stl/stream.h" +#ifdef LITE_WITH_ANDROID +#include +// Android log macors +#define ANDROID_LOG_TAG "Paddle-Lite" +#define ANDROID_LOG_I(msg) \ + __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, msg) +#define ANDROID_LOG_W(msg) \ + __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, msg) +#define ANDROID_LOG_F(msg) \ + __android_log_print(ANDROID_LOG_FATAL, ANDROID_LOG_TAG, msg) +#endif + // NOLINTFILE() // LOG() @@ -93,11 +105,22 @@ class LogMessage { const char* func, int lineno, const char* level = "I") { + level_ = level; paddle::lite::gen_log(log_stream_, file, func, lineno, level); } ~LogMessage() { log_stream_ << '\n'; +#ifdef LITE_WITH_ANDROID + if (level_ == "I") { + ANDROID_LOG_I(log_stream_.str().c_str()); + } else if (level_ == "W") { + ANDROID_LOG_W(log_stream_.str().c_str()); + } else { + fprintf(stderr, "Unsupported log level: %s", level_.c_str()); + assert(false); + } +#endif fprintf(stderr, "%s", log_stream_.str().c_str()); } @@ -105,6 +128,7 @@ class LogMessage { protected: STL::stringstream log_stream_; + std::string level_; LogMessage(const LogMessage&) = delete; void operator=(const LogMessage&) = delete; @@ -121,7 +145,11 @@ class LogMessageFatal : public LogMessage { ~LogMessageFatal() { log_stream_ << '\n'; +#ifdef LITE_WITH_ANDROID + ANDROID_LOG_F(log_stream_.str().c_str()); +#endif fprintf(stderr, "%s", log_stream_.str().c_str()); + #ifndef LITE_ON_TINY_PUBLISH abort(); #else @@ -152,6 +180,9 @@ class VLogMessage { return; } log_stream_ << '\n'; +#ifdef LITE_WITH_ANDROID + ANDROID_LOG_I(log_stream_.str().c_str()); +#endif fprintf(stderr, "%s", log_stream_.str().c_str()); } diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc index 61999a79e3..d821078e36 100644 --- a/lite/utils/replace_stl/stream.cc +++ b/lite/utils/replace_stl/stream.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "lite/utils/replace_stl/stream.h" +#include +#include #ifdef LITE_ON_TINY_PUBLISH @@ -20,93 +22,119 @@ namespace paddle { namespace lite { namespace replace_stl { +void ostream::pad(const std::string& text) { + if (display_width_ > 0) { + if (display_width_ < text.size()) { + fprintf(stderr, "Replace STL IO display length less than text\n"); + assert(false); + } else { + for (int i = 0; i < display_width_ - text.size(); ++i) { + data_.push_back(' '); + } + display_width_ = -1; + } + } +} + #ifdef LITE_SHUTDOWN_LOG #define ADD_DATA_AS_STRING(data_, obj_) #else -#define ADD_DATA_AS_STRING(data_, obj_) data_ = data_ + std::to_string(obj_) +#define ADD_DATA_AS_STRING(data_, obj_) \ + std::string text = std::to_string(obj_); \ + pad(text); \ + data_ = data_ + text; + #endif template <> ostream& ostream::operator<<(const char* obj) { - _data = _data + std::string(obj); + data_ = data_ + std::string(obj); return *this; } template <> ostream& ostream::operator<<(const char& obj) { - _data = _data + obj; + data_ = data_ + obj; return *this; } template <> ostream& ostream::operator<<(const std::string& obj) { - _data = _data + obj; + data_ = data_ + obj; return *this; } template <> ostream& ostream::operator<<(const int16_t& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const int& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const bool& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const long long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const unsigned& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const unsigned long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const unsigned long long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const float& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const double& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const long double& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); + return *this; +} + +template <> +ostream& ostream::operator<<(const LiteIoWidth& obj) { + int width = obj.width; + assert(width > 0); + display_width_ = width; return *this; } diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h index e6bb261706..3288a19869 100644 --- a/lite/utils/replace_stl/stream.h +++ b/lite/utils/replace_stl/stream.h @@ -29,18 +29,25 @@ namespace lite { namespace replace_stl { +struct LiteIoWidth { + explicit LiteIoWidth(int value) : width(value) {} + int width; +}; + +static LiteIoWidth setw(int width) { return LiteIoWidth(width); } + class ostream { public: ostream() {} - explicit ostream(const std::string& x) : _data(x) {} + explicit ostream(const std::string& x) : data_(x) {} ~ostream() {} - const char* c_str() { return _data.c_str(); } + const char* c_str() { return data_.c_str(); } - const std::string& str() { return _data; } + const std::string& str() { return data_; } const std::string& str(const std::string& x) { - _data = x; - return _data; + data_ = x; + return data_; } template @@ -50,7 +57,9 @@ class ostream { ostream& operator<<(const T* obj); private: - std::string _data; + void pad(const std::string& text); + std::string data_; + int display_width_{-1}; // -1 refers to no setting }; class stringstream : public ostream { diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp index f39d012e08..1a90cb5bdc 100644 --- a/mobile/src/fpga/V2/api.cpp +++ b/mobile/src/fpga/V2/api.cpp @@ -623,7 +623,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->concat_arg.images_in[i] = (int8_t *)arg->conv_arg[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; + arg->concat_arg.scales_in[i] = out->scale; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; expand_conv_arg(&arg->conv_arg[i]); diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp old mode 100644 new mode 100755 index dc3c3356e8..917491c371 --- a/mobile/src/fpga/V2/image.cpp +++ b/mobile/src/fpga/V2/image.cpp @@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * sizeof(int8_t)); - for (j = 0; - j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); - j++) { - images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5); - } } align_each_out_area_cw = align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); @@ -102,7 +97,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, memcpy( (int8_t *)image_out + tmp_channel + // NOLINT k * align_each_out_area_cw_differ, - images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw, + images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, channel_num[i] * sizeof(int8_t)); tmp_channel += channel_num[i]; @@ -110,6 +105,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, } } fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t)); + for (i = 0; i < image_num; i++) { + fpga_free(images_in_tmp[i]); + } + fpga_free(images_in_tmp); } void split_image(int8_t *image_in, void **images_out, int image_num, diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp old mode 100644 new mode 100755 index aa150e0c6c..a3c179994a --- a/mobile/src/fpga/V2/pe.cpp +++ b/mobile/src/fpga/V2/pe.cpp @@ -109,7 +109,7 @@ using namespace std; // NOLINT #define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 #define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 #define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 -#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880 #define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 #define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 #define REG_POOLING_MODE_RECIPROCAL 0x890 @@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { // DLOG << " activation_type:" << active_args.activation_type // << " leaky_relu_negative_slope:" // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - + DLOG << " reg_ActivationArgs:"; + uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { ret = -EIO; @@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } + // reg_writeq(reg_ActivationArgs, + // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion + + reg_writeq(output_scale, REG_SCALE_PARAMETER); // new reg_writeq((args.driver.row_padding_down << 45) | (args.driver.row_padding_up << 34) | @@ -270,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { args.driver.filter_pad_width_mul_channel, REG_CONV_REG1); - reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) | - (args.driver.filter_row << 8) | - (args.driver.filter_height << 4) | args.driver.filter_width, - REG_CONV_REG2); + reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) | + (args.driver.filter_row << 10) | + (args.driver.filter_height << 5) | args.driver.filter_width, + REG_CONV_REG2); reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | (args.driver.prog_full_cnt << 16) | @@ -358,7 +362,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Polling"; // return 0; uint64_t output_scale = 0; uint64_t timer_cnt = 0; @@ -366,66 +369,74 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { uint64_t cmd = 0; uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; - - // uint64_t reg_ActivationArgs = 0; - // active function:{none,leakeyrelu,sigmoid,tanh} - // ActivationArgs active_args; - // active_args.activation_type = LEAKYRELU; - // active_args.activation_type = args.output.activation.activation_type; - - // active_args.leaky_relu_negative_slope = - // args.output.activation.leaky_relu_negative_slope; - - // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - // active_args.leaky_relu_negative_slope; - - // DLOG << " activation_type:" << active_args.activation_type - // << " leaky_relu_negative_slope:" - // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - - image_physical_address = vaddr_to_paddr_driver(args.image.address); - output_physical_address = vaddr_to_paddr_driver(args.output.address); - uint32_t output_height = (uint32_t)( +uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); + image_physical_address = vaddr_to_paddr(args.image.address); + output_physical_address = vaddr_to_paddr(args.output.address); + uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64); + uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); + uint64_t output_height = (uint64_t)( (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( + args.kernel.stride_h + 1); + uint64_t output_width = (uint64_t)( (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1); + args.kernel.stride_w + 1); + uint64_t image_amount_per_row = align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | - (((uint64_t)args.kernel_reciprocal)); - + uint64_t image_one_pad_per_row = (uint64_t)args.image.width * + (uint64_t)args.image.channels +(uint64_t)args.image.pad_width * + (uint64_t)args.image.channels; + + uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width * + (uint64_t)args.image.channels, 32); + uint64_t result_addr_row = + (result_amount_align_32 << 32) | output_physical_address; + uint64_t row_padding_down = + (uint64_t)args.image.height + (uint64_t)args.image.pad_height; + uint64_t kernel_width_sub1 = + (uint64_t)args.kernel.width - 1; + uint64_t kernel_padding_step = row_padding_down | + ((uint64_t)args.image.pad_height << 16) | + ((uint64_t)args.kernel.stride_h << 24) | + ((uint64_t)kernel_width_sub1<<32) | + ((uint64_t)args.kernel.height << 40) | + ((uint64_t)(args.kernel.height-1) << 48); + uint64_t image_calcu_height = (uint64_t)args.kernel.height + + (output_height - 1) * (uint64_t)args.kernel.stride_h; + uint64_t result_size_calcu_height = (output_height - 1) | + ((output_width - 1) << 16) | (image_calcu_height << 32); + uint64_t col_padding_down = ((uint64_t)args.image.width + + (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; + + uint64_t image_row_col_padding_down = + image_amount_per_row | (col_padding_down << 32); + uint64_t image_rowXpadding_h = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_rowXstep_h = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t image_rowXpad_h_rowXstep_h = + image_rowXpadding_h | (image_rowXstep_h << 32); + uint64_t channelXpad_w = + (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; + uint64_t channelXstep_w = + (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; + uint64_t channelXpad_w_channelXstep_w = + channelXpad_w | (channelXstep_w << 32); + uint64_t filter_row_align = + C_align_32 * (uint64_t)args.kernel.width; + uint64_t sub_filter_amount_align = C_align_32 * + (uint64_t)args.kernel.width * (uint64_t)args.kernel.height; + uint64_t mult_factor = 0; + float average_reciprocal = args.kernel_reciprocal; + uint32_t* kernel_reciprocal; + kernel_reciprocal =(reinterpret_cast(&average_reciprocal)); + if (args.mode == 1) + mult_factor = (uint64_t)(*kernel_reciprocal) | + ((uint64_t)1 << 32) | ((uint64_t)1 << 40); + else + mult_factor = + (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { ret = -EIO; @@ -433,41 +444,21 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } - - // reg_writeq(reg_ActivationArgs, - // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - // reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); - reg_writeq(cmd, REG_POOLING_CMD); + reg_writeq(output_scale, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq((uint64_t)args.image.channels, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); + reg_writeq(mult_factor, 0x840); // dw donot care + reg_writeq(channelXpad_w_channelXstep_w, 0x848); + if (args.mode == 1) + cmd = (uint64_t)4; + else + cmd = (uint64_t)8; + reg_writeq(cmd, 0x800); DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { @@ -478,14 +469,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { } DLOG << "after reg poll"; - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - // output_scale = reg_readq(REG_SCALE_PARAMETER); - // output_scale = (output_scale << 32) | (output_scale >> 32); - // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - - // active_args.activation_type = NONE; - // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; @@ -518,19 +501,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #endif #ifdef PADDLE_MOBILE_ZU5 int ret = 0; - uint64_t output_scale = 0; - - // uint64_t reg_ActivationArgs = 0; - // ActivationArgs active_args; - // active_args.activation_type = args.output.activation.activation_type; - // active_args.leaky_relu_negative_slope = - // args.output.activation.leaky_relu_negative_slope; - // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - // active_args.leaky_relu_negative_slope; - // DLOG << " activation_type:" << active_args.activation_type - // << " leaky_relu_negative_slope:" - // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; +uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { @@ -540,18 +511,47 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { return ret; } - // reg_writeq(reg_ActivationArgs, - // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); - reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); - reg_writeq(args.driver.datalen, REG_EW_DATA_LEN); - reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL); - reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); - reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR); - reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT); - reg_writeq(args.driver.cmd, REG_EW_CMD); + uint64_t image0_physical_address = 0; + uint64_t image1_physical_address = 0; + uint64_t image_physical_address = 0; + uint64_t output_physical_address = 0; + image0_physical_address = vaddr_to_paddr(args.image0.address); + image1_physical_address = vaddr_to_paddr(args.image1.address); + image_physical_address = + image0_physical_address | (image1_physical_address << 32); + output_physical_address = vaddr_to_paddr(args.output.address); + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * + (uint64_t)args.image0.channels, IMAGE_ALIGNMENT); + uint64_t result_addr_row = + output_physical_address | (image_amount_per_row << 32); + uint64_t kernel_padding_step = 0; + kernel_padding_step = ((uint64_t)args.image0.height * 2) | + ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48); + uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) | + ((image_amount_per_row / 32 - 1) << 16) | + (((uint64_t)args.image0.height * 2) << 32); + uint64_t image_row_col_padding_down = image_amount_per_row | + (image_amount_per_row << 32); + float quantParam = (args.output.scale_address)[0]; + uint32_t* ew_scale = reinterpret_cast(&quantParam); + uint64_t ew_scale_mult_factor = (*ew_scale) | + ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40); + reg_writeq(0ul, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq(32, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(((image_amount_per_row*2) << 32), 0x838); + reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care + reg_writeq(((uint64_t)32 << 32), 0x848); + reg_writeq(0, 0x858); + uint64_t cmd = 0; + cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8); + reg_writeq(cmd, 0x800); if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; @@ -560,12 +560,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); } - // output_scale = reg_readq(REG_SCALE_PARAMETER); - // output_scale = (output_scale << 32) | (output_scale >> 32); - // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - // active_args.activation_type = NONE; - // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif @@ -870,7 +864,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { #endif } - if (sub_conv_num > 1) { + /*if (sub_conv_num > 1) { float max_scale = -1.0f; #ifdef COST_TIME_PRINT gettimeofday(&start, NULL); @@ -894,19 +888,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" << std::endl; #endif - - // fpga_flush(args.output.scale_address, 2 * sizeof(float)); - /*#ifdef COST_TIME_PRINT - gettimeofday(&start,NULL); - #endif - //deconv_post_process(args); - #ifdef COST_TIME_PRINT - gettimeofday(&end,NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv_post_process " << " cost time: " << - (dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/ - } + }*/ return 0; } // ComputeFpgaDeconv @@ -940,8 +922,8 @@ int ComputeDWConv(const struct DWconvArgs &args) { << " image_width:" << args.image.width << " pad_height:" << args.image.pad_height << " pad_width:" << args.image.pad_width; - DLOG << " filter_address:" << args.filter_address - << " bias_address:" << args.bias_address; + DLOG << " filter_address:" << args.filter_address; + //<< " bias_address:" << args.bias_address; DLOG << " kernel_height:" << args.kernel.height << " kernel_width:" << args.kernel.width << " stride_h:" << args.kernel.stride_h @@ -951,11 +933,10 @@ int ComputeDWConv(const struct DWconvArgs &args) { #endif #ifdef PADDLE_MOBILE_ZU5 DLOG << "DWConv"; + uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); // return 0; - uint64_t output_scale = 0; uint64_t timer_cnt = 0; int ret = 0; - // uint64_t cmd = args.relu_enabled; uint64_t cmd = 0; uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; @@ -966,57 +947,69 @@ int ComputeDWConv(const struct DWconvArgs &args) { output_physical_address = vaddr_to_paddr(args.output.address); filter_physical_address = vaddr_to_paddr(args.filter_address); bias_physical_address = vaddr_to_paddr(args.bias_address); - uint64_t filter_N_align = - align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t filter_amount_per_row_align = - filter_N_align * (uint64_t)args.kernel.width; - uint64_t sub_filter_amount_align = filter_N_align * - (uint64_t)args.kernel.width * - (uint64_t)args.kernel.height; - uint64_t filter_amount_align = - sub_filter_amount_align * (uint64_t)args.sub_conv_num; - - uint32_t output_height = (uint32_t)( - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( - ((args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1) * - args.sub_conv_num); + uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64); + uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); + uint64_t output_height = (uint64_t) + ((args.image.height + args.image.pad_height * 2 - + args.kernel.height) / args.kernel.stride_h +1); + uint64_t output_width = (uint64_t) + (((args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + 1) * args.sub_conv_num); uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); + align_to_x((uint64_t)args.image.width * + (uint64_t)args.image.channels, IMAGE_ALIGNMENT); uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - + (uint64_t)args.image.width * (uint64_t)args.image.channels + + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; + + uint64_t result_amount_align_32 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, 32); + uint64_t result_addr_row = + (result_amount_align_32 << 32) | output_physical_address; + uint64_t row_padding_down = + (uint64_t)args.image.height + (uint64_t)args.image.pad_height; + uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1; + uint64_t kernel_padding_step = row_padding_down | + ((uint64_t)args.image.pad_height << 16) | + ((uint64_t)args.kernel.stride_h << 24) | + ((uint64_t)kernel_width_sub1<<32) | + ((uint64_t)args.kernel.height << 40) | + ((uint64_t)(args.kernel.height-1) << 48); + uint64_t image_calcu_height = (uint64_t)args.kernel.height + + (output_height - 1) * (uint64_t)args.kernel.stride_h; + uint64_t result_size_calcu_height = (output_height - 1) | + ((output_width - 1) << 16) | (image_calcu_height << 32); + uint64_t col_padding_down = ((uint64_t)args.image.width + + (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; + + uint64_t image_row_col_padding_down = + image_amount_per_row | (col_padding_down << 32); + uint64_t image_rowXpadding_h = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_rowXstep_h = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t image_rowXpad_h_rowXstep_h = + image_rowXpadding_h | (image_rowXstep_h << 32); + uint64_t channelXpad_w = + (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; + uint64_t channelXstep_w = + (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; + uint64_t channelXpad_w_channelXstep_w = + channelXpad_w | (channelXstep_w << 32); + + uint64_t filter_row_align = + C_align_64 * (uint64_t)args.kernel.width; + uint64_t sub_filter_amount_align = C_align_64 * + (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height; + uint64_t filter_amount_align = + sub_filter_amount_align * (uint64_t)args.sub_conv_num; + uint64_t filter_param = filter_row_align | (filter_amount_align << 16) | + (sub_filter_amount_align << 32) | + (((uint64_t)args.sub_conv_num -1) << 48); + uint64_t channel_parameter = + (uint64_t)args.image.channels | (C_align_64 << 16); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { ret = -EIO; @@ -1024,73 +1017,31 @@ int ComputeDWConv(const struct DWconvArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } - - /*restart scale*/ - reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq((bias_physical_address << 32 | filter_physical_address), - REG_DWCONV_FILTER_BASE_ADDR); - reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), - REG_DWCONV_FILTER_SHAPE); - reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32), - REG_DWCONV_FILTER_SUBNUMBER); - reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); - - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - - /*SDK刷Cache保证数据一致性*/ - - reg_writeq(cmd, REG_DWCONV_CMD); + reg_writeq(0ul, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq(channel_parameter, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); + reg_writeq(0, 0x840); + reg_writeq(channelXpad_w_channelXstep_w, 0x848); + reg_writeq(filter_physical_address, 0x850); + reg_writeq(filter_param, 0x858); + reg_writeq(((bias_physical_address+C_align_64*4) | + (bias_physical_address << 32)), 0x860); + cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8); + reg_writeq(cmd, 0x800); DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; ret = -EIO; - DLOG << "Pooling Wait Irq Timeout!"; + DLOG << "DWconv Wait Irq Timeout!"; PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout"); } DLOG << "after reg poll"; - - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - DLOG << "output_scale:" << output_scale; pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp old mode 100644 new mode 100755 index 911704965a..b7ce4d3247 --- a/mobile/src/fpga/common/driver.cpp +++ b/mobile/src/fpga/common/driver.cpp @@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { uint64_t i = 0; /*timeout精确性待确认*/ int64_t timeout = time * 6; - usleep(1); for (i = 0; i < timeout; i++) { + usleep(1); if (val == reg_readq(reg)) { break; } diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h old mode 100644 new mode 100755 index a798d54459..a767cd2606 --- a/mobile/src/fpga/common/fpga_common.h +++ b/mobile/src/fpga/common/fpga_common.h @@ -211,6 +211,7 @@ struct ConcatArgs { uint32_t out_channel; uint32_t height; uint32_t width; + std::vector> vector_concat_space; }; struct SplitConvArgs { diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp old mode 100644 new mode 100755 index 951fbb5f37..56cc8927f0 --- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp @@ -37,7 +37,7 @@ bool AnchorGeneratorKernel::Init( int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, -20, 39, 36, -43, -34, 59, 49, -63, -54, 79, 69, -96, -77, 112, 93, -137, -118, 153, - 134, -204, -188, 220, 204, -281, -395, 296, 441}; + 134, -204, -188, 220, 204, -281, -395, 296, 411}; int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103, 0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58, diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp old mode 100644 new mode 100755 index 716531fcab..8442eef8b2 --- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp @@ -53,6 +53,15 @@ bool ConcatKernel::Init(ConcatParam *param) { concatArgs.channel_num = channel_num; concatArgs.height = height; concatArgs.width = width; + + auto deleter = [](void *p) { fpga::fpga_free(p); }; + concatArgs.vector_concat_space.push_back(std::shared_ptr( + reinterpret_cast(concatArgs.images_in), deleter)); + concatArgs.vector_concat_space.push_back(std::shared_ptr( + reinterpret_cast(concatArgs.scales_in), deleter)); + concatArgs.vector_concat_space.push_back(std::shared_ptr( + reinterpret_cast(concatArgs.channel_num), deleter)); + param->SetFpgaArgs(concatArgs); return true; } diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp old mode 100644 new mode 100755 index 43b9355c99..57ccf9f00d --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp @@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef ELEMENTWISEADD_OP - +#include #include "operators/kernel/elementwise_add_kernel.h" -#include -#include "fpga/V2/api.h" - namespace paddle_mobile { namespace operators { @@ -60,10 +57,36 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { return true; } +void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) { + int inputc = ewaddArgs.image0.channels; + int inputh = ewaddArgs.image0.height; + int inputw = ewaddArgs.image0.width; + float inScale0 = + (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; + float inScale1 = + (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; + float outScale = + (reinterpret_cast(ewaddArgs.output.scale_address))[0]; + int8_t* inPtr0 = reinterpret_cast(ewaddArgs.image0.address); + int8_t* inPtr1 = reinterpret_cast(ewaddArgs.image1.address); + int8_t* outPtr = reinterpret_cast(ewaddArgs.output.address); + int datasize = inputc * inputh * inputw; + float const0 = inScale0 / outScale; + float const1 = inScale1 / outScale; + fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); + fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); + for (int i = 0; i < datasize; i++) { + float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; + int tmpI = static_cast(round(tmpF)); + outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI))); + } + fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); +} template <> void ElementwiseAddKernel::Compute( const ElementwiseAddParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + ComputeCPUEWAdd(param.FpgaArgs()); } } // namespace operators } // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp old mode 100644 new mode 100755 index 6d5ad50573..de60341874 --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef FUSION_ELEMENTWISEADDRELU_OP - +#include #include "operators/kernel/elementwise_add_relu_kernel.h" namespace paddle_mobile { @@ -58,10 +58,37 @@ bool ElementwiseAddReluKernel::Init( return true; } +void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) { + int inputc = ewaddArgs.image0.channels; + int inputh = ewaddArgs.image0.height; + int inputw = ewaddArgs.image0.width; + float inScale0 = + (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; + float inScale1 = + (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; + float outScale = + (reinterpret_cast(ewaddArgs.output.scale_address))[0]; + int8_t* inPtr0 = reinterpret_cast(ewaddArgs.image0.address); + int8_t* inPtr1 = reinterpret_cast(ewaddArgs.image1.address); + int8_t* outPtr = reinterpret_cast(ewaddArgs.output.address); + int datasize = inputc * inputh * inputw; + float const0 = inScale0 / outScale; + float const1 = inScale1 / outScale; + fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); + fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); + for (int i = 0; i < datasize; i++) { + float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; + int tmpI = static_cast(round(tmpF)); + outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI))); + } + fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); +} + template <> void ElementwiseAddReluKernel::Compute( const ElementwiseAddReluParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + ComputeCPUEWAddRelu(param.FpgaArgs()); } } // namespace operators } // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp old mode 100644 new mode 100755 index fcf0889b4a..c7cd6575e4 --- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp @@ -110,7 +110,27 @@ void Reshape2Kernel::Compute(const Reshape2Param ¶m) { } } output->Resize(framework::make_ddim(shape)); + + bool reshapeNeedFlg = 1; if (output->dims() == input->dims()) { + reshapeNeedFlg = 0; + } else if (output->dims().size() != input->dims().size()) { + auto inputdimsize = input->dims().size(); + auto outputdimsize = output->dims().size(); + int smallersize = + inputdimsize > outputdimsize ? outputdimsize : inputdimsize; + int i = 0; + for (i = 0; i < smallersize; i++) { + if ((input->dims())[i] != (output->dims())[i]) + break; + } + if (i == smallersize) { + reshapeNeedFlg = 0; + } + } + if (reshapeNeedFlg) { + reshape(input, output); + } else { DLOG << "No need to reshape"; output->ShareDataWith(*input); framework::LoD lod = input->lod(); @@ -118,9 +138,6 @@ void Reshape2Kernel::Compute(const Reshape2Param ¶m) { output->scale[0] = input->scale[0]; return; } - - reshape(input, output); - // } } // namespace operators diff --git a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp index 194fd5a305..44aae4be32 100644 --- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp @@ -48,7 +48,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { template <> void SigmoidKernel::Compute(const SigmoidParam ¶m) { fpga::PerformBypass(param.FpgaArgs()); - param.Out()->scale[0] = 127.0; + param.Out()->scale[0] = 1.0; } } // namespace operators diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp old mode 100644 new mode 100755 index a1500ecdb0..d32dddb307 --- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp @@ -30,6 +30,7 @@ bool SliceKernel::Init(SliceParam* param) { } return true; } + template <> void SliceKernel::Compute(const SliceParam& param) { // Only support slicing in channel dimension @@ -38,6 +39,8 @@ void SliceKernel::Compute(const SliceParam& param) { auto input = param.input_; auto output = param.output_; + int H = input->dims()[2]; + int W = input->dims()[3]; int HW = input->dims()[2] * input->dims()[3]; int channel = input->dims()[1]; auto input_ptr = input->data(); @@ -53,10 +56,32 @@ void SliceKernel::Compute(const SliceParam& param) { end = end > channel ? channel : end; int len = end - start; size_t size = len * sizeof(int8_t); + DLOG << input->fpga_data_num; + fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t)); + DLOG << output->fpga_data_num; + fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t)); + int unalignedWC = len * W; + int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT); - for (int i = 0; i < HW; i++) { - memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); + if (unalignedWC != alignedWC) { + auto tmpOutput = reinterpret_cast + (fpga::fpga_malloc(len*HW * sizeof(int8_t))); + for (int i = 0; i < HW; i++) { + memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size); + } + for (int i = 0; i < H; i++) { + for (int j = 0; j < unalignedWC; j++) { + *(output_ptr + alignedWC * i + j) = + *(tmpOutput + unalignedWC * i + j); + } + } + fpga::fpga_free(tmpOutput); + } else { + for (int i = 0; i < HW; i++) { + memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); + } } + fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t)); } } // namespace operators } // namespace paddle_mobile diff --git a/mobile/src/operators/math/depthwise_conv3x3.cpp b/mobile/src/operators/math/depthwise_conv3x3.cpp index 11fce28605..4f8b7a7b30 100644 --- a/mobile/src/operators/math/depthwise_conv3x3.cpp +++ b/mobile/src/operators/math/depthwise_conv3x3.cpp @@ -150,7 +150,8 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, const int out_image_size = output_h * output_w; const int valid_h_start = padding_h; const int valid_h_end = output_h - valid_h_start; - const int valid_h = valid_h_end - valid_h_start; + const int valid_h = + valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0; const int valid_w_start = padding_w; const int valid_w_end = output_w - valid_w_start; const int valid_w = valid_w_end - valid_w_start; @@ -631,7 +632,7 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, } } // pad bottom - for (int h = valid_h_end; h < output_h; ++h) { + for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) { DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, input_w, padding_h, padding_w, output_w, output_ptr, _ker); @@ -659,7 +660,8 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, const int valid_h_start = (padding_h + 1) / 2; const int valid_h_end = std::max((input_h + padding_h - 1) / 2, valid_h_start); - const int valid_h = valid_h_end - valid_h_start; + const int valid_h = + valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0; const int valid_w_start = (padding_w + 1) / 2; const int valid_w_end = std::max((input_w + padding_w - 1) / 2, valid_w_start); @@ -1045,7 +1047,7 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, } } // pad bottom - for (int h = valid_h_end; h < output_h; ++h) { + for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) { DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, input_w, padding_h, padding_w, output_w, output_ptr, _ker); -- GitLab